summation.cxx 74.3 KB
Newer Older
1
#include "summation.h"
2 3 4 5 6 7 8
#include "../scaling/strp_tsr.h"
#include "../mapping/mapping.h"
#include "../mapping/distribution.h"
#include "../tensor/untyped_tensor.h"
#include "../shared/util.h"
#include "../shared/memcontrol.h"
#include "sym_seq_sum.h"
9 10
#include "../symmetry/sym_indices.h"
#include "../symmetry/symmetrization.h"
11
#include "../redistribution/nosym_transp.h"
12
#include "../redistribution/redist.h"
solomon's avatar
solomon committed
13
#include "../scaling/scaling.h"
14 15 16

namespace CTF_int {

17 18
  using namespace CTF;

19
  summation::~summation(){
solomon's avatar
solomon committed
20 21
    if (idx_A != NULL) cdealloc(idx_A);
    if (idx_B != NULL) cdealloc(idx_B);
22 23
  }

solomon's avatar
solomon committed
24
  summation::summation(summation const & other){
25
    A     = other.A;
26
    idx_A = (int*)alloc(sizeof(int)*other.A->order);
27
    memcpy(idx_A, other.idx_A, sizeof(int)*other.A->order);
28
    B     = other.B;
29
    idx_B = (int*)alloc(sizeof(int)*other.B->order);
30
    memcpy(idx_B, other.idx_B, sizeof(int)*other.B->order);
solomon's avatar
solomon committed
31
    if (other.is_custom){
32
      func      = other.func;
solomon's avatar
solomon committed
33
      is_custom = 1;
34 35 36
    } else is_custom = 0; 
    alpha = other.alpha;
    beta  = other.beta;
solomon's avatar
solomon committed
37 38
  }

39 40 41 42 43 44 45 46 47
  summation::summation(tensor *     A_,
                       int const *  idx_A_,
                       char const * alpha_,
                       tensor *     B_,
                       int const *  idx_B_,
                       char const * beta_){
    A         = A_;
    alpha     = alpha_;
    B         = B_;
48 49 50
    beta      = beta_;
    is_custom = 0;

51 52
    idx_A     = (int*)alloc(sizeof(int)*A->order);
    idx_B     = (int*)alloc(sizeof(int)*B->order);
53 54

    memcpy(idx_A, idx_A_, sizeof(int)*A->order);
55
    memcpy(idx_B, idx_B_, sizeof(int)*B->order);
56 57 58 59 60 61 62 63 64 65 66
  }

  summation::summation(tensor *     A_,
                       char const * cidx_A,
                       char const * alpha_,
                       tensor *     B_,
                       char const * cidx_B,
                       char const * beta_){
    A         = A_;
    alpha     = alpha_;
    B         = B_;
67
    beta      = beta_;
68
    is_custom = 0;
69 70
    
    conv_idx(A->order, cidx_A, &idx_A, B->order, cidx_B, &idx_B);
71
  }
72

73
 
74 75 76 77 78 79 80
  summation::summation(tensor *                A_,
                       int const *             idx_A_,
                       char const *            alpha_,
                       tensor *                B_,
                       int const *             idx_B_,
                       char const *            beta_,
                       univar_function const * func_){
81
    A         = A_;
82
    alpha     = alpha_;
83
    B         = B_;
84 85 86 87
    beta      = beta_;
    func      = func_;
    is_custom = 1;

88 89
    idx_A     = (int*)alloc(sizeof(int)*A->order);
    idx_B     = (int*)alloc(sizeof(int)*B->order);
90 91

    memcpy(idx_A, idx_A_, sizeof(int)*A->order);
92
    memcpy(idx_B, idx_B_, sizeof(int)*B->order);
93 94 95
  }

 
96 97 98 99 100 101 102
  summation::summation(tensor *                A_,
                       char const *            cidx_A,
                       char const *            alpha_,
                       tensor *                B_,
                       char const *            cidx_B,
                       char const *            beta_,
                       univar_function const * func_){
103 104 105
    A         = A_;
    alpha     = alpha_;
    B         = B_;
106
    beta      = beta_;
107
    func      = func_;
108
    is_custom = 1;
109 110

    conv_idx(A->order, cidx_A, &idx_A, B->order, cidx_B, &idx_B);
111 112
  }

113
  void summation::execute(bool run_diag){
114 115 116 117
#if DEBUG >= 2
    if (A->wrld->cdt.rank == 0) printf("Summation::execute (head):\n");
    print();
#endif
118
    if (A->is_sparse || B->is_sparse){
119 120
      int stat = sym_sum_tsr(run_diag);
      assert(stat == SUCCESS); 
121 122 123 124
    } else {
      int stat = home_sum_tsr(run_diag);
      assert(stat == SUCCESS); 
    }
125 126 127 128 129 130 131
  }
  
  double summation::estimate_time(){
    assert(0); //FIXME
    return 0.0;
  }

132 133
  void summation::get_fold_indices(int *  num_fold,
                                   int ** fold_idx){
134 135 136
    int i, in, num_tot, nfold, broken;
    int iA, iB, inA, inB, iiA, iiB;
    int * idx_arr, * idx;
137 138
    inv_idx(A->order, idx_A,
            B->order, idx_B,
139
            &num_tot, &idx_arr);
140
    CTF_int::alloc_ptr(num_tot*sizeof(int), (void**)&idx);
141 142 143 144 145

    for (i=0; i<num_tot; i++){
      idx[i] = 1;
    }
    
146
    for (iA=0; iA<A->order; iA++){
147 148
      i      = idx_A[iA];
      iB     = idx_arr[2*i+1];
149
      broken = 0;
150
      inA    = iA;
151
      do {
152
        in = idx_A[inA];
153 154 155
        inB = idx_arr[2*in+1];
        if (((inA>=0) + (inB>=0) != 2) ||
            (iB != -1 && inB - iB != in-i) ||
156
            (iB != -1 && A->sym[inA] != B->sym[inB])){
157
          broken = 1;
158
      //    printf("index in = %d inA = %d inB = %d is broken symA = %d symB = %d\n",in, inA, inB, A->sym[inA], B->sym[inB]);
159 160
        }
        inA++;
161
      } while (A->sym[inA-1] != NS);
162 163
      if (broken){
        for (iiA=iA;iiA<inA;iiA++){
164
          idx[idx_A[iiA]] = 0;
165 166 167 168
        }
      }
    }

169
    for (iB=0; iB<B->order; iB++){
170 171
      i      = idx_B[iB];
      iA     = idx_arr[2*i+0];
172
      broken = 0;
173
      inB    = iB;
174
      do {
175
        in = idx_B[inB];
176 177 178
        inA = idx_arr[2*in+0];
        if (((inB>=0) + (inA>=0) != 2) ||
            (iA != -1 && inA - iA != in-i) ||
179
            (iA != -1 && B->sym[inB] != A->sym[inA])){
180 181 182
          broken = 1;
        }
        inB++;
183
      } while (B->sym[inB-1] != NS);
184 185
      if (broken){
        for (iiB=iB;iiB<inB;iiB++){
186
          idx[idx_B[iiB]] = 0;
187 188 189 190 191 192 193 194 195 196 197 198 199 200
        }
      }
    }
    

    nfold = 0;
    for (i=0; i<num_tot; i++){
      if (idx[i] == 1){
        idx[nfold] = i;
        nfold++;
      }
    }
    *num_fold = nfold;
    *fold_idx = idx;
solomon's avatar
solomon committed
201
    CTF_int::cdealloc(idx_arr);
202 203 204 205
  }

  int summation::can_fold(){
    int i, j, nfold, * fold_idx;
206
    //FIXME: fold sparse tensors into CSR form
207
    if (A->is_sparse || B->is_sparse) return 0;
208

209 210 211
    for (i=0; i<A->order; i++){
      for (j=i+1; j<A->order; j++){
        if (idx_A[i] == idx_A[j]) return 0;
212 213
      }
    }
214 215 216
    for (i=0; i<B->order; i++){
      for (j=i+1; j<B->order; j++){
        if (idx_B[i] == idx_B[j]) return 0;
217 218
      }
    }
219
    get_fold_indices(&nfold, &fold_idx);
solomon's avatar
solomon committed
220
    CTF_int::cdealloc(fold_idx);
221 222 223
    /* FIXME: 1 folded index is good enough for now, in the future model */
    return nfold > 0;
  }
224 225 226 227 228 229 230
 
  void summation::get_fold_sum(summation *& fold_sum,
                               int &        all_fdim_A,
                               int &        all_fdim_B,
                               int *&       all_flen_A,
                               int *&       all_flen_B){
    int i, j, nfold, nf;
231 232 233 234 235
    int * fold_idx, * fidx_map_A, * fidx_map_B;
    tensor * ftsr_A, * ftsr_B;

    get_fold_indices(&nfold, &fold_idx);
    if (nfold == 0){
solomon's avatar
solomon committed
236
      CTF_int::cdealloc(fold_idx);
237
      assert(0);
238 239 240
    }

    /* overestimate this space to not bother with it later */
241 242
    CTF_int::alloc_ptr(nfold*sizeof(int), (void**)&fidx_map_A);
    CTF_int::alloc_ptr(nfold*sizeof(int), (void**)&fidx_map_B);
243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271


    A->fold(nfold, fold_idx, idx_A, 
            &all_fdim_A, &all_flen_A);
    B->fold(nfold, fold_idx, idx_B, 
            &all_fdim_B, &all_flen_B);

    nf = 0;
    for (i=0; i<A->order; i++){
      for (j=0; j<nfold; j++){
        if (A->sym[i] == NS && idx_A[i] == fold_idx[j]){
          fidx_map_A[nf] = j;
          nf++;
        }
      }
    }
    nf = 0;
    for (i=0; i<B->order; i++){
      for (j=0; j<nfold; j++){
        if (B->sym[i] == NS && idx_B[i] == fold_idx[j]){
          fidx_map_B[nf] = j;
          nf++;
        }
      }
    }

    ftsr_A = A->rec_tsr;
    ftsr_B = B->rec_tsr;

272
    int * sidx_A, * sidx_B;
273
    CTF_int::conv_idx<int>(ftsr_A->order, fidx_map_A, &sidx_A,
Edgar Solomonik's avatar
Edgar Solomonik committed
274 275 276 277 278
                           ftsr_B->order, fidx_map_B, &sidx_B);
    
    cdealloc(fidx_map_A);
    cdealloc(fidx_map_B);
    cdealloc(fold_idx);
279

280
    fold_sum = new summation(A->rec_tsr, sidx_A, alpha, B->rec_tsr, sidx_B, beta);
solomon's avatar
solomon committed
281 282
    cdealloc(sidx_A);
    cdealloc(sidx_B);
283 284 285 286 287 288 289 290 291 292 293
  }

  int summation::map_fold(){
    int i, all_fdim_A, all_fdim_B;
    int nvirt_A, nvirt_B;
    int * fnew_ord_A, * fnew_ord_B;
    int * all_flen_A, * all_flen_B;
    int inr_stride;

    summation * fold_sum;
    get_fold_sum(fold_sum, all_fdim_A, all_fdim_B, all_flen_A, all_flen_B);
294
  #if DEBUG>=2
295
    if (A->wrld->rank == 0){
296 297
      printf("Folded summation type:\n");
    }
298
    fold_sum->print();//print_sum(&fold_type,0.0,0.0);
299 300 301
  #endif
   
    //for type order 1 to 3 
302 303 304
    fold_sum->get_len_ordering(&fnew_ord_A, &fnew_ord_B); 
    permute_target(fold_sum->A->order, fnew_ord_A, A->inner_ordering);
    permute_target(fold_sum->B->order, fnew_ord_B, B->inner_ordering);
305 306
    

solomon's avatar
solomon committed
307
    nvirt_A = A->calc_nvirt();
308
    for (i=0; i<nvirt_A; i++){
solomon's avatar
solomon committed
309
      nosym_transpose(all_fdim_A, A->inner_ordering, all_flen_A, 
310
                      A->data + A->sr->el_size*i*(A->size/nvirt_A), 1, A->sr);
311
    }
solomon's avatar
solomon committed
312
    nvirt_B = B->calc_nvirt();
313
    for (i=0; i<nvirt_B; i++){
solomon's avatar
solomon committed
314
      nosym_transpose(all_fdim_B, B->inner_ordering, all_flen_B, 
315
                      B->data + B->sr->el_size*i*(B->size/nvirt_B), 1, B->sr);
316 317 318
    }

    inr_stride = 1;
319 320
    for (i=0; i<fold_sum->A->order; i++){
      inr_stride *= fold_sum->A->pad_edge_len[i];
321 322
    }

solomon's avatar
solomon committed
323 324 325 326
    CTF_int::cdealloc(fnew_ord_A);
    CTF_int::cdealloc(fnew_ord_B);
    CTF_int::cdealloc(all_flen_A);
    CTF_int::cdealloc(all_flen_B);
327

Edgar Solomonik's avatar
Edgar Solomonik committed
328 329
    delete fold_sum;

solomon's avatar
solomon committed
330
    return inr_stride; 
331 332
  }

333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
  double summation::est_time_fold(){
    int all_fdim_A, all_fdim_B;
    int * fnew_ord_A, * fnew_ord_B;
    int * all_flen_A, * all_flen_B;
    int * tAiord, * tBiord;

    summation * fold_sum;
    get_fold_sum(fold_sum, all_fdim_A, all_fdim_B, all_flen_A, all_flen_B);
    fold_sum->get_len_ordering(&fnew_ord_A, &fnew_ord_B); 
    CTF_int::alloc_ptr(all_fdim_A*sizeof(int), (void**)&tAiord);
    CTF_int::alloc_ptr(all_fdim_B*sizeof(int), (void**)&tBiord);
    memcpy(tAiord, A->inner_ordering, all_fdim_A*sizeof(int));
    memcpy(tBiord, B->inner_ordering, all_fdim_B*sizeof(int));

    permute_target(fold_sum->A->order, fnew_ord_A, tAiord);
    permute_target(fold_sum->B->order, fnew_ord_B, tBiord);
  
    A->is_folded = 0; 
    delete A->rec_tsr; 
    cdealloc(A->inner_ordering); 
    B->is_folded = 0; 
    delete B->rec_tsr; 
    cdealloc(B->inner_ordering); 

    double esttime = 0.0;

    esttime += A->calc_nvirt()*est_time_transp(all_fdim_A, tAiord, all_flen_A, 1, A->sr);
    esttime += 2.*B->calc_nvirt()*est_time_transp(all_fdim_B, tBiord, all_flen_B, 1, B->sr);
Edgar Solomonik's avatar
Edgar Solomonik committed
361 362 363 364 365 366 367 368 369

    delete fold_sum;

    cdealloc(all_flen_A);
    cdealloc(all_flen_B);
    cdealloc(tAiord);
    cdealloc(tBiord);
    cdealloc(fnew_ord_A);
    cdealloc(fnew_ord_B);
370 371 372 373 374
    return esttime;
  }


 
375 376
  void summation::get_len_ordering(int ** new_ordering_A,
                                   int ** new_ordering_B){
377
    int num_tot;
378 379
    int * ordering_A, * ordering_B, * idx_arr;
    
380 381
    CTF_int::alloc_ptr(sizeof(int)*A->order, (void**)&ordering_A);
    CTF_int::alloc_ptr(sizeof(int)*B->order, (void**)&ordering_B);
382

383 384
    inv_idx(A->order, idx_A,
            B->order, idx_B,
385
            &num_tot, &idx_arr);
386 387 388
    ASSERT(num_tot == A->order);
    ASSERT(num_tot == B->order);
    /*for (i=0; i<num_tot; i++){
389 390
      ordering_A[i] = idx_arr[2*i];
      ordering_B[i] = idx_arr[2*i+1];
391 392 393 394 395 396 397
    }*/
    for (int i=0; i<num_tot; i++){
      ordering_B[i] = i;
      for (int j=0; j<num_tot; j++){
        if (idx_A[j] == idx_B[i])
          ordering_A[i] = j;
      }
398
    }
solomon's avatar
solomon committed
399
    CTF_int::cdealloc(idx_arr);
400 401 402 403
    *new_ordering_A = ordering_A;
    *new_ordering_B = ordering_B;
  }

404 405 406

  tspsum * summation::construct_sparse_sum(int const * phys_mapped){
    int nvirt, i, iA, iB, order_tot, is_top, need_rep;
407 408
    int64_t blk_sz_A, blk_sz_B, vrt_sz_A, vrt_sz_B;
    int nphys_dim;
409 410
    int * virt_dim;
    int * idx_arr;
411 412 413
    int * virt_blk_len_A, * virt_blk_len_B;
    int * blk_len_A, * blk_len_B;
    mapping * map;
414
    tspsum * htsum = NULL , ** rec_tsum = NULL;
415 416

    is_top = 1;
solomon's avatar
solomon committed
417 418
    inv_idx(A->order, idx_A,
            B->order, idx_B,
419 420
            &order_tot, &idx_arr);

solomon's avatar
solomon committed
421
    nphys_dim = A->topo->order;
422

423
    CTF_int::alloc_ptr(sizeof(int)*order_tot,   (void**)&virt_dim);
424 425 426 427
    CTF_int::alloc_ptr(sizeof(int)*A->order,    (void**)&blk_len_A);
    CTF_int::alloc_ptr(sizeof(int)*B->order,    (void**)&blk_len_B);
    CTF_int::alloc_ptr(sizeof(int)*A->order,    (void**)&virt_blk_len_A);
    CTF_int::alloc_ptr(sizeof(int)*B->order,    (void**)&virt_blk_len_B);
428 429

    /* Determine the block dimensions of each local subtensor */
solomon's avatar
solomon committed
430 431 432
    blk_sz_A = A->size;
    blk_sz_B = B->size;
    calc_dim(A->order, blk_sz_A, A->pad_edge_len, A->edge_map,
433
             &vrt_sz_A, virt_blk_len_A, blk_len_A);
solomon's avatar
solomon committed
434
    calc_dim(B->order, blk_sz_B, B->pad_edge_len, B->edge_map,
435 436 437 438 439 440 441
             &vrt_sz_B, virt_blk_len_B, blk_len_B);

    nvirt = 1;
    for (i=0; i<order_tot; i++){
      iA = idx_arr[2*i];
      iB = idx_arr[2*i+1];
      if (iA != -1){
solomon's avatar
solomon committed
442
        map = &A->edge_map[iA];
443 444 445 446 447 448 449
        while (map->has_child) map = map->child;
        if (map->type == VIRTUAL_MAP){
          virt_dim[i] = map->np;
        }
        else virt_dim[i] = 1;
      } else {
        ASSERT(iB!=-1);
solomon's avatar
solomon committed
450
        map = &B->edge_map[iB];
451 452 453 454 455 456 457 458 459
        while (map->has_child) map = map->child;
        if (map->type == VIRTUAL_MAP){
          virt_dim[i] = map->np;
        }
        else virt_dim[i] = 1;
      }
      nvirt *= virt_dim[i];
    }

solomon's avatar
solomon committed
460 461


462
    if (A->is_sparse){
463 464 465 466 467 468 469 470 471 472 473 474
      if (A->wrld->np > 1){
        tspsum_pin_keys * sksum = new tspsum_pin_keys(this, 1);
        if (is_top){
          htsum = sksum;
          is_top = 0;
        } else {
          *rec_tsum = sksum;
        }
        rec_tsum = &sksum->rec_tsum;
      }

      tspsum_permute * pmsum = new tspsum_permute(this, 1, virt_blk_len_A);
475
      if (is_top){
476
        htsum = pmsum;
477 478
        is_top = 0;
      } else {
479
        *rec_tsum = pmsum;
480
      }
481 482 483
      rec_tsum = &pmsum->rec_tsum;
    }
    if (B->is_sparse){
484 485 486 487 488 489 490 491 492 493 494 495
      if (B->wrld->np > 1){
        tspsum_pin_keys * sksum = new tspsum_pin_keys(this, 0);
        if (is_top){
          htsum = sksum;
          is_top = 0;
        } else {
          *rec_tsum = sksum;
        }
        rec_tsum = &sksum->rec_tsum;
      }

      tspsum_permute * pmsum = new tspsum_permute(this, 0, virt_blk_len_B);
496
      if (is_top){
497
        htsum = pmsum;
498 499
        is_top = 0;
      } else {
500
        *rec_tsum = pmsum;
solomon's avatar
solomon committed
501
      }
502
      rec_tsum = &pmsum->rec_tsum;
503 504
    }

solomon's avatar
solomon committed
505
/*    bool need_sp_map = false;
506 507
    if (A->is_sparse || B->is_sparse){
      for (int i=0; i<B->order; i++){
508
        bool found_match = false;
509 510
        for (int j=0; j<A->order; j++){
          if (idx_B[i] == idx_A[j]) found_match = true;
511 512
        }
        if (!found_match) need_sp_map = true;
513 514 515
      }
    }

516
    if (need_sp_map){
517
      tspsum_map * smsum = new tspsum_map(this);
518 519 520 521 522 523 524
      if (is_top){
        htsum = smsum;
        is_top = 0;
      } else {
        *rec_tsum = smsum;
      }
      rec_tsum = &smsum->rec_tsum;
solomon's avatar
solomon committed
525
    }*/
526 527


528 529 530 531 532 533 534 535
    need_rep = 0;
    for (i=0; i<nphys_dim; i++){
      if (phys_mapped[2*i+0] == 0 ||
          phys_mapped[2*i+1] == 0){
        need_rep = 1;
        break;
      }
    }
536

537
    if (need_rep){
538 539 540 541 542
/*      if (A->wrld->cdt.rank == 0)
        DPRINTF(1,"Replicating tensor\n");*/

      tspsum_replicate * rtsum = new tspsum_replicate(this, phys_mapped, blk_sz_A, blk_sz_B);

543 544 545 546 547 548
      if (is_top){
        htsum = rtsum;
        is_top = 0;
      } else {
        *rec_tsum = rtsum;
      }
549
      rec_tsum      = &rtsum->rec_tsum;
550 551
    }

552 553 554 555 556 557 558 559 560 561 562 563 564 565 566
    /* Multiply over virtual sub-blocks */
    tspsum_virt * tsumv = new tspsum_virt(this);
    if (is_top) {
      htsum = tsumv;
      is_top = 0;
    } else {
      *rec_tsum = tsumv;
    }
    rec_tsum         = &tsumv->rec_tsum;

    tsumv->num_dim   = order_tot;
    tsumv->virt_dim  = virt_dim;
    tsumv->blk_sz_A  = vrt_sz_A;
    tsumv->blk_sz_B  = vrt_sz_B;

567
    int * new_sym_A, * new_sym_B;
568
    CTF_int::alloc_ptr(sizeof(int)*A->order, (void**)&new_sym_A);
solomon's avatar
solomon committed
569
    memcpy(new_sym_A, A->sym, sizeof(int)*A->order);
570
    CTF_int::alloc_ptr(sizeof(int)*B->order, (void**)&new_sym_B);
solomon's avatar
solomon committed
571
    memcpy(new_sym_B, B->sym, sizeof(int)*B->order);
572

573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702
    seq_tsr_spsum * tsumseq = new seq_tsr_spsum(this);
    tsumseq->is_inner = 0;
    tsumseq->edge_len_A  = virt_blk_len_A;
    tsumseq->sym_A       = new_sym_A;
    tsumseq->edge_len_B  = virt_blk_len_B;
    tsumseq->sym_B       = new_sym_B;
    tsumseq->is_custom   = is_custom;
    if (is_custom){
      tsumseq->is_inner  = 0;
      tsumseq->func      = func;
    } else tsumseq->func = NULL;

    if (is_top) {
      htsum = tsumseq;
      is_top = 0;
    } else {
      *rec_tsum = tsumseq;
    }

    CTF_int::cdealloc(idx_arr);
    CTF_int::cdealloc(blk_len_A);
    CTF_int::cdealloc(blk_len_B);
    return htsum;
  }

  tsum * summation::construct_dense_sum(int         inner_stride,
                                        int const * phys_mapped){
    int i, iA, iB, order_tot, is_top, sA, sB, need_rep, i_A, i_B, j, k;
    int64_t blk_sz_A, blk_sz_B, vrt_sz_A, vrt_sz_B;
    int nphys_dim, nvirt;
    int * idx_arr, * virt_dim;
    int * virt_blk_len_A, * virt_blk_len_B;
    int * blk_len_A, * blk_len_B;
    tsum * htsum = NULL , ** rec_tsum = NULL;
    mapping * map;
    strp_tsr * str_A, * str_B;

    is_top = 1;
    inv_idx(A->order, idx_A,
            B->order, idx_B,
            &order_tot, &idx_arr);

    nphys_dim = A->topo->order;

    CTF_int::alloc_ptr(sizeof(int)*order_tot,   (void**)&virt_dim);
    CTF_int::alloc_ptr(sizeof(int)*A->order,    (void**)&blk_len_A);
    CTF_int::alloc_ptr(sizeof(int)*B->order,    (void**)&blk_len_B);
    CTF_int::alloc_ptr(sizeof(int)*A->order,    (void**)&virt_blk_len_A);
    CTF_int::alloc_ptr(sizeof(int)*B->order,    (void**)&virt_blk_len_B);

    /* Determine the block dimensions of each local subtensor */
    blk_sz_A = A->size;
    blk_sz_B = B->size;
    calc_dim(A->order, blk_sz_A, A->pad_edge_len, A->edge_map,
             &vrt_sz_A, virt_blk_len_A, blk_len_A);
    calc_dim(B->order, blk_sz_B, B->pad_edge_len, B->edge_map,
             &vrt_sz_B, virt_blk_len_B, blk_len_B);
    /* Strip out the relevant part of the tensor if we are contracting over diagonal */
    sA = strip_diag(A->order, order_tot, idx_A, vrt_sz_A,
                           A->edge_map, A->topo, A->sr,
                           blk_len_A, &blk_sz_A, &str_A);
    sB = strip_diag(B->order, order_tot, idx_B, vrt_sz_B,
                           B->edge_map, B->topo, B->sr,
                           blk_len_B, &blk_sz_B, &str_B);
    if (sA || sB){
      if (A->wrld->cdt.rank == 0)
        DPRINTF(1,"Stripping tensor\n");
      strp_sum * ssum = new strp_sum(this);
      ssum->sr_A = A->sr;
      ssum->sr_B = B->sr;
      htsum = ssum;
      is_top = 0;
      rec_tsum = &ssum->rec_tsum;

      ssum->rec_strp_A = str_A;
      ssum->rec_strp_B = str_B;
      ssum->strip_A = sA;
      ssum->strip_B = sB;
    }


    nvirt = 1;
    for (i=0; i<order_tot; i++){
      iA = idx_arr[2*i];
      iB = idx_arr[2*i+1];
      if (iA != -1){
        map = &A->edge_map[iA];
        while (map->has_child) map = map->child;
        if (map->type == VIRTUAL_MAP){
          virt_dim[i] = map->np;
          if (sA) virt_dim[i] = virt_dim[i]/str_A->strip_dim[iA];
        }
        else virt_dim[i] = 1;
      } else {
        ASSERT(iB!=-1);
        map = &B->edge_map[iB];
        while (map->has_child) map = map->child;
        if (map->type == VIRTUAL_MAP){
          virt_dim[i] = map->np;
          if (sB) virt_dim[i] = virt_dim[i]/str_B->strip_dim[iA];
        }
        else virt_dim[i] = 1;
      }
      nvirt *= virt_dim[i];
    }

    need_rep = 0;
    for (i=0; i<nphys_dim; i++){
      if (phys_mapped[2*i+0] == 0 ||
          phys_mapped[2*i+1] == 0){
        need_rep = 1;
        break;
      }
    }

    if (need_rep){
/*      if (A->wrld->cdt.rank == 0)
        DPRINTF(1,"Replicating tensor\n");*/

      tsum_replicate * rtsum = new tsum_replicate(this, phys_mapped, blk_sz_A, blk_sz_B);

      if (is_top){
        htsum = rtsum;
        is_top = 0;
      } else {
        *rec_tsum = rtsum;
      }
      rec_tsum      = &rtsum->rec_tsum;
    }

703
    /* Multiply over virtual sub-blocks */
704
    if (nvirt > 1){
705
      tsum_virt * tsumv = new tsum_virt(this);
706 707 708 709 710 711
      if (is_top) {
        htsum = tsumv;
        is_top = 0;
      } else {
        *rec_tsum = tsumv;
      }
712 713 714 715 716 717
      rec_tsum         = &tsumv->rec_tsum;

      tsumv->num_dim   = order_tot;
      tsumv->virt_dim  = virt_dim;
      tsumv->blk_sz_A  = vrt_sz_A;
      tsumv->blk_sz_B  = vrt_sz_B;
solomon's avatar
solomon committed
718
    } else CTF_int::cdealloc(virt_dim);
719 720 721 722 723
    int * new_sym_A, * new_sym_B;
    CTF_int::alloc_ptr(sizeof(int)*A->order, (void**)&new_sym_A);
    memcpy(new_sym_A, A->sym, sizeof(int)*A->order);
    CTF_int::alloc_ptr(sizeof(int)*B->order, (void**)&new_sym_B);
    memcpy(new_sym_B, B->sym, sizeof(int)*B->order);
724

725
    seq_tsr_sum * tsumseq = new seq_tsr_sum(this);
726 727 728 729 730
    if (inner_stride == -1){
      tsumseq->is_inner = 0;
    } else {
      tsumseq->is_inner = 1;
      tsumseq->inr_stride = inner_stride;
solomon's avatar
solomon committed
731 732
      tensor * itsr;
      itsr = A->rec_tsr;
733
      i_A = 0;
solomon's avatar
solomon committed
734 735
      for (i=0; i<A->order; i++){
        if (A->sym[i] == NS){
736
          for (j=0; j<itsr->order; j++){
solomon's avatar
solomon committed
737
            if (A->inner_ordering[j] == i_A){
738 739 740
              j=i;
              do {
                j--;
solomon's avatar
solomon committed
741
              } while (j>=0 && A->sym[j] != NS);
742 743 744 745 746 747 748 749 750 751
              for (k=j+1; k<=i; k++){
                virt_blk_len_A[k] = 1;
                new_sym_A[k] = NS;
              }
              break;
            }
          }
          i_A++;
        }
      }
solomon's avatar
solomon committed
752
      itsr = B->rec_tsr;
753
      i_B = 0;
solomon's avatar
solomon committed
754 755
      for (i=0; i<B->order; i++){
        if (B->sym[i] == NS){
756
          for (j=0; j<itsr->order; j++){
solomon's avatar
solomon committed
757
            if (B->inner_ordering[j] == i_B){
758 759 760
              j=i;
              do {
                j--;
solomon's avatar
solomon committed
761
              } while (j>=0 && B->sym[j] != NS);
762 763 764 765 766 767 768 769 770 771 772
              for (k=j+1; k<=i; k++){
                virt_blk_len_B[k] = 1;
                new_sym_B[k] = NS;
              }
              break;
            }
          }
          i_B++;
        }
      }
    }
773 774 775 776 777
    tsumseq->edge_len_A  = virt_blk_len_A;
    tsumseq->sym_A       = new_sym_A;
    tsumseq->edge_len_B  = virt_blk_len_B;
    tsumseq->sym_B       = new_sym_B;
    tsumseq->is_custom   = is_custom;
solomon's avatar
solomon committed
778
    if (is_custom){
779 780 781
      tsumseq->is_inner  = 0;
      tsumseq->func      = func;
    } else tsumseq->func = NULL;
782 783 784 785 786 787
    if (is_top) {
      htsum = tsumseq;
      is_top = 0;
    } else {
      *rec_tsum = tsumseq;
    }
788

solomon's avatar
solomon committed
789 790 791
    CTF_int::cdealloc(idx_arr);
    CTF_int::cdealloc(blk_len_A);
    CTF_int::cdealloc(blk_len_B);
792 793 794 795 796 797
    return htsum;

  }


  tsum * summation::construct_sum(int inner_stride){
Edgar Solomonik's avatar
Edgar Solomonik committed
798
    int i;
799
    int nphys_dim;
Edgar Solomonik's avatar
Edgar Solomonik committed
800 801
    int * phys_mapped;
    tsum * htsum;
802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840
    mapping * map;

    nphys_dim = A->topo->order;

    CTF_int::alloc_ptr(sizeof(int)*nphys_dim*2, (void**)&phys_mapped);
    memset(phys_mapped, 0, sizeof(int)*nphys_dim*2);



    for (i=0; i<A->order; i++){
      map = &A->edge_map[i];
      if (map->type == PHYSICAL_MAP){
        phys_mapped[2*map->cdt+0] = 1;
      }
      while (map->has_child) {
        map = map->child;
        if (map->type == PHYSICAL_MAP){
          phys_mapped[2*map->cdt+0] = 1;
        }
      }
    }
    for (i=0; i<B->order; i++){
      map = &B->edge_map[i];
      if (map->type == PHYSICAL_MAP){
        phys_mapped[2*map->cdt+1] = 1;
      }
      while (map->has_child) {
        map = map->child;
        if (map->type == PHYSICAL_MAP){
          phys_mapped[2*map->cdt+1] = 1;
        }
      }
    }
    if (A->is_sparse || B->is_sparse){
      htsum = construct_sparse_sum(phys_mapped);
    } else {
      htsum = construct_dense_sum(inner_stride, phys_mapped);
    }

solomon's avatar
solomon committed
841
    CTF_int::cdealloc(phys_mapped);
842 843 844 845 846 847

    return htsum;
  }

  int summation::home_sum_tsr(bool run_diag){
    int ret, was_home_A, was_home_B;
solomon's avatar
solomon committed
848
    tensor * tnsr_A, * tnsr_B;
849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873
    // code below turns summations into scaling, but never seems to be invoked in AQ or test_suite, so commenting it out for now
/*    if (A==B && !is_custom){
      bool is_scal = true;
      for (int i=0; i<A->order; i++){
        if (idx_A[i] != idx_B[i]) is_scal = false;
      }
      if (is_scal){
        if (alpha == NULL && beta == NULL){
          scaling scl = scaling(A, idx_A, NULL);
          scl.execute();
        } else {
          char nalpha[A->sr->el_size];
          if (alpha == NULL) A->sr->copy(nalpha, A->sr->mulid());
          else A->sr->copy(nalpha, alpha);

          if (beta == NULL) A->sr->add(nalpha, A->sr->mulid(), nalpha);
          else A->sr->add(nalpha, beta, nalpha);

          scaling scl = scaling(A, idx_A, nalpha);
          scl.execute();
        }
        return SUCCESS;
      }
    }*/

solomon's avatar
solomon committed
874 875
    summation osum = summation(*this);
   
876
    CTF_int::contract_mst();
877

878 879
    A->unfold();
    B->unfold();
880
    // FIXME: if custom function, we currently don't know whether its odd, even or neither, so unpack everything
881
    /*if (is_custom){
882 883 884 885 886 887 888 889 890 891 892 893 894 895 896
      bool is_nonsym=true;
      for (int i=0; i<A->order; i++){
        if (A->sym[i] != NS){
          is_nonsym = false;
        }
      }
      if (!is_nonsym){
        int sym_A[A->order];
        std::fill(sym_A, sym_A+A->order, NS);
        int idx_A[A->order];
        for (int i=0; i<A->order; i++){
          idx_A[i] = i;
        }
        tensor tA(A->sr, A->order, A->lens, sym_A, A->wrld, 1);
        tA.is_home = 0;
897
        tA.has_home = 0;
898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921
        summation st(A, idx_A, A->sr->mulid(), &tA, idx_A, A->sr->mulid());
        st.execute();
        summation stme(*this);
        stme.A = &tA;
        stme.execute();
        return SUCCESS;
      }
    }
    if (is_custom){
      bool is_nonsym=true;
      for (int i=0; i<B->order; i++){
        if (B->sym[i] != NS){
          is_nonsym = false;
        }
      }
      if (!is_nonsym){
        int sym_B[B->order];
        std::fill(sym_B, sym_B+B->order, NS);
        int idx_B[B->order];
        for (int i=0; i<B->order; i++){
          idx_B[i] = i;
        }
        tensor tB(B->sr, B->order, B->lens, sym_B, B->wrld, 1);
        tB.is_home = 0;
922
        tB.has_home = 0;
923 924 925 926 927 928 929 930 931 932 933
        if (!B->sr->isequal(B->sr->addid(), beta)){
          summation st(B, idx_B, B->sr->mulid(), &tB, idx_B, B->sr->mulid());
          st.execute();
        }
        summation stme(*this);
        stme.B = &tB;
        stme.execute();
        summation stme2(&tB, idx_B, B->sr->mulid(), B, idx_B, B->sr->addid());
        stme2.execute();
        return SUCCESS;
      }
934
    }*/
935

936 937
  #ifndef HOME_CONTRACT
    #ifdef USE_SYM_SUM
solomon's avatar
solomon committed
938
      ret = sym_sum_tsr(run_diag);
939 940
      return ret;
    #else
solomon's avatar
solomon committed
941
      ret = sum_tensors(run_diag);
942 943 944
      return ret;
    #endif
  #else
solomon's avatar
solomon committed
945 946
    if (A->has_zero_edge_len || 
        B->has_zero_edge_len){
947
      if (!B->sr->isequal(beta,B->sr->mulid()) && !B->has_zero_edge_len){ 
solomon's avatar
solomon committed
948
        int sub_idx_map_B[B->order];
949
        int sm_idx=0;
solomon's avatar
solomon committed
950
        for (int i=0; i<B->order; i++){
951 952 953
          sub_idx_map_B[i]=sm_idx;
          sm_idx++;
          for (int j=0; j<i; j++){
solomon's avatar
solomon committed
954
            if (idx_B[i]==idx_B[j]){
955 956 957 958 959 960
              sub_idx_map_B[i]=sub_idx_map_B[j];
              sm_idx--;
              break;
            }
          }
        }
solomon's avatar
solomon committed
961 962
        scaling scl = scaling(B, sub_idx_map_B, beta);
        scl.execute();
963
      }
solomon's avatar
solomon committed
964
      return SUCCESS;
965
    }
solomon's avatar
solomon committed
966 967 968 969
    if (A == B){
      tensor * cpy_tsr_A = new tensor(A);
      osum.A = cpy_tsr_A;
      osum.execute();
970
      delete cpy_tsr_A;
solomon's avatar
solomon committed
971
      return SUCCESS;
972
    }
solomon's avatar
solomon committed
973 974
    was_home_A = A->is_home;
    was_home_B = B->is_home;
975
    if (was_home_A){
976 977
      tnsr_A              = new tensor(A,0,0);
      tnsr_A->data        = A->data;
solomon's avatar
solomon committed
978
      tnsr_A->home_buffer = A->home_buffer;
979 980 981
      tnsr_A->is_home     = 1;
      tnsr_A->is_mapped   = 1;
      tnsr_A->topo        = A->topo;
solomon's avatar
solomon committed
982 983
      copy_mapping(A->order, A->edge_map, tnsr_A->edge_map);
      tnsr_A->set_padding();
984
      osum.A              = tnsr_A;
985
    } else tnsr_A = NULL;     
986
    if (was_home_B){
987 988
      tnsr_B              = new tensor(B,0,0);
      tnsr_B->data        = B->data;
solomon's avatar
solomon committed
989
      tnsr_B->home_buffer = B->home_buffer;
990 991 992
      tnsr_B->is_home     = 1;
      tnsr_B->is_mapped   = 1;
      tnsr_B->topo        = B->topo;
solomon's avatar
solomon committed
993 994
      copy_mapping(B->order, B->edge_map, tnsr_B->edge_map);
      tnsr_B->set_padding();
995
      osum.B              = tnsr_B;
996
    } else tnsr_B = NULL;
997
  #if DEBUG >= 2
solomon's avatar
solomon committed
998
    if (A->wrld->cdt.rank == 0)
999 1000 1001 1002
      printf("Start head sum:\n");
  #endif
    
    #ifdef USE_SYM_SUM
solomon's avatar
solomon committed
1003
    ret = osum.sym_sum_tsr(run_diag);
1004
    #else
solomon's avatar
solomon committed
1005
    ret = osum.sum_tensors(run_diag);
1006
    #endif
1007
  #if DEBUG >= 2
solomon's avatar
solomon committed
1008
    if (A->wrld->cdt.rank == 0)
1009 1010 1011
      printf("End head sum:\n");
  #endif

solomon's avatar
solomon committed
1012
    if (ret!= SUCCESS) return ret;
1013 1014
    if (was_home_A) tnsr_A->unfold(); 
    else A->unfold();
solomon's avatar
solomon committed
1015
    if (was_home_B) tnsr_B->unfold();
1016
    else B->unfold();
1017

solomon's avatar
solomon committed
1018 1019
    if (was_home_B && !tnsr_B->is_home){
      if (A->wrld->cdt.rank == 0)
1020
        DPRINTF(2,"Migrating tensor %s back to home\n", B->name);
1021
      distribution odst(tnsr_B);
solomon's avatar
solomon committed
1022 1023
      B->data = tnsr_B->data;
      B->is_home = 0;
1024
      TAU_FSTART(redistribute_for_sum_home);
solomon's avatar
solomon committed
1025
      B->redistribute(odst);
1026
      TAU_FSTOP(redistribute_for_sum_home);
1027
      memcpy(B->home_buffer, B->data, B->size*B->sr->el_size);
solomon's avatar
solomon committed
1028
      CTF_int::cdealloc(B->data);
solomon's avatar
solomon committed
1029 1030 1031 1032
      B->data = B->home_buffer;
      B->is_home = 1;
      tnsr_B->is_data_aliased = 1;
      delete tnsr_B;
1033
    } else if (was_home_B){
solomon's avatar
solomon committed
1034 1035
      if (tnsr_B->data != B->data){
        printf("Tensor %s is a copy of %s and did not leave home but buffer is %p was %p\n", tnsr_B->name, B->name, tnsr_B->data, B->data);
1036 1037 1038
        ABORT;

      }
solomon's avatar
solomon committed
1039 1040 1041
      tnsr_B->has_home = 0;
      tnsr_B->is_data_aliased = 1;
      delete tnsr_B;
1042
    }
solomon's avatar
solomon committed
1043 1044 1045
    if (was_home_A && !tnsr_A->is_home){
      tnsr_A->has_home = 0;
      delete tnsr_A;
1046
    } else if (was_home_A) {
solomon's avatar
solomon committed
1047 1048 1049
      tnsr_A->has_home = 0;
      tnsr_A->is_data_aliased = 1;
      delete tnsr_A;
1050 1051 1052 1053 1054 1055
    }
    return ret;
  #endif
  }

  int summation::sym_sum_tsr(bool run_diag){
1056
    int sidx, i, nst_B, * new_idx_map;
solomon's avatar
solomon committed
1057
    int * map_A, * map_B;
1058
    int ** dstack_map_B;
1059
    tensor * tnsr_A, * tnsr_B, * new_tsr, ** dstack_tsr_B;
solomon's avatar
solomon committed
1060
    std::vector<summation> perm_types;
1061
    std::vector<int> signs;
solomon's avatar
solomon committed
1062
    char const * dbeta;
1063
  #if (DEBUG >= 2)
1064 1065
    print();
  #endif
1066
    check_consistency();
1067 1068 1069

    A->unfold();
    B->unfold();
1070
    if (A->has_zero_edge_len || B->has_zero_edge_len){
1071
      if (!B->sr->isequal(beta, B->sr->mulid()) && !B->has_zero_edge_len){ 
1072
        int sub_idx_map_B[B->order];
1073
        int sm_idx=0;
1074
        for (int i=0; i<B->order; i++){
1075 1076 1077
          sub_idx_map_B[i]=sm_idx;
          sm_idx++;
          for (int j=0; j<i; j++){
1078
            if (idx_B[i]==idx_B[j]){
1079 1080 1081 1082 1083 1084
              sub_idx_map_B[i]=sub_idx_map_B[j];
              sm_idx--;
              break;
            }
          }
        }
1085 1086
        scaling scl = scaling(B, sub_idx_map_B, beta);
        scl.execute();
1087
      }
solomon's avatar
solomon committed
1088
      return SUCCESS;
1089
    }
1090
    // If we have sparisity, use separate mechanism
1091
    /*if (A->is_sparse || B->is_sparse){
1092 1093
      sp_sum();
      return SUCCESS;
1094
    }*/
1095 1096
    tnsr_A = A;
    tnsr_B = B;
1097
    char * new_alpha = (char*)alloc(tnsr_B->sr->el_size);
1098 1099 1100 1101
    CTF_int::alloc_ptr(sizeof(int)*tnsr_A->order,     (void**)&map_A);
    CTF_int::alloc_ptr(sizeof(int)*tnsr_B->order,     (void**)&map_B);
    CTF_int::alloc_ptr(sizeof(int*)*tnsr_B->order,    (void**)&dstack_map_B);
    CTF_int::alloc_ptr(sizeof(tensor*)*tnsr_B->order, (void**)&dstack_tsr_B);
1102 1103 1104 1105
    memcpy(map_A, idx_A, tnsr_A->order*sizeof(int));
    memcpy(map_B, idx_B, tnsr_B->order*sizeof(int));
    while (!run_diag && tnsr_A->extract_diag(map_A, 1, new_tsr, &new_idx_map) == SUCCESS){
      if (tnsr_A != A) delete tnsr_A;
solomon's avatar
solomon committed
1106
      CTF_int::cdealloc(map_A);
1107
      tnsr_A = new_tsr;
1108 1109 1110
      map_A = new_idx_map;
    }
    nst_B = 0;
1111
    while (!run_diag && tnsr_B->extract_diag(map_B, 1, new_tsr, &new_idx_map) == SUCCESS){
1112
      dstack_map_B[nst_B] = map_B;
1113
      dstack_tsr_B[nst_B] = tnsr_B;
1114
      nst_B++;
1115
      tnsr_B = new_tsr;
1116 1117 1118
      map_B = new_idx_map;
    }

Edgar Solomonik's avatar
Edgar Solomonik committed
1119 1120 1121
    summation new_sum = summation(*this);
    new_sum.A = tnsr_A;
    new_sum.B = tnsr_B;
1122 1123
    memcpy(new_sum.idx_A, map_A, sizeof(int)*tnsr_A->order);
    memcpy(new_sum.idx_B, map_B, sizeof(int)*tnsr_B->order);
1124
    if (tnsr_A == tnsr_B){
1125
      tensor nnew_tsr = tensor(tnsr_A);
Edgar Solomonik's avatar
Edgar Solomonik committed
1126 1127
      new_sum.A = &nnew_tsr;
      new_sum.B = tnsr_B;
1128
      new_sum.sym_sum_tsr(run_diag);
1129 1130
      
      /*clone_tensor(ntid_A, 1, &new_tid);
1131 1132 1133 1134
      new_type = *type;
      new_type.tid_A = new_tid;
      stat = sym_sum_tsr(alpha_, beta, &new_type, ftsr, felm, run_diag);
      del_tsr(new_tid);
1135
      return stat;*/
1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150
    } else {
      
  /*    new_type.tid_A = ntid_A;
      new_type.tid_B = ntid_B;
      new_type.idx_map_A = map_A;
      new_type.idx_map_B = map_B;*/
  
      //FIXME: make these typefree...
      int sign = align_symmetric_indices(tnsr_A->order,
                                         new_sum.idx_A,
                                         tnsr_A->sym,
                                         tnsr_B->order,
                                         new_sum.idx_B,
                                         tnsr_B->sym);
      int ocfact = overcounting_factor(tnsr_A->order,
1151
                                       new_sum.idx_A,
1152 1153
                                       tnsr_A->sym,
                                       tnsr_B->order,
1154
                                       new_sum.idx_B,
1155
                                       tnsr_B->sym);
1156 1157 1158
  
      if (ocfact != 1 || sign != 1){
        if (ocfact != 1){
1159
          tnsr_B->sr->safecopy(new_alpha, tnsr_B->sr->addid());
1160 1161 1162 1163 1164 1165 1166
          
          for (int i=0; i<ocfact; i++){
            tnsr_B->sr->add(new_alpha, alpha, new_alpha);
          }
          alpha = new_alpha;
        }
        if (sign == -1){
1167
          tnsr_B->sr->safeaddinv(alpha, new_alpha);
1168
          alpha = new_alpha;
1169 1170
        }
      }
1171
  
1172
      /*if (A->sym[0] == SY && B->sym[0] == AS){
solomon's avatar
solomon committed
1173 1174
        print();
        ASSERT(0); 
1175
      }*/
1176
      if (new_sum.unfold_broken_sym(NULL) != -1){
solomon's avatar
solomon committed
1177
        if (A->wrld->cdt.rank == 0)
1178
          DPRINTF(1, "Permutational symmetry is broken\n");
1179 1180 1181 1182 1183
  
        summation * unfold_sum;
        sidx = new_sum.unfold_broken_sym(&unfold_sum);
        int sy;
        sy = 0;
1184
        int sidx2 = unfold_sum->unfold_broken_sym(NULL);
1185 1186
        if (sidx%2 == 0 && (A->sym[sidx/2] == SY || unfold_sum->A->sym[sidx/2] == SY)) sy = 1;
        if (sidx%2 == 1 && (B->sym[sidx/2] == SY || unfold_sum->B->sym[sidx/2] == SY)) sy = 1;
1187
        //if (sy && sidx%2 == 0){
1188 1189
        if (!A->is_sparse && !B->is_sparse && (sidx2 != -1 || 
            (sy && (sidx%2 == 0  || !tnsr_B->sr->isequal(new_sum.beta, tnsr_B->sr->addid()))))){
1190 1191 1192
          if (sidx%2 == 0){
            if (unfold_sum->A->sym[sidx/2] == NS){
              if (A->wrld->cdt.rank == 0)
1193
                DPRINTF(1,"Performing operand desymmetrization for summation of A idx=%d\n",sidx/2);
1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231
              desymmetrize(tnsr_A, unfold_sum->A, 0);
            } else {
              if (A->wrld->cdt.rank == 0)
                DPRINTF(1,"Performing operand symmetrization for summation\n");
              symmetrize(unfold_sum->A, tnsr_A);
            }
            //unfold_sum->B = tnsr_B;
            unfold_sum->sym_sum_tsr(run_diag);
    //        sym_sum_tsr(alpha, beta, &unfold_type, ftsr, felm, run_diag);
            if (tnsr_A != unfold_sum->A){
              unfold_sum->A->unfold();
              tnsr_A->pull_alias(unfold_sum->A);
              delete unfold_sum->A;
            }
          } else {
            //unfold_sum->A = tnsr_A;
            if (A->wrld->cdt.rank == 0)
              DPRINTF(1,"Performing product desymmetrization for summation\n");
            desymmetrize(tnsr_B, unfold_sum->B, 1);
            unfold_sum->sym_sum_tsr(run_diag);
            if (A->wrld->cdt.rank == 0)
              DPRINTF(1,"Performing product symmetrization for summation\n");
            if (tnsr_B->data != unfold_sum->B->data && !tnsr_B->sr->isequal(tnsr_B->sr->mulid(), unfold_sum->beta)){
              int sidx_B[tnsr_B->order];
              for (int iis=0; iis<tnsr_B->order; iis++){
                sidx_B[iis] = iis;
              }
              scaling sscl = scaling(tnsr_B, sidx_B, unfold_sum->beta);
              sscl.execute();
            }
            symmetrize(tnsr_B, unfold_sum->B);

    //        sym_sum_tsr(alpha, beta, &unfold_type, ftsr, felm, run_diag);
            if (tnsr_B != unfold_sum->B){
              unfold_sum->B->unfold();
              tnsr_B->pull_alias(unfold_sum->B);
              delete unfold_sum->B;
            }
1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245
          }
        } else {
          if (sidx != -1 && sidx%2 == 1){
            delete unfold_sum->B;
          } else if (sidx != -1 && sidx%2 == 0){
            delete unfold_sum->A;
          }
          //get_sym_perms(&new_type, alpha, perm_types, signs);
          get_sym_perms(new_sum, perm_types, signs);
          if (A->wrld->cdt.rank == 0)
            DPRINTF(1,"Performing %d summation permutations\n",
                    (int)perm_types.size());
          dbeta = beta;
          char * new_alpha = (char*)alloc(tnsr_B->sr->el_size);
1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258

          tensor * inv_tsr_A;
          bool need_inv = false;
          // if we have no multiplicative operator, must inverse sign manually
          if (tnsr_B->sr->mulid() == NULL){
            for (i=0; i<(int)perm_types.size(); i++){
              need_inv = true;
            }
            if (need_inv){
              inv_tsr_A = new tensor(tnsr_A);
              inv_tsr_A->addinv();
            }
          }
1259
          for (i=0; i<(int)perm_types.size(); i++){
1260 1261 1262 1263 1264 1265 1266 1267 1268 1269
            // if group apply additive inverse manually
            if (signs[i] == -1 && need_inv){
              perm_types[i].A = inv_tsr_A;
            } else {
              if (signs[i] == 1)
                tnsr_B->sr->safecopy(new_alpha, alpha);
              else 
                tnsr_B->sr->safeaddinv(alpha, new_alpha);
              perm_types[i].alpha = new_alpha;
            }
1270 1271 1272 1273
            perm_types[i].beta = dbeta;
            perm_types[i].sum_tensors(run_diag);
            dbeta = new_sum.B->sr->mulid();
          }
solomon's avatar
solomon committed
1274
          cdealloc(new_alpha);
1275 1276
          if (need_inv)
            delete inv_tsr_A;
1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288
  /*        for (i=0; i<(int)perm_types.size(); i++){
            free_type(&perm_types[i]);
          }*/
          perm_types.clear();
          signs.clear();
        }
        delete unfold_sum;
      } else {
        new_sum.alpha = alpha;
        new_sum.sum_tensors(run_diag);
  /*      sum_tensors(alpha, beta, new_type.tid_A, new_type.tid_B, new_type.idx_map_A,
                    new_type.idx_map_B, ftsr, felm, run_diag);*/
1289 1290
      }
    }
1291
    if (tnsr_A != A) delete tnsr_A;
1292
    for (i=nst_B-1; i>=0; i--){
1293 1294 1295 1296
//      extract_diag(dstack_tid_B[i], dstack_map_B[i], 0, &ntid_B, &new_idx_map);
      dstack_tsr_B[i]->extract_diag(dstack_map_B[i], 0, tnsr_B, &new_idx_map);
      //del_tsr(ntid_B);
      delete tnsr_B;
Edgar Solomonik's avatar
Edgar Solomonik committed
1297 1298
      cdealloc(dstack_map_B[i]);
      cdealloc(new_idx_map);
1299
      tnsr_B = dstack_tsr_B[i];
1300
    }
1301
    ASSERT(tnsr_B == B);
solomon's avatar
solomon committed
1302 1303 1304 1305 1306
    CTF_int::cdealloc(new_alpha);
    CTF_int::cdealloc(map_A);
    CTF_int::cdealloc(map_B);
    CTF_int::cdealloc(dstack_map_B);
    CTF_int::cdealloc(dstack_tsr_B);
1307

solomon's avatar
solomon committed
1308
    return SUCCESS;
1309 1310 1311 1312 1313 1314 1315 1316
  }


  /**
   * \brief PDAXPY: a*idx_map_A(A) + b*idx_map_B(B) -> idx_map_B(B).
   * \param[in] run_diag if 1 run diagonal sum
   */
  int summation::sum_tensors(bool run_diag){
1317
    int stat, * new_idx_map;
1318
    int * map_A, * map_B;
1319
    int nst_B;
1320
    int ** dstack_map_B;
1321 1322
    tensor * tnsr_A, * tnsr_B, * new_tsr, ** dstack_tsr_B;
//    tsum<dtype> * sumf;
solomon's avatar
solomon committed
1323
    tsum * sumf;
1324 1325 1326
    //check_sum(tid_A, tid_B, idx_map_A, idx_map_B);
    //FIXME: hmm all of the below already takes place in sym_sum
    check_consistency();
1327 1328
    A->unfold();
    B->unfold();
1329
    if (A->has_zero_edge_len || B->has_zero_edge_len){
1330
      if (!B->sr->isequal(beta,B->sr->mulid()) && !B->has_zero_edge_len){ 
1331
    /*    fseq_scl<dtype> fs;
1332 1333
        fs.func_ptr=sym_seq_scl_ref<dtype>;
        fseq_elm_scl<dtype> felm;
1334
        felm.func_ptr = NULL;*/
1335
        int sub_idx_map_B[B->order];
1336
        int sm_idx=0;
1337
        for (int i=0; i<B->order; i++){
1338 1339 1340
          sub_idx_map_B[i]=sm_idx;
          sm_idx++;
          for (int j=0; j<i; j++){
1341
            if (idx_B[i]==idx_B[j]){
1342 1343 1344 1345 1346 1347
              sub_idx_map_B[i]=sub_idx_map_B[j];
              sm_idx--;
              break;
            }
          }
        }
1348 1349
        scaling scl = scaling(B, sub_idx_map_B, beta);
        scl.execute();
1350
      }
solomon's avatar
solomon committed
1351
      return SUCCESS;
1352 1353 1354
    }


1355
    //FIXME: remove all of the below, sum_tensors should never be called without sym_sum
1356 1357 1358 1359
    CTF_int::alloc_ptr(sizeof(int)*A->order,  (void**)&map_A);
    CTF_int::alloc_ptr(sizeof(int)*B->order,  (void**)&map_B);
    CTF_int::alloc_ptr(sizeof(int*)*B->order, (void**)&dstack_map_B);
    CTF_int::alloc_ptr(sizeof(tensor*)*B->order, (void**)&dstack_tsr_B);
1360 1361 1362 1363 1364 1365
    tnsr_A = A;
    tnsr_B = B;
    memcpy(map_A, idx_A, tnsr_A->order*sizeof(int));
    memcpy(map_B, idx_B, tnsr_B->order*sizeof(int));
    while (!run_diag && tnsr_A->extract_diag(map_A, 1, new_tsr, &new_idx_map) == SUCCESS){
      if (tnsr_A != A) delete tnsr_A;
solomon's avatar
solomon committed
1366
      CTF_int::cdealloc(map_A);
1367
      tnsr_A = new_tsr;
1368 1369 1370
      map_A = new_idx_map;
    }
    nst_B = 0;
1371
    while (!run_diag && tnsr_B->extract_diag(map_B, 1, new_tsr, &new_idx_map) == SUCCESS){
1372
      dstack_map_B[nst_B] = map_B;
1373
      dstack_tsr_B[nst_B] = tnsr_B;
1374
      nst_B++;
1375
      tnsr_B = new_tsr;
1376 1377
      map_B = new_idx_map;
    }
1378 1379 1380 1381 1382 1383 1384 1385 1386

    if (!tnsr_A->is_sparse && tnsr_B->is_sparse){
      tensor * stnsr_A = tnsr_A;
      tnsr_A = new tensor(stnsr_A);
      tnsr_A->sparsify(); 
      if (A != stnsr_A) delete stnsr_A;

    }

1387
    // FIXME: if A has self indices and function is distributive, presum A first, otherwise if it is dense and B is sparse, sparsify A
Edgar Solomonik's avatar
Edgar Solomonik committed
1388 1389 1390
    summation new_sum = summation(*this);
    new_sum.A = tnsr_A;
    new_sum.B = tnsr_B;
1391
    if (tnsr_A == tnsr_B){
solomon's avatar
solomon committed
1392
      tensor * nnew_tsr = new tensor(tnsr_A);
Edgar Solomonik's avatar
Edgar Solomonik committed
1393 1394
      new_sum.A = nnew_tsr;
      new_sum.B = tnsr_B;
1395
    } else{ 
1396 1397
     //FIXME: remove the below, sum_tensors should never be called without sym_sum
     int sign = align_symmetric_indices(tnsr_A->order,
1398
                                        new_sum.idx_A,
1399 1400
                                        tnsr_A->sym,
                                        tnsr_B->order,
1401
                                        new_sum.idx_B,
1402
                                        tnsr_B->sym);
1403 1404 1405 1406 1407

      #if DEBUG >= 1
        new_sum.print();
      #endif

1408
      ASSERT(sign == 1);
solomon's avatar
solomon committed
1409
/*        if (sign == -1){
1410 1411
          char * new_alpha = (char*)malloc(tnsr_B->sr->el_size);
          tnsr_B->sr->addinv(alpha, new_alpha);
solomon's avatar
solomon committed
1412 1413
          alpha = new_alpha;
        }*/
1414

1415
  #if 0 //VERIFY
1416 1417 1418 1419 1420 1421 1422 1423
      int64_t nsA, nsB;
      int64_t nA, nB;
      dtype * sA, * sB;
      dtype * uA, * uB;
      int order_A, order_B,  i;
      int * edge_len_A, * edge_len_B;
      int * sym_A, * sym_B;
      stat = allread_tsr(ntid_A, &nsA, &sA);
solomon's avatar
solomon committed
1424
      assert(stat == SUCCESS);
1425 1426

      stat = allread_tsr(ntid_B, &nsB, &sB);
solomon's avatar
solomon committed
1427
      assert(stat == SUCCESS);
1428 1429 1430 1431 1432 1433 1434 1435
  #endif

      TAU_FSTART(sum_tensors);

      /* Check if the current tensor mappings can be summed on */
  #if REDIST
      if (1) {
  #else
1436
      if (new_sum.check_mapping() == 0) {
1437 1438
  #endif
        /* remap if necessary */
1439
        stat = new_sum.map();
1440
        if (stat == ERROR) {
1441
          printf("Failed to map tensors to physical grid\n");
1442
          return ERROR;
1443 1444
        }
      } else {
1445
  #if DEBUG > 2
solomon's avatar
solomon committed
1446
        if (A->wrld->cdt.rank == 0){
1447 1448
          printf("Keeping mappings:\n");
        }
1449 1450
        tnsr_A->print_map(stdout);
        tnsr_B->print_map(stdout);
1451 1452 1453
  #endif
      }
      /* Construct the tensor algorithm we would like to use */
1454
      ASSERT(new_sum.check_mapping());
1455
  #if FOLD_TSR
1456
      if (is_custom == false && new_sum.can_fold()){
1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472
        //FIXME bit of a guess, no?
        double est_time_nofold = 4.*(A->size + B->size)*COST_MEMBW;
        if (new_sum.est_time_fold() + (A->size + B->size)*COST_MEMBW < est_time_nofold){
          /*if (A->wrld->cdt.rank == 0)
            printf("Decided to fold\n");*/
          int inner_stride;
          TAU_FSTART(map_fold);
          inner_stride = new_sum.map_fold();
          TAU_FSTOP(map_fold);
          sumf = new_sum.construct_sum(inner_stride);
        } else {
          /*if (A->wrld->cdt.rank == 0)
            printf("Decided not to fold\n");*/
        
          sumf = new_sum.construct_sum();
        }
1473 1474 1475 1476 1477 1478
      } else{
  #if DEBUG >= 1
        if (A->wrld->cdt.rank == 0){
          printf("Could not fold summation, is_custom = %d, new_sum.can_fold = %d\n", is_custom, new_sum.can_fold());
        }
  #endif
Edgar Solomonik's avatar
Edgar Solomonik committed
1479
        sumf = new_sum.construct_sum();
1480
      }
solomon's avatar
solomon committed
1481 1482
        /*sumf = construct_sum(alpha, beta, ntid_A, map_A, ntid_B, map_B,
                             ftsr, felm);*/
1483
  #else
Edgar Solomonik's avatar
Edgar Solomonik committed
1484
      sumf = new_sum.construct_sum();
solomon's avatar
solomon committed
1485 1486
      /*sumf = construct_sum(alpha, beta, ntid_A, map_A, ntid_B, map_B,
                           ftsr, felm);*/
1487 1488 1489 1490 1491 1492 1493
  #endif
      /*TAU_FSTART(zero_sum_padding);
      stat = zero_out_padding(ntid_A);
      TAU_FSTOP(zero_sum_padding);
      TAU_FSTART(zero_sum_padding);
      stat = zero_out_padding(ntid_B);
      TAU_FSTOP(zero_sum_padding);*/
solomon's avatar
solomon committed
1494
      DEBUG_PRINTF("[%d] performing tensor sum\n", A->wrld->cdt.rank);
1495
  #if DEBUG >=3
1496
      /*if (A->wrld->cdt.rank == 0){
1497 1498 1499 1500 1501 1502
        for (int i=0; i<tensors[ntid_A]->order; i++){
          printf("padding[%d] = %d\n",i, tensors[ntid_A]->padding[i]);
        }
        for (int i=0; i<tensors[ntid_B]->order; i++){
          printf("padding[%d] = %d\n",i, tensors[ntid_B]->padding[i]);
        }
1503
      }*/
1504 1505
  #endif

1506 1507 1508
  #if DEBUG >= 2
      if (tnsr_B->wrld->rank==0)
        sumf->print();
1509 1510
      tnsr_A->print_map();
      tnsr_B->print_map();
1511
  #endif
1512 1513
      TAU_FSTART(sum_func);
      /* Invoke the contraction algorithm */
1514
      tnsr_A->topo->activate();
1515
      MPI_Barrier(tnsr_B->wrld->comm);
1516
      sumf->run();
1517
      if (tnsr_B->is_sparse){
1518 1519
        tspsum * spsumf = (tspsum*)sumf;
        if (tnsr_B->data != spsumf->new_B){
1520
          cdealloc(tnsr_B->data);
1521 1522
          tnsr_B->data = spsumf->new_B;
          //tnsr_B->nnz_loc = spsumf->new_nnz_B;
1523
        }
solomon's avatar
solomon committed
1524
        tnsr_B->set_new_nnz_glb(tnsr_B->nnz_blk);
1525
        ASSERT(tnsr_B->nnz_loc == spsumf->new_nnz_B);
1526
      }
1527
      /*tnsr_B->unfold();
1528 1529 1530 1531 1532 1533 1534 1535
      tnsr_B->print();
      MPI_Barrier(tnsr_B->wrld->comm);
      if (tnsr_B->wrld->rank==1){
      for (int i=0; i<tnsr_B->size; i++){
        printf("[%d] %dth element  ",tnsr_B->wrld->rank,i);
        tnsr_B->sr->print(tnsr_B->data+i*tnsr_B->sr->el_size);
        printf("\n");
      }
1536 1537
      }*/
      tnsr_A->topo->deactivate();
solomon's avatar
solomon committed
1538 1539
      tnsr_A->unfold();
      tnsr_B->unfold();
1540 1541 1542
#ifndef SEQ
      stat = tnsr_B->zero_out_padding();
#endif
1543
      TAU_FSTOP(sum_func);
1544

1545
  #if 0 //VERIFY
1546
      stat = allread_tsr(ntid_A, &nA, &uA);
solomon's avatar
solomon committed
1547
      assert(stat == SUCCESS);
1548
      stat = get_info(ntid_A, &order_A, &edge_len_A, &sym_A);
solomon's avatar
solomon committed
1549
      assert(stat == SUCCESS);
1550 1551

      stat = allread_tsr(ntid_B, &nB, &uB);
solomon's avatar
solomon committed
1552
      assert(stat == SUCCESS);
1553
      stat = get_info(ntid_B, &order_B, &edge_len_B, &sym_B);
solomon's avatar
solomon committed
1554
      assert(stat == SUCCESS);
1555 1556 1557

      if (nsA != nA) { printf("nsA = " PRId64 ", nA = " PRId64 "\n",nsA,nA); ABORT; }
      if (nsB != nB) { printf("nsB = " PRId64 ", nB = " PRId64 "\n",nsB,nB); ABORT; }
1558
      for (i=0; (int64_t)i<nA; i++){
1559 1560 1561 1562 1563 1564 1565
        if (fabs(uA[i] - sA[i]) > 1.E-6){
          printf("A[i] = %lf, sA[i] = %lf\n", uA[i], sA[i]);
        }
      }

      cpy_sym_sum(alpha, uA, order_A, edge_len_A, edge_len_A, sym_A, map_A,
                  beta, sB, order_B, edge_len_B, edge_len_B, sym_B, map_B);
solomon's avatar
solomon committed
1566
      assert(stat == SUCCESS);
1567

1568
      for (i=0; (int64_t)i<nB; i++){
1569 1570 1571 1572 1573
        if (fabs(uB[i] - sB[i]) > 1.E-6){
          printf("B[%d] = %lf, sB[%d] = %lf\n", i, uB[i], i, sB[i]);
        }
        assert(fabs(sB[i] - uB[i]) < 1.E-6);
      }
solomon's avatar
solomon committed
1574 1575 1576 1577
      CTF_int::cdealloc(uA);
      CTF_int::cdealloc(uB);
      CTF_int::cdealloc(sA);
      CTF_int::cdealloc(sB);
1578 1579 1580
  #endif

      delete sumf;
Edgar Solomonik's avatar
Edgar Solomonik committed
1581 1582 1583
      if (tnsr_A != A){
        delete tnsr_A;
      }
1584
      for (int i=nst_B-1; i>=0; i--){
solomon's avatar
solomon committed
1585
        int ret = dstack_tsr_B[i]->extract_diag(dstack_map_B[i], 0, tnsr_B, &new_idx_map);
solomon's avatar
solomon committed
1586
        ASSERT(ret == SUCCESS);
solomon's avatar
solomon committed
1587 1588
        delete tnsr_B;
        tnsr_B = dstack_tsr_B[i];
1589
      }
solomon's avatar
solomon committed
1590
      ASSERT(tnsr_B == B);
1591
    }
1592
  //#ifndef SEQ
1593
    //stat = B->zero_out_padding();
1594
  //#endif
solomon's avatar
solomon committed
1595 1596 1597 1598
    CTF_int::cdealloc(map_A);
    CTF_int::cdealloc(map_B);
    CTF_int::cdealloc(dstack_map_B);
    CTF_int::cdealloc(dstack_tsr_B);
1599 1600

    TAU_FSTOP(sum_tensors);
solomon's avatar
solomon committed
1601
    return SUCCESS;
1602 1603
  }

1604 1605 1606
  int summation::unfold_broken_sym(summation ** nnew_sum){
    int sidx, i, num_tot, iA, iA2, iB;
    int * idx_arr;
1607
    int bsym = NS;
solomon's avatar
solomon committed
1608
    summation * new_sum;
1609 1610 1611 1612
   
    if (nnew_sum != NULL){
      new_sum = new summation(*this);
      *nnew_sum = new_sum;
1613
    } else new_sum = NULL;
1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624

    inv_idx(A->order, idx_A,
            B->order, idx_B,
            &num_tot, &idx_arr);

    sidx = -1;
    for (i=0; i<A->order; i++){
      if (A->sym[i] != NS){
        iA = idx_A[i];
        if (idx_arr[2*iA+1] != -1){
          if (B->sym[idx_arr[2*iA+1]] == NS ||
1625
              ((B->sym[idx_arr[2*iA+1]] == AS) ^ (A->sym[i] == AS)) ||
1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642
              idx_arr[2*idx_A[i+1]+1] == -1 ||
              idx_A[i+1] != idx_B[idx_arr[2*iA+1]+1]){
            sidx = 2*i;
            break;
          }
        } else if (idx_arr[2*idx_A[i+1]+1] != -1){
          sidx = 2*i;
          break;
        }
      }
    } 
    if (sidx == -1){
      for (i=0; i<B->order; i++){
        if (B->sym[i] != NS){
          iB = idx_B[i];
          if (idx_arr[2*iB+0] != -1){
            if (A->sym[idx_arr[2*iB+0]] == NS ||
1643
                ((A->sym[idx_arr[2*iB+0]] == AS) ^ (B->sym[i] == AS)) ||
1644 1645 1646 1647
                idx_arr[2*idx_B[i+1]+0] == -1 ||
                idx_B[i+1] != idx_A[idx_arr[2*iB+0]+1]){
              sidx = 2*i+1;
              break;
1648 1649 1650 1651
            } else if (A->sym[idx_arr[2*iB+0]] == NS){
              sidx = 2*i;
              bsym = B->sym[i];
              break;
1652 1653 1654 1655 1656 1657 1658 1659
            }
          } else if (idx_arr[2*idx_B[i+1]+0] != -1){
            sidx = 2*i+1;
            break;
          }
        }
      }
    }
1660
    //if we have e.g. b[""] = A["ij"] with SY A, symmetry preserved bu t need to account for diagonal, this just unpacks (FIXME: suboptimal)
1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676
    if (sidx == -1){
      for (i=0; i<A->order; i++){
        if (A->sym[i] == SY){
          iA = idx_A[i];
          iA2 = idx_A[i+1];
          if (idx_arr[2*iA+1] == -1 &&
              idx_arr[2*iA2+1] == -1){
            sidx = 2*i;
            break;
          }
        }
      }
    } 
    if (nnew_sum != NULL && sidx != -1){
      if(sidx%2 == 0){
        new_sum->A = new tensor(A, 0, 0);
1677 1678
        int nA_sym[A->order];
        memcpy(nA_sym, new_sum->A->sym, sizeof(int)*new_sum->A->order);
1679
        nA_sym[sidx/2] = bsym;
1680
        new_sum->A->set_sym(nA_sym);
1681 1682
      } else {
        new_sum->B = new tensor(B, 0, 0);
1683 1684 1685 1686
        int nB_sym[B->order];
        memcpy(nB_sym, new_sum->B->sym, sizeof(int)*new_sum->B->order);
        nB_sym[sidx/2] = NS;
        new_sum->B->set_sym(nB_sym);
1687 1688
      }
    }
solomon's avatar
solomon committed
1689
    CTF_int::cdealloc(idx_arr);
1690 1691 1692 1693 1694 1695 1696 1697
    return sidx;
  }

  void summation::check_consistency(){
    int i, num_tot, len;
    int iA, iB;
    int * idx_arr;
       
1698 1699
    inv_idx(A->order, idx_A,
            B->order, idx_B,
1700 1701 1702 1703 1704 1705 1706 1707 1708 1709
            &num_tot, &idx_arr);

    for (i=0; i<num_tot; i++){
      len = -1;
      iA = idx_arr[2*i+0];
      iB = idx_arr[2*i+1];
      if (iA != -1){
        len = A->lens[iA];
      }
      if (len != -1 && iB != -1 && len != B->lens[iB]){
solomon's avatar
solomon committed
1710
        if (A->wrld->cdt.rank == 0){
1711
          printf("i = %d Error in sum call: The %dth edge length (%d) of tensor %s does not",
solomon's avatar
solomon committed
1712
                  i, iA, len, A->name);
1713
          printf("match the %dth edge length (%d) of tensor %s.\n",
solomon's avatar
solomon committed
1714
                  iB, B->lens[iB], B->name);
1715 1716 1717 1718
        }
        ABORT;
      }
    }
solomon's avatar
solomon committed
1719
    CTF_int::cdealloc(idx_arr);
1720 1721 1722 1723

  }


1724
  int summation::is_equal(summation const & os){
1725 1726
    int i;

1727 1728
    if (A != os.A) return 0;
    if (B != os.B) return 0;
1729

solomon's avatar
solomon committed
1730
    for (i=0; i<A->order; i++){
1731
      if (idx_A[i] != os.idx_A[i]) return 0;
1732
    }
solomon's avatar
solomon committed
1733
    for (i=0; i<B->order; i++){
1734
      if (idx_B[i] != os.idx_B[i]) return 0;
1735 1736 1737
    }
    return 1;
  }
1738

1739 1740
  int summation::check_mapping(){
    int i, pass, order_tot, iA, iB;
1741
    int * idx_arr; //, * phys_map;
1742
    //mapping * map;
1743 1744 1745 1746 1747 1748 1749 1750

    TAU_FSTART(check_sum_mapping);
    pass = 1;
    
    if (A->is_mapped == 0) pass = 0;
    if (B->is_mapped == 0) pass = 0;
    
    
1751 1752
    if (A->is_folded == 1) pass = 0;
    if (B->is_folded == 1) pass = 0;
1753 1754 1755 1756
    
    if (A->topo != B->topo) pass = 0;

    if (pass==0){
1757
      DPRINTF(4,"failed confirmation here\n");
1758 1759 1760 1761
      TAU_FSTOP(check_sum_mapping);
      return 0;
    }
    
1762 1763
    //CTF_int::alloc_ptr(sizeof(int)*A->topo->order, (void**)&phys_map);
    //memset(phys_map, 0, sizeof(int)*A->topo->order);
1764

1765 1766
    inv_idx(A->order, idx_A,
            B->order, idx_B,
1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784
            &order_tot, &idx_arr);

    if (!check_self_mapping(A, idx_A))
      pass = 0;
    if (!check_self_mapping(B, idx_B))
      pass = 0;
    if (pass == 0)
      DPRINTF(4,"failed confirmation here\n");

    for (i=0; i<order_tot; i++){
      iA = idx_arr[2*i];
      iB = idx_arr[2*i+1];
      if (iA != -1 && iB != -1) {
        if (!comp_dim_map(&A->edge_map[iA], &B->edge_map[iB])){
          pass = 0;
          DPRINTF(4,"failed confirmation here i=%d\n",i);
        }
      }
1785
      /*if (iA != -1) {
1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803
        map = &A->edge_map[iA];
        if (map->type == PHYSICAL_MAP)
          phys_map[map->cdt] = 1;
        while (map->has_child) {
          map = map->child;
          if (map->type == PHYSICAL_MAP)
            phys_map[map->cdt] = 1;
        }
      }
      if (iB != -1){
        map = &B->edge_map[iB];
        if (map->type == PHYSICAL_MAP)
          phys_map[map->cdt] = 1;
        while (map->has_child) {
          map = map->child;
          if (map->type == PHYSICAL_MAP)
            phys_map[map->cdt] = 1;
        }
1804
      }*/
1805 1806 1807 1808 1809 1810 1811 1812 1813 1814
    }
    /* Ensure that something is mapped to each dimension, since replciation
       does not make sense in sum for all tensors */
  /*  for (i=0; i<topovec[A->itopo].order; i++){
      if (phys_map[i] == 0) {
        pass = 0;
        DPRINTF(3,"failed confirmation here i=%d\n",i);
      }
    }*/

solomon's avatar
solomon committed
1815 1816
    //CTF_int::cdealloc(phys_map);
    CTF_int::cdealloc(idx_arr);
1817

1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832
    //if we have don't have an additive id we can't replicate
    if (B->sr->addid() == NULL){
      int ndim_mapped = 0;
      for (int j=0; j<B->order; j++){
        mapping * map = &B->edge_map[j];
        if (map->type == PHYSICAL_MAP) ndim_mapped++;
        while (map->has_child) {
          map = map->child;
          if (map->type == PHYSICAL_MAP)
            ndim_mapped++;
        }
        if (ndim_mapped < B->topo->order) pass = 0;
      }
    }
       
1833 1834 1835 1836 1837
    TAU_FSTOP(check_sum_mapping);

    return pass;
  }

1838 1839 1840
  int summation::map_sum_indices(topology const * topo){
    int tsr_order, isum, iA, iB, i, j, jsum, jX, stat;
    int * tsr_edge_len, * tsr_sym_table, * restricted;
solomon's avatar
solomon committed
1841
    int * idx_arr, * idx_sum;
1842 1843 1844 1845 1846
    int num_sum, num_tot, idx_num;
    idx_num = 2;
    mapping * sum_map;

    TAU_FSTART(map_sum_indices);
solomon's avatar
solomon committed
1847

1848 1849
    inv_idx(A->order, idx_A,
            B->order, idx_B,
solomon's avatar
solomon committed
1850 1851
            &num_tot, &idx_arr);

1852
    CTF_int::alloc_ptr(sizeof(int)*num_tot, (void**)&idx_sum);
solomon's avatar
solomon committed
1853 1854 1855 1856 1857 1858 1859
    
    num_sum = 0;
    for (i=0; i<num_tot; i++){
      if (idx_arr[2*i] != -1 && idx_arr[2*i+1] != -1){
        idx_sum[num_sum] = i;
        num_sum++;
      }
1860
    }
1861 1862 1863 1864

    tsr_order = num_sum;


1865 1866 1867 1868
    CTF_int::alloc_ptr(tsr_order*sizeof(int),           (void**)&restricted);
    CTF_int::alloc_ptr(tsr_order*sizeof(int),           (void**)&tsr_edge_len);
    CTF_int::alloc_ptr(tsr_order*tsr_order*sizeof(int), (void**)&tsr_sym_table);
    CTF_int::alloc_ptr(tsr_order*sizeof(mapping),       (void**)&sum_map);
1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947

    memset(tsr_sym_table, 0, tsr_order*tsr_order*sizeof(int));
    memset(restricted, 0, tsr_order*sizeof(int));

    for (i=0; i<tsr_order; i++){ 
      sum_map[i].type             = NOT_MAPPED; 
      sum_map[i].has_child        = 0;
      sum_map[i].np               = 1;
    }
    for (i=0; i<num_sum; i++){
      isum = idx_sum[i];
      iA = idx_arr[isum*2+0];
      iB = idx_arr[isum*2+1];

      if (A->edge_map[iA].type != NOT_MAPPED){
        ASSERT(B->edge_map[iB].type == NOT_MAPPED);
        copy_mapping(1, &A->edge_map[iA], &sum_map[i]);
      } else if (B->edge_map[iB].type != NOT_MAPPED){
        copy_mapping(1, &B->edge_map[iB], &sum_map[i]);
      }
    }

    /* Map a tensor of dimension.
     * Set the edge lengths and symmetries according to those in sum dims of A and B.
     * This gives us a mapping for the common mapped dimensions of tensors A and B. */
    for (i=0; i<num_sum; i++){
      isum = idx_sum[i];
      iA = idx_arr[isum*idx_num+0];
      iB = idx_arr[isum*idx_num+1];

      tsr_edge_len[i] = A->pad_edge_len[iA];

      /* Check if A has symmetry among the dimensions being contracted over.
       * Ignore symmetry with non-contraction dimensions.
       * FIXME: this algorithm can be more efficient but should not be a bottleneck */
      if (A->sym[iA] != NS){
        for (j=0; j<num_sum; j++){
          jsum = idx_sum[j];
          jX = idx_arr[jsum*idx_num+0];
          if (jX == iA+1){
            tsr_sym_table[i*tsr_order+j] = 1;
            tsr_sym_table[j*tsr_order+i] = 1;
          }
        }
      }
      if (B->sym[iB] != NS){
        for (j=0; j<num_sum; j++){
          jsum = idx_sum[j];
          jX = idx_arr[jsum*idx_num+1];
          if (jX == iB+1){
            tsr_sym_table[i*tsr_order+j] = 1;
            tsr_sym_table[j*tsr_order+i] = 1;
          }
        }
      }
    }
    /* Run the mapping algorithm on this construct */
    stat = map_tensor(topo->order,        tsr_order, 
                      tsr_edge_len,       tsr_sym_table,
                      restricted,         topo->dim_comm,
                      NULL,               0,
                      sum_map);

    if (stat == ERROR){
      TAU_FSTOP(map_sum_indices);
      return ERROR;
    }
    
    /* define mapping of tensors A and B according to the mapping of sum dims */
    if (stat == SUCCESS){
      for (i=0; i<num_sum; i++){
        isum = idx_sum[i];
        iA = idx_arr[isum*idx_num+0];
        iB = idx_arr[isum*idx_num+1];

        copy_mapping(1, &sum_map[i], &A->edge_map[iA]);
        copy_mapping(1, &sum_map[i], &B->edge_map[iB]);
      }
    }
solomon's avatar
solomon committed
1948 1949 1950
    CTF_int::cdealloc(restricted);
    CTF_int::cdealloc(tsr_edge_len);
    CTF_int::cdealloc(tsr_sym_table);
1951 1952 1953
    for (i=0; i<num_sum; i++){
      sum_map[i].clear();
    }
solomon's avatar
solomon committed
1954 1955 1956
    CTF_int::cdealloc(sum_map);
    CTF_int::cdealloc(idx_sum);
    CTF_int::cdealloc(idx_arr);
1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974

    TAU_FSTOP(map_sum_indices);
    return stat;

  }

  int summation::map(){
    int i, ret, need_remap;
    int need_remap_A, need_remap_B;
    int d;
    topology * old_topo_A, * old_topo_B;
    int btopo;
    int gtopo;

    ASSERT(A->wrld == B->wrld);
    World * wrld = A->wrld;
   
    TAU_FSTART(map_tensor_pair);
solomon's avatar
solomon committed
1975
  #if DEBUG >= 2
1976
    if (wrld->rank == 0)
solomon's avatar
solomon committed
1977
      printf("Initial mappings:\n");
1978
    A->print_map(stdout);
1979
    B->print_map(stdout);
solomon's avatar
solomon committed
1980 1981
  #endif

1982 1983 1984 1985 1986 1987
    //FIXME: try to avoid unfolding immediately, as its not always necessary
    A->unfold();
    B->unfold();
    A->set_padding();
    B->set_padding();

1988

1989 1990
    distribution dA(A);
    distribution dB(B);
1991 1992 1993 1994 1995 1996
    old_topo_A = A->topo;
    old_topo_B = B->topo;
    mapping * old_map_A = new mapping[A->order];
    mapping * old_map_B = new mapping[B->order];
    copy_mapping(A->order, A->edge_map, old_map_A);
    copy_mapping(B->order, B->edge_map, old_map_B);
solomon's avatar
solomon committed
1997
    btopo = -1;
1998 1999
    int64_t size;
    int64_t min_size = INT64_MAX;
solomon's avatar
solomon committed
2000
    /* Attempt to map to all possible permutations of processor topology */
2001
    for (i=A->wrld->cdt.rank; i<2*(int)A->wrld->topovec.size(); i+=A->wrld->cdt.np){
solomon's avatar
solomon committed
2002
  //  for (i=global_comm.rank*topovec.size(); i<2*(int)topovec.size(); i++){
2003 2004 2005 2006
      A->clear_mapping();
      B->clear_mapping();
      A->set_padding();
      B->set_padding();
solomon's avatar
solomon committed
2007

2008 2009 2010 2011
      A->topo = wrld->topovec[i/2];
      B->topo = wrld->topovec[i/2];
      A->is_mapped = 1;
      B->is_mapped = 1;
solomon's avatar
solomon committed
2012 2013

      if (i%2 == 0){
2014 2015
        ret = map_self_indices(A, idx_A);
        if (ret == NEGATIVE) continue;
solomon's avatar
solomon committed
2016 2017
        else if (ret != SUCCESS) return ret;
      } else {
2018 2019
        ret = map_self_indices(B, idx_B);
        if (ret == NEGATIVE) continue;
solomon's avatar
solomon committed
2020 2021
        else if (ret != SUCCESS) return ret;
      }
2022 2023
      ret = map_sum_indices(A->topo);
      if (ret == NEGATIVE) continue;
solomon's avatar
solomon committed
2024 2025 2026 2027
      else if (ret != SUCCESS){
        return ret;
      }
      if (i%2 == 0){
2028 2029
        ret = map_self_indices(A, idx_A);
        if (ret == NEGATIVE) continue;
solomon's avatar
solomon committed
2030 2031
        else if (ret != SUCCESS) return ret;
      } else {
2032 2033
        ret = map_self_indices(B, idx_B);
        if (ret == NEGATIVE) continue;
solomon's avatar
solomon committed
2034 2035 2036 2037
        else if (ret != SUCCESS) return ret;
      }

      if (i%2 == 0){
2038 2039
        ret = map_self_indices(A, idx_A);
        if (ret == NEGATIVE) continue;
solomon's avatar
solomon committed
2040
        else if (ret != SUCCESS) return ret;
2041 2042 2043
        ret = A->map_tensor_rem(A->topo->order, 
                                A->topo->dim_comm);
        if (ret == NEGATIVE) continue;
solomon's avatar
solomon committed
2044
        else if (ret != SUCCESS) return ret;
2045 2046 2047 2048 2049 2050
        copy_mapping(A->order, B->order,
                     idx_A, A->edge_map, 
                     idx_B, B->edge_map,0);
        ret = B->map_tensor_rem(B->topo->order, 
                                B->topo->dim_comm);
        if (ret == NEGATIVE) continue;
solomon's avatar
solomon committed
2051 2052
        else if (ret != SUCCESS) return ret;
      } else {
2053 2054
        ret = map_self_indices(B, idx_B);
        if (ret == NEGATIVE) continue;
solomon's avatar
solomon committed
2055
        else if (ret != SUCCESS) return ret;
2056 2057 2058
        ret = B->map_tensor_rem(B->topo->order, 
                                B->topo->dim_comm);
        if (ret == NEGATIVE) continue;
solomon's avatar
solomon committed
2059
        else if (ret != SUCCESS) return ret;
2060 2061 2062 2063 2064 2065
        copy_mapping(B->order, A->order,
                     idx_B, B->edge_map, 
                     idx_A, A->edge_map,0);
        ret = A->map_tensor_rem(A->topo->order, 
                                A->topo->dim_comm);
        if (ret == NEGATIVE) continue;
solomon's avatar
solomon committed
2066 2067 2068
        else if (ret != SUCCESS) return ret;
      }
      if (i%2 == 0){
2069 2070
        ret = map_self_indices(B, idx_B);
        if (ret == NEGATIVE) continue;
solomon's avatar
solomon committed
2071 2072
        else if (ret != SUCCESS) return ret;
      } else {
2073 2074
        ret = map_self_indices(A, idx_A);
        if (ret == NEGATIVE) continue;
solomon's avatar
solomon committed
2075 2076 2077
        else if (ret != SUCCESS) return ret;
      }

2078 2079
  /*    ret = map_symtsr(A->order, A->sym_table, A->edge_map);
      ret = map_symtsr(B->order, B->sym_table, B->edge_map);
solomon's avatar
solomon committed
2080 2081 2082
      if (ret!=SUCCESS) return ret;
      return SUCCESS;*/

2083
  #if DEBUG >= 4
2084 2085
      A->print_map(stdout,0);
      B->print_map(stdout,0);
solomon's avatar
solomon committed
2086
  #endif
2087 2088 2089 2090
      if (!check_mapping()) continue;
      A->set_padding();
      B->set_padding();
      size = A->size + B->size;
solomon's avatar
solomon committed
2091 2092 2093 2094

      need_remap_A = 0;
      need_remap_B = 0;

2095 2096 2097
      if (A->topo == old_topo_A){
        for (d=0; d<A->order; d++){
          if (!comp_dim_map(&A->edge_map[d],&old_map_A[d]))
solomon's avatar
solomon committed
2098 2099 2100 2101 2102
            need_remap_A = 1;
        }
      } else
        need_remap_A = 1;
      if (need_remap_A){
2103 2104
        if (can_block_reshuffle(A->order, dA.phase, A->edge_map)){
          size += A->size*log2(wrld->cdt.np);
solomon's avatar
solomon committed
2105
        } else {
2106 2107 2108 2109
          if (A->is_sparse)
            size += 25.*A->size*log2(wrld->cdt.np);
          else
            size += 5.*A->size*log2(wrld->cdt.np);
solomon's avatar
solomon committed
2110 2111
        }
      }
2112 2113 2114
      if (B->topo == old_topo_B){
        for (d=0; d<B->order; d++){
          if (!comp_dim_map(&B->edge_map[d],&old_map_B[d]))
solomon's avatar
solomon committed
2115 2116 2117 2118 2119
            need_remap_B = 1;
        }
      } else
        need_remap_B = 1;
      if (need_remap_B){
2120 2121
        if (can_block_reshuffle(B->order, dB.phase, B->edge_map)){
          size += B->size*log2(wrld->cdt.np);
solomon's avatar
solomon committed
2122
        } else {
2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133
          if (B->is_home){
            if (B->is_sparse)
              size += 50.*B->size*log2(wrld->cdt.np);
            else
              size += 10.*B->size*log2(wrld->cdt.np);
          } else {
            if (B->is_sparse)
              size += 25.*B->size*log2(wrld->cdt.np);
            else
              size += 5.*B->size*log2(wrld->cdt.np);
          }
solomon's avatar
solomon committed
2134 2135 2136
        }
      }

2137 2138
      /*nvirt = (int64_t)calc_nvirt(A);
      tnvirt = nvirt*(int64_t)calc_nvirt(B);
solomon's avatar
solomon committed
2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150
      if (tnvirt < nvirt) nvirt = UINT64_MAX;
      else nvirt = tnvirt;
      if (btopo == -1 || nvirt < bnvirt ) {
        bnvirt = nvirt;
        btopo = i;      
      }*/
      if (btopo == -1 || size < min_size){
        min_size = size;
        btopo = i;      
      }
    }
    if (btopo == -1)
2151
      min_size = INT64_MAX;
solomon's avatar
solomon committed
2152
    /* pick lower dimensional mappings, if equivalent */
2153
    gtopo = get_best_topo(min_size, btopo, wrld->cdt);
solomon's avatar
solomon committed
2154 2155 2156 2157 2158 2159 2160
    TAU_FSTOP(map_tensor_pair);
    if (gtopo == -1){
      printf("ERROR: Failed to map pair!\n");
      ABORT;
      return ERROR;
    }
    
2161 2162 2163 2164
    A->clear_mapping();
    B->clear_mapping();
    A->set_padding();
    B->set_padding();
2165

2166 2167
    A->topo = wrld->topovec[gtopo/2];
    B->topo = wrld->topovec[gtopo/2];
2168 2169 2170

    A->is_mapped = 1;
    B->is_mapped = 1;
solomon's avatar
solomon committed
2171 2172
      
    if (gtopo%2 == 0){
2173
      ret = map_self_indices(A, idx_A);
solomon's avatar
solomon committed
2174
      ASSERT(ret == SUCCESS);
2175
    } else {
2176
      ret = map_self_indices(B, idx_B);
solomon's avatar
solomon committed
2177
      ASSERT(ret == SUCCESS);
2178
    }
2179
    ret = map_sum_indices(A->topo);
solomon's avatar
solomon committed
2180
    ASSERT(ret == SUCCESS);
2181 2182 2183 2184 2185 2186 2187
    if (gtopo%2 == 0){
      ret = map_self_indices(A, idx_A);
      ASSERT(ret == SUCCESS);
    } else {
      ret = map_self_indices(B, idx_B);
      ASSERT(ret == SUCCESS);
    }
2188

solomon's avatar
solomon committed
2189
    if (gtopo%2 == 0){
2190
      ret = map_self_indices(A, idx_A);
solomon's avatar
solomon committed
2191
      ASSERT(ret == SUCCESS);
2192 2193
      ret = A->map_tensor_rem(A->topo->order, 
                              A->topo->dim_comm);
solomon's avatar
solomon committed
2194
      ASSERT(ret == SUCCESS);
2195 2196 2197 2198 2199
      copy_mapping(A->order, B->order,
                   idx_A, A->edge_map, 
                   idx_B, B->edge_map,0);
      ret = B->map_tensor_rem(B->topo->order, 
                              B->topo->dim_comm);
solomon's avatar
solomon committed
2200
      ASSERT(ret == SUCCESS);
2201
    } else {
2202
      ret = map_self_indices(B, idx_B);
solomon's avatar
solomon committed
2203
      ASSERT(ret == SUCCESS);
2204 2205
      ret = B->map_tensor_rem(B->topo->order, 
                              B->topo->dim_comm);
solomon's avatar
solomon committed
2206
      ASSERT(ret == SUCCESS);
2207 2208 2209 2210 2211
      copy_mapping(B->order, A->order,
                   idx_B, B->edge_map, 
                   idx_A, A->edge_map,0);
      ret = A->map_tensor_rem(A->topo->order, 
                              A->topo->dim_comm);
solomon's avatar
solomon committed
2212
      ASSERT(ret == SUCCESS);
2213
    }
2214
    if (gtopo%2 == 0){
2215
      ret = map_self_indices(B, idx_B);
2216 2217
      ASSERT(ret == SUCCESS);
    } else {
2218
      ret = map_self_indices(A, idx_A);
2219 2220 2221
      ASSERT(ret == SUCCESS);
    }

2222
    ASSERT(check_mapping());
2223

2224 2225
    A->set_padding();
    B->set_padding();
2226
  #if DEBUG > 2
2227
    if (wrld->cdt.rank == 0)
solomon's avatar
solomon committed
2228
      printf("New mappings:\n");
2229 2230
    A->print_map(stdout);
    B->print_map(stdout);
solomon's avatar
solomon committed
2231
  #endif
2232

solomon's avatar
solomon committed
2233 2234
    TAU_FSTART(redistribute_for_sum);
   
2235 2236
    A->is_cyclic = 1;
    B->is_cyclic = 1;
solomon's avatar
solomon committed
2237
    need_remap = 0;
2238 2239 2240
    if (A->topo == old_topo_A){
      for (d=0; d<A->order; d++){
        if (!comp_dim_map(&A->edge_map[d],&old_map_A[d]))
solomon's avatar
solomon committed
2241
          need_remap = 1;
2242 2243
      }
    } else
solomon's avatar
solomon committed
2244 2245
      need_remap = 1;
    if (need_remap)
2246
      A->redistribute(dA);
solomon's avatar
solomon committed
2247
    need_remap = 0;
2248 2249 2250
    if (B->topo == old_topo_B){
      for (d=0; d<B->order; d++){
        if (!comp_dim_map(&B->edge_map[d],&old_map_B[d]))
solomon's avatar
solomon committed
2251
          need_remap = 1;
2252 2253
      }
    } else
solomon's avatar
solomon committed
2254 2255
      need_remap = 1;
    if (need_remap)
2256
      B->redistribute(dB);
solomon's avatar
solomon committed
2257 2258

    TAU_FSTOP(redistribute_for_sum);
2259 2260
    delete [] old_map_A;
    delete [] old_map_B;
2261

solomon's avatar
solomon committed
2262
    return SUCCESS;
2263
  }
2264 2265

  void summation::print(){
2266 2267
    int i;
    //max = A->order+B->order;
2268

2269 2270 2271
    CommData global_comm = A->wrld->cdt;
    MPI_Barrier(global_comm.cm);
    if (global_comm.rank == 0){
2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295
      char sname[200];
      sname[0] = '\0';
      sprintf(sname, "%s", B->name);
      sprintf(sname+strlen(sname),"[");
      for (i=0; i<B->order; i++){
        if (i>0)
          sprintf(sname+strlen(sname)," %d",idx_B[i]);
        else
          sprintf(sname+strlen(sname),"%d",idx_B[i]);
      }
      sprintf(sname+strlen(sname),"] <-");
      sprintf(sname+strlen(sname), "%s", A->name);
      sprintf(sname+strlen(sname),"[");
      for (i=0; i<A->order; i++){
        if (i>0)
          sprintf(sname+strlen(sname)," %d",idx_A[i]);
        else
          sprintf(sname+strlen(sname),"%d",idx_A[i]);
      }
      sprintf(sname+strlen(sname),"]");
      printf("CTF: Summation %s\n",sname);


/*      printf("Summing Tensor %s into %s\n", A->name, B->name);
2296 2297 2298 2299 2300 2301 2302
      printf("alpha is "); 
      if (alpha != NULL) A->sr->print(alpha);
      else printf("NULL");
      printf("\nbeta is "); 
      if (beta != NULL) B->sr->print(beta);
      else printf("NULL"); 
      printf("\n");
2303 2304 2305 2306 2307 2308 2309 2310 2311
      printf("Summation index table:\n");
      printf("     A     B\n");
      for (i=0; i<max; i++){
        ex_A=0;
        ex_B=0;
        printf("%d:   ",i);
        for (j=0; j<A->order; j++){
          if (idx_A[j] == i){
            ex_A++;
solomon's avatar
solomon committed
2312 2313 2314 2315 2316 2317
            if (A->sym[j] == SY)
              printf("%dY ",j);
            else if (A->sym[j] == SH)
              printf("%dH ",j);
            else if (A->sym[j] == AS)
              printf("%dS ",j);
2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328
            else
              printf("%d  ",j);
          }
        }
        if (ex_A == 0)
          printf("      ");
        if (ex_A == 1)
          printf("   ");
        for (j=0; j<B->order; j++){
          if (idx_B[j] == i){
            ex_B=1;
solomon's avatar
solomon committed
2329 2330 2331 2332 2333 2334
            if (B->sym[j] == SY)
              printf("%dY ",j);
            else if (B->sym[j] == SH)
              printf("%dH ",j);
            else if (B->sym[j] == AS)
              printf("%dS ",j);
2335 2336 2337 2338 2339 2340
            else
              printf("%d  ",j);
          }
        }
        printf("\n");
        if (ex_A + ex_B== 0) break;
2341
      }*/
2342 2343
    }
  }
2344
              
2345

2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360
  void summation::sp_sum(){
    int64_t num_pair;
    char * mapped_data;
    
    bool is_idx_matched = true;
    if (A->order != B->order)
      is_idx_matched = false;
    else {
      for (int o=0; o<A->order; o++){
        if (idx_A[o] != idx_B[o]){
          is_idx_matched = false;
        }
      }
    }

2361

2362
    //read data from A    
2363
    A->read_local_nnz(&num_pair, &mapped_data);
2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376

    if (!is_idx_matched){
      int64_t lda_A[A->order];
      int64_t lda_B[B->order];
      lda_A[0] = 1;
      for (int o=1; o<A->order; o++){
        lda_A[o] = lda_A[o-1]*A->lens[o];
      }
      lda_B[0] = 1;
      for (int o=1; o<B->order; o++){
        lda_B[o] = lda_B[o-1]*B->lens[o];
      }
      PairIterator pi(A->sr, mapped_data);
2377
#ifdef USE_OMP
2378
      #pragma omp parallel for
2379
#endif
2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394
      for (int i=0; i<num_pair; i++){
        int64_t k = pi[i].k();
        int64_t k_new = 0;
        for (int o=0; o<A->order; o++){
          int64_t kpart = (k/lda_A[o])%A->lens[o];
          //FIXME: slow, but handles diagonal indexing, probably worth having separate versions
          for (int q=0; q<B->order; q++){
            if (idx_A[o] == idx_B[q]){
              k_new += kpart*lda_B[q];
            }
          }
        }
        ((int64_t*)(pi[i].ptr))[0] = k_new;
      }

2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450
      // when idx_A has indices idx_B does not, we need to reduce, which can be done partially here since the elements of A should be sorted
      bool is_reduce = false;
      for (int oA=0; oA<A->order; oA++){
        bool inB = false;
        for (int oB=0; oB<B->order; oB++){
          if (idx_A[oA] == idx_B[oB]){
            inB = true;
          }
        }
        if (!inB) is_reduce = true;
      }
  
      if (is_reduce && num_pair > 0){
        pi.sort(num_pair);
        int64_t nuniq=1;
        for (int64_t i=1; i<num_pair; i++){
          if (pi[i].k() != pi[i-1].k()) nuniq++;
        }
        if (nuniq != num_pair){
          char * swap_data = mapped_data;
          alloc_ptr(A->sr->pair_size()*nuniq, (void**)&mapped_data);
          PairIterator pi_new(A->sr, mapped_data);
          int64_t cp_st = 0;
          int64_t acc_st = -1;
          int64_t pfx = 0;
          for (int64_t i=1; i<num_pair; i++){
            if (pi[i].k() == pi[i-1].k()){
              if (cp_st < i){ 
                memcpy(pi_new[pfx].ptr, pi[cp_st].ptr, A->sr->pair_size()*(i-cp_st));
                pfx += i-cp_st;
              }
              cp_st = i+1;

              if (acc_st == -1) acc_st = i;
            } else {
              if (acc_st != -1){
                for (int64_t j=acc_st; j<i; j++){
                  A->sr->add(pi_new[pfx-1].d(), pi[j].d(), pi_new[pfx-1].d());
                }
              }
              acc_st = -1;
            }           
          }
          if (cp_st < num_pair)
            memcpy(pi_new[pfx].ptr, pi[cp_st].ptr, A->sr->pair_size()*(num_pair-cp_st));
          if (acc_st != -1){
            for (int64_t j=acc_st; j<num_pair; j++){
              A->sr->add(pi_new[pfx-1].d(), pi[j].d(), pi_new[pfx-1].d());
            }
          }
          cdealloc(swap_data);
          num_pair = nuniq;
        }
      }

      // if applying custom function, apply immediately on reduced form
2451
      if (is_custom && !func->is_accumulator()){
2452 2453 2454 2455 2456 2457 2458 2459
        char * swap_data = mapped_data;
        alloc_ptr(B->sr->pair_size()*num_pair, (void**)&mapped_data);
        PairIterator pi_new(B->sr, mapped_data);
#ifdef USE_OMP
        #pragma omp parallel for
#endif
        for (int64_t i=0; i<num_pair; i++){
          if (alpha == NULL)
2460
            func->apply_f(pi[i].d(), pi_new[i].d());
2461 2462 2463
          else  {
            char tmp_A[A->sr->el_size];
            A->sr->mul(pi[i].d(), alpha, tmp_A);
2464
            func->apply_f(tmp_A, pi_new[i].d());
2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537
          }
        }
        cdealloc(swap_data);
        alpha = NULL;
      }
  
      // when idx_B has indices idx_A does not, we need to map, which we do by replicating the key value pairs of B
      // FIXME this is probably not most efficient, but not entirely stupid, as at least the set of replicated pairs is not expected to be bigger than B
      int nmap_idx = 0;
      int64_t map_idx_len[B->order];
      int64_t map_idx_lda[B->order];
      int map_idx_rev[B->order];
      for (int oB=0; oB<B->order; oB++){
        bool inA = false;
        for (int oA=0; oA<A->order; oA++){
          if (idx_A[oA] == idx_B[oB]){
            inA = true;
          }
        }
        if (!inA){ 
          bool is_rep=false;
          for (int ooB=0; ooB<oB; ooB++){
            if (idx_B[ooB] == idx_B[oB]){
              is_rep = true;
              map_idx_lda[map_idx_rev[ooB]] += lda_B[oB];
              break;
            }
          }
          if (!is_rep){
            map_idx_len[nmap_idx] = B->lens[oB];
            map_idx_lda[nmap_idx] = lda_B[oB];
            map_idx_rev[nmap_idx] = oB;
            nmap_idx++;
          }
        }
      }
      if (nmap_idx > 0){
        int64_t tot_rep=1;
        for (int midx=0; midx<nmap_idx; midx++){
          tot_rep *= map_idx_len[midx];
        }
        char * swap_data = mapped_data;
        alloc_ptr(A->sr->pair_size()*num_pair*tot_rep, (void**)&mapped_data);
        PairIterator pi_new(A->sr, mapped_data);
#ifdef USE_OMP
        #pragma omp parallel for
#endif
        for (int64_t i=0; i<num_pair; i++){
          for (int64_t r=0; r<tot_rep; r++){
            memcpy(pi_new[i*tot_rep+r].ptr, pi[i].ptr, A->sr->pair_size());
          }
        }
#ifdef USE_OMP
        #pragma omp parallel for
#endif
        for (int64_t i=0; i<num_pair; i++){
          int64_t phase=1;
          for (int midx=0; midx<nmap_idx; midx++){
            int64_t stride=phase;
            phase *= map_idx_len[midx];
            for (int64_t r=0; r<tot_rep/phase; r++){
              for (int64_t m=1; m<map_idx_len[midx]; m++){
                for (int64_t s=0; s<stride; s++){
                  ((int64_t*)(pi_new[i*tot_rep + r*phase + m*stride + s].ptr))[0] += m*map_idx_lda[midx];
                }
              }
            }
          }
        }
        cdealloc(swap_data);
        num_pair *= tot_rep;
      }
    }
2538 2539 2540 2541 2542 2543
    
    B->write(num_pair, alpha, beta, mapped_data, 'w');
    cdealloc(mapped_data);

  }

2544
}