Got normal, ror, isend/irecv, put, and fompi put all working and integrated in unified code

4d6d1d6b · Edgar Solomonik · 69aee1c7 · 4d6d1d6b · 4d6d1d6b · 4d6d1d6b
Commit 4d6d1d6b authored 10 years ago by Edgar Solomonik
7 changed files
--- a/src/contraction/Makefile
+++ b/src/contraction/Makefile
@@ -3,7 +3,7 @@ include ../../config.mk
 OBJS = contraction.o sym_seq_ctr.o ctr_offload.o ctr_comm.o ctr_tsr.o ctr_2d_general.o

 #%d | r ! grep -ho "\.\..*\.h" *.cxx *.h | sort | uniq
-HDRS = ../../Makefile ../../config.mk  ../interface/functions.h ../mapping/distribution.h ../mapping/mapping.h ../redistribution/nosym_transp.h ../redistribution/redist.h ../scaling/strp_tsr.h ../shared/iter_tsr.h ../shared/memcontrol.h ../shared/offload.h ../shared/util.h ../symmetry/sym_indices.h ../symmetry/symmetrization.h ../tensor/algstrct.h ../tensor/untyped_tensor.h 
+HDRS = ../../Makefile ../../config.mk  ../interface/functions.h ../mapping/distribution.h ../mapping/mapping.h ../redistribution/nosym_transp.h ../redistribution/redist.h ../scaling/strp_tsr.h ../shared/iter_tsr.h ../shared/memcontrol.h ../shared/offload.h ../shared/util.h ../symmetry/sym_indices.h ../symmetry/symmetrization.h ../tensor/algstrct.h ../tensor/untyped_tensor.h

 ctf: $(OBJS) 


--- a/src/interface/common.h
+++ b/src/interface/common.h
@@ -13,6 +13,8 @@
 #include <unistd.h>
 #include <iostream>
 #include <limits.h>
+
+#include "../shared/fompi_wrapper.h"
 #include "mpi.h"

 /**

--- a/src/redistribution/phase_reshuffle.cxx
+++ b/src/redistribution/phase_reshuffle.cxx
@@ -6,7 +6,7 @@
 #include "nosym_transp.h"

 #define MTAG 777
-#define ROR
+//#define ROR
 #ifdef ROR
  //#define IREDIST
  //#define REDIST_PUT
@@ -149,42 +149,39 @@ namespace CTF_int {

  template <int idim>
  void calc_cnt_from_rep_cnt(int const *     rep_phase,
-                             int const *     rep_phase_lda,
-                             int const *     rank,
-                             int const *     new_pe_lda,
-                             int const *     old_phys_phase,
-                             int const *     new_phys_phase,
+                             int * const *   pe_offset,
+                             int * const *   bucket_offset,
                             int64_t const * old_counts,
                             int64_t *       counts,
-                             int             coff,
-                             int             roff,
+                             int             bucket_off,
+                             int             pe_off,
                             int             dir){
    for (int i=0; i<rep_phase[idim]; i++){
-      calc_cnt_from_rep_cnt<idim-1>(rep_phase, rep_phase_lda, rank, new_pe_lda, old_phys_phase, new_phys_phase, old_counts, counts,
-                                    coff+new_pe_lda[idim]*((rank[idim]+i*old_phys_phase[idim])%new_phys_phase[idim]),
-                                    roff+rep_phase_lda[idim]*i, dir);
+      int rec_bucket_off = bucket_off+bucket_offset[idim][i];
+      int rec_pe_off = pe_off+pe_offset[idim][i];
+      calc_cnt_from_rep_cnt<idim-1>(rep_phase, pe_offset, bucket_offset, old_counts, counts, rec_bucket_off, rec_pe_off, dir);
+//                                    coff+new_pe_lda[idim]*((rank[idim]+i*old_phys_phase[idim])%new_phys_phase[idim]),
+  //                                  roff+rep_phase_lda[idim]*i, dir);
    }
  }

  template <>
-  void calc_cnt_from_rep_cnt<0>(int const *     rep_phase,
-                                int const *     rep_phase_lda,
-                                int const *     rank,
-                                int const *     new_pe_lda,
-                                int const *     old_phys_phase,
-                                int const *     new_phys_phase,
-                                int64_t const * old_counts,
-                                int64_t *       counts,
-                                int             coff,
-                                int             roff,
-                                int             dir){
+  void calc_cnt_from_rep_cnt<0>
+                            (int const *     rep_phase,
+                             int * const *   pe_offset,
+                             int * const *   bucket_offset,
+                             int64_t const * old_counts,
+                             int64_t *       counts,
+                             int             bucket_off,
+                             int             pe_off,
+                             int             dir){
    if (dir){
      for (int i=0; i<rep_phase[0]; i++){
-        counts[coff+new_pe_lda[0]*((rank[0]+i*old_phys_phase[0])%new_phys_phase[0])] = old_counts[roff + i];
+        counts[pe_off+pe_offset[0][i]] = old_counts[bucket_off+i];
      }
    } else {
      for (int i=0; i<rep_phase[0]; i++){
-        counts[roff + i] = old_counts[coff+new_pe_lda[0]*((rank[0]+i*old_phys_phase[0])%new_phys_phase[0])];
+        counts[bucket_off+i] = old_counts[pe_off+pe_offset[0][i]];
      }

    }
@@ -209,6 +206,7 @@ namespace CTF_int {
      new_loc_edge_len      = (int*)alloc(order*sizeof(int));
      int nrep = 1;
      for (int i=0; i<order; i++){
+        //FIXME: computed elsewhere already
        rep_phase_lda[i]  = nrep;
        sphase[i]         = lcm(old_dist.phys_phase[i],new_dist.phys_phase[i]);
        rep_phase[i]      = sphase[i] / old_dist.phys_phase[i];
@@ -436,9 +434,9 @@ namespace CTF_int {
                 MPI_Comm         cm,
                 char *           buffer,
                 algstrct const * sr,
-                 int              bucket_off=0,
-                 int              pe_off=0,
-                 int              dir=0){
+                 int              bucket_off,
+                 int              pe_off,
+                 int              dir){
    for (int r=0; r<rep_phase[idim]; r++){
      int rec_bucket_off = bucket_off+bucket_offset[idim][r];
      int rec_pe_off = pe_off+pe_offset[idim][r];
@@ -461,7 +459,7 @@ namespace CTF_int {
                 int              pe_off,
                 int              dir){
    for (int r=0; r<rep_phase[0]; r++){
-      int bucket = bucket_off+bucket_offset[0][r];
+      int bucket = bucket_off+r;
      int pe = pe_off+pe_offset[0][r];
      if (dir)
        MPI_Irecv(buffer+displs[bucket]*sr->el_size, counts[bucket], sr->mdtype(), pe, MTAG, cm, reqs+bucket);
@@ -539,7 +537,7 @@ namespace CTF_int {
    for (int r=0; r<rep_phase[idim]; r++){
      int rec_bucket_off = bucket_off+bucket_offset[idim][r];
      int rec_pe_off = pe_off+pe_offset[idim][r];
-      put_buckets<idim-1>(rep_phase, bucket_offset, buckets, counts, sr, put_displs, win, rec_bucket_off, rec_pe_off);
+      put_buckets<idim-1>(rep_phase, pe_offset, bucket_offset, buckets, counts, sr, put_displs, win, rec_bucket_off, rec_pe_off);
    }
  }

@@ -802,15 +800,19 @@ namespace CTF_int {

    int nold_rep = 1;
    int * old_rep_phase; alloc_ptr(sizeof(int)*order, (void**)&old_rep_phase);
+    int * old_rep_phase_lda; alloc_ptr(sizeof(int)*order, (void**)&old_rep_phase_lda);
    for (int i=0; i<order; i++){
      old_rep_phase[i] = lcm(old_dist.phys_phase[i], new_dist.phys_phase[i])/old_dist.phys_phase[i];
+      old_rep_phase_lda[i] = nold_rep;
      nold_rep *= old_rep_phase[i];
    }

    int nnew_rep = 1;
    int * new_rep_phase; alloc_ptr(sizeof(int)*order, (void**)&new_rep_phase);
+    int * new_rep_phase_lda; alloc_ptr(sizeof(int)*order, (void**)&new_rep_phase_lda);
    for (int i=0; i<order; i++){
      new_rep_phase[i] = lcm(new_dist.phys_phase[i], old_dist.phys_phase[i])/new_dist.phys_phase[i];
+      new_rep_phase_lda[i] = nnew_rep;
      nnew_rep *= new_rep_phase[i];
    }
    
@@ -848,10 +850,9 @@ namespace CTF_int {

    precompute_offsets(old_dist, new_dist, sym, edge_len, old_rep_phase, old_phys_edge_len, old_virt_edge_len, old_dist.virt_phase, old_virt_lda, old_virt_nelem, send_pe_offset, send_bucket_offset, send_data_offset);

-
 #ifdef IREDIST
    if (new_idx_lyr == 0)
-      SWITCH_ORD_CALL(isendrecv, order-1, recv_pe_offset, recv_bucket_offset, new_rep_phase, recv_counts, recv_displs, recv_reqs, ord_glb_comm.cm, recv_buffer, sr);
+      SWITCH_ORD_CALL(isendrecv, order-1, recv_pe_offset, recv_bucket_offset, new_rep_phase, recv_counts, recv_displs, recv_reqs, ord_glb_comm.cm, recv_buffer, sr, 0, 0, 1);
 #endif
 #ifndef IREDIST
 #ifndef REDIST_PUT
@@ -861,16 +862,15 @@ namespace CTF_int {
      send_displs[i] = send_displs[i-1] + send_counts[i-1];
    }
 #else
-
-    int64_t * all_get_displs = (int64_t*)alloc(sizeof(int64_t)*ord_glb_comm.np);
-    SWITCH_ORD_CALL(calc_cnt_from_rep_cnt, order-1, rep_phase, rep_phase_lda, old_dist.perank, new_dist.pe_lda, old_dist.phys_phase, new_dist.phys_phase, counts, all_get_displs);
+    int64_t * all_recv_displs = (int64_t*)alloc(sizeof(int64_t)*ord_glb_comm.np);
+    SWITCH_ORD_CALL(calc_cnt_from_rep_cnt, order-1, new_rep_phase, recv_pe_offset, recv_bucket_offset, recv_displs, all_recv_displs, 0, 0, 1);

    int64_t * all_put_displs = (int64_t*)alloc(sizeof(int64_t)*ord_glb_comm.np);
-    MPI_Alltoall(all_get_displs, 1, MPI_INT64_T, all_put_displs, 1, MPI_INT64_T, ord_glb_comm.cm);
-    cfree(all_get_displs);
+    MPI_Alltoall(all_recv_displs, 1, MPI_INT64_T, all_put_displs, 1, MPI_INT64_T, ord_glb_comm.cm);
+    cfree(all_recv_displs);

    int64_t * put_displs = (int64_t*)alloc(sizeof(int64_t)*nold_rep);
-    SWITCH_ORD_CALL(calc_cnt_from_rep_cnt, order-1, rep_phase, rep_phase_lda, new_dist.perank, old_dist.pe_lda, new_dist.phys_phase, old_dist.phys_phase, all_put_displs, put_displs);
+    SWITCH_ORD_CALL(calc_cnt_from_rep_cnt, order-1, old_rep_phase, send_pe_offset, send_bucket_offset, all_put_displs, put_displs, 0, 0, 0);

    cfree(all_put_displs);

@@ -1001,6 +1001,9 @@ namespace CTF_int {
                      new_rep_phase, new_rep_idx, new_dist.virt_phase[0],
 #ifdef IREDIST
                      recv_reqs, ord_glb_comm.cm,
+#endif
+#ifdef  REDIST_PUT
+                      NULL, win,
 #endif
                      0, aux_buf, buckets, recv_counts, sr);
      cfree(new_rep_idx);

--- a/src/redistribution/phase_reshuffle.h
+++ b/src/redistribution/phase_reshuffle.h
@@ -51,19 +51,6 @@ namespace CTF_int {
                     int const * edge_len,
                     int const * loc_edge_len);

-  template <int idim>
-  void calc_cnt_from_rep_cnt(int const *     rep_phase,
-                             int const *     rep_phase_lda,
-                             int const *     rank,
-                             int const *     new_pe_lda,
-                             int const *     old_phys_phase,
-                             int const *     new_phys_phase,
-                             int64_t const * old_counts,
-                             int64_t *       counts,
-                             int             coff=0,
-                             int             roff=0,
-                             int             dir=1);
-
  void calc_drv_displs(int const *          sym,
                       int const *          edge_len,
                       int const *          loc_edge_len,

--- a/src/shared/fompi_wrapper.h
+++ b/src/shared/fompi_wrapper.h
+#ifndef __FOMPI_WRAPPER__
+#define __FOMPI_WRAPPER__
+
+#ifdef USE_FOMPI
+#include "fompi.h"
+
+#define MPI_Init(...) foMPI_Init(__VA_ARGS__)
+#define MPI_Win(...) foMPI_Win(__VA_ARGS__)
+#define MPI_Win_create(...) foMPI_Win_create(__VA_ARGS__)
+#define MPI_Win_fence(...) foMPI_Win_fence(__VA_ARGS__)
+#define MPI_Win_free(...) foMPI_Win_free(__VA_ARGS__)
+#define MPI_Put(...) foMPI_Put(__VA_ARGS__)
+
+#endif
+
+#endif
--- a/src/shared/util.h
+++ b/src/shared/util.h
@@ -28,7 +28,7 @@ namespace CTF_int {
  /* Force redistributions always by setting to 1 */
  #define REDIST 0
  //#define VERIFY 0
-  #define VERIFY_REMAP 0
+  #define VERIFY_REMAP 1
  #define FOLD_TSR 1
  #define PERFORM_DESYM 1
  #define ALLOW_NVIRT 1024

--- a/test/test_suite.cxx
+++ b/test/test_suite.cxx
@@ -41,8 +41,9 @@ int main(int argc, char ** argv){
  int in_num = argc;
  char ** input_str = argv;

-  int nt;
-  MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &nt);
+  //int nt;
+  //MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &nt);
+  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &np);