Allowed more nvirt, added VERBOSITY flag for better info reporting

d68f4f9d · solomon · 9cd03642 · d68f4f9d · d68f4f9d · d68f4f9d
Commit d68f4f9d authored 11 years ago by solomon
10 changed files
--- a/examples/ccsd.cxx
+++ b/examples/ccsd.cxx
@@ -61,20 +61,20 @@ class Integrals {
    aa = CTF_Vector(nv,dw_);
    ii = CTF_Vector(no,dw_);
    
-    ab = CTF_Matrix(nv,nv,AS,dw_,"V",1);
-    ai = CTF_Matrix(nv,no,NS,dw_,"V",1);
-    ia = CTF_Matrix(no,nv,NS,dw_,"V",1);
-    ij = CTF_Matrix(no,no,AS,dw_,"V",1);
-
-    abcd = CTF_Tensor(4,vvvv,shapeASAS,dw_,"V",1);
-    abci = CTF_Tensor(4,vvvo,shapeASNS,dw_,"V",1);
-    aibc = CTF_Tensor(4,vovv,shapeNSAS,dw_,"V",1);
-    aibj = CTF_Tensor(4,vovo,shapeNSNS,dw_,"V",1);
-    abij = CTF_Tensor(4,vvoo,shapeASAS,dw_,"V",1);
-    ijab = CTF_Tensor(4,oovv,shapeASAS,dw_,"V",1);
-    aijk = CTF_Tensor(4,vooo,shapeNSAS,dw_,"V",1);
-    ijak = CTF_Tensor(4,oovo,shapeASNS,dw_,"V",1);
-    ijkl = CTF_Tensor(4,oooo,shapeASAS,dw_,"V",1);
+    ab = CTF_Matrix(nv,nv,AS,dw_,"Vab",1);
+    ai = CTF_Matrix(nv,no,NS,dw_,"Vai",1);
+    ia = CTF_Matrix(no,nv,NS,dw_,"Via",1);
+    ij = CTF_Matrix(no,no,AS,dw_,"Vij",1);
+
+    abcd = CTF_Tensor(4,vvvv,shapeASAS,dw_,"Vabcd",1);
+    abci = CTF_Tensor(4,vvvo,shapeASNS,dw_,"Vabci",1);
+    aibc = CTF_Tensor(4,vovv,shapeNSAS,dw_,"Vaibc",1);
+    aibj = CTF_Tensor(4,vovo,shapeNSNS,dw_,"Vaibj",1);
+    abij = CTF_Tensor(4,vvoo,shapeASAS,dw_,"Vabij",1);
+    ijab = CTF_Tensor(4,oovv,shapeASAS,dw_,"Vijab",1);
+    aijk = CTF_Tensor(4,vooo,shapeNSAS,dw_,"Vaijk",1);
+    ijak = CTF_Tensor(4,oovo,shapeASNS,dw_,"Vijak",1);
+    ijkl = CTF_Tensor(4,oooo,shapeASAS,dw_,"Vijkl",1);
  }

  void fill_rand(){
@@ -148,9 +148,9 @@ class Amplitudes {
    int shapeASAS[] = {AS,NS,AS,NS};
    int vvoo[]      = {nv,nv,no,no};

-    ai = CTF_Matrix(nv,no,NS,dw_,"T",1);
+    ai = CTF_Matrix(nv,no,NS,dw_,"Tai",1);

-    abij = CTF_Tensor(4,vvoo,shapeASAS,dw_,"T",1);
+    abij = CTF_Tensor(4,vvoo,shapeASAS,dw_,"Tabij",1);
  }

  tCTF_Idx_Tensor<double> operator[](char const * idx_map_){

--- a/src/dist_tensor/cyclopstf.cxx
+++ b/src/dist_tensor/cyclopstf.cxx
@@ -111,7 +111,7 @@ int tCTF<dtype>::init(MPI_Comm const  global_context,

 #ifdef USE_OMP
  if (rank == 0)
-    DPRINTF(1,"CTF running with %d threads\n",omp_get_max_threads());
+    VPRINTF(1,"Running with %d threads\n",omp_get_max_threads());
 #endif
  
  mst_size = getenv("CTF_MST_SIZE");
@@ -119,12 +119,11 @@ int tCTF<dtype>::init(MPI_Comm const  global_context,
  if (mst_size == NULL && stack_size == NULL){
 #ifdef USE_MST
    if (rank == 0)
-      DPRINTF(1,"Creating CTF stack of size "PRId64"\n",1000*(long_int)1E6);
+      VPRINTF(1,"Creating stack of size "PRId64"\n",1000*(long_int)1E6);
    CTF_mst_create(1000*(long_int)1E6);
 #else
    if (rank == 0){
-      DPRINTF(1,"Running CTF without stack, define CTF_STACK_SIZE ");
-      DPRINTF(1,"environment variable to activate stack\n");
+      VPRINTF(1,"Running without stack, define CTF_STACK_SIZE environment variable to activate stack\n");
    }
 #endif
  } else {
@@ -134,7 +133,7 @@ int tCTF<dtype>::init(MPI_Comm const  global_context,
    if (stack_size != NULL)
      imst_size = MAX(imst_size,strtoull(stack_size,NULL,0));
    if (rank == 0)
-      DPRINTF(1,"Creating CTF stack of size "PRIu64" due to CTF_STACK_SIZE enviroment variable\n",
+      VPRINTF(1,"Creating stack of size "PRIu64" due to CTF_STACK_SIZE enviroment variable\n",
                imst_size);
    CTF_mst_create(imst_size);
  }
@@ -142,14 +141,14 @@ int tCTF<dtype>::init(MPI_Comm const  global_context,
  if (mem_size != NULL){
    uint64_t imem_size = strtoull(mem_size,NULL,0);
    if (rank == 0)
-      DPRINTF(1,"CTF memory size set to "PRIu64" by CTF_MEMORY_SIZE environment variable\n",
+      VPRINTF(1,"Memory size set to "PRIu64" by CTF_MEMORY_SIZE environment variable\n",
                imem_size);
    CTF_set_mem_size(imem_size);
  }
  ppn = getenv("CTF_PPN");
  if (ppn != NULL){
    if (rank == 0)
-      DPRINTF(1,"CTF assuming %d processes per node due to CTF_PPN environment variable\n",
+      VPRINTF(1,"Assuming %d processes per node due to CTF_PPN environment variable\n",
                atoi(ppn));
    LIBT_ASSERT(atoi(ppn)>=1);
    CTF_set_memcap(.75/atof(ppn));
@@ -554,14 +553,25 @@ int tCTF<dtype>::contract(CTF_ctr_type_t const *    type,
        sprintf(cname+strlen(cname),"%d",type->idx_map_B[i]);
    }
    sprintf(cname+strlen(cname),"]");
-    
+
+    double dtt;
+    if (dt->get_global_comm()->rank == 0){
+      dtt = MPI_Wtime();
+      VPRINTF(1,"Starting %s\n",cname);
+    }
   
    CTF_Timer tctr(cname);
    tctr.start(); 
    ret = dt->home_contract(type, func_ptr, felm, alpha, beta, map_inner);
    tctr.stop();
+    if (dt->get_global_comm()->rank == 0){
+      VPRINTF(1,"Ended %s in %lf seconds\n",cname,MPI_Wtime()-dtt);   }
  } else 
    ret = dt->home_contract(type, func_ptr, felm, alpha, beta, map_inner);
+  if ((*dt->get_tensors())[type->tid_A]->profile &&
+      (*dt->get_tensors())[type->tid_B]->profile &&
+      (*dt->get_tensors())[type->tid_C]->profile){
+  }
 #if DEBUG >= 1
  if (dt->get_global_comm()->rank == 0)
    printf("End head contraction :\n");

--- a/src/dist_tensor/cyclopstf.hpp
+++ b/src/dist_tensor/cyclopstf.hpp
@@ -47,6 +47,7 @@ enum CTF_OP { CTF_OP_SUM, CTF_OP_SUMABS,
 typedef int64_t long_int;
 typedef long_int key;

+static const char * SY_strings[4] = {"NS", "SY", "AS", "SH"};

 template<typename dtype>
 struct tkv_pair {
@@ -80,7 +81,7 @@ inline bool comp_tkv_pair(tkv_pair<dtype> i,tkv_pair<dtype> j) {
 #define INNER_MAP 0
 #define FOLD_TSR 1
 #define PERFORM_DESYM 1
-#define ALLOW_NVIRT 8
+#define ALLOW_NVIRT 32
 #define DIAG_RESCALE
 #define USE_SYM_SUM 
 #define HOME_CONTRACT

--- a/src/dist_tensor/dist_tensor_fold.cxx
+++ b/src/dist_tensor/dist_tensor_fold.cxx
@@ -1573,6 +1573,8 @@ void dist_tensor<dtype>::desymmetrize(int const sym_tid,
    strcpy(spf,"desymmetrize_");
    strcat(spf,tsr_sym->name);
    CTF_Timer t_pf(spf);
+    if (global_comm->rank == 0) 
+      VPRINTF(1,"Desymmetrizing %s\n", tsr_sym->name);
    t_pf.start();
  }

@@ -1716,6 +1718,16 @@ void dist_tensor<dtype>::symmetrize(int const sym_tid, int const nonsym_tid){
  
  tsr_sym = tensors[sym_tid];
  tsr_nonsym = tensors[nonsym_tid];
+  
+  if (tsr_sym->profile) {
+    char spf[80];
+    strcpy(spf,"symmetrize_");
+    strcat(spf,tsr_nonsym->name);
+    CTF_Timer t_pf(spf);
+    if (global_comm->rank == 0) 
+      VPRINTF(1,"Symmetrizing %s\n", tsr_nonsym->name);
+    t_pf.start();
+  }

  sym_dim = -1;
  is = -1;
@@ -1828,6 +1840,14 @@ idx_map_B, fss, fselm);
  CTF_free(idx_map_A);
  CTF_free(idx_map_B);

+  if (tsr_sym->profile) {
+    char spf[80];
+    strcpy(spf,"symmetrize_");
+    strcat(spf,tsr_sym->name);
+    CTF_Timer t_pf(spf);
+    t_pf.stop();
+  }
+

  TAU_FSTOP(symmetrize);
 }

--- a/src/dist_tensor/dist_tensor_internal.cxx
+++ b/src/dist_tensor/dist_tensor_internal.cxx
@@ -125,17 +125,13 @@ int dist_tensor<dtype>::initialize(CommData_t * cdt_global,
 /* FIXME: Sorting will fuck up dimensional ordering */
 //  std::sort(srt_dim_len, srt_dim_len + ndim);

-#if DEBUG >= 1
  if (cdt_global->rank == 0)
-    printf("Setting up initial torus topology:\n");
-#endif
+    VPRINTF(1,"Setting up initial torus physical topology P:\n");
  stride = 1, cut = 0;
  for (i=0; i<ndim; i++){
    LIBT_ASSERT(dim_len[i] != 1);
-#if DEBUG >= 1
    if (cdt_global->rank == 0)
-      printf("dim[%d] = %d:\n",i,srt_dim_len[i]);
-#endif
+      VPRINTF(1,"P[%d] = %d\n",i,srt_dim_len[i]);

    phys_comm[i] = (CommData_t*)CTF_alloc(sizeof(CommData_t));
    SETUP_SUB_COMM(cdt_global, phys_comm[i],
@@ -181,6 +177,10 @@ void dist_tensor<dtype>::set_phys_comm(CommData_t ** cdt, int const ndim){
  lda = 1;
  /* Figure out the lda of each dimension communicator */
  for (i=0; i<ndim; i++){
+#if DEBUG >= 1
+    if (global_comm->rank == 0)
+      printf("Added topo %d dim[%d] = %d:\n",(int)topovec.size(),i,cdt[i]->np);
+#endif
    LIBT_ASSERT(cdt[i]->np != 1);
    new_topo.lda[i] = lda;
    lda = lda*cdt[i]->np;
@@ -1970,54 +1970,12 @@ int dist_tensor<dtype>::print_map(FILE *    stream,
  mapping * map;
  tsr = tensors[tid];

-
-  if (all)
-    COMM_BARRIER(global_comm);
-  if (/*tsr->is_mapped &&*/ (!all || global_comm->rank == 0)){
-    printf("Tensor %d of dimension %d is mapped to a ", tid, tsr->ndim);
-    if (is_inner){
-      for (i=0; i<inner_topovec[tsr->itopo].ndim-1; i++){
-              printf("%d-by-", inner_topovec[tsr->itopo].dim_comm[i]->np);
-      }
-      if (inner_topovec[tsr->itopo].ndim > 0)
-              printf("%d inner topology.\n", inner_topovec[tsr->itopo].dim_comm[i]->np);
-    } else {
-      for (i=0; i<topovec[tsr->itopo].ndim-1; i++){
-              printf("%d-by-", topovec[tsr->itopo].dim_comm[i]->np);
-      }
-      if (topovec[tsr->itopo].ndim > 0)
-              printf("%d topology.\n", topovec[tsr->itopo].dim_comm[i]->np);
-    }
-    for (i=0; i<tsr->ndim; i++){
-      switch (tsr->edge_map[i].type){
-        case NOT_MAPPED:
-          printf("Dimension %d of length %d and symmetry %d is not mapped\n",i,tsr->edge_len[i],tsr->sym[i]);
-          break;
-
-        case PHYSICAL_MAP:
-          printf("Dimension %d of length %d and symmetry %d is mapped to physical dimension %d with phase %d\n",
-            i,tsr->edge_len[i],tsr->sym[i],tsr->edge_map[i].cdt,tsr->edge_map[i].np);
-          map = &tsr->edge_map[i];
-          while (map->has_child){
-            map = map->child;
-            if (map->type == VIRTUAL_MAP)
-              printf("\tDimension %d also has a virtualized child of phase %d\n", i, map->np);
-            else
-              printf("\tDimension %d also has a physical child mapped to physical dimension %d with phase %d\n",
-                      i, map->cdt, map->np);
-          }
-          break;
-
-        case VIRTUAL_MAP:
-          printf("Dimension %d of length %d and symmetry %d is mapped virtually with phase %d\n",
-            i,tsr->edge_len[i],tsr->sym[i],tsr->edge_map[i].np);
-          break;
-      }
-    }
+  if (!all || global_comm->rank == 0){
+    tsr->print_map(stdout);
  }
-  if (all)
-    COMM_BARRIER(global_comm);
+
  return DIST_TENSOR_SUCCESS;
+
 }

 /**
@@ -2383,7 +2341,7 @@ void dist_tensor<dtype>::contract_mst(){



-
+#include "tensor_object.cxx"
 #include "dist_tensor_map.cxx"
 #include "dist_tensor_op.cxx"
 #include "dist_tensor_inner.cxx"

--- a/src/dist_tensor/dist_tensor_internal.h
+++ b/src/dist_tensor/dist_tensor_internal.h
@@ -52,7 +52,8 @@ struct topology {


 template<typename dtype>
-struct tensor {
+class tensor {
+  public:
  int ndim;
  int * edge_len;
  int is_padded;
@@ -85,6 +86,8 @@ struct tensor {
  int has_home;
  char const * name;
  int profile;
+
+  void print_map(FILE * stream) const;
 };



--- a/src/dist_tensor/dt_aux_map.hxx
+++ b/src/dist_tensor/dt_aux_map.hxx
@@ -651,13 +651,6 @@ int remap_tensor(int const  tid,
  dtype * shuffled_data_corr;
 #endif

-  if (tsr->profile) {
-    char spf[80];
-    strcpy(spf,"redistribute_");
-    strcat(spf,tsr->name);
-    CTF_Timer t_pf(spf);
-    t_pf.start();
-  }

  CTF_alloc_ptr(sizeof(int)*tsr->ndim, (void**)&new_phase);
  CTF_alloc_ptr(sizeof(int)*tsr->ndim, (void**)&new_rank);
@@ -695,11 +688,20 @@ int remap_tensor(int const  tid,
    tsr->is_home = 0;
  }
 #endif
-#if DEBUG >= 1
-  if (global_comm->rank == 0){
-    printf("Remapping tensor %d with virtualization factor of %d\n",tid,new_nvirt);
-  }
+  if (tsr->profile) {
+    char spf[80];
+    strcpy(spf,"redistribute_");
+    strcat(spf,tsr->name);
+    if (global_comm->rank == 0){
+      if (can_block_shuffle) VPRINTF(1,"Remapping tensor %s via block_reshuffle\n",tsr->name);
+      else VPRINTF(1,"Remapping tensor %s via cyclic_reshuffle\n",tsr->name);
+#if VERBOSE >=1
+      tsr->print_map(stdout);
 #endif
+    }
+    CTF_Timer t_pf(spf);
+    t_pf.start();
+  }

 #if VERIFY_REMAP
    padded_reshuffle(tid,
@@ -726,9 +728,6 @@ int remap_tensor(int const  tid,
 #endif

  if (can_block_shuffle){
-    if (global_comm->rank == 0) {
-      DPRINTF(1,"remapping tensor %d via block_reshuffle\n", tid);
-    }
    block_reshuffle( tsr->ndim,
                     old_phase,
                     old_size,
@@ -743,10 +742,6 @@ int remap_tensor(int const  tid,
                     shuffled_data,
                     global_comm);
  } else {
-    if (global_comm->rank == 0) {
-      DEBUG_PRINTF("remapping with cyclic reshuffle (was padded = %d)\n",
-        tsr->is_padded);
-    }
 //    CTF_alloc_ptr(sizeof(dtype)*tsr->size, (void**)&shuffled_data);
    cyclic_reshuffle(tsr->ndim,
                     old_size,

--- a/src/dist_tensor/dt_aux_topo.hxx
+++ b/src/dist_tensor/dt_aux_topo.hxx
@@ -62,6 +62,8 @@ void fold_torus(topology *              topo,
        /* Reorder the lda, bring j lda to lower lda and adjust other ldas */
        color = glb_comm->rank - topo->dim_comm[i]->rank*topo->lda[i]
                               - topo->dim_comm[j]->rank*topo->lda[j];
+        if (j<ndim-1)
+          color = (color%topo->lda[i])+(color/topo->lda[j+1]);
      }
      np = topo->dim_comm[i]->np*topo->dim_comm[j]->np;


--- a/src/shared/timer.cxx
+++ b/src/shared/timer.cxx
@@ -10,7 +10,7 @@
 #include "timer.h"
 #include "util.h"

-#define MAX_NAME_LENGTH 43
+#define MAX_NAME_LENGTH 53

 int main_argc = 0;
 const char * const * main_argv;

--- a/src/shared/util.h
+++ b/src/shared/util.h
@@ -154,6 +154,20 @@ do { printf("error:%s:%d ",__FILE__,__LINE__); printf(__VA_ARGS__); printf("\n")
 do { printf("warning: "); printf(__VA_ARGS__); printf("\n"); } while(0)
 #endif

+#if defined(VERBOSE)
+  #ifndef VPRINTF
+  #define VPRINTF(i,...) \
+    do { if (i<=VERBOSE) { \
+      printf("CTF: "__VA_ARGS__); } \
+    } while (0)
+  #endif
+#else
+  #ifndef VPRINTF
+  #define VPRINTF(...) do { } while (0)
+  #endif
+#endif
+
+
 #ifdef DEBUG
  #ifndef DPRINTF
  #define DPRINTF(i,...) \
@@ -191,16 +205,6 @@ do { printf("warning: "); printf(__VA_ARGS__); printf("\n"); } while(0)
  #endif
 #endif

-#ifdef VERBOSE
-  #ifndef VERBOSE_PRINTF
-  #define VERBOSE_PRINTF(...) \
-    do { LOC; printf(__VA_ARGS__); } while(0)
-  #endif
-#else
-  #ifndef VERBOSE_PRINTF
-  #define VERBOSE_PRINTF
-  #endif
-#endif

 #ifdef DUMPDEBUG
  #ifndef DUMPDEBUG_PRINTF