Got rid of compiler warnings, added performance models for upload/download/offload_gemm

fea513c7 · Edgar Solomonik · df4a209f · fea513c7 · fea513c7 · fea513c7
Commit fea513c7 authored 9 years ago by Edgar Solomonik
11 changed files
--- a/src/contraction/contraction.cxx
+++ b/src/contraction/contraction.cxx
@@ -1843,7 +1843,7 @@ namespace CTF_int {
 //      printf("new_order[%d/%d] = %d, new_lens[%d] = %d\n", i, topo->order, new_order[i], new_order[i], new_lens[new_order[i]]);
    }
    topology * new_topo = NULL;
-    for (int i=0; i<A->wrld->topovec.size(); i++){
+    for (int i=0; i<(int)A->wrld->topovec.size(); i++){
      if (A->wrld->topovec[i]->order == topo->order){
        bool has_same_len = true;
        for (int j=0; j<topo->order; j++){
@@ -2586,7 +2586,7 @@ namespace CTF_int {
            &num_tot, &idx_arr);
    cdealloc(idx_arr);
    int64_t tot_num_choices = 0;
-    for (int i=0; i<wrld->topovec.size(); i++){
+    for (int i=0; i<(int)wrld->topovec.size(); i++){
     // tot_num_choices += pow(num_choices,(int)wrld->topovec[i]->order);
      tot_num_choices += get_num_map_variants(wrld->topovec[i]);
    }
@@ -2594,7 +2594,7 @@ namespace CTF_int {
    int64_t choice_offset = 0;
    int64_t max_memuse = proc_bytes_available();
    TAU_FSTOP(init_select_ctr_map);
-    for (int i=0; i<wrld->topovec.size(); i++){
+    for (int i=0; i<(int)wrld->topovec.size(); i++){
 //      int tnum_choices = pow(num_choices,(int) wrld->topovec[i]->order);
      int tnum_choices = get_num_map_variants(wrld->topovec[i]);

@@ -2767,10 +2767,6 @@ namespace CTF_int {

  int contraction::map(ctr ** ctrf, bool do_remap){
    int ret, j, need_remap, d;
-    int64_t memuse;//, bmemuse;
-    double best_time;
-    int btopo;
-    //int * idx_arr, * idx_ctr, * idx_no_ctr, * idx_extra, * idx_weigh;
    int * old_phase_A, * old_phase_B, * old_phase_C;
    topology * old_topo_A, * old_topo_B, * old_topo_C;
    distribution * dA, * dB, * dC;
@@ -2831,12 +2827,9 @@ namespace CTF_int {
      old_phase_C[j]   = C->edge_map[j].calc_phase();
    }

-    //}
-    btopo = -1;
-    best_time = DBL_MAX;
    //bmemuse = UINT64_MAX;
    int ttopo, ttopo_sel, ttopo_exh;
-    double gbest_time, gbest_time_sel, gbest_time_exh;
+    double gbest_time_sel, gbest_time_exh;
  
    TAU_FSTART(get_best_sel_map);
    get_best_sel_map(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, ttopo_sel, gbest_time_sel);
@@ -2850,10 +2843,8 @@ namespace CTF_int {
      TAU_FSTOP(get_best_exh_map);
    }
    if (gbest_time_sel <= gbest_time_exh){
-      gbest_time = gbest_time_sel;
      ttopo = ttopo_sel;
    } else {
-      gbest_time = gbest_time_exh;
      ttopo = ttopo_exh;
    }

@@ -2914,7 +2905,7 @@ namespace CTF_int {
      int64_t choice_offset = 0;
      int i=0;
      int64_t old_off;
-      for (i=0; i<wrld->topovec.size(); i++){
+      for (i=0; i<(int)wrld->topovec.size(); i++){
        //int tnum_choices = pow(num_choices,(int) wrld->topovec[i]->order);
        int tnum_choices = get_num_map_variants(wrld->topovec[i]);
        old_off = choice_offset;
@@ -2962,10 +2953,10 @@ namespace CTF_int {
    #endif
     
    //FIXME: adhoc? 
-    memuse = MAX((int64_t)(*ctrf)->mem_rec(), (int64_t)(A->size*A->sr->el_size+B->size*B->sr->el_size+C->size*C->sr->el_size)*3);
+    /*memuse = MAX((int64_t)(*ctrf)->mem_rec(), (int64_t)(A->size*A->sr->el_size+B->size*B->sr->el_size+C->size*C->sr->el_size)*3);
    if (global_comm.rank == 0)
      VPRINTF(1,"Contraction will use %E bytes per processor out of %E available memory and take an estimated of %lf sec\n",
-              (double)memuse,(double)proc_bytes_available(),gbest_time);
+              (double)memuse,(double)proc_bytes_available(),gbest_time);*/

    if (A->is_cyclic == 0 &&
        B->is_cyclic == 0 &&
@@ -4039,7 +4030,9 @@ namespace CTF_int {
  #if DEBUG >=2
    ctrf->print();
  #endif
+  #if VERBOSE >= 1
  double dtt = MPI_Wtime();
+  #endif
  #ifdef DEBUG
    if (global_comm.rank == 0){
      //DPRINTF(1,"[%d] performing contraction\n",
@@ -4823,9 +4816,8 @@ namespace CTF_int {
  }

  void contraction::print(){
-//    int j,ex_A, ex_B,ex_C;
-    int i,max;
-    max = A->order+B->order+C->order;
+    int i;
+    //max = A->order+B->order+C->order;
    CommData global_comm = A->wrld->cdt;
    MPI_Barrier(global_comm.cm);
    if (global_comm.rank == 0){

--- a/src/contraction/ctr_2d_general.cxx
+++ b/src/contraction/ctr_2d_general.cxx
@@ -267,7 +267,7 @@ namespace CTF_int {
  void ctr_2d_general::run(char * A, char * B, char * C){
    int owner_A, owner_B, owner_C, ret;
    int64_t ib;
-    char * buf_A, * buf_B, * buf_C, * buf_aux; 
+    char * buf_A, * buf_B, * buf_C; 
    char * op_A, * op_B, * op_C; 
    int rank_A, rank_B, rank_C;
    int64_t b_A, b_B, b_C, s_A, s_B, s_C, aux_size;

--- a/src/contraction/ctr_offload.cxx
+++ b/src/contraction/ctr_offload.cxx
@@ -64,9 +64,9 @@ namespace CTF_int {

  double ctr_offload::est_time_fp(int nlyr){
    double tot_time = 0.0;
-    tot_time += size_A*sr_A->el_size*(total_iter/upload_phase_A)*COST_OFFLOADBW;
-    tot_time += size_B*sr_B->el_size*(total_iter/upload_phase_B)*COST_OFFLOADBW;
-    tot_time += size_C*sr_C->el_size*(total_iter/download_phase_C)*COST_OFFLOADBW;
+    tot_time += estimate_upload_time(size_A*sr_A->el_size)*(total_iter/upload_phase_A);
+    tot_time += estimate_upload_time(size_B*sr_B->el_size)*(total_iter/upload_phase_B);
+    tot_time += estimate_download_time(size_C*sr_C->el_size)*(total_iter/download_phase_C);
    return tot_time;
  }


--- a/src/contraction/ctr_tsr.cxx
+++ b/src/contraction/ctr_tsr.cxx
@@ -386,6 +386,7 @@ namespace CTF_int {
  LinModel<3> seq_tsr_ctr_mdl_cst(seq_tsr_ctr_mdl_cst_init,"seq_tsr_ctr_mdl_cst");
  LinModel<3> seq_tsr_ctr_mdl_ref(seq_tsr_ctr_mdl_ref_init,"seq_tsr_ctr_mdl_ref");
  LinModel<3> seq_tsr_ctr_mdl_inr(seq_tsr_ctr_mdl_inr_init,"seq_tsr_ctr_mdl_inr");
+  LinModel<3> seq_tsr_ctr_mdl_off(seq_tsr_ctr_mdl_off_init,"seq_tsr_ctr_mdl_off");

  uint64_t seq_tsr_ctr::est_membw(){
    uint64_t size_A = sy_packed_size(order_A, edge_len_A, sym_A)*sr_A->el_size;
@@ -430,9 +431,12 @@ namespace CTF_int {
 //    printf("time estimate is %lf\n", seq_tsr_ctr_mdl.est_time(ps));
    if (is_custom)
      return seq_tsr_ctr_mdl_cst.est_time(ps);
-    else if (is_inner)          
-      return seq_tsr_ctr_mdl_inr.est_time(ps);
-    else                        
+    else if (is_inner){
+      if (inner_params.offload)
+        return seq_tsr_ctr_mdl_off.est_time(ps);
+      else
+        return seq_tsr_ctr_mdl_inr.est_time(ps);
+    } else                        
      return seq_tsr_ctr_mdl_ref.est_time(ps);
  }

@@ -497,7 +501,10 @@ namespace CTF_int {
      double exe_time = MPI_Wtime()-st_time;
 //     printf("exe_time = %E est_time = %E abs_err = %e rel_err = %lf\n", exe_time,est_time,fabs(exe_time-est_time),fabs(exe_time-est_time)/exe_time);
      double tps[] = {exe_time, 1.0, (double)est_membw(), est_fp()};
-      seq_tsr_ctr_mdl_inr.observe(tps);
+      if (inner_params.offload)
+        seq_tsr_ctr_mdl_off.observe(tps);
+      else 
+        seq_tsr_ctr_mdl_inr.observe(tps);
 //      seq_tsr_ctr_mdl_inr.print_param_guess();
    } else {
      double st_time = MPI_Wtime();

--- a/src/shared/init_models.cxx
+++ b/src/shared/init_models.cxx
@@ -2,6 +2,7 @@ namespace CTF_int{
  double seq_tsr_ctr_mdl_cst_init[] = {3.8587E-13, 5.0935E-09, 4.2446E-10};
  double seq_tsr_ctr_mdl_ref_init[] = {1.5382E-16, 9.2293E-10, 7.6911E-11};
  double seq_tsr_ctr_mdl_inr_init[] = {3.2489E-04, 1.2692E-10, 1.0686E-10};
+  double seq_tsr_ctr_mdl_off_init[] = {2.5413E-04, 1.5889E-10, 9.6735E-12};
  double long_contig_transp_mdl_init[] = {-1.8639E-03, 1.8467E-08};
  double shrt_contig_transp_mdl_init[] = {5.5768E-03, 1.7220E-08};
  double non_contig_transp_mdl_init[] = {-4.8961E-04, 1.0225E-08};
@@ -11,5 +12,7 @@ namespace CTF_int{
  double bcast_mdl_init[] = {2.0139E-04, -4.1326E-05, 2.8453E-10, 2.4672E-10};
  double dgtog_res_mdl_init[] = {1.1994E-04, 3.1005E-04, 9.6259E-10};
  double blres_mdl_init[] = {1.4728E-05, 1.7673E-10};
+  double upload_mdl_init[] = {1.4604E-04, 7.3783E-10};
+  double download_mdl_init[] = {5.7942E-04, 6.0787E-10};
 }

--- a/src/shared/init_models.h
+++ b/src/shared/init_models.h
@@ -13,6 +13,9 @@ namespace CTF_int{
  extern double seq_tsr_ctr_mdl_cst_init[];
  extern double seq_tsr_ctr_mdl_ref_init[];
  extern double seq_tsr_ctr_mdl_inr_init[];
+  extern double seq_tsr_ctr_mdl_off_init[];
+  extern double upload_mdl_init[];
+  extern double download_mdl_init[];
 }

 #endif
--- a/src/shared/offload.cu
+++ b/src/shared/offload.cu
@@ -13,6 +13,7 @@

 #include "offload.h"
 #include "../tensor/algstrct.h"
+#include "../interface/timer.h"

 namespace CTF_int{
  volatile static int64_t int64_t_max = INT64_MAX;
@@ -65,20 +66,42 @@ namespace CTF_int{
    cudaError_t err = cudaFree(dev_ptr);
    assert(err == cudaSuccess);
  }
+
+  LinModel<2> upload_mdl(upload_mdl_init,"upload_mdl");
+  LinModel<2> download_mdl(download_mdl_init,"download_mdl");
+
+  double estimate_download_time(int64_t size){
+    double ps[] = {1.0, (double)size};
+    return download_mdl.est_time(ps);
+  }
+
+  double estimate_upload_time(int64_t size){
+    double ps[] = {1.0, (double)size};
+    return upload_mdl.est_time(ps);
+  }
  
  void offload_ptr::download(char * host_ptr){
    assert(initialized);
    TAU_FSTART(cuda_download);
+    double st_time = MPI_Wtime();
    cudaError_t err = cudaMemcpy(host_ptr, dev_ptr, size*sr->el_size,
                                 cudaMemcpyDeviceToHost);
+    double exe_time = MPI_Wtime()-st_time;
+    double tps[] = {exe_time, 1.0, (double)size*sr->el_size};
+    download_mdl.observe(tps);
    TAU_FSTOP(cuda_download);
    assert(err == cudaSuccess);
  }
  
  void offload_ptr::upload(char const * host_ptr){
    TAU_FSTART(cuda_upload);
+    double st_time = MPI_Wtime();
    cudaError_t err = cudaMemcpy(dev_ptr, host_ptr, size*sr->el_size,
                                 cudaMemcpyHostToDevice);
+
+    double exe_time = MPI_Wtime()-st_time;
+    double tps[] = {exe_time, 1.0, (double)size*sr->el_size};
+    upload_mdl.observe(tps);
    TAU_FSTOP(cuda_upload);
    assert(err == cudaSuccess);
  }

--- a/src/shared/offload.h
+++ b/src/shared/offload.h
@@ -12,6 +12,12 @@ namespace CTF_int{
  void offload_init();
  /** \brief exit offloading, e.g. destroy cublas */
  void offload_exit();
+
+  /** \brief estimate time it takes to upload */
+  double estimate_download_time(int64_t size);
+
+  /** \brief estimate time it takes to download */
+  double estimate_upload_time(int64_t size);
  
  class offload_ptr {
    public:

--- a/src/summation/summation.cxx
+++ b/src/summation/summation.cxx
@@ -2263,8 +2263,8 @@ namespace CTF_int {
  }

  void summation::print(){
-    int i,j,max,ex_A, ex_B;
-    max = A->order+B->order;
+    int i;
+    //max = A->order+B->order;

    CommData global_comm = A->wrld->cdt;
    MPI_Barrier(global_comm.cm);

--- a/src/tensor/algstrct.cxx
+++ b/src/tensor/algstrct.cxx
@@ -639,7 +639,7 @@ namespace CTF_int {
                 swap_buffer+ptr_pairs[i].idx*(sizeof(int64_t)+sr->el_size),
                 sizeof(int64_t)+sr->el_size);
        }
-        break;
+        break; //compiler warning here seems to be gcc bug
    }
  }


--- a/src/tensor/untyped_tensor.cxx
+++ b/src/tensor/untyped_tensor.cxx
@@ -20,7 +20,7 @@ using namespace CTF;

 namespace CTF_int {

-  static const char * SY_strings[4] = {"NS", "SY", "AS", "SH"};
+//  static const char * SY_strings[4] = {"NS", "SY", "AS", "SH"};

  Idx_Tensor tensor::operator[](const char * idx_map_){
    Idx_Tensor idxtsr(this, idx_map_);