Connected scalapack back-end to proper gemm as it used to be and made this gemm offload to GPU.

52065cd6 · solomon · e8071959 · 52065cd6 · 52065cd6 · 52065cd6
Commit 52065cd6 authored 11 years ago by solomon
9 changed files
--- a/src/Makefile
+++ b/src/Makefile
@@ -32,7 +32,7 @@ ${libdir}/libctf.a: interface/ctf_world.o \
                    interface/ctf_sparse_tensor.o \
                    interface/ctf_flop_counter.o \
                    ctr_comm/seq_tsr.o \
-                    ctr_seq/offload.o \
+                    shared/offload.o \
                    shared/util.o \
                    shared/timer.o \
                    shared/memcontrol.o \
@@ -40,9 +40,9 @@ ${libdir}/libctf.a: interface/ctf_world.o \
                    dist_tensor/distribution.o \
                    dist_tensor/cyclopstf.o 

-ctr_seq/offload.o: ctr_seq/offload.h ctr_seq/offload.cxx $(_DEPENDENCIES)
+shared/offload.o: shared/offload.h shared/offload.cxx $(_DEPENDENCIES)
 	@mkdir -p $(DEPDIR)
-	$(OFFLOAD_CXX) -c ctr_seq/offload.cxx -o ctr_seq/offload.o
+	$(OFFLOAD_CXX) -c shared/offload.cxx -o shared/offload.o

 #INCLUDES += -I${top_dir}/src/ctr_comm -I${top_dir}/src/ctr_seq -I${top_dir}/src/dist_tensor -I${top_dir}/src/util -I${top_dir}/src/interface

--- a/src/bench/nonsq_pgemm_bench.cxx
+++ b/src/bench/nonsq_pgemm_bench.cxx
@@ -403,10 +403,12 @@ int main(int argc, char **argv) {
  startTime = MPI_Wtime();
  for (iter=0; iter < num_iter; iter++){
    //seq_square_matmul(mat_A, mat_B, mat_C, blockDim, 0);
+    TAU_FSTART(ctf_pgemm_bench);
    myctf->pgemm('T','N', m, n, k, ALPHA, 
 	    mat_A, 1, 1, desc_a,
 	    mat_B, 1, 1, desc_b, BETA,
 	    mat_C, 1, 1, desc_c); 
+    TAU_FSTOP(ctf_pgemm_bench);
 //    myctf->pgemm('T', 'N', ALPHA, tid_A, tid_B, BETA, tid_C);
    if (iter == 0)
      ans_verify = mat_C[2];

--- a/src/ctr_seq/sym_seq_ctr_inner.hxx
+++ b/src/ctr_seq/sym_seq_ctr_inner.hxx
@@ -6,7 +6,7 @@
 #include "../shared/util.h"
 #include <limits.h>
 #include "sym_seq_shared.hxx"
-#include "offload.h"
+#include "../shared/offload.h"


 /**

--- a/src/dist_tensor/cyclopstf.cxx
+++ b/src/dist_tensor/cyclopstf.cxx
@@ -557,7 +557,7 @@ int tCTF<dtype>::contract(CTF_ctr_type_t const *  type,
                          dtype const             alpha,
                          dtype const             beta){
  fseq_tsr_ctr<dtype> fs;
-  fs.func_ptr=sym_seq_ctr_ref<dtype>;
+  fs.func_ptr=NULL;//sym_seq_ctr_ref<dtype>;
  return contract(type, fs, alpha, beta);
 }

@@ -675,7 +675,7 @@ int tCTF<dtype>::contract(CTF_ctr_type_t const *     type,
  dt->print_ctr(type,alpha,beta);
 #endif
  fseq_tsr_ctr<dtype> fs;
-  fs.func_ptr=sym_seq_ctr_ref<dtype>;
+  fs.func_ptr=NULL;//sym_seq_ctr_ref<dtype>;
  int ret = dt->home_contract(type, fs, felm, alpha, beta);
 #if DEBUG >= 1
  if (dt->get_global_comm().rank == 0)

--- a/src/dist_tensor/cyclopstf.hpp
+++ b/src/dist_tensor/cyclopstf.hpp
@@ -165,7 +165,6 @@ struct fseq_elm_sum {

 template<typename dtype>
 struct fseq_tsr_ctr {
-
    /* Function signature for sub-tensor contraction recrusive call */
    int  (*func_ptr) ( dtype const      alpha,
                       dtype const *    A,

--- a/src/dist_tensor/dist_tensor_op.cxx
+++ b/src/dist_tensor/dist_tensor_op.cxx
@@ -2931,8 +2931,11 @@ int dist_tensor<dtype>::
  assert(stat == DIST_TENSOR_SUCCESS);
 #endif
  /* Check if the current tensor mappings can be contracted on */
+  fseq_tsr_ctr<dtype> fftsr=ftsr;
+  if (ftsr.func_ptr == NULL)
+    fftsr.func_ptr = &sym_seq_ctr_ref<dtype>;
 #if REDIST
-  stat = map_tensors(type, ftsr, felm, alpha, beta, &ctrf);
+  stat = map_tensors(type, fftsr, felm, alpha, beta, &ctrf);
  if (stat == DIST_TENSOR_ERROR) {
    printf("Failed to map tensors to physical grid\n");
    return DIST_TENSOR_ERROR;
@@ -2940,7 +2943,7 @@ int dist_tensor<dtype>::
 #else
  if (check_contraction_mapping(type) == 0) {
    /* remap if necessary */
-    stat = map_tensors(type, ftsr, felm, alpha, beta, &ctrf);
+    stat = map_tensors(type, fftsr, felm, alpha, beta, &ctrf);
    if (stat == DIST_TENSOR_ERROR) {
      printf("Failed to map tensors to physical grid\n");
      return DIST_TENSOR_ERROR;
@@ -2954,7 +2957,7 @@ int dist_tensor<dtype>::
    print_map(stdout, type->tid_B);
    print_map(stdout, type->tid_C);
 #endif
-    ctrf = construct_contraction(type, ftsr, felm, alpha, beta);
+    ctrf = construct_contraction(type, fftsr, felm, alpha, beta);
    if (global_comm.rank == 0){
      uint64_t memuse = ctrf->mem_rec();
      VPRINTF(1,"Contraction does not require redistribution, will use %E bytes per processor out of %E available memory and take an estimated of %lf sec\n",
@@ -2964,7 +2967,9 @@ int dist_tensor<dtype>::
 #endif
  LIBT_ASSERT(check_contraction_mapping(type));
 #if FOLD_TSR
-  if (felm.func_ptr == NULL  && can_fold(type)){
+  if (felm.func_ptr == NULL && 
+      ftsr.func_ptr == NULL && //sym_seq_ctr_ref<dtype> && 
+      can_fold(type)){
    iparam prm;
    TAU_FSTART(map_fold);
    stat = map_fold(type, &prm);
@@ -2974,9 +2979,9 @@ int dist_tensor<dtype>::
    }
    if (stat == DIST_TENSOR_SUCCESS){
      delete ctrf;
-      ctrf = construct_contraction(type, ftsr, felm, alpha, beta, 2, &prm);
+      ctrf = construct_contraction(type, fftsr, felm, alpha, beta, 2, &prm);
    }
-  }
+  } 
 #endif
 #if DEBUG >=2
  if (get_global_comm().rank == 0)

--- a/src/dist_tensor/scala_backend.cxx
+++ b/src/dist_tensor/scala_backend.cxx
 /*Copyright (c) 2011, Edgar Solomonik, all rights reserved.*/

 #include "dist_tensor_internal.h"
+#include "../shared/offload.h"

 #if (defined BGP || defined BGQ)
 #define BLACS_GRIDINFO blacs_gridinfo
@@ -16,25 +17,22 @@ inline
 void BLACS_GRIDINFO(int *, int *, int *, int *, int *) { assert(0); }
 #endif

-
-
-
 template<typename dtype, int is_herm_A, int is_herm_B>
-int  gemm_ctr( dtype const      alpha,
-               dtype const *    A,
+int gemm_ctr(  dtype  const     alpha,
+               dtype  const *   A,
               int const        ndim_A,
               int const *      edge_len_A,
               int const *      lda_A,
               int const *      sym_A,
               int const *      idx_map_A,
-               dtype const *    B,
+               dtype  const *   B,
               int const        ndim_B,
               int const *      edge_len_B,
               int const *      lda_B,
               int const *      sym_B,
               int const *      idx_map_B,
-               dtype const      beta,
-               dtype *          C,
+               dtype  const     beta,
+               dtype  *         C,
               int const        ndim_C,
               int const *      edge_len_C,
               int const *      lda_C,
@@ -79,13 +77,36 @@ int  gemm_ctr( dtype const      alpha,
  LIBT_ASSERT(n==edge_len_C[1]);
  la_C = m;

+
+#ifdef OFFLOAD
+  TAU_FSTART(offload_alloc);
+  offload_ptr<dtype> ptr_A(m*k);
+  offload_ptr<dtype> ptr_B(k*n);
+  offload_ptr<dtype> ptr_C(m*n);
+  TAU_FSTOP(offload_alloc);
+  TAU_FSTART(offload_upload);
+  ptr_A.upload(A);
+  ptr_B.upload(B);
+  ptr_C.upload(C);
+  TAU_FSTOP(offload_upload);
+  TAU_FSTART(offload_gemm);
+  TAU_FSTART(dgemm);
+  offload_gemm<dtype>(ta, tb, m, n, k, alpha, 
+                      ptr_A, la_A,
+                      ptr_B, la_B, beta,
+                      ptr_C, la_C);
+  TAU_FSTOP(dgemm);
+  TAU_FSTART(offload_download);
+  ptr_C.download(C);
+  TAU_FSTOP(offload_download);
+#else
  TAU_FSTART(dgemm);
  cxgemm(ta, tb, m, n, k, alpha, A, la_A, B, la_B, beta, C, la_C);
  TAU_FSTOP(dgemm);
+#endif
  return 0;
 }

-
 /*
 #define DECLARE_GEMM_CTR(type, herm_A, herm_B)          \
 template                                                \

--- a/src/ctr_seq/offload.cxx
+++ b/src/ctr_seq/offload.cxx
--- a/src/ctr_seq/offload.h
+++ b/src/ctr_seq/offload.h