/*Copyright (c) 2011, Edgar Solomonik, all rights reserved.*/ #include "dist_tensor_internal.h" #include "dt_aux_permute.hxx" #include "dt_aux_sort.hxx" #include "dt_aux_rw.hxx" #include "dt_aux_map.hxx" #include "dt_aux_topo.hxx" #include "cyclopstf.hpp" #include "../shared/util.h" #include "../shared/memcontrol.h" #include "../ctr_comm/ctr_comm.h" #include "../ctr_comm/ctr_tsr.h" #include "../ctr_comm/sum_tsr.h" #include "../ctr_comm/strp_tsr.h" #include <limits.h> #include <float.h> #include <stdint.h> #include <vector> #include <list> #include <algorithm> #include <errno.h> #define MAX_NVIRT 256 #ifndef MIN_NVIRT #define MIN_NVIRT 1 #endif //#define USE_VIRT_25D /* accessors */ template<typename dtype> CommData_t * dist_tensor<dtype>::get_global_comm(){ return global_comm; } template<typename dtype> void dist_tensor<dtype>::set_global_comm(CommData_t * cdt){ global_comm = cdt; } /** * \brief deallocates all internal state */ template<typename dtype> int dist_tensor<dtype>::dist_cleanup(){ int j, rank; std::vector<topology>::iterator iter; for (iter=topovec.begin(); iter<topovec.end(); iter++){ for (j=0; j<iter->ndim; j++){ FREE_CDT(iter->dim_comm[j]); } CTF_free(iter->lda); } for (iter=rejected_topos.begin(); iter<rejected_topos.end(); iter++){ for (j=0; j<iter->ndim; j++){ FREE_CDT(iter->dim_comm[j]); } } for (iter=rejected_topos.begin(); iter<rejected_topos.end(); iter++){ for (j=0; j<iter->ndim; j++){ // folded communicator pointers are replicated CTF_free_cond(iter->dim_comm[j]); } CTF_free(iter->dim_comm); } for (iter=topovec.begin(); iter<topovec.end(); iter++){ for (j=0; j<iter->ndim; j++){ // folded communicator pointers are replicated // CTF_free_cond(iter->dim_comm[j]); } CTF_free(iter->dim_comm); } topovec.clear(); #if INNER_MAP for (iter=inner_topovec.begin(); iter<inner_topovec.end(); iter++){ CTF_free(iter->dim_comm); } inner_topovec.clear(); #endif rank = global_comm->rank; FREE_CDT(global_comm); CTF_free(global_comm); CTF_mem_exit(rank); return DIST_TENSOR_SUCCESS; } /** * \brief destructor */ template<typename dtype> dist_tensor<dtype>::~dist_tensor(){ int ret; ret = dist_cleanup(); LIBT_ASSERT(ret == DIST_TENSOR_SUCCESS); } /** * \brief initializes library. * \param[in] ndim is the number of dimensions in the topology * \param[in] dim_len is the number of processors along each dimension * \param[in] inner_size is the total block size of dgemm calls */ template<typename dtype> int dist_tensor<dtype>::initialize(CommData_t * cdt_global, int const ndim, int const * dim_len, int const inner_sz){ int i, rank, stride, cut; int * srt_dim_len; CTF_mem_create(); CTF_alloc_ptr(ndim*sizeof(int), (void**)&srt_dim_len); memcpy(srt_dim_len, dim_len, ndim*sizeof(int)); rank = cdt_global->rank; /* setup global communicator */ set_global_comm(cdt_global); /* setup dimensional communicators */ CommData_t ** phys_comm = (CommData_t**)CTF_alloc(ndim*sizeof(CommData_t*)); /* FIXME: Sorting will fuck up dimensional ordering */ // std::sort(srt_dim_len, srt_dim_len + ndim); if (cdt_global->rank == 0) VPRINTF(1,"Setting up initial torus physical topology P:\n"); stride = 1, cut = 0; for (i=0; i<ndim; i++){ LIBT_ASSERT(dim_len[i] != 1); if (cdt_global->rank == 0) VPRINTF(1,"P[%d] = %d\n",i,srt_dim_len[i]); phys_comm[i] = (CommData_t*)CTF_alloc(sizeof(CommData_t)); SETUP_SUB_COMM(cdt_global, phys_comm[i], ((rank/stride)%srt_dim_len[ndim-i-1]), (((rank/(stride*srt_dim_len[ndim-i-1]))*stride)+cut), srt_dim_len[ndim-i-1], NREQ, NBCAST); stride*=srt_dim_len[ndim-i-1]; cut = (rank - (rank/stride)*stride); } set_phys_comm(phys_comm, ndim); CTF_free(srt_dim_len); #if INNER_MAP return init_inner_topology(inner_sz); #else return DIST_TENSOR_SUCCESS; #endif } /** * \brief sets the physical torus topology * \param[in] cdt grid communicator * \param[in] ndim number of dimensions */ template<typename dtype> void dist_tensor<dtype>::set_phys_comm(CommData_t ** cdt, int const ndim){ int i, lda; topology new_topo; new_topo.ndim = ndim; new_topo.dim_comm = cdt; /* do not duplicate topologies */ if (find_topology(&new_topo, topovec) != -1){ rejected_topos.push_back(new_topo); return; } new_topo.lda = (int*)CTF_alloc(sizeof(int)*ndim); lda = 1; /* Figure out the lda of each dimension communicator */ for (i=0; i<ndim; i++){ #if DEBUG >= 1 if (global_comm->rank == 0) printf("Added topo %d dim[%d] = %d:\n",(int)topovec.size(),i,cdt[i]->np); #endif LIBT_ASSERT(cdt[i]->np != 1); new_topo.lda[i] = lda; lda = lda*cdt[i]->np; LIBT_ASSERT(cdt[i]->np > 0); } topovec.push_back(new_topo); if (ndim > 1) fold_torus(&new_topo, global_comm, this); } /** * \brief defines a tensor and retrieves handle * * \param[in] ndim number of tensor dimensions * \param[in] edge_len global edge lengths of tensor * \param[in] sym symmetry relations of tensor * \param[out] tensor_id the tensor index (handle) * \param[in] alloc_data whether this tensor's data should be alloced * \param[in] name string name for tensor (optionary) * \param[in] profile wether to make profile calls for the tensor */ template<typename dtype> int dist_tensor<dtype>::define_tensor( int const ndim, int const * edge_len, int const * sym, int * tensor_id, int const alloc_data, char const * name, int profile){ int i; tensor<dtype> * tsr = (tensor<dtype>*)CTF_alloc(sizeof(tensor<dtype>)); CTF_alloc_ptr(ndim*sizeof(int), (void**)&tsr->padding); memset(tsr->padding, 0, ndim*sizeof(int)); tsr->is_padded = 1; tsr->is_scp_padded = 0; tsr->is_mapped = 0; tsr->itopo = -1; tsr->is_alloced = 1; tsr->is_cyclic = 1; tsr->size = 0; tsr->is_inner_mapped = 0; tsr->is_folded = 0; tsr->is_matrix = 0; tsr->is_data_aliased = 0; tsr->has_zero_edge_len = 0; tsr->is_home = 0; tsr->has_home = 0; tsr->profile = profile; if (name != NULL){ tsr->name = name; } else tsr->name = NULL; tsr->pairs = NULL; tsr->ndim = ndim; tsr->edge_len = (int*)CTF_alloc(ndim*sizeof(int)); memcpy(tsr->edge_len, edge_len, ndim*sizeof(int)); tsr->sym = (int*)CTF_alloc(ndim*sizeof(int)); memcpy(tsr->sym, sym, ndim*sizeof(int)); // memcpy(inner_sym, sym, ndim*sizeof(int)); /* for (i=0; i<ndim; i++){ if (tsr->sym[i] != NS) tsr->sym[i] = SY; }*/ tsr->sym_table = (int*)CTF_alloc(ndim*ndim*sizeof(int)); memset(tsr->sym_table, 0, ndim*ndim*sizeof(int)); tsr->edge_map = (mapping*)CTF_alloc(sizeof(mapping)*ndim); (*tensor_id) = tensors.size(); /* initialize map array and symmetry table */ #if DEBUG >= 2 if (global_comm->rank == 0) printf("Tensor %d of dimension %d defined with edge lengths", *tensor_id, ndim); #endif for (i=0; i<ndim; i++){ #if DEBUG >= 1 int maxlen; ALLREDUCE(tsr->sym+i,&maxlen,1,MPI_INT,MPI_MAX,global_comm); LIBT_ASSERT(maxlen==sym[i]); ALLREDUCE(tsr->edge_len+i,&maxlen,1,MPI_INT,MPI_MAX,global_comm); LIBT_ASSERT(maxlen==edge_len[i]); #endif #if DEBUG >= 2 if (global_comm->rank == 0) printf(" %d", edge_len[i]); #endif if (tsr->edge_len[i] <= 0) tsr->has_zero_edge_len = 1; tsr->edge_map[i].type = NOT_MAPPED; tsr->edge_map[i].has_child = 0; tsr->edge_map[i].np = 1; if (tsr->sym[i] != NS) { tsr->sym_table[(i+1)+i*ndim] = 1; tsr->sym_table[(i+1)*ndim+i] = 1; } } #if DEBUG >= 2 if (global_comm->rank == 0) printf("\n"); #endif tensors.push_back(tsr); /* Set tensor data to zero. */ if (alloc_data) return set_zero_tsr(*tensor_id); else return DIST_TENSOR_SUCCESS; } template<typename dtype> std::vector< tensor<dtype>* > * dist_tensor<dtype>::get_tensors(){ return &tensors; } /** * \brief sets the data in the tensor * \param[in] tensor_id tensor handle, * \param[in] num_val new number of data values * \param[in] tsr_data new tensor data */ template<typename dtype> int dist_tensor<dtype>::set_tsr_data( int const tensor_id, int const num_val, dtype * tsr_data){ tensors[tensor_id]->data = tsr_data; tensors[tensor_id]->size = num_val; return DIST_TENSOR_SUCCESS; } template<typename dtype> topology * dist_tensor<dtype>::get_topo(int const itopo) { return &topovec[itopo]; } template<typename dtype> int dist_tensor<dtype>::get_dim(int const tensor_id) const { return tensors[tensor_id]->ndim; } template<typename dtype> int * dist_tensor<dtype>::get_edge_len(int const tensor_id) const { int i; int * edge_len; CTF_alloc_ptr(tensors[tensor_id]->ndim*sizeof(int), (void**)&edge_len); if (tensors[tensor_id]->is_padded){ for (i=0; i<tensors[tensor_id]->ndim; i++){ edge_len[i] = tensors[tensor_id]->edge_len[i] -tensors[tensor_id]->padding[i]; } } else { memcpy(edge_len, tensors[tensor_id]->edge_len, tensors[tensor_id]->ndim*sizeof(int)); } return edge_len; } template<typename dtype> int dist_tensor<dtype>::get_name(int const tensor_id, char const ** name){ *name = tensors[tensor_id]->name; return DIST_TENSOR_SUCCESS; } template<typename dtype> int dist_tensor<dtype>::set_name(int const tensor_id, char const * name){ tensors[tensor_id]->name = name; return DIST_TENSOR_SUCCESS; } template<typename dtype> int dist_tensor<dtype>::profile_on(int const tensor_id){ tensors[tensor_id]->profile = 1; return DIST_TENSOR_SUCCESS; } template<typename dtype> int dist_tensor<dtype>::profile_off(int const tensor_id){ tensors[tensor_id]->profile = 0; return DIST_TENSOR_SUCCESS; } template<typename dtype> int * dist_tensor<dtype>::get_sym(int const tensor_id) const { int * sym; CTF_alloc_ptr(tensors[tensor_id]->ndim*sizeof(int), (void**)&sym); memcpy(sym, tensors[tensor_id]->sym, tensors[tensor_id]->ndim*sizeof(int)); return sym; } /* \brief get raw data pointer WARNING: includes padding * \param[in] tensor_id id of tensor * \return raw local data */ template<typename dtype> dtype * dist_tensor<dtype>::get_raw_data(int const tensor_id, long_int * size) { if (tensors[tensor_id]->has_zero_edge_len){ *size = 0; return NULL; } *size = tensors[tensor_id]->size; return tensors[tensor_id]->data; } /** * \brief pulls out information about a tensor * \param[in] tensor_id tensor handle * \param[out] ndim the dimensionality of the tensor * \param[out] edge_len the number of processors along each dimension * of the tensor * \param[out] sym the symmetries of the tensor */ template<typename dtype> int dist_tensor<dtype>::get_tsr_info( int const tensor_id, int * ndim, int ** edge_len, int ** sym) const{ int i; int nd; int * el, * s; const tensor<dtype> * tsr = tensors[tensor_id]; nd = tsr->ndim; CTF_alloc_ptr(nd*sizeof(int), (void**)&el); CTF_alloc_ptr(nd*sizeof(int), (void**)&s); if (tsr->is_padded){ for (i=0; i<nd; i++){ el[i] = tsr->edge_len[i] - tsr->padding[i]; } } else memcpy(el, tsr->edge_len, nd*sizeof(int)); memcpy(s, tsr->sym, nd*sizeof(int)); *ndim = nd; *edge_len = el; *sym = s; return DIST_TENSOR_SUCCESS; } /** * \brief copies scl object */ template<typename dtype> seq_tsr_scl<dtype>::seq_tsr_scl(scl<dtype> * other) : scl<dtype>(other) { seq_tsr_scl<dtype> * o = (seq_tsr_scl<dtype>*)other; ndim = o->ndim; idx_map = o->idx_map; sym = o->sym; edge_len = (int*)CTF_alloc(sizeof(int)*ndim); memcpy(edge_len, o->edge_len, sizeof(int)*ndim); func_ptr = o->func_ptr; } /** * \brief copies scl object */ template<typename dtype> scl<dtype> * seq_tsr_scl<dtype>::clone() { return new seq_tsr_scl<dtype>(this); } template<typename dtype> long_int seq_tsr_scl<dtype>::mem_fp(){ return 0; } /** * \brief wraps user sequential function signature */ template<typename dtype> void seq_tsr_scl<dtype>::run(){ func_ptr.func_ptr(this->alpha, this->A, ndim, edge_len, edge_len, sym, idx_map); } template<typename dtype> void seq_tsr_scl<dtype>::print(){ int i; printf("seq_tsr_scl:\n"); for (i=0; i<ndim; i++){ printf("edge_len[%d]="PRId64"\n",i,edge_len[i]); } } /** * \brief copies sum object */ template<typename dtype> seq_tsr_sum<dtype>::seq_tsr_sum(tsum<dtype> * other) : tsum<dtype>(other) { seq_tsr_sum<dtype> * o = (seq_tsr_sum<dtype>*)other; ndim_A = o->ndim_A; idx_map_A = o->idx_map_A; sym_A = o->sym_A; edge_len_A = (int*)CTF_alloc(sizeof(int)*ndim_A); memcpy(edge_len_A, o->edge_len_A, sizeof(int)*ndim_A); ndim_B = o->ndim_B; idx_map_B = o->idx_map_B; sym_B = o->sym_B; edge_len_B = (int*)CTF_alloc(sizeof(int)*ndim_B); memcpy(edge_len_B, o->edge_len_B, sizeof(int)*ndim_B); is_inner = o->is_inner; inr_stride = o->inr_stride; func_ptr = o->func_ptr; } template<typename dtype> void seq_tsr_sum<dtype>::print(){ int i; printf("seq_tsr_sum:\n"); for (i=0; i<ndim_A; i++){ printf("edge_len_A[%d]="PRId64"\n",i,edge_len_A[i]); } for (i=0; i<ndim_B; i++){ printf("edge_len_B[%d]="PRId64"\n",i,edge_len_B[i]); } } /** * \brief copies sum object */ template<typename dtype> tsum<dtype> * seq_tsr_sum<dtype>::clone() { return new seq_tsr_sum<dtype>(this); } template<typename dtype> long_int seq_tsr_sum<dtype>::mem_fp(){ return 0; } /** * \brief wraps user sequential function signature */ template<typename dtype> void seq_tsr_sum<dtype>::run(){ if (is_custom){ LIBT_ASSERT(is_inner == 0); sym_seq_sum_cust( this->alpha, this->A, ndim_A, edge_len_A, edge_len_A, sym_A, idx_map_A, this->beta, this->B, ndim_B, edge_len_B, edge_len_B, sym_B, idx_map_B, &custom_params); } else if (is_inner){ sym_seq_sum_inr(this->alpha, this->A, ndim_A, edge_len_A, edge_len_A, sym_A, idx_map_A, this->beta, this->B, ndim_B, edge_len_B, edge_len_B, sym_B, idx_map_B, inr_stride); } else { func_ptr.func_ptr(this->alpha, this->A, ndim_A, edge_len_A, edge_len_A, sym_A, idx_map_A, this->beta, this->B, ndim_B, edge_len_B, edge_len_B, sym_B, idx_map_B); } } template<typename dtype> void seq_tsr_ctr<dtype>::print(){ int i; printf("seq_tsr_ctr:\n"); for (i=0; i<ndim_A; i++){ printf("edge_len_A[%d]=%d\n",i,edge_len_A[i]); } for (i=0; i<ndim_B; i++){ printf("edge_len_B[%d]=%d\n",i,edge_len_B[i]); } for (i=0; i<ndim_C; i++){ printf("edge_len_C[%d]=%d\n",i,edge_len_C[i]); } } /** * \brief copies ctr object */ template<typename dtype> seq_tsr_ctr<dtype>::seq_tsr_ctr(ctr<dtype> * other) : ctr<dtype>(other) { seq_tsr_ctr<dtype> * o = (seq_tsr_ctr<dtype>*)other; alpha = o->alpha; ndim_A = o->ndim_A; idx_map_A = o->idx_map_A; sym_A = (int*)CTF_alloc(sizeof(int)*ndim_A); memcpy(sym_A, o->sym_A, sizeof(int)*ndim_A); edge_len_A = (int*)CTF_alloc(sizeof(int)*ndim_A); memcpy(edge_len_A, o->edge_len_A, sizeof(int)*ndim_A); ndim_B = o->ndim_B; idx_map_B = o->idx_map_B; sym_B = (int*)CTF_alloc(sizeof(int)*ndim_B); memcpy(sym_B, o->sym_B, sizeof(int)*ndim_B); edge_len_B = (int*)CTF_alloc(sizeof(int)*ndim_B); memcpy(edge_len_B, o->edge_len_B, sizeof(int)*ndim_B); ndim_C = o->ndim_C; idx_map_C = o->idx_map_C; sym_C = (int*)CTF_alloc(sizeof(int)*ndim_C); memcpy(sym_C, o->sym_C, sizeof(int)*ndim_C); edge_len_C = (int*)CTF_alloc(sizeof(int)*ndim_C); memcpy(edge_len_C, o->edge_len_C, sizeof(int)*ndim_C); is_inner = o->is_inner; inner_params = o->inner_params; is_custom = o->is_custom; custom_params = o->custom_params; func_ptr = o->func_ptr; } /** * \brief copies ctr object */ template<typename dtype> ctr<dtype> * seq_tsr_ctr<dtype>::clone() { return new seq_tsr_ctr<dtype>(this); } template<typename dtype> long_int seq_tsr_ctr<dtype>::mem_fp(){ return 0; } /** * \brief wraps user sequential function signature */ template<typename dtype> void seq_tsr_ctr<dtype>::run(){ if (is_custom){ LIBT_ASSERT(is_inner == 0); sym_seq_ctr_cust( this->alpha, this->A, ndim_A, edge_len_A, edge_len_A, sym_A, idx_map_A, this->B, ndim_B, edge_len_B, edge_len_B, sym_B, idx_map_B, this->beta, this->C, ndim_C, edge_len_C, edge_len_C, sym_C, idx_map_C, &custom_params); } else if (is_inner){ sym_seq_ctr_inr(this->alpha, this->A, ndim_A, edge_len_A, edge_len_A, sym_A, idx_map_A, this->B, ndim_B, edge_len_B, edge_len_B, sym_B, idx_map_B, this->beta, this->C, ndim_C, edge_len_C, edge_len_C, sym_C, idx_map_C, &inner_params); } else { func_ptr.func_ptr(this->alpha, this->A, ndim_A, edge_len_A, edge_len_A, sym_A, idx_map_A, this->B, ndim_B, edge_len_B, edge_len_B, sym_B, idx_map_B, this->beta, this->C, ndim_C, edge_len_C, edge_len_C, sym_C, idx_map_C); } } /* \brief clone a tensor object * \param[in] tensor_id id of old tensor * \param[in] copy_data if 0 then leave tensor blank, if 1 copy data from old * \param[out] new_tensor_id id of new tensor */ template<typename dtype> int dist_tensor<dtype>::clone_tensor( int const tensor_id, int const copy_data, int * new_tensor_id, int const alloc_data){ int ndim, * edge_len, * sym; get_tsr_info(tensor_id, &ndim, &edge_len, &sym); define_tensor(ndim, edge_len, sym, new_tensor_id, alloc_data, tensors[tensor_id]->name, tensors[tensor_id]->profile); if (global_comm->rank == 0) DPRINTF(2,"Cloned tensor %d from tensor %d\n", *new_tensor_id,tensor_id); CTF_free(edge_len), CTF_free(sym); if (copy_data){ return cpy_tsr(tensor_id, *new_tensor_id); } return DIST_TENSOR_SUCCESS; } /** * \brief copies tensor A to tensor B * \param[in] tid_A handle to A * \param[in] tid_B handle to B */ template<typename dtype> int dist_tensor<dtype>::cpy_tsr(int const tid_A, int const tid_B){ int i; tensor<dtype> * tsr_A, * tsr_B; if (global_comm->rank == 0) DPRINTF(2,"Copying tensor %d to tensor %d\n", tid_A, tid_B); tsr_A = tensors[tid_A]; tsr_B = tensors[tid_B]; tsr_B->has_zero_edge_len = tsr_A->has_zero_edge_len; if (tsr_A->is_folded) unfold_tsr(tsr_A); if (tsr_A->is_mapped){ if (tsr_B->is_mapped){ if (tsr_B->size < tsr_A->size || tsr_B->size > 2*tsr_A->size){ CTF_free(tsr_B->data); CTF_alloc_ptr(tsr_A->size*sizeof(dtype), (void**)&tsr_B->data); } } else { if (tsr_B->pairs != NULL) CTF_free(tsr_B->pairs); CTF_alloc_ptr(tsr_A->size*sizeof(dtype), (void**)&tsr_B->data); } #ifdef HOME_CONTRACT if (tsr_A->has_home){ if (tsr_B->has_home && (!tsr_B->is_home && tsr_B->home_size != tsr_A->home_size)){ CTF_free(tsr_B->home_buffer); } if (tsr_A->is_home){ tsr_B->home_buffer = tsr_B->data; tsr_B->is_home = 1; } else { if (tsr_B->is_home || tsr_B->home_size != tsr_A->home_size){ tsr_B->home_buffer = (dtype*)CTF_alloc(tsr_A->home_size); } tsr_B->is_home = 0; memcpy(tsr_B->home_buffer, tsr_A->home_buffer, tsr_A->home_size); } tsr_B->has_home = 1; } else { if (tsr_B->has_home && !tsr_B->is_home){ CTF_free(tsr_B->home_buffer); } tsr_B->has_home = 0; tsr_B->is_home = 0; } tsr_B->home_size = tsr_A->home_size; #endif memcpy(tsr_B->data, tsr_A->data, sizeof(dtype)*tsr_A->size); } else { if (tsr_B->is_mapped){ CTF_free(tsr_B->data); CTF_alloc_ptr(tsr_A->size*sizeof(tkv_pair<dtype>), (void**)&tsr_B->pairs); } else { if (tsr_B->size < tsr_A->size || tsr_B->size > 2*tsr_A->size){ CTF_free(tsr_B->pairs); CTF_alloc_ptr(tsr_A->size*sizeof(tkv_pair<dtype>), (void**)&tsr_B->pairs); } } memcpy(tsr_B->pairs, tsr_A->pairs, sizeof(tkv_pair<dtype>)*tsr_A->size); } if (tsr_B->is_inner_mapped || tsr_B->is_folded){ del_tsr(tsr_B->rec_tid); } tsr_B->is_inner_mapped = tsr_A->is_inner_mapped; if (tsr_A->is_inner_mapped){ int new_tensor_id; tensor<dtype> * itsr = tensors[tsr_A->rec_tid]; define_tensor(tsr_A->ndim, itsr->edge_len, tsr_A->sym, &new_tensor_id, 0); cpy_tsr(tsr_A->rec_tid, new_tensor_id); tsr_B->rec_tid = new_tensor_id; } tsr_B->is_folded = tsr_A->is_folded; if (tsr_A->is_folded){ int new_tensor_id; tensor<dtype> * itsr = tensors[tsr_A->rec_tid]; define_tensor(tsr_A->ndim, itsr->edge_len, tsr_A->sym, &new_tensor_id, 0); CTF_alloc_ptr(sizeof(int)*tsr_A->ndim, (void**)&tsr_B->inner_ordering); for (i=0; i<tsr_A->ndim; i++){ tsr_B->inner_ordering[i] = tsr_A->inner_ordering[i]; } tsr_B->rec_tid = new_tensor_id; } if (tsr_A->ndim != tsr_B->ndim){ CTF_free(tsr_B->edge_len); if (tsr_B->is_padded) CTF_free(tsr_B->padding); CTF_free(tsr_B->sym); CTF_free(tsr_B->sym_table); if (tsr_B->is_mapped) CTF_free(tsr_B->edge_map); CTF_alloc_ptr(tsr_A->ndim*sizeof(int), (void**)&tsr_B->edge_len); CTF_alloc_ptr(tsr_A->ndim*sizeof(int), (void**)tsr_B->padding); CTF_alloc_ptr(tsr_A->ndim*sizeof(int), (void**)tsr_B->sym); CTF_alloc_ptr(tsr_A->ndim*tsr_A->ndim*sizeof(int), (void**)tsr_B->sym_table); CTF_alloc_ptr(tsr_A->ndim*sizeof(mapping), (void**)tsr_B->edge_map); } tsr_B->ndim = tsr_A->ndim; memcpy(tsr_B->edge_len, tsr_A->edge_len, sizeof(int)*tsr_A->ndim); tsr_B->is_padded = tsr_A->is_padded; if (tsr_A->is_padded) memcpy(tsr_B->padding, tsr_A->padding, sizeof(int)*tsr_A->ndim); memcpy(tsr_B->sym, tsr_A->sym, sizeof(int)*tsr_A->ndim); memcpy(tsr_B->sym_table, tsr_A->sym_table, sizeof(int)*tsr_A->ndim*tsr_A->ndim); tsr_B->is_mapped = tsr_A->is_mapped; tsr_B->is_cyclic = tsr_A->is_cyclic; tsr_B->itopo = tsr_A->itopo; if (tsr_A->is_mapped) copy_mapping(tsr_A->ndim, tsr_A->edge_map, tsr_B->edge_map); tsr_B->size = tsr_A->size; return DIST_TENSOR_SUCCESS; } /** * \brief permutes a tensor along each dimension skips if perm set to -1, generalizes slice. * one of permutation_A or permutation_B has to be set to NULL, if multiworld read, then * the parent world tensor should not be being permuted * * \param[in] tid_A id of first tensor * \param[in] permutation_A permutation for each edge lengths of the first tensor, can be NULL, * and subpointers can be NULL * \param[in] dt_A the dist_tensor object (world) on which A lives * \param[in] alpha scaling factor for the A tensor * \param[in] tid_B analogous to tid_A * \param[in] permutation_B analogous to permutation_A * \param[in] beta analogous to alpha * \param[in] dt_B analogous to dt_A */ template<typename dtype> int dist_tensor<dtype>::permute_tensor(int const tid_A, int * const * permutation_A, dtype const alpha, dist_tensor<dtype> * dt_A, int const tid_B, int * const * permutation_B, dtype const beta, dist_tensor<dtype> * dt_B){ long_int sz_A, blk_sz_A, sz_B, blk_sz_B; tkv_pair<dtype> * all_data_A; tkv_pair<dtype> * all_data_B; tensor<dtype> * tsr_A, * tsr_B; int ndim_A, * len_A, * sym_A; int ndim_B, * len_B, * sym_B; int ret; tsr_A = dt_A->tensors[tid_A]; tsr_B = dt_B->tensors[tid_B]; dt_A->get_tsr_info(tid_A, &ndim_A, &len_A, &sym_A); dt_B->get_tsr_info(tid_B, &ndim_B, &len_B, &sym_B); if (permutation_B != NULL){ LIBT_ASSERT(permutation_A == NULL); LIBT_ASSERT(dt_B->get_global_comm()->np <= dt_A->get_global_comm()->np); if (ndim_B == 0 || tsr_B->has_zero_edge_len){ blk_sz_B = 0; } else { dt_B->read_local_pairs(tid_B, &sz_B, &all_data_B); //permute all_data_B permute_keys(ndim_B, sz_B, len_B, len_A, permutation_B, all_data_B, &blk_sz_B); } ret = dt_A->write_pairs(tid_A, blk_sz_B, 1.0, 0.0, all_data_B, 'r'); if (blk_sz_B > 0) depermute_keys(ndim_B, blk_sz_B, len_B, len_A, permutation_B, all_data_B); all_data_A = all_data_B; blk_sz_A = blk_sz_B; } else { LIBT_ASSERT(permutation_A != NULL); LIBT_ASSERT(permutation_B == NULL); LIBT_ASSERT(dt_B->get_global_comm()->np >= dt_A->get_global_comm()->np); if (ndim_A == 0 || tsr_A->has_zero_edge_len){ blk_sz_A = 0; } else { dt_A->read_local_pairs(tid_A, &sz_A, &all_data_A); //permute all_data_A permute_keys(ndim_A, sz_A, len_A, len_B, permutation_A, all_data_A, &blk_sz_A); } } ret = dt_B->write_pairs(tid_B, blk_sz_A, alpha, beta, all_data_A, 'w'); CTF_free(len_A); CTF_free(len_B); CTF_free(sym_A); CTF_free(sym_B); if (blk_sz_A > 0) CTF_free(all_data_A); return ret; } /** * Add tensor data from A to a block of B, * B[offsets_B:ends_B] = beta*B[offsets_B:ends_B] + alpha*A[offsets_A:ends_A] * \param[in] tid_A id of tensor A * \param[in] offsets_A closest corner of tensor block in A * \param[in] ends_A furthest corner of tensor block in A * \param[in] permutation_A permutation of indices for each dimension (NULL if none) * \param[in] alpha scaling factor of A * \param[in] tid_B id of tensor B * \param[in] offsets_B closest corner of tensor block in B * \param[in] ends_B furthest corner of tensor block in B * \param[in] permutation_B permutation of indices for each dimension (NULL if none) * \param[in] alpha scaling factor of B */ template<typename dtype> int dist_tensor<dtype>::slice_tensor(int const tid_A, int const * offsets_A, int const * ends_A, dtype const alpha, dist_tensor<dtype> * dt_A, int const tid_B, int const * offsets_B, int const * ends_B, dtype const beta, dist_tensor<dtype> * dt_B){ long_int i, sz_A, blk_sz_A, sz_B, blk_sz_B; tkv_pair<dtype> * all_data_A, * blk_data_A; tkv_pair<dtype> * all_data_B, * blk_data_B; tensor<dtype> * tsr_A, * tsr_B; int ndim_A, * len_A, * sym_A; int ndim_B, * len_B, * sym_B; int ret; tsr_A = dt_A->tensors[tid_A]; tsr_B = dt_B->tensors[tid_B]; dt_A->get_tsr_info(tid_A, &ndim_A, &len_A, &sym_A); dt_B->get_tsr_info(tid_B, &ndim_B, &len_B, &sym_B); int * padding_A = (int*)CTF_alloc(sizeof(int)*tsr_A->ndim); int * toffset_A = (int*)CTF_alloc(sizeof(int)*tsr_A->ndim); int * padding_B = (int*)CTF_alloc(sizeof(int)*tsr_B->ndim); int * toffset_B = (int*)CTF_alloc(sizeof(int)*tsr_B->ndim); /*dt_B->read_local_pairs(tid_B, &sz_B, &all_data_B); CTF_alloc_ptr(sizeof(tkv_pair<dtype>)*sz_B, (void**)&blk_data_B); for (i=0; i<tsr_B->ndim; i++){ padding_B[i] = len_B[i] - ends_B[i]; } depad_tsr(ndim_B, sz_B, ends_B, sym_B, padding_B, offsets_B, all_data_B, blk_data_B, &blk_sz_B); if (sz_B > 0) CTF_free(all_data_B);*/ if (dt_A == dt_B){ //use remap_tensor() } //else { if (dt_B->get_global_comm()->np < dt_A->get_global_comm()->np){ if (ndim_B == 0 || tsr_B->has_zero_edge_len){ blk_sz_B = 0; } else { dt_B->read_local_pairs(tid_B, &sz_B, &all_data_B); CTF_alloc_ptr(sizeof(tkv_pair<dtype>)*sz_B, (void**)&blk_data_B); for (i=0; i<tsr_B->ndim; i++){ padding_B[i] = len_B[i] - ends_B[i]; } depad_tsr(ndim_B, sz_B, ends_B, sym_B, padding_B, offsets_B, all_data_B, blk_data_B, &blk_sz_B); if (sz_B > 0) CTF_free(all_data_B); for (i=0; i<ndim_B; i++){ toffset_B[i] = -offsets_B[i]; padding_B[i] = ends_B[i]-offsets_B[i]-len_B[i]; } pad_key(ndim_B, blk_sz_B, len_B, padding_B, blk_data_B, toffset_B); for (i=0; i<ndim_A; i++){ toffset_A[i] = ends_A[i] - offsets_A[i]; padding_A[i] = len_A[i] - toffset_A[i]; } pad_key(ndim_A, blk_sz_B, toffset_A, padding_A, blk_data_B, offsets_A); } ret = dt_A->write_pairs(tid_A, blk_sz_B, 1.0, 0.0, blk_data_B, 'r'); all_data_A = blk_data_B; sz_A = blk_sz_B; } else { dt_A->read_local_pairs(tid_A, &sz_A, &all_data_A); } if (ndim_A == 0 || tsr_A->has_zero_edge_len){ blk_sz_A = 0; } else { CTF_alloc_ptr(sizeof(tkv_pair<dtype>)*sz_A, (void**)&blk_data_A); for (i=0; i<tsr_A->ndim; i++){ padding_A[i] = len_A[i] - ends_A[i]; } depad_tsr(ndim_A, sz_A, ends_A, sym_A, padding_A, offsets_A, all_data_A, blk_data_A, &blk_sz_A); if (sz_A > 0) CTF_free(all_data_A); for (i=0; i<ndim_A; i++){ toffset_A[i] = -offsets_A[i]; padding_A[i] = ends_A[i]-offsets_A[i]-len_A[i]; } pad_key(ndim_A, blk_sz_A, len_A, padding_A, blk_data_A, toffset_A); for (i=0; i<ndim_B; i++){ toffset_B[i] = ends_B[i] - offsets_B[i]; padding_B[i] = len_B[i] - toffset_B[i]; } pad_key(ndim_B, blk_sz_A, toffset_B, padding_B, blk_data_A, offsets_B); } ret = dt_B->write_pairs(tid_B, blk_sz_A, alpha, beta, blk_data_A, 'w'); CTF_free(len_A); CTF_free(len_B); CTF_free(sym_A); CTF_free(sym_B); if (blk_sz_A > 0) CTF_free(blk_data_A); CTF_free(padding_A); CTF_free(padding_B); CTF_free(toffset_A); CTF_free(toffset_B); return ret; } /** * \brief Read or write tensor data by <key, value> pairs where key is the * global index for the value. * \param[in] tensor_id tensor handle * \param[in] num_pair number of pairs to write * \param[in] alpha multiplier for new value * \param[in] beta multiplier for old value * \param[in,out] mapped_data pairs to read/write * \param[in] rw weather to read (r) or write (w) */ template<typename dtype> int dist_tensor<dtype>::write_pairs(int const tensor_id, long_int const num_pair, dtype const alpha, dtype const beta, tkv_pair<dtype> * mapped_data, char const rw){ int i, num_virt, need_pad; int * phys_phase, * virt_phase, * bucket_lda; int * virt_phys_rank; mapping * map; tensor<dtype> * tsr; #if DEBUG >= 1 int ndim, * len, * sym; get_tsr_info(tensor_id, &ndim, &len, &sym); if (global_comm->rank == 0){ if (rw == 'w') printf("Writing data to tensor %d\n", tensor_id); else printf("Reading data from tensor %d\n", tensor_id); print_map(stdout, tensor_id, 0, 0); } long_int total_tsr_size = 1; for (i=0; i<ndim; i++){ total_tsr_size *= len[i]; } for (i=0; i<num_pair; i++){ LIBT_ASSERT(mapped_data[i].k >= 0); LIBT_ASSERT(mapped_data[i].k < total_tsr_size); } CTF_free(len); CTF_free(sym); #endif tsr = tensors[tensor_id]; if (tsr->has_zero_edge_len) return DIST_TENSOR_SUCCESS; TAU_FSTART(write_pairs); unmap_inner(tsr); set_padding(tsr); if (tsr->is_mapped){ CTF_alloc_ptr(tsr->ndim*sizeof(int), (void**)&phys_phase); CTF_alloc_ptr(tsr->ndim*sizeof(int), (void**)&virt_phys_rank); CTF_alloc_ptr(tsr->ndim*sizeof(int), (void**)&bucket_lda); CTF_alloc_ptr(tsr->ndim*sizeof(int), (void**)&virt_phase); num_virt = 1; need_pad = tsr->is_padded; /* Setup rank/phase arrays, given current mapping */ for (i=0; i<tsr->ndim; i++){ map = tsr->edge_map + i; phys_phase[i] = calc_phase(map); virt_phase[i] = phys_phase[i]/calc_phys_phase(map); virt_phys_rank[i] = calc_phys_rank(map, &topovec[tsr->itopo]) *virt_phase[i]; num_virt = num_virt*virt_phase[i]; if (map->type == PHYSICAL_MAP) bucket_lda[i] = topovec[tsr->itopo].lda[map->cdt]; else bucket_lda[i] = 0; } wr_pairs_layout(tsr->ndim, global_comm->np, num_pair, alpha, beta, need_pad, rw, num_virt, tsr->sym, tsr->edge_len, tsr->padding, phys_phase, virt_phase, virt_phys_rank, bucket_lda, mapped_data, tsr->data, global_comm); CTF_free(phys_phase); CTF_free(virt_phys_rank); CTF_free(bucket_lda); CTF_free(virt_phase); } else { DEBUG_PRINTF("SHOULD NOT BE HERE, ALWAYS MAP ME\n"); TAU_FSTOP(write_pairs); return DIST_TENSOR_ERROR; } TAU_FSTOP(write_pairs); return DIST_TENSOR_SUCCESS; } /** * \brief Retrieve local data in the form of key value pairs * \param[in] tensor_id tensor handle * \param[out] num_pair number of pairs read * \param[out] mapped_data pairs read */ template<typename dtype> int dist_tensor<dtype>::read_local_pairs(int tensor_id, long_int * num_pair, tkv_pair<dtype> ** mapped_data){ int i, num_virt, idx_lyr; long_int np; int * virt_phase, * virt_phys_rank, * phys_phase; tensor<dtype> * tsr; tkv_pair<dtype> * pairs; mapping * map; TAU_FSTART(read_local_pairs); tsr = tensors[tensor_id]; if (tsr->has_zero_edge_len){ *num_pair = 0; return DIST_TENSOR_SUCCESS; } unmap_inner(tsr); set_padding(tsr); if (!tsr->is_mapped){ *num_pair = tsr->size; *mapped_data = tsr->pairs; return DIST_TENSOR_SUCCESS; } else { np = tsr->size; CTF_alloc_ptr(sizeof(int)*tsr->ndim, (void**)&virt_phase); CTF_alloc_ptr(sizeof(int)*tsr->ndim, (void**)&phys_phase); CTF_alloc_ptr(sizeof(int)*tsr->ndim, (void**)&virt_phys_rank); num_virt = 1; idx_lyr = global_comm->rank; for (i=0; i<tsr->ndim; i++){ /* Calcute rank and phase arrays */ map = tsr->edge_map + i; phys_phase[i] = calc_phase(map); virt_phase[i] = phys_phase[i]/calc_phys_phase(map); virt_phys_rank[i] = calc_phys_rank(map, &topovec[tsr->itopo]) *virt_phase[i]; num_virt = num_virt*virt_phase[i]; if (map->type == PHYSICAL_MAP) idx_lyr -= topovec[tsr->itopo].lda[map->cdt] *virt_phys_rank[i]/virt_phase[i]; } if (idx_lyr == 0){ read_loc_pairs(tsr->ndim, np, tsr->is_padded, num_virt, tsr->sym, tsr->edge_len, tsr->padding, virt_phase, phys_phase, virt_phys_rank, num_pair, tsr->data, &pairs); *mapped_data = pairs; } else { *mapped_data = NULL; *num_pair = 0; } CTF_free((void*)virt_phase); CTF_free((void*)phys_phase); CTF_free((void*)virt_phys_rank); TAU_FSTOP(read_local_pairs); return DIST_TENSOR_SUCCESS; } TAU_FSTOP(read_local_pairs); } /** * \brief read entire tensor with each processor (in packed layout). * WARNING: will use a lot of memory. * \param[in] tensor_id tensor handle * \param[out] num_pair number of values read * \param[in,out] mapped_data values read * \param[in] is_prealloced set to 1 if all_data is already allocated, 0 * default */ template<typename dtype> int dist_tensor<dtype>::allread_tsr(int const tid, long_int * num_val, dtype ** all_data, int const is_prealloced){ int numPes; int * nXs; int nval, n, i; int * pXs; tkv_pair<dtype> * my_pairs, * all_pairs; numPes = global_comm->np; if (tensors[tid]->has_zero_edge_len){ *num_val = 0; return DIST_TENSOR_SUCCESS; } CTF_alloc_ptr(numPes*sizeof(int), (void**)&nXs); CTF_alloc_ptr(numPes*sizeof(int), (void**)&pXs); pXs[0] = 0; long_int ntt = 0; my_pairs = NULL; read_local_pairs(tid, &ntt, &my_pairs); n = (int)ntt; n*=sizeof(tkv_pair<dtype>); ALLGATHER(&n, 1, MPI_INT, nXs, 1, MPI_INT, global_comm); for (i=1; i<numPes; i++){ pXs[i] = pXs[i-1]+nXs[i-1]; } nval = pXs[numPes-1] + nXs[numPes-1]; CTF_alloc_ptr(nval, (void**)&all_pairs); MPI_Allgatherv(my_pairs, n, MPI_CHAR, all_pairs, nXs, pXs, MPI_CHAR, MPI_COMM_WORLD); nval = nval/sizeof(tkv_pair<dtype>); std::sort(all_pairs,all_pairs+nval); if (n>0) CTF_free(my_pairs); if (is_prealloced) CTF_alloc_ptr(nval*sizeof(dtype), (void**)all_data); for (i=0; i<nval; i++){ (*all_data)[i] = all_pairs[i].d; } *num_val = (long_int)nval; CTF_free(nXs); CTF_free(pXs); CTF_free(all_pairs); return DIST_TENSOR_SUCCESS; } template<typename dtype> int dist_tensor<dtype>::get_max_abs(int const tid, int const n, dtype * data){ printf("CTF: Currently unable to get largest values of non-double type array, exiting.\n"); return DIST_TENSOR_ERROR; } /** * \brief obtains a small number of the biggest elements of the * tensor in sorted order (e.g. eigenvalues) * \param[in] tid index of tensor * \param[in] n number of elements to collect * \param[in] data output data (should be preallocated to size at least n) */ template<> int dist_tensor<double>::get_max_abs(int const tid, int const n, double * data){ int i, j, con, np, rank; tensor<double> * tsr; double val, swp; double * recv_data, * merge_data; MPI_Status stat; CTF_alloc_ptr(n*sizeof(double), (void**)&recv_data); CTF_alloc_ptr(n*sizeof(double), (void**)&merge_data); tsr = tensors[tid]; std::fill(data, data+n, get_zero<double>()); for (i=0; i<tsr->size; i++){ val = std::abs(tsr->data[i]); for (j=0; j<n; j++){ if (val > data[j]){ swp = val; val = data[j]; data[j] = swp; } } } np = global_comm->np; rank = global_comm->rank; con = np/2; while (con>0){ if (np%2 == 1) con++; if (rank+con < np){ MPI_Recv(recv_data, n*sizeof(double), MPI_CHAR, rank+con, 0, global_comm->cm, &stat); i=0, j=0; while (i+j<n){ if (data[i]<recv_data[j]){ merge_data[i+j] = data[i]; i++; } else { merge_data[i+j] = recv_data[j]; j++; } } memcpy(data, merge_data, sizeof(double)*n); } else if (rank-con >= 0 && rank < np){ MPI_Send(data, n*sizeof(double), MPI_CHAR, rank-con, 0, global_comm->cm); } np = np/2 + (np%2); con = np/2; } MPI_Bcast(data, n*sizeof(double), MPI_CHAR, 0, global_comm->cm); CTF_free(merge_data); CTF_free(recv_data); return DIST_TENSOR_SUCCESS; } /* \brief deletes a tensor and deallocs the data */ template<typename dtype> int dist_tensor<dtype>::del_tsr(int const tid){ tensor<dtype> * tsr; tsr = tensors[tid]; if (tsr != NULL){ if (global_comm->rank == 0){ DPRINTF(1,"Deleting tensor %d\n",tid); } //unfold_tsr(tsr); if (tsr->is_folded){ del_tsr(tsr->rec_tid); CTF_free(tsr->inner_ordering); } CTF_free(tsr->edge_len); if (tsr->is_padded) CTF_free(tsr->padding); if (tsr->is_scp_padded) CTF_free(tsr->scp_padding); CTF_free(tsr->sym); CTF_free(tsr->sym_table); if (tsr->is_mapped){ if (!tsr->is_data_aliased){ CTF_free(tsr->data); if (tsr->has_home && !tsr->is_home) CTF_free(tsr->home_buffer); } clear_mapping(tsr); } CTF_free(tsr->edge_map); tsr->is_alloced = 0; CTF_free(tsr); tensors[tid] = NULL; } return DIST_TENSOR_SUCCESS; } /* WARNING: not a standard spec function, know what you are doing */ template<typename dtype> int dist_tensor<dtype>::elementalize(int const tid, int const x_rank, int const x_np, int const y_rank, int const y_np, long_int const blk_sz, dtype * data){ tensor<dtype> * tsr; int * old_phase, * old_rank, * old_virt_dim, * old_pe_lda, * old_padding; int * new_phase, * new_rank, * new_virt_dim, * new_pe_lda, * new_edge_len; int * new_padding, * old_edge_len; dtype * shuffled_data; int repad, is_pad, i, j, pad, my_x_dim, my_y_dim, was_padded; long_int old_size; std::vector< tensor<dtype> > * tensors = get_tensors(); tsr = &(*tensors)[tid]; unmap_inner(tsr); set_padding(tsr); assert(tsr->is_mapped); assert(tsr->ndim == 2); assert(tsr->sym[0] == NS); assert(tsr->sym[1] == NS); save_mapping(tsr, &old_phase, &old_rank, &old_virt_dim, &old_pe_lda, &old_size, &was_padded, &old_padding, &old_edge_len, &topovec[tsr->itopo]); CTF_alloc_ptr(sizeof(int)*tsr->ndim, (void**)&new_phase); CTF_alloc_ptr(sizeof(int)*tsr->ndim, (void**)&new_rank); CTF_alloc_ptr(sizeof(int)*tsr->ndim, (void**)&new_pe_lda); CTF_alloc_ptr(sizeof(int)*tsr->ndim, (void**)&new_virt_dim); CTF_alloc_ptr(sizeof(int)*tsr->ndim, (void**)&new_padding); CTF_alloc_ptr(sizeof(int)*tsr->ndim, (void**)&new_edge_len); repad = 1; is_pad = 1; new_phase[0] = x_np; new_rank[0] = x_rank; new_virt_dim[0] = 1; new_pe_lda[0] = 1; new_phase[1] = y_np; new_rank[1] = y_rank; new_virt_dim[1] = 1; new_pe_lda[1] = x_np; for (j=0; j<tsr->ndim; j++){ new_edge_len[j] = tsr->edge_len[j]; if (tsr->is_padded){ pad = (tsr->edge_len[j]-tsr->padding[j])%new_phase[j]; if (pad != 0) pad = new_phase[j]-pad; if (pad != tsr->padding[j]){ repad = 1; } if (pad != 0) is_pad = 1; new_padding[j] = pad; } else { pad = tsr->edge_len[j]%new_phase[j]; if (pad != 0){ pad = new_phase[j]-pad; is_pad = 1; repad = 1; } new_padding[j] = pad; } } if (false){ padded_reshuffle(tid, tsr->ndim, old_size, new_edge_len, tsr->sym, old_phase, old_rank, old_pe_lda, was_padded, old_padding, old_edge_len, new_phase, new_rank, new_pe_lda, is_pad, new_padding, old_virt_dim, new_virt_dim, tsr->data, &shuffled_data, get_global_comm()); } else { ABORT; /* CTF_alloc_ptr(sizeof(dtype)*tsr->size, (void**)&shuffled_data); cyclic_reshuffle(tsr->ndim, tsr->size, new_edge_len, tsr->sym, old_phase, old_rank, old_pe_lda, new_phase, new_rank, new_pe_lda, old_virt_dim, new_virt_dim, tsr->data, shuffled_data, get_global_comm()); new_blk_sz = tsr->size; new_padding = tsr->padding;*/ } if (!is_pad && !repad){ // assert(new_blk_sz == blk_sz); memcpy(data, shuffled_data, blk_sz*sizeof(dtype)); } else { my_x_dim = (new_edge_len[0]-new_padding[0])/x_np; if (x_rank < (new_edge_len[0]-new_padding[0])%x_np) my_x_dim++; my_y_dim = (new_edge_len[1]-new_padding[1])/y_np; if (y_rank < (new_edge_len[1]-new_padding[1])%y_np) my_y_dim++; assert(my_x_dim*my_y_dim == blk_sz); for (i=0; i<my_y_dim; i++){ for (j=0; j<my_x_dim; j++){ data[j+i*my_x_dim] = shuffled_data[j+i*new_edge_len[0]]; } } } CTF_free((void*)new_phase); CTF_free((void*)new_rank); CTF_free((void*)new_virt_dim); CTF_free((void*)new_edge_len); CTF_free((void*)shuffled_data); return DIST_TENSOR_SUCCESS; } /* \brief set the tensor to zero, called internally for each defined tensor * \param tensor_id id of tensor to set to zero */ template<typename dtype> int dist_tensor<dtype>::set_zero_tsr(int tensor_id){ tensor<dtype> * tsr; int * restricted; int i, map_success, btopo; uint64_t nvirt, bnvirt; uint64_t memuse, bmemuse; tsr = tensors[tensor_id]; if (tsr->is_mapped){ std::fill(tsr->data, tsr->data + tsr->size, get_zero<dtype>()); } else { if (tsr->pairs != NULL){ for (i=0; i<tsr->size; i++) tsr->pairs[i].d = get_zero<dtype>(); } else { CTF_alloc_ptr(tsr->ndim*sizeof(int), (void**)&restricted); // memset(restricted, 0, tsr->ndim*sizeof(int)); /* Map the tensor if necessary */ bnvirt = UINT64_MAX, btopo = -1; bmemuse = UINT64_MAX; for (i=global_comm->rank; i<(int)topovec.size(); i+=global_comm->np){ clear_mapping(tsr); set_padding(tsr); memset(restricted, 0, tsr->ndim*sizeof(int)); map_success = map_tensor(topovec[i].ndim, tsr->ndim, tsr->edge_len, tsr->sym_table, restricted, topovec[i].dim_comm, NULL, 0, tsr->edge_map); if (map_success == DIST_TENSOR_ERROR) { LIBT_ASSERT(0); return DIST_TENSOR_ERROR; } else if (map_success == DIST_TENSOR_SUCCESS){ tsr->itopo = i; set_padding(tsr); memuse = (uint64_t)tsr->size; if ((uint64_t)memuse >= proc_bytes_available()){ DPRINTF(1,"Not enough memory to map tensor on topo %d\n", i); continue; } nvirt = (uint64_t)calc_nvirt(tsr); LIBT_ASSERT(nvirt != 0); if (btopo == -1 || nvirt < bnvirt){ bnvirt = nvirt; btopo = i; bmemuse = memuse; } else if (nvirt == bnvirt && memuse < bmemuse){ btopo = i; bmemuse = memuse; } } } if (btopo == -1) bnvirt = UINT64_MAX; /* pick lower dimensional mappings, if equivalent */ ///btopo = get_best_topo(bnvirt, btopo, global_comm, 0, bmemuse); btopo = get_best_topo(bmemuse, btopo, global_comm); if (btopo == -1 || btopo == INT_MAX) { if (global_comm->rank==0) printf("ERROR: FAILED TO MAP TENSOR\n"); return DIST_TENSOR_ERROR; } memset(restricted, 0, tsr->ndim*sizeof(int)); clear_mapping(tsr); set_padding(tsr); map_success = map_tensor(topovec[btopo].ndim, tsr->ndim, tsr->edge_len, tsr->sym_table, restricted, topovec[btopo].dim_comm, NULL, 0, tsr->edge_map); LIBT_ASSERT(map_success == DIST_TENSOR_SUCCESS); tsr->itopo = btopo; CTF_free(restricted); tsr->is_mapped = 1; set_padding(tsr); #if 0 CTF_alloc_ptr(tsr->ndim*sizeof(int), (void**)&phys_phase); CTF_alloc_ptr(tsr->ndim*sizeof(int), (void**)&sub_edge_len); /* Pad the tensor */ need_pad = 1; nvirt = 1; for (i=0; i<tsr->ndim; i++){ if (tsr->edge_map[i].type == PHYSICAL_MAP){ phys_phase[i] = tsr->edge_map[i].np; if (tsr->edge_map[i].has_child){ phys_phase[i] = phys_phase[i]*tsr->edge_map[i].child->np; nvirt *= tsr->edge_map[i].child->np; } } else { LIBT_ASSERT(tsr->edge_map[i].type == VIRTUAL_MAP); phys_phase[i] = tsr->edge_map[i].np; nvirt *= tsr->edge_map[i].np; } if (tsr->edge_len[i] % phys_phase[i] != 0){ need_pad = 1; } } old_size = packed_size(tsr->ndim, tsr->edge_len, tsr->sym, tsr->sym_type); if (need_pad){ tsr->is_padded = 1; CTF_alloc_ptr(tsr->ndim*sizeof(int), (void**)&tsr->padding); for (i=0; i<tsr->ndim; i++){ tsr->padding[i] = tsr->edge_len[i] % phys_phase[i]; if (tsr->padding[i] != 0) tsr->padding[i] = phys_phase[i] - tsr->padding[i]; tsr->edge_len[i] += tsr->padding[i]; } } for (i=0; i<tsr->ndim; i++){ sub_edge_len[i] = tsr->edge_len[i]/phys_phase[i]; } /* Alloc and set the data to zero */ DEBUG_PRINTF("tsr->size = nvirt = "PRIu64" * packed_size = "PRIu64"\n", (unsigned int64_t int)nvirt, packed_size(tsr->ndim, sub_edge_len, tsr->sym, tsr->sym_type)); if (global_comm->rank == 0){ printf("Tensor %d initially mapped with virtualization factor of "PRIu64"\n",tensor_id,nvirt); } tsr->size =nvirt*packed_size(tsr->ndim, sub_edge_len, tsr->sym, tsr->sym_type); if (global_comm->rank == 0){ printf("Tensor %d is of size "PRId64", has factor of %lf growth due to padding\n", tensor_id, tsr->size, global_comm->np*(tsr->size/(double)old_size)); } #endif #ifdef HOME_CONTRACT if (tsr->ndim > 0){ tsr->home_size = tsr->size; //MAX(1024+tsr->size, 1.20*tsr->size); tsr->is_home = 1; tsr->has_home = 1; DPRINTF(3,"Initial size of tensor %d is "PRId64",",tensor_id,tsr->size); CTF_alloc_ptr(tsr->home_size*sizeof(dtype), (void**)&tsr->home_buffer); tsr->data = tsr->home_buffer; } else { CTF_alloc_ptr(tsr->size*sizeof(dtype), (void**)&tsr->data); } #else CTF_mst_alloc_ptr(tsr->size*sizeof(dtype), (void**)&tsr->data); #endif #if DEBUG >= 2 if (global_comm->rank == 0) printf("Tensor %d set to zero with mapping:\n", tensor_id); print_map(stdout, tensor_id); #endif std::fill(tsr->data, tsr->data + tsr->size, get_zero<dtype>()); /* CTF_free(phys_phase); CTF_free(sub_edge_len);*/ } } return DIST_TENSOR_SUCCESS; } /* * \brief print tensor tid to stream * WARNING: serializes ALL data to ONE processor * \param stream output stream (stdout, stdin, FILE) * \param tid tensor handle */ template<typename dtype> int dist_tensor<dtype>::print_tsr(FILE * stream, int const tid, double cutoff) { tensor<dtype> const * tsr; int i, j; long_int my_sz, tot_sz =0; int * recvcnts, * displs, * adj_edge_len, * idx_arr; tkv_pair<dtype> * my_data; tkv_pair<dtype> * all_data; key k; print_map(stdout, tid, 1, 0); tsr = tensors[tid]; my_sz = 0; read_local_pairs(tid, &my_sz, &my_data); if (global_comm->rank == 0){ CTF_alloc_ptr(global_comm->np*sizeof(int), (void**)&recvcnts); CTF_alloc_ptr(global_comm->np*sizeof(int), (void**)&displs); CTF_alloc_ptr(tsr->ndim*sizeof(int), (void**)&adj_edge_len); CTF_alloc_ptr(tsr->ndim*sizeof(int), (void**)&idx_arr); if (tsr->is_padded){ for (i=0; i<tsr->ndim; i++){ adj_edge_len[i] = tsr->edge_len[i] - tsr->padding[i]; } } else { memcpy(adj_edge_len, tsr->edge_len, tsr->ndim*sizeof(int)); } } GATHER(&my_sz, 1, COMM_INT_T, recvcnts, 1, COMM_INT_T, 0, global_comm); if (global_comm->rank == 0){ for (i=0; i<global_comm->np; i++){ recvcnts[i] *= sizeof(tkv_pair<dtype>); } displs[0] = 0; for (i=1; i<global_comm->np; i++){ displs[i] = displs[i-1] + recvcnts[i-1]; } tot_sz = (displs[global_comm->np-1] + recvcnts[global_comm->np-1])/sizeof(tkv_pair<dtype>); CTF_alloc_ptr(tot_sz*sizeof(tkv_pair<dtype>), (void**)&all_data); } if (my_sz == 0) my_data = NULL; GATHERV(my_data, my_sz*sizeof(tkv_pair<dtype>), COMM_CHAR_T, all_data, recvcnts, displs, COMM_CHAR_T, 0, global_comm); if (global_comm->rank == 0){ std::sort(all_data, all_data + tot_sz); for (i=0; i<tot_sz; i++){ if (std::abs(all_data[i].d) > cutoff) { k = all_data[i].k; for (j=0; j<tsr->ndim; j++){ //idx_arr[tsr->ndim-j-1] = k%adj_edge_len[j]; idx_arr[j] = k%adj_edge_len[j]; k = k/adj_edge_len[j]; } for (j=0; j<tsr->ndim; j++){ fprintf(stream,"[%d]",idx_arr[j]); } fprintf(stream," <%20.14E>\n",GET_REAL(all_data[i].d)); } } CTF_free(recvcnts); CTF_free(displs); CTF_free(adj_edge_len); CTF_free(idx_arr); CTF_free(all_data); } //COMM_BARRIER(global_comm); return DIST_TENSOR_SUCCESS; } /* * \brief print tensors tid_A and tid_A side-by-side to stream * WARNING: serializes ALL data to ONE processor * \param stream output stream (stdout, stdin, FILE) * \param tid_A first tensor handle * \param tid_B second tensor handle */ template<typename dtype> int dist_tensor<dtype>::compare_tsr(FILE * stream, int const tid_A, int const tid_B, double cutoff) { tensor<dtype> const * tsr_A; int i, j; long_int my_sz, tot_sz =0, my_sz_B; int * recvcnts, * displs, * adj_edge_len, * idx_arr; tkv_pair<dtype> * my_data_A; tkv_pair<dtype> * my_data_B; tkv_pair<dtype> * all_data_A; tkv_pair<dtype> * all_data_B; key k; print_map(stdout, tid_A, 1, 0); print_map(stdout, tid_B, 1, 0); tsr_A = tensors[tid_A]; my_sz = 0; read_local_pairs(tid_A, &my_sz, &my_data_A); my_sz_B = 0; read_local_pairs(tid_B, &my_sz_B, &my_data_B); assert(my_sz == my_sz_B); if (global_comm->rank == 0){ CTF_alloc_ptr(global_comm->np*sizeof(int), (void**)&recvcnts); CTF_alloc_ptr(global_comm->np*sizeof(int), (void**)&displs); CTF_alloc_ptr(tsr_A->ndim*sizeof(int), (void**)&adj_edge_len); CTF_alloc_ptr(tsr_A->ndim*sizeof(int), (void**)&idx_arr); if (tsr_A->is_padded){ for (i=0; i<tsr_A->ndim; i++){ adj_edge_len[i] = tsr_A->edge_len[i] - tsr_A->padding[i]; } } else { memcpy(adj_edge_len, tsr_A->edge_len, tsr_A->ndim*sizeof(int)); } } GATHER(&my_sz, 1, COMM_INT_T, recvcnts, 1, COMM_INT_T, 0, global_comm); if (global_comm->rank == 0){ for (i=0; i<global_comm->np; i++){ recvcnts[i] *= sizeof(tkv_pair<dtype>); } displs[0] = 0; for (i=1; i<global_comm->np; i++){ displs[i] = displs[i-1] + recvcnts[i-1]; } tot_sz = (displs[global_comm->np-1] + recvcnts[global_comm->np-1])/sizeof(tkv_pair<dtype>); CTF_alloc_ptr(tot_sz*sizeof(tkv_pair<dtype>), (void**)&all_data_A); CTF_alloc_ptr(tot_sz*sizeof(tkv_pair<dtype>), (void**)&all_data_B); } if (my_sz == 0) my_data_A = my_data_B = NULL; GATHERV(my_data_A, my_sz*sizeof(tkv_pair<dtype>), COMM_CHAR_T, all_data_A, recvcnts, displs, COMM_CHAR_T, 0, global_comm); GATHERV(my_data_B, my_sz*sizeof(tkv_pair<dtype>), COMM_CHAR_T, all_data_B, recvcnts, displs, COMM_CHAR_T, 0, global_comm); if (global_comm->rank == 0){ std::sort(all_data_A, all_data_A + tot_sz); std::sort(all_data_B, all_data_B + tot_sz); for (i=0; i<tot_sz; i++){ if (std::abs(all_data_A[i].d) > cutoff || std::abs(all_data_B[i].d) > cutoff) { k = all_data_A[i].k; for (j=0; j<tsr_A->ndim; j++){ //idx_arr[tsr_A->ndim-j-1] = k%adj_edge_len[j]; idx_arr[j] = k%adj_edge_len[j]; k = k/adj_edge_len[j]; } for (j=0; j<tsr_A->ndim; j++){ fprintf(stream,"[%d]",idx_arr[j]); } fprintf(stream," <%20.14E> <%20.14E>\n",GET_REAL(all_data_A[i].d),GET_REAL(all_data_B[i].d)); } } CTF_free(recvcnts); CTF_free(displs); CTF_free(adj_edge_len); CTF_free(idx_arr); CTF_free(all_data_A); CTF_free(all_data_B); } //COMM_BARRIER(global_comm); return DIST_TENSOR_SUCCESS; } /* * \brief print mapping of tensor tid to stream * \param stream output stream (stdout, stdin, FILE) * \param tid tensor handle */ template<typename dtype> int dist_tensor<dtype>::print_map(FILE * stream, int const tid, int const all, int const is_inner) const{ int i; tensor<dtype> const * tsr; mapping * map; tsr = tensors[tid]; if (!all || global_comm->rank == 0){ tsr->print_map(stdout); } return DIST_TENSOR_SUCCESS; } /** * \brief sets the padded area of a tensor to zero, needed after contractions * \param[in] tensor_id identifies tensor */ template<typename dtype> int dist_tensor<dtype>::zero_out_padding(int const tensor_id){ int i, num_virt, idx_lyr; long_int np; int * virt_phase, * virt_phys_rank, * phys_phase; tensor<dtype> * tsr; mapping * map; TAU_FSTART(zero_out_padding); tsr = tensors[tensor_id]; if (tsr->has_zero_edge_len){ return DIST_TENSOR_SUCCESS; } unmap_inner(tsr); set_padding(tsr); if (!tsr->is_mapped){ return DIST_TENSOR_SUCCESS; } else { np = tsr->size; CTF_alloc_ptr(sizeof(int)*tsr->ndim, (void**)&virt_phase); CTF_alloc_ptr(sizeof(int)*tsr->ndim, (void**)&phys_phase); CTF_alloc_ptr(sizeof(int)*tsr->ndim, (void**)&virt_phys_rank); num_virt = 1; idx_lyr = global_comm->rank; for (i=0; i<tsr->ndim; i++){ /* Calcute rank and phase arrays */ map = tsr->edge_map + i; phys_phase[i] = calc_phase(map); virt_phase[i] = phys_phase[i]/calc_phys_phase(map); virt_phys_rank[i] = calc_phys_rank(map, &topovec[tsr->itopo]) *virt_phase[i]; num_virt = num_virt*virt_phase[i]; if (map->type == PHYSICAL_MAP) idx_lyr -= topovec[tsr->itopo].lda[map->cdt] *virt_phys_rank[i]/virt_phase[i]; } if (idx_lyr == 0){ zero_padding(tsr->ndim, np, num_virt, tsr->edge_len, tsr->sym, tsr->padding, phys_phase, virt_phase, virt_phys_rank, tsr->data); } else { std::fill(tsr->data, tsr->data+np, 0.0); } CTF_free(virt_phase); CTF_free(phys_phase); CTF_free(virt_phys_rank); } TAU_FSTOP(zero_out_padding); return DIST_TENSOR_SUCCESS; } template<typename dtype> int dist_tensor<dtype>::print_ctr(CTF_ctr_type_t const * ctype, dtype const alpha, dtype const beta) const { int dim_A, dim_B, dim_C; int * sym_A, * sym_B, * sym_C; int i,j,max,ex_A, ex_B,ex_C; COMM_BARRIER(global_comm); if (global_comm->rank == 0){ printf("Contracting Tensor %d with %d into %d\n", ctype->tid_A, ctype->tid_B, ctype->tid_C); printf("alpha = %lf, beta = %lf\n", GET_REAL(alpha), GET_REAL(beta)); dim_A = get_dim(ctype->tid_A); dim_B = get_dim(ctype->tid_B); dim_C = get_dim(ctype->tid_C); max = dim_A+dim_B+dim_C; sym_A = get_sym(ctype->tid_A); sym_B = get_sym(ctype->tid_B); sym_C = get_sym(ctype->tid_C); printf("Contraction index table:\n"); printf(" A B C\n"); for (i=0; i<max; i++){ ex_A=0; ex_B=0; ex_C=0; printf("%d: ",i); for (j=0; j<dim_A; j++){ if (ctype->idx_map_A[j] == i){ ex_A++; if (sym_A[j] != NS) printf("%d' ",j); else printf("%d ",j); } } if (ex_A == 0) printf(" "); if (ex_A == 1) printf(" "); for (j=0; j<dim_B; j++){ if (ctype->idx_map_B[j] == i){ ex_B=1; if (sym_B[j] != NS) printf("%d' ",j); else printf("%d ",j); } } if (ex_B == 0) printf(" "); if (ex_B == 1) printf(" "); for (j=0; j<dim_C; j++){ if (ctype->idx_map_C[j] == i){ ex_C=1; if (sym_C[j] != NS) printf("%d' ",j); else printf("%d ",j); } } printf("\n"); if (ex_A + ex_B + ex_C == 0) break; } CTF_free(sym_A); CTF_free(sym_B); CTF_free(sym_C); } return DIST_TENSOR_SUCCESS; } template<typename dtype> int dist_tensor<dtype>::print_sum(CTF_sum_type_t const * stype, dtype const alpha, dtype const beta) const { int dim_A, dim_B; int i,j,max,ex_A,ex_B; int * sym_A, * sym_B; COMM_BARRIER(global_comm); if (global_comm->rank == 0){ printf("Summing Tensor %lf*%d with %lf*%d into %d\n", GET_REAL(alpha), stype->tid_A, GET_REAL(beta), stype->tid_B, stype->tid_B); dim_A = get_dim(stype->tid_A); dim_B = get_dim(stype->tid_B); max = dim_A+dim_B; //MAX(MAX((dim_A), (dim_B)), (dim_C)); sym_A = get_sym(stype->tid_A); sym_B = get_sym(stype->tid_B); printf("Sum index table:\n"); printf(" A B \n"); for (i=0; i<max; i++){ ex_A=0; ex_B=0; printf("%d: ",i); for (j=0; j<dim_A; j++){ if (stype->idx_map_A[j] == i){ ex_A++; if (sym_A[j] != NS) printf("%d' ",j); else printf("%d ",j); } } if (ex_A == 0) printf(" "); if (ex_A == 1) printf(" "); for (j=0; j<dim_B; j++){ if (stype->idx_map_B[j] == i){ ex_B=1; if (sym_B[j] != NS) printf("%d' ",j); else printf("%d ",j); } } printf("\n"); if (ex_A + ex_B == 0) break; } CTF_free(sym_A); CTF_free(sym_B); } COMM_BARRIER(global_comm); return DIST_TENSOR_SUCCESS; } template<typename dtype> int dist_tensor<dtype>::check_contraction(CTF_ctr_type_t const * type){ int i, num_tot, len; int iA, iB, iC; int ndim_A, ndim_B, ndim_C; int * len_A, * len_B, * len_C; int * sym_A, * sym_B, * sym_C; int * idx_arr; tensor<dtype> * tsr_A, * tsr_B, * tsr_C; tsr_A = tensors[type->tid_A]; tsr_B = tensors[type->tid_B]; tsr_C = tensors[type->tid_C]; get_tsr_info(type->tid_A, &ndim_A, &len_A, &sym_A); get_tsr_info(type->tid_B, &ndim_B, &len_B, &sym_B); get_tsr_info(type->tid_C, &ndim_C, &len_C, &sym_C); inv_idx(tsr_A->ndim, type->idx_map_A, tsr_A->edge_map, tsr_B->ndim, type->idx_map_B, tsr_B->edge_map, tsr_C->ndim, type->idx_map_C, tsr_C->edge_map, &num_tot, &idx_arr); for (i=0; i<num_tot; i++){ len = -1; iA = idx_arr[3*i+0]; iB = idx_arr[3*i+1]; iC = idx_arr[3*i+2]; if (iA != -1){ len = len_A[iA]; } if (len != -1 && iB != -1 && len != len_B[iB]){ if (global_comm->rank == 0){ printf("Error in contraction call: The %dth edge length of tensor %d does not", iA, type->tid_A); printf("match the %dth edge length of tensor %d.\n", iB, type->tid_B); } ABORT; } if (len != -1 && iC != -1 && len != len_C[iC]){ if (global_comm->rank == 0){ printf("Error in contraction call: The %dth edge length of tensor %d (%d) does not", iA, type->tid_A, len); printf("match the %dth edge length of tensor %d (%d).\n", iC, type->tid_C, len_C[iC]); } ABORT; } if (iB != -1){ len = len_B[iB]; } if (len != -1 && iC != -1 && len != len_C[iC]){ if (global_comm->rank == 0){ printf("Error in contraction call: The %dth edge length of tensor %d does not", iB, type->tid_B); printf("match the %dth edge length of tensor %d.\n", iC, type->tid_C); } ABORT; } } CTF_free(len_A); CTF_free(len_B); CTF_free(len_C); CTF_free(sym_A); CTF_free(sym_B); CTF_free(sym_C); CTF_free(idx_arr); return DIST_TENSOR_SUCCESS; } /** * \brief checks the edge lengths specfied for this sum * \param type contains tensor ids and index maps */ template<typename dtype> int dist_tensor<dtype>::check_sum(CTF_sum_type_t const * type){ return check_sum(type->tid_A, type->tid_B, type->idx_map_A, type->idx_map_B); } /** * \brief checks the edge lengths specfied for this sum * \param tid_A id of tensor A * \param tid_B id of tensor B * \param idx_map_A indices of tensor A * \param idx_map_B indices of tensor B */ template<typename dtype> int dist_tensor<dtype>::check_sum(int const tid_A, int const tid_B, int const * idx_map_A, int const * idx_map_B){ int i, num_tot, len; int iA, iB; int ndim_A, ndim_B; int * len_A, * len_B; int * sym_A, * sym_B; int * idx_arr; tensor<dtype> * tsr_A, * tsr_B; tsr_A = tensors[tid_A]; tsr_B = tensors[tid_B]; get_tsr_info(tid_A, &ndim_A, &len_A, &sym_A); get_tsr_info(tid_B, &ndim_B, &len_B, &sym_B); inv_idx(tsr_A->ndim, idx_map_A, tsr_A->edge_map, tsr_B->ndim, idx_map_B, tsr_B->edge_map, &num_tot, &idx_arr); for (i=0; i<num_tot; i++){ len = -1; iA = idx_arr[2*i+0]; iB = idx_arr[2*i+1]; if (iA != -1){ len = len_A[iA]; } if (len != -1 && iB != -1 && len != len_B[iB]){ if (global_comm->rank == 0){ printf("i = %d Error in sum call: The %dth edge length (%d) of tensor %d does not", i, iA, len, tid_A); printf("match the %dth edge length (%d) of tensor %d.\n", iB, len_B[iB], tid_B); } ABORT; } } CTF_free(sym_A); CTF_free(sym_B); CTF_free(len_A); CTF_free(len_B); CTF_free(idx_arr); return DIST_TENSOR_SUCCESS; } template<typename dtype> void dist_tensor<dtype>::contract_mst(){ std::list<mem_transfer> tfs = CTF_contract_mst(); if (tfs.size() > 0 && get_global_comm()->rank == 0){ DPRINTF(1,"CTF Warning: contracting memory stack\n"); } std::list<mem_transfer>::iterator it; int i; int j = 0; for (it=tfs.begin(); it!=tfs.end(); it++){ j++; for (i=0; i<(int)tensors.size(); i++){ if (tensors[i]->data == (dtype*)it->old_ptr){ tensors[i]->data = (dtype*)it->new_ptr; break; } } if (i == (int)tensors.size()){ printf("CTF ERROR: pointer %d on mst is not tensor data, aborting\n",j); LIBT_ASSERT(0); } for (i=0; i<(int)tensors.size(); i++){ if (tensors[i]->data == (dtype*)it->old_ptr){ tensors[i]->data = (dtype*)it->new_ptr; } } } } #include "tensor_object.cxx" #include "dist_tensor_map.cxx" #include "dist_tensor_op.cxx" #include "dist_tensor_inner.cxx" #include "dist_tensor_fold.cxx"