/*Copyright (c) 2011, Edgar Solomonik, all rights reserved.*/ #ifndef __DIST_TENSOR_PERMUTE_HXX__ #define __DIST_TENSOR_PERMUTE_HXX__ #include "dist_tensor_internal.h" #include "cyclopstf.hpp" #include "../shared/util.h" #include "../ctr_comm/strp_tsr.h" #ifdef USE_OMP #include "omp.h" #endif /** * \brief ,calculate the block-sizes of a tensor * \param[in] ndim number of dimensions of this tensor * \param[in] size is the size of the local tensor stored * \param[in] edge_len edge lengths of global tensor * \param[in] edge_map mapping of each dimension * \param[out] vrt_sz size of virtual block * \param[out] vrt_edge_len edge lengths of virtual block * \param[out] blk_edge_len edge lengths of local block */ inline void calc_dim(int const ndim, long_int const size, int const * edge_len, mapping const * edge_map, long_int * vrt_sz, int * vrt_edge_len, int * blk_edge_len){ long_int vsz, i, cont; mapping const * map; vsz = size; for (i=0; i<ndim; i++){ if (blk_edge_len != NULL) blk_edge_len[i] = edge_len[i]; vrt_edge_len[i] = edge_len[i]; map = &edge_map[i]; do { if (blk_edge_len != NULL){ if (map->type == PHYSICAL_MAP) blk_edge_len[i] = blk_edge_len[i] / map->np; } vrt_edge_len[i] = vrt_edge_len[i] / map->np; if (vrt_sz != NULL){ if (map->type == VIRTUAL_MAP) vsz = vsz / map->np; } if (map->has_child){ cont = 1; map = map->child; } else cont = 0; } while (cont); } if (vrt_sz != NULL) *vrt_sz = vsz; } /** * \brief invert index map * \param[in] ndim_A number of dimensions of A * \param[in] idx_A index map of A * \param[in] edge_map_B mapping of each dimension of A * \param[out] ndim_tot number of total dimensions * \param[out] idx_arr 2*ndim_tot index array */ inline void inv_idx(int const ndim_A, int const * idx_A, mapping const * edge_map_A, int * ndim_tot, int ** idx_arr){ int i, dim_max; dim_max = -1; for (i=0; i<ndim_A; i++){ if (idx_A[i] > dim_max) dim_max = idx_A[i]; } dim_max++; *ndim_tot = dim_max; CTF_alloc_ptr(sizeof(int)*dim_max, (void**)idx_arr); std::fill((*idx_arr), (*idx_arr)+dim_max, -1); for (i=0; i<ndim_A; i++){ if ((*idx_arr)[idx_A[i]] == -1) (*idx_arr)[idx_A[i]] = i; else { // if (edge_map_A == NULL || edge_map_A[i].type == PHYSICAL_MAP) (*idx_arr)[idx_A[i]] = i; } } } /** * \brief invert index map * \param[in] ndim_A number of dimensions of A * \param[in] idx_A index map of A * \param[in] edge_map_B mapping of each dimension of A * \param[in] ndim_B number of dimensions of B * \param[in] idx_B index map of B * \param[in] edge_map_B mapping of each dimension of B * \param[out] ndim_tot number of total dimensions * \param[out] idx_arr 2*ndim_tot index array */ inline void inv_idx(int const ndim_A, int const * idx_A, mapping const * edge_map_A, int const ndim_B, int const * idx_B, mapping const * edge_map_B, int * ndim_tot, int ** idx_arr){ int i, dim_max; dim_max = -1; for (i=0; i<ndim_A; i++){ if (idx_A[i] > dim_max) dim_max = idx_A[i]; } for (i=0; i<ndim_B; i++){ if (idx_B[i] > dim_max) dim_max = idx_B[i]; } dim_max++; *ndim_tot = dim_max; CTF_alloc_ptr(sizeof(int)*2*dim_max, (void**)idx_arr); std::fill((*idx_arr), (*idx_arr)+2*dim_max, -1); for (i=0; i<ndim_A; i++){ if ((*idx_arr)[2*idx_A[i]] == -1) (*idx_arr)[2*idx_A[i]] = i; else { // if (edge_map_A == NULL || edge_map_A[i].type == PHYSICAL_MAP) (*idx_arr)[2*idx_A[i]] = i; } } for (i=0; i<ndim_B; i++){ if ((*idx_arr)[2*idx_B[i]+1] == -1) (*idx_arr)[2*idx_B[i]+1] = i; else { // if (edge_map_B == NULL || edge_map_B[i].type == PHYSICAL_MAP) (*idx_arr)[2*idx_B[i]+1] = i; } } } /** * \brief invert index map * \param[in] ndim_A number of dimensions of A * \param[in] idx_A index map of A * \param[in] edge_map_B mapping of each dimension of A * \param[in] ndim_B number of dimensions of B * \param[in] idx_B index map of B * \param[in] edge_map_B mapping of each dimension of B * \param[in] ndim_C number of dimensions of C * \param[in] idx_C index map of C * \param[in] edge_map_C mapping of each dimension of C * \param[out] ndim_tot number of total dimensions * \param[out] idx_arr 3*ndim_tot index array */ inline void inv_idx(int const ndim_A, int const * idx_A, mapping const * edge_map_A, int const ndim_B, int const * idx_B, mapping const * edge_map_B, int const ndim_C, int const * idx_C, mapping const * edge_map_C, int * ndim_tot, int ** idx_arr){ int i, dim_max; dim_max = -1; for (i=0; i<ndim_A; i++){ if (idx_A[i] > dim_max) dim_max = idx_A[i]; } for (i=0; i<ndim_B; i++){ if (idx_B[i] > dim_max) dim_max = idx_B[i]; } for (i=0; i<ndim_C; i++){ if (idx_C[i] > dim_max) dim_max = idx_C[i]; } dim_max++; *ndim_tot = dim_max; CTF_alloc_ptr(sizeof(int)*3*dim_max, (void**)idx_arr); std::fill((*idx_arr), (*idx_arr)+3*dim_max, -1); for (i=0; i<ndim_A; i++){ if ((*idx_arr)[3*idx_A[i]] == -1) (*idx_arr)[3*idx_A[i]] = i; else { // if (edge_map_A == NULL || edge_map_A[i].type == PHYSICAL_MAP) (*idx_arr)[3*idx_A[i]] = i; } } for (i=0; i<ndim_B; i++){ if ((*idx_arr)[3*idx_B[i]+1] == -1) (*idx_arr)[3*idx_B[i]+1] = i; else { // if (edge_map_B == NULL || edge_map_B[i].type == PHYSICAL_MAP) (*idx_arr)[3*idx_B[i]+1] = i; } } for (i=0; i<ndim_C; i++){ if ((*idx_arr)[3*idx_C[i]+2] == -1) (*idx_arr)[3*idx_C[i]+2] = i; else { // if (edge_map_C == NULL || edge_map_C[i].type == PHYSICAL_MAP) (*idx_arr)[3*idx_C[i]+2] = i; } } } /** * \brief assigns keys to an array of values * \param[in] ndim tensor dimension * \param[in] nbuf number of global virtual buckets * \param[in] new_nvirt new total virtualization factor * \param[in] np number of processors * \param[in] old_edge_len old tensor edge lengths * \param[in] new_edge_len new tensor edge lengths * \param[in] sym symmetries of tensor * \param[in] old_phase current physical*virtual phase * \param[in] old_rank current physical rank * \param[in] new_phase new physical*virtual phase * \param[in] old_virt_dim current virtualization dimensions on each process * \param[in] new_virt_dim new virtualization dimensions on each process * \param[in] new_virt_lda prefix sum of new_virt_dim * \param[in] buf_lda prefix sum of new_phase * \param[in] pe_lda processor grid lda * \param[in] is_pad whether tensor is padded * \param[in] padding padding of tensor * \param[out] send_counts outgoing counts of pairs by pe * \param[out] recv_counts incoming counts of pairs by pe * \param[out] send_displs outgoing displs of pairs by pe * \param[out] recv_displs incoming displs of pairs by pe * \param[out] svirt_displs outgoing displs of pairs by virt bucket * \param[out] rvirt_displs incoming displs of pairs by virt bucket * \param[in] ord_glb_comm the global communicator * \param[in] idx_lyr starting processor layer (2.5D) * \param[in] was_cyclic whether the old mapping was cyclic * \param[in] is_cyclic whether the new mapping is cyclic * \param[in] bucket_offset offsets for target index for each dimension */ template<typename dtype> void calc_cnt_displs_old(int const ndim, int const nbuf, int const new_nvirt, int const np, int const * old_edge_len, int const * new_edge_len, int const * old_virt_edge_len, int const * sym, int const * old_phase, int const * old_rank, int const * new_phase, int const * old_virt_dim, int const * new_virt_dim, int const * new_virt_lda, int const * buf_lda, int const * pe_lda, int const is_pad, int const * padding, int * send_counts, int * recv_counts, int * send_displs, int * recv_displs, int * svirt_displs, int * rvirt_displs, CommData_t ord_glb_comm, int const idx_lyr, int const was_cyclic, int const is_cyclic, int * const * bucket_offset){ int i, j, imax, act_lda, act_max; long_int buf_offset, idx_offset; int virt_offset, tmp1, tmp2, skip; int * all_virt_counts; #ifdef USE_OMP // int max_ntd = MIN(omp_get_max_threads(),REDIST_MAX_THREADS); long_int vbs = sy_packed_size(ndim, old_virt_edge_len, sym); int max_ntd = omp_get_max_threads(); max_ntd = MAX(1,MIN(max_ntd,vbs/nbuf)); CTF_mst_alloc_ptr(nbuf*sizeof(int)*max_ntd, (void**)&all_virt_counts); #else CTF_mst_alloc_ptr(nbuf*sizeof(int), (void**)&all_virt_counts); #endif /* Count how many elements need to go to each new virtual bucket */ if (idx_lyr==0){ if (ndim == 0){ memset(all_virt_counts, 0, nbuf*sizeof(int)); all_virt_counts[0]++; } else { #ifdef USE_OMP #pragma omp parallel private (i,j,imax,act_lda,act_max,buf_offset,idx_offset,\ tmp1,tmp2,skip) num_threads (max_ntd) { int start_ldim, end_ldim, i_st, vc; int * virt_counts; int * old_virt_idx, * virt_rank; int * idx; long_int * idx_offs; int * spad; int last_len = old_edge_len[ndim-1]/old_phase[ndim-1]+1; int omp_ntd, omp_tid; omp_ntd = omp_get_num_threads(); omp_tid = omp_get_thread_num(); // if (omp_tid < omp_ntd){ virt_counts = all_virt_counts+nbuf*omp_tid; start_ldim = (last_len/omp_ntd)*omp_tid; start_ldim += MIN(omp_tid,last_len%omp_ntd); end_ldim = (last_len/omp_ntd)*(omp_tid+1); end_ldim += MIN(omp_tid+1,last_len%omp_ntd); #else { int start_ldim, end_ldim, i_st, vc; int * virt_counts; long_int * old_virt_idx, * virt_rank; long_int * idx; long_int * idx_offs; int * spad; int last_len = old_edge_len[ndim-1]/old_phase[ndim-1]+1; virt_counts = all_virt_counts; start_ldim = 0; end_ldim = last_len; #endif CTF_alloc_ptr(ndim*sizeof(long_int), (void**)&idx); CTF_alloc_ptr(ndim*sizeof(long_int), (void**)&idx_offs); CTF_alloc_ptr(ndim*sizeof(long_int), (void**)&old_virt_idx); CTF_alloc_ptr(ndim*sizeof(long_int), (void**)&virt_rank); CTF_alloc_ptr(ndim*sizeof(int), (void**)&spad); memset(virt_counts, 0, nbuf*sizeof(int)); memset(old_virt_idx, 0, ndim*sizeof(long_int)); /* virt_rank = physical_rank*num_virtual_ranks + virtual_rank */ for (i=0; i<ndim; i++){ virt_rank[i] = old_rank[i]*old_virt_dim[i]; } for (;;){ memset(idx, 0, ndim*sizeof(long_int)); memset(idx_offs, 0, ndim*sizeof(long_int)); idx_offset = 0; skip = 0; idx[ndim-1] = MAX(idx[ndim-1],start_ldim); for (i=0; i<ndim; i++) { /* Warning: This next if block has a history of bugs */ if (is_pad){ //spad[i] = padding[i]; if (sym[i] != NS){ LIBT_ASSERT(padding[i] < old_phase[i]); spad[i] = 1; if (sym[i] != SY && virt_rank[i] < virt_rank[i+1]) spad[i]--; if (sym[i] == SY && virt_rank[i] <= virt_rank[i+1]) spad[i]--; } } else { spad[i] = 0; } if (sym[i] != NS && idx[i] >= idx[i+1]-spad[i]){ idx[i+1] = idx[i]+spad[i]; // if (virt_rank[sym[i]] + (sym[i]==SY) <= virt_rank[i]) // idx[sym[i]]++; } if (i > 0){ imax = (old_edge_len[i]-padding[i])/old_phase[i]; if (virt_rank[i] < (old_edge_len[i]-padding[i])%old_phase[i]) imax++; if (i == ndim - 1) imax = MIN(imax, end_ldim); if (idx[i] >= imax) skip = 1; else { if (was_cyclic) idx_offs[i] = idx[i]*old_phase[i]+virt_rank[i]; else idx_offs[i] = idx[i]+virt_rank[i]*old_edge_len[i]/old_phase[i]; if (is_cyclic) idx_offs[i] = (idx_offs[i]%new_phase[i])*buf_lda[i]; else idx_offs[i] = (idx_offs[i]/(new_edge_len[i]/new_phase[i]))*buf_lda[i]; /* if (idx_offs[i] != bucket_offset[i][old_virt_idx[i]*old_virt_edge_len[i]+idx[i]]){ printf("%d %d\n", idx_offs[i], bucket_offset[i][old_virt_idx[i]*old_virt_edge_len[i]+idx[i]]); ABORT; }*/ idx_offset += idx_offs[i]; } } } /* determine how many elements belong to each virtual processor */ /* Iterate over a block corresponding to a virtual rank */ /* (INNER LOOP) */ if (!skip){ for (;;){ imax = (old_edge_len[0]-padding[0])/old_phase[0]; if (virt_rank[0] < (old_edge_len[0]-padding[0])%old_phase[0]) imax++; if (sym[0] != NS) { imax = MIN(imax,idx[1]+1-spad[0]); } if (ndim == 1){ imax = MIN(imax, end_ldim); i_st = start_ldim; } else i_st = 0; /* Increment virtual bucket */ for (i=i_st; i<imax; i++){ //virt_counts[idx_offset+((i*old_phase[0]+virt_rank[0])%new_phase[0])]++; if (was_cyclic) vc = i*old_phase[0]+virt_rank[0]; else vc = i+virt_rank[0]*old_edge_len[0]/old_phase[0]; if (is_cyclic) vc = (vc%new_phase[0])*buf_lda[0]; else vc = (vc/(new_edge_len[0]/new_phase[0]))*buf_lda[0]; virt_counts[idx_offset+vc]++; // vc = bucket_offset[0][old_virt_idx[0]*old_virt_edge_len[0]+i]; // svirt_displs[idx_offset+vc]++; } /* Increment indices and set up offsets */ for (act_lda=1; act_lda < ndim; act_lda++){ idx[act_lda]++; if (is_pad){ act_max = (old_edge_len[act_lda]-padding[act_lda])/old_phase[act_lda]; if (virt_rank[act_lda] < (old_edge_len[act_lda]-padding[act_lda])%old_phase[act_lda]) act_max++; } else act_max = old_edge_len[act_lda]/old_phase[act_lda]; if (act_lda == ndim - 1) act_max = MIN(act_max, end_ldim); if (sym[act_lda] != NS) act_max = MIN(act_max,idx[act_lda+1]+1-spad[act_lda]); bool ended = true; if (idx[act_lda] >= act_max){ ended = false; idx[act_lda] = 0; if (sym[act_lda-1] != NS) idx[act_lda] = idx[act_lda-1]+spad[act_lda-1]; } idx_offset -= idx_offs[act_lda]; if (was_cyclic) idx_offs[act_lda] = idx[act_lda]*old_phase[act_lda]+virt_rank[act_lda]; else idx_offs[act_lda] = idx[act_lda]+virt_rank[act_lda]*old_edge_len[act_lda]/old_phase[act_lda]; if (is_cyclic) idx_offs[act_lda] = (idx_offs[act_lda]%new_phase[act_lda])*buf_lda[act_lda]; else idx_offs[act_lda] = (idx_offs[act_lda]/(new_edge_len[act_lda]/new_phase[act_lda]))*buf_lda[act_lda]; idx_offset += idx_offs[act_lda]; if (ended)//idx[act_lda] > 0) break; } if (act_lda == ndim) break; } } /* (OUTER LOOP) Iterate over virtual ranks on this pe */ for (act_lda=0; act_lda<ndim; act_lda++){ old_virt_idx[act_lda]++; if (old_virt_idx[act_lda] >= old_virt_dim[act_lda]) old_virt_idx[act_lda] = 0; virt_rank[act_lda] = old_rank[act_lda]*old_virt_dim[act_lda] +old_virt_idx[act_lda]; if (old_virt_idx[act_lda] > 0) break; } if (act_lda == ndim) break; } CTF_free(idx); CTF_free(idx_offs); CTF_free(old_virt_idx); CTF_free(virt_rank); CTF_free(spad); } #ifdef USE_OMP for (j=1; j<max_ntd; j++){ for (i=0; i<nbuf; i++){ all_virt_counts[i] += all_virt_counts[i+nbuf*j]; } } #endif } } else memset(all_virt_counts, 0, sizeof(int)*nbuf); int * virt_counts = all_virt_counts; long_int * idx; long_int * idx_offs; /* reduce virtual counts to phyiscal processor counts */ CTF_alloc_ptr(ndim*sizeof(long_int), (void**)&idx_offs); CTF_alloc_ptr(ndim*sizeof(long_int), (void**)&idx); /* FIXME: unnecessary memset */ memset(idx, 0, ndim*sizeof(long_int)); memset(idx_offs, 0, ndim*sizeof(long_int)); memset(send_counts, 0, np*sizeof(int)); idx_offset = 0; buf_offset = 0; virt_offset = 0; /* Calculate All-to-all processor grid offsets from the virtual bucket counts */ if (ndim == 0){ send_counts[0] = virt_counts[0]; memset(svirt_displs, 0, np*sizeof(int)); } else { for (;;){ for (i=0; i<new_phase[0]; i++){ // Accumulate total pe-to-pe counts send_counts[idx_offset + (i/new_virt_dim[0])*pe_lda[0]] += virt_counts[buf_offset+i]; // Calculate counts of virtual buckets going to each pe svirt_displs[(idx_offset + (i/new_virt_dim[0])*pe_lda[0])*new_nvirt +virt_offset+(i%new_virt_dim[0])] = virt_counts[buf_offset+i]; } buf_offset += new_phase[0];//buf_lda[1]; // Increment indices and set up offsets for (act_lda=1; act_lda<ndim; act_lda++){ idx[act_lda]++; if (idx[act_lda] >= new_phase[act_lda]){ idx[act_lda] = 0; } virt_offset -= (idx_offs[act_lda]%new_virt_dim[act_lda])*new_virt_lda[act_lda]; idx_offset -= (idx_offs[act_lda]/new_virt_dim[act_lda])*pe_lda[act_lda]; idx_offs[act_lda] = idx[act_lda]; virt_offset += (idx_offs[act_lda]%new_virt_dim[act_lda])*new_virt_lda[act_lda]; idx_offset += (idx_offs[act_lda]/new_virt_dim[act_lda])*pe_lda[act_lda]; if (idx[act_lda] > 0) break; } if (act_lda == ndim) break; } } /* Exchange counts */ ALL_TO_ALL(send_counts, 1, COMM_INT_T, recv_counts, 1, COMM_INT_T, ord_glb_comm); /* Calculate displacements out of the count arrays */ send_displs[0] = 0; recv_displs[0] = 0; for (i=1; i<np; i++){ send_displs[i] = send_displs[i-1] + send_counts[i-1]; recv_displs[i] = recv_displs[i-1] + recv_counts[i-1]; } /* Calculate displacements for virt buckets in each message */ for (i=0; i<np; i++){ tmp2 = svirt_displs[i*new_nvirt]; svirt_displs[i*new_nvirt] = 0; for (j=1; j<new_nvirt; j++){ tmp1 = svirt_displs[i*new_nvirt+j]; svirt_displs[i*new_nvirt+j] = svirt_displs[i*new_nvirt+j-1]+tmp2; tmp2 = tmp1; } } /* Exchange displacements for virt buckets */ ALL_TO_ALL(svirt_displs, new_nvirt, COMM_INT_T, rvirt_displs, new_nvirt, COMM_INT_T, ord_glb_comm); CTF_free(all_virt_counts); CTF_free(idx); CTF_free(idx_offs); } /** * \brief assigns keys to an array of values * \param[in] ndim tensor dimension * \param[in] nbuf number of global virtual buckets * \param[in] new_nvirt new total virtualization factor * \param[in] np number of processors * \param[in] old_edge_len old tensor edge lengths * \param[in] new_edge_len new tensor edge lengths * \param[in] sym symmetries of tensor * \param[in] old_phase current physical*virtual phase * \param[in] old_rank current physical rank * \param[in] new_phase new physical*virtual phase * \param[in] old_virt_dim current virtualization dimensions on each process * \param[in] new_virt_dim new virtualization dimensions on each process * \param[in] new_virt_lda prefix sum of new_virt_dim * \param[in] buf_lda prefix sum of new_phase * \param[in] pe_lda processor grid lda * \param[in] is_pad whether tensor is padded * \param[in] padding padding of tensor * \param[out] send_counts outgoing counts of pairs by pe * \param[out] recv_counts incoming counts of pairs by pe * \param[out] send_displs outgoing displs of pairs by pe * \param[out] recv_displs incoming displs of pairs by pe * \param[out] svirt_displs outgoing displs of pairs by virt bucket * \param[out] rvirt_displs incoming displs of pairs by virt bucket * \param[in] ord_glb_comm the global communicator * \param[in] idx_lyr starting processor layer (2.5D) * \param[in] was_cyclic whether the old mapping was cyclic * \param[in] is_cyclic whether the new mapping is cyclic * \param[in] bucket_offset offsets for target index for each dimension */ template<typename dtype> void calc_cnt_displs(int const ndim, int const nbuf, int const new_nvirt, int const np, int const * old_edge_len, int const * new_edge_len, int const * old_virt_edge_len, int const * sym, int const * old_phase, int const * old_rank, int const * new_phase, int const * old_virt_dim, int const * new_virt_dim, int const * new_virt_lda, int const * buf_lda, int const * pe_lda, int const is_pad, int const * padding, int * send_counts, int * recv_counts, int * send_displs, int * recv_displs, int * svirt_displs, int * rvirt_displs, CommData_t ord_glb_comm, int const idx_lyr, int const was_cyclic, int const is_cyclic, int * const * bucket_offset){ int * all_virt_counts; #ifdef USE_OMP long_int vbs = sy_packed_size(ndim, old_virt_edge_len, sym); int max_ntd = omp_get_max_threads(); max_ntd = MAX(1,MIN(max_ntd,vbs/nbuf)); #else int max_ntd = 1; #endif CTF_mst_alloc_ptr(nbuf*sizeof(int)*max_ntd, (void**)&all_virt_counts); /* Count how many elements need to go to each new virtual bucket */ if (idx_lyr==0){ if (ndim == 0){ memset(all_virt_counts, 0, nbuf*sizeof(int)); all_virt_counts[0]++; } else { #ifdef USE_OMP #pragma omp parallel num_threads(max_ntd) { int imax, act_max, skip; int start_ldim, end_ldim, i_st, vc, dim; int * virt_counts; int * old_virt_idx, * virt_rank; int * idx; long_int idx_offset; long_int * idx_offs; int * spad; int last_len = old_edge_len[ndim-1]/old_phase[ndim-1]+1; int omp_ntd, omp_tid; omp_ntd = omp_get_num_threads(); omp_tid = omp_get_thread_num(); virt_counts = all_virt_counts+nbuf*omp_tid; start_ldim = (last_len/omp_ntd)*omp_tid; start_ldim += MIN(omp_tid,last_len%omp_ntd); end_ldim = (last_len/omp_ntd)*(omp_tid+1); end_ldim += MIN(omp_tid+1,last_len%omp_ntd); #else { int imax, act_max, skip; int start_ldim, end_ldim, i_st, vc, dim; int * virt_counts; long_int * old_virt_idx, * virt_rank; long_int * idx; long_int idx_offset; long_int * idx_offs; int * spad; int last_len = old_edge_len[ndim-1]/old_phase[ndim-1]+1; virt_counts = all_virt_counts; start_ldim = 0; end_ldim = last_len; #endif CTF_alloc_ptr(ndim*sizeof(long_int), (void**)&idx); CTF_alloc_ptr(ndim*sizeof(long_int), (void**)&idx_offs); CTF_alloc_ptr(ndim*sizeof(long_int), (void**)&old_virt_idx); CTF_alloc_ptr(ndim*sizeof(long_int), (void**)&virt_rank); CTF_alloc_ptr(ndim*sizeof(int), (void**)&spad); memset(virt_counts, 0, nbuf*sizeof(int)); memset(old_virt_idx, 0, ndim*sizeof(long_int)); /* virt_rank = physical_rank*num_virtual_ranks + virtual_rank */ for (int i=0; i<ndim; i++){ virt_rank[i] = old_rank[i]*old_virt_dim[i]; } for (;;){ memset(idx, 0, ndim*sizeof(long_int)); memset(idx_offs, 0, ndim*sizeof(long_int)); idx_offset = 0; skip = 0; idx[ndim-1] = MAX(idx[ndim-1],start_ldim); for (dim=0; dim<ndim; dim++) { /* Warning: This next if block has a history of bugs */ if (is_pad){ //spad[dim] = padding[dim]; if (sym[dim] != NS){ LIBT_ASSERT(padding[dim] < old_phase[dim]); spad[dim] = 1; if (sym[dim] != SY && virt_rank[dim] < virt_rank[dim+1]) spad[dim]--; if (sym[dim] == SY && virt_rank[dim] <= virt_rank[dim+1]) spad[dim]--; } } else { spad[dim] = 0; } if (sym[dim] != NS && idx[dim] >= idx[dim+1]-spad[dim]){ idx[dim+1] = idx[dim]+spad[dim]; // if (virt_rank[sym[dim]] + (sym[dim]==SY) <= virt_rank[dim]) // idx[sym[dim]]++; } if (dim > 0){ imax = (old_edge_len[dim]-padding[dim])/old_phase[dim]; if (virt_rank[dim] < (old_edge_len[dim]-padding[dim])%old_phase[dim]) imax++; if (dim == ndim - 1) imax = MIN(imax, end_ldim); if (idx[dim] >= imax) skip = 1; else { idx_offs[dim] = bucket_offset[dim][old_virt_idx[dim]*old_virt_edge_len[dim]+idx[dim]]; idx_offset += idx_offs[dim]; } } } /* determine how many elements belong to each virtual processor */ /* Iterate over a block corresponding to a virtual rank */ /* (INNER LOOP) */ if (!skip){ for (;;){ imax = (old_edge_len[0]-padding[0])/old_phase[0]; if (virt_rank[0] < (old_edge_len[0]-padding[0])%old_phase[0]) imax++; if (sym[0] != NS) { imax = MIN(imax,idx[1]+1-spad[0]); } if (ndim == 1){ imax = MIN(imax, end_ldim); i_st = start_ldim; } else i_st = 0; /* Increment virtual bucket */ for (int i=i_st; i<imax; i++){ /* if (sym[0] != NS){ if ((sym[0] != SY && virt_rank[0] >= virt_rank[1]) || (sym[0] == SY && virt_rank[0] > virt_rank[1])) { LIBT_ASSERT(i<idx[1]); } else { LIBT_ASSERT(i<=idx[1]); } } for (int dim = 1; dim < ndim-1; dim++){ if (sym[dim] != NS){ if ((sym[dim] != SY && virt_rank[dim] >= virt_rank[dim+1]) || (sym[dim] == SY && virt_rank[dim] > virt_rank[dim+1])) { LIBT_ASSERT(idx[dim]<idx[dim+1]); } else LIBT_ASSERT(idx[dim]<=idx[dim+1]); } }*/ vc = bucket_offset[0][old_virt_idx[0]*old_virt_edge_len[0]+i]; virt_counts[idx_offset+vc]++; } /* Increment indices and set up offsets */ for (dim=1; dim < ndim; dim++){ idx[dim]++; if (is_pad){ act_max = (old_edge_len[dim]-padding[dim])/old_phase[dim]; if (virt_rank[dim] < (old_edge_len[dim]-padding[dim])%old_phase[dim]) act_max++; } else act_max = old_edge_len[dim]/old_phase[dim]; if (dim == ndim - 1) act_max = MIN(act_max, end_ldim); if (sym[dim] != NS) act_max = MIN(act_max,idx[dim+1]+1-spad[dim]); bool ended = true; if (idx[dim] >= act_max){ ended = false; idx[dim] = 0; if (sym[dim-1] != NS) idx[dim] = idx[dim-1]+spad[dim-1]; } idx_offset -= idx_offs[dim]; idx_offs[dim] = bucket_offset[dim][old_virt_idx[dim]*old_virt_edge_len[dim]+idx[dim]]; idx_offset += idx_offs[dim]; if (ended) break; } if (dim == ndim) break; } } /* (OUTER LOOP) Iterate over virtual ranks on this pe */ for (dim=0; dim<ndim; dim++){ old_virt_idx[dim]++; if (old_virt_idx[dim] >= old_virt_dim[dim]) old_virt_idx[dim] = 0; virt_rank[dim] = old_rank[dim]*old_virt_dim[dim] +old_virt_idx[dim]; if (old_virt_idx[dim] > 0) break; } if (dim == ndim) break; } CTF_free(idx); CTF_free(idx_offs); CTF_free(old_virt_idx); CTF_free(virt_rank); CTF_free(spad); } #ifdef USE_OMP for (int j=1; j<max_ntd; j++){ for (int i=0; i<nbuf; i++){ all_virt_counts[i] += all_virt_counts[i+nbuf*j]; } } #endif } } else memset(all_virt_counts, 0, sizeof(int)*nbuf); int tmp1, tmp2; int * virt_counts = all_virt_counts; memset(send_counts, 0, np*sizeof(int)); /* Calculate All-to-all processor grid offsets from the virtual bucket counts */ if (ndim == 0){ // send_counts[0] = virt_counts[0]; memset(svirt_displs, 0, np*sizeof(int)); } else { /* for (;;){ for (i=0; i<new_phase[0]; i++){ // Accumulate total pe-to-pe counts send_counts[idx_offset + (i/new_virt_dim[0])*pe_lda[0]] += virt_counts[buf_offset+i]; // Calculate counts of virtual buckets going to each pe svirt_displs[(idx_offset + (i/new_virt_dim[0])*pe_lda[0])*new_nvirt +virt_offset+(i%new_virt_dim[0])] = virt_counts[buf_offset+i]; } buf_offset += new_phase[0];//buf_lda[1]; // Increment indices and set up offsets for (act_lda=1; act_lda<ndim; act_lda++){ idx[act_lda]++; if (idx[act_lda] >= new_phase[act_lda]){ idx[act_lda] = 0; } virt_offset -= (idx_offs[act_lda]%new_virt_dim[act_lda])*new_virt_lda[act_lda]; idx_offset -= (idx_offs[act_lda]/new_virt_dim[act_lda])*pe_lda[act_lda]; idx_offs[act_lda] = idx[act_lda]; virt_offset += (idx_offs[act_lda]%new_virt_dim[act_lda])*new_virt_lda[act_lda]; idx_offset += (idx_offs[act_lda]/new_virt_dim[act_lda])*pe_lda[act_lda]; if (idx[act_lda] > 0) break; } if (act_lda == ndim) break; }*/ for (int i=0; i<np; i++){ for (int j=0; j<new_nvirt; j++){ //printf("virt_count[%d][%d]=%d\n",i,j,virt_counts[i*new_nvirt+j]); send_counts[i] += virt_counts[i*new_nvirt+j]; svirt_displs[i*new_nvirt+j] = virt_counts[i*new_nvirt+j]; } } } /* Exchange counts */ ALL_TO_ALL(send_counts, 1, COMM_INT_T, recv_counts, 1, COMM_INT_T, ord_glb_comm); /* Calculate displacements out of the count arrays */ send_displs[0] = 0; recv_displs[0] = 0; for (int i=1; i<np; i++){ send_displs[i] = send_displs[i-1] + send_counts[i-1]; recv_displs[i] = recv_displs[i-1] + recv_counts[i-1]; } /* Calculate displacements for virt buckets in each message */ for (int i=0; i<np; i++){ tmp2 = svirt_displs[i*new_nvirt]; svirt_displs[i*new_nvirt] = 0; for (int j=1; j<new_nvirt; j++){ tmp1 = svirt_displs[i*new_nvirt+j]; svirt_displs[i*new_nvirt+j] = svirt_displs[i*new_nvirt+j-1]+tmp2; tmp2 = tmp1; } } /* Exchange displacements for virt buckets */ ALL_TO_ALL(svirt_displs, new_nvirt, COMM_INT_T, rvirt_displs, new_nvirt, COMM_INT_T, ord_glb_comm); CTF_free(all_virt_counts); } /** * \brief move data between buckets and tensor layout * * \param[in] ndim number of tensor dimensions * \param[in] nbuf number of global virtual buckets * \param[in] new_nvirt new total virtualization factor * \param[in] edge_len current edge lengths of tensor * \param[in] sym symmetries of tensor * \param[in] old_phase current physical*virtual phase * \param[in] old_rank current physical rank * \param[in] new_phase new physical*virtual phase * \param[in] new_rank new physical rank * \param[in] old_virt_dim current virtualization dimensions on each process * \param[in] new_virt_dim new virtualization dimensions on each process * \param[in] pe_displs outgoing displs of pairs by pe * \param[in] virt_displs outgoing displs of pairs by virtual bucket * \param[in] old_virt_lda prefix sum of old_virt_dim * \param[in] new_virt_lda prefix sum of new_virt_dim * \param[in] pe_lda lda of processor grid * \param[in] vbs number of elements per virtual subtensor * \param[in] is_pad whether tensor is padded * \param[in] padding padding of tensor * \param[out] tsr_data data to retrieve from * \param[out] tsr_cyclic_data data to write to * \param[in] p_or_up whether to pack (1) or unpack (0) * \param[in] was_cyclic whether the old mapping was cyclic * \param[in] is_cyclic whether the new mapping is cyclic */ template<typename dtype> void pup_virt_buff(int const ndim, int const nbuf, int const new_nvirt, int const * edge_len, int const * sym, int const * old_phase, int const * old_rank, int const * new_phase, int const * new_rank, int const * old_virt_dim, int const * new_virt_dim, int const * pe_displs, int const * virt_displs, int const * old_virt_lda, int const * new_virt_lda, int const * pe_lda, long_int const vbs, int const is_pad, int const * padding, dtype * tsr_data, dtype * tsr_cyclic_data, int const p_or_up, int const was_cyclic, int const is_cyclic){ long_int i, imax, padimax, act_lda, act_max; long_int poutside, pe_idx, virt_idx, overt_idx; long_int arr_idx, tmp1; long_int vr, vr_max, pact_max, ispm; int outside; long_int idx_offset, virt_offset; long_int * idx, * old_virt_idx, * virt_rank; int * virt_counts; long_int * idx_offs; long_int * acc_idx; int * spad; CTF_mst_alloc_ptr(nbuf*sizeof(int), (void**)&virt_counts); CTF_alloc_ptr(ndim*sizeof(long_int), (void**)&idx); CTF_alloc_ptr(ndim*sizeof(int), (void**)&acc_idx); CTF_alloc_ptr(ndim*sizeof(long_int), (void**)&idx_offs); CTF_alloc_ptr(ndim*sizeof(int), (void**)&old_virt_idx); CTF_alloc_ptr(ndim*sizeof(long_int), (void**)&virt_rank); CTF_alloc_ptr(ndim*sizeof(int), (void**)&spad); memset(old_virt_idx, 0, ndim*sizeof(long_int)); memset(acc_idx, 0, ndim*sizeof(long_int)); memset(virt_counts, 0, nbuf*sizeof(int)); for (i=0; i<ndim; i++){ virt_rank[i] = old_rank[i]*old_virt_dim[i]; } /* Loop accross all virtual subtensors in order, and keep order within subtensors. We should move through one of the arrays contiguously */ arr_idx = 0; overt_idx = 0; memset(idx, 0, ndim*sizeof(long_int)); memset(idx_offs, 0, ndim*sizeof(long_int)); idx_offset = 0; virt_offset = 0; for (i=1; i<ndim; i++) { idx_offs[i] = (virt_rank[i]%new_phase[i]); idx_offset += (idx_offs[i]/new_virt_dim[i])*pe_lda[i]; virt_offset += (idx_offs[i]%new_virt_dim[i])*new_virt_lda[i]; } outside = 0; poutside = -1; for (i=1; i<ndim; i++){ if (sym[i] != NS){ if (sym[i] == SY){ if (old_rank[i+1]*old_virt_dim[i+1] < old_rank[i]*old_virt_dim[i]){ outside=1; break; } } if (sym[i] != SY){ if (old_rank[i+1]*old_virt_dim[i+1] <= old_rank[i]*old_virt_dim[i]){ outside=1; break; } } } } imax = edge_len[0]/old_phase[0]; for (;;){ if (sym[0] != NS) imax = idx[1]+1; /* NOTE: Must iterate across all local data in correct global order * to make the back-transformation correct */ /* for each leading index owned by a virtual prc along an edge */ if (outside == 0){ if (is_pad) { ispm = (edge_len[0]-padding[0])/old_phase[0]; if ((edge_len[0]-padding[0])%old_phase[0] > old_rank[0]*old_virt_dim[0]) ispm++; padimax = MIN(imax,ispm); } else padimax = imax; for (i=0; i<padimax; i++){ /* for each virtual proc along leading dim */ vr_max = old_virt_dim[0]; if (is_pad){ vr_max = MIN(vr_max, ((edge_len[0]-padding[0])- (i*old_phase[0]+old_rank[0]*old_virt_dim[0]))); // printf("vr_max = %d rather than %d\n",vr_max,old_virt_dim[0]); } if (sym[0] != NS && i == idx[1]){ vr_max = MIN(vr_max, ((old_virt_idx[1]+(sym[0]==SY)+ old_rank[1]*old_virt_dim[1])- (old_rank[0]*old_virt_dim[0]))); } for (vr=0; vr<vr_max; vr++){ virt_rank[0] = old_rank[0]*old_virt_dim[0]+vr; /* Find the offset of this virtual proc */ arr_idx += (overt_idx + vr)*vbs; /* Find the processor of the new distribution */ pe_idx = (old_phase[0]*i+virt_rank[0])%new_phase[0]; virt_idx = (pe_idx%new_virt_dim[0]) + virt_offset; if (!p_or_up) virt_idx = overt_idx + vr; pe_idx = (pe_idx/new_virt_dim[0])*pe_lda[0] + idx_offset; /* Get the bucket indices and offsets */ tmp1 = pe_displs[pe_idx]; tmp1 += virt_displs[pe_idx*new_nvirt+virt_idx]; tmp1 += virt_counts[pe_idx*new_nvirt+virt_idx]; virt_counts[pe_idx*new_nvirt+virt_idx]++; /* Pack or unpack from bucket */ if (p_or_up){ tsr_cyclic_data[tmp1] = tsr_data[arr_idx+i]; } else tsr_cyclic_data[arr_idx+i] = tsr_data[tmp1]; arr_idx -= (overt_idx + vr)*vbs; } } } /* Adjust outer indices */ for (act_lda=1; act_lda < ndim; act_lda++){ overt_idx -= old_virt_idx[act_lda]*old_virt_lda[act_lda]; /* get outer virtual index */ old_virt_idx[act_lda]++; vr_max = old_virt_dim[act_lda]; if (is_pad){ vr_max = MIN(vr_max, ((edge_len[act_lda]-padding[act_lda])- (idx[act_lda]*old_phase[act_lda] +old_rank[act_lda]*old_virt_dim[act_lda]))); //LIBT_ASSERT(vr_max!=0 || outside>0); } if (sym[act_lda] != NS && idx[act_lda] == idx[act_lda+1]){ vr_max = MIN(vr_max, ((old_virt_idx[act_lda+1]+(sym[act_lda]==SY)+ old_rank[act_lda+1]*old_virt_dim[act_lda+1])- (old_rank[act_lda]*old_virt_dim[act_lda]))); //LIBT_ASSERT(vr_max!=0 || outside>0); } if (old_virt_idx[act_lda] >= vr_max) old_virt_idx[act_lda] = 0; overt_idx += old_virt_idx[act_lda]*old_virt_lda[act_lda]; /* adjust virtual rank */ virt_rank[act_lda] = old_rank[act_lda]*old_virt_dim[act_lda] +old_virt_idx[act_lda]; /* adjust buffer offset */ if (old_virt_idx[act_lda] == 0){ /* Propogate buffer offset. When outer indices have multiple virtual ranks we must roll back the buffer to the correct position. So must keep history of the offsets. */ if (idx[act_lda] == 0) acc_idx[act_lda] = 0; if (act_lda == 1) { acc_idx[act_lda] += imax; arr_idx += imax; } else { acc_idx[act_lda] += acc_idx[act_lda-1]; arr_idx += acc_idx[act_lda-1]; } idx[act_lda]++; act_max = edge_len[act_lda]/old_phase[act_lda]; if (sym[act_lda] != NS) act_max = idx[act_lda+1]+1; if (idx[act_lda] >= act_max){ idx[act_lda] = 0; arr_idx -= acc_idx[act_lda]; } if (is_pad){ pact_max = (edge_len[act_lda]-padding[act_lda])/old_phase[act_lda]; if ((edge_len[act_lda]-padding[act_lda])%old_phase[act_lda] > old_rank[act_lda]*old_virt_dim[act_lda]) pact_max++; if (poutside == act_lda) poutside = -1; if (act_lda > poutside){ if (idx[act_lda] >= pact_max) { poutside = act_lda; } /*else if (sym[act_lda] != NS] && idx[act_lda] == idx[sym[act_lda]]) { if (old_virt_idx[sym[act_lda]]+sym_type[act_lda]+ old_rank[sym[act_lda]]*old_virt_dim[sym[act_lda]]<= (old_rank[act_lda]*old_virt_dim[act_lda])){ outside = act_lda; } }*/ } } } /* Adjust bucket locations by translating phases */ idx_offset -= (idx_offs[act_lda]/new_virt_dim[act_lda])*pe_lda[act_lda]; virt_offset -= (idx_offs[act_lda]%new_virt_dim[act_lda])*new_virt_lda[act_lda]; idx_offs[act_lda] = ((idx[act_lda]*old_phase[act_lda]+virt_rank[act_lda]) %new_phase[act_lda]); idx_offset += (idx_offs[act_lda]/new_virt_dim[act_lda])*pe_lda[act_lda]; virt_offset += (idx_offs[act_lda]%new_virt_dim[act_lda])*new_virt_lda[act_lda]; if (idx[act_lda] > 0 || old_virt_idx[act_lda] > 0) break; } if (poutside > -1) outside = 1; else { outside = 0; for (i=1; i<ndim; i++){ if (sym[i] != NS && idx[i+1] == idx[i]){ if (old_rank[i+1]*old_virt_dim[i+1]+ old_virt_idx[i+1] + (sym[i]==SY) <= old_rank[i]*old_virt_dim[i] + old_virt_idx[i]){ outside=1; break; } } } } if (act_lda == ndim) break; } /* for (i=1; i<nbuf; i++){ printf("[%d] virt_counts = %d, virt_displs = %d\n",i-1, virt_counts[i-1], virt_displs[i]-virt_displs[i-1]+ (pe_displs[i/new_nvirt]-pe_displs[(i-1)/new_nvirt])); }*/ CTF_free(virt_counts); CTF_free(idx); CTF_free(acc_idx); CTF_free(idx_offs); CTF_free(old_virt_idx); CTF_free(virt_rank); CTF_free(spad); } int ** compute_bucket_offsets(int ndim, int const * len, int const * old_phys_rank, int const * old_phys_dim, int const * old_virt_dim, int const * old_virt_lda, int const * old_offsets, int * const * old_permutation, int const * new_phys_dim, int const * new_phys_lda, int const * new_virt_dim, int const * new_virt_lda, int forward, int old_virt_np, int new_virt_np, int const * old_phys_edge_len, int const * old_virt_edge_len){ TAU_FSTART(compute_bucket_offsets); int **bucket_offset; CTF_alloc_ptr(sizeof(int*)*ndim, (void**)&bucket_offset); for (int dim = 0;dim < ndim;dim++){ CTF_alloc_ptr(sizeof(int)*old_phys_edge_len[dim], (void**)&bucket_offset[dim]); int pidx = 0; for (int vr = 0;vr < old_virt_dim[dim];vr++){ for (int vidx = 0;vidx < old_virt_edge_len[dim];vidx++,pidx++){ long_int _gidx = ((long_int)vidx*old_phys_dim[dim]+old_phys_rank[dim])*old_virt_dim[dim]+vr; long_int gidx; if (_gidx > len[dim] || (old_offsets != NULL && _gidx < old_offsets[dim])){ gidx = -1; } else { if (old_permutation == NULL || old_permutation[dim] == NULL){ gidx = _gidx; } else { gidx = old_permutation[dim][_gidx]; } } if (gidx != -1){ int total_rank = gidx%(new_phys_dim[dim]*new_virt_dim[dim]); int phys_rank = total_rank/new_virt_dim[dim]; if (forward){ int virt_rank = total_rank%new_virt_dim[dim]; bucket_offset[dim][pidx] = phys_rank*MAX(1,new_phys_lda[dim])*new_virt_np+ virt_rank*new_virt_lda[dim]; //printf("f %d - %d %d %d - %d - %d %d %d - %d\n", dim, vr, vidx, pidx, gidx, total_rank, // phys_rank, virt_rank, bucket_offset[dim][pidx]); } else{ bucket_offset[dim][pidx] = phys_rank*MAX(1,new_phys_lda[dim])*old_virt_np+ vr*old_virt_lda[dim]; //printf("r %d - %d %d %d - %d - %d %d - %d\n", dim, vr, vidx, pidx, gidx, total_rank, // phys_rank, bucket_offset[dim][pidx]); } } else { bucket_offset[dim][pidx] = -1; } } } } TAU_FSTOP(compute_bucket_offsets); return bucket_offset; } /** * \brief version of pup_virt_buff which assumes padding and cyclic layout * * \param[in] ndim dimension of tensor * \param[in] len non-padded edge lengths of tensor * \param[in] sym symmetries of tensor * \param[in] old_phys_rank ranks of this processor on the old processor grid * \param[in] old_phys_dim edge lengths of the old processor grid * \param[in] old_virt_dim old virtualization factors along each dimension * \param[in] old_phys_edge_len the old tensor processor block lengths * \param[in] old_virt_edge_len the old tensor block lengths * \param[in] old_virt_nelem the old number of elements per block * \param[in] old_offsets old offsets of each tensor edge (corner 1 of slice) * \param[in] old_permutation permutation array for each edge length (no perm if NULL) * \param[in] new_phys_np the new number of processors * \param[in] new_phys_rank ranks of this processor on the new processor grid * \param[in] new_phys_dim edge lengths of the new processor grid * \param[in] new_virt_dim new virtualization factors along each dimension * \param[in] new_phys_edge_len the new tensor processor block lengths * \param[in] new_virt_edge_len the new tensor block lengths * \param[in] new_virt_nelem the new number of elements per block * \param[in,out] old_data the previous set of values stored locally * \param[in,out] new_data buffers to fill with data to send to each process and virtual bucket * \param[in] forward is 0 on the receiving side and reverses the role of all the previous parameters * \param[in] bucket_offset offsets for target index for each dimension */ template<typename dtype> void pad_cyclic_pup_virt_buff(int const ndim, int const * len, int const * sym, int const * old_phys_rank, int const * old_phys_dim, int const * old_virt_dim, int const * old_phys_edge_len, int const * old_virt_edge_len, long_int const old_virt_nelem, int const * old_offsets, int * const * old_permutation, int const new_phys_np, int const * new_phys_rank, int const * new_phys_dim, int const * new_virt_dim, int const * new_phys_edge_len, int const * new_virt_edge_len, long_int const new_virt_nelem, dtype * old_data, dtype ** new_data, int const forward, int * const * bucket_offset){ if (ndim == 0){ if (forward) new_data[0][0] = old_data[0]; else old_data[0] = new_data[0][0]; return; } int old_virt_np = 1; for (int dim = 0;dim < ndim;dim++) old_virt_np *= old_virt_dim[dim]; int new_virt_np = 1; for (int dim = 0;dim < ndim;dim++) new_virt_np *= new_virt_dim[dim]; int nbucket = new_phys_np*(forward ? new_virt_np : old_virt_np); #if DEBUG >= 1 int rank; MPI_Comm_rank(MPI_COMM_WORLD,&rank); #endif TAU_FSTART(cyclic_pup_bucket); #ifdef USE_OMP int max_ntd = omp_get_max_threads(); max_ntd = MAX(1,MIN(max_ntd,new_virt_nelem/nbucket)); long_int old_size, new_size; old_size = sy_packed_size(ndim, old_virt_edge_len, sym)*old_virt_np; new_size = sy_packed_size(ndim, new_virt_edge_len, sym)*new_virt_np; /*if (forward){ } else { old_size = sy_packed_size(ndim, old_virt_edge_len, sym)*new_virt_np; new_size = sy_packed_size(ndim, new_virt_edge_len, sym)*old_virt_np; }*/ /*printf("old_size=%d, new_size=%d,old_virt_np=%d,new_virt_np=%d\n", old_size,new_size,old_virt_np,new_virt_np); */ int * bucket_store; int * count_store; int * thread_store; CTF_mst_alloc_ptr(sizeof(int)*MAX(old_size,new_size), (void**)&bucket_store); CTF_mst_alloc_ptr(sizeof(int)*MAX(old_size,new_size), (void**)&count_store); CTF_mst_alloc_ptr(sizeof(int)*MAX(old_size,new_size), (void**)&thread_store); std::fill(bucket_store, bucket_store+MAX(old_size,new_size), -1); int ** par_virt_counts; CTF_alloc_ptr(sizeof(int*)*max_ntd, (void**)&par_virt_counts); for (int t=0; t<max_ntd; t++){ CTF_mst_alloc_ptr(sizeof(int)*nbucket, (void**)&par_virt_counts[t]); std::fill(par_virt_counts[t], par_virt_counts[t]+nbucket, 0); } #pragma omp parallel num_threads(max_ntd) { #endif int *offs; CTF_alloc_ptr(sizeof(int)*ndim, (void**)&offs); if (old_offsets == NULL) for (int dim = 0;dim < ndim;dim++) offs[dim] = 0; else for (int dim = 0;dim < ndim;dim++) offs[dim] = old_offsets[dim]; int *ends; CTF_alloc_ptr(sizeof(int)*ndim, (void**)&ends); for (int dim = 0;dim < ndim;dim++) ends[dim] = len[dim]; #ifdef USE_OMP int tid = omp_get_thread_num(); int ntd = omp_get_num_threads(); //partition the global tensor among threads, to preserve //global ordering and load balance in partitioning int gidx_st[ndim]; int gidx_end[ndim]; if (ndim > 1){ long_int all_size = packed_size(ndim-1, len+1, sym+1); long_int chnk = all_size/ntd; long_int glb_idx_st = chnk*tid + MIN(tid,all_size%ntd); long_int glb_idx_end = glb_idx_st+chnk+(tid<(all_size%ntd)); //calculate global indices along each dimension corresponding to partition // printf("glb_idx_st = %ld, glb_idx_end = %ld\n",glb_idx_st,glb_idx_end); calc_idx_arr(ndim-1, len+1, sym+1, glb_idx_st, gidx_st+1); calc_idx_arr(ndim-1, len+1, sym+1, glb_idx_end, gidx_end+1); gidx_st[0] = 0; gidx_end[0] = 0; #if DEBUG >= 1 if (ntd == 1){ if (gidx_end[ndim-1] != len[ndim-1]){ for (int dim=0; dim<ndim; dim++){ printf("glb_idx_end = %ld, gidx_end[%d]= %d, len[%d] = %d\n", glb_idx_end, dim, gidx_end[dim], dim, len[dim]); } ABORT; } LIBT_ASSERT(gidx_end[ndim-1] <= ends[ndim-1]); } #endif } else { //FIXME the below means redistribution of a vector is non-threaded if (tid == 0){ gidx_st[0] = 0; gidx_end[0] = ends[0]; } else { gidx_st[0] = 0; gidx_end[0] = 0; } } //clip global indices to my physical cyclic phase (local tensor data) #endif // FIXME: may be better to mst_alloc, but this should ensure the // compiler knows there are no write conflicts #ifdef USE_OMP int * count = par_virt_counts[tid]; #else int *count; CTF_alloc_ptr(sizeof(int)*nbucket, (void**)&count); memset(count, 0, sizeof(int)*nbucket); #endif int *gidx; CTF_alloc_ptr(sizeof(int)*ndim, (void**)&gidx); memset(gidx, 0, sizeof(int)*ndim); for (int dim = 0;dim < ndim;dim++){ gidx[dim] = old_phys_rank[dim]*old_virt_dim[dim]; } int *virt_offset; CTF_alloc_ptr(sizeof(int)*ndim, (void**)&virt_offset); memset(virt_offset, 0, sizeof(int)*ndim); int *idx; CTF_alloc_ptr(sizeof(int)*ndim, (void**)&idx); memset(idx, 0, sizeof(int)*ndim); long_int *virt_acc; CTF_alloc_ptr(sizeof(long_int)*ndim, (void**)&virt_acc); memset(virt_acc, 0, sizeof(long_int)*ndim); long_int *idx_acc; CTF_alloc_ptr(sizeof(long_int)*ndim, (void**)&idx_acc); memset(idx_acc, 0, sizeof(long_int)*ndim); long_int *old_virt_lda; CTF_alloc_ptr(sizeof(long_int)*ndim, (void**)&old_virt_lda); old_virt_lda[0] = old_virt_nelem; for (int dim=1; dim<ndim; dim++){ old_virt_lda[dim] = old_virt_lda[dim-1]*old_virt_dim[dim-1]; } long_int offset = 0; long_int zero_len_toff = 0; #ifdef USE_OMP for (int dim=ndim-1; dim>=0; dim--){ int iist = MAX(0,(gidx_st[dim]-old_phys_rank[dim]*old_virt_dim[dim])); int ist = iist/(old_phys_dim[dim]*old_virt_dim[dim]); if (sym[dim] != NS) ist = MIN(ist,idx[dim+1]); int plen[ndim]; memcpy(plen,old_virt_edge_len,ndim*sizeof(int)); int idim = dim; do { plen[idim] = ist; idim--; } while (idim >= 0 && sym[idim] != NS); gidx[dim] += ist*old_phys_dim[dim]*old_virt_dim[dim]; idx[dim] = ist; idx_acc[dim] = sy_packed_size(dim+1, plen, sym); offset += idx_acc[dim]; LIBT_ASSERT(ist == 0 || gidx[dim] <= gidx_st[dim]); // LIBT_ASSERT(ist < old_virt_edge_len[dim]); if (gidx[dim] > gidx_st[dim]) break; int vst = iist-ist*old_phys_dim[dim]*old_virt_dim[dim]; if (vst > 0 ){ vst = MIN(old_virt_dim[dim]-1,vst); gidx[dim] += vst; virt_offset[dim] = vst*old_virt_edge_len[dim]; offset += vst*old_virt_lda[dim]; } else vst = 0; if (gidx[dim] > gidx_st[dim]) break; } #endif bool done = false; for (;!done;){ int bucket0 = 0; bool outside0 = false; int len_zero_max = ends[0]; #ifdef USE_OMP bool is_at_end = true; bool is_at_start = true; for (int dim = ndim-1;dim >0;dim--){ if (gidx[dim] > gidx_st[dim]){ is_at_start = false; break; } if (gidx[dim] < gidx_st[dim]){ outside0 = true; break; } } if (is_at_start){ zero_len_toff = gidx_st[0]; } for (int dim = ndim-1;dim >0;dim--){ if (gidx_end[dim] < gidx[dim]){ outside0 = true; done = true; break; } if (gidx_end[dim] > gidx[dim]){ is_at_end = false; break; } } if (is_at_end){ len_zero_max = MIN(ends[0],gidx_end[0]); done = true; } #endif if (!outside0){ for (int dim = 1;dim < ndim;dim++){ if (bucket_offset[dim][virt_offset[dim]+idx[dim]] == -1) outside0 = true; bucket0 += bucket_offset[dim][virt_offset[dim]+idx[dim]]; } } if (!outside0){ for (int dim = 1;dim < ndim;dim++){ if (gidx[dim] >= (sym[dim] == NS ? ends[dim] : (sym[dim] == SY ? gidx[dim+1]+1 : gidx[dim+1])) || gidx[dim] < offs[dim]){ outside0 = true; break; } } } int idx_max = (sym[0] == NS ? old_virt_edge_len[0] : idx[1]+1); int idx_st = 0; if (!outside0){ int gidx_min = MAX(zero_len_toff,offs[0]); int gidx_max = (sym[0] == NS ? ends[0] : (sym[0] == SY ? gidx[1]+1 : gidx[1])); gidx_max = MIN(gidx_max, len_zero_max); for (idx[0] = idx_st;idx[0] < idx_max;idx[0]++){ int virt_min = MAX(0,MIN(old_virt_dim[0],gidx_min-gidx[0])); int virt_max = MAX(0,MIN(old_virt_dim[0],gidx_max-gidx[0])); offset += old_virt_nelem*virt_min; if (forward){ for (virt_offset[0] = virt_min*old_virt_edge_len[0]; virt_offset[0] < virt_max*old_virt_edge_len[0]; virt_offset[0] += old_virt_edge_len[0]) { int bucket = bucket0+bucket_offset[0][virt_offset[0]+idx[0]]; #ifdef USE_OMP bucket_store[offset] = bucket; count_store[offset] = count[bucket]++; thread_store[offset] = tid; #else new_data[bucket][count[bucket]++] = old_data[offset]; #endif offset += old_virt_nelem; } } else{ for (virt_offset[0] = virt_min*old_virt_edge_len[0]; virt_offset[0] < virt_max*old_virt_edge_len[0]; virt_offset[0] += old_virt_edge_len[0]) { int bucket = bucket0+bucket_offset[0][virt_offset[0]+idx[0]]; #ifdef USE_OMP bucket_store[offset] = bucket; count_store[offset] = count[bucket]++; thread_store[offset] = tid; #else old_data[offset] = new_data[bucket][count[bucket]++]; #endif offset += old_virt_nelem; } } offset++; offset -= old_virt_nelem*virt_max; gidx[0] += old_phys_dim[0]*old_virt_dim[0]; } offset -= idx_max; gidx[0] -= idx_max*old_phys_dim[0]*old_virt_dim[0]; } idx_acc[0] = idx_max; idx[0] = 0; zero_len_toff = 0; /* Adjust outer indices */ if (!done){ for (int dim = 1;dim < ndim;dim++){ offset += old_virt_lda[dim]; virt_offset[dim] += old_virt_edge_len[dim]; gidx[dim]++; if (virt_offset[dim] == old_virt_dim[dim]*old_virt_edge_len[dim]){ offset -= old_virt_lda[dim]*old_virt_dim[dim]; gidx[dim] -= old_virt_dim[dim]; virt_offset[dim] = 0; offset += idx_acc[dim-1]; idx_acc[dim] += idx_acc[dim-1]; idx_acc[dim-1] = 0; gidx[dim] -= idx[dim]*old_phys_dim[dim]*old_virt_dim[dim]; idx[dim]++; if (idx[dim] == (sym[dim] == NS ? old_virt_edge_len[dim] : idx[dim+1]+1)){ offset -= idx_acc[dim]; //index should always be zero here sicne everything is SY and not SH idx[dim] = 0;//(dim == 0 || sym[dim-1] == NS ? 0 : idx[dim-1]); //gidx[dim] += idx[dim]*old_phys_dim[dim]*old_virt_dim[dim]; if (dim == ndim-1) done = true; } else{ gidx[dim] += idx[dim]*old_phys_dim[dim]*old_virt_dim[dim]; break; } } else{ idx_acc[dim-1] = 0; break; } } if (ndim <= 1) done = true; } } CTF_free(gidx); CTF_free(idx_acc); CTF_free(virt_acc); CTF_free(idx); CTF_free(virt_offset); CTF_free(old_virt_lda); #ifndef USE_OMP #if DEBUG >= 1 bool pass = true; for (int i = 0;i < nbucket-1;i++){ if (count[i] != (long_int)(new_data[i+1]-new_data[i])){ printf("rank = %d count %d should have been %d is %d\n", rank, i, (int)(new_data[i+1]-new_data[i]), count[i]); pass = false; } } if (!pass) ABORT; #endif #endif CTF_free(offs); CTF_free(ends); #ifndef USE_OMP CTF_free(count); #else par_virt_counts[tid] = count; } //#pragma omp endfor for (int bckt=0; bckt<nbucket; bckt++){ int par_tmp = 0; for (int thread=0; thread<max_ntd; thread++){ par_tmp += par_virt_counts[thread][bckt]; par_virt_counts[thread][bckt] = par_tmp - par_virt_counts[thread][bckt]; } #if DEBUG >= 1 if (bckt < nbucket-1 && par_tmp != new_data[bckt+1]-new_data[bckt]){ printf("rank = %d count for bucket %d is %d should have been %ld\n",rank,bckt,par_tmp,(long_int)(new_data[bckt+1]-new_data[bckt])); ABORT; } #endif } TAU_FSTOP(cyclic_pup_bucket); TAU_FSTART(cyclic_pup_move); { long_int tot_sz = MAX(old_size, new_size); int i; if (forward){ #pragma omp parallel for private(i) for (i=0; i<tot_sz; i++){ if (bucket_store[i] != -1){ int pc = par_virt_counts[thread_store[i]][bucket_store[i]]; int ct = count_store[i]+pc; new_data[bucket_store[i]][ct] = old_data[i]; } } } else { #pragma omp parallel for private(i) for (i=0; i<tot_sz; i++){ if (bucket_store[i] != -1){ int pc = par_virt_counts[thread_store[i]][bucket_store[i]]; int ct = count_store[i]+pc; old_data[i] = new_data[bucket_store[i]][ct]; } } } } TAU_FSTOP(cyclic_pup_move); for (int t=0; t<max_ntd; t++){ CTF_free(par_virt_counts[t]); } CTF_free(par_virt_counts); CTF_free(count_store); CTF_free(bucket_store); CTF_free(thread_store); #endif } /** * \brief optimized version of pup_virt_buff */ template<typename dtype> void opt_pup_virt_buff(int const ndim, int const nbuf, int const numPes, long_int const hvbs, int const old_nvirt, int const new_nvirt, int const * old_edge_len, int const * new_edge_len, int const * sym, int const * old_phase, int const * old_rank, int const * new_phase, int const * new_rank, int const * old_virt_dim, int const * new_virt_dim, int const * pe_displs, int const * virt_displs, int const * old_virt_lda, int const * new_virt_lda, int const * pe_lda, long_int const vbs, int const is_pad, int const * padding, dtype * tsr_data, dtype * tsr_cyclic_data, int const was_cyclic, int const is_cyclic, int const p_or_up){ /* if (ndim==4){ if (sym[ndim-2] != NS) printf("ncase with last sym\n"); else printf("case with no last sym\n"); }*/ long_int old_size, new_size, ntd, tid, pe_idx, virt_idx; long_int i, tmp1; long_int * par_virt_counts; long_int * pe_idx_store, * target_store; int * sub_old_edge_len, * sub_new_edge_len; #ifdef USE_OMP int max_ntd = omp_get_max_threads(); max_ntd = MAX(1,MIN(max_ntd,vbs/nbuf)); ntd = max_ntd; #else /* int const max_ntd = 4; ntd = 4;*/ int const max_ntd = 1; ntd = 1; #endif //CTF_alloc_ptr(sizeof(long_int*)*max_ntd, (void**)&par_virt_counts); CTF_alloc_ptr(sizeof(long_int)*max_ntd*nbuf, (void**)&par_virt_counts); //for (i=0; i<max_ntd; i++) // CTF_mst_alloc_ptr(nbuf*sizeof(long_int), (void**)&par_virt_counts[i]); CTF_alloc_ptr(ndim*sizeof(int), (void**)&sub_old_edge_len); for (i=0; i<ndim; i++){ sub_old_edge_len[i] = old_edge_len[i]/old_phase[i]; } CTF_alloc_ptr(ndim*sizeof(int), (void**)&sub_new_edge_len); for (i=0; i<ndim; i++){ sub_new_edge_len[i] = new_edge_len[i]/new_phase[i]; } if (p_or_up){ old_size = sy_packed_size(ndim, sub_old_edge_len, sym)*old_nvirt; new_size = sy_packed_size(ndim, sub_new_edge_len, sym)*new_nvirt; } else { old_size = sy_packed_size(ndim, sub_old_edge_len, sym)*new_nvirt; new_size = sy_packed_size(ndim, sub_new_edge_len, sym)*old_nvirt; } CTF_mst_alloc_ptr(sizeof(long_int)*MAX(old_size,new_size), (void**)&pe_idx_store); CTF_mst_alloc_ptr(sizeof(long_int)*MAX(old_size,new_size), (void**)&target_store); std::fill(target_store, target_store+MAX(old_size,new_size), -1); //for (i=0; i<max_ntd; i++) // memset(par_virt_counts[i], 0, nbuf*sizeof(long_int)); memset(par_virt_counts, 0, max_ntd*nbuf*sizeof(long_int)); TAU_FSTART(opt_pup_virt_buf_calc_off); #ifdef USE_OMP #pragma omp parallel private(ntd, tid, pe_idx, virt_idx, i, tmp1) num_threads(max_ntd) #else //for (tid=0; tid<ntd; tid++) #endif { /* TAU_FSTART(thread_loop_time); if (tid != 1) TAU_FSTOP(thread_loop_time);*/ long_int imax, padimax, act_lda, act_max, ledge_len_max, ledge_len_st, cptr; long_int poutside, overt_idx, part_size, idx_offset, csize, allsize; long_int virt_offset, arr_idx; long_int vr, outside, vr_max, pact_max, ispm; long_int * virt_counts, * idx, * old_virt_idx, * virt_rank; int * off_edge_len, * sub_edge_len; long_int * idx_offs; long_int * acc_idx, * spad; #ifdef USE_OMP tid = omp_get_thread_num(); ntd = omp_get_num_threads(); #else tid = 0; ntd = 1; #endif if (ndim == 0){ if (tid == 0){ ledge_len_st = 0; ledge_len_max = 1; } else { ledge_len_st = 1; ledge_len_max = 1; } } else { if (ndim == 1){ if (tid == 0){ ledge_len_st = 0; ledge_len_max = old_edge_len[0]/old_phase[0]; } else { ledge_len_st = old_edge_len[0]/old_phase[0]; ledge_len_max = old_edge_len[0]/old_phase[0]; } } else { if (sym[ndim-1] == 0){ part_size = (old_edge_len[ndim-1]/old_phase[ndim-1])/ntd; if (tid < (old_edge_len[ndim-1]/old_phase[ndim-1])%ntd){ part_size++; ledge_len_st = ((old_edge_len[ndim-1]/old_phase[ndim-1])/ntd + 1)*tid; } else ledge_len_st = ((old_edge_len[ndim-1]/old_phase[ndim-1])/ntd)*tid + (old_edge_len[ndim-1]/old_phase[ndim-1])%ntd; ledge_len_max = ledge_len_st+part_size; } else { CTF_alloc_ptr(ndim*sizeof(int), (void**)&sub_edge_len); part_size = (old_edge_len[ndim-1]/old_phase[ndim-1])/ntd; cptr = 0; for (i=0; i<ndim; i++){ sub_edge_len[i] = old_edge_len[i]/old_phase[i]; } allsize = packed_size(ndim, sub_edge_len, sym); ledge_len_st = -1; ledge_len_max = -1; for (i=0; i<old_edge_len[ndim-1]/old_phase[ndim-1]; i++){ sub_edge_len[ndim-1] = i; cptr = ndim - 2; while (cptr >= 0 && sym[cptr] != 0){ sub_edge_len[cptr] = i; cptr--; } csize = packed_size(ndim, sub_edge_len, sym); if (ledge_len_st == -1 && csize >= (allsize/ntd)*tid){ ledge_len_st = i; } if (ledge_len_max == -1 && csize >= (allsize/ntd)*(tid+1)){ ledge_len_max = i; } } LIBT_ASSERT(ledge_len_max != -1); } } } if (ledge_len_max!=ledge_len_st) { //CTF_alloc_ptr(nbuf*sizeof(long_int), (void**)&virt_counts); CTF_alloc_ptr(ndim*sizeof(long_int), (void**)&idx); CTF_alloc_ptr(ndim*sizeof(long_int), (void**)&acc_idx); CTF_alloc_ptr(ndim*sizeof(long_int), (void**)&idx_offs); CTF_alloc_ptr(ndim*sizeof(long_int), (void**)&old_virt_idx); CTF_alloc_ptr(ndim*sizeof(long_int), (void**)&virt_rank); CTF_alloc_ptr(ndim*sizeof(long_int), (void**)&spad); CTF_alloc_ptr(ndim*sizeof(int), (void**)&off_edge_len); //virt_counts = par_virt_counts[tid]; virt_counts = par_virt_counts+tid*nbuf; memset(old_virt_idx, 0, ndim*sizeof(long_int)); memset(acc_idx, 0, ndim*sizeof(long_int)); // memset(virt_counts, 0, nbuf*sizeof(long_int)); for (i=0; i<ndim; i++){ virt_rank[i] = old_rank[i]*old_virt_dim[i]; } /* Loop accross all virtual subtensors in order, and keep order within subtensors. We should move through one of the arrays contiguously */ arr_idx = 0; overt_idx = 0; memset(idx, 0, ndim*sizeof(long_int)); memset(idx_offs, 0, ndim*sizeof(long_int)); if (ndim != 0){ idx[ndim-1] = ledge_len_st; for (i=0; i<ndim; i++){ off_edge_len[i] = old_edge_len[i]/old_phase[i]; } i=ndim-1; do { off_edge_len[i] = ledge_len_st; i--; } while (i>=0 && sym[i] != NS); arr_idx += sy_packed_size(ndim, off_edge_len, sym); } idx_offset = 0; virt_offset = 0; for (i=1; i<ndim; i++) { if (was_cyclic) idx_offs[i] = idx[i]*old_phase[i]+virt_rank[i]; else idx_offs[i] = idx[i]+virt_rank[i]*old_edge_len[i]/old_phase[i]; if (is_cyclic) idx_offs[i] = idx_offs[i]%new_phase[i]; else idx_offs[i] = idx_offs[i]/(new_edge_len[i]/new_phase[i]); idx_offset += (idx_offs[i]/new_virt_dim[i])*pe_lda[i]; virt_offset += (idx_offs[i]%new_virt_dim[i])*new_virt_lda[i]; } poutside = -1; if (is_pad){ for (i=1; i<ndim; i++){ pact_max = (old_edge_len[i]-padding[i])/old_phase[i]; if ((old_edge_len[i]-padding[i])%old_phase[i] > old_rank[i]*old_virt_dim[i]) pact_max++; if (poutside == i) poutside = -1; if (i > poutside){ if (idx[i] >= pact_max) { poutside = i; } } } } outside = 0; if (poutside > -1) outside = 1; for (i=1; i<ndim; i++){ if (sym[i] != NS && idx[i+1] == idx[i]){ if (old_rank[i+1]*old_virt_dim[i+1]+(sym[i]==SY)<= old_rank[i]*old_virt_dim[i]){ outside=1; break; } } } if (ndim > 0){ imax = old_edge_len[0]/old_phase[0]; for (;;){ if (sym[0] != NS) imax = idx[1]+1; /* NOTE: Must iterate across all local data in correct global order * to make the back-transformation correct */ /* for each leading index owned by a virtual prc along an edge */ if (outside == 0){ if (is_pad) { ispm = (old_edge_len[0]-padding[0])/old_phase[0]; if ((old_edge_len[0]-padding[0])%old_phase[0] > old_rank[0]*old_virt_dim[0]) ispm++; padimax = MIN(imax,ispm); } else padimax = imax; if (ndim == 1) { i=ledge_len_st; padimax = MIN(padimax,ledge_len_max); } else i=0; for (; i<padimax; i++){ /* for each virtual proc along leading dim */ vr_max = old_virt_dim[0]; if (is_pad){ vr_max = MIN(vr_max, ((old_edge_len[0]-padding[0])- (i*old_phase[0]+old_rank[0]*old_virt_dim[0]))); // printf("vr_max = %d rather than %d\n",vr_max,old_virt_dim[0]); } if (sym[0] != NS && i == idx[1]){ vr_max = MIN(vr_max, ((old_virt_idx[1]+(sym[0]==SY)+ old_rank[1]*old_virt_dim[1])- (old_rank[0]*old_virt_dim[0]))); } if (was_cyclic&&is_cyclic) { long_int pe_idx_0 = (old_phase[0]*i+old_rank[0]*old_virt_dim[0])%new_phase[0]; long_int pe_idx_1 = (pe_idx_0/new_virt_dim[0])*pe_lda[0]; long_int pe_idx_2 = pe_idx_0; long_int virt_idx_0 = (pe_idx_0%new_virt_dim[0]); for (vr=0; vr<vr_max; vr++){ virt_rank[0] = old_rank[0]*old_virt_dim[0]+vr; /* Find the offset of this virtual proc */ arr_idx += (overt_idx + vr)*vbs; /* Find the processor of the new distribution */ if (!p_or_up) virt_idx = overt_idx + vr; else virt_idx = virt_idx_0 + virt_offset; pe_idx = pe_idx_1 + idx_offset; pe_idx_store[arr_idx+i] = pe_idx*new_nvirt+virt_idx+tid*nbuf; /* Get the bucket indices and offsets */ tmp1 = pe_displs[pe_idx]; tmp1 += virt_displs[pe_idx*new_nvirt+virt_idx]; tmp1 += virt_counts[pe_idx*new_nvirt+virt_idx]; virt_counts[pe_idx*new_nvirt+virt_idx]++; // /* Pack or unpack from bucket */ target_store[arr_idx+i]=tmp1; /* if (p_or_up){ tsr_cyclic_data[tmp1] = tsr_data[arr_idx+i]; } else tsr_cyclic_data[arr_idx+i] = tsr_data[tmp1];*/ arr_idx -= (overt_idx + vr)*vbs; if (pe_idx_0 == new_phase[0]-1) { virt_idx_0 = 0; pe_idx_0 = 0; pe_idx_1 = 0; pe_idx_2 = 0; } else { virt_idx_0 = (virt_idx_0 == new_virt_dim[0]-1 ? 0 : virt_idx_0+1); pe_idx_0++; pe_idx_2++; if (pe_idx_2 == new_virt_dim[0]) { pe_idx_2 = 0; pe_idx_1 += pe_lda[0]; } } } } else { for (vr=0; vr<vr_max; vr++){ virt_rank[0] = old_rank[0]*old_virt_dim[0]+vr; /* Find the offset of this virtual proc */ arr_idx += (overt_idx + vr)*vbs; /* Find the processor of the new distribution */ //pe_idx = (old_phase[0]*i+virt_rank[0])%new_phase[0]; if (was_cyclic) pe_idx = i*old_phase[0]+virt_rank[0]; else pe_idx = i+virt_rank[0]*old_edge_len[0]/old_phase[0]; if (is_cyclic) pe_idx = pe_idx%new_phase[0]; else pe_idx = pe_idx/(new_edge_len[0]/new_phase[0]); virt_idx = (pe_idx%new_virt_dim[0]) + virt_offset; if (!p_or_up) virt_idx = overt_idx + vr; pe_idx = (pe_idx/new_virt_dim[0])*pe_lda[0] + idx_offset; //pe_idx_store[arr_idx+i] = pe_idx*new_nvirt+virt_idx; //pe_idx_store[arr_idx+i] = pe_idx_store[arr_idx+i]*ntd+tid; pe_idx_store[arr_idx+i] = pe_idx*new_nvirt+virt_idx+tid*nbuf; /* Get the bucket indices and offsets */ tmp1 = pe_displs[pe_idx]; tmp1 += virt_displs[pe_idx*new_nvirt+virt_idx]; tmp1 += virt_counts[pe_idx*new_nvirt+virt_idx]; virt_counts[pe_idx*new_nvirt+virt_idx]++; // /* Pack or unpack from bucket */ target_store[arr_idx+i]=tmp1; /* if (p_or_up){ tsr_cyclic_data[tmp1] = tsr_data[arr_idx+i]; } else tsr_cyclic_data[arr_idx+i] = tsr_data[tmp1];*/ arr_idx -= (overt_idx + vr)*vbs; } } } } /* Adjust outer indices */ for (act_lda=1; act_lda < ndim; act_lda++){ overt_idx -= old_virt_idx[act_lda]*old_virt_lda[act_lda]; /* get outer virtual index */ old_virt_idx[act_lda]++; vr_max = old_virt_dim[act_lda]; if (is_pad){ vr_max = MIN(vr_max, ((old_edge_len[act_lda]-padding[act_lda])- (idx[act_lda]*old_phase[act_lda] +old_rank[act_lda]*old_virt_dim[act_lda]))); //LIBT_ASSERT(vr_max!=0 || outside>0); } if (sym[act_lda] != NS && idx[act_lda] == idx[act_lda+1]){ vr_max = MIN(vr_max, ((old_virt_idx[act_lda+1]+(sym[act_lda]==SY)+ old_rank[act_lda+1]*old_virt_dim[act_lda+1])- (old_rank[act_lda]*old_virt_dim[act_lda]))); //LIBT_ASSERT(vr_max!=0 || outside>0); } if (old_virt_idx[act_lda] >= vr_max) old_virt_idx[act_lda] = 0; overt_idx += old_virt_idx[act_lda]*old_virt_lda[act_lda]; /* adjust virtual rank */ virt_rank[act_lda] = old_rank[act_lda]*old_virt_dim[act_lda] +old_virt_idx[act_lda]; /* adjust buffer offset */ if (old_virt_idx[act_lda] == 0){ /* Propogate buffer offset. When outer indices have multiple virtual ranks we must roll back the buffer to the correct position. So must keep history of the offsets. */ if (idx[act_lda] == 0) acc_idx[act_lda] = 0; if (act_lda == 1) { acc_idx[act_lda] += imax; arr_idx += imax; } else { acc_idx[act_lda] += acc_idx[act_lda-1]; arr_idx += acc_idx[act_lda-1]; } idx[act_lda]++; if (act_lda == ndim - 1) act_max = ledge_len_max; else act_max = old_edge_len[act_lda]/old_phase[act_lda]; if (sym[act_lda] != NS) act_max = idx[act_lda+1]+1; if (idx[act_lda] >= act_max){ idx[act_lda] = 0; arr_idx -= acc_idx[act_lda]; } if (is_pad){ pact_max = (old_edge_len[act_lda]-padding[act_lda])/old_phase[act_lda]; if ((old_edge_len[act_lda]-padding[act_lda])%old_phase[act_lda] > old_rank[act_lda]*old_virt_dim[act_lda]) pact_max++; if (poutside == act_lda) poutside = -1; if (act_lda > poutside){ if (idx[act_lda] >= pact_max) { poutside = act_lda; } /*else if (sym[act_lda] != NS] && idx[act_lda] == idx[sym[act_lda]]) { if (old_virt_idx[sym[act_lda]]+sym_type[act_lda]+ +old_rank[sym[act_lda]]*old_virt_dim[sym[act_lda]]<= (old_rank[act_lda]*old_virt_dim[act_lda])){ outside = act_lda; } }*/ } } } /* Adjust bucket locations by translating phases */ idx_offset -= (idx_offs[act_lda]/new_virt_dim[act_lda])*pe_lda[act_lda]; virt_offset -= (idx_offs[act_lda]%new_virt_dim[act_lda])*new_virt_lda[act_lda]; if (was_cyclic) idx_offs[act_lda] = idx[act_lda]*old_phase[act_lda]+virt_rank[act_lda]; else idx_offs[act_lda] = idx[act_lda]+virt_rank[act_lda]*old_edge_len[act_lda]/old_phase[act_lda]; if (is_cyclic) idx_offs[act_lda] = idx_offs[act_lda]%new_phase[act_lda]; else idx_offs[act_lda] = idx_offs[act_lda]/(new_edge_len[act_lda]/new_phase[act_lda]); idx_offset += (idx_offs[act_lda]/new_virt_dim[act_lda])*pe_lda[act_lda]; virt_offset += (idx_offs[act_lda]%new_virt_dim[act_lda])*new_virt_lda[act_lda]; if (idx[act_lda] > 0 || old_virt_idx[act_lda] > 0) break; } if (poutside > -1) outside = 1; else { outside = 0; for (i=1; i<ndim; i++){ if (sym[i] != NS && idx[i+1] == idx[i]){ if (old_rank[i+1]*old_virt_dim[i+1]+ old_virt_idx[i+1] + (sym[i]==SY)<= old_rank[i]*old_virt_dim[i] + old_virt_idx[i]){ outside=1; break; } } } } if (act_lda == ndim) { break; } } } CTF_free(idx); CTF_free(acc_idx); CTF_free(idx_offs); CTF_free(old_virt_idx); CTF_free(virt_rank); CTF_free(spad); CTF_free(off_edge_len); } //if (tid == 1) TAU_FSTOP(thread_loop_time); } TAU_FSTOP(opt_pup_virt_buf_calc_off); long_int par_i, par_j, par_tmp; for (par_i=0; par_i<nbuf; par_i++){ par_tmp = 0; for (par_j=0; par_j<max_ntd; par_j++){ //par_tmp += par_virt_counts[par_j][par_i]; //par_virt_counts[par_j][par_i] = par_tmp - par_virt_counts[par_j][par_i]; par_tmp += par_virt_counts[par_i+par_j*nbuf]; par_virt_counts[par_i+par_j*nbuf] = par_tmp - par_virt_counts[par_i+par_j*nbuf]; } } TAU_FSTART(opt_pup_virt_buf_move); if (ndim == 0){ tsr_cyclic_data[0] = tsr_data[0]; } else { if (p_or_up){ #ifdef USE_OMP #pragma omp parallel for private(i, tmp1, virt_idx) #endif for (i=0; i<MAX(new_size,old_size); i++){ tmp1 = target_store[i]; if (tmp1 != -1){ //virt_idx = pe_idx_store[i]/ntd; /*tmp1 += par_virt_counts[(pe_idx_store[i]%ntd)*nbuf + virt_idx];*/ //tmp1 += par_virt_counts[(pe_idx_store[i]%ntd)][virt_idx]; tmp1 += par_virt_counts[pe_idx_store[i]]; tsr_cyclic_data[tmp1] = tsr_data[i]; } } } else { #ifdef USE_OMP #pragma omp parallel for private(i, tmp1, virt_idx) #endif for (i=0; i<MAX(new_size,old_size); i++){ tmp1 = target_store[i]; if (tmp1 != -1){ //virt_idx = pe_idx_store[i]/ntd; /*tmp1 += par_virt_counts[(pe_idx_store[i]%ntd)*nbuf + virt_idx];*/ //tmp1 += par_virt_counts[(pe_idx_store[i]%ntd)][virt_idx]; tmp1 += par_virt_counts[pe_idx_store[i]]; tsr_cyclic_data[i] = tsr_data[tmp1]; } } } } TAU_FSTOP(opt_pup_virt_buf_move); CTF_free(sub_old_edge_len); CTF_free(sub_new_edge_len); CTF_free(pe_idx_store); CTF_free(target_store); //for (i=0; i<ntd; i++) // CTF_free(par_virt_counts[i]); CTF_free(par_virt_counts); //#endif } /** * \brief Repad and reshuffle elements * * \param ndim number of tensor dimensions * \param nval number of elements in each subtensor * \param old_edge_len old edge lengths of tensor * \param sym symmetries of tensor * \param old_phase current physical*virtual phase * \param old_rank current physical rank * \param old_pe_lda old lda of processor grid * \param is_old_pad whether this tensor is currently padded * \param old_padding the padding in each dimension * \param new_edge_len new edge lengths of tensor * \param new_phase new physival*virtual phase * \param new_rank new physical rank * \param new_pe_lda lda of processor grid * \param is_new_pad tells function whether new layout is padded * \param new_padding the new padding to apply to edge_len * \param old_virt_dim current virtualization dimensions on each process * \param new_virt_dim new virtualization dimensions on each process * \param tsr_data current tensor data * \param tsr_cyclic_data pointer to a a pointer that will point to * new tensor of data that will be alloced * \param ord_glb_comm the global communicator */ template<typename dtype> int padded_reshuffle(int const tid, int const ndim, long_int const nval, int const * old_edge_len, int const * sym, int const * old_phase, int const * old_rank, int const * old_pe_lda, int const is_old_pad, int const * old_padding, int const * new_edge_len, int const * new_phase, int const * new_rank, int const * new_pe_lda, int const is_new_pad, int const * new_padding, int const * old_virt_dim, int const * new_virt_dim, dtype * tsr_data, dtype ** tsr_cyclic_data, CommData_t ord_glb_comm){ int i, old_num_virt, new_num_virt, numPes; long_int new_nval, swp_nval; long_int old_size; int idx_lyr; int * virt_phase_rank, * old_virt_phase_rank, * sub_edge_len; tkv_pair<dtype> * pairs; dtype * tsr_new_data; DEBUG_PRINTF("Performing padded reshuffle\n"); TAU_FSTART(padded_reshuffle); numPes = ord_glb_comm.np; CTF_alloc_ptr(ndim*sizeof(int), (void**)&virt_phase_rank); CTF_alloc_ptr(ndim*sizeof(int), (void**)&old_virt_phase_rank); CTF_alloc_ptr(ndim*sizeof(int), (void**)&sub_edge_len); new_num_virt = 1; old_num_virt = 1; idx_lyr = ord_glb_comm.rank; for (i=0; i<ndim; i++){ /* if (ord_glb_comm == NULL || ord_glb_comm.rank == 0){ printf("old_pe_lda[%d] = %d, old_phase = %d, old_virt_dim = %d\n", i,old_pe_lda[i], old_phase[i], old_virt_dim[i]); printf("new_pe_lda[%d] = %d, new_phase = %d, new_virt_dim = %d\n", i,new_pe_lda[i], new_phase[i], new_virt_dim[i]); printf("is_new_pad = %d\n", is_new_pad); if (is_new_pad) printf("new_padding[%d] = %d\n", i, new_padding[i]); printf("is_old_pad = %d\n", is_old_pad); if (is_old_pad) printf("old_padding[%d] = %d\n", i, old_padding[i]); }*/ old_num_virt = old_num_virt*old_virt_dim[i]; new_num_virt = new_num_virt*new_virt_dim[i]; virt_phase_rank[i] = new_rank[i]*new_virt_dim[i]; old_virt_phase_rank[i] = old_rank[i]*old_virt_dim[i]; //idx_lyr -= (old_phase[i]/old_virt_dim[i])*old_rank[i]; idx_lyr -= old_rank[i]*old_pe_lda[i]; } if (idx_lyr == 0 ){ read_loc_pairs(ndim, nval, is_old_pad, old_num_virt, sym, old_edge_len, old_padding, old_virt_dim, old_phase, old_virt_phase_rank, &new_nval, tsr_data, &pairs); } else { new_nval = 0; pairs = NULL; } old_size = sy_packed_size(ndim, new_edge_len, sym); for (i=0; i<ndim; i++){ sub_edge_len[i] = new_edge_len[i] / new_phase[i]; } if (ord_glb_comm.rank == 0){ DPRINTF(1,"Tensor %d now has virtualization factor of %d\n",tid,new_num_virt); } swp_nval = new_num_virt*sy_packed_size(ndim, sub_edge_len, sym); if (ord_glb_comm.rank == 0){ DPRINTF(1,"Tensor %d is of size "PRId64", has factor of %lf growth due to padding\n", tid, swp_nval, ord_glb_comm.np*(swp_nval/(double)old_size)); } CTF_alloc_ptr(swp_nval*sizeof(dtype), (void**)&tsr_new_data); std::fill(tsr_new_data, tsr_new_data+swp_nval, get_zero<dtype>()); wr_pairs_layout(ndim, numPes, new_nval, (dtype)1.0, (dtype)0.0, (is_old_pad | is_new_pad), 'w', new_num_virt, sym, new_edge_len, new_padding, new_phase, new_virt_dim, virt_phase_rank, new_pe_lda, pairs, tsr_new_data, ord_glb_comm); *tsr_cyclic_data = tsr_new_data; CTF_free(old_virt_phase_rank); if (pairs != NULL) CTF_free(pairs); CTF_free(virt_phase_rank); CTF_free(sub_edge_len); TAU_FSTOP(padded_reshuffle); return DIST_TENSOR_SUCCESS; } /* Goes from any set of phases to any new set of phases */ /** * \brief Reshuffle elements * * \param[in] ndim number of tensor dimensions * \param[in] nval number of elements in the subtensors each proc owns * \param[in] old_edge_len old edge lengths of tensor * \param[in] sym symmetries of tensor * \param[in] old_phase current physical*virtual phase * \param[in] old_rank current physical rank * \param[in] old_pe_lda old lda of processor grid * \param[in] is_old_pad whether tensor was padded * \param[in] old_padding padding of current tensor * \param[in] old_offsets old offsets of each tensor edge (corner 1 of slice) * \param[in] old_permutation permutation array for each edge length (no perm if NULL) * \param[in] new_edge_len new edge lengths of tensor * \param[in] new_phase new physical*virtual phase * \param[in] new_rank new physical rank * \param[in] new_pe_lda new lda of processor grid * \param[in] is_new_pad whether tensor will be padded * \param[in] new_padding padding we want * \param[in] new_offsets old offsets of each tensor edge (corner 1 of slice) * \param[in] new_permutation permutation array for each edge length (no perm if NULL) * \param[in] old_virt_dim current virtualization dimensions on each process * \param[in] new_virt_dim new virtualization dimensions on each process * \param[in,out] ptr_tsr_data current tensor data * \param[out] ptr_tsr_cyclic_data pointer to a new tensor of data that will be filled * \param[in] ord_glb_comm the global communicator * \param[in] was_cyclic whether the mapping was cyclic * \param[in] is_cyclic whether the mapping is cyclic */ template<typename dtype> int cyclic_reshuffle(int const ndim, long_int const nval, int const * old_edge_len, int const * sym, int const * old_phase, int const * old_rank, int const * old_pe_lda, int const is_old_pad, int const * old_padding, int const * old_offsets, int * const * old_permutation, int const * new_edge_len, int const * new_phase, int const * new_rank, int const * new_pe_lda, int const is_new_pad, int const * new_padding, int const * new_offsets, int * const * new_permutation, int const * old_virt_dim, int const * new_virt_dim, dtype ** ptr_tsr_data, dtype ** ptr_tsr_cyclic_data, CommData_t ord_glb_comm, int const was_cyclic, int const is_cyclic){ /* If we want to use key value pairs to reshuffle */ int i, nbuf, np, old_nvirt, new_nvirt, old_np, new_np, idx_lyr; long_int vbs_old, vbs_new; long_int hvbs_old, hvbs_new; long_int swp_nval; int * hsym; int * send_counts, * recv_counts, * idx, * buf_lda; long_int * idx_offs; int * svirt_displs, * rvirt_displs, * send_displs; int * recv_displs, * new_virt_lda, * old_virt_lda; int * old_sub_edge_len, * new_sub_edge_len; dtype * tsr_cyclic_data; dtype * tsr_data = *ptr_tsr_data; if (ndim == 0){ CTF_alloc_ptr(sizeof(dtype), (void**)&tsr_cyclic_data); if (ord_glb_comm.rank == 0){ tsr_cyclic_data[0] = tsr_data[0]; } else { tsr_cyclic_data[0] = get_zero<dtype>(); } *ptr_tsr_cyclic_data = tsr_cyclic_data; return DIST_TENSOR_SUCCESS; } if (ndim == 0){ CTF_alloc_ptr(sizeof(dtype), (void**)&tsr_cyclic_data); if (ord_glb_comm.rank == 0){ tsr_cyclic_data[0] = tsr_data[0]; } else { tsr_cyclic_data[0] = get_zero<dtype>(); } *ptr_tsr_cyclic_data = tsr_cyclic_data; return DIST_TENSOR_SUCCESS; } TAU_FSTART(cyclic_reshuffle); np = ord_glb_comm.np; CTF_alloc_ptr(ndim*sizeof(int), (void**)&hsym); CTF_alloc_ptr(ndim*sizeof(int), (void**)&idx); CTF_alloc_ptr(ndim*sizeof(long_int), (void**)&idx_offs); CTF_alloc_ptr(ndim*sizeof(int), (void**)&old_virt_lda); CTF_alloc_ptr(ndim*sizeof(int), (void**)&new_virt_lda); CTF_alloc_ptr(ndim*sizeof(int), (void**)&buf_lda); nbuf = 1; new_nvirt = 1; old_nvirt = 1; old_np = 1; new_np = 1; idx_lyr = ord_glb_comm.rank; for (i=0; i<ndim; i++) { buf_lda[i] = nbuf; new_virt_lda[i] = new_nvirt; old_virt_lda[i] = old_nvirt; nbuf = nbuf*new_phase[i]; /*printf("is_new_pad = %d\n", is_new_pad); if (is_new_pad) printf("new_padding[%d] = %d\n", i, new_padding[i]); printf("is_old_pad = %d\n", is_old_pad); if (is_old_pad) printf("old_padding[%d] = %d\n", i, old_padding[i]);*/ old_nvirt = old_nvirt*old_virt_dim[i]; new_nvirt = new_nvirt*new_virt_dim[i]; new_np = new_np*new_phase[i]/new_virt_dim[i]; old_np = old_np*old_phase[i]/old_virt_dim[i]; idx_lyr -= old_rank[i]*old_pe_lda[i]; } vbs_old = nval/old_nvirt; nbuf = np*new_nvirt; CTF_mst_alloc_ptr(np*sizeof(int), (void**)&recv_counts); CTF_mst_alloc_ptr(np*sizeof(int), (void**)&send_counts); CTF_mst_alloc_ptr(nbuf*sizeof(int), (void**)&rvirt_displs); CTF_mst_alloc_ptr(nbuf*sizeof(int), (void**)&svirt_displs); CTF_mst_alloc_ptr(np*sizeof(int), (void**)&send_displs); CTF_mst_alloc_ptr(np*sizeof(int), (void**)&recv_displs); CTF_alloc_ptr(ndim*sizeof(int), (void**)&old_sub_edge_len); CTF_alloc_ptr(ndim*sizeof(int), (void**)&new_sub_edge_len); int ** bucket_offset; int *real_edge_len; CTF_alloc_ptr(sizeof(int)*ndim, (void**)&real_edge_len); for (i=0; i<ndim; i++) real_edge_len[i] = old_edge_len[i]-old_padding[i]; int *old_phys_dim; CTF_alloc_ptr(sizeof(int)*ndim, (void**)&old_phys_dim); for (i=0; i<ndim; i++) old_phys_dim[i] = old_phase[i]/old_virt_dim[i]; int *new_phys_dim; CTF_alloc_ptr(sizeof(int)*ndim, (void**)&new_phys_dim); for (i=0; i<ndim; i++) new_phys_dim[i] = new_phase[i]/new_virt_dim[i]; int *old_phys_edge_len; CTF_alloc_ptr(sizeof(int)*ndim, (void**)&old_phys_edge_len); for (int dim = 0;dim < ndim;dim++) old_phys_edge_len[dim] = (real_edge_len[dim]+old_padding[dim])/old_phys_dim[dim]; int *new_phys_edge_len; CTF_alloc_ptr(sizeof(int)*ndim, (void**)&new_phys_edge_len); for (int dim = 0;dim < ndim;dim++) new_phys_edge_len[dim] = (real_edge_len[dim]+new_padding[dim])/new_phys_dim[dim]; int *old_virt_edge_len; CTF_alloc_ptr(sizeof(int)*ndim, (void**)&old_virt_edge_len); for (int dim = 0;dim < ndim;dim++) old_virt_edge_len[dim] = old_phys_edge_len[dim]/old_virt_dim[dim]; int *new_virt_edge_len; CTF_alloc_ptr(sizeof(int)*ndim, (void**)&new_virt_edge_len); for (int dim = 0;dim < ndim;dim++) new_virt_edge_len[dim] = new_phys_edge_len[dim]/new_virt_dim[dim]; LIBT_ASSERT(is_old_pad); LIBT_ASSERT(is_new_pad); // if (idx_lyr == 0){ bucket_offset = compute_bucket_offsets( ndim, real_edge_len, old_rank, old_phys_dim, old_virt_dim, old_virt_lda, old_offsets, old_permutation, new_phys_dim, new_pe_lda, new_virt_dim, new_virt_lda, 1, old_nvirt, new_nvirt, old_phys_edge_len, old_virt_edge_len); TAU_FSTART(calc_cnt_displs); /* Calculate bucket counts to begin exchange */ calc_cnt_displs<dtype>( ndim, nbuf, new_nvirt, np, old_edge_len, new_edge_len, old_virt_edge_len, sym, old_phase, old_rank, new_phase, old_virt_dim, new_virt_dim, new_virt_lda, buf_lda, new_pe_lda, is_old_pad, old_padding, send_counts, recv_counts, send_displs, recv_displs, svirt_displs, rvirt_displs, ord_glb_comm, idx_lyr, was_cyclic, is_cyclic, bucket_offset); TAU_FSTOP(calc_cnt_displs); /*for (i=0; i<np; i++){ printf("[%d] send_counts[%d] = %d recv_counts[%d] = %d\n", ord_glb_comm.rank, i, send_counts[i], i, recv_counts[i]); } for (i=0; i<nbuf; i++){ printf("[%d] svirt_displs[%d] = %d rvirt_displs[%d] = %d\n", ord_glb_comm.rank, i, svirt_displs[i], i, rvirt_displs[i]); }*/ // } for (i=0; i<ndim; i++){ new_sub_edge_len[i] = new_edge_len[i]; old_sub_edge_len[i] = old_edge_len[i]; } for (i=0; i<ndim; i++){ new_sub_edge_len[i] = new_sub_edge_len[i] / new_phase[i]; old_sub_edge_len[i] = old_sub_edge_len[i] / old_phase[i]; } for (i=1; i<ndim; i++){ hsym[i-1] = sym[i]; } hvbs_old = sy_packed_size(ndim-1, old_sub_edge_len+1, hsym); hvbs_new = sy_packed_size(ndim-1, new_sub_edge_len+1, hsym); swp_nval = new_nvirt*sy_packed_size(ndim, new_sub_edge_len, sym); CTF_mst_alloc_ptr(MAX(nval,swp_nval)*sizeof(dtype), (void**)&tsr_cyclic_data); vbs_new = swp_nval/new_nvirt; DPRINTF(4,"[%d] send = %d, had= "PRId64" recv = %d, should get = "PRId64"\n", ord_glb_comm.rank, send_displs[ord_glb_comm.np-1] + send_counts[ord_glb_comm.np-1], nval, recv_displs[ord_glb_comm.np-1] + recv_counts[ord_glb_comm.np-1], swp_nval); TAU_FSTART(pack_virt_buf); if (idx_lyr == 0){ if (was_cyclic&&is_cyclic) { dtype **new_data; CTF_alloc_ptr(sizeof(dtype*)*np*new_nvirt, (void**)&new_data); for (int p = 0,b = 0;p < np;p++) { for (int v = 0;v < new_nvirt;v++,b++) new_data[b] = tsr_cyclic_data+send_displs[p]+svirt_displs[b]; } //long_int ndata = send_displs[np-1] + send_counts[np-1]; //dtype *test_data; CTF_alloc_ptr(sizeof(dtype)*ndata, (void**)&test_data); //memset(test_data, 0, sizeof(dtype)*ndata); pad_cyclic_pup_virt_buff(ndim, real_edge_len, sym, old_rank, old_phys_dim, old_virt_dim, old_phys_edge_len, old_virt_edge_len, vbs_old, old_offsets, old_permutation, new_np, new_rank, new_phys_dim, new_virt_dim, new_phys_edge_len, new_virt_edge_len, vbs_new, tsr_data, new_data, 1, bucket_offset); /* opt_pup_virt_buff(ndim, nbuf, np, hvbs_old, old_nvirt, new_nvirt, old_edge_len, new_edge_len, sym, old_phase, old_rank, new_phase, new_rank, old_virt_dim, new_virt_dim, send_displs, svirt_displs, old_virt_lda, new_virt_lda, new_pe_lda, vbs_old, is_old_pad, old_padding, tsr_data, test_data, was_cyclic, is_cyclic, 1); for (long_int k = 0;k < ndata;k++) { if (test_data[k] != tsr_cyclic_data[k]) { for (long_int l = 0;l < ndata;l++) { printf("%lld: %f %f\n", l, test_data[l], tsr_cyclic_data[l]); } assert(0); } } CTF_free(test_data); */ CTF_free(new_data); } else { opt_pup_virt_buff(ndim, nbuf, np, hvbs_old, old_nvirt, new_nvirt, old_edge_len, new_edge_len, sym, old_phase, old_rank, new_phase, new_rank, old_virt_dim, new_virt_dim, send_displs, svirt_displs, old_virt_lda, new_virt_lda, new_pe_lda, vbs_old, is_old_pad, old_padding, tsr_data, tsr_cyclic_data, was_cyclic, is_cyclic, 1); } } for (int dim = 0;dim < ndim;dim++){ CTF_free(bucket_offset[dim]); } CTF_free(bucket_offset); TAU_FSTOP(pack_virt_buf); if (swp_nval > nval){ CTF_free(tsr_data); CTF_mst_alloc_ptr(swp_nval*sizeof(dtype), (void**)&tsr_data); } /* Communicate data */ TAU_FSTART(ALL_TO_ALL_V); /* FIXME: not general to all types */ if (sizeof(dtype) == 4) ALL_TO_ALLV(tsr_cyclic_data, send_counts, send_displs, MPI_FLOAT, tsr_data, recv_counts, recv_displs, MPI_FLOAT, ord_glb_comm); if (sizeof(dtype) == 8) ALL_TO_ALLV(tsr_cyclic_data, send_counts, send_displs, COMM_DOUBLE_T, tsr_data, recv_counts, recv_displs, COMM_DOUBLE_T, ord_glb_comm); if (sizeof(dtype) == 16) ALL_TO_ALLV(tsr_cyclic_data, send_counts, send_displs, MPI::DOUBLE_COMPLEX, tsr_data, recv_counts, recv_displs, MPI::DOUBLE_COMPLEX, ord_glb_comm); TAU_FSTOP(ALL_TO_ALL_V); std::fill(tsr_cyclic_data, tsr_cyclic_data+swp_nval, get_zero<dtype>()); TAU_FSTART(unpack_virt_buf); /* Deserialize data into correctly ordered virtual sub blocks */ if (recv_displs[ord_glb_comm.np-1] + recv_counts[ord_glb_comm.np-1] > 0){ if (was_cyclic&&is_cyclic) { dtype **new_data; CTF_alloc_ptr(sizeof(dtype*)*np*new_nvirt, (void**)&new_data); for (int p = 0,b = 0;p < np;p++) { for (int v = 0;v < new_nvirt;v++,b++) new_data[b] = tsr_data+recv_displs[p]+rvirt_displs[b]; } bucket_offset = compute_bucket_offsets( ndim, real_edge_len, new_rank, new_phys_dim, new_virt_dim, new_virt_lda, new_offsets, new_permutation, old_phys_dim, old_pe_lda, old_virt_dim, old_virt_lda, 0, new_nvirt, old_nvirt, new_phys_edge_len, new_virt_edge_len); /* dtype *test_data; CTF_alloc_ptr(sizeof(dtype)*swp_nval, (void**)&test_data); memset(test_data, 0, sizeof(dtype)*swp_nval); opt_pup_virt_buff(ndim, nbuf, np, hvbs_new, old_nvirt, new_nvirt, new_edge_len, old_edge_len, sym, new_phase, new_rank, old_phase, old_rank, new_virt_dim, old_virt_dim, recv_displs, rvirt_displs, new_virt_lda, old_virt_lda, old_pe_lda, vbs_new, is_new_pad, new_padding, tsr_data, test_data, is_cyclic, was_cyclic, 0); */ pad_cyclic_pup_virt_buff(ndim, real_edge_len, sym, new_rank, new_phys_dim, new_virt_dim, new_phys_edge_len, new_virt_edge_len, vbs_new, new_offsets, new_permutation, old_np, old_rank, old_phys_dim, old_virt_dim, old_phys_edge_len, old_virt_edge_len, vbs_old, tsr_cyclic_data, new_data, 0, bucket_offset); for (int dim = 0;dim < ndim;dim++){ CTF_free(bucket_offset[dim]); } CTF_free(bucket_offset); /* for (long_int k = 0;k < swp_nval;k++) { if (test_data[k] != tsr_cyclic_data[k]) { for (long_int l = 0;l < swp_nval;l++) { printf("%lld: %f %f\n", l, test_data[l], tsr_cyclic_data[l]); } assert(0); } } CTF_free(test_data); */ CTF_free(new_data); } else { opt_pup_virt_buff(ndim, nbuf, np, hvbs_new, old_nvirt, new_nvirt, new_edge_len, old_edge_len, sym, new_phase, new_rank, old_phase, old_rank, new_virt_dim, old_virt_dim, recv_displs, rvirt_displs, new_virt_lda, old_virt_lda, old_pe_lda, vbs_new, is_new_pad, new_padding, tsr_data, tsr_cyclic_data, is_cyclic, was_cyclic, 0); } } TAU_FSTOP(unpack_virt_buf); *ptr_tsr_cyclic_data = tsr_cyclic_data; *ptr_tsr_data = tsr_data; CTF_free(real_edge_len); CTF_free(old_phys_dim); CTF_free(new_phys_dim); CTF_free(hsym); CTF_free(idx); CTF_free(idx_offs); CTF_free(old_virt_lda); CTF_free(new_virt_lda); CTF_free(buf_lda); CTF_free(recv_counts); CTF_free(send_counts); CTF_free(rvirt_displs); CTF_free(svirt_displs); CTF_free(send_displs); CTF_free(recv_displs); CTF_free(old_sub_edge_len); CTF_free(new_sub_edge_len); CTF_free(new_virt_edge_len); CTF_free(old_virt_edge_len); CTF_free(new_phys_edge_len); CTF_free(old_phys_edge_len); TAU_FSTOP(cyclic_reshuffle); return DIST_TENSOR_SUCCESS; } /** * \brief Reshuffle elements given the global phases stay the same * * \param[in] ndim number of tensor dimensions * \param[in] phase physical*virtual phase * \param[in] old_size number of elements in the subtensor each proc owned * \param[in] old_virt_dim current virtualization dimensions on each process * \param[in] old_rank current physical rank * \param[in] old_pe_lda old lda of processor grid * \param[in] new_size number of elements in the subtensor each proc owns * \param[in] new_virt_dim new virtualization dimensions on each process * \param[in] new_rank new physical rank * \param[in] new_pe_lda new lda of processor grid * \param[in] tsr_data current tensor data * \param[out] ptr_tsr_cyclic_data pointer to a new tensor of data that will be filled * \param[in] glb_comm the global communicator */ template<typename dtype> void block_reshuffle(int const ndim, int const * phase, long_int const old_size, int const * old_virt_dim, int const * old_rank, int const * old_pe_lda, long_int const new_size, int const * new_virt_dim, int const * new_rank, int const * new_pe_lda, dtype * tsr_data, dtype *& tsr_cyclic_data, CommData_t glb_comm){ int i, idx_lyr_new, idx_lyr_old, blk_idx, prc_idx, loc_idx; int num_old_virt, num_new_virt; int * idx, * old_loc_lda, * new_loc_lda, * phase_lda; long_int blk_sz; MPI_Request * reqs; if (ndim == 0){ CTF_alloc_ptr(sizeof(dtype)*new_size, (void**)&tsr_cyclic_data); if (glb_comm.rank == 0) tsr_cyclic_data[0] = tsr_data[0]; else tsr_cyclic_data[0] = 0.0; return; } TAU_FSTART(block_reshuffle); CTF_mst_alloc_ptr(sizeof(dtype)*new_size, (void**)&tsr_cyclic_data); CTF_alloc_ptr(sizeof(int)*ndim, (void**)&idx); CTF_alloc_ptr(sizeof(int)*ndim, (void**)&old_loc_lda); CTF_alloc_ptr(sizeof(int)*ndim, (void**)&new_loc_lda); CTF_alloc_ptr(sizeof(int)*ndim, (void**)&phase_lda); blk_sz = old_size; old_loc_lda[0] = 1; new_loc_lda[0] = 1; phase_lda[0] = 1; num_old_virt = 1; num_new_virt = 1; idx_lyr_old = glb_comm.rank; idx_lyr_new = glb_comm.rank; for (i=0; i<ndim; i++){ num_old_virt *= old_virt_dim[i]; num_new_virt *= new_virt_dim[i]; blk_sz = blk_sz/old_virt_dim[i]; idx_lyr_old -= old_rank[i]*old_pe_lda[i]; idx_lyr_new -= new_rank[i]*new_pe_lda[i]; if (i>0){ old_loc_lda[i] = old_loc_lda[i-1]*old_virt_dim[i-1]; new_loc_lda[i] = new_loc_lda[i-1]*new_virt_dim[i-1]; phase_lda[i] = phase_lda[i-1]*phase[i-1]; } } CTF_alloc_ptr(sizeof(MPI_Request)*(num_old_virt+num_new_virt), (void**)&reqs); if (idx_lyr_new == 0){ memset(idx, 0, sizeof(int)*ndim); for (;;){ loc_idx = 0; blk_idx = 0; prc_idx = 0; for (i=0; i<ndim; i++){ loc_idx += idx[i]*new_loc_lda[i]; blk_idx += ( idx[i] + new_rank[i]*new_virt_dim[i]) *phase_lda[i]; prc_idx += ((idx[i] + new_rank[i]*new_virt_dim[i])/old_virt_dim[i])*old_pe_lda[i]; } DPRINTF(4,"proc %d receiving blk %d (loc %d, size "PRId64") from proc %d\n", glb_comm.rank, blk_idx, loc_idx, blk_sz, prc_idx); MPI_Irecv(tsr_cyclic_data+loc_idx*blk_sz, blk_sz*sizeof(dtype), MPI_CHAR, prc_idx, blk_idx, glb_comm.cm, reqs+loc_idx); for (i=0; i<ndim; i++){ idx[i]++; if (idx[i] >= new_virt_dim[i]) idx[i] = 0; else break; } if (i==ndim) break; } } if (idx_lyr_old == 0){ memset(idx, 0, sizeof(int)*ndim); for (;;){ loc_idx = 0; blk_idx = 0; prc_idx = 0; for (i=0; i<ndim; i++){ loc_idx += idx[i]*old_loc_lda[i]; blk_idx += ( idx[i] + old_rank[i]*old_virt_dim[i]) *phase_lda[i]; prc_idx += ((idx[i] + old_rank[i]*old_virt_dim[i])/new_virt_dim[i])*new_pe_lda[i]; } DPRINTF(4,"proc %d sending blk %d (loc %d) to proc %d\n", glb_comm.rank, blk_idx, loc_idx, prc_idx); MPI_Isend(tsr_data+loc_idx*blk_sz, blk_sz*sizeof(dtype), MPI_CHAR, prc_idx, blk_idx, glb_comm.cm, reqs+num_new_virt+loc_idx); for (i=0; i<ndim; i++){ idx[i]++; if (idx[i] >= old_virt_dim[i]) idx[i] = 0; else break; } if (i==ndim) break; } } if (idx_lyr_new == 0 && idx_lyr_old == 0){ MPI_Waitall(num_new_virt+num_old_virt, reqs, MPI_STATUSES_IGNORE); } else if (idx_lyr_new == 0){ MPI_Waitall(num_new_virt, reqs, MPI_STATUSES_IGNORE); } else if (idx_lyr_old == 0){ MPI_Waitall(num_old_virt, reqs+num_new_virt, MPI_STATUSES_IGNORE); std::fill(tsr_cyclic_data, tsr_cyclic_data+new_size, get_zero<dtype>()); } else { std::fill(tsr_cyclic_data, tsr_cyclic_data+new_size, get_zero<dtype>()); } CTF_free(idx); CTF_free(old_loc_lda); CTF_free(new_loc_lda); CTF_free(phase_lda); CTF_free(reqs); TAU_FSTOP(block_reshuffle); } #endif