Commit fea513c7 authored by Edgar Solomonik's avatar Edgar Solomonik
Browse files

Got rid of compiler warnings, added performance models for upload/download/offload_gemm

parent df4a209f
......@@ -1843,7 +1843,7 @@ namespace CTF_int {
// printf("new_order[%d/%d] = %d, new_lens[%d] = %d\n", i, topo->order, new_order[i], new_order[i], new_lens[new_order[i]]);
}
topology * new_topo = NULL;
for (int i=0; i<A->wrld->topovec.size(); i++){
for (int i=0; i<(int)A->wrld->topovec.size(); i++){
if (A->wrld->topovec[i]->order == topo->order){
bool has_same_len = true;
for (int j=0; j<topo->order; j++){
......@@ -2586,7 +2586,7 @@ namespace CTF_int {
&num_tot, &idx_arr);
cdealloc(idx_arr);
int64_t tot_num_choices = 0;
for (int i=0; i<wrld->topovec.size(); i++){
for (int i=0; i<(int)wrld->topovec.size(); i++){
// tot_num_choices += pow(num_choices,(int)wrld->topovec[i]->order);
tot_num_choices += get_num_map_variants(wrld->topovec[i]);
}
......@@ -2594,7 +2594,7 @@ namespace CTF_int {
int64_t choice_offset = 0;
int64_t max_memuse = proc_bytes_available();
TAU_FSTOP(init_select_ctr_map);
for (int i=0; i<wrld->topovec.size(); i++){
for (int i=0; i<(int)wrld->topovec.size(); i++){
// int tnum_choices = pow(num_choices,(int) wrld->topovec[i]->order);
int tnum_choices = get_num_map_variants(wrld->topovec[i]);
......@@ -2767,10 +2767,6 @@ namespace CTF_int {
int contraction::map(ctr ** ctrf, bool do_remap){
int ret, j, need_remap, d;
int64_t memuse;//, bmemuse;
double best_time;
int btopo;
//int * idx_arr, * idx_ctr, * idx_no_ctr, * idx_extra, * idx_weigh;
int * old_phase_A, * old_phase_B, * old_phase_C;
topology * old_topo_A, * old_topo_B, * old_topo_C;
distribution * dA, * dB, * dC;
......@@ -2831,12 +2827,9 @@ namespace CTF_int {
old_phase_C[j] = C->edge_map[j].calc_phase();
}
//}
btopo = -1;
best_time = DBL_MAX;
//bmemuse = UINT64_MAX;
int ttopo, ttopo_sel, ttopo_exh;
double gbest_time, gbest_time_sel, gbest_time_exh;
double gbest_time_sel, gbest_time_exh;
TAU_FSTART(get_best_sel_map);
get_best_sel_map(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, ttopo_sel, gbest_time_sel);
......@@ -2850,10 +2843,8 @@ namespace CTF_int {
TAU_FSTOP(get_best_exh_map);
}
if (gbest_time_sel <= gbest_time_exh){
gbest_time = gbest_time_sel;
ttopo = ttopo_sel;
} else {
gbest_time = gbest_time_exh;
ttopo = ttopo_exh;
}
......@@ -2914,7 +2905,7 @@ namespace CTF_int {
int64_t choice_offset = 0;
int i=0;
int64_t old_off;
for (i=0; i<wrld->topovec.size(); i++){
for (i=0; i<(int)wrld->topovec.size(); i++){
//int tnum_choices = pow(num_choices,(int) wrld->topovec[i]->order);
int tnum_choices = get_num_map_variants(wrld->topovec[i]);
old_off = choice_offset;
......@@ -2962,10 +2953,10 @@ namespace CTF_int {
#endif
//FIXME: adhoc?
memuse = MAX((int64_t)(*ctrf)->mem_rec(), (int64_t)(A->size*A->sr->el_size+B->size*B->sr->el_size+C->size*C->sr->el_size)*3);
/*memuse = MAX((int64_t)(*ctrf)->mem_rec(), (int64_t)(A->size*A->sr->el_size+B->size*B->sr->el_size+C->size*C->sr->el_size)*3);
if (global_comm.rank == 0)
VPRINTF(1,"Contraction will use %E bytes per processor out of %E available memory and take an estimated of %lf sec\n",
(double)memuse,(double)proc_bytes_available(),gbest_time);
(double)memuse,(double)proc_bytes_available(),gbest_time);*/
if (A->is_cyclic == 0 &&
B->is_cyclic == 0 &&
......@@ -4039,7 +4030,9 @@ namespace CTF_int {
#if DEBUG >=2
ctrf->print();
#endif
#if VERBOSE >= 1
double dtt = MPI_Wtime();
#endif
#ifdef DEBUG
if (global_comm.rank == 0){
//DPRINTF(1,"[%d] performing contraction\n",
......@@ -4823,9 +4816,8 @@ namespace CTF_int {
}
void contraction::print(){
// int j,ex_A, ex_B,ex_C;
int i,max;
max = A->order+B->order+C->order;
int i;
//max = A->order+B->order+C->order;
CommData global_comm = A->wrld->cdt;
MPI_Barrier(global_comm.cm);
if (global_comm.rank == 0){
......
......@@ -267,7 +267,7 @@ namespace CTF_int {
void ctr_2d_general::run(char * A, char * B, char * C){
int owner_A, owner_B, owner_C, ret;
int64_t ib;
char * buf_A, * buf_B, * buf_C, * buf_aux;
char * buf_A, * buf_B, * buf_C;
char * op_A, * op_B, * op_C;
int rank_A, rank_B, rank_C;
int64_t b_A, b_B, b_C, s_A, s_B, s_C, aux_size;
......
......@@ -64,9 +64,9 @@ namespace CTF_int {
double ctr_offload::est_time_fp(int nlyr){
double tot_time = 0.0;
tot_time += size_A*sr_A->el_size*(total_iter/upload_phase_A)*COST_OFFLOADBW;
tot_time += size_B*sr_B->el_size*(total_iter/upload_phase_B)*COST_OFFLOADBW;
tot_time += size_C*sr_C->el_size*(total_iter/download_phase_C)*COST_OFFLOADBW;
tot_time += estimate_upload_time(size_A*sr_A->el_size)*(total_iter/upload_phase_A);
tot_time += estimate_upload_time(size_B*sr_B->el_size)*(total_iter/upload_phase_B);
tot_time += estimate_download_time(size_C*sr_C->el_size)*(total_iter/download_phase_C);
return tot_time;
}
......
......@@ -386,6 +386,7 @@ namespace CTF_int {
LinModel<3> seq_tsr_ctr_mdl_cst(seq_tsr_ctr_mdl_cst_init,"seq_tsr_ctr_mdl_cst");
LinModel<3> seq_tsr_ctr_mdl_ref(seq_tsr_ctr_mdl_ref_init,"seq_tsr_ctr_mdl_ref");
LinModel<3> seq_tsr_ctr_mdl_inr(seq_tsr_ctr_mdl_inr_init,"seq_tsr_ctr_mdl_inr");
LinModel<3> seq_tsr_ctr_mdl_off(seq_tsr_ctr_mdl_off_init,"seq_tsr_ctr_mdl_off");
uint64_t seq_tsr_ctr::est_membw(){
uint64_t size_A = sy_packed_size(order_A, edge_len_A, sym_A)*sr_A->el_size;
......@@ -430,9 +431,12 @@ namespace CTF_int {
// printf("time estimate is %lf\n", seq_tsr_ctr_mdl.est_time(ps));
if (is_custom)
return seq_tsr_ctr_mdl_cst.est_time(ps);
else if (is_inner)
return seq_tsr_ctr_mdl_inr.est_time(ps);
else
else if (is_inner){
if (inner_params.offload)
return seq_tsr_ctr_mdl_off.est_time(ps);
else
return seq_tsr_ctr_mdl_inr.est_time(ps);
} else
return seq_tsr_ctr_mdl_ref.est_time(ps);
}
......@@ -497,7 +501,10 @@ namespace CTF_int {
double exe_time = MPI_Wtime()-st_time;
// printf("exe_time = %E est_time = %E abs_err = %e rel_err = %lf\n", exe_time,est_time,fabs(exe_time-est_time),fabs(exe_time-est_time)/exe_time);
double tps[] = {exe_time, 1.0, (double)est_membw(), est_fp()};
seq_tsr_ctr_mdl_inr.observe(tps);
if (inner_params.offload)
seq_tsr_ctr_mdl_off.observe(tps);
else
seq_tsr_ctr_mdl_inr.observe(tps);
// seq_tsr_ctr_mdl_inr.print_param_guess();
} else {
double st_time = MPI_Wtime();
......
......@@ -2,6 +2,7 @@ namespace CTF_int{
double seq_tsr_ctr_mdl_cst_init[] = {3.8587E-13, 5.0935E-09, 4.2446E-10};
double seq_tsr_ctr_mdl_ref_init[] = {1.5382E-16, 9.2293E-10, 7.6911E-11};
double seq_tsr_ctr_mdl_inr_init[] = {3.2489E-04, 1.2692E-10, 1.0686E-10};
double seq_tsr_ctr_mdl_off_init[] = {2.5413E-04, 1.5889E-10, 9.6735E-12};
double long_contig_transp_mdl_init[] = {-1.8639E-03, 1.8467E-08};
double shrt_contig_transp_mdl_init[] = {5.5768E-03, 1.7220E-08};
double non_contig_transp_mdl_init[] = {-4.8961E-04, 1.0225E-08};
......@@ -11,5 +12,7 @@ namespace CTF_int{
double bcast_mdl_init[] = {2.0139E-04, -4.1326E-05, 2.8453E-10, 2.4672E-10};
double dgtog_res_mdl_init[] = {1.1994E-04, 3.1005E-04, 9.6259E-10};
double blres_mdl_init[] = {1.4728E-05, 1.7673E-10};
double upload_mdl_init[] = {1.4604E-04, 7.3783E-10};
double download_mdl_init[] = {5.7942E-04, 6.0787E-10};
}
......@@ -13,6 +13,9 @@ namespace CTF_int{
extern double seq_tsr_ctr_mdl_cst_init[];
extern double seq_tsr_ctr_mdl_ref_init[];
extern double seq_tsr_ctr_mdl_inr_init[];
extern double seq_tsr_ctr_mdl_off_init[];
extern double upload_mdl_init[];
extern double download_mdl_init[];
}
#endif
......@@ -13,6 +13,7 @@
#include "offload.h"
#include "../tensor/algstrct.h"
#include "../interface/timer.h"
namespace CTF_int{
volatile static int64_t int64_t_max = INT64_MAX;
......@@ -65,20 +66,42 @@ namespace CTF_int{
cudaError_t err = cudaFree(dev_ptr);
assert(err == cudaSuccess);
}
LinModel<2> upload_mdl(upload_mdl_init,"upload_mdl");
LinModel<2> download_mdl(download_mdl_init,"download_mdl");
double estimate_download_time(int64_t size){
double ps[] = {1.0, (double)size};
return download_mdl.est_time(ps);
}
double estimate_upload_time(int64_t size){
double ps[] = {1.0, (double)size};
return upload_mdl.est_time(ps);
}
void offload_ptr::download(char * host_ptr){
assert(initialized);
TAU_FSTART(cuda_download);
double st_time = MPI_Wtime();
cudaError_t err = cudaMemcpy(host_ptr, dev_ptr, size*sr->el_size,
cudaMemcpyDeviceToHost);
double exe_time = MPI_Wtime()-st_time;
double tps[] = {exe_time, 1.0, (double)size*sr->el_size};
download_mdl.observe(tps);
TAU_FSTOP(cuda_download);
assert(err == cudaSuccess);
}
void offload_ptr::upload(char const * host_ptr){
TAU_FSTART(cuda_upload);
double st_time = MPI_Wtime();
cudaError_t err = cudaMemcpy(dev_ptr, host_ptr, size*sr->el_size,
cudaMemcpyHostToDevice);
double exe_time = MPI_Wtime()-st_time;
double tps[] = {exe_time, 1.0, (double)size*sr->el_size};
upload_mdl.observe(tps);
TAU_FSTOP(cuda_upload);
assert(err == cudaSuccess);
}
......
......@@ -12,6 +12,12 @@ namespace CTF_int{
void offload_init();
/** \brief exit offloading, e.g. destroy cublas */
void offload_exit();
/** \brief estimate time it takes to upload */
double estimate_download_time(int64_t size);
/** \brief estimate time it takes to download */
double estimate_upload_time(int64_t size);
class offload_ptr {
public:
......
......@@ -2263,8 +2263,8 @@ namespace CTF_int {
}
void summation::print(){
int i,j,max,ex_A, ex_B;
max = A->order+B->order;
int i;
//max = A->order+B->order;
CommData global_comm = A->wrld->cdt;
MPI_Barrier(global_comm.cm);
......
......@@ -639,7 +639,7 @@ namespace CTF_int {
swap_buffer+ptr_pairs[i].idx*(sizeof(int64_t)+sr->el_size),
sizeof(int64_t)+sr->el_size);
}
break;
break; //compiler warning here seems to be gcc bug
}
}
......
......@@ -20,7 +20,7 @@ using namespace CTF;
namespace CTF_int {
static const char * SY_strings[4] = {"NS", "SY", "AS", "SH"};
// static const char * SY_strings[4] = {"NS", "SY", "AS", "SH"};
Idx_Tensor tensor::operator[](const char * idx_map_){
Idx_Tensor idxtsr(this, idx_map_);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment