Commit 3f8f9b0c authored by Edgar Solomonik's avatar Edgar Solomonik
Browse files

Merge branch 'scheduler' into slice_opt

parents e168c213 f02f5eb8
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<storageModule moduleId="org.eclipse.cdt.core.settings">
<cconfiguration id="org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346" moduleId="org.eclipse.cdt.core.settings" name="Build (GNU)">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.Cygwin_PE" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration artifactName="ctf" buildProperties="" description="" id="org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346" name="Build (GNU)" parent="org.eclipse.cdt.build.core.emptycfg">
<folderInfo id="org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346.68812188" name="/" resourcePath="">
<toolChain id="cdt.managedbuild.toolchain.gnu.cygwin.base.114500116" name="Cygwin GCC" superClass="cdt.managedbuild.toolchain.gnu.cygwin.base">
<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.Cygwin_PE" id="cdt.managedbuild.target.gnu.platform.cygwin.base.1974190108" name="Debug Platform" osList="win32" superClass="cdt.managedbuild.target.gnu.platform.cygwin.base"/>
<builder buildPath="${workspace_loc:/ctf}/Build (GNU)" id="cdt.managedbuild.target.gnu.builder.cygwin.base.1867890232" name="Gnu Make Builder.Build (GNU)" superClass="cdt.managedbuild.target.gnu.builder.cygwin.base"/>
<tool id="cdt.managedbuild.tool.gnu.assembler.cygwin.base.1018853843" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.cygwin.base">
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.1866661508" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.archiver.cygwin.base.1555343600" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.cygwin.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.1674540487" name="Cygwin C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base">
<option id="gnu.cpp.compiler.option.include.paths.1449537127" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;C:\cygwin64\usr\include&quot;"/>
<listOptionValue builtIn="false" value="&quot;C:\cygwin64\lib\gcc\x86_64-pc-cygwin\4.8.2\include\c++&quot;"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.2056994875" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1942961242" name="Cygwin C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.cygwin.base">
<option id="gnu.c.compiler.option.include.paths.1543177086" superClass="gnu.c.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;C:\cygwin64\usr\include&quot;"/>
<listOptionValue builtIn="false" value="&quot;C:\cygwin64\lib\gcc\x86_64-pc-cygwin\4.8.2\include\c++&quot;"/>
</option>
<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.1102012624" superClass="cdt.managedbuild.tool.gnu.c.compiler.input.cygwin"/>
</tool>
<tool id="cdt.managedbuild.tool.gnu.c.linker.cygwin.base.920934820" name="Cygwin C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.cygwin.base"/>
<tool id="cdt.managedbuild.tool.gnu.cpp.linker.cygwin.base.2034171734" name="Cygwin C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.cygwin.base">
<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1371501430" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
</toolChain>
</folderInfo>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<project id="ctf.null.2097328945" name="ctf"/>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
<storageModule moduleId="refreshScope" versionNumber="2">
<configuration configurationName="Build (GNU)">
<resource resourceType="PROJECT" workspacePath="/ctf"/>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
<storageModule moduleId="scannerConfiguration">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
<scannerConfigBuildInfo instanceId="org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346;org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346.68812188;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.1674540487;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.2056994875">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346;org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346.68812188;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1942961242;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.1102012624">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346;org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346.68812188;org.eclipse.linuxtools.cdt.autotools.core.toolchain.tool.gcc.686440635;cdt.managedbuild.tool.gnu.c.compiler.input.1917476232">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346;org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346.68812188;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1552707334;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.1663905346">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346;org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346.68812188;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.283180442;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.1114020496">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346;org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346.68812188;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.360486812;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.890507584">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346;org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346.68812188;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.1266459141;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.605308406">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346;org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346.68812188;org.eclipse.linuxtools.cdt.autotools.core.toolchain.tool.gpp.2043906437;cdt.managedbuild.tool.gnu.cpp.compiler.input.1884109157">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
</scannerConfigBuildInfo>
</storageModule>
</cproject>
*.o
.*
# Build directories
*.o
.deps
......
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>ctf</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
<triggers>clean,full,incremental,</triggers>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
<triggers>full,incremental,</triggers>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.cdt.core.cnature</nature>
<nature>org.eclipse.cdt.core.ccnature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
</natures>
</projectDescription>
......@@ -24,26 +24,31 @@ all $(MAKECMDGOALS):
echo 'Machine recognized as a MAC'; \
cp mkfiles/config.mk.linux config.mk; \
else \
if [ $(shell hostname | grep 'edison\|hopper' ) ] ; then \
echo 'Hostname recognized as Edison or Hopper, using pre-made config.mk file'; \
cp mkfiles/config.mk.hopper config.mk; \
if [ $(shell hostname | grep 'edison' ) ] ; then \
echo 'Hostname recognized as Edison, using pre-made config.mk file'; \
cp mkfiles/config.mk.edison config.mk; \
else \
if [ $(shell hostname | grep 'cvrsvc' ) ] ; then \
echo 'Hostname recognized as Carver, using pre-made config.mk file'; \
cp mkfiles/config.mk.carver config.mk; \
if [ $(shell hostname | grep 'hopper' ) ] ; then \
echo 'Hostname recognized as Hopper, using pre-made config.mk file'; \
cp mkfiles/config.mk.hopper config.mk; \
else \
if [ $(shell hostname | grep 'surveyor\|intrepid\|challenger\|udawn' ) ] ; then \
echo 'Hostname recognized as a BG/P machine, using pre-made config.mk file'; \
cp mkfiles/config.mk.bgp config.mk; \
if [ $(shell hostname | grep 'cvrsvc' ) ] ; then \
echo 'Hostname recognized as Carver, using pre-made config.mk file'; \
cp mkfiles/config.mk.carver config.mk; \
else \
if [ $(shell hostname | grep 'ls[0-9]*.tacc.utexas.edu' ) ] ; then \
cp mkfiles/config.mk.lonestar config.mk; \
if [ $(shell hostname | grep 'surveyor\|intrepid\|challenger\|udawn' ) ] ; then \
echo 'Hostname recognized as a BG/P machine, using pre-made config.mk file'; \
cp mkfiles/config.mk.bgp config.mk; \
else \
if [ $(shell hostname | grep 'vesta\|mira\|cetus\|seq' ) ] ; then \
cp mkfiles/config.mk.bgq config.mk; \
if [ $(shell hostname | grep 'ls[0-9]*.tacc.utexas.edu' ) ] ; then \
cp mkfiles/config.mk.lonestar config.mk; \
else \
echo 'Hostname not recognized: assuming linux, specialize config.mk if necessary'; \
cp mkfiles/config.mk.linux config.mk; \
if [ $(shell hostname | grep 'vesta\|mira\|cetus\|seq' ) ] ; then \
cp mkfiles/config.mk.bgq config.mk; \
else \
echo 'Hostname not recognized: assuming linux, specialize config.mk if necessary'; \
cp mkfiles/config.mk.linux config.mk; \
fi; \
fi; \
fi; \
fi; \
......
......@@ -5,7 +5,7 @@ include ../src/make/rules.mk
examples: dft dft_3D gemm gemm_4D scalar trace diag_sym fast_diagram \
fast_3mm sym3 fast_sym fast_sym_4D ccsdt_t3_to_t2 weight_4D \
test_suite strassen slice_gemm ccsd readwrite_test \
test_suite strassen slice_gemm ccsd readwrite_test subworld_gemm \
permute_multiworld sparse_permuted_slice
sparse_permuted_slice: ${bindir}/sparse_permuted_slice
......@@ -56,6 +56,9 @@ ${bindir}/gemm_4D: gemm_4D.o ${libdir}/libctf.a
gemm: ${bindir}/gemm
${bindir}/gemm: gemm.o ${libdir}/libctf.a
subworld_gemm: ${bindir}/subworld_gemm
${bindir}/subworld_gemm: subworld_gemm.o ${libdir}/libctf.a
weight_4D: ${bindir}/weight_4D
${bindir}/weight_4D: weight_4D.o ${libdir}/libctf.a
......
......@@ -181,7 +181,13 @@ class Amplitudes {
};
void ccsd(Integrals &V,
Amplitudes &T){
Amplitudes &T,
int sched_nparts = 0){
int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank);
double timer = MPI_Wtime();
tCTF_Schedule<double> sched(V.dw);
sched.set_max_partitions(sched_nparts);
sched.record();
CTF_Tensor T21 = CTF_Tensor(T.abij);
T21["abij"] += .5*T["ai"]*T["bj"];
......@@ -251,6 +257,14 @@ void ccsd(Integrals &V,
Zabij += .5*V["abef"]*T21["efij"];
Zabij += .5*Wmnij*T21["abmn"];
if (rank == 0) {
printf("Record: %lf\n",
MPI_Wtime()-timer);
}
timer = MPI_Wtime();
tCTF_ScheduleTimer schedule_time = sched.execute();
CTF_fctr fctr;
fctr.func_ptr = &divide;
......@@ -265,8 +279,21 @@ void ccsd(Integrals &V,
Dabij["abij"] -= V["a"];
Dabij["abij"] -= V["b"];
T.ai.contract(1.0, *(Zai.parent), "ai", Dai, "ai", 0.0, "ai", fctr);
T.abij.contract(1.0, *(Zabij.parent), "abij", Dabij, "abij", 0.0, "abij", fctr);
if (rank == 0) {
printf("Schedule comm down: %lf\n", schedule_time.comm_down_time);
printf("Schedule execute: %lf\n", schedule_time.exec_time);
printf("Schedule imbalance, wall: %lf\n", schedule_time.imbalance_wall_time);
printf("Schedule imbalance, accum: %lf\n", schedule_time.imbalance_acuum_time);
printf("Schedule comm up: %lf\n", schedule_time.comm_up_time);
printf("Schedule total: %lf\n", schedule_time.total_time);
printf("All execute: %lf\n",
MPI_Wtime()-timer);
}
}
#ifndef TEST_SUITE
......@@ -283,7 +310,7 @@ char* getCmdOption(char ** begin,
int main(int argc, char ** argv){
int rank, np, niter, no, nv, i;
int rank, np, niter, no, nv, sched_nparts, i;
int const in_num = argc;
char ** input_str = argv;
......@@ -303,6 +330,10 @@ int main(int argc, char ** argv){
niter = atoi(getCmdOption(input_str, input_str+in_num, "-niter"));
if (niter < 0) niter = 1;
} else niter = 1;
if (getCmdOption(input_str, input_str+in_num, "-nparts")){
sched_nparts = atoi(getCmdOption(input_str, input_str+in_num, "-nparts"));
if (sched_nparts < 0) sched_nparts = 0;
} else sched_nparts = 0;
{
CTF_World dw(argc, argv);
......@@ -313,10 +344,10 @@ int main(int argc, char ** argv){
for (i=0; i<niter; i++){
T.fill_rand();
double d = MPI_Wtime();
ccsd(V,T);
ccsd(V,T,sched_nparts);
if (rank == 0)
printf("Completed %dth CCSD iteration in time = %lf, |T| is %lf\n",
i, MPI_Wtime()-d, T.ai.norm2()+T.abij.norm2());
printf("(%d nodes) Completed %dth CCSD iteration in time = %lf, |T| is %lf\n",
np, i, MPI_Wtime()-d, T.ai.norm2()+T.abij.norm2());
else {
T.ai.norm2();
T.abij.norm2();
......
......@@ -262,7 +262,7 @@ char* getCmdOption(char ** begin,
}
int main(int argc, char ** argv){
int rank, np, niter, n, m, k, pass;
int rank, np, n, pass;
int const in_num = argc;
char ** input_str = argv;
......
/*Copyright (c) 2011, Edgar Solomonik, all rights reserved.*/
/** \addtogroup examples
* @{
* \defgroup subworld_gemm
* @{
* \brief Performs recursive parallel matrix multiplication using the slice interface to extract blocks
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <string>
#include <math.h>
#include <assert.h>
#include <stdint.h>
#include <algorithm>
#include <ctf.hpp>
int test_subworld_gemm(int n,
int m,
int k,
int div_,
CTF_World &dw){
int rank, num_pes;
int64_t i, np;
double * pairs, err;
int64_t * indices;
CTF_Matrix C(m, n, NS, dw);
CTF_Matrix C_ans(m, n, NS, dw);
CTF_Matrix A(m, k, NS, dw);
CTF_Matrix B(k, n, NS, dw);
MPI_Comm pcomm = dw.comm;
MPI_Comm_rank(pcomm, &rank);
MPI_Comm_size(pcomm, &num_pes);
int div = div_;
if (div > num_pes) div = num_pes;
srand48(13*rank);
A.read_local(&np, &indices, &pairs);
for (i=0; i<np; i++ ) pairs[i] = drand48()-.5;
A.write(np, indices, pairs);
free(pairs);
free(indices);
B.read_local(&np, &indices, &pairs);
for (i=0; i<np; i++ ) pairs[i] = drand48()-.5;
B.write(np, indices, pairs);
free(pairs);
free(indices);
int cnum_pes = num_pes / div;
int color = rank/cnum_pes;
int crank = rank%cnum_pes;
MPI_Comm ccomm;
MPI_Comm_split(pcomm, color, crank, &ccomm);
CTF_World sworld(ccomm);
C_ans["ij"] = ((double)div)*A["ik"]*B["kj"];
CTF_Matrix subA(m, k, NS, sworld);
CTF_Matrix subB(k, n, NS, sworld);
CTF_Matrix subC(m, n, NS, sworld);
for (int c=0; c<num_pes/cnum_pes; c++){
if (c==color){
A.add_to_subworld(&subA,1.0,0.0);
B.add_to_subworld(&subB,1.0,0.0);
} else {
A.add_to_subworld(NULL,1.0,0.0);
B.add_to_subworld(NULL,1.0,0.0);
}
}
if (rank < cnum_pes*div)
subC["ij"] = subA["ik"]*subB["kj"];
for (int c=0; c<num_pes/cnum_pes; c++){
if (c==color){
C.add_from_subworld(&subC, 1.0, 1.0);
} else {
C.add_from_subworld(NULL, 1.0, 1.0);
}
}
C_ans["ij"] -= C["ij"];
err = C_ans.norm2();
if (rank == 0){
if (err<1.E-9)
printf("{ GEMM on subworlds } passed\n");
else
printf("{ GEMM on subworlds } FAILED, error norm = %E\n",err);
}
return err<1.E-9;
}
#ifndef TEST_SUITE
char* getCmdOption(char ** begin,
char ** end,
const std::string & option){
char ** itr = std::find(begin, end, option);
if (itr != end && ++itr != end){
return *itr;
}
return 0;
}
int main(int argc, char ** argv){
int rank, np, niter, n, m, k, pass, div;
int const in_num = argc;
char ** input_str = argv;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &np);
if (getCmdOption(input_str, input_str+in_num, "-n")){
n = atoi(getCmdOption(input_str, input_str+in_num, "-n"));
if (n < 0) n = 23;
} else n = 23;
if (getCmdOption(input_str, input_str+in_num, "-m")){
m = atoi(getCmdOption(input_str, input_str+in_num, "-m"));
if (m < 0) m = 17;
} else m = 17;
if (getCmdOption(input_str, input_str+in_num, "-k")){
k = atoi(getCmdOption(input_str, input_str+in_num, "-k"));
if (k < 0) k = 31;
} else k = 31;
if (getCmdOption(input_str, input_str+in_num, "-div")){
div = atoi(getCmdOption(input_str, input_str+in_num, "-div"));
if (div < 0) div = 2;
} else div = 2;
{
CTF_World dw(MPI_COMM_WORLD, argc, argv);
int pass;
if (rank == 0){
printf("Non-symmetric: NS = NS*NS test_subworld_gemm:\n");
}
pass = test_subworld_gemm(n, m, k, div, dw);
assert(pass);
}
MPI_Finalize();
return 0;
}
#endif
......@@ -7,6 +7,9 @@
#include <stdio.h>
#include <stdint.h>
#include <vector>
#include <deque>
#include <set>
#include <map>
#include "../src/dist_tensor/cyclopstf.hpp"
/**
......@@ -139,14 +142,15 @@ class tCTF_Tensor {
tCTF_World<dtype> & world_,
char const * name_ = NULL,
int profile_ = 0);
/**
* \brief creates a copy of the tensor, in a different world if specified
* \param[in] oworld pointer to another world (NULL oworld = this->world)
* \return new tensor object on oworld
* \brief creates a zeroed out copy (data not copied) of a tensor in a different world
* \param[in] A tensor whose characteristics to copy
* \param[in] world_ a world for the tensor we are creating to live in, can be different from A
*/
tCTF_Tensor<dtype> clone(tCTF_World<dtype> * oworld = NULL) const;
tCTF_Tensor(tCTF_Tensor const & A,
tCTF_World<dtype> & world_);
/**
* \brief gives the values associated with any set of indices
* The sparse data is defined in coordinate format. The tensor index (i,j,k,l) of a tensor with edge lengths
......@@ -262,6 +266,34 @@ class tCTF_Tensor {
dtype beta,
char const * idx_C,
tCTF_fctr<dtype> fseq = tCTF_fctr<dtype>());
/**
* \brief estimate the cost of a contraction C[idx_C] = A[idx_A]*B[idx_B]
* \param[in] A first operand tensor
* \param[in] idx_A indices of A in contraction, e.g. "ik" -> A_{ik}
* \param[in] B second operand tensor
* \param[in] idx_B indices of B in contraction, e.g. "kj" -> B_{kj}
* \param[in] idx_C indices of C (this tensor), e.g. "ij" -> C_{ij}
* \return cost as a int64_t type, currently a rought estimate of flops/processor
*/
int64_t estimate_cost(const tCTF_Tensor & A,
char const * idx_A,
const tCTF_Tensor & B,
char const * idx_B,
char const * idx_C);
/**
* \brief estimate the cost of a sum B[idx_B] = A[idx_A]
* \param[in] A first operand tensor
* \param[in] idx_A indices of A in contraction, e.g. "ik" -> A_{ik}
* \param[in] idx_B indices of B in contraction, e.g. "kj" -> B_{kj}
* \return cost as a int64_t type, currently a rought estimate of flops/processor
*/
int64_t estimate_cost(const tCTF_Tensor & A,
char const * idx_A,
char const * idx_B);
/**
* \brief sums B[idx_B] = beta*B[idx_B] + alpha*A[idx_A]
......@@ -346,7 +378,7 @@ class tCTF_Tensor {
dtype alpha) const;
/**
* \brief TODO: apply permutation to matrix, potentially extracting a slice
* \brief Apply permutation to matrix, potentially extracting a slice
* B[i,j,...]
* = beta*B[...] + alpha*A[perms_A[0][i],perms_A[1][j],...]
*
......@@ -365,7 +397,7 @@ class tCTF_Tensor {
dtype alpha);
/**
* \brief TODO: apply permutation to matrix, potentially extracting a slice
* \brief Apply permutation to matrix, potentially extracting a slice
* B[perms_B[0][i],perms_B[0][j],...]
* = beta*B[...] + alpha*A[i,j,...]
*
......@@ -382,6 +414,31 @@ class tCTF_Tensor {
dtype beta,
tCTF_Tensor & A,
dtype alpha);
/**
* \brief accumulates this tensor to a tensor object defined on a different world
* \param[in] tsr a tensor object of the same characteristic as this tensor,
* but on a different CTF_world/MPI_comm
* \param[in] alpha scaling factor for this tensor (default 1.0)
* \param[in] beta scaling factor for tensor tsr (default 1.0)
*/
void add_to_subworld(tCTF_Tensor<dtype> * tsr,
dtype alpha,
dtype beta) const;
void add_to_subworld(tCTF_Tensor<dtype> * tsr) const;
/**
* \brief accumulates this tensor from a tensor object defined on a different world
* \param[in] tsr a tensor object of the same characteristic as this tensor,
* but on a different CTF_world/MPI_comm
* \param[in] alpha scaling factor for tensor tsr (default 1.0)
* \param[in] beta scaling factor for this tensor (default 1.0)
*/
void add_from_subworld(tCTF_Tensor<dtype> * tsr,
dtype alpha,
dtype beta) const;
void add_from_subworld(tCTF_Tensor<dtype> * tsr) const;
/**
* \brief aligns data mapping with tensor A
......@@ -526,6 +583,22 @@ class tCTF_Tensor {
~tCTF_Tensor();
};
/**
* \brief comparison function for sets of tensor pointers
* This ensures the set iteration order is consistent across nodes
*/
template<typename dtype>
struct tensor_tid_less {
bool operator()(tCTF_Tensor<dtype>* A, tCTF_Tensor<dtype>* B) {
if (A == NULL && B != NULL) {
return true;
} else if (A == NULL || B == NULL) {
return false;
}
return A->tid < B->tid;
}
};
/**
* \brief Matrix class which encapsulates a 2D tensor
*/
......@@ -624,7 +697,7 @@ class tCTF_Idx_Tensor : public tCTF_Term<dtype> {
// dervied clone calls copy constructor
tCTF_Term<dtype> * clone() const;
tCTF_Term<dtype> * clone(std::map<tCTF_Tensor<dtype>*, tCTF_Tensor<dtype>*>* remap = NULL) const;
/**
* \brief constructor takes in a parent tensor and its indices
......@@ -642,8 +715,8 @@ class tCTF_Idx_Tensor : public tCTF_Term<dtype> {
* \param[in] copy if 1 then copy the parent tensor of B into a new tensor
*/
tCTF_Idx_Tensor(tCTF_Idx_Tensor<dtype> const & B,
int copy = 0);
int copy = 0,
std::map<tCTF_Tensor<dtype>*, tCTF_Tensor<dtype>*>* remap = NULL);
tCTF_Idx_Tensor();
......@@ -664,6 +737,24 @@ class tCTF_Idx_Tensor : public tCTF_Term<dtype> {
*/
void execute(tCTF_Idx_Tensor<dtype> output) const;
/**
* \brief estimates the cost of a contraction
* \param[in] output tensor to write results into and its indices
*/
long_int estimate_cost(tCTF_Idx_Tensor<dtype> output) const;
/**
* \brief estimates the cost the expression to produce an intermediate with
* all expression indices remaining
* \param[in,out] output tensor to write results into and its indices
*/
tCTF_Idx_Tensor<dtype> estimate_cost(long_int & cost) const;
/**
* \brief appends the tensors this depends on to the input set
*/
void get_inputs(std::set<tCTF_Tensor<dtype>*, tensor_tid_less<dtype> >* inputs_set) const;
/**
* \brief A = B, compute any operations on operand B and set
* \param[in] B tensor on the right hand side
......@@ -893,7 +984,7 @@ class tCTF_Term {
/**
* \brief base classes must implement this copy function to retrieve pointer
*/
virtual tCTF_Term * clone() const = 0;
virtual tCTF_Term * clone(std::map<tCTF_Tensor<dtype>*, tCTF_Tensor<dtype>*>* remap = NULL) const = 0;
/**
* \brief evalues the expression, which just scales by default
......@@ -901,6 +992,21 @@ class tCTF_Term {
*/
virtual void execute(tCTF_Idx_Tensor<dtype> output) const = 0;
/**
* \brief estimates the cost of a contraction/sum/.. term
* \param[in] output tensor to write results into and its indices
*/
virtual long_int estimate_cost(tCTF_Idx_Tensor<dtype> output) const = 0;
/**
* \brief estimates the cost the expression to produce an intermediate with
* all expression indices remaining
* \param\[in,out] cost the cost of the operatiob
* \return output tensor to write results into and its indices
*/
virtual tCTF_Idx_Tensor<dtype> estimate_cost(long_int & cost) const = 0;
/**
* \brief evalues the expression to produce an intermediate with
* all expression indices remaining
......@@ -908,6 +1014,11 @@ class tCTF_Term {
*/
virtual tCTF_Idx_Tensor<dtype> execute() const = 0;
/**
* \brief appends the tensors this depends on to the input set
*/
virtual void get_inputs(std::set<tCTF_Tensor<dtype>*, tensor_tid_less<dtype> >* inputs_set) const = 0;
/**
* \brief constructs a new term which multiplies by tensor A
* \param[in] A term to multiply by
......@@ -965,10 +1076,11 @@ class tCTF_Sum_Term : public tCTF_Term<dtype> {
~tCTF_Sum_Term();
// copy constructor
tCTF_Sum_Term(tCTF_Sum_Term<dtype> const & other);
tCTF_Sum_Term(tCTF_Sum_Term<dtype> const & other,
std::map<tCTF_Tensor<dtype>*, tCTF_Tensor<dtype>*>* remap = NULL);
// dervied clone calls copy constructor
tCTF_Term<dtype> * clone() const;
tCTF_Term<dtype>* clone(std::map<tCTF_Tensor<dtype>*, tCTF_Tensor<dtype>*>* remap = NULL) const;
/**
* construct sum term corresponding to a single tensor
......@@ -981,7 +1093,8 @@ class tCTF_Sum_Term : public tCTF_Term<dtype> {
* \param[in,out] output tensor to write results into and its indices
*/
void execute(tCTF_Idx_Tensor<dtype> output) const;
/**
* \brief evalues the expression to produce an intermediate with
* all expression indices remaining
......@@ -989,6 +1102,26 @@ class tCTF_Sum_Term : public tCTF_Term<dtype> {
*/
tCTF_Idx_Tensor<dtype> execute() const;
/**
* \brief estimates the cost of a sum term
* \param[in] output tensor to write results into and its indices
*/
long_int estimate_cost(tCTF_Idx_Tensor<dtype> output) const;
/**
* \brief estimates the cost the expression to produce an intermediate with
* all expression indices remaining
* \param[in,out] output tensor to write results into and its indices
*/
tCTF_Idx_Tensor<dtype> estimate_cost(long_int & cost) const;
/**
* \brief appends the tensors this depends on to the input set
*/
void get_inputs(std::set<tCTF_Tensor<dtype>*, tensor_tid_less<dtype> >* inputs_set) const;
/**
* \brief constructs a new term by addition of two terms
* \param[in] A term to add to output
......@@ -1035,10 +1168,11 @@ class tCTF_Contract_Term : public tCTF_Term<dtype> {
~tCTF_Contract_Term();
// \brief copy constructor
tCTF_Contract_Term(tCTF_Contract_Term<dtype> const & other);
tCTF_Contract_Term(tCTF_Contract_Term<dtype> const & other,
std::map<tCTF_Tensor<dtype>*, tCTF_Tensor<dtype>*>* remap = NULL);
// \brief dervied clone calls copy constructor
tCTF_Term<dtype> * clone() const;
tCTF_Term<dtype> * clone(std::map<tCTF_Tensor<dtype>*, tCTF_Tensor<dtype>*>* remap = NULL) const;
/**
* \brief override execution to to contract operands and add them to output
......@@ -1046,6 +1180,11 @@ class tCTF_Contract_Term : public tCTF_Term<dtype> {
*/
void execute(tCTF_Idx_Tensor<dtype> output) const;
/**
* \brief appends the tensors this depends on to the input set
*/
void get_inputs(std::set<tCTF_Tensor<dtype>*, tensor_tid_less<dtype> >* inputs_set) const;
/**
* \brief evalues the expression to produce an intermediate with
* all expression indices remaining
......@@ -1053,6 +1192,20 @@ class tCTF_Contract_Term : public tCTF_Term<dtype> {
*/
tCTF_Idx_Tensor<dtype> execute() const;
/**
* \brief estimates the cost of a contract term
* \param[in] output tensor to write results into and its indices
*/
long_int estimate_cost(tCTF_Idx_Tensor<dtype> output) const;
/**
* \brief estimates the cost the expression to produce an intermediate with
* all expression indices remaining
* \param[in,out] output tensor to write results into and its indices
*/
tCTF_Idx_Tensor<dtype> estimate_cost(long_int & cost) const;
/**
* \brief override contraction to grow vector rather than create recursive terms
* \param[in] A term to multiply by
......@@ -1068,6 +1221,239 @@ class tCTF_Contract_Term : public tCTF_Term<dtype> {
* @}
*/
/**
* \defgroup scheduler Dynamic scheduler.
* @{
*/
enum tCTF_TensorOperationTypes {
TENSOR_OP_NONE,
TENSOR_OP_SET,
TENSOR_OP_SUM,
TENSOR_OP_SUBTRACT,
TENSOR_OP_MULTIPLY };
/**
* \brief Provides a untemplated base class for tensor operations.
*/
class tCTF_TensorOperationBase {
public:
virtual ~tCTF_TensorOperationBase() {}
};
/**
* \brief A tensor operation, containing all the data (op, lhs, rhs) required
* to run it. Also provides methods to get a list of inputs and outputs, as well
* as successor and dependency information used in scheduling.
*/
template<typename dtype>
class tCTF_TensorOperation : public tCTF_TensorOperationBase {
public:
/**
* \brief Constructor, create the tensor operation lhs op= rhs
*/
tCTF_TensorOperation(tCTF_TensorOperationTypes op,
tCTF_Idx_Tensor<dtype>* lhs,
const tCTF_Term<dtype>* rhs) :
op(op),
lhs(lhs),
rhs(rhs),
dependency_count(0),
cached_estimated_cost(0) {}
/**
* \brief appends the tensors this writes to to the input set
*/
void get_outputs(std::set<tCTF_Tensor<dtype>*, tensor_tid_less<dtype> >* outputs_set) const;
/**
* \brief appends the tensors this depends on (reads from, including the output
* if a previous value is required) to the input set
*/
void get_inputs(std::set<tCTF_Tensor<dtype>*, tensor_tid_less<dtype> >* inputs_set) const;
/**
* \brief runs this operation, but does NOT handle dependency scheduling
* optionally takes a remapping of tensors
*/
void execute(std::map<tCTF_Tensor<dtype>*, tCTF_Tensor<dtype>*>* remap = NULL);
/**
*\brief provides an estimated runtime cost
*/
long_int estimate_cost();
bool is_dummy() {
return op == TENSOR_OP_NONE;
}
/**
* Schedule Recording Variables
*/
// Number of dependencies I have
int dependency_count;
// List of all successors - operations that depend on me
std::vector<tCTF_TensorOperation<dtype>* > successors;
std::vector<tCTF_TensorOperation<dtype>* > reads;
/**
* Schedule Execution Variables
*/
int dependency_left;
/**
* Debugging Helpers
*/
const char* name() {
return lhs->parent->name;
}
protected:
tCTF_TensorOperationTypes op;
tCTF_Idx_Tensor<dtype>* lhs;
const tCTF_Term<dtype>* rhs;
long_int cached_estimated_cost;
};
// untemplatized scheduler abstract base class to assist in global operations
class tCTF_ScheduleBase {
public:
virtual void add_operation(tCTF_TensorOperationBase* op) = 0;
};
extern tCTF_ScheduleBase* global_schedule;
struct tCTF_ScheduleTimer {
double comm_down_time;
double exec_time;
double imbalance_wall_time;
double imbalance_acuum_time;
double comm_up_time;
double total_time;
tCTF_ScheduleTimer():
comm_down_time(0),
exec_time(0),
imbalance_wall_time(0),
imbalance_acuum_time(0),
comm_up_time(0),
total_time(0) {}
void operator+=(tCTF_ScheduleTimer const & B) {
comm_down_time += B.comm_down_time;
exec_time += B.exec_time;
imbalance_wall_time += B.imbalance_wall_time;
imbalance_acuum_time += B.imbalance_acuum_time;
comm_up_time += B.comm_up_time;
total_time += B.total_time;
}
};
template<typename dtype>
class tCTF_Schedule : public tCTF_ScheduleBase {
public:
/**
* \brief Constructor, optionally specifying a world to restrict processor
* allocations to
*/
tCTF_Schedule(tCTF_World<dtype>* world = NULL) :
world(world),
partitions(0) {}
/**
* \brief Starts recording all tensor operations to this schedule
* (instead of executing them immediately)
*/
void record();
/**
* \brief Executes the schedule and implicitly terminates recording
*/
tCTF_ScheduleTimer execute();
/**
* \brief Executes a slide of the ready_queue, partitioning it among the
* processors in the grid
*/
inline tCTF_ScheduleTimer partition_and_execute();
/**
* \brief Call when a tensor op finishes, this adds newly enabled ops to the ready queue
*/
inline void schedule_op_successors(tCTF_TensorOperation<dtype>* op);
/**
* \brief Adds a tensor operation to this schedule.
* THIS IS CALL ORDER DEPENDENT - operations will *appear* to execute
* sequentially in the order they were added.
*/
void add_operation_typed(tCTF_TensorOperation<dtype>* op);
void add_operation(tCTF_TensorOperationBase* op);
/**
* Testing functionality
*/
void set_max_partitions(int in_partitions) {
partitions = in_partitions;
}
protected:
tCTF_World<dtype>* world;
/**
* Internal scheduling operation overview:
* DAG Structure:
* Each task maintains:
* dependency_count: the number of dependencies that the task has
* dependency_left: the number of dependencies left before this task can
* execute
* successors: a vector of tasks which has this as a dependency
* On completing a task, it decrements the dependency_left of all
* successors. Once the count reaches zero, the task is added to the ready
* queue and can be scheduled for execution.
* To allow one schedule to be executed many times, dependency_count is
* only modified by recording tasks, and is copied to dependency_left when
* the schedule starts executing.
*
* DAG Construction:
* A map from tensors pointers to operations is maintained, which contains
* the latest operation that writes to a tensor.
* When a new operation is added, it checks this map for all dependencies.
* If a dependency has no entry yet, then it is considered satisfied.
* Otherwise, it depends on the current entry - and the latest write
* operation adds this task as a successor.
* Then, the latest_write for this operation is updated.
*/
/**
* Schedule Recording Variables
*/
// Tasks with no dependencies, which can be executed at the start
std::deque<tCTF_TensorOperation<dtype>*> root_tasks;
// For debugging purposes - the steps in the original input order
std::deque<tCTF_TensorOperation<dtype>*> steps_original;
// Last operation writing to the key tensor
std::map<tCTF_Tensor<dtype>*, tCTF_TensorOperation<dtype>*> latest_write;
/**
* Schedule Execution Variables
*/
// Ready queue of tasks with all dependencies satisfied
std::deque<tCTF_TensorOperation<dtype>*> ready_tasks;
/**
* Testing variables
*/
int partitions;
};
/**
* @}
*/
/**
* \defgroup timer Timing and cost measurement
......
DEFAULT_COMPONENTS = ctf
BLAS =
LIBS =
CXX = CC
WARN_FLAGS = #-Drestrict = -Wall
OPT_FLAGS = -g -O3
CXXFLAGS = -openmp $(OPT_FLAGS) $(WARN_FLAGS)
DEFS = -DEDISON -D__STDC_LIMIT_MACROS
LDFLAGS =
INCLUDES =
AR = ar -crs
DEPFLAGS = -MT $@ -MD -MP -MF $(DEPDIR)/$(notdir $*).Po
#defining production removes memory tracking, definining CTF_COMPLEX instantiates
#CTF for the complex<double> type
DEFS := $(DEFS) -DPRODUCTION -DCTF_COMPLEX
#uncomment below to enable performance profiling
#DEFS := $(DEFS) -DPROFILE -DPMPI
#uncomment below to enable CTF debugging and status output
#DEFS := $(DEFS) -DVERBOSE=1 -DDEBUG=1
#SCALAPACK only necessary for pgemm tests and benchmarks
#LIBS := $(LIBS) -L$(HOME)/work/scalapack-2.0.2/lib -lscalapack -lgfortran
#DEFS := $(DEFS) -DUSE_SCALAPACK
......@@ -4,7 +4,7 @@ BLAS = -llapack -lblas
LIBS = $(BLAS)
CXX = mpicxx
WARN_FLAGS = #-Drestrict = -Wall
OPT_FLAGS = -g -O3
OPT_FLAGS = -g -O2
CXXFLAGS = -fopenmp $(OPT_FLAGS) $(WARN_FLAGS)
DEFS = -D__STDC_LIMIT_MACROS
LDFLAGS =
......
......@@ -10,8 +10,7 @@ include make/make.in
include make/rules.mk
test_suite_SUBDIRS = test
test_SUBDIRS = test ctr_comm unit_test
test_model_SUBDIRS = unit_test
test_SUBDIRS = test ctr_comm
pgemm_test_SUBDIRS = test
nonsq_pgemm_test_SUBDIRS = test
......@@ -26,18 +25,18 @@ ${libdir}/libctf.a: interface/ctf_world.o \
interface/ctf_tensor.o \
interface/ctf_matrix.o \
interface/ctf_scalar.o \
interface/ctf_schedule.o \
interface/ctf_vector.o \
interface/ctf_term.o \
interface/ctf_idx_tensor.o \
interface/ctf_sparse_tensor.o \
interface/ctf_flop_counter.o \
shared/comm.o \
shared/util.o \
shared/timer.o \
shared/memcontrol.o \
shared/unit_util.o \
dist_tensor/cyclopstf.o \
unit_test/unit_test.o
dist_tensor/dist_tensor_internal.o \
dist_tensor/distribution.o \
dist_tensor/cyclopstf.o
#INCLUDES += -I${top_dir}/src/ctr_comm -I${top_dir}/src/ctr_seq -I${top_dir}/src/dist_tensor -I${top_dir}/src/util -I${top_dir}/src/interface
......
......@@ -2,11 +2,10 @@ include ../../config.mk
include ../make/make.in
include ../make/rules.mk
nonsq_pgemm_bench: ${bindir}/bench/nonsq_pgemm_bench
${bindir}/bench/nonsq_pgemm_bench: nonsq_pgemm_bench.o \
nonsq_pgemm_bench: ${bindir}/nonsq_pgemm_bench
${bindir}/nonsq_pgemm_bench: nonsq_pgemm_bench.o \
${libdir}/libctf.a
unit_bench.o: FORCE
INCLUDES += -I${top_dir}/src/dist_tensor
LIBS := -lctf $(LIBS)
......@@ -303,7 +303,7 @@ int main(int argc, char **argv) {
myctf->def_scala_mat(desc_a, mat_A, &tid_A);
myctf->def_scala_mat(desc_b, mat_B, &tid_B);
myctf->def_scala_mat(desc_c, mat_C_CTF, &tid_C);
myctf->pgemm('T', 'N', m, n, k, ALPHA, tid_A, tid_B, BETA, tid_C);
myctf->pgemm('T', 'N', ALPHA, tid_A, tid_B, BETA, tid_C);
myctf->read_scala_mat(tid_C, mat_C_CTF);
#if 0
......@@ -353,7 +353,7 @@ int main(int argc, char **argv) {
myctf->def_scala_mat(desc_a, mat_A, &tid_A);
myctf->def_scala_mat(desc_b, mat_B, &tid_B);
myctf->def_scala_mat(desc_c, mat_C_CTF, &tid_C);
myctf->pgemm('T', 'N', m, n, k, ALPHA, tid_A, tid_B, BETA, tid_C);
myctf->pgemm('T', 'N', ALPHA, tid_A, tid_B, BETA, tid_C);
myctf->read_scala_mat(tid_C, mat_C_CTF);
......@@ -406,7 +406,7 @@ int main(int argc, char **argv) {
mat_A, 1, 1, desc_a,
mat_B, 1, 1, desc_b, BETA,
mat_C, 1, 1, desc_c); */
myctf->pgemm('T', 'N', m, n, k, ALPHA, tid_A, tid_B, BETA, tid_C);
myctf->pgemm('T', 'N', ALPHA, tid_A, tid_B, BETA, tid_C);
if (iter == 0)
ans_verify = mat_C[2];
}
......
......@@ -23,12 +23,15 @@ ctr_2d_general<dtype>::ctr_2d_general(ctr<dtype> * other) : ctr<dtype>(other) {
ctr_lda_A = o->ctr_lda_A;
ctr_sub_lda_A = o->ctr_sub_lda_A;
cdt_A = o->cdt_A;
move_A = o->move_A;
ctr_lda_B = o->ctr_lda_B;
ctr_sub_lda_B = o->ctr_sub_lda_B;
cdt_B = o->cdt_B;
move_B = o->move_B;
ctr_lda_C = o->ctr_lda_C;
ctr_sub_lda_C = o->ctr_sub_lda_C;
cdt_C = o->cdt_C;
move_C = o->move_C;
}
/**
......@@ -37,15 +40,15 @@ ctr_2d_general<dtype>::ctr_2d_general(ctr<dtype> * other) : ctr<dtype>(other) {
template<typename dtype>
void ctr_2d_general<dtype>::print() {
printf("ctr_2d_general: edge_len = %d\n", edge_len);
printf("cdt_A = %p, ctr_lda_A = "PRId64", ctr_sub_lda_A = "PRId64"\n",
cdt_A, ctr_lda_A, ctr_sub_lda_A);
if (cdt_A != NULL) printf("cdt_A length = %d\n",cdt_A->np);
printf("cdt_B = %p, ctr_lda_B = "PRId64", ctr_sub_lda_B = "PRId64"\n",
cdt_B, ctr_lda_B, ctr_sub_lda_B);
if (cdt_B != NULL) printf("cdt_B length = %d\n",cdt_B->np);
printf("cdt_C = %p, ctr_lda_C = "PRId64", ctr_sub_lda_C = "PRId64"\n",
cdt_C, ctr_lda_C, ctr_sub_lda_C);
if (cdt_C != NULL) printf("cdt_C length = %d\n",cdt_C->np);
printf("move_A = %d, ctr_lda_A = "PRId64", ctr_sub_lda_A = "PRId64"\n",
move_A, ctr_lda_A, ctr_sub_lda_A);
if (move_A) printf("cdt_A length = %d\n",cdt_A.np);
printf("move_B = %d, ctr_lda_B = "PRId64", ctr_sub_lda_B = "PRId64"\n",
move_B, ctr_lda_B, ctr_sub_lda_B);
if (move_B) printf("cdt_B length = %d\n",cdt_B.np);
printf("move_C = %d, ctr_lda_C = "PRId64", ctr_sub_lda_C = "PRId64"\n",
move_C, ctr_lda_C, ctr_sub_lda_C);
if (move_C) printf("cdt_C length = %d\n",cdt_C.np);
rec_ctr->print();
}
......@@ -70,22 +73,22 @@ uint64_t ctr_2d_general<dtype>::comm_fp(int nlyr) {
long_int s_A, s_B, s_C;
db = long_int_max;
s_A = 0, s_B = 0, s_C = 0;
if (cdt_A != NULL){
np_A = cdt_A->np;
if (move_A){
np_A = cdt_A.np;
b_A = edge_len/np_A;
s_A = ctr_lda_A*ctr_sub_lda_A*(long_int)log(cdt_A->np);
s_A = ctr_lda_A*ctr_sub_lda_A*(long_int)log(cdt_A.np);
db = MIN(b_A, db);
}
if (cdt_B != NULL){
np_B = cdt_B->np;
if (move_B){
np_B = cdt_B.np;
b_B = edge_len/np_B;
s_B = ctr_lda_B*ctr_sub_lda_B*(long_int)log(cdt_B->np);
s_B = ctr_lda_B*ctr_sub_lda_B*(long_int)log(cdt_B.np);
db = MIN(b_B, db);
}
if (cdt_C != NULL){
np_C = cdt_C->np;
if (move_C){
np_C = cdt_C.np;
b_C = edge_len/np_C;
s_C = ctr_lda_C*ctr_sub_lda_C*(long_int)log(cdt_C->np);
s_C = ctr_lda_C*ctr_sub_lda_C*(long_int)log(cdt_C.np);
db = MIN(b_C, db);
}
return ((s_A+s_B+s_C)*(uint64_t)db*sizeof(dtype)*edge_len/db)/MIN(nlyr,edge_len);
......@@ -98,12 +101,12 @@ template<typename dtype>
uint64_t ctr_2d_general<dtype>::comm_rec(int nlyr) {
long_int db;
db = long_int_max;
if (cdt_A != NULL)
db = MIN(db,edge_len/cdt_A->np);
if (cdt_B != NULL)
db = MIN(db,edge_len/cdt_B->np);
if (cdt_C != NULL)
db = MIN(db,edge_len/cdt_C->np);
if (move_A)
db = MIN(db,edge_len/cdt_A.np);
if (move_B)
db = MIN(db,edge_len/cdt_B.np);
if (move_C)
db = MIN(db,edge_len/cdt_C.np);
return (edge_len/db)*rec_ctr->comm_rec(1) + comm_fp(nlyr);
}
......@@ -127,15 +130,15 @@ long_int ctr_2d_general<dtype>::mem_fp() {
if (ctr_sub_lda_C != 0)
s_C = ctr_sub_lda_C*ctr_lda_C;
aux_size = 0;
if (cdt_A != NULL){
np_A = cdt_A->np;
if (move_A){
np_A = cdt_A.np;
LIBT_ASSERT(np_A!=0);
b_A = edge_len/np_A;
s_A = ctr_lda_A*ctr_sub_lda_A;
db = MIN(b_A, db);
}
if (cdt_B != NULL){
np_B = cdt_B->np;
if (move_B){
np_B = cdt_B.np;
LIBT_ASSERT(np_B!=0);
b_B = edge_len/np_B;
s_B = ctr_lda_B*ctr_sub_lda_B;
......@@ -144,8 +147,8 @@ long_int ctr_2d_general<dtype>::mem_fp() {
}
db = MIN(b_B, db);
}
if (cdt_C != NULL){
np_C = cdt_C->np;
if (move_C){
np_C = cdt_C.np;
LIBT_ASSERT(np_C!=0);
b_C = edge_len/np_C;
s_C = ctr_lda_C*ctr_sub_lda_C;
......@@ -184,10 +187,8 @@ void ctr_2d_general<dtype>::run() {
TAU_FSTART(ctr_2d_general);
/* Must move at least one tensor */
LIBT_ASSERT(!(cdt_A == NULL && cdt_B == NULL && cdt_C == NULL));
/* Must move at most two tensors */
LIBT_ASSERT(!(cdt_A != NULL && cdt_B != NULL && cdt_C != NULL));
LIBT_ASSERT(!(move_A && move_B && move_C));
rec_ctr->beta = this->beta;
rec_ctr->num_lyr = 1;
......@@ -213,25 +214,25 @@ void ctr_2d_general<dtype>::run() {
s_B = ctr_sub_lda_B*ctr_lda_B;
if (ctr_sub_lda_C != 0)
s_C = ctr_sub_lda_C*ctr_lda_C;
if (cdt_A != NULL){
rank_A = cdt_A->rank;
np_A = cdt_A->np;
if (move_A){
rank_A = cdt_A.rank;
np_A = cdt_A.np;
b_A = edge_len/np_A;
s_A = ctr_lda_A*ctr_sub_lda_A;
db = MIN(b_A, db);
LIBT_ASSERT(edge_len%np_A == 0);
}
if (cdt_B != NULL){
rank_B = cdt_B->rank;
np_B = cdt_B->np;
if (move_B){
rank_B = cdt_B.rank;
np_B = cdt_B.np;
b_B = edge_len/np_B;
s_B = ctr_lda_B*ctr_sub_lda_B;
db = MIN(b_B, db);
LIBT_ASSERT(edge_len%np_B == 0);
}
if (cdt_C != NULL){
rank_C = cdt_C->rank;
np_C = cdt_C->np;
if (move_C){
rank_C = cdt_C.rank;
np_C = cdt_C.np;
b_C = edge_len/np_C;
s_C = ctr_lda_C*ctr_sub_lda_C;
db = MIN(b_C, db);
......@@ -246,7 +247,7 @@ void ctr_2d_general<dtype>::run() {
for (ib=this->idx_lyr*db; ib<edge_len; ib+=db*this->num_lyr){
if (cdt_A != NULL){
if (move_A){
owner_A = ib / b_A;
c_A = MIN(((owner_A+1)*b_A-ib), db);
if (rank_A == owner_A){
......@@ -288,7 +289,7 @@ void ctr_2d_general<dtype>::run() {
}
}
}
if (cdt_B != NULL){
if (move_B){
owner_B = ib / b_B;
c_B = MIN(((owner_B+1)*b_B-ib), db);
if (rank_B == owner_B){
......@@ -330,7 +331,7 @@ void ctr_2d_general<dtype>::run() {
}
}
}
if (cdt_C != NULL){
if (move_C){
op_C = buf_C;
rec_ctr->beta = get_zero<dtype>();
} else {
......@@ -353,7 +354,7 @@ void ctr_2d_general<dtype>::run() {
rec_ctr->run();
if (cdt_C != NULL){
if (move_C){
/* FIXME: Wont work for single precsion */
ALLREDUCE(MPI_IN_PLACE, op_C, db*s_C*(sizeof(dtype)/sizeof(double)), COMM_DOUBLE_T, COMM_OP_SUM, cdt_C);
owner_C = ib / b_C;
......
......@@ -36,30 +36,6 @@ class ctr {
ctr(){ buffer = NULL; }
};
template<typename dtype>
class ctr_1d_sqr_bcast : public ctr<dtype> {
public:
/* Class to be called on sub-blocks */
ctr<dtype> * rec_ctr;
int k;
int ctr_lda; /* local lda_A of contraction dimension 'k' */
int ctr_sub_lda; /* elements per local lda_A
of contraction dimension 'k' */
int sz;
CommData_t * cdt;
int cdt_dir;
void run();
void print() {};
long_int mem_fp();
long_int mem_rec();
ctr<dtype> * clone();
ctr_1d_sqr_bcast(ctr<dtype> * other);
~ctr_1d_sqr_bcast();
ctr_1d_sqr_bcast(){}
};
template<typename dtype>
class ctr_replicate : public ctr<dtype> {
public:
......@@ -70,9 +46,9 @@ class ctr_replicate : public ctr<dtype> {
long_int size_B; /* size of B blocks */
long_int size_C; /* size of C blocks */
CommData_t ** cdt_A;
CommData_t ** cdt_B;
CommData_t ** cdt_C;
CommData_t * cdt_A;
CommData_t * cdt_B;
CommData_t * cdt_C;
/* Class to be called on sub-blocks */
ctr<dtype> * rec_ctr;
......@@ -103,9 +79,14 @@ class ctr_2d_general : public ctr<dtype> {
long_int ctr_lda_C; /* local lda_C of contraction dimension 'k' */
long_int ctr_sub_lda_C; /* elements per local lda_C
of contraction dimension 'k' */
CommData_t * cdt_A;
CommData_t * cdt_B;
CommData_t * cdt_C;
bool move_A;
bool move_B;
bool move_C;
CommData_t cdt_A;
CommData_t cdt_B;
CommData_t cdt_C;
/* Class to be called on sub-blocks */
ctr<dtype> * rec_ctr;
......@@ -148,28 +129,6 @@ class ctr_2d_rect_bcast : public ctr<dtype> {
ctr_2d_rect_bcast(){}
};
template<typename dtype>
class ctr_2d_sqr_bcast : public ctr<dtype> {
public:
/* Class to be called on sub-blocks */
ctr<dtype> * rec_ctr;
int k;
long_int sz_A; /* number of elements in a block of A */
long_int sz_B; /* number of elements in a block of A */
CommData_t * cdt_x;
CommData_t * cdt_y;
void run();
long_int mem_fp();
long_int mem_rec();
ctr<dtype> * clone();
ctr_2d_sqr_bcast(ctr<dtype> * other);
~ctr_2d_sqr_bcast();
ctr_2d_sqr_bcast(){}
};
/* Assume LDA equal to dim */
template<typename dtype>
class ctr_dgemm : public ctr<dtype> {
......@@ -200,7 +159,7 @@ class ctr_lyr : public ctr<dtype> {
/* Class to be called on sub-blocks */
ctr<dtype> * rec_ctr;
int k;
CommData_t * cdt;
CommData_t cdt;
long_int sz_C;
void print() {};
......
......@@ -170,9 +170,9 @@ void ctr_lyr<dtype>::run(){
rec_ctr->A = this->A;
rec_ctr->B = this->B;
rec_ctr->C = this->C;
rec_ctr->beta = cdt->rank > 0 ? 0.0 : this->beta;
rec_ctr->num_lyr = cdt->np;
rec_ctr->idx_lyr = cdt->rank;
rec_ctr->beta = cdt.rank > 0 ? 0.0 : this->beta;
rec_ctr->num_lyr = cdt.np;
rec_ctr->idx_lyr = cdt.rank;
rec_ctr->run();
......@@ -231,17 +231,17 @@ void ctr_replicate<dtype>::print() {
printf("cdt_A = %p, size_A = "PRId64", ncdt_A = %d\n",
cdt_A, size_A, ncdt_A);
for (i=0; i<ncdt_A; i++){
printf("cdt_A[%d] length = %d\n",i,cdt_A[i]->np);
printf("cdt_A[%d] length = %d\n",i,cdt_A[i].np);
}
printf("cdt_B = %p, size_B = "PRId64", ncdt_B = %d\n",
cdt_B, size_B, ncdt_B);
for (i=0; i<ncdt_B; i++){
printf("cdt_B[%d] length = %d\n",i,cdt_B[i]->np);
printf("cdt_B[%d] length = %d\n",i,cdt_B[i].np);
}
printf("cdt_C = %p, size_C = "PRId64", ncdt_C = %d\n",
cdt_C, size_C, ncdt_C);
for (i=0; i<ncdt_C; i++){
printf("cdt_C[%d] length = %d\n",i,cdt_C[i]->np);
printf("cdt_C[%d] length = %d\n",i,cdt_C[i].np);
}
rec_ctr->print();
}
......@@ -256,16 +256,16 @@ uint64_t ctr_replicate<dtype>::comm_fp(int nlyr){
long_int tot_sz;
tot_sz = 0;
for (i=0; i<ncdt_A; i++){
LIBT_ASSERT(cdt_A[i]->np > 0);
tot_sz += size_A*log(cdt_A[i]->np);
LIBT_ASSERT(cdt_A[i].np > 0);
tot_sz += size_A*log(cdt_A[i].np);
}
for (i=0; i<ncdt_B; i++){
LIBT_ASSERT(cdt_B[i]->np > 0);
tot_sz += size_B*log(cdt_B[i]->np);
LIBT_ASSERT(cdt_B[i].np > 0);
tot_sz += size_B*log(cdt_B[i].np);
}
for (i=0; i<ncdt_C; i++){
LIBT_ASSERT(cdt_C[i]->np > 0);
tot_sz += size_C*log(cdt_C[i]->np);
LIBT_ASSERT(cdt_C[i].np > 0);
tot_sz += size_C*log(cdt_C[i].np);
}
return ((uint64_t)tot_sz)*sizeof(dtype);
}
......@@ -308,15 +308,15 @@ void ctr_replicate<dtype>::run(){
arank = 0, brank = 0, crank = 0;
for (i=0; i<ncdt_A; i++){
arank += cdt_A[i]->rank;
arank += cdt_A[i].rank;
POST_BCAST(this->A, size_A*sizeof(dtype), COMM_CHAR_T, 0, cdt_A[i], 0);
}
for (i=0; i<ncdt_B; i++){
brank += cdt_B[i]->rank;
brank += cdt_B[i].rank;
POST_BCAST(this->B, size_B*sizeof(dtype), COMM_CHAR_T, 0, cdt_B[i], 0);
}
for (i=0; i<ncdt_C; i++){
crank += cdt_C[i]->rank;
crank += cdt_C[i].rank;
}
if (crank != 0) std::fill(this->C, this->C+size_C, get_zero<dtype>());
else {
......
......@@ -200,7 +200,7 @@ void tsum_replicate<dtype>::run(){
}*/
brank = 0;
for (i=0; i<ncdt_B; i++){
brank += cdt_B[i]->rank;
brank += cdt_B[i].rank;
}
if (brank != 0) std::fill(this->B, this->B+size_B, 0.0);
......
......@@ -55,8 +55,8 @@ class tsum_replicate : public tsum<dtype> {
int ncdt_A; /* number of processor dimensions to replicate A along */
int ncdt_B; /* number of processor dimensions to replicate B along */
CommData_t ** cdt_A;
CommData_t ** cdt_B;
CommData_t * cdt_A;
CommData_t * cdt_B;
/* Class to be called on sub-blocks */
tsum<dtype> * rec_tsum;
......
/*Copyright (c) 2011, Edgar Solomonik, all rights reserved.*/
#include "dist_tensor.h"
#include "dist_tensor_internal.h"
#include "../shared/util.h"
#include "../shared/unit_util.h"
#include "unit_bench.h"
#include "bench_sym_contract.hxx"
/**
* \brief benchmarks model symmetric contractions
*/
void bench_model(int argc, char ** argv){
int seed, i, tid_A, tid_B, tid_C, stat;
int nctr, myRank, numPes, iter, ndim, n, inner_sz;
int * edge_len, * sym;
CommData_t *cdt_glb = (CommData_t*)malloc(sizeof(CommData_t));
RINIT_COMM(numPes, myRank, 4, 4, cdt_glb);
assert(argc == 3 || argc == 4);
seed = 100;
nctr = 2;
iter = 3;
ndim = atoi(argv[1]);
n = atoi(argv[2]);
if (argc > 3)
inner_sz = atoi(argv[3]);
else
inner_sz = DEF_INNER_SIZE;
if (myRank == 0) {
printf("Executing model contraction of tensor with dimension %d and edges of length %d\n",ndim,n);
printf("Using inner blocking size of %d\n",inner_sz);
}
edge_len = (int*)malloc(sizeof(int)*ndim);
sym = (int*)malloc(sizeof(int)*ndim);
CTF_ctr_type_t * ctypes = (CTF_ctr_type_t*)malloc(sizeof(CTF_ctr_type_t)*nctr);;
ctypes[0].idx_map_A = (int*)malloc(ndim*sizeof(int));
ctypes[0].idx_map_B = (int*)malloc(ndim*sizeof(int));
ctypes[0].idx_map_C = (int*)malloc(ndim*sizeof(int));
ctypes[1].idx_map_A = (int*)malloc(ndim*sizeof(int));
ctypes[1].idx_map_B = (int*)malloc(ndim*sizeof(int));
ctypes[1].idx_map_C = (int*)malloc(ndim*sizeof(int));
std::fill(edge_len, edge_len+ndim, n);
for (i=0; i<ndim; i++){
if (i == ndim/2 - 1 || i == ndim-1) {
sym[i] = NS;
} else {
sym[i] = SY;
}
ctypes[0].idx_map_A[i] = i;
if (i>=ndim/2)
ctypes[0].idx_map_B[i] = i + ndim/2;
else
ctypes[0].idx_map_B[i] = i;
ctypes[0].idx_map_C[i] = i + ndim/2;
ctypes[1].idx_map_B[i] = i;
if (i>=ndim/2) {
ctypes[1].idx_map_A[i] = i + ndim/2;
} else {
ctypes[1].idx_map_A[i] = ndim/2-i-1;
}
ctypes[1].idx_map_C[i] = i + ndim/2;
}
stat = CTF_init(MPI_COMM_WORLD, MACHINE_BGQ, myRank, numPes, inner_sz);
assert(stat == DIST_TENSOR_SUCCESS);
stat = CTF_define_tensor(ndim, edge_len, sym, &tid_A);
stat = CTF_define_tensor(ndim, edge_len, sym, &tid_B);
stat = CTF_define_tensor(ndim, edge_len, sym, &tid_C);
ctypes[0].tid_A = tid_A;
ctypes[0].tid_B = tid_B;
ctypes[0].tid_C = tid_C;
ctypes[1].tid_A = tid_A;
ctypes[1].tid_B = tid_B;
ctypes[1].tid_C = tid_C;
sym_readwrite(seed, tid_A, myRank, numPes);
sym_readwrite(seed, tid_B, myRank, numPes);
sym_readwrite(seed, tid_C, myRank, numPes);
GLOBAL_BARRIER(cdt_glb);
#ifdef TAU
TAU_PROFILE_TIMER(timer, "main", "int (int, char**)", TAU_USER);
TAU_PROFILE_START(timer);
TAU_PROFILE_INIT(argc, argv);
TAU_PROFILE_SET_NODE(myRank);
TAU_PROFILE_SET_CONTEXT(0);
#endif
GLOBAL_BARRIER(cdt_glb);
bench_sym_contract(ctypes, myRank, numPes, iter, nctr);
GLOBAL_BARRIER(cdt_glb);
CTF_exit();
for (i=0; i<nctr; i++){
free(ctypes[i].idx_map_A);
free(ctypes[i].idx_map_B);
free(ctypes[i].idx_map_C);
}
free(ctypes);
TAU_PROFILE_STOP(timer);
if (myRank==0) printf("Model symmetry benchmark completed\n");
GLOBAL_BARRIER(cdt_glb);
FREE_CDT(cdt_glb);
free(cdt_glb);
COMM_EXIT;
return;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment