Merge branch 'scheduler' into slice_opt

3f8f9b0c · Edgar Solomonik · e168c213 · f02f5eb8 · 3f8f9b0c · 3f8f9b0c
Commit 3f8f9b0c authored 11 years ago by Edgar Solomonik
20 changed files
--- a/.cproject
+++ b/.cproject
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
+	<storageModule moduleId="org.eclipse.cdt.core.settings">
+		<cconfiguration id="org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346">
+			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346" moduleId="org.eclipse.cdt.core.settings" name="Build (GNU)">
+				<externalSettings/>
+				<extensions>
+					<extension id="org.eclipse.cdt.core.Cygwin_PE" point="org.eclipse.cdt.core.BinaryParser"/>
+					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+				</extensions>
+			</storageModule>
+			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+				<configuration artifactName="ctf" buildProperties="" description="" id="org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346" name="Build (GNU)" parent="org.eclipse.cdt.build.core.emptycfg">
+					<folderInfo id="org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346.68812188" name="/" resourcePath="">
+						<toolChain id="cdt.managedbuild.toolchain.gnu.cygwin.base.114500116" name="Cygwin GCC" superClass="cdt.managedbuild.toolchain.gnu.cygwin.base">
+							<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.Cygwin_PE" id="cdt.managedbuild.target.gnu.platform.cygwin.base.1974190108" name="Debug Platform" osList="win32" superClass="cdt.managedbuild.target.gnu.platform.cygwin.base"/>
+							<builder buildPath="${workspace_loc:/ctf}/Build (GNU)" id="cdt.managedbuild.target.gnu.builder.cygwin.base.1867890232" name="Gnu Make Builder.Build (GNU)" superClass="cdt.managedbuild.target.gnu.builder.cygwin.base"/>
+							<tool id="cdt.managedbuild.tool.gnu.assembler.cygwin.base.1018853843" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.cygwin.base">
+								<inputType id="cdt.managedbuild.tool.gnu.assembler.input.1866661508" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.archiver.cygwin.base.1555343600" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.cygwin.base"/>
+							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.1674540487" name="Cygwin C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base">
+								<option id="gnu.cpp.compiler.option.include.paths.1449537127" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
+									<listOptionValue builtIn="false" value="&quot;C:\cygwin64\usr\include&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;C:\cygwin64\lib\gcc\x86_64-pc-cygwin\4.8.2\include\c++&quot;"/>
+								</option>
+								<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.2056994875" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin"/>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1942961242" name="Cygwin C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.cygwin.base">
+								<option id="gnu.c.compiler.option.include.paths.1543177086" superClass="gnu.c.compiler.option.include.paths" valueType="includePath">
+									<listOptionValue builtIn="false" value="&quot;C:\cygwin64\usr\include&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;C:\cygwin64\lib\gcc\x86_64-pc-cygwin\4.8.2\include\c++&quot;"/>
+								</option>
+								<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.1102012624" superClass="cdt.managedbuild.tool.gnu.c.compiler.input.cygwin"/>
+							</tool>
+							<tool id="cdt.managedbuild.tool.gnu.c.linker.cygwin.base.920934820" name="Cygwin C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.cygwin.base"/>
+							<tool id="cdt.managedbuild.tool.gnu.cpp.linker.cygwin.base.2034171734" name="Cygwin C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.cygwin.base">
+								<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.1371501430" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
+									<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
+									<additionalInput kind="additionalinput" paths="$(LIBS)"/>
+								</inputType>
+							</tool>
+						</toolChain>
+					</folderInfo>
+				</configuration>
+			</storageModule>
+			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
+		</cconfiguration>
+	</storageModule>
+	<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+		<project id="ctf.null.2097328945" name="ctf"/>
+	</storageModule>
+	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Build (GNU)">
+			<resource resourceType="PROJECT" workspacePath="/ctf"/>
+		</configuration>
+	</storageModule>
+	<storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
+	<storageModule moduleId="scannerConfiguration">
+		<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+		<scannerConfigBuildInfo instanceId="org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346;org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346.68812188;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.1674540487;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.2056994875">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346;org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346.68812188;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1942961242;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.1102012624">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346;org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346.68812188;org.eclipse.linuxtools.cdt.autotools.core.toolchain.tool.gcc.686440635;cdt.managedbuild.tool.gnu.c.compiler.input.1917476232">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346;org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346.68812188;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1552707334;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.1663905346">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346;org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346.68812188;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.283180442;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.1114020496">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346;org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346.68812188;cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.360486812;cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.890507584">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346;org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346.68812188;cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.1266459141;cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.605308406">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346;org.eclipse.linuxtools.cdt.autotools.core.toolChain.453801346.68812188;org.eclipse.linuxtools.cdt.autotools.core.toolchain.tool.gpp.2043906437;cdt.managedbuild.tool.gnu.cpp.compiler.input.1884109157">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+		</scannerConfigBuildInfo>
+	</storageModule>
+</cproject>
--- a/.gitignore
+++ b/.gitignore
+*.o
+.*
 # Build directories
 *.o
 .deps

--- a/.project
+++ b/.project
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>ctf</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
+			<triggers>clean,full,incremental,</triggers>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
+			<triggers>full,incremental,</triggers>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.cdt.core.cnature</nature>
+		<nature>org.eclipse.cdt.core.ccnature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
+	</natures>
+</projectDescription>
--- a/Makefile
+++ b/Makefile
@@ -24,26 +24,31 @@ all $(MAKECMDGOALS):
      echo 'Machine recognized as a MAC'; \
      cp mkfiles/config.mk.linux config.mk; \
    else \
-      if [ $(shell hostname | grep 'edison\|hopper' ) ] ;  then \
-        echo 'Hostname recognized as Edison or Hopper, using pre-made config.mk file'; \
-        cp mkfiles/config.mk.hopper config.mk;   \
+      if [ $(shell hostname | grep 'edison' ) ] ;  then \
+        echo 'Hostname recognized as Edison, using pre-made config.mk file'; \
+        cp mkfiles/config.mk.edison config.mk;   \
      else \
-	if [ $(shell hostname | grep 'cvrsvc' ) ] ;  then \
-	  echo 'Hostname recognized as Carver, using pre-made config.mk file'; \
-	  cp mkfiles/config.mk.carver config.mk;   \
+        if [ $(shell hostname | grep 'hopper' ) ] ;  then \
+          echo 'Hostname recognized as Hopper, using pre-made config.mk file'; \
+          cp mkfiles/config.mk.hopper config.mk;   \
 	else \
-	  if [ $(shell hostname | grep 'surveyor\|intrepid\|challenger\|udawn' ) ] ;  then \
-	    echo 'Hostname recognized as a BG/P machine, using pre-made config.mk file'; \
-	      cp mkfiles/config.mk.bgp config.mk;   \
+	  if [ $(shell hostname | grep 'cvrsvc' ) ] ;  then \
+	    echo 'Hostname recognized as Carver, using pre-made config.mk file'; \
+	    cp mkfiles/config.mk.carver config.mk;   \
 	  else \
-	    if [ $(shell hostname | grep 'ls[0-9]*.tacc.utexas.edu' ) ] ;  then \
-	      cp mkfiles/config.mk.lonestar config.mk;   \
+	    if [ $(shell hostname | grep 'surveyor\|intrepid\|challenger\|udawn' ) ] ;  then \
+	      echo 'Hostname recognized as a BG/P machine, using pre-made config.mk file'; \
+	      cp mkfiles/config.mk.bgp config.mk;   \
 	    else \
-	      if [ $(shell hostname | grep 'vesta\|mira\|cetus\|seq' ) ] ;  then \
-		cp mkfiles/config.mk.bgq config.mk;   \
+	      if [ $(shell hostname | grep 'ls[0-9]*.tacc.utexas.edu' ) ] ;  then \
+		cp mkfiles/config.mk.lonestar config.mk;   \
 	      else \
-		echo 'Hostname not recognized: assuming linux, specialize config.mk if necessary'; \
-		cp mkfiles/config.mk.linux config.mk;   \
+		if [ $(shell hostname | grep 'vesta\|mira\|cetus\|seq' ) ] ;  then \
+		  cp mkfiles/config.mk.bgq config.mk;   \
+		else \
+		  echo 'Hostname not recognized: assuming linux, specialize config.mk if necessary'; \
+		  cp mkfiles/config.mk.linux config.mk;   \
+		fi; \
 	      fi; \
 	    fi; \
 	  fi; \

--- a/examples/Makefile
+++ b/examples/Makefile
@@ -5,7 +5,7 @@ include ../src/make/rules.mk

 examples: dft dft_3D gemm gemm_4D scalar trace diag_sym fast_diagram \
          fast_3mm sym3 fast_sym fast_sym_4D ccsdt_t3_to_t2 weight_4D \
-          test_suite strassen slice_gemm ccsd readwrite_test \
+          test_suite strassen slice_gemm ccsd readwrite_test subworld_gemm \
          permute_multiworld sparse_permuted_slice

 sparse_permuted_slice: ${bindir}/sparse_permuted_slice
@@ -56,6 +56,9 @@ ${bindir}/gemm_4D: gemm_4D.o ${libdir}/libctf.a
 gemm: ${bindir}/gemm
 ${bindir}/gemm: gemm.o ${libdir}/libctf.a

+subworld_gemm: ${bindir}/subworld_gemm
+${bindir}/subworld_gemm: subworld_gemm.o ${libdir}/libctf.a
+
 weight_4D: ${bindir}/weight_4D
 ${bindir}/weight_4D: weight_4D.o ${libdir}/libctf.a


--- a/examples/ccsd.cxx
+++ b/examples/ccsd.cxx
@@ -181,7 +181,13 @@ class Amplitudes {
 };

 void ccsd(Integrals   &V,
-          Amplitudes  &T){
+          Amplitudes  &T,
+          int sched_nparts = 0){
+  int rank;   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  double timer = MPI_Wtime();
+  tCTF_Schedule<double> sched(V.dw);
+  sched.set_max_partitions(sched_nparts);
+  sched.record();

  CTF_Tensor T21 = CTF_Tensor(T.abij);
  T21["abij"] += .5*T["ai"]*T["bj"];
@@ -251,6 +257,14 @@ void ccsd(Integrals   &V,
  Zabij += .5*V["abef"]*T21["efij"];
  Zabij += .5*Wmnij*T21["abmn"];
  
+  if (rank == 0) {
+    printf("Record: %lf\n",
+            MPI_Wtime()-timer);
+  }
+
+  timer = MPI_Wtime();
+  tCTF_ScheduleTimer schedule_time = sched.execute();
+
  CTF_fctr fctr;
  fctr.func_ptr = &divide;

@@ -265,8 +279,21 @@ void ccsd(Integrals   &V,
  Dabij["abij"] -= V["a"];
  Dabij["abij"] -= V["b"];

+
+
  T.ai.contract(1.0, *(Zai.parent), "ai", Dai, "ai", 0.0, "ai", fctr);
  T.abij.contract(1.0, *(Zabij.parent), "abij", Dabij, "abij", 0.0, "abij", fctr);
+
+  if (rank == 0) {
+    printf("Schedule comm down: %lf\n", schedule_time.comm_down_time);
+    printf("Schedule execute: %lf\n", schedule_time.exec_time);
+    printf("Schedule imbalance, wall: %lf\n", schedule_time.imbalance_wall_time);
+    printf("Schedule imbalance, accum: %lf\n", schedule_time.imbalance_acuum_time);
+    printf("Schedule comm up: %lf\n", schedule_time.comm_up_time);
+    printf("Schedule total: %lf\n", schedule_time.total_time);
+    printf("All execute: %lf\n",
+            MPI_Wtime()-timer);
+  }
 } 

 #ifndef TEST_SUITE
@@ -283,7 +310,7 @@ char* getCmdOption(char ** begin,


 int main(int argc, char ** argv){
-  int rank, np, niter, no, nv, i;
+  int rank, np, niter, no, nv, sched_nparts, i;
  int const in_num = argc;
  char ** input_str = argv;

@@ -303,6 +330,10 @@ int main(int argc, char ** argv){
    niter = atoi(getCmdOption(input_str, input_str+in_num, "-niter"));
    if (niter < 0) niter = 1;
  } else niter = 1;
+  if (getCmdOption(input_str, input_str+in_num, "-nparts")){
+    sched_nparts = atoi(getCmdOption(input_str, input_str+in_num, "-nparts"));
+    if (sched_nparts < 0) sched_nparts = 0;
+  } else sched_nparts = 0;

  {
    CTF_World dw(argc, argv);
@@ -313,10 +344,10 @@ int main(int argc, char ** argv){
      for (i=0; i<niter; i++){
        T.fill_rand();
        double d = MPI_Wtime();
-        ccsd(V,T);
+        ccsd(V,T,sched_nparts);
        if (rank == 0)
-          printf("Completed %dth CCSD iteration in time = %lf, |T| is %lf\n",
-                  i, MPI_Wtime()-d, T.ai.norm2()+T.abij.norm2());
+          printf("(%d nodes) Completed %dth CCSD iteration in time = %lf, |T| is %lf\n",
+              np, i, MPI_Wtime()-d, T.ai.norm2()+T.abij.norm2());
        else {
          T.ai.norm2();
          T.abij.norm2();

--- a/examples/strassen.cxx
+++ b/examples/strassen.cxx
@@ -262,7 +262,7 @@ char* getCmdOption(char ** begin,
 }

 int main(int argc, char ** argv){
-  int rank, np, niter, n, m, k, pass;
+  int rank, np, n,  pass;
  int const in_num = argc;
  char ** input_str = argv;


--- a/examples/subworld_gemm.cxx
+++ b/examples/subworld_gemm.cxx
+/*Copyright (c) 2011, Edgar Solomonik, all rights reserved.*/
+/** \addtogroup examples 
+  * @{ 
+  * \defgroup subworld_gemm
+  * @{ 
+  * \brief Performs recursive parallel matrix multiplication using the slice interface to extract blocks
+  */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+#include <math.h>
+#include <assert.h>
+#include <stdint.h>
+#include <algorithm>
+#include <ctf.hpp>
+
+
+int test_subworld_gemm(int n,
+                       int m,
+                       int k,
+                       int div_,
+                       CTF_World &dw){
+  int rank, num_pes;
+  int64_t i, np;
+  double * pairs, err;
+  int64_t * indices;
+  
+  
+  CTF_Matrix C(m, n, NS, dw);
+  CTF_Matrix C_ans(m, n, NS, dw);
+  CTF_Matrix A(m, k, NS, dw);
+  CTF_Matrix B(k, n, NS, dw);
+  
+  MPI_Comm pcomm = dw.comm;
+  MPI_Comm_rank(pcomm, &rank);
+  MPI_Comm_size(pcomm, &num_pes);
+  
+  int div = div_;
+  if (div > num_pes) div = num_pes;
+
+  
+  srand48(13*rank);
+  A.read_local(&np, &indices, &pairs);
+  for (i=0; i<np; i++ ) pairs[i] = drand48()-.5; 
+  A.write(np, indices, pairs);
+  free(pairs);
+  free(indices);
+  B.read_local(&np, &indices, &pairs);
+  for (i=0; i<np; i++ ) pairs[i] = drand48()-.5; 
+  B.write(np, indices, pairs);
+  free(pairs);
+  free(indices);
+
+  
+  int cnum_pes = num_pes / div;
+  int color = rank/cnum_pes;
+  int crank = rank%cnum_pes;
+   
+  MPI_Comm ccomm; 
+  MPI_Comm_split(pcomm, color, crank, &ccomm);
+  CTF_World sworld(ccomm);
+  
+  C_ans["ij"] = ((double)div)*A["ik"]*B["kj"];
+
+  CTF_Matrix subA(m, k, NS, sworld);
+  CTF_Matrix subB(k, n, NS, sworld);
+  CTF_Matrix subC(m, n, NS, sworld);
+
+  for (int c=0; c<num_pes/cnum_pes; c++){
+    if (c==color){
+      A.add_to_subworld(&subA,1.0,0.0);
+      B.add_to_subworld(&subB,1.0,0.0);
+    } else {
+      A.add_to_subworld(NULL,1.0,0.0);
+      B.add_to_subworld(NULL,1.0,0.0);
+    }    
+  }
+
+  if (rank < cnum_pes*div)
+    subC["ij"] = subA["ik"]*subB["kj"];
+
+  for (int c=0; c<num_pes/cnum_pes; c++){
+    if (c==color){
+      C.add_from_subworld(&subC, 1.0, 1.0);
+    } else {
+      C.add_from_subworld(NULL, 1.0, 1.0);
+    }    
+  }
+  
+
+  C_ans["ij"] -= C["ij"];
+
+  err = C_ans.norm2();
+
+  if (rank == 0){
+    if (err<1.E-9)
+      printf("{ GEMM on subworlds } passed\n");
+    else
+      printf("{ GEMM on subworlds } FAILED, error norm = %E\n",err);
+  }
+  return err<1.E-9;
+} 
+
+
+#ifndef TEST_SUITE
+char* getCmdOption(char ** begin,
+                   char ** end,
+                   const   std::string & option){
+  char ** itr = std::find(begin, end, option);
+  if (itr != end && ++itr != end){
+    return *itr;
+  }
+  return 0;
+}
+
+int main(int argc, char ** argv){
+  int rank, np, niter, n, m, k, pass, div;
+  int const in_num = argc;
+  char ** input_str = argv;
+
+  MPI_Init(&argc, &argv);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &np);
+
+  if (getCmdOption(input_str, input_str+in_num, "-n")){
+    n = atoi(getCmdOption(input_str, input_str+in_num, "-n"));
+    if (n < 0) n = 23;
+  } else n = 23;
+  if (getCmdOption(input_str, input_str+in_num, "-m")){
+    m = atoi(getCmdOption(input_str, input_str+in_num, "-m"));
+    if (m < 0) m = 17;
+  } else m = 17;
+  if (getCmdOption(input_str, input_str+in_num, "-k")){
+    k = atoi(getCmdOption(input_str, input_str+in_num, "-k"));
+    if (k < 0) k = 31;
+  } else k = 31;
+  if (getCmdOption(input_str, input_str+in_num, "-div")){
+    div = atoi(getCmdOption(input_str, input_str+in_num, "-div"));
+    if (div < 0) div = 2;
+  } else div = 2;
+
+  {
+    CTF_World dw(MPI_COMM_WORLD, argc, argv);
+    int pass;    
+    if (rank == 0){
+      printf("Non-symmetric: NS = NS*NS test_subworld_gemm:\n");
+    }
+    pass = test_subworld_gemm(n, m, k, div, dw);
+    assert(pass);
+  }
+
+  MPI_Finalize();
+  return 0;
+}
+#endif
+
--- a/include/ctf.hpp
+++ b/include/ctf.hpp
@@ -7,6 +7,9 @@
 #include <stdio.h>
 #include <stdint.h>
 #include <vector>
+#include <deque>
+#include <set>
+#include <map>
 #include "../src/dist_tensor/cyclopstf.hpp"

 /**
@@ -139,14 +142,15 @@ class tCTF_Tensor {
                tCTF_World<dtype> &  world_,
                char const *         name_ = NULL,
                int                  profile_ = 0);
-
+    
    /**
-     * \brief creates a copy of the tensor, in a different world if specified
-     * \param[in] oworld pointer to another world (NULL oworld = this->world)
-     * \return new tensor object on oworld
+     * \brief creates a zeroed out copy (data not copied) of a tensor in a different world
+     * \param[in] A tensor whose characteristics to copy
+     * \param[in] world_ a world for the tensor we are creating to live in, can be different from A
     */
-    tCTF_Tensor<dtype> clone(tCTF_World<dtype> * oworld = NULL) const;
-    
+    tCTF_Tensor(tCTF_Tensor const & A,
+                tCTF_World<dtype> & world_);
+
    /**
     * \brief gives the values associated with any set of indices
     * The sparse data is defined in coordinate format. The tensor index (i,j,k,l) of a tensor with edge lengths
@@ -262,6 +266,34 @@ class tCTF_Tensor {
                  dtype                    beta,
                  char const *             idx_C,
                  tCTF_fctr<dtype>         fseq = tCTF_fctr<dtype>());
+
+    /**
+     * \brief estimate the cost of a contraction C[idx_C] = A[idx_A]*B[idx_B]
+     * \param[in] A first operand tensor
+     * \param[in] idx_A indices of A in contraction, e.g. "ik" -> A_{ik}
+     * \param[in] B second operand tensor
+     * \param[in] idx_B indices of B in contraction, e.g. "kj" -> B_{kj}
+     * \param[in] idx_C indices of C (this tensor),  e.g. "ij" -> C_{ij}
+     * \return cost as a int64_t type, currently a rought estimate of flops/processor
+     */
+    int64_t estimate_cost(const tCTF_Tensor & A,
+                          char const *        idx_A,
+                          const tCTF_Tensor & B,
+                          char const *        idx_B,
+                          char const *        idx_C);
+    
+    /**
+     * \brief estimate the cost of a sum B[idx_B] = A[idx_A]
+     * \param[in] A first operand tensor
+     * \param[in] idx_A indices of A in contraction, e.g. "ik" -> A_{ik}
+     * \param[in] idx_B indices of B in contraction, e.g. "kj" -> B_{kj}
+     * \return cost as a int64_t type, currently a rought estimate of flops/processor
+     */
+    int64_t estimate_cost(const tCTF_Tensor & A,
+                          char const *        idx_A,
+                          char const *        idx_B);
+
+
    
    /**
     * \brief sums B[idx_B] = beta*B[idx_B] + alpha*A[idx_A]
@@ -346,7 +378,7 @@ class tCTF_Tensor {
               dtype          alpha) const;

    /**
-     * \brief TODO: apply permutation to matrix, potentially extracting a slice
+     * \brief Apply permutation to matrix, potentially extracting a slice
     *              B[i,j,...] 
     *                = beta*B[...] + alpha*A[perms_A[0][i],perms_A[1][j],...]
     *
@@ -365,7 +397,7 @@ class tCTF_Tensor {
                 dtype          alpha);

    /**
-     * \brief TODO: apply permutation to matrix, potentially extracting a slice
+     * \brief Apply permutation to matrix, potentially extracting a slice
     *              B[perms_B[0][i],perms_B[0][j],...] 
     *                = beta*B[...] + alpha*A[i,j,...]
     *
@@ -382,6 +414,31 @@ class tCTF_Tensor {
                 dtype          beta,
                 tCTF_Tensor &  A,
                 dtype          alpha);
+    
+   /**
+     * \brief accumulates this tensor to a tensor object defined on a different world
+     * \param[in] tsr a tensor object of the same characteristic as this tensor, 
+     *             but on a different CTF_world/MPI_comm
+     * \param[in] alpha scaling factor for this tensor (default 1.0)
+     * \param[in] beta scaling factor for tensor tsr (default 1.0)
+     */
+    void add_to_subworld(tCTF_Tensor<dtype> * tsr,
+                         dtype alpha,
+                         dtype beta) const;
+    void add_to_subworld(tCTF_Tensor<dtype> * tsr) const;
+    
+   /**
+     * \brief accumulates this tensor from a tensor object defined on a different world
+     * \param[in] tsr a tensor object of the same characteristic as this tensor, 
+     *             but on a different CTF_world/MPI_comm
+     * \param[in] alpha scaling factor for tensor tsr (default 1.0)
+     * \param[in] beta scaling factor for this tensor (default 1.0)
+     */
+    void add_from_subworld(tCTF_Tensor<dtype> * tsr,
+                           dtype alpha,
+                           dtype beta) const;
+    void add_from_subworld(tCTF_Tensor<dtype> * tsr) const;
+    

    /**
     * \brief aligns data mapping with tensor A
@@ -526,6 +583,22 @@ class tCTF_Tensor {
    ~tCTF_Tensor();
 };

+/**
+ * \brief comparison function for sets of tensor pointers
+ * This ensures the set iteration order is consistent across nodes
+ */
+template<typename dtype>
+struct tensor_tid_less {
+  bool operator()(tCTF_Tensor<dtype>* A, tCTF_Tensor<dtype>* B) {
+    if (A == NULL && B != NULL) {
+      return true;
+    } else if (A == NULL || B == NULL) {
+      return false;
+    }
+    return A->tid < B->tid;
+  }
+};
+
 /**
 * \brief Matrix class which encapsulates a 2D tensor 
 */
@@ -624,7 +697,7 @@ class tCTF_Idx_Tensor : public tCTF_Term<dtype> {

  
    // dervied clone calls copy constructor
-    tCTF_Term<dtype> * clone() const;
+    tCTF_Term<dtype> * clone(std::map<tCTF_Tensor<dtype>*, tCTF_Tensor<dtype>*>* remap = NULL) const;

    /**
     * \brief constructor takes in a parent tensor and its indices 
@@ -642,8 +715,8 @@ class tCTF_Idx_Tensor : public tCTF_Term<dtype> {
     * \param[in] copy if 1 then copy the parent tensor of B into a new tensor
     */
    tCTF_Idx_Tensor(tCTF_Idx_Tensor<dtype> const & B,
-                    int copy = 0);
-    
+                    int copy = 0,
+                    std::map<tCTF_Tensor<dtype>*, tCTF_Tensor<dtype>*>* remap = NULL);

    tCTF_Idx_Tensor();
    
@@ -664,6 +737,24 @@ class tCTF_Idx_Tensor : public tCTF_Term<dtype> {
     */
    void execute(tCTF_Idx_Tensor<dtype> output) const;
    
+    /**
+     * \brief estimates the cost of a contraction
+     * \param[in] output tensor to write results into and its indices
+     */
+    long_int estimate_cost(tCTF_Idx_Tensor<dtype> output) const;
+    
+    /**
+     * \brief estimates the cost the expression to produce an intermediate with 
+     *        all expression indices remaining
+     * \param[in,out] output tensor to write results into and its indices
+     */
+    tCTF_Idx_Tensor<dtype> estimate_cost(long_int & cost) const;
+    
+    /**
+    * \brief appends the tensors this depends on to the input set
+    */
+    void get_inputs(std::set<tCTF_Tensor<dtype>*, tensor_tid_less<dtype> >* inputs_set) const;
+
    /**
     * \brief A = B, compute any operations on operand B and set
     * \param[in] B tensor on the right hand side
@@ -893,7 +984,7 @@ class tCTF_Term {
    /**
     * \brief base classes must implement this copy function to retrieve pointer
     */ 
-    virtual tCTF_Term * clone() const = 0;
+    virtual tCTF_Term * clone(std::map<tCTF_Tensor<dtype>*, tCTF_Tensor<dtype>*>* remap = NULL) const = 0;
    
    /**
     * \brief evalues the expression, which just scales by default
@@ -901,6 +992,21 @@ class tCTF_Term {
     */
    virtual void execute(tCTF_Idx_Tensor<dtype> output) const = 0;
    
+    /**
+     * \brief estimates the cost of a contraction/sum/.. term
+     * \param[in] output tensor to write results into and its indices
+     */
+    virtual long_int estimate_cost(tCTF_Idx_Tensor<dtype> output) const = 0;
+    
+    /**
+     * \brief estimates the cost the expression to produce an intermediate with 
+     *        all expression indices remaining
+     * \param\[in,out] cost the cost of the operatiob
+     * \return output tensor to write results into and its indices
+     */
+    virtual tCTF_Idx_Tensor<dtype> estimate_cost(long_int & cost) const = 0;
+    
+    
    /**
     * \brief evalues the expression to produce an intermediate with 
     *        all expression indices remaining
@@ -908,6 +1014,11 @@ class tCTF_Term {
     */
    virtual tCTF_Idx_Tensor<dtype> execute() const = 0;
    
+    /**
+    * \brief appends the tensors this depends on to the input set
+    */
+    virtual void get_inputs(std::set<tCTF_Tensor<dtype>*, tensor_tid_less<dtype> >* inputs_set) const = 0;
+
    /**
     * \brief constructs a new term which multiplies by tensor A
     * \param[in] A term to multiply by
@@ -965,10 +1076,11 @@ class tCTF_Sum_Term : public tCTF_Term<dtype> {
    ~tCTF_Sum_Term();
  
    // copy constructor
-    tCTF_Sum_Term(tCTF_Sum_Term<dtype> const & other);
+    tCTF_Sum_Term(tCTF_Sum_Term<dtype> const & other,
+        std::map<tCTF_Tensor<dtype>*, tCTF_Tensor<dtype>*>* remap = NULL);

    // dervied clone calls copy constructor
-    tCTF_Term<dtype> * clone() const;
+    tCTF_Term<dtype>* clone(std::map<tCTF_Tensor<dtype>*, tCTF_Tensor<dtype>*>* remap = NULL) const;

    /**
     * construct sum term corresponding to a single tensor
@@ -981,7 +1093,8 @@ class tCTF_Sum_Term : public tCTF_Term<dtype> {
     * \param[in,out] output tensor to write results into and its indices
     */
    void execute(tCTF_Idx_Tensor<dtype> output) const;
-    
+
+  
    /**
     * \brief evalues the expression to produce an intermediate with 
     *        all expression indices remaining
@@ -989,6 +1102,26 @@ class tCTF_Sum_Term : public tCTF_Term<dtype> {
     */
    tCTF_Idx_Tensor<dtype> execute() const;
    
+    /**
+     * \brief estimates the cost of a sum term
+     * \param[in] output tensor to write results into and its indices
+     */
+    long_int estimate_cost(tCTF_Idx_Tensor<dtype> output) const;
+    
+    /**
+     * \brief estimates the cost the expression to produce an intermediate with 
+     *        all expression indices remaining
+     * \param[in,out] output tensor to write results into and its indices
+     */
+    tCTF_Idx_Tensor<dtype> estimate_cost(long_int & cost) const;
+    
+    
+    
+    /**
+    * \brief appends the tensors this depends on to the input set
+    */
+    void get_inputs(std::set<tCTF_Tensor<dtype>*, tensor_tid_less<dtype> >* inputs_set) const;
+
    /**
     * \brief constructs a new term by addition of two terms
     * \param[in] A term to add to output
@@ -1035,10 +1168,11 @@ class tCTF_Contract_Term : public tCTF_Term<dtype> {
    ~tCTF_Contract_Term();
  
    // \brief copy constructor
-    tCTF_Contract_Term(tCTF_Contract_Term<dtype> const & other);
+    tCTF_Contract_Term(tCTF_Contract_Term<dtype> const & other,
+        std::map<tCTF_Tensor<dtype>*, tCTF_Tensor<dtype>*>* remap = NULL);

    // \brief dervied clone calls copy constructor
-    tCTF_Term<dtype> * clone() const;
+    tCTF_Term<dtype> * clone(std::map<tCTF_Tensor<dtype>*, tCTF_Tensor<dtype>*>* remap = NULL) const;

    /**
     * \brief override execution to  to contract operands and add them to output
@@ -1046,6 +1180,11 @@ class tCTF_Contract_Term : public tCTF_Term<dtype> {
     */
    void execute(tCTF_Idx_Tensor<dtype> output) const;
    
+    /**
+    * \brief appends the tensors this depends on to the input set
+    */
+    void get_inputs(std::set<tCTF_Tensor<dtype>*, tensor_tid_less<dtype> >* inputs_set) const;
+
    /**
     * \brief evalues the expression to produce an intermediate with 
     *        all expression indices remaining
@@ -1053,6 +1192,20 @@ class tCTF_Contract_Term : public tCTF_Term<dtype> {
     */
    tCTF_Idx_Tensor<dtype> execute() const;
    
+    /**
+     * \brief estimates the cost of a contract term
+     * \param[in] output tensor to write results into and its indices
+     */
+    long_int estimate_cost(tCTF_Idx_Tensor<dtype> output) const;
+    
+    /**
+     * \brief estimates the cost the expression to produce an intermediate with 
+     *        all expression indices remaining
+     * \param[in,out] output tensor to write results into and its indices
+     */
+    tCTF_Idx_Tensor<dtype> estimate_cost(long_int & cost) const;
+    
+    
    /**
     * \brief override contraction to grow vector rather than create recursive terms
     * \param[in] A term to multiply by
@@ -1068,6 +1221,239 @@ class tCTF_Contract_Term : public tCTF_Term<dtype> {
 * @}
 */

+/**
+ * \defgroup scheduler Dynamic scheduler.
+ * @{
+ */
+enum tCTF_TensorOperationTypes {
+  TENSOR_OP_NONE,
+  TENSOR_OP_SET,
+  TENSOR_OP_SUM,
+  TENSOR_OP_SUBTRACT,
+  TENSOR_OP_MULTIPLY };
+
+/**
+ * \brief Provides a untemplated base class for tensor operations.
+ */
+class tCTF_TensorOperationBase {
+public:
+  virtual ~tCTF_TensorOperationBase() {}
+};
+
+/**
+ * \brief A tensor operation, containing all the data (op, lhs, rhs) required
+ * to run it. Also provides methods to get a list of inputs and outputs, as well
+ * as successor and dependency information used in scheduling.
+ */
+template<typename dtype>
+class tCTF_TensorOperation : public tCTF_TensorOperationBase {
+public:
+	/**
+	 * \brief Constructor, create the tensor operation lhs op= rhs
+	 */
+	tCTF_TensorOperation(tCTF_TensorOperationTypes op,
+			tCTF_Idx_Tensor<dtype>* lhs,
+			const tCTF_Term<dtype>* rhs) :
+			  op(op),
+			  lhs(lhs),
+			  rhs(rhs),
+			  dependency_count(0),
+			  cached_estimated_cost(0) {}
+
+  /**
+   * \brief appends the tensors this writes to to the input set
+   */
+  void get_outputs(std::set<tCTF_Tensor<dtype>*, tensor_tid_less<dtype> >* outputs_set) const;
+
+	/**
+	 * \brief appends the tensors this depends on (reads from, including the output
+	 * if a previous value is required) to the input set
+	 */
+	void get_inputs(std::set<tCTF_Tensor<dtype>*, tensor_tid_less<dtype> >* inputs_set) const;
+
+	/**
+	 * \brief runs this operation, but does NOT handle dependency scheduling
+	 * optionally takes a remapping of tensors
+	 */
+	void execute(std::map<tCTF_Tensor<dtype>*, tCTF_Tensor<dtype>*>* remap = NULL);
+
+	/**
+	 *\brief provides an estimated runtime cost
+	 */
+	long_int estimate_cost();
+
+	bool is_dummy() {
+	  return op == TENSOR_OP_NONE;
+	}
+
+  /**
+   * Schedule Recording Variables
+   */
+	// Number of dependencies I have
+  int dependency_count;
+  // List of all successors - operations that depend on me
+  std::vector<tCTF_TensorOperation<dtype>* > successors;
+  std::vector<tCTF_TensorOperation<dtype>* > reads;
+
+  /**
+   * Schedule Execution Variables
+   */
+  int dependency_left;
+
+  /**
+   * Debugging Helpers
+   */
+  const char* name() {
+    return lhs->parent->name;
+  }
+
+protected:
+	tCTF_TensorOperationTypes op;
+	tCTF_Idx_Tensor<dtype>* lhs;
+	const tCTF_Term<dtype>* rhs;
+
+	long_int cached_estimated_cost;
+};
+
+// untemplatized scheduler abstract base class to assist in global operations
+class tCTF_ScheduleBase {
+public:
+	virtual void add_operation(tCTF_TensorOperationBase* op) = 0;
+};
+
+extern tCTF_ScheduleBase* global_schedule;
+
+struct tCTF_ScheduleTimer {
+  double comm_down_time;
+  double exec_time;
+  double imbalance_wall_time;
+  double imbalance_acuum_time;
+  double comm_up_time;
+  double total_time;
+
+  tCTF_ScheduleTimer():
+    comm_down_time(0),
+    exec_time(0),
+    imbalance_wall_time(0),
+    imbalance_acuum_time(0),
+    comm_up_time(0),
+    total_time(0) {}
+
+  void operator+=(tCTF_ScheduleTimer const & B) {
+    comm_down_time += B.comm_down_time;
+    exec_time += B.exec_time;
+    imbalance_wall_time += B.imbalance_wall_time;
+    imbalance_acuum_time += B.imbalance_acuum_time;
+    comm_up_time += B.comm_up_time;
+    total_time += B.total_time;
+  }
+};
+
+template<typename dtype>
+class tCTF_Schedule : public tCTF_ScheduleBase {
+public:
+  /**
+   * \brief Constructor, optionally specifying a world to restrict processor
+   * allocations to
+   */
+  tCTF_Schedule(tCTF_World<dtype>* world = NULL) :
+    world(world),
+    partitions(0) {}
+
+	/**
+	 * \brief Starts recording all tensor operations to this schedule
+	 * (instead of executing them immediately)
+	 */
+	void record();
+
+	/**
+	 * \brief Executes the schedule and implicitly terminates recording
+	 */
+	tCTF_ScheduleTimer execute();
+
+  /**
+   * \brief Executes a slide of the ready_queue, partitioning it among the
+   * processors in the grid
+   */
+  inline tCTF_ScheduleTimer partition_and_execute();
+
+	/**
+	 * \brief Call when a tensor op finishes, this adds newly enabled ops to the ready queue
+	 */
+	inline void schedule_op_successors(tCTF_TensorOperation<dtype>* op);
+
+	/**
+	 * \brief Adds a tensor operation to this schedule.
+	 * THIS IS CALL ORDER DEPENDENT - operations will *appear* to execute
+	 * sequentially in the order they were added.
+	 */
+	void add_operation_typed(tCTF_TensorOperation<dtype>* op);
+	void add_operation(tCTF_TensorOperationBase* op);
+
+	/**
+	 * Testing functionality
+	 */
+	void set_max_partitions(int in_partitions) {
+	  partitions = in_partitions;
+	}
+
+protected:
+	tCTF_World<dtype>* world;
+
+	/**
+	 * Internal scheduling operation overview:
+	 * DAG Structure:
+	 *  Each task maintains:
+	 *    dependency_count: the number of dependencies that the task has
+	 *    dependency_left: the number of dependencies left before this task can
+	 *      execute
+	 *    successors: a vector of tasks which has this as a dependency
+	 *  On completing a task, it decrements the dependency_left of all
+	 *  successors. Once the count reaches zero, the task is added to the ready
+	 *  queue and can be scheduled for execution.
+	 *  To allow one schedule to be executed many times, dependency_count is
+	 *  only modified by recording tasks, and is copied to dependency_left when
+	 *  the schedule starts executing.
+	 *
+	 * DAG Construction:
+	 *  A map from tensors pointers to operations is maintained, which contains
+	 *  the latest operation that writes to a tensor.
+	 *  When a new operation is added, it checks this map for all dependencies.
+	 *  If a dependency has no entry yet, then it is considered satisfied.
+	 *  Otherwise, it depends on the current entry - and the latest write
+	 *  operation adds this task as a successor.
+	 *  Then, the latest_write for this operation is updated.
+	 */
+
+	/**
+	 * Schedule Recording Variables
+	 */
+	// Tasks with no dependencies, which can be executed at the start
+	std::deque<tCTF_TensorOperation<dtype>*> root_tasks;
+
+  // For debugging purposes - the steps in the original input order
+  std::deque<tCTF_TensorOperation<dtype>*> steps_original;
+
+  // Last operation writing to the key tensor
+  std::map<tCTF_Tensor<dtype>*, tCTF_TensorOperation<dtype>*> latest_write;
+
+  /**
+   * Schedule Execution Variables
+   */
+  // Ready queue of tasks with all dependencies satisfied
+  std::deque<tCTF_TensorOperation<dtype>*> ready_tasks;
+
+  /**
+   * Testing variables
+   */
+  int partitions;
+
+};
+/**
+ * @}
+ */
+
+

 /**
 * \defgroup timer Timing and cost measurement

--- a/mkfiles/config.mk.edison
+++ b/mkfiles/config.mk.edison
+DEFAULT_COMPONENTS = ctf
+
+BLAS        = 
+LIBS        =  
+CXX         = CC
+WARN_FLAGS  = #-Drestrict = -Wall 
+OPT_FLAGS   = -g -O3
+CXXFLAGS    = -openmp $(OPT_FLAGS) $(WARN_FLAGS) 
+DEFS        = -DEDISON -D__STDC_LIMIT_MACROS
+LDFLAGS     = 
+INCLUDES    = 
+AR          = ar -crs
+DEPFLAGS    = -MT $@ -MD -MP -MF $(DEPDIR)/$(notdir $*).Po
+
+#defining production removes memory tracking, definining CTF_COMPLEX instantiates
+#CTF for the complex<double> type
+DEFS       := $(DEFS) -DPRODUCTION -DCTF_COMPLEX
+
+#uncomment below to enable performance profiling
+#DEFS      := $(DEFS) -DPROFILE -DPMPI
+
+#uncomment below to enable CTF debugging and status output
+#DEFS      := $(DEFS) -DVERBOSE=1 -DDEBUG=1
+
+#SCALAPACK only necessary for pgemm tests and benchmarks 
+#LIBS      := $(LIBS) -L$(HOME)/work/scalapack-2.0.2/lib -lscalapack -lgfortran
+#DEFS      := $(DEFS) -DUSE_SCALAPACK
+
--- a/mkfiles/config.mk.linux
+++ b/mkfiles/config.mk.linux
@@ -4,7 +4,7 @@ BLAS        = -llapack -lblas
 LIBS        = $(BLAS) 
 CXX         = mpicxx
 WARN_FLAGS  = #-Drestrict = -Wall 
-OPT_FLAGS   = -g -O3
+OPT_FLAGS   = -g -O2
 CXXFLAGS    = -fopenmp $(OPT_FLAGS) $(WARN_FLAGS) 
 DEFS        = -D__STDC_LIMIT_MACROS
 LDFLAGS     = 

--- a/src/Makefile
+++ b/src/Makefile
@@ -10,8 +10,7 @@ include make/make.in
 include make/rules.mk

 test_suite_SUBDIRS = test 
-test_SUBDIRS = test ctr_comm unit_test
-test_model_SUBDIRS = unit_test
+test_SUBDIRS = test ctr_comm 
 pgemm_test_SUBDIRS = test
 nonsq_pgemm_test_SUBDIRS = test

@@ -26,18 +25,18 @@ ${libdir}/libctf.a: interface/ctf_world.o \
                    interface/ctf_tensor.o \
                    interface/ctf_matrix.o \
                    interface/ctf_scalar.o \
+                    interface/ctf_schedule.o \
                    interface/ctf_vector.o \
                    interface/ctf_term.o \
                    interface/ctf_idx_tensor.o \
                    interface/ctf_sparse_tensor.o \
                    interface/ctf_flop_counter.o \
-                    shared/comm.o \
                    shared/util.o \
                    shared/timer.o \
                    shared/memcontrol.o \
-                    shared/unit_util.o \
-                    dist_tensor/cyclopstf.o \
-                    unit_test/unit_test.o 
+                    dist_tensor/dist_tensor_internal.o \
+                    dist_tensor/distribution.o \
+                    dist_tensor/cyclopstf.o 


 #INCLUDES += -I${top_dir}/src/ctr_comm -I${top_dir}/src/ctr_seq -I${top_dir}/src/dist_tensor -I${top_dir}/src/util -I${top_dir}/src/interface

--- a/src/bench/Makefile
+++ b/src/bench/Makefile
@@ -2,11 +2,10 @@ include ../../config.mk
 include ../make/make.in
 include ../make/rules.mk

-nonsq_pgemm_bench: ${bindir}/bench/nonsq_pgemm_bench
-${bindir}/bench/nonsq_pgemm_bench: nonsq_pgemm_bench.o \
+nonsq_pgemm_bench: ${bindir}/nonsq_pgemm_bench
+${bindir}/nonsq_pgemm_bench: nonsq_pgemm_bench.o \
                                   ${libdir}/libctf.a

-unit_bench.o: FORCE

 INCLUDES += -I${top_dir}/src/dist_tensor
 LIBS := -lctf $(LIBS)
--- a/src/bench/nonsq_pgemm_bench.cxx
+++ b/src/bench/nonsq_pgemm_bench.cxx
@@ -303,7 +303,7 @@ int main(int argc, char **argv) {
  myctf->def_scala_mat(desc_a, mat_A, &tid_A);
  myctf->def_scala_mat(desc_b, mat_B, &tid_B);
  myctf->def_scala_mat(desc_c, mat_C_CTF, &tid_C);
-  myctf->pgemm('T', 'N', m, n, k, ALPHA, tid_A, tid_B, BETA, tid_C);
+  myctf->pgemm('T', 'N', ALPHA, tid_A, tid_B, BETA, tid_C);
  myctf->read_scala_mat(tid_C, mat_C_CTF);

 #if 0
@@ -353,7 +353,7 @@ int main(int argc, char **argv) {
  myctf->def_scala_mat(desc_a, mat_A, &tid_A);
  myctf->def_scala_mat(desc_b, mat_B, &tid_B);
  myctf->def_scala_mat(desc_c, mat_C_CTF, &tid_C);
-  myctf->pgemm('T', 'N', m, n, k, ALPHA, tid_A, tid_B, BETA, tid_C);
+  myctf->pgemm('T', 'N', ALPHA, tid_A, tid_B, BETA, tid_C);
  myctf->read_scala_mat(tid_C, mat_C_CTF);


@@ -406,7 +406,7 @@ int main(int argc, char **argv) {
 	    mat_A, 1, 1, desc_a,
 	    mat_B, 1, 1, desc_b, BETA,
 	    mat_C, 1, 1, desc_c); */
-    myctf->pgemm('T', 'N', m, n, k, ALPHA, tid_A, tid_B, BETA, tid_C);
+    myctf->pgemm('T', 'N', ALPHA, tid_A, tid_B, BETA, tid_C);
    if (iter == 0)
      ans_verify = mat_C[2];
  }

--- a/src/ctr_comm/ctr_2d_general.cxx
+++ b/src/ctr_comm/ctr_2d_general.cxx
@@ -23,12 +23,15 @@ ctr_2d_general<dtype>::ctr_2d_general(ctr<dtype> * other) : ctr<dtype>(other) {
  ctr_lda_A     = o->ctr_lda_A;
  ctr_sub_lda_A = o->ctr_sub_lda_A;
  cdt_A         = o->cdt_A;
+  move_A        = o->move_A;
  ctr_lda_B     = o->ctr_lda_B;
  ctr_sub_lda_B = o->ctr_sub_lda_B;
  cdt_B         = o->cdt_B;
+  move_B        = o->move_B;
  ctr_lda_C     = o->ctr_lda_C;
  ctr_sub_lda_C = o->ctr_sub_lda_C;
  cdt_C         = o->cdt_C;
+  move_C        = o->move_C;
 }

 /**
@@ -37,15 +40,15 @@ ctr_2d_general<dtype>::ctr_2d_general(ctr<dtype> * other) : ctr<dtype>(other) {
 template<typename dtype>
 void ctr_2d_general<dtype>::print() {
  printf("ctr_2d_general: edge_len = %d\n", edge_len);
-  printf("cdt_A = %p, ctr_lda_A = "PRId64", ctr_sub_lda_A = "PRId64"\n",
-          cdt_A, ctr_lda_A, ctr_sub_lda_A);
-  if (cdt_A != NULL) printf("cdt_A length = %d\n",cdt_A->np);
-  printf("cdt_B = %p, ctr_lda_B = "PRId64", ctr_sub_lda_B = "PRId64"\n",
-          cdt_B, ctr_lda_B, ctr_sub_lda_B);
-  if (cdt_B != NULL) printf("cdt_B length = %d\n",cdt_B->np);
-  printf("cdt_C = %p, ctr_lda_C = "PRId64", ctr_sub_lda_C = "PRId64"\n",
-          cdt_C, ctr_lda_C, ctr_sub_lda_C);
-  if (cdt_C != NULL) printf("cdt_C length = %d\n",cdt_C->np);
+  printf("move_A = %d, ctr_lda_A = "PRId64", ctr_sub_lda_A = "PRId64"\n",
+          move_A, ctr_lda_A, ctr_sub_lda_A);
+  if (move_A) printf("cdt_A length = %d\n",cdt_A.np);
+  printf("move_B = %d, ctr_lda_B = "PRId64", ctr_sub_lda_B = "PRId64"\n",
+          move_B, ctr_lda_B, ctr_sub_lda_B);
+  if (move_B) printf("cdt_B length = %d\n",cdt_B.np);
+  printf("move_C = %d, ctr_lda_C = "PRId64", ctr_sub_lda_C = "PRId64"\n",
+          move_C, ctr_lda_C, ctr_sub_lda_C);
+  if (move_C) printf("cdt_C length = %d\n",cdt_C.np);
  rec_ctr->print();
 }

@@ -70,22 +73,22 @@ uint64_t ctr_2d_general<dtype>::comm_fp(int nlyr) {
  long_int s_A,         s_B,    s_C;
  db = long_int_max;
  s_A = 0, s_B = 0, s_C = 0;
-  if (cdt_A != NULL){
-    np_A        = cdt_A->np;
+  if (move_A){
+    np_A        = cdt_A.np;
    b_A         = edge_len/np_A;
-    s_A         = ctr_lda_A*ctr_sub_lda_A*(long_int)log(cdt_A->np);
+    s_A         = ctr_lda_A*ctr_sub_lda_A*(long_int)log(cdt_A.np);
    db          = MIN(b_A, db);
  } 
-  if (cdt_B != NULL){
-    np_B        = cdt_B->np;
+  if (move_B){
+    np_B        = cdt_B.np;
    b_B         = edge_len/np_B;
-    s_B         = ctr_lda_B*ctr_sub_lda_B*(long_int)log(cdt_B->np);
+    s_B         = ctr_lda_B*ctr_sub_lda_B*(long_int)log(cdt_B.np);
    db          = MIN(b_B, db);
  }
-  if (cdt_C != NULL){
-    np_C        = cdt_C->np;
+  if (move_C){
+    np_C        = cdt_C.np;
    b_C         = edge_len/np_C;
-    s_C         = ctr_lda_C*ctr_sub_lda_C*(long_int)log(cdt_C->np);
+    s_C         = ctr_lda_C*ctr_sub_lda_C*(long_int)log(cdt_C.np);
    db          = MIN(b_C, db);
  }
  return ((s_A+s_B+s_C)*(uint64_t)db*sizeof(dtype)*edge_len/db)/MIN(nlyr,edge_len);
@@ -98,12 +101,12 @@ template<typename dtype>
 uint64_t ctr_2d_general<dtype>::comm_rec(int nlyr) {
  long_int db;
  db = long_int_max;
-  if (cdt_A != NULL)
-    db          = MIN(db,edge_len/cdt_A->np);
-  if (cdt_B != NULL)
-    db          = MIN(db,edge_len/cdt_B->np);
-  if (cdt_C != NULL)
-    db          = MIN(db,edge_len/cdt_C->np);
+  if (move_A)
+    db          = MIN(db,edge_len/cdt_A.np);
+  if (move_B)
+    db          = MIN(db,edge_len/cdt_B.np);
+  if (move_C)
+    db          = MIN(db,edge_len/cdt_C.np);
  return (edge_len/db)*rec_ctr->comm_rec(1) + comm_fp(nlyr);
 }

@@ -127,15 +130,15 @@ long_int ctr_2d_general<dtype>::mem_fp() {
  if (ctr_sub_lda_C != 0)
    s_C = ctr_sub_lda_C*ctr_lda_C;
  aux_size = 0;
-  if (cdt_A != NULL){
-    np_A        = cdt_A->np;
+  if (move_A){
+    np_A        = cdt_A.np;
    LIBT_ASSERT(np_A!=0);
    b_A         = edge_len/np_A;
    s_A         = ctr_lda_A*ctr_sub_lda_A;
    db          = MIN(b_A, db);
  } 
-  if (cdt_B != NULL){
-    np_B        = cdt_B->np;
+  if (move_B){
+    np_B        = cdt_B.np;
    LIBT_ASSERT(np_B!=0);
    b_B         = edge_len/np_B;
    s_B         = ctr_lda_B*ctr_sub_lda_B;
@@ -144,8 +147,8 @@ long_int ctr_2d_general<dtype>::mem_fp() {
    }
    db          = MIN(b_B, db);
  }
-  if (cdt_C != NULL){
-    np_C        = cdt_C->np;
+  if (move_C){
+    np_C        = cdt_C.np;
    LIBT_ASSERT(np_C!=0);
    b_C         = edge_len/np_C;
    s_C         = ctr_lda_C*ctr_sub_lda_C;
@@ -184,10 +187,8 @@ void ctr_2d_general<dtype>::run() {
  
  TAU_FSTART(ctr_2d_general);

-  /* Must move at least one tensor */
-  LIBT_ASSERT(!(cdt_A == NULL && cdt_B == NULL && cdt_C == NULL));
  /* Must move at most two tensors */
-  LIBT_ASSERT(!(cdt_A != NULL && cdt_B != NULL && cdt_C != NULL));
+  LIBT_ASSERT(!(move_A && move_B && move_C));
  
  rec_ctr->beta         = this->beta;
  rec_ctr->num_lyr      = 1;
@@ -213,25 +214,25 @@ void ctr_2d_general<dtype>::run() {
    s_B = ctr_sub_lda_B*ctr_lda_B;
  if (ctr_sub_lda_C != 0)
    s_C = ctr_sub_lda_C*ctr_lda_C;
-  if (cdt_A != NULL){
-    rank_A      = cdt_A->rank;
-    np_A        = cdt_A->np;
+  if (move_A){
+    rank_A      = cdt_A.rank;
+    np_A        = cdt_A.np;
    b_A         = edge_len/np_A;
    s_A         = ctr_lda_A*ctr_sub_lda_A;
    db          = MIN(b_A, db);
    LIBT_ASSERT(edge_len%np_A == 0);
  } 
-  if (cdt_B != NULL){
-    rank_B      = cdt_B->rank;
-    np_B        = cdt_B->np;
+  if (move_B){
+    rank_B      = cdt_B.rank;
+    np_B        = cdt_B.np;
    b_B         = edge_len/np_B;
    s_B         = ctr_lda_B*ctr_sub_lda_B;
    db          = MIN(b_B, db);
    LIBT_ASSERT(edge_len%np_B == 0);
  }
-  if (cdt_C != NULL){
-    rank_C      = cdt_C->rank;
-    np_C        = cdt_C->np;
+  if (move_C){
+    rank_C      = cdt_C.rank;
+    np_C        = cdt_C.np;
    b_C         = edge_len/np_C;
    s_C         = ctr_lda_C*ctr_sub_lda_C;
    db          = MIN(b_C, db);
@@ -246,7 +247,7 @@ void ctr_2d_general<dtype>::run() {


  for (ib=this->idx_lyr*db; ib<edge_len; ib+=db*this->num_lyr){
-    if (cdt_A != NULL){
+    if (move_A){
      owner_A   = ib / b_A;
      c_A       = MIN(((owner_A+1)*b_A-ib), db);
      if (rank_A == owner_A){
@@ -288,7 +289,7 @@ void ctr_2d_general<dtype>::run() {
        }      
      }
    }
-    if (cdt_B != NULL){
+    if (move_B){
      owner_B   = ib / b_B;
      c_B       = MIN(((owner_B+1)*b_B-ib), db);
      if (rank_B == owner_B){
@@ -330,7 +331,7 @@ void ctr_2d_general<dtype>::run() {
        }      
      }
    }
-    if (cdt_C != NULL){
+    if (move_C){
      op_C = buf_C;
      rec_ctr->beta = get_zero<dtype>();
    } else {
@@ -353,7 +354,7 @@ void ctr_2d_general<dtype>::run() {

    rec_ctr->run();

-    if (cdt_C != NULL){
+    if (move_C){
      /* FIXME: Wont work for single precsion */
      ALLREDUCE(MPI_IN_PLACE, op_C, db*s_C*(sizeof(dtype)/sizeof(double)), COMM_DOUBLE_T, COMM_OP_SUM, cdt_C);
      owner_C   = ib / b_C;

--- a/src/ctr_comm/ctr_comm.h
+++ b/src/ctr_comm/ctr_comm.h
@@ -36,30 +36,6 @@ class ctr {
    ctr(){ buffer = NULL; }
 };

-template<typename dtype>
-class ctr_1d_sqr_bcast : public ctr<dtype> {
-  public: 
-    /* Class to be called on sub-blocks */
-    ctr<dtype> * rec_ctr;
-    int k;
-    int ctr_lda; /* local lda_A of contraction dimension 'k' */
-    int ctr_sub_lda; /* elements per local lda_A 
-                        of contraction dimension 'k' */
-    int sz;
-    CommData_t * cdt;
-    int cdt_dir;
-    
-    void run();
-    void print() {};
-    long_int mem_fp();
-    long_int mem_rec();
-    ctr<dtype> * clone();
-    
-    ctr_1d_sqr_bcast(ctr<dtype> * other);
-    ~ctr_1d_sqr_bcast();
-    ctr_1d_sqr_bcast(){}
-};
-
 template<typename dtype>
 class ctr_replicate : public ctr<dtype> {
  public: 
@@ -70,9 +46,9 @@ class ctr_replicate : public ctr<dtype> {
    long_int size_B; /* size of B blocks */
    long_int size_C; /* size of C blocks */

-    CommData_t ** cdt_A;
-    CommData_t ** cdt_B;
-    CommData_t ** cdt_C;
+    CommData_t * cdt_A;
+    CommData_t * cdt_B;
+    CommData_t * cdt_C;
    /* Class to be called on sub-blocks */
    ctr<dtype> * rec_ctr;
    
@@ -103,9 +79,14 @@ class ctr_2d_general : public ctr<dtype> {
    long_int ctr_lda_C; /* local lda_C of contraction dimension 'k' */
    long_int ctr_sub_lda_C; /* elements per local lda_C 
                          of contraction dimension 'k' */
-    CommData_t * cdt_A;
-    CommData_t * cdt_B;
-    CommData_t * cdt_C;
+    
+    bool move_A;
+    bool move_B;
+    bool move_C;
+
+    CommData_t cdt_A;
+    CommData_t cdt_B;
+    CommData_t cdt_C;
    /* Class to be called on sub-blocks */
    ctr<dtype> * rec_ctr;
    
@@ -148,28 +129,6 @@ class ctr_2d_rect_bcast : public ctr<dtype> {
    ctr_2d_rect_bcast(){}
 };

-
-template<typename dtype>
-class ctr_2d_sqr_bcast : public ctr<dtype> {
-  public: 
-    /* Class to be called on sub-blocks */
-    ctr<dtype> * rec_ctr;
-    int k;
-    long_int sz_A; /* number of elements in a block of A */
-    long_int sz_B; /* number of elements in a block of A */
-    CommData_t * cdt_x;
-    CommData_t * cdt_y;
-    
-    void run();
-    long_int mem_fp();
-    long_int mem_rec();
-    ctr<dtype> * clone();
-
-    ctr_2d_sqr_bcast(ctr<dtype> * other);
-    ~ctr_2d_sqr_bcast();
-    ctr_2d_sqr_bcast(){}
-};
-
 /* Assume LDA equal to dim */
 template<typename dtype>
 class ctr_dgemm : public ctr<dtype> {
@@ -200,7 +159,7 @@ class ctr_lyr : public ctr<dtype> {
    /* Class to be called on sub-blocks */
    ctr<dtype> * rec_ctr;
    int k;
-    CommData_t * cdt;
+    CommData_t cdt;
    long_int sz_C;
    
    void print() {};

--- a/src/ctr_comm/ctr_simple.cxx
+++ b/src/ctr_comm/ctr_simple.cxx
@@ -170,9 +170,9 @@ void ctr_lyr<dtype>::run(){
  rec_ctr->A            = this->A;
  rec_ctr->B            = this->B;
  rec_ctr->C            = this->C;
-  rec_ctr->beta         = cdt->rank > 0 ? 0.0 : this->beta;
-  rec_ctr->num_lyr      = cdt->np;
-  rec_ctr->idx_lyr      = cdt->rank;
+  rec_ctr->beta         = cdt.rank > 0 ? 0.0 : this->beta;
+  rec_ctr->num_lyr      = cdt.np;
+  rec_ctr->idx_lyr      = cdt.rank;

  rec_ctr->run();
  
@@ -231,17 +231,17 @@ void ctr_replicate<dtype>::print() {
  printf("cdt_A = %p, size_A = "PRId64", ncdt_A = %d\n",
          cdt_A, size_A, ncdt_A);
  for (i=0; i<ncdt_A; i++){
-    printf("cdt_A[%d] length = %d\n",i,cdt_A[i]->np);
+    printf("cdt_A[%d] length = %d\n",i,cdt_A[i].np);
  }
  printf("cdt_B = %p, size_B = "PRId64", ncdt_B = %d\n",
          cdt_B, size_B, ncdt_B);
  for (i=0; i<ncdt_B; i++){
-    printf("cdt_B[%d] length = %d\n",i,cdt_B[i]->np);
+    printf("cdt_B[%d] length = %d\n",i,cdt_B[i].np);
  }
  printf("cdt_C = %p, size_C = "PRId64", ncdt_C = %d\n",
          cdt_C, size_C, ncdt_C);
  for (i=0; i<ncdt_C; i++){
-    printf("cdt_C[%d] length = %d\n",i,cdt_C[i]->np);
+    printf("cdt_C[%d] length = %d\n",i,cdt_C[i].np);
  }
  rec_ctr->print();
 }
@@ -256,16 +256,16 @@ uint64_t ctr_replicate<dtype>::comm_fp(int nlyr){
  long_int tot_sz;
  tot_sz = 0;
  for (i=0; i<ncdt_A; i++){
-    LIBT_ASSERT(cdt_A[i]->np > 0);
-    tot_sz += size_A*log(cdt_A[i]->np);
+    LIBT_ASSERT(cdt_A[i].np > 0);
+    tot_sz += size_A*log(cdt_A[i].np);
  }
  for (i=0; i<ncdt_B; i++){
-    LIBT_ASSERT(cdt_B[i]->np > 0);
-    tot_sz += size_B*log(cdt_B[i]->np);
+    LIBT_ASSERT(cdt_B[i].np > 0);
+    tot_sz += size_B*log(cdt_B[i].np);
  }
  for (i=0; i<ncdt_C; i++){
-    LIBT_ASSERT(cdt_C[i]->np > 0);
-    tot_sz += size_C*log(cdt_C[i]->np);
+    LIBT_ASSERT(cdt_C[i].np > 0);
+    tot_sz += size_C*log(cdt_C[i].np);
  }
  return ((uint64_t)tot_sz)*sizeof(dtype);
 }
@@ -308,15 +308,15 @@ void ctr_replicate<dtype>::run(){

  arank = 0, brank = 0, crank = 0;
  for (i=0; i<ncdt_A; i++){
-    arank += cdt_A[i]->rank;
+    arank += cdt_A[i].rank;
    POST_BCAST(this->A, size_A*sizeof(dtype), COMM_CHAR_T, 0, cdt_A[i], 0);
  }
  for (i=0; i<ncdt_B; i++){
-    brank += cdt_B[i]->rank;
+    brank += cdt_B[i].rank;
    POST_BCAST(this->B, size_B*sizeof(dtype), COMM_CHAR_T, 0, cdt_B[i], 0);
  }
  for (i=0; i<ncdt_C; i++){
-    crank += cdt_C[i]->rank;
+    crank += cdt_C[i].rank;
  }
  if (crank != 0) std::fill(this->C, this->C+size_C, get_zero<dtype>());
  else {

--- a/src/ctr_comm/sum_tsr.cxx
+++ b/src/ctr_comm/sum_tsr.cxx
@@ -200,7 +200,7 @@ void tsum_replicate<dtype>::run(){
  }*/
  brank = 0;
  for (i=0; i<ncdt_B; i++){
-    brank += cdt_B[i]->rank;
+    brank += cdt_B[i].rank;
  }
  if (brank != 0) std::fill(this->B, this->B+size_B, 0.0);


--- a/src/ctr_comm/sum_tsr.h
+++ b/src/ctr_comm/sum_tsr.h
@@ -55,8 +55,8 @@ class tsum_replicate : public tsum<dtype> {
    int ncdt_A; /* number of processor dimensions to replicate A along */
    int ncdt_B; /* number of processor dimensions to replicate B along */

-    CommData_t ** cdt_A;
-    CommData_t ** cdt_B;
+    CommData_t * cdt_A;
+    CommData_t * cdt_B;
    /* Class to be called on sub-blocks */
    tsum<dtype> * rec_tsum;
    

--- a/src/deprecated/bench/bench_model.cxx
+++ b/src/deprecated/bench/bench_model.cxx
-/*Copyright (c) 2011, Edgar Solomonik, all rights reserved.*/
-
-#include "dist_tensor.h"
-#include "dist_tensor_internal.h"
-#include "../shared/util.h"
-#include "../shared/unit_util.h"
-#include "unit_bench.h"
-#include "bench_sym_contract.hxx"
-
-
-/** 
- * \brief benchmarks model symmetric contractions 
- */
-void bench_model(int argc, char ** argv){
-  int seed, i, tid_A, tid_B, tid_C, stat;
-  int nctr, myRank, numPes, iter, ndim, n, inner_sz;
-  int * edge_len, * sym;
-  CommData_t *cdt_glb = (CommData_t*)malloc(sizeof(CommData_t));
-  RINIT_COMM(numPes, myRank, 4, 4, cdt_glb);
-
-  assert(argc == 3 || argc == 4);
-
-  seed = 100;
-  nctr = 2;
-  iter = 3;
-
-  ndim = atoi(argv[1]);
-  n = atoi(argv[2]);
-  if (argc > 3)
-    inner_sz = atoi(argv[3]);
-  else
-    inner_sz = DEF_INNER_SIZE;
-
-  if (myRank == 0) {
-    printf("Executing model contraction of tensor with dimension %d and edges of length %d\n",ndim,n);
-    printf("Using inner blocking size of %d\n",inner_sz);
-  }
-
-  edge_len 	= (int*)malloc(sizeof(int)*ndim);
-  sym 		= (int*)malloc(sizeof(int)*ndim);
-
-  CTF_ctr_type_t * ctypes = (CTF_ctr_type_t*)malloc(sizeof(CTF_ctr_type_t)*nctr);;
-
-  ctypes[0].idx_map_A = (int*)malloc(ndim*sizeof(int));
-  ctypes[0].idx_map_B = (int*)malloc(ndim*sizeof(int));
-  ctypes[0].idx_map_C = (int*)malloc(ndim*sizeof(int));
-  ctypes[1].idx_map_A = (int*)malloc(ndim*sizeof(int));
-  ctypes[1].idx_map_B = (int*)malloc(ndim*sizeof(int));
-  ctypes[1].idx_map_C = (int*)malloc(ndim*sizeof(int));
-
-  std::fill(edge_len, edge_len+ndim, n);
-  for (i=0; i<ndim; i++){
-    if (i == ndim/2 - 1 || i == ndim-1) {
-      sym[i] = NS;
-    } else {
-      sym[i] = SY;
-    }
-    ctypes[0].idx_map_A[i] = i;
-    if (i>=ndim/2)
-      ctypes[0].idx_map_B[i] = i + ndim/2;
-    else
-      ctypes[0].idx_map_B[i] = i;
-    ctypes[0].idx_map_C[i] = i + ndim/2; 
-    
-    ctypes[1].idx_map_B[i] = i;
-    if (i>=ndim/2) {
-      ctypes[1].idx_map_A[i] = i + ndim/2;
-    } else {
-      ctypes[1].idx_map_A[i] = ndim/2-i-1;
-    }
-    ctypes[1].idx_map_C[i] = i + ndim/2; 
-  }
-      
-  stat = CTF_init(MPI_COMM_WORLD, MACHINE_BGQ, myRank, numPes, inner_sz);
-  assert(stat == DIST_TENSOR_SUCCESS); 
-  
-  stat = CTF_define_tensor(ndim, edge_len, sym, &tid_A); 
-  stat = CTF_define_tensor(ndim, edge_len, sym, &tid_B); 
-  stat = CTF_define_tensor(ndim, edge_len, sym, &tid_C); 
-
-  ctypes[0].tid_A = tid_A;
-  ctypes[0].tid_B = tid_B;
-  ctypes[0].tid_C = tid_C;
-  ctypes[1].tid_A = tid_A;
-  ctypes[1].tid_B = tid_B;
-  ctypes[1].tid_C = tid_C;
-  
-  sym_readwrite(seed, tid_A, myRank, numPes);
-  sym_readwrite(seed, tid_B, myRank, numPes);
-  sym_readwrite(seed, tid_C, myRank, numPes);
-
-  
-  GLOBAL_BARRIER(cdt_glb);
-#ifdef TAU
-  TAU_PROFILE_TIMER(timer, "main", "int (int, char**)", TAU_USER);
-  TAU_PROFILE_START(timer);
-  TAU_PROFILE_INIT(argc, argv);
-  TAU_PROFILE_SET_NODE(myRank);
-  TAU_PROFILE_SET_CONTEXT(0);
-#endif
-  GLOBAL_BARRIER(cdt_glb);
-  bench_sym_contract(ctypes, myRank, numPes, iter, nctr);
-
-  GLOBAL_BARRIER(cdt_glb);
-  CTF_exit();
-  for (i=0; i<nctr; i++){
-    free(ctypes[i].idx_map_A);
-    free(ctypes[i].idx_map_B);
-    free(ctypes[i].idx_map_C);
-  }
-  free(ctypes);
-  TAU_PROFILE_STOP(timer);
-  if (myRank==0) printf("Model symmetry benchmark completed\n");
-  GLOBAL_BARRIER(cdt_glb);
-  FREE_CDT(cdt_glb);
-  free(cdt_glb);
-  COMM_EXIT;
-  return;
-}