cyclopstf.hpp 22.2 KB
Newer Older
Edgar Solomonik's avatar
Edgar Solomonik committed
1
/*Copyright (c) 2011, Edgar Solomonik, all rights reserved.*/
2 3 4 5 6 7 8

#ifndef __CYCLOPSTF_HPP__
#define __CYCLOPSTF_HPP__

#include "mpi.h"
#include <stdint.h>
#include <stdio.h>
9
#include <complex>
10 11 12 13 14 15 16 17 18 19 20 21 22

/* READ ME!
 * ALL BELOW FUNCTIONS MUST BE CALLED BY ALL MEMBERS OF MPI_COMM_WORLD
 * all functions return DIST_TENSOR_SUCCESS if successfully compeleted
 *
 * Usage:
 * call init_dist_tensor_lib() to initialize library
 * call define_tensor() to specify an empty tensor
 * call write_*_tensor() to input tensor data
 * call contract() to execute contraction (data may get remapped)
 * call read_*_tensor() to read tensor data
 * call clean_tensors() to clean up internal handles
 */
23 24 25 26
/**
 * \addtogroup CTF CTF: main C++ interface
 * @{
 */
27

Edgar Solomonik's avatar
Edgar Solomonik committed
28 29 30
/**
 * \brief reduction types for tensor data
 */
Edgar Solomonik's avatar
Edgar Solomonik committed
31 32
enum CTF_OP { CTF_OP_SUM, CTF_OP_SUMABS,
              CTF_OP_NORM1, CTF_OP_NORM2, CTF_OP_NORM_INFTY,
33
              CTF_OP_MAX, CTF_OP_MIN, CTF_OP_MAXABS, CTF_OP_MINABS};
34 35 36 37 38 39 40 41 42 43 44 45 46
/**
 * labels corresponding to symmetry of each tensor dimension
 * NS = 0 - nonsymmetric
 * SY = 1 - symmetric
 * AS = 2 - antisymmetric
 * SH = 3 - symmetric hollow
 */
#if (!defined NS && !defined SY && !defined SH)
#define NS 0
#define SY 1
#define AS 2
#define SH 3
#endif
solomon's avatar
solomon committed
47
typedef int64_t long_int;
solomon's avatar
solomon committed
48
typedef long_int key;
49

50
static const char * SY_strings[4] = {"NS", "SY", "AS", "SH"};
51 52 53 54 55 56 57 58 59 60

template<typename dtype>
struct tkv_pair {
  key k;
  dtype d;
  tkv_pair() {}
  tkv_pair(key k, dtype d) : k(k), d(d) {}
  bool operator< (const tkv_pair<dtype>& other) const{
    return k < other.k;
  }
61 62 63 64 65 66
  bool operator==(const tkv_pair<dtype>& other) const{
    return (k == other.k && d == other.d);
  }
  bool operator!=(const tkv_pair<dtype>& other) const{
    return !(*this == other);
  }
67 68 69 70 71 72 73 74
};

typedef tkv_pair<double> kv_pair;

template<typename dtype>
inline bool comp_tkv_pair(tkv_pair<dtype> i,tkv_pair<dtype> j) {
  return (i.k<j.k);
}
75 76 77 78 79 80 81 82 83 84
/**
 * @}
 */

/**
 * \defgroup internal Tensor mapping and redistribution internals
 * @{
 */
/* Force redistributions always by setting to 1 (use 2.5D algorithms) */
#define REDIST 0
85
//#define VERIFY 0
solomon's avatar
solomon committed
86
#define VERIFY_REMAP 0
87 88
#define FOLD_TSR 1
#define PERFORM_DESYM 1
89
#define ALLOW_NVIRT 1024
90
#define DIAG_RESCALE
91
#define USE_SYM_SUM
92
#define HOME_CONTRACT
93
#define USE_BLOCK_RESHUFFLE
94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120

typedef struct CTF_ctr_type {
  int   tid_A;
  int   tid_B;
  int   tid_C;
  int * idx_map_A; /* map indices of tensor A to contraction */
  int * idx_map_B; /* map indices of tensor B to contraction */
  int * idx_map_C; /* map indices of tensor C to contraction */
} CTF_ctr_type_t;

typedef struct CTF_sum_type {
  int   tid_A;
  int   tid_B;
  int * idx_map_A; /* map indices of tensor A to sum */
  int * idx_map_B; /* map indices of tensor B to sum */
} CTF_sum_type_t;

enum { DIST_TENSOR_SUCCESS, DIST_TENSOR_ERROR, DIST_TENSOR_NEGATIVE };

enum CTF_MACHINE { MACHINE_GENERIC, MACHINE_BGP, MACHINE_BGQ,
                   MACHINE_8D, NO_TOPOLOGY };


/* These now have to live in a struct due to being templated, since one
   cannot typedef. */
template<typename dtype>
struct fseq_tsr_scl {
121
  /* Function signature for sub-tensor scale recrusive call */
122 123 124 125 126 127 128 129
  int  (*func_ptr) ( dtype const        alpha,
                     dtype *            A,
                     int const          ndim_A,
                     int const *        edge_len_A,
                     int const *        lda_A,
                     int const *        sym_A,
                     int const *        idx_map_A);
};
130 131 132 133

/* custom element-wise function for tensor scale */
template<typename dtype>
struct fseq_elm_scl {
134
  void  (*func_ptr)(dtype const alpha,
135 136 137 138
                    dtype &     a);
};

/* Function signature for sub-tensor summation recrusive call */
139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
template<typename dtype>
struct fseq_tsr_sum {
  int  (*func_ptr) ( dtype const          alpha,
                     dtype const *        A,
                     int const            ndim_A,
                     int const *          edge_len_A,
                     int const *          lda_A,
                     int const *          sym_A,
                     int const *          idx_map_A,
                     dtype const          beta,
                     dtype *              B,
                     int const            ndim_B,
                     int const *          edge_len_B,
                     int const *          lda_B,
                     int const *          sym_B,
                     int const *          idx_map_B);
};

157 158 159
/* custom element-wise function for tensor sum */
template<typename dtype>
struct fseq_elm_sum {
160
  void  (*func_ptr)(dtype const alpha,
161 162 163 164
                    dtype const a,
                    dtype &     b);
};

165 166 167

template<typename dtype>
struct fseq_tsr_ctr {
168 169 170
#ifdef OFFLOAD
    int is_offloadable;
#endif
171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193
    /* Function signature for sub-tensor contraction recrusive call */
    int  (*func_ptr) ( dtype const      alpha,
                       dtype const *    A,
                       int const        ndim_A,
                       int const *      edge_len_A,
                       int const *      lda_A,
                       int const *      sym_A,
                       int const *      idx_map_A,
                       dtype const *    B,
                       int const        ndim_B,
                       int const *      edge_len_B,
                       int const *      lda_B,
                       int const *      sym_B,
                       int const *      idx_map_B,
                       dtype const      beta,
                       dtype *          C,
                       int const        ndim_C,
                       int const *      edge_len_C,
                       int const *      lda_C,
                       int const *      sym_C,
                       int const *      idx_map_C);
};

194 195 196
/* custom element-wise function for tensor sum */
template<typename dtype>
struct fseq_elm_ctr {
197 198
  void  (*func_ptr)(dtype const alpha,
                    dtype const a,
199 200 201 202
                    dtype const b,
                    dtype &     c);
};

203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218
template<typename dtype>
class dist_tensor;

template<typename dtype>
class tCTF{
  private:
    dist_tensor<dtype> * dt;
    int initialized;

  public:


    tCTF();
    ~tCTF();

    /* initializes library. Sets topology to be a torus of edge lengths equal to the
219
       factorization of np. Main args can be sset for profiler output. */
220 221 222
    int init(MPI_Comm const global_context,
             int const      rank,
             int const      np,
223 224
             CTF_MACHINE    mach = MACHINE_GENERIC,
             int const      argc = 0,
Devin Matthews's avatar
Devin Matthews committed
225
             const char * const * argv = NULL);
226 227 228 229 230 231 232 233 234


    /* initializes library. Sets topology to be a mesh of dimension ndim with
       edge lengths dim_len. */
    int init(MPI_Comm const global_context,
             int const      rank,
             int const      np,
             int const      ndim,
             int const *    dim_len,
235
             int const      argc = 0,
Devin Matthews's avatar
Devin Matthews committed
236
             const char * const * argv = NULL);
237 238


239 240
    /* return MPI_Comm global_context */
    MPI_Comm get_MPI_Comm();
241

Edgar Solomonik's avatar
Edgar Solomonik committed
242 243
    /* return MPI processor rank */
    int get_rank();
244

Edgar Solomonik's avatar
Edgar Solomonik committed
245 246
    /* return number of MPI processes in the defined global context */
    int get_num_pes();
247 248

    /* define a tensor and retrive handle */
249 250 251 252
    int define_tensor(int const     ndim,
                      int const *   edge_len,
                      int const *   sym,
                      int *         tensor_id,
253
#if DEBUG < 3
254
                      char const *  name = NULL,
255 256 257 258 259 260
                      int           profile = 0
#else
                      char const *  name = "X",
                      int           profile = 1
#endif
                      );
261 262 263 264 265 266 267 268 269 270 271 272

    /* Create identical tensor with identical data if copy_data=1 */
    int clone_tensor(int const  tensor_id,
                     int const  copy_data,
                     int *      new_tensor_id);

    /* get information about a tensor */
    int info_tensor(int const tensor_id,
                    int *     ndim,
                    int **    edge_len,
                    int **    sym) const;

273 274
    /* set the tensor name */
    int set_name(int const tensor_id, char const * name);
275

276
    /* get the tensor name */
277
    int get_name(int const tensor_id, char const ** name);
278 279 280

    /* turn on profiling */
    int profile_on(int const tensor_id);
281

282 283 284
    /* turn off profiling */
    int profile_off(int const tensor_id);

285 286 287 288 289 290 291 292 293 294
    /* get dimension of a tensor */
    int get_dimension(int const tensor_id, int *ndim) const;

    /* get lengths of a tensor */
    int get_lengths(int const tensor_id, int **edge_len) const;

    /* get symmetry of a tensor */
    int get_symmetry(int const tensor_id, int **sym) const;

    /* get raw data pointer WARNING: includes padding */
solomon's avatar
solomon committed
295
    int get_raw_data(int const tensor_id, dtype ** data, long_int * size);
296 297 298 299

    /* Input tensor data with <key, value> pairs where key is the
       global index for the value. */
    int write_tensor(int const                tensor_id,
300
                     long_int const           num_pair,
Devin Matthews's avatar
Devin Matthews committed
301
                     tkv_pair<dtype> const *  mapped_data);
302

303
    /* Add tensor data new=alpha*new+beta*old
304
       with <key, value> pairs where key is the
305 306
       global index for the value. */
    int write_tensor(int const                tensor_id,
307
                     long_int const           num_pair,
308 309
                     dtype const              alpha,
                     dtype const              beta,
Devin Matthews's avatar
Devin Matthews committed
310
                     tkv_pair<dtype> const *  mapped_data);
311

312 313 314 315 316 317 318 319 320 321 322 323 324
    /**
     * Permutes a tensor along each dimension skips if perm set to -1, generalizes slice.
     *        one of permutation_A or permutation_B has to be set to NULL, if multiworld read, then
     *        the parent world tensor should not be being permuted
     */
    int permute_tensor(int const              tid_A,
                       int * const *          permutation_A,
                       dtype const            alpha,
                       tCTF<dtype> *          tC_A,
                       int const              tid_B,
                       int * const *          permutation_B,
                       dtype const            beta,
                       tCTF<dtype> *          tC_B);
325 326 327 328 329 330 331 332 333 334 335 336

   /**
     * \brief accumulates this tensor to a tensor object defined on a different world
     * \param[in] tid id of tensor on this CTF instance
     * \param[in] tid_sub id of tensor on a subcomm of this CTF inst
     * \param[in] tC_sub CTF instance on a mpi subcomm
     * \param[in] alpha scaling factor for this tensor
     * \param[in] beta scaling factor for tensor tsr
     */
    int  add_to_subworld(int          tid,
                         int          tid_sub,
                         tCTF<dtype> *tC_sub,
337 338
                         dtype       alpha,
                         dtype       beta);
339 340 341 342 343 344 345 346 347 348 349 350
   /**
     * \brief accumulates this tensor from a tensor object defined on a different world
     * \param[in] tsr a tensor object of the same characteristic as this tensor, 
     * \param[in] tid id of tensor on this CTF instance
     * \param[in] tid_sub id of tensor on a subcomm of this CTF inst
     * \param[in] tC_sub CTF instance on a mpi subcomm
     * \param[in] alpha scaling factor for this tensor
     * \param[in] beta scaling factor for tensor tsr
     */
    int  add_from_subworld(int          tid,
                           int          tid_sub,
                           tCTF<dtype> *tC_sub,
351 352
                           dtype       alpha,
                           dtype       beta);
353
    
354
    /* Add tensor data from A to a block of B, 
355 356
       B[offsets_B:ends_B] = beta*B[offsets_B:ends_B] 
                          + alpha*A[offsets_A:ends_A] */
357 358 359
    int slice_tensor(int const    tid_A,
                     int const *  offsets_A,
                     int const *  ends_A,
360
                     dtype const  alpha,
361 362 363
                     int const    tid_B,
                     int const *  offsets_B,
                     int const *  ends_B,
364
                     dtype const  beta);
365

366 367 368 369
    /* Same as above, except tid_A lives on dt_other_A */
    int slice_tensor(int const      tid_A,
                     int const *    offsets_A,
                     int const *    ends_A,
370
                     dtype const    alpha,
371 372 373 374
                     tCTF<dtype> *  dt_other_A,
                     int const      tid_B,
                     int const *    offsets_B,
                     int const *    ends_B,
375
                     dtype const    beta);
376

377 378 379 380
    /* Same as above, except tid_B lives on dt_other_B */
    int slice_tensor(int const      tid_A,
                     int const *    offsets_A,
                     int const *    ends_A,
381
                     dtype const    alpha,
382 383 384
                     int const      tid_B,
                     int const *    offsets_B,
                     int const *    ends_B,
385
                     dtype const    beta,
386 387
                     tCTF<dtype> *  dt_other_B);

388 389

    /* read a block from tensor_id,
390 391 392 393 394 395
       new_tensor_id = tensor_id[offsets:ends] */
/*    int read_block_tensor(int const   tensor_id,
                          int const * offsets,
                          int const * ends,
                          int *       new_tensor_id);*/

396

397
    /* read tensor data with <key, value> pairs where key is the
398
       global index for the value, which gets filled in with
399 400 401 402 403 404
       beta times the old values plus alpha times the values read from the tensor. */
    int read_tensor(int const               tensor_id,
                    long_int const          num_pair,
                    dtype const             alpha,
                    dtype const             beta,
                    tkv_pair<dtype> * const mapped_data);
405

406 407 408
    /* read tensor data with <key, value> pairs where key is the
       global index for the value, which gets filled in. */
    int read_tensor(int const               tensor_id,
409
                    long_int const          num_pair,
410 411
                    tkv_pair<dtype> * const mapped_data);

412

413 414 415
    /* read entire tensor with each processor (in packed layout).
       WARNING: will use a lot of memory. */
    int allread_tensor(int const  tensor_id,
416
                       long_int * num_pair,
417 418
                       dtype **   all_data);

419
    /* read entire tensor with each processor to preallocated buffer
420 421 422 423 424 425
       (in packed layout).
       WARNING: will use a lot of memory. */
    int allread_tensor(int const  tensor_id,
                       long_int * num_pair,
                       dtype *    all_data);

426 427 428 429 430 431

    /* map input tensor local data to zero. */
    int set_zero_tensor(int tensor_id);

    /* read tensor data pairs local to processor. */
    int read_local_tensor(int const           tensor_id,
432
                          long_int *          num_pair,
433 434 435 436 437 438 439 440 441 442 443 444 445
                          tkv_pair<dtype> **  mapped_data);

    /* contracts tensors alpha*A*B + beta*C -> C,
       uses standard symmetric contraction sequential kernel */
    int contract(CTF_ctr_type_t const * type,
                 dtype const            alpha,
                 dtype const            beta);

    /* contracts tensors alpha*A*B + beta*C -> C,
       seq_func used to perform sequential op */
    int contract(CTF_ctr_type_t const *     type,
                 fseq_tsr_ctr<dtype> const  func_ptr,
                 dtype const                alpha,
446
                 dtype const                beta);
447

448 449 450 451 452 453
    /* contracts tensors alpha*A*B + beta*C -> C,
       seq_func used to perform element-wise sequential op */
    int contract(CTF_ctr_type_t const *     type,
                 fseq_elm_ctr<dtype> const  felm,
                 dtype const                alpha,
                 dtype const                beta);
454 455 456 457 458 459 460 461 462 463 464 465 466 467

    /* DAXPY: a*A + B -> B. */
    int sum_tensors(dtype const  alpha,
                    int const    tid_A,
                    int const    tid_B);

    /* DAXPY: a*idx_map_A(A) + b*idx_map_B(B) -> idx_map_B(B). */
    int sum_tensors(CTF_sum_type_t const *  type,
                    dtype const             alpha,
                    dtype const             beta);

    /* DAXPY: a*idx_map_A(A) + b*idx_map_B(B) -> idx_map_B(B). */
    int sum_tensors(dtype const               alpha,
                    dtype const               beta,
468
                    int const                 tid_A,
469 470 471 472 473 474 475 476 477 478
                    int const                 tid_B,
                    int const *               idx_map_A,
                    int const *               idx_map_B,
                    fseq_tsr_sum<dtype> const func_ptr);

    /* DAXPY: a*idx_map_A(A) + b*idx_map_B(B) -> idx_map_B(B). */
    int sum_tensors(CTF_sum_type_t const *    type,
                    dtype const               alpha,
                    dtype const               beta,
                    fseq_tsr_sum<dtype> const func_ptr);
479

480 481 482 483 484 485 486 487
    /* DAXPY: a*idx_map_A(A) + b*idx_map_B(B) -> idx_map_B(B). */
    int sum_tensors(dtype const               alpha,
                    dtype const               beta,
                    int const                 tid_A,
                    int const                 tid_B,
                    int const *               idx_map_A,
                    int const *               idx_map_B,
                    fseq_elm_sum<dtype> const felm);
488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504

    /* copy A into B. Realloc if necessary */
    int copy_tensor(int const tid_A, int const tid_B);

    /* scale tensor by alpha. A <- a*A */
    int scale_tensor(dtype const alpha, int const tid);

    /* scale tensor by alpha. A <- a*A */
    int scale_tensor(dtype const                alpha,
                     int const                  tid,
                     int const *                idx_map_A);

    /* scale tensor by alpha. A <- a*A */
    int scale_tensor(dtype const                alpha,
                     int const                  tid,
                     int const *                idx_map_A,
                     fseq_tsr_scl<dtype> const  func_ptr);
505

506 507 508 509 510
    /* scale tensor by alpha. A <- a*A */
    int scale_tensor(dtype const                alpha,
                     int const                  tid,
                     int const *                idx_map_A,
                     fseq_elm_scl<dtype> const  felm);
Edgar Solomonik's avatar
Edgar Solomonik committed
511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542

    /**
     * \brief estimate the cost of a contraction C[idx_C] = A[idx_A]*B[idx_B]
     * \param[in] A first operand tensor
     * \param[in] idx_A indices of A in contraction, e.g. "ik" -> A_{ik}
     * \param[in] B second operand tensor
     * \param[in] idx_B indices of B in contraction, e.g. "kj" -> B_{kj}
     * \param[in] beta C scaling factor
     * \param[in] idx_C indices of C (this tensor),  e.g. "ij" -> C_{ij}
     * \return cost as a int64_t type, currently a rought estimate of flops/processor
     */
    int64_t estimate_cost(int tid_A,
                          int const *        idx_A,
                          int tid_B,
                          int const *        idx_B,
                          int tid_C,
                          int const *        idx_C);
    
    /**
     * \brief estimate the cost of a sum B[idx_B] = A[idx_A]
     * \param[in] A first operand tensor
     * \param[in] idx_A indices of A in contraction, e.g. "ik" -> A_{ik}
     * \param[in] B second operand tensor
     * \param[in] idx_B indices of B in contraction, e.g. "kj" -> B_{kj}
     * \return cost as a int64_t type, currently a rought estimate of flops/processor
     */
    int64_t estimate_cost(int tid_A,
                          int const *        idx_A,
                          int tid_B,
                          int const *        idx_B);


Edgar Solomonik's avatar
Edgar Solomonik committed
543
    /* aligns tensor mapping of tid_A to that of tid_B */
544
    int align(int const    tid_A,
Edgar Solomonik's avatar
Edgar Solomonik committed
545
              int const    tid_B);
546 547 548 549 550 551 552 553 554 555

    /* product will contain the dot prodiuct if tsr_A and tsr_B */
    int dot_tensor(int const tid_A, int const tid_B, dtype *product);

    /* reduce data of tid_A with the given OP */
    int reduce_tensor(int const tid, CTF_OP op, dtype * result);

    /* map data of tid_A with the given function */
    int map_tensor(int const tid,
                   dtype (*map_func)(int const ndim, int const * indices,
556
                                     dtype const elem));
557

558 559 560
    /* obtains the largest n elements (in absolute value) of the tensor */
    int get_max_abs(int const tid, int const n, dtype * data);

561
    /* Prints a tensor on one processor. */
562
    int print_tensor(FILE * stream, int const tid, double cutoff = -1.0);
563

564
    /* Compares two tensors on one processor. */
565
    int compare_tensor(FILE * stream, int const tid_A, int const tid_B, double cutoff = -1.0);
566

567
    /* Prints contraction type. */
568 569 570
    int print_ctr(CTF_ctr_type_t const * ctype,
                  dtype const            alpha,
                  dtype const            beta) const;
571 572

    /* Prints sum type. */
573 574 575
    int print_sum(CTF_sum_type_t const * stype,
                  dtype const            alpha,
                  dtype const            beta) const;
576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627

    /* Deletes all tensor handles. Invalidates all tensor ids. */
    int clean_tensors();

    /* Deletes a tensor handle. Invalidates all tensor ids. */
    int clean_tensor(int const tid);

    /* Exits library and deletes all internal data */
    int exit();

    /* ScaLAPACK back-end */
    int pgemm( char const         TRANSA,
               char const         TRANSB,
               int const          M,
               int const          N,
               int const          K,
               dtype const        ALPHA,
               dtype *            A,
               int const          IA,
               int const          JA,
               int const *        DESCA,
               dtype *            B,
               int const          IB,
               int const          JB,
               int const *        DESCB,
               dtype const        BETA,
               dtype *            C,
               int const          IC,
               int const          JC,
               int const *        DESCC);

    /* define matrix from ScaLAPACK descriptor */
    int def_scala_mat(int const * DESCA, dtype const * data, int * tid);

    /* reads a ScaLAPACK matrix to the original data pointer */
    int read_scala_mat(int const tid, dtype * data);

    int pgemm( char const         TRANSA,
               char const         TRANSB,
               dtype const        ALPHA,
               int const          tid_A,
               int const          tid_B,
               dtype const        BETA,
               int const          tid_C);

};

//template class tCTF<double>;
//template class tCTF< std::complex<double> >;

typedef tCTF<double> CTF;

628 629 630 631
/**
 * @}
 */

632 633 634 635 636

//#include "cyclopstf.cxx"

#endif ////__CYCLOPSTF_HPP__