matmul_check - OptiTrust Trace

⌵Trace for matmul_check✔

⌵Preprocessing loop contracts✔
- ⌵Resources.make_strict_loop_contracts✔
›Function.inline_def [cFunDef "mm"]; ~index:("b" ^ loop_id) ~bound:TileDivides [cFor loop_id] in✔
›List.iter tile [("i", 32); ("j", 32); ("k", 4)];✔
›Loop.reorder_at ~order:["bi"; "bj"; "bk"; "i"; "k"; "j"] [cPlusEq ~lhs:[cVar "sum"] ()];✔
›Loop.hoist_expr ~dest:[tBefore; cFor "bi"] "pB" ~indep:["bi"; "i"] [cArrayRead "B"];✔
›Matrix.stack_copy ~var:"sum" ~copy_var:"s" ~copy_dims:1 [cFor ~body:[cPlusEq ~lhs:[cVar "sum"] ()] "k"];✔
›Omp.simd [nbMulti; cFor ~body:[cPlusEq ~lhs:[cVar "s"] ()] "j"];✔
›Omp.parallel_for [nbMulti; cFunBody ""; cStrict; cFor ""];✔
›Loop.unroll ~simpl:Arith.do_nothing [cFor ~body:[cPlusEq ~lhs:[cVar "s"] ()] "k"];✔
›postprocessing (); )✔

tmp_before.cpp → tmp_after.cpp RENAMED Viewed

@@ -1,244 +1,64 @@
 #include <optitrust.h>
 #include "omp.h"
 // NOTE: using pretty matrix notation
 void mm1024(float* C, float* A, float* B) {
- __modifies("C ~> Matrix2(1024, 1024)");
- __reads("A ~> Matrix2(1024, 1024)");
- __reads("B ~> Matrix2(1024, 1024)");
- __ghost(tile_divides,
-     "tile_count := 32, tile_size := 32, size := 1024, items := fun i -> "
-     "for j in 0..1024 -> &C[i][j] ~> Cell");
  float* const pB = (float* const)malloc(sizeof(float[32][256][4][32]));
  #pragma omp parallel for
  for (int bj = 0; bj < 32; bj++) {
-  __strict();
-  __sreads("B ~> Matrix2(1024, 1024)");
-  __xwrites(
-    "for _v9 in 0..256 -> for _v10 in 0..4 -> for _v11 in 0..32 -> "
-    "&pB[bj][_v9][_v10][_v11] ~> Cell");
   for (int bk = 0; bk < 256; bk++) {
-   __strict();
-   __sreads("B ~> Matrix2(1024, 1024)");
-   __xwrites(
-     "for _v7 in 0..4 -> for _v8 in 0..32 -> &pB[bj][bk][_v7][_v8] ~> "
-     "Cell");
    for (int k = 0; k < 4; k++) {
-    __strict();
-    __sreads("B ~> Matrix2(1024, 1024)");
-    __xwrites("for _v6 in 0..32 -> &pB[bj][bk][k][_v6] ~> Cell");
     for (int j = 0; j < 32; j++) {
-     __strict();
-     __sreads("B ~> Matrix2(1024, 1024)");
-     __xwrites("&pB[bj][bk][k][j] ~> Cell");
-     __ghost(tiled_index_in_range,
-         "tile_index := bj, index := j, tile_count := 32, tile_size "
-         ":= 32, size := 1024");
-     __ghost(tiled_index_in_range,
-         "tile_index := bk, index := k, tile_count := 256, tile_size "
-         ":= 4, size := 1024");
-     const __ghost_fn __ghost_pair_3 = __ghost_begin(
-       matrix2_ro_focus, "M := B, i := bk * 4 + k, j := bj * 32 + j");
-     pB[bj][bk][k][j] = B[bk * 4 + k][bj * 32 + j];
-     __ghost_end(__ghost_pair_3);
     }
    }
   }
  }
  #pragma omp parallel for
  for (int bi = 0; bi < 32; bi++) {
-  __strict();
-  __sreads("pB ~> Matrix4(32, 256, 4, 32)");
-  __sreads("A ~> Matrix2(1024, 1024)");
-  __xmodifies(
-    "for i in 0..32 -> for j in 0..1024 -> &C[bi * 32 + i][j] ~> Cell");
-  for (int i = 0; i < 32; i++) {
-   __strict();
-   __xconsumes("for j in 0..1024 -> &C[bi * 32 + i][j] ~> Cell");
-   __xproduces(
-     "for bi1 in 0..32 -> for i2 in 0..32 -> &C[bi * 32 + i][bi1 * 32 + "
-     "i2] ~> Cell");
-   __ghost(tile_divides,
-       "tile_count := 32, tile_size := 32, size := 1024, items := fun j "
-       "-> &C[bi * 32 + i][j] ~> Cell");
-  }
-  __ghost(swap_groups,
-      "outer_range := 0..32, inner_range := 0..32, items := fun i, bj -> "
-      "for j in 0..32 -> &C[bi * 32 + i][bj * 32 + j] ~> Cell");
   for (int bj = 0; bj < 32; bj++) {
-   __strict();
-   __sreads("A ~> Matrix2(1024, 1024)");
-   __xwrites(
-     "for i in 0..32 -> for j in 0..32 -> &C[bi * 32 + i][bj * 32 + j] ~> "
-     "Cell");
-   __xreads(
-     "for bk in 0..256 -> for k in 0..4 -> for j in 0..32 -> "
-     "&pB[bj][bk][k][j] ~> Cell");
    float* const sum = (float* const)malloc(sizeof(float[32][32]));
    for (int i = 0; i < 32; i++) {
-    __strict();
-    __xwrites("for _v5 in 0..32 -> &sum[i][_v5] ~> Cell");
     for (int j = 0; j < 32; j++) {
-     __strict();
-     __xwrites("&sum[i][j] ~> Cell");
-     sum[i][j] = 0.f;
     }
    }
    for (int bk = 0; bk < 256; bk++) {
-    __strict();
-    __smodifies("sum ~> Matrix2(32, 32)");
-    __sreads("A ~> Matrix2(1024, 1024)");
-    __xreads(
-      "for k in 0..4 -> for j in 0..32 -> &pB[bj][bk][k][j] ~> Cell");
     for (int i = 0; i < 32; i++) {
-     __strict();
-     __sreads(
-       "for k in 0..4 -> for j in 0..32 -> &pB[bj][bk][k][j] ~> Cell");
-     __sreads("A ~> Matrix2(1024, 1024)");
-     __xmodifies("for j in 0..32 -> &sum[i][j] ~> Cell");
-     __ghost(tiled_index_in_range,
-         "tile_index := bi, index := i, tile_count := 32, tile_size "
-         ":= 32, size := 1024");
      float* const s = new float(32)();
-     const __ghost_fn __ghost_pair_6 =
-       __ghost_begin(mindex2_contiguous_ro, "M := sum");
-     MMEMCPY(s, 0, sum, i * 32 + 0, 32, sizeof(float));
-     __ghost_end(__ghost_pair_6);
-     const __ghost_fn __ghost_pair_7 =
-       __ghost_begin(group_ro_focus,
-              "i := 0, items := fun k -> for j in 0..32 -> "
-              "&pB[bj][bk][k][j] ~> Cell");
      #pragma omp simd
      for (int j = 0; j < 32; j++) {
-      __strict();
-      __sreads("A ~> Matrix2(1024, 1024)");
-      __xmodifies("&s[j] ~> Cell");
-      __xreads("&pB[bj][bk][0][j] ~> Cell");
-      __ghost(tiled_index_in_range,
-          "tile_index := bj, index := j, tile_count := 32, tile_size "
-          ":= 32, size := 1024");
-      __ghost(tiled_index_in_range,
-          "tile_index := bk, index := 0, tile_count := 256, "
-          "tile_size := 4, size := 1024");
-      __ghost(tiled_index_in_range,
-          "tile_index := bk, index := 0, tile_count := 256, "
-          "tile_size := 4, size := 1024");
-      const __ghost_fn __ghost_pair_2 = __ghost_begin(
-        matrix2_ro_focus, "M := A, i := bi * 32 + i, j := bk * 4 + 0");
-      s[j] += A[bi * 32 + i][bk * 4 + 0] * pB[bj][bk][0][j];
-      __ghost_end(__ghost_pair_2);
-     }
-     __ghost_end(__ghost_pair_7);
-     const __ghost_fn __ghost_pair_712 =
-       __ghost_begin(group_ro_focus,
-              "i := 1, items := fun k -> for j in 0..32 -> "
-              "&pB[bj][bk][k][j] ~> Cell");
      #pragma omp simd
      for (int j = 0; j < 32; j++) {
-      __strict();
-      __sreads("A ~> Matrix2(1024, 1024)");
-      __xmodifies("&s[j] ~> Cell");
-      __xreads("&pB[bj][bk][1][j] ~> Cell");
-      __ghost(tiled_index_in_range,
-          "tile_index := bj, index := j, tile_count := 32, tile_size "
-          ":= 32, size := 1024");
-      __ghost(tiled_index_in_range,
-          "tile_index := bk, index := 1, tile_count := 256, "
-          "tile_size := 4, size := 1024");
-      __ghost(tiled_index_in_range,
-          "tile_index := bk, index := 1, tile_count := 256, "
-          "tile_size := 4, size := 1024");
-      const __ghost_fn __ghost_pair_2 = __ghost_begin(
-        matrix2_ro_focus, "M := A, i := bi * 32 + i, j := bk * 4 + 1");
-      s[j] += A[bi * 32 + i][bk * 4 + 1] * pB[bj][bk][1][j];
-      __ghost_end(__ghost_pair_2);
-     }
-     __ghost_end(__ghost_pair_712);
-     const __ghost_fn __ghost_pair_713 =
-       __ghost_begin(group_ro_focus,
-              "i := 2, items := fun k -> for j in 0..32 -> "
-              "&pB[bj][bk][k][j] ~> Cell");
      #pragma omp simd
      for (int j = 0; j < 32; j++) {
-      __strict();
-      __sreads("A ~> Matrix2(1024, 1024)");
-      __xmodifies("&s[j] ~> Cell");
-      __xreads("&pB[bj][bk][2][j] ~> Cell");
-      __ghost(tiled_index_in_range,
-          "tile_index := bj, index := j, tile_count := 32, tile_size "
-          ":= 32, size := 1024");
-      __ghost(tiled_index_in_range,
-          "tile_index := bk, index := 2, tile_count := 256, "
-          "tile_size := 4, size := 1024");
-      __ghost(tiled_index_in_range,
-          "tile_index := bk, index := 2, tile_count := 256, "
-          "tile_size := 4, size := 1024");
-      const __ghost_fn __ghost_pair_2 = __ghost_begin(
-        matrix2_ro_focus, "M := A, i := bi * 32 + i, j := bk * 4 + 2");
-      s[j] += A[bi * 32 + i][bk * 4 + 2] * pB[bj][bk][2][j];
-      __ghost_end(__ghost_pair_2);
-     }
-     __ghost_end(__ghost_pair_713);
-     const __ghost_fn __ghost_pair_714 =
-       __ghost_begin(group_ro_focus,
-              "i := 3, items := fun k -> for j in 0..32 -> "
-              "&pB[bj][bk][k][j] ~> Cell");
      #pragma omp simd
      for (int j = 0; j < 32; j++) {
-      __strict();
-      __sreads("A ~> Matrix2(1024, 1024)");
-      __xmodifies("&s[j] ~> Cell");
-      __xreads("&pB[bj][bk][3][j] ~> Cell");
-      __ghost(tiled_index_in_range,
-          "tile_index := bj, index := j, tile_count := 32, tile_size "
-          ":= 32, size := 1024");
-      __ghost(tiled_index_in_range,
-          "tile_index := bk, index := 3, tile_count := 256, "
-          "tile_size := 4, size := 1024");
-      __ghost(tiled_index_in_range,
-          "tile_index := bk, index := 3, tile_count := 256, "
-          "tile_size := 4, size := 1024");
-      const __ghost_fn __ghost_pair_2 = __ghost_begin(
-        matrix2_ro_focus, "M := A, i := bi * 32 + i, j := bk * 4 + 3");
-      s[j] += A[bi * 32 + i][bk * 4 + 3] * pB[bj][bk][3][j];
-      __ghost_end(__ghost_pair_2);
-     }
-     __ghost_end(__ghost_pair_714);
-     __ghost(mindex2_contiguous_uninit, "M := sum");
-     MMEMCPY(sum, i * 32 + 0, s, 0, 32, sizeof(float));
-     __ghost(mindex2_contiguous_rev, "M := sum");
     }
    }
    for (int i = 0; i < 32; i++) {
-    __strict();
-    __xwrites("for j in 0..32 -> &C[bi * 32 + i][bj * 32 + j] ~> Cell");
-    __xreads("for j in 0..32 -> &sum[i][j] ~> Cell");
     for (int j = 0; j < 32; j++) {
-     __strict();
-     __xwrites("&C[bi * 32 + i][bj * 32 + j] ~> Cell");
-     __xreads("&sum[i][j] ~> Cell");
-     C[bi * 32 + i][bj * 32 + j] = sum[i][j];
     }
    }
    free(sum);
   }
-  __ghost(swap_groups,
-      "outer_range := 0..32, inner_range := 0..32, items := fun bj, i -> "
-      "for j in 0..32 -> &C[bi * 32 + i][bj * 32 + j] ~> Cell");
-  for (int i = 0; i < 32; i++) {
-   __strict();
-   __xconsumes(
-     "for bj in 0..32 -> for j in 0..32 -> &C[bi * 32 + i][bj * 32 + j] "
-     "~> Cell");
-   __xproduces("for j in 0..1024 -> &C[bi * 32 + i][j] ~> Cell");
-   __ghost(untile_divides,
-       "tile_count := 32, tile_size := 32, size := 1024, items := fun j "
-       "-> &C[bi * 32 + i][j] ~> Cell");
-  }
  }
  free(pB);
- __ghost(untile_divides,
-     "tile_count := 32, tile_size := 32, size := 1024, items := fun i -> "
-     "for j in 0..1024 -> &C[i][j] ~> Cell");
 }

 #include <optitrust.h>
 #include "omp.h"
 // NOTE: using pretty matrix notation
 void mm1024(float* C, float* A, float* B) {
  float* const pB = (float* const)malloc(sizeof(float[32][256][4][32]));
  #pragma omp parallel for
  for (int bj = 0; bj < 32; bj++) {
   for (int bk = 0; bk < 256; bk++) {
    for (int k = 0; k < 4; k++) {
     for (int j = 0; j < 32; j++) {
+     pB[32768 * bj + 128 * bk + 32 * k + j] =
+       B[1024 * (4 * bk + k) + 32 * bj + j];
     }
    }
   }
  }
  #pragma omp parallel for
  for (int bi = 0; bi < 32; bi++) {
   for (int bj = 0; bj < 32; bj++) {
    float* const sum = (float* const)malloc(sizeof(float[32][32]));
    for (int i = 0; i < 32; i++) {
     for (int j = 0; j < 32; j++) {
+     sum[32 * i + j] = 0.f;
     }
    }
    for (int bk = 0; bk < 256; bk++) {
     for (int i = 0; i < 32; i++) {
      float* const s = new float(32)();
+     MMEMCPY(s, 0, sum, 32 * i, 32, sizeof(float));
      #pragma omp simd
      for (int j = 0; j < 32; j++) {
+      s[j] += A[1024 * (32 * bi + i) + 4 * bk] *
+          pB[32768 * bj + 128 * bk + j];
+     }
      #pragma omp simd
      for (int j = 0; j < 32; j++) {
+      s[j] += A[1 + 1024 * (32 * bi + i) + 4 * bk] *
+          pB[32 + 32768 * bj + 128 * bk + j];
+     }
      #pragma omp simd
      for (int j = 0; j < 32; j++) {
+      s[j] += A[2 + 1024 * (32 * bi + i) + 4 * bk] *
+          pB[64 + 32768 * bj + 128 * bk + j];
+     }
      #pragma omp simd
      for (int j = 0; j < 32; j++) {
+      s[j] += A[3 + 1024 * (32 * bi + i) + 4 * bk] *
+          pB[96 + 32768 * bj + 128 * bk + j];
+     }
+     MMEMCPY(sum, 32 * i, s, 0, 32, sizeof(float));
     }
    }
    for (int i = 0; i < 32; i++) {
     for (int j = 0; j < 32; j++) {
+     C[1024 * (32 * bi + i) + 32 * bj + j] = sum[32 * i + j];
     }
    }
    free(sum);
   }
  }
  free(pB);
 }

​x
 
#include <optitrust.h>
​
#include "omp.h"
// NOTE: using pretty matrix notation
​
void mm1024(float* C, float* A, float* B) {
  float* const pB = (float* const)malloc(sizeof(float[32][256][4][32]));
  #pragma omp parallel for
  for (int bj = 0; bj < 32; bj++) {
    for (int bk = 0; bk < 256; bk++) {
      for (int k = 0; k < 4; k++) {
        for (int j = 0; j < 32; j++) {
          pB[32768 * bj + 128 * bk + 32 * k + j] =
              B[1024 * (4 * bk + k) + 32 * bj + j];
        }
      }
    }
  }
  #pragma omp parallel for
  for (int bi = 0; bi < 32; bi++) {
    for (int bj = 0; bj < 32; bj++) {
      float* const sum = (float* const)malloc(sizeof(float[32][32]));
      for (int i = 0; i < 32; i++) {
        for (int j = 0; j < 32; j++) {
          sum[32 * i + j] = 0.f;
        }
      }
      for (int bk = 0; bk < 256; bk++) {
        for (int i = 0; i < 32; i++) {
          float* const s = new float(32)();
          MMEMCPY(s, 0, sum, 32 * i, 32, sizeof(float));
          #pragma omp simd
          for (int j = 0; j < 32; j++) {
            s[j] += A[1024 * (32 * bi + i) + 4 * bk] *
                    pB[32768 * bj + 128 * bk + j];
          }
          #pragma omp simd
          for (int j = 0; j < 32; j++) {
            s[j] += A[1 + 1024 * (32 * bi + i) + 4 * bk] *
                    pB[32 + 32768 * bj + 128 * bk + j];
          }
          #pragma omp simd
          for (int j = 0; j < 32; j++) {
            s[j] += A[2 + 1024 * (32 * bi + i) + 4 * bk] *
                    pB[64 + 32768 * bj + 128 * bk + j];
          }
          #pragma omp simd
          for (int j = 0; j < 32; j++) {
            s[j] += A[3 + 1024 * (32 * bi + i) + 4 * bk] *
                    pB[96 + 32768 * bj + 128 * bk + j];
          }
          MMEMCPY(sum, 32 * i, s, 0, 32, sizeof(float));
        }
      }
      for (int i = 0; i < 32; i++) {
        for (int j = 0; j < 32; j++) {
          C[1024 * (32 * bi + i) + 32 * bj + j] = sum[32 * i + j];
        }
      }
      free(sum);
    }
  }
  free(pB);
}
​