Trace for matmul_check

    
tmp_before.cpp → tmp_after.cpp RENAMED
@@ -1,244 +1,64 @@
1
  #include <optitrust.h>
2
 
3
  #include "omp.h"
4
  // NOTE: using pretty matrix notation
5
 
6
  void mm1024(float* C, float* A, float* B) {
7
- __modifies("C ~> Matrix2(1024, 1024)");
8
- __reads("A ~> Matrix2(1024, 1024)");
9
- __reads("B ~> Matrix2(1024, 1024)");
10
- __ghost(tile_divides,
11
- "tile_count := 32, tile_size := 32, size := 1024, items := fun i -> "
12
- "for j in 0..1024 -> &C[i][j] ~> Cell");
13
  float* const pB = (float* const)malloc(sizeof(float[32][256][4][32]));
14
  #pragma omp parallel for
15
  for (int bj = 0; bj < 32; bj++) {
16
- __strict();
17
- __sreads("B ~> Matrix2(1024, 1024)");
18
- __xwrites(
19
- "for _v9 in 0..256 -> for _v10 in 0..4 -> for _v11 in 0..32 -> "
20
- "&pB[bj][_v9][_v10][_v11] ~> Cell");
21
  for (int bk = 0; bk < 256; bk++) {
22
- __strict();
23
- __sreads("B ~> Matrix2(1024, 1024)");
24
- __xwrites(
25
- "for _v7 in 0..4 -> for _v8 in 0..32 -> &pB[bj][bk][_v7][_v8] ~> "
26
- "Cell");
27
  for (int k = 0; k < 4; k++) {
28
- __strict();
29
- __sreads("B ~> Matrix2(1024, 1024)");
30
- __xwrites("for _v6 in 0..32 -> &pB[bj][bk][k][_v6] ~> Cell");
31
  for (int j = 0; j < 32; j++) {
32
- __strict();
33
- __sreads("B ~> Matrix2(1024, 1024)");
34
- __xwrites("&pB[bj][bk][k][j] ~> Cell");
35
- __ghost(tiled_index_in_range,
36
- "tile_index := bj, index := j, tile_count := 32, tile_size "
37
- ":= 32, size := 1024");
38
- __ghost(tiled_index_in_range,
39
- "tile_index := bk, index := k, tile_count := 256, tile_size "
40
- ":= 4, size := 1024");
41
- const __ghost_fn __ghost_pair_3 = __ghost_begin(
42
- matrix2_ro_focus, "M := B, i := bk * 4 + k, j := bj * 32 + j");
43
- pB[bj][bk][k][j] = B[bk * 4 + k][bj * 32 + j];
44
- __ghost_end(__ghost_pair_3);
45
  }
46
  }
47
  }
48
  }
49
  #pragma omp parallel for
50
  for (int bi = 0; bi < 32; bi++) {
51
- __strict();
52
- __sreads("pB ~> Matrix4(32, 256, 4, 32)");
53
- __sreads("A ~> Matrix2(1024, 1024)");
54
- __xmodifies(
55
- "for i in 0..32 -> for j in 0..1024 -> &C[bi * 32 + i][j] ~> Cell");
56
- for (int i = 0; i < 32; i++) {
57
- __strict();
58
- __xconsumes("for j in 0..1024 -> &C[bi * 32 + i][j] ~> Cell");
59
- __xproduces(
60
- "for bi1 in 0..32 -> for i2 in 0..32 -> &C[bi * 32 + i][bi1 * 32 + "
61
- "i2] ~> Cell");
62
- __ghost(tile_divides,
63
- "tile_count := 32, tile_size := 32, size := 1024, items := fun j "
64
- "-> &C[bi * 32 + i][j] ~> Cell");
65
- }
66
- __ghost(swap_groups,
67
- "outer_range := 0..32, inner_range := 0..32, items := fun i, bj -> "
68
- "for j in 0..32 -> &C[bi * 32 + i][bj * 32 + j] ~> Cell");
69
  for (int bj = 0; bj < 32; bj++) {
70
- __strict();
71
- __sreads("A ~> Matrix2(1024, 1024)");
72
- __xwrites(
73
- "for i in 0..32 -> for j in 0..32 -> &C[bi * 32 + i][bj * 32 + j] ~> "
74
- "Cell");
75
- __xreads(
76
- "for bk in 0..256 -> for k in 0..4 -> for j in 0..32 -> "
77
- "&pB[bj][bk][k][j] ~> Cell");
78
  float* const sum = (float* const)malloc(sizeof(float[32][32]));
79
  for (int i = 0; i < 32; i++) {
80
- __strict();
81
- __xwrites("for _v5 in 0..32 -> &sum[i][_v5] ~> Cell");
82
  for (int j = 0; j < 32; j++) {
83
- __strict();
84
- __xwrites("&sum[i][j] ~> Cell");
85
- sum[i][j] = 0.f;
86
  }
87
  }
88
  for (int bk = 0; bk < 256; bk++) {
89
- __strict();
90
- __smodifies("sum ~> Matrix2(32, 32)");
91
- __sreads("A ~> Matrix2(1024, 1024)");
92
- __xreads(
93
- "for k in 0..4 -> for j in 0..32 -> &pB[bj][bk][k][j] ~> Cell");
94
  for (int i = 0; i < 32; i++) {
95
- __strict();
96
- __sreads(
97
- "for k in 0..4 -> for j in 0..32 -> &pB[bj][bk][k][j] ~> Cell");
98
- __sreads("A ~> Matrix2(1024, 1024)");
99
- __xmodifies("for j in 0..32 -> &sum[i][j] ~> Cell");
100
- __ghost(tiled_index_in_range,
101
- "tile_index := bi, index := i, tile_count := 32, tile_size "
102
- ":= 32, size := 1024");
103
  float* const s = new float(32)();
104
- const __ghost_fn __ghost_pair_6 =
105
- __ghost_begin(mindex2_contiguous_ro, "M := sum");
106
- MMEMCPY(s, 0, sum, i * 32 + 0, 32, sizeof(float));
107
- __ghost_end(__ghost_pair_6);
108
- const __ghost_fn __ghost_pair_7 =
109
- __ghost_begin(group_ro_focus,
110
- "i := 0, items := fun k -> for j in 0..32 -> "
111
- "&pB[bj][bk][k][j] ~> Cell");
112
  #pragma omp simd
113
  for (int j = 0; j < 32; j++) {
114
- __strict();
115
- __sreads("A ~> Matrix2(1024, 1024)");
116
- __xmodifies("&s[j] ~> Cell");
117
- __xreads("&pB[bj][bk][0][j] ~> Cell");
118
- __ghost(tiled_index_in_range,
119
- "tile_index := bj, index := j, tile_count := 32, tile_size "
120
- ":= 32, size := 1024");
121
- __ghost(tiled_index_in_range,
122
- "tile_index := bk, index := 0, tile_count := 256, "
123
- "tile_size := 4, size := 1024");
124
- __ghost(tiled_index_in_range,
125
- "tile_index := bk, index := 0, tile_count := 256, "
126
- "tile_size := 4, size := 1024");
127
- const __ghost_fn __ghost_pair_2 = __ghost_begin(
128
- matrix2_ro_focus, "M := A, i := bi * 32 + i, j := bk * 4 + 0");
129
- s[j] += A[bi * 32 + i][bk * 4 + 0] * pB[bj][bk][0][j];
130
- __ghost_end(__ghost_pair_2);
131
- }
132
- __ghost_end(__ghost_pair_7);
133
- const __ghost_fn __ghost_pair_712 =
134
- __ghost_begin(group_ro_focus,
135
- "i := 1, items := fun k -> for j in 0..32 -> "
136
- "&pB[bj][bk][k][j] ~> Cell");
137
  #pragma omp simd
138
  for (int j = 0; j < 32; j++) {
139
- __strict();
140
- __sreads("A ~> Matrix2(1024, 1024)");
141
- __xmodifies("&s[j] ~> Cell");
142
- __xreads("&pB[bj][bk][1][j] ~> Cell");
143
- __ghost(tiled_index_in_range,
144
- "tile_index := bj, index := j, tile_count := 32, tile_size "
145
- ":= 32, size := 1024");
146
- __ghost(tiled_index_in_range,
147
- "tile_index := bk, index := 1, tile_count := 256, "
148
- "tile_size := 4, size := 1024");
149
- __ghost(tiled_index_in_range,
150
- "tile_index := bk, index := 1, tile_count := 256, "
151
- "tile_size := 4, size := 1024");
152
- const __ghost_fn __ghost_pair_2 = __ghost_begin(
153
- matrix2_ro_focus, "M := A, i := bi * 32 + i, j := bk * 4 + 1");
154
- s[j] += A[bi * 32 + i][bk * 4 + 1] * pB[bj][bk][1][j];
155
- __ghost_end(__ghost_pair_2);
156
- }
157
- __ghost_end(__ghost_pair_712);
158
- const __ghost_fn __ghost_pair_713 =
159
- __ghost_begin(group_ro_focus,
160
- "i := 2, items := fun k -> for j in 0..32 -> "
161
- "&pB[bj][bk][k][j] ~> Cell");
162
  #pragma omp simd
163
  for (int j = 0; j < 32; j++) {
164
- __strict();
165
- __sreads("A ~> Matrix2(1024, 1024)");
166
- __xmodifies("&s[j] ~> Cell");
167
- __xreads("&pB[bj][bk][2][j] ~> Cell");
168
- __ghost(tiled_index_in_range,
169
- "tile_index := bj, index := j, tile_count := 32, tile_size "
170
- ":= 32, size := 1024");
171
- __ghost(tiled_index_in_range,
172
- "tile_index := bk, index := 2, tile_count := 256, "
173
- "tile_size := 4, size := 1024");
174
- __ghost(tiled_index_in_range,
175
- "tile_index := bk, index := 2, tile_count := 256, "
176
- "tile_size := 4, size := 1024");
177
- const __ghost_fn __ghost_pair_2 = __ghost_begin(
178
- matrix2_ro_focus, "M := A, i := bi * 32 + i, j := bk * 4 + 2");
179
- s[j] += A[bi * 32 + i][bk * 4 + 2] * pB[bj][bk][2][j];
180
- __ghost_end(__ghost_pair_2);
181
- }
182
- __ghost_end(__ghost_pair_713);
183
- const __ghost_fn __ghost_pair_714 =
184
- __ghost_begin(group_ro_focus,
185
- "i := 3, items := fun k -> for j in 0..32 -> "
186
- "&pB[bj][bk][k][j] ~> Cell");
187
  #pragma omp simd
188
  for (int j = 0; j < 32; j++) {
189
- __strict();
190
- __sreads("A ~> Matrix2(1024, 1024)");
191
- __xmodifies("&s[j] ~> Cell");
192
- __xreads("&pB[bj][bk][3][j] ~> Cell");
193
- __ghost(tiled_index_in_range,
194
- "tile_index := bj, index := j, tile_count := 32, tile_size "
195
- ":= 32, size := 1024");
196
- __ghost(tiled_index_in_range,
197
- "tile_index := bk, index := 3, tile_count := 256, "
198
- "tile_size := 4, size := 1024");
199
- __ghost(tiled_index_in_range,
200
- "tile_index := bk, index := 3, tile_count := 256, "
201
- "tile_size := 4, size := 1024");
202
- const __ghost_fn __ghost_pair_2 = __ghost_begin(
203
- matrix2_ro_focus, "M := A, i := bi * 32 + i, j := bk * 4 + 3");
204
- s[j] += A[bi * 32 + i][bk * 4 + 3] * pB[bj][bk][3][j];
205
- __ghost_end(__ghost_pair_2);
206
- }
207
- __ghost_end(__ghost_pair_714);
208
- __ghost(mindex2_contiguous_uninit, "M := sum");
209
- MMEMCPY(sum, i * 32 + 0, s, 0, 32, sizeof(float));
210
- __ghost(mindex2_contiguous_rev, "M := sum");
211
  }
212
  }
213
  for (int i = 0; i < 32; i++) {
214
- __strict();
215
- __xwrites("for j in 0..32 -> &C[bi * 32 + i][bj * 32 + j] ~> Cell");
216
- __xreads("for j in 0..32 -> &sum[i][j] ~> Cell");
217
  for (int j = 0; j < 32; j++) {
218
- __strict();
219
- __xwrites("&C[bi * 32 + i][bj * 32 + j] ~> Cell");
220
- __xreads("&sum[i][j] ~> Cell");
221
- C[bi * 32 + i][bj * 32 + j] = sum[i][j];
222
  }
223
  }
224
  free(sum);
225
  }
226
- __ghost(swap_groups,
227
- "outer_range := 0..32, inner_range := 0..32, items := fun bj, i -> "
228
- "for j in 0..32 -> &C[bi * 32 + i][bj * 32 + j] ~> Cell");
229
- for (int i = 0; i < 32; i++) {
230
- __strict();
231
- __xconsumes(
232
- "for bj in 0..32 -> for j in 0..32 -> &C[bi * 32 + i][bj * 32 + j] "
233
- "~> Cell");
234
- __xproduces("for j in 0..1024 -> &C[bi * 32 + i][j] ~> Cell");
235
- __ghost(untile_divides,
236
- "tile_count := 32, tile_size := 32, size := 1024, items := fun j "
237
- "-> &C[bi * 32 + i][j] ~> Cell");
238
- }
239
  }
240
  free(pB);
241
- __ghost(untile_divides,
242
- "tile_count := 32, tile_size := 32, size := 1024, items := fun i -> "
243
- "for j in 0..1024 -> &C[i][j] ~> Cell");
244
  }
1
  #include <optitrust.h>
2
 
3
  #include "omp.h"
4
  // NOTE: using pretty matrix notation
5
 
6
  void mm1024(float* C, float* A, float* B) {
 
 
 
 
 
 
7
  float* const pB = (float* const)malloc(sizeof(float[32][256][4][32]));
8
  #pragma omp parallel for
9
  for (int bj = 0; bj < 32; bj++) {
 
 
 
 
 
10
  for (int bk = 0; bk < 256; bk++) {
 
 
 
 
 
11
  for (int k = 0; k < 4; k++) {
 
 
 
12
  for (int j = 0; j < 32; j++) {
 
 
 
 
 
13
+ pB[32768 * bj + 128 * bk + 32 * k + j] =
 
 
 
 
 
14
+ B[1024 * (4 * bk + k) + 32 * bj + j];
 
15
  }
16
  }
17
  }
18
  }
19
  #pragma omp parallel for
20
  for (int bi = 0; bi < 32; bi++) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  for (int bj = 0; bj < 32; bj++) {
 
 
 
 
 
 
 
 
22
  float* const sum = (float* const)malloc(sizeof(float[32][32]));
23
  for (int i = 0; i < 32; i++) {
 
 
24
  for (int j = 0; j < 32; j++) {
 
 
25
+ sum[32 * i + j] = 0.f;
26
  }
27
  }
28
  for (int bk = 0; bk < 256; bk++) {
 
 
 
 
 
29
  for (int i = 0; i < 32; i++) {
 
 
 
 
 
 
 
 
30
  float* const s = new float(32)();
 
 
31
+ MMEMCPY(s, 0, sum, 32 * i, 32, sizeof(float));
 
 
 
 
 
32
  #pragma omp simd
33
  for (int j = 0; j < 32; j++) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
+ s[j] += A[1024 * (32 * bi + i) + 4 * bk] *
35
+ pB[32768 * bj + 128 * bk + j];
36
+ }
 
 
 
 
 
37
  #pragma omp simd
38
  for (int j = 0; j < 32; j++) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
+ s[j] += A[1 + 1024 * (32 * bi + i) + 4 * bk] *
40
+ pB[32 + 32768 * bj + 128 * bk + j];
41
+ }
 
 
 
 
 
42
  #pragma omp simd
43
  for (int j = 0; j < 32; j++) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
+ s[j] += A[2 + 1024 * (32 * bi + i) + 4 * bk] *
45
+ pB[64 + 32768 * bj + 128 * bk + j];
46
+ }
 
 
 
 
 
47
  #pragma omp simd
48
  for (int j = 0; j < 32; j++) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
+ s[j] += A[3 + 1024 * (32 * bi + i) + 4 * bk] *
50
+ pB[96 + 32768 * bj + 128 * bk + j];
51
+ }
 
 
52
+ MMEMCPY(sum, 32 * i, s, 0, 32, sizeof(float));
 
53
  }
54
  }
55
  for (int i = 0; i < 32; i++) {
 
 
 
56
  for (int j = 0; j < 32; j++) {
 
 
 
57
+ C[1024 * (32 * bi + i) + 32 * bj + j] = sum[32 * i + j];
58
  }
59
  }
60
  free(sum);
61
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  }
63
  free(pB);
 
 
 
64
  }