@@ -1,244 +1,64 @@
|
|
1 |
|
2 |
|
3 |
|
4 |
// NOTE: using pretty matrix notation
|
5 |
|
6 |
void mm1024(float* C, float* A, float* B) {
|
7 |
-
__modifies("C ~> Matrix2(1024, 1024)");
|
8 |
-
__reads("A ~> Matrix2(1024, 1024)");
|
9 |
-
__reads("B ~> Matrix2(1024, 1024)");
|
10 |
-
__ghost(tile_divides,
|
11 |
-
"tile_count := 32, tile_size := 32, size := 1024, items := fun i -> "
|
12 |
-
"for j in 0..1024 -> &C[i][j] ~> Cell");
|
13 |
float* const pB = (float* const)malloc(sizeof(float[32][256][4][32]));
|
14 |
|
15 |
for (int bj = 0; bj < 32; bj++) {
|
16 |
-
__strict();
|
17 |
-
__sreads("B ~> Matrix2(1024, 1024)");
|
18 |
-
__xwrites(
|
19 |
-
"for _v9 in 0..256 -> for _v10 in 0..4 -> for _v11 in 0..32 -> "
|
20 |
-
"&pB[bj][_v9][_v10][_v11] ~> Cell");
|
21 |
for (int bk = 0; bk < 256; bk++) {
|
22 |
-
__strict();
|
23 |
-
__sreads("B ~> Matrix2(1024, 1024)");
|
24 |
-
__xwrites(
|
25 |
-
"for _v7 in 0..4 -> for _v8 in 0..32 -> &pB[bj][bk][_v7][_v8] ~> "
|
26 |
-
"Cell");
|
27 |
for (int k = 0; k < 4; k++) {
|
28 |
-
__strict();
|
29 |
-
__sreads("B ~> Matrix2(1024, 1024)");
|
30 |
-
__xwrites("for _v6 in 0..32 -> &pB[bj][bk][k][_v6] ~> Cell");
|
31 |
for (int j = 0; j < 32; j++) {
|
32 |
-
__strict();
|
33 |
-
__sreads("B ~> Matrix2(1024, 1024)");
|
34 |
-
__xwrites("&pB[bj][bk][k][j] ~> Cell");
|
35 |
-
__ghost(tiled_index_in_range,
|
36 |
-
"tile_index := bj, index := j, tile_count := 32, tile_size "
|
37 |
-
|
38 |
-
__ghost(tiled_index_in_range,
|
39 |
-
"tile_index := bk, index := k, tile_count := 256, tile_size "
|
40 |
-
":= 4, size := 1024");
|
41 |
-
const __ghost_fn __ghost_pair_3 = __ghost_begin(
|
42 |
-
matrix2_ro_focus, "M := B, i := bk * 4 + k, j := bj * 32 + j");
|
43 |
-
|
44 |
-
__ghost_end(__ghost_pair_3);
|
45 |
}
|
46 |
}
|
47 |
}
|
48 |
}
|
49 |
|
50 |
for (int bi = 0; bi < 32; bi++) {
|
51 |
-
__strict();
|
52 |
-
__sreads("pB ~> Matrix4(32, 256, 4, 32)");
|
53 |
-
__sreads("A ~> Matrix2(1024, 1024)");
|
54 |
-
__xmodifies(
|
55 |
-
"for i in 0..32 -> for j in 0..1024 -> &C[bi * 32 + i][j] ~> Cell");
|
56 |
-
for (int i = 0; i < 32; i++) {
|
57 |
-
__strict();
|
58 |
-
__xconsumes("for j in 0..1024 -> &C[bi * 32 + i][j] ~> Cell");
|
59 |
-
__xproduces(
|
60 |
-
"for bi1 in 0..32 -> for i2 in 0..32 -> &C[bi * 32 + i][bi1 * 32 + "
|
61 |
-
"i2] ~> Cell");
|
62 |
-
__ghost(tile_divides,
|
63 |
-
"tile_count := 32, tile_size := 32, size := 1024, items := fun j "
|
64 |
-
"-> &C[bi * 32 + i][j] ~> Cell");
|
65 |
-
}
|
66 |
-
__ghost(swap_groups,
|
67 |
-
"outer_range := 0..32, inner_range := 0..32, items := fun i, bj -> "
|
68 |
-
"for j in 0..32 -> &C[bi * 32 + i][bj * 32 + j] ~> Cell");
|
69 |
for (int bj = 0; bj < 32; bj++) {
|
70 |
-
__strict();
|
71 |
-
__sreads("A ~> Matrix2(1024, 1024)");
|
72 |
-
__xwrites(
|
73 |
-
"for i in 0..32 -> for j in 0..32 -> &C[bi * 32 + i][bj * 32 + j] ~> "
|
74 |
-
"Cell");
|
75 |
-
__xreads(
|
76 |
-
"for bk in 0..256 -> for k in 0..4 -> for j in 0..32 -> "
|
77 |
-
"&pB[bj][bk][k][j] ~> Cell");
|
78 |
float* const sum = (float* const)malloc(sizeof(float[32][32]));
|
79 |
for (int i = 0; i < 32; i++) {
|
80 |
-
__strict();
|
81 |
-
__xwrites("for _v5 in 0..32 -> &sum[i][_v5] ~> Cell");
|
82 |
for (int j = 0; j < 32; j++) {
|
83 |
-
__strict();
|
84 |
-
__xwrites("&sum[i][j] ~> Cell");
|
85 |
-
sum[i
|
86 |
}
|
87 |
}
|
88 |
for (int bk = 0; bk < 256; bk++) {
|
89 |
-
__strict();
|
90 |
-
__smodifies("sum ~> Matrix2(32, 32)");
|
91 |
-
__sreads("A ~> Matrix2(1024, 1024)");
|
92 |
-
__xreads(
|
93 |
-
"for k in 0..4 -> for j in 0..32 -> &pB[bj][bk][k][j] ~> Cell");
|
94 |
for (int i = 0; i < 32; i++) {
|
95 |
-
__strict();
|
96 |
-
__sreads(
|
97 |
-
"for k in 0..4 -> for j in 0..32 -> &pB[bj][bk][k][j] ~> Cell");
|
98 |
-
__sreads("A ~> Matrix2(1024, 1024)");
|
99 |
-
__xmodifies("for j in 0..32 -> &sum[i][j] ~> Cell");
|
100 |
-
__ghost(tiled_index_in_range,
|
101 |
-
"tile_index := bi, index := i, tile_count := 32, tile_size "
|
102 |
-
":= 32, size := 1024");
|
103 |
float* const s = new float(32)();
|
104 |
-
const __ghost_fn __ghost_pair_6 =
|
105 |
-
__ghost_begin(mindex2_contiguous_ro, "M := sum");
|
106 |
-
MMEMCPY(s, 0, sum,
|
107 |
-
__ghost_end(__ghost_pair_6);
|
108 |
-
const __ghost_fn __ghost_pair_7 =
|
109 |
-
__ghost_begin(group_ro_focus,
|
110 |
-
"i := 0, items := fun k -> for j in 0..32 -> "
|
111 |
-
"&pB[bj][bk][k][j] ~> Cell");
|
112 |
|
113 |
for (int j = 0; j < 32; j++) {
|
114 |
-
__strict();
|
115 |
-
__sreads("A ~> Matrix2(1024, 1024)");
|
116 |
-
__xmodifies("&s[j] ~> Cell");
|
117 |
-
__xreads("&pB[bj][bk][0][j] ~> Cell");
|
118 |
-
__ghost(tiled_index_in_range,
|
119 |
-
"tile_index := bj, index := j, tile_count := 32, tile_size "
|
120 |
-
":= 32, size := 1024");
|
121 |
-
__ghost(tiled_index_in_range,
|
122 |
-
"tile_index := bk, index := 0, tile_count := 256, "
|
123 |
-
"tile_size := 4, size := 1024");
|
124 |
-
__ghost(tiled_index_in_range,
|
125 |
-
"tile_index := bk, index := 0, tile_count := 256, "
|
126 |
-
"tile_size := 4, size := 1024");
|
127 |
-
const __ghost_fn __ghost_pair_2 = __ghost_begin(
|
128 |
-
matrix2_ro_focus, "M := A, i := bi * 32 + i, j := bk * 4 + 0");
|
129 |
-
s[j] += A[
|
130 |
-
|
131 |
-
}
|
132 |
-
__ghost_end(__ghost_pair_7);
|
133 |
-
const __ghost_fn __ghost_pair_712 =
|
134 |
-
__ghost_begin(group_ro_focus,
|
135 |
-
"i := 1, items := fun k -> for j in 0..32 -> "
|
136 |
-
"&pB[bj][bk][k][j] ~> Cell");
|
137 |
|
138 |
for (int j = 0; j < 32; j++) {
|
139 |
-
__strict();
|
140 |
-
__sreads("A ~> Matrix2(1024, 1024)");
|
141 |
-
__xmodifies("&s[j] ~> Cell");
|
142 |
-
__xreads("&pB[bj][bk][1][j] ~> Cell");
|
143 |
-
__ghost(tiled_index_in_range,
|
144 |
-
"tile_index := bj, index := j, tile_count := 32, tile_size "
|
145 |
-
":= 32, size := 1024");
|
146 |
-
__ghost(tiled_index_in_range,
|
147 |
-
"tile_index := bk, index := 1, tile_count := 256, "
|
148 |
-
"tile_size := 4, size := 1024");
|
149 |
-
__ghost(tiled_index_in_range,
|
150 |
-
"tile_index := bk, index := 1, tile_count := 256, "
|
151 |
-
"tile_size := 4, size := 1024");
|
152 |
-
const __ghost_fn __ghost_pair_2 = __ghost_begin(
|
153 |
-
matrix2_ro_focus, "M := A, i := bi * 32 + i, j := bk * 4 + 1");
|
154 |
-
s[j] += A[
|
155 |
-
|
156 |
-
}
|
157 |
-
__ghost_end(__ghost_pair_712);
|
158 |
-
const __ghost_fn __ghost_pair_713 =
|
159 |
-
__ghost_begin(group_ro_focus,
|
160 |
-
"i := 2, items := fun k -> for j in 0..32 -> "
|
161 |
-
"&pB[bj][bk][k][j] ~> Cell");
|
162 |
|
163 |
for (int j = 0; j < 32; j++) {
|
164 |
-
__strict();
|
165 |
-
__sreads("A ~> Matrix2(1024, 1024)");
|
166 |
-
__xmodifies("&s[j] ~> Cell");
|
167 |
-
__xreads("&pB[bj][bk][2][j] ~> Cell");
|
168 |
-
__ghost(tiled_index_in_range,
|
169 |
-
"tile_index := bj, index := j, tile_count := 32, tile_size "
|
170 |
-
":= 32, size := 1024");
|
171 |
-
__ghost(tiled_index_in_range,
|
172 |
-
"tile_index := bk, index := 2, tile_count := 256, "
|
173 |
-
"tile_size := 4, size := 1024");
|
174 |
-
__ghost(tiled_index_in_range,
|
175 |
-
"tile_index := bk, index := 2, tile_count := 256, "
|
176 |
-
"tile_size := 4, size := 1024");
|
177 |
-
const __ghost_fn __ghost_pair_2 = __ghost_begin(
|
178 |
-
matrix2_ro_focus, "M := A, i := bi * 32 + i, j := bk * 4 + 2");
|
179 |
-
s[j] += A[
|
180 |
-
|
181 |
-
}
|
182 |
-
__ghost_end(__ghost_pair_713);
|
183 |
-
const __ghost_fn __ghost_pair_714 =
|
184 |
-
__ghost_begin(group_ro_focus,
|
185 |
-
"i := 3, items := fun k -> for j in 0..32 -> "
|
186 |
-
"&pB[bj][bk][k][j] ~> Cell");
|
187 |
|
188 |
for (int j = 0; j < 32; j++) {
|
189 |
-
__strict();
|
190 |
-
__sreads("A ~> Matrix2(1024, 1024)");
|
191 |
-
__xmodifies("&s[j] ~> Cell");
|
192 |
-
__xreads("&pB[bj][bk][3][j] ~> Cell");
|
193 |
-
__ghost(tiled_index_in_range,
|
194 |
-
"tile_index := bj, index := j, tile_count := 32, tile_size "
|
195 |
-
":= 32, size := 1024");
|
196 |
-
__ghost(tiled_index_in_range,
|
197 |
-
"tile_index := bk, index := 3, tile_count := 256, "
|
198 |
-
"tile_size := 4, size := 1024");
|
199 |
-
__ghost(tiled_index_in_range,
|
200 |
-
"tile_index := bk, index := 3, tile_count := 256, "
|
201 |
-
"tile_size := 4, size := 1024");
|
202 |
-
const __ghost_fn __ghost_pair_2 = __ghost_begin(
|
203 |
-
matrix2_ro_focus, "M := A, i := bi * 32 + i, j := bk * 4 + 3");
|
204 |
-
s[j] += A[
|
205 |
-
|
206 |
-
}
|
207 |
-
__ghost_end(__ghost_pair_714);
|
208 |
-
__ghost(mindex2_contiguous_uninit, "M := sum");
|
209 |
-
MMEMCPY(sum,
|
210 |
-
__ghost(mindex2_contiguous_rev, "M := sum");
|
211 |
}
|
212 |
}
|
213 |
for (int i = 0; i < 32; i++) {
|
214 |
-
__strict();
|
215 |
-
__xwrites("for j in 0..32 -> &C[bi * 32 + i][bj * 32 + j] ~> Cell");
|
216 |
-
__xreads("for j in 0..32 -> &sum[i][j] ~> Cell");
|
217 |
for (int j = 0; j < 32; j++) {
|
218 |
-
__strict();
|
219 |
-
__xwrites("&C[bi * 32 + i][bj * 32 + j] ~> Cell");
|
220 |
-
__xreads("&sum[i][j] ~> Cell");
|
221 |
-
C[
|
222 |
}
|
223 |
}
|
224 |
free(sum);
|
225 |
}
|
226 |
-
__ghost(swap_groups,
|
227 |
-
"outer_range := 0..32, inner_range := 0..32, items := fun bj, i -> "
|
228 |
-
"for j in 0..32 -> &C[bi * 32 + i][bj * 32 + j] ~> Cell");
|
229 |
-
for (int i = 0; i < 32; i++) {
|
230 |
-
__strict();
|
231 |
-
__xconsumes(
|
232 |
-
"for bj in 0..32 -> for j in 0..32 -> &C[bi * 32 + i][bj * 32 + j] "
|
233 |
-
"~> Cell");
|
234 |
-
__xproduces("for j in 0..1024 -> &C[bi * 32 + i][j] ~> Cell");
|
235 |
-
__ghost(untile_divides,
|
236 |
-
"tile_count := 32, tile_size := 32, size := 1024, items := fun j "
|
237 |
-
"-> &C[bi * 32 + i][j] ~> Cell");
|
238 |
-
}
|
239 |
}
|
240 |
free(pB);
|
241 |
-
__ghost(untile_divides,
|
242 |
-
"tile_count := 32, tile_size := 32, size := 1024, items := fun i -> "
|
243 |
-
"for j in 0..1024 -> &C[i][j] ~> Cell");
|
244 |
}
|
1 |
|
2 |
|
3 |
|
4 |
// NOTE: using pretty matrix notation
|
5 |
|
6 |
void mm1024(float* C, float* A, float* B) {
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
float* const pB = (float* const)malloc(sizeof(float[32][256][4][32]));
|
8 |
|
9 |
for (int bj = 0; bj < 32; bj++) {
|
|
|
|
|
|
|
|
|
|
|
10 |
for (int bk = 0; bk < 256; bk++) {
|
|
|
|
|
|
|
|
|
|
|
11 |
for (int k = 0; k < 4; k++) {
|
|
|
|
|
|
|
12 |
for (int j = 0; j < 32; j++) {
|
|
|
|
|
|
|
|
|
|
|
13 |
+
pB[32768 * bj + 128 * bk + 32 * k + j] =
|
|
|
|
|
|
|
|
|
|
|
14 |
+
B[1024 * (4 * bk + k) + 32 * bj + j];
|
|
|
15 |
}
|
16 |
}
|
17 |
}
|
18 |
}
|
19 |
|
20 |
for (int bi = 0; bi < 32; bi++) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
for (int bj = 0; bj < 32; bj++) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
float* const sum = (float* const)malloc(sizeof(float[32][32]));
|
23 |
for (int i = 0; i < 32; i++) {
|
|
|
|
|
24 |
for (int j = 0; j < 32; j++) {
|
|
|
|
|
25 |
+
sum[32 * i + j] = 0.f;
|
26 |
}
|
27 |
}
|
28 |
for (int bk = 0; bk < 256; bk++) {
|
|
|
|
|
|
|
|
|
|
|
29 |
for (int i = 0; i < 32; i++) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
float* const s = new float(32)();
|
|
|
|
|
31 |
+
MMEMCPY(s, 0, sum, 32 * i, 32, sizeof(float));
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
for (int j = 0; j < 32; j++) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
+
s[j] += A[1024 * (32 * bi + i) + 4 * bk] *
|
35 |
+
pB[32768 * bj + 128 * bk + j];
|
36 |
+
}
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
for (int j = 0; j < 32; j++) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
+
s[j] += A[1 + 1024 * (32 * bi + i) + 4 * bk] *
|
40 |
+
pB[32 + 32768 * bj + 128 * bk + j];
|
41 |
+
}
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
for (int j = 0; j < 32; j++) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
+
s[j] += A[2 + 1024 * (32 * bi + i) + 4 * bk] *
|
45 |
+
pB[64 + 32768 * bj + 128 * bk + j];
|
46 |
+
}
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
for (int j = 0; j < 32; j++) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
+
s[j] += A[3 + 1024 * (32 * bi + i) + 4 * bk] *
|
50 |
+
pB[96 + 32768 * bj + 128 * bk + j];
|
51 |
+
}
|
|
|
|
|
52 |
+
MMEMCPY(sum, 32 * i, s, 0, 32, sizeof(float));
|
|
|
53 |
}
|
54 |
}
|
55 |
for (int i = 0; i < 32; i++) {
|
|
|
|
|
|
|
56 |
for (int j = 0; j < 32; j++) {
|
|
|
|
|
|
|
57 |
+
C[1024 * (32 * bi + i) + 32 * bj + j] = sum[32 * i + j];
|
58 |
}
|
59 |
}
|
60 |
free(sum);
|
61 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
}
|
63 |
free(pB);
|
|
|
|
|
|
|
64 |
}
|