bzr branch
http://suren.me/webbzr/ani/mrses
1
by Suren A. Chilingaryan
Initial import |
1 |
This file contains some ideas which was slower or not comed to the end due |
2 |
their nature or my inabilities or for time saving purposes. |
|
3 |
||
4 |
1. The general idea is to computer full 16x16 matrix until the end. We got |
|
5 |
several quadratic matrices along the diagonal with which are the actual |
|
6 |
C matrixes and we padding the rest of diagonal with 1. Everything else is 0. |
|
7 |
Then we can compute Cholesky and solve equations on all matrices together |
|
8 |
using SPE-optimized blas. However, this approach is twice slower compared |
|
9 |
to computing matrix by matrix using non-optimized blas. |
|
10 |
||
11 |
for (j = 1; j < at_once; j++) { |
|
12 |
int offset = j * width * real_width; |
|
13 |
MRSESDataType *C_cur = C + offset; |
|
14 |
MRSESDataType *Ca_cur = Ca + offset; |
|
15 |
MRSESDataType *Cb_cur = Cb + offset; |
|
16 |
||
17 |
for (i = 0; i < real_width; i++) { |
|
18 |
memset(C_cur + i * width, 0, j * real_width * sizeof(MRSESDataType)); |
|
19 |
} |
|
20 |
} |
|
21 |
for (j = at_once * real_width; j < width; j++) { |
|
22 |
memset(C + j * width, 0, width * sizeof(MRSESDataType)); |
|
23 |
C[j * width + j] = 1; |
|
24 |
} |
|
25 |
||
26 |
||
27 |
Solving equation can be done with following commands, but I have not managed to |
|
28 |
get correct results. Anyway the trsv uses neglectable amount of time. Most |
|
29 |
things are done in matrix multiplication and cholesky decomposition. |
|
30 |
||
31 |
transpose_matrix(width, width, C, width, D, width); |
|
32 |
MRSESDataType *D_cur = D + offset; |
|
33 |
solve_upper_1(real_width, D_cur, width, mean_cur); |
|
34 |
||
35 |
// multiple of 16 |
|
36 |
strsv_spu_lower(width, C, width, mean); |
|
37 |
||
38 |
||
39 |
2. Another idea was to compute [Ca]*[Cb] as [Ca*Cb]. For that we could |
|
40 |
multiply Ca*Cb and then multiply by transpose of it (Ca*Cb)' to get |
|
41 |
symmetric matrix. Finally we should get the square of determinat. |
|
42 |
But, the resulting symmetric matrix is not positive-definite (or for |
|
43 |
precision reasons), the cholesky decomposition have failed for it. |
|
44 |
||
45 |
for (i = 0; i < width; i++) { |
|
46 |
for (j = 0; j <= i; j++) { |
|
47 |
Ca[j * width + i] = Ca[i * width + j]; |
|
48 |
Cb[j * width + i] = Cb[i * width + j]; |
|
49 |
} |
|
50 |
} |
|
51 |
||
52 |
memset(Cab, 0, width*width*sizeof(MRSESDataType)); |
|
53 |
//sgemm_spu(width, width, width, Ca, Cb, Cab); |
|
54 |
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, width, width, width, 1, Ca, width, Cb, width, 0, Cab, width); |
|
55 |
||
56 |
memset(Cab2, 0, width*width*sizeof(MRSESDataType)); |
|
57 |
blas_syrk(CblasRowMajor, CblasLower, CblasNoTrans, width, width, 1, Cab, width, 0, Cab2, width); |
|
58 |
//spe_syrk(CblasRowMajor, CblasLower, CblasNoTrans, width, width, 1, Cab, width, 0, Cab2, width); |
|
59 |
for ... |
|
60 |
lapack_potrf(&hmode, &real_width, Cab_cur, &width, err); |
|
61 |
||
62 |
detC = C_cur[0]; detAB = Cab_cur[0]; |
|
63 |
for (i = width + 1; i < c_step; i+= (width+1)) { |
|
64 |
detAB *= Cab_cur[i]; |
|
65 |
detC *= C_cur[i]; |
|
66 |
} |
|
67 |
rcorr = logf(detC * detC * detC * detC / detAB); |
|
68 |
} |
|
69 |
||
70 |
||
71 |
3. Following code is matrix multiplocation using default (non-SPE) blas. Even |
|
72 |
if used with appropriate small matrices it is significantly slower than IBM |
|
73 |
code on full matrix. |
|
74 |
||
75 |
/* |
|
76 |
char hmode = 'L'; |
|
77 |
||
78 |
//original (do not store C) |
|
79 |
//blas_syrk(CblasRowMajor, CblasUpper, CblasNoTrans, width, nA, 1, A, nA, 0, Ca, width); |
|
80 |
//blas_syrk(CblasRowMajor, CblasUpper, CblasNoTrans, width, nB, 1, B, nB, 0, Cb, width); |
|
81 |
||
82 |
blas_syrk(CblasRowMajor, CblasUpper, CblasNoTrans, 5, 50, 1, A, nA, 0, Ca, width); |
|
83 |
blas_syrk(CblasRowMajor, CblasUpper, CblasNoTrans, 5, 40, 1, B, nB, 0, Cb, width); |
|
84 |
||
85 |
memcpy(C, Ca, width2 * sizeof(MRSESDataType)); |
|
86 |
blas_axpy(width2, 1, Cb, 1, C, 1); |
|
87 |
blas_scal(width2, 0.5, C, 1); |
|
88 |
*/ |
|
89 |
||
90 |
4. Other ways to solve equation system. Both works but a bit slower |
|
91 |
/* |
|
92 |
// Faster 82: |
|
93 |
blas_trsm( |
|
94 |
CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit, |
|
95 |
1, real_width, 1, C_cur, width, mean_cur, width //1? |
|
96 |
); |
|
97 |
||
98 |
// Slower 83-84 |
|
99 |
blas_trsm( |
|
100 |
CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, |
|
101 |
real_width, 1, 1, C_cur, width, mean_cur, 1 //width //1? |
|
102 |
); |
|
103 |
*/ |
|
104 |
||
105 |
5. Other ways to perform cholesky decomposition. The lapack version is surely |
|
106 |
working. Curretnly implemented seems to work, but who nows about special cases. |
|
107 |
||
108 |
atlas_spotrf2(real_width, C_cur, width); |
|
109 |
atlas_spotrf2(real_width, Ca_cur, width); |
|
110 |
atlas_spotrf2(real_width, Cb_cur, width); |
|
111 |
||
112 |
lapack_potrf(&hmode, &real_width, C_cur, &width, &err); |
|
113 |
if (err) return 1; |
|
114 |
lapack_potrf(&hmode, &real_width, Ca_cur, &width, &err); |
|
115 |
if (err) return 1; |
|
116 |
lapack_potrf(&hmode, &real_width, Cb_cur, &width, &err); |
|
117 |
if (err) return 1; |
|
118 |
||
119 |
||
120 |
6. This is fully working, but not vectorized version of code |
|
121 |
static inline int mrses_spe_real_run( |
|
122 |
MRSESContext mrses, MRSESDataType *result, |
|
123 |
int width, int width2, int nA, int nB, |
|
124 |
MRSESDataType *A, MRSESDataType *B, MRSESDataType *mean, |
|
125 |
MRSESDataType *C, MRSESDataType *Ca, MRSESDataType *Cb |
|
126 |
) { |
|
127 |
int i, j, err; |
|
128 |
||
129 |
MRSESDataType detAB, detC; |
|
130 |
MRSESDataType rmahal, rcorr; |
|
131 |
||
132 |
short int walloc; |
|
133 |
||
134 |
char hmode = 'U'; |
|
135 |
||
136 |
int real_width = 5; |
|
137 |
// int iterate_size = mrses->iterate_size; |
|
138 |
int at_once = max(1, SPE_BLOCK / real_width); |
|
139 |
// int times = iterate_size / at_once; |
|
140 |
||
141 |
int rww = at_once * width * real_width; |
|
142 |
int rww_alloc = calc_alloc(rww, 64); |
|
143 |
||
144 |
memset(Ca, 0, rww * sizeof(MRSESDataType)); |
|
145 |
memset(Cb, 0, rww * sizeof(MRSESDataType)); |
|
146 |
||
147 |
//PRINT_MATRIX("% 6.4f ", A, 16, 16, 16) |
|
148 |
||
149 |
// that works on multiples of 16 |
|
150 |
spe_syrk(CblasRowMajor, CblasLower, CblasNoTrans, width, nA, 1, A, nA, 0, Ca, width); |
|
151 |
spe_syrk(CblasRowMajor, CblasLower, CblasNoTrans, width, nB, 1, B, nB, 0, Cb, width); |
|
152 |
||
153 |
// thats works on multiples of 64 (16x16 - fine) |
|
154 |
memcpy(C, Ca, rww * sizeof(MRSESDataType)); |
|
155 |
spe_axpy(rww_alloc, 1, Cb, 1, C, 1); |
|
156 |
spe_scal(rww_alloc, 0.5, C, 1); |
|
157 |
||
158 |
int c_step = real_width * (width + 1); |
|
159 |
||
160 |
for (j = 0; j < at_once; j++) { |
|
161 |
int offset = j * c_step; |
|
162 |
MRSESDataType *C_cur = C + offset; |
|
163 |
MRSESDataType *Ca_cur = Ca + offset; |
|
164 |
MRSESDataType *Cb_cur = Cb + offset; |
|
165 |
MRSESDataType *mean_cur = mean + j * real_width; |
|
166 |
MRSESDataType result; |
|
167 |
||
168 |
||
169 |
err = atlas_spotrf_u(real_width, C_cur, width); |
|
170 |
if (err) return 1; |
|
171 |
err = atlas_spotrf_u(real_width, Ca_cur, width); |
|
172 |
if (err) return 1; |
|
173 |
err = atlas_spotrf_u(real_width, Cb_cur, width); |
|
174 |
if (err) return 1; |
|
175 |
||
176 |
/* |
|
177 |
||
178 |
atlas_spotrf2(real_width, C_cur, width); |
|
179 |
atlas_spotrf2(real_width, Ca_cur, width); |
|
180 |
atlas_spotrf2(real_width, Cb_cur, width); |
|
181 |
||
182 |
lapack_potrf(&hmode, &real_width, C_cur, &width, &err); |
|
183 |
if (err) return 1; |
|
184 |
lapack_potrf(&hmode, &real_width, Ca_cur, &width, &err); |
|
185 |
if (err) return 1; |
|
186 |
lapack_potrf(&hmode, &real_width, Cb_cur, &width, &err); |
|
187 |
if (err) return 1; |
|
188 |
*/ |
|
189 |
||
190 |
detC = C_cur[0]; detAB = Ca_cur[0] * Cb_cur[0]; |
|
191 |
for (i = width + 1; i < c_step; i+= (width+1)) { |
|
192 |
detAB *= (Ca_cur[i] * Cb_cur[i]); |
|
193 |
detC *= C_cur[i]; |
|
194 |
} |
|
195 |
||
196 |
rcorr = 2 * logf(detC * detC / detAB); |
|
197 |
||
198 |
||
199 |
blas_trsv( |
|
200 |
CblasRowMajor, CblasLower, CblasNoTrans, CblasNonUnit, |
|
201 |
real_width, C_cur, width, mean_cur, 1 |
|
202 |
); |
|
203 |
||
204 |
rmahal = blas_dot(real_width, mean_cur, 1, mean_cur, 1); |
|
205 |
||
206 |
switch (mrses->dist) { |
|
207 |
case BHATTACHARYYA: |
|
208 |
result = rmahal/8 + rcorr/4; |
|
209 |
break; |
|
210 |
case MAHALANOBIS: |
|
211 |
result = rmahal; |
|
212 |
break; |
|
213 |
case CORCOR: |
|
214 |
result = rcorr; |
|
215 |
break; |
|
216 |
default: |
|
217 |
result = 0; |
|
218 |
} |
|
219 |
||
220 |
// printf("SPU result: %e (mahal: %e, corcor: %e)\n", result, rmahal, rcorr); |
|
221 |
} |
|
222 |
||
223 |
return 0; |
|
224 |
} |
|
225 |
||
226 |
data->A = (MRSESDataType*)(allocation + pos); |
|
227 |
pos += walloc * alloc * sizeof(MRSESDataType); |
|
228 |
||
229 |
data->B = (MRSESDataType*)(allocation + pos); |
|
230 |
pos += walloc * alloc * sizeof(MRSESDataType); |
|
231 |
||
232 |
data->C = (MRSESDataType*)(allocation + pos); |
|
233 |
pos += walloc2 * sizeof(MRSESDataType); |
|
234 |
||
235 |
data->Ca = (MRSESDataType*)(allocation + pos); |
|
236 |
pos += walloc2 * sizeof(MRSESDataType); |
|
237 |
||
238 |
data->Cb = (MRSESDataType*)(allocation + pos); |
|
239 |
pos += walloc2 * sizeof(MRSESDataType); |
|
240 |
||
241 |
// memset(data->A + used_walloc * alloc, 0, (walloc - used_walloc) * alloc * sizeof(MRSESDataType)); |
|
242 |
// memset(data->B + used_walloc * alloc, 0, (walloc - used_walloc) * alloc * sizeof(MRSESDataType)); |
|
243 |
// memset(data->C + used_walloc * walloc, 0, (walloc - used_walloc) * walloc * sizeof(MRSESDataType)); |
|
244 |
||
245 |
// memset(data->mean + used_walloc, 0, (walloc - used_walloc) * sizeof(MRSESDataType)); |
|
246 |
// memset(data->mean_copy + used_walloc, 0, (walloc - used_walloc) * sizeof(MRSESDataType)); |
|
247 |
||
248 |
||
249 |
/* |
|
250 |
err = mrses_spe_real_run( |
|
251 |
mrses, cur_result, |
|
252 |
walign, walign2, aA, aB, //width, width2, nA, nB, alloc, |
|
253 |
A, B, mean_copy, C, Ca, Cb |
|
254 |
); |
|
255 |
*/ |
|
256 |
||
257 |
/* |
|
258 |
err = mrses_spe_real_run( |
|
259 |
mrses, result, |
|
260 |
walign, walign2, aA, aB, //width, width2, nA, nB, alloc, |
|
261 |
A, B, mean_copy, C, Ca, Cb |
|
262 |
); |
|
263 |
*/ |
|
264 |
||
265 |
||
266 |
7. Vectorization tests with various invalid operations |
|
267 |
/* |
|
268 |
vector float one = {1, -1, 1, 1}; |
|
269 |
vector float some_zero = {0, 1, 1, 0}; |
|
270 |
vector float zero = {0, 0, 0, 0}; |
|
271 |
||
272 |
vector float result; |
|
273 |
result = divf4(one, some_zero); |
|
274 |
||
275 |
printf("%f %f\n", *(((float*)&result)+1), *(((float*)&result)+2)); |
|
276 |
printf("%f %f\n", *(((float*)&result)+0), *(((float*)&result)+3)); |
|
277 |
||
278 |
result = sqrtf4(result); |
|
279 |
||
280 |
printf("%f %f\n", *(((float*)&result)+1), *(((float*)&result)+2)); |
|
281 |
printf("%f %f\n", *(((float*)&result)+0), *(((float*)&result)+3)); |
|
282 |
||
283 |
result = divf4_fast(one, some_zero); |
|
284 |
||
285 |
printf("%f %f\n", *(((float*)&result)+1), *(((float*)&result)+2)); |
|
286 |
printf("%f %f\n", *(((float*)&result)+0), *(((float*)&result)+3)); |
|
287 |
||
288 |
result = sqrtf4_fast(fmaxf4(result, zero)); |
|
289 |
||
290 |
printf("%f %f\n", *(((float*)&result)+1), *(((float*)&result)+2)); |
|
291 |
printf("%f %f\n", *(((float*)&result)+0), *(((float*)&result)+3)); |
|
292 |
||
293 |
return 0; |
|
294 |
*/ |
|
295 |
||
296 |
||
297 |
8. Debugging vectorized code |
|
298 |
/* |
|
299 |
int err = atlas_spotrf2(real_width, Ca, width); |
|
300 |
if (err) printf("Problem with spotrf\n"); |
|
301 |
else { |
|
302 |
if ((((unsigned int)D)%16)==0) { |
|
303 |
printf("Cholesky Ca\n"); |
|
304 |
PRINT_MATRIX("% 6.4f ", Ca, 16, 5, 5) |
|
305 |
} |
|
306 |
} |
|
307 |
||
308 |
||
309 |
float C[16*16]; |
|
310 |
memset(C, 0, 16*16 * sizeof(MRSESDataType)); |
|
311 |
memcpy(C, Ca, rww * sizeof(MRSESDataType)); |
|
312 |
spe_axpy(16*16, 1, Cb, 1, C, 1); |
|
313 |
spe_scal(16*16, 0.5, C, 1); |
|
314 |
||
315 |
||
316 |
puts("====================================="); |
|
317 |
int c_step = real_width * (width + 1); |
|
318 |
atlas_spotrf2(real_width, Ca, width); |
|
319 |
atlas_spotrf2(real_width, Cb, width); |
|
320 |
atlas_spotrf2(real_width, C, width); |
|
321 |
float detC = C[0]; float detAB = Ca[0] * Cb[0]; |
|
322 |
for (i = width + 1; i < c_step; i+= (width+1)) { |
|
323 |
detAB *= (Ca[i] * Cb[i]); |
|
324 |
detC *= C[i]; |
|
325 |
} |
|
326 |
||
327 |
float rcorr = 2 * logf(detC * detC / detAB); |
|
328 |
printf("Real det: %e = %e %e\n", rcorr, detC, detAB); |
|
329 |
*/ |
|
330 |
||
331 |
9. Non-vectorized D-recovery |
|
332 |
static inline int mrses_spe_real_multiply_recover_old( |
|
333 |
MRSESDataType *D, |
|
334 |
short int pack, short int at_once, |
|
335 |
short int d_step, short int width, short int ralloc, short int walloc, short int nA, short int nB, |
|
336 |
MRSESDataType *A, MRSESDataType *B, |
|
337 |
MRSESDataType *Ca, MRSESDataType *Cb, |
|
338 |
MRSESIntType *drp_gen, MRSESIntType *rst_gen, |
|
339 |
MRSESDataType *Ea, MRSESDataType *Eb |
|
340 |
||
341 |
) { |
|
342 |
short int i, j, k, l; |
|
343 |
||
344 |
/* |
|
345 |
if ((((int)D)%16)==0) { |
|
346 |
if (*rst_gen > width) { |
|
347 |
printf("Orig\n"); |
|
348 |
PRINT_MATRIX("% 6.4f", Ca, walloc, 5, 5); |
|
349 |
} |
|
350 |
} |
|
351 |
if (*rst_gen > width) { |
|
352 |
vec_ssyrk_rln_11(ralloc, nA, A, nA, Ca, walloc); |
|
353 |
} |
|
354 |
if ((((int)D)%16)==0) { |
|
355 |
if (*rst_gen > width) { |
|
356 |
printf("Update: %i\n", *drp_gen); |
|
357 |
PRINT_MATRIX("% 6.4f", Ca, walloc, 5, 5); |
|
358 |
}} |
|
359 |
||
360 |
if (*rst_gen > width) { |
|
361 |
vec_ssyrk_rln_11(ralloc, nB, B, nB, Cb, walloc); |
|
362 |
} |
|
363 |
*/ |
|
364 |
/* if (*rst_gen > width) { |
|
365 |
vec_ssyrk_rln_11(ralloc, nA, A, nA, Ca, walloc); |
|
366 |
vec_ssyrk_rln_11(ralloc, nB, B, nB, Cb, walloc); |
|
367 |
}*/ |
|
368 |
||
369 |
short int c_step = width * (walloc + 1); |
|
370 |
for (i = 0; i < at_once; ++i) { |
|
371 |
short int e_offset = i * walloc; |
|
372 |
short int c_offset = i * c_step; |
|
373 |
||
374 |
short int gen = drp_gen[i]; |
|
375 |
||
376 |
MRSESDataType *Ea_cur = Ea + e_offset; |
|
377 |
MRSESDataType *Eb_cur = Eb + e_offset; |
|
378 |
||
379 |
MRSESDataType *Ca_cur = Ca + c_offset; |
|
380 |
MRSESDataType *Cb_cur = Cb + c_offset; |
|
381 |
||
382 |
MRSESDataType *Da = D + (3 * i + 1) * d_step; |
|
383 |
MRSESDataType *Db = Da + d_step; |
|
384 |
||
385 |
// PRINT_MATRIX("% 6f ", Ea_cur, 0, 1, 5); |
|
386 |
||
387 |
if (rst_gen[i] > width) { |
|
388 |
for (j = 0; j < width; j++) { |
|
389 |
if (j < gen) { |
|
390 |
l = j; k = gen; |
|
391 |
} else { |
|
392 |
l = gen; k = j; |
|
393 |
} |
|
394 |
||
395 |
// if (fabs(Ca[k*walloc+l] - Ea_cur[j])>0.001) { |
|
396 |
// printf("Restoring (%i): %i %i: % 6.4f to % 6.4f\n", gen, k, l, Ca_cur[k*walloc + l], Ea_cur[j]); |
|
397 |
// } |
|
398 |
Ca_cur[k * walloc + l] = Ea_cur[j]; |
|
399 |
Cb_cur[k * walloc + l] = Eb_cur[j]; |
|
400 |
} |
|
401 |
||
402 |
} |
|
403 |
||
404 |
// Recovering to current stage |
|
405 |
for (k = 0; k < width; ++k) { |
|
406 |
for (l = 0; l < width; ++l) { |
|
407 |
Da[pack*(k * width + l)] = Ca_cur[k * walloc + l]; |
|
408 |
Db[pack*(k * width + l)] = Cb_cur[k * walloc + l]; |
|
409 |
} |
|
410 |
} |
|
411 |
} |
|
412 |
||
413 |
/* |
|
414 |
float Ta[16*16]; |
|
415 |
vec_ssyrk_rln_11(ralloc, nA, A, nA, Ta, walloc); |
|
416 |
int prob = 0; |
|
417 |
for (i = 0; i < 5; i++) { |
|
418 |
for (j = 0; j < 5; j++) { |
|
419 |
if (fabs(Ta[i*walloc+j] - Ca[i*walloc+j])>0.00001) { |
|
420 |
prob = 1; |
|
421 |
break; |
|
422 |
} |
|
423 |
} |
|
424 |
} |
|
425 |
if (prob) { |
|
426 |
printf("Restoring: %i, gen: %i\n", *rst_gen, *drp_gen); |
|
427 |
printf("Computed\n"); |
|
428 |
PRINT_MATRIX("% 6.4f", Ca, walloc, 5, 5); |
|
429 |
printf("Should be:\n"); |
|
430 |
PRINT_MATRIX("% 6.4f", Ta, walloc, 5, 5); |
|
431 |
exit(1); |
|
432 |
} else { |
|
433 |
// printf("pass\n"); |
|
434 |
} |
|
435 |
||
436 |
// vec_ssyrk_rln_11(ralloc, nB, B, nB, Tb, walloc); |
|
437 |
||
438 |
*/ |
|
439 |
return 0; |
|
440 |
} |
|
441 |