1
This file contains some ideas which was slower or not comed to the end due
2
their nature or my inabilities or for time saving purposes.
4
1. The general idea is to computer full 16x16 matrix until the end. We got
5
several quadratic matrices along the diagonal with which are the actual
6
C matrixes and we padding the rest of diagonal with 1. Everything else is 0.
7
Then we can compute Cholesky and solve equations on all matrices together
8
using SPE-optimized blas. However, this approach is twice slower compared
9
to computing matrix by matrix using non-optimized blas.
11
for (j = 1; j < at_once; j++) {
12
int offset = j * width * real_width;
13
MRSESDataType *C_cur = C + offset;
14
MRSESDataType *Ca_cur = Ca + offset;
15
MRSESDataType *Cb_cur = Cb + offset;
17
for (i = 0; i < real_width; i++) {
18
memset(C_cur + i * width, 0, j * real_width * sizeof(MRSESDataType));
21
for (j = at_once * real_width; j < width; j++) {
22
memset(C + j * width, 0, width * sizeof(MRSESDataType));
27
Solving equation can be done with following commands, but I have not managed to
28
get correct results. Anyway the trsv uses neglectable amount of time. Most
29
things are done in matrix multiplication and cholesky decomposition.
31
transpose_matrix(width, width, C, width, D, width);
32
MRSESDataType *D_cur = D + offset;
33
solve_upper_1(real_width, D_cur, width, mean_cur);
36
strsv_spu_lower(width, C, width, mean);
39
2. Another idea was to compute [Ca]*[Cb] as [Ca*Cb]. For that we could
40
multiply Ca*Cb and then multiply by transpose of it (Ca*Cb)' to get
41
symmetric matrix. Finally we should get the square of determinat.
42
But, the resulting symmetric matrix is not positive-definite (or for
43
precision reasons), the cholesky decomposition have failed for it.
45
for (i = 0; i < width; i++) {
46
for (j = 0; j <= i; j++) {
47
Ca[j * width + i] = Ca[i * width + j];
48
Cb[j * width + i] = Cb[i * width + j];
52
memset(Cab, 0, width*width*sizeof(MRSESDataType));
53
//sgemm_spu(width, width, width, Ca, Cb, Cab);
54
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, width, width, width, 1, Ca, width, Cb, width, 0, Cab, width);
56
memset(Cab2, 0, width*width*sizeof(MRSESDataType));
57
blas_syrk(CblasRowMajor, CblasLower, CblasNoTrans, width, width, 1, Cab, width, 0, Cab2, width);
58
//spe_syrk(CblasRowMajor, CblasLower, CblasNoTrans, width, width, 1, Cab, width, 0, Cab2, width);
60
lapack_potrf(&hmode, &real_width, Cab_cur, &width, err);
62
detC = C_cur[0]; detAB = Cab_cur[0];
63
for (i = width + 1; i < c_step; i+= (width+1)) {
67
rcorr = logf(detC * detC * detC * detC / detAB);
71
3. Following code is matrix multiplocation using default (non-SPE) blas. Even
72
if used with appropriate small matrices it is significantly slower than IBM
78
//original (do not store C)
79
//blas_syrk(CblasRowMajor, CblasUpper, CblasNoTrans, width, nA, 1, A, nA, 0, Ca, width);
80
//blas_syrk(CblasRowMajor, CblasUpper, CblasNoTrans, width, nB, 1, B, nB, 0, Cb, width);
82
blas_syrk(CblasRowMajor, CblasUpper, CblasNoTrans, 5, 50, 1, A, nA, 0, Ca, width);
83
blas_syrk(CblasRowMajor, CblasUpper, CblasNoTrans, 5, 40, 1, B, nB, 0, Cb, width);
85
memcpy(C, Ca, width2 * sizeof(MRSESDataType));
86
blas_axpy(width2, 1, Cb, 1, C, 1);
87
blas_scal(width2, 0.5, C, 1);
90
4. Other ways to solve equation system. Both works but a bit slower
94
CblasRowMajor, CblasRight, CblasLower, CblasTrans, CblasNonUnit,
95
1, real_width, 1, C_cur, width, mean_cur, width //1?
100
CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit,
101
real_width, 1, 1, C_cur, width, mean_cur, 1 //width //1?
105
5. Other ways to perform cholesky decomposition. The lapack version is surely
106
working. Curretnly implemented seems to work, but who nows about special cases.
108
atlas_spotrf2(real_width, C_cur, width);
109
atlas_spotrf2(real_width, Ca_cur, width);
110
atlas_spotrf2(real_width, Cb_cur, width);
112
lapack_potrf(&hmode, &real_width, C_cur, &width, &err);
114
lapack_potrf(&hmode, &real_width, Ca_cur, &width, &err);
116
lapack_potrf(&hmode, &real_width, Cb_cur, &width, &err);
120
6. This is fully working, but not vectorized version of code
121
static inline int mrses_spe_real_run(
122
MRSESContext mrses, MRSESDataType *result,
123
int width, int width2, int nA, int nB,
124
MRSESDataType *A, MRSESDataType *B, MRSESDataType *mean,
125
MRSESDataType *C, MRSESDataType *Ca, MRSESDataType *Cb
129
MRSESDataType detAB, detC;
130
MRSESDataType rmahal, rcorr;
137
// int iterate_size = mrses->iterate_size;
138
int at_once = max(1, SPE_BLOCK / real_width);
139
// int times = iterate_size / at_once;
141
int rww = at_once * width * real_width;
142
int rww_alloc = calc_alloc(rww, 64);
144
memset(Ca, 0, rww * sizeof(MRSESDataType));
145
memset(Cb, 0, rww * sizeof(MRSESDataType));
147
//PRINT_MATRIX("% 6.4f ", A, 16, 16, 16)
149
// that works on multiples of 16
150
spe_syrk(CblasRowMajor, CblasLower, CblasNoTrans, width, nA, 1, A, nA, 0, Ca, width);
151
spe_syrk(CblasRowMajor, CblasLower, CblasNoTrans, width, nB, 1, B, nB, 0, Cb, width);
153
// thats works on multiples of 64 (16x16 - fine)
154
memcpy(C, Ca, rww * sizeof(MRSESDataType));
155
spe_axpy(rww_alloc, 1, Cb, 1, C, 1);
156
spe_scal(rww_alloc, 0.5, C, 1);
158
int c_step = real_width * (width + 1);
160
for (j = 0; j < at_once; j++) {
161
int offset = j * c_step;
162
MRSESDataType *C_cur = C + offset;
163
MRSESDataType *Ca_cur = Ca + offset;
164
MRSESDataType *Cb_cur = Cb + offset;
165
MRSESDataType *mean_cur = mean + j * real_width;
166
MRSESDataType result;
169
err = atlas_spotrf_u(real_width, C_cur, width);
171
err = atlas_spotrf_u(real_width, Ca_cur, width);
173
err = atlas_spotrf_u(real_width, Cb_cur, width);
178
atlas_spotrf2(real_width, C_cur, width);
179
atlas_spotrf2(real_width, Ca_cur, width);
180
atlas_spotrf2(real_width, Cb_cur, width);
182
lapack_potrf(&hmode, &real_width, C_cur, &width, &err);
184
lapack_potrf(&hmode, &real_width, Ca_cur, &width, &err);
186
lapack_potrf(&hmode, &real_width, Cb_cur, &width, &err);
190
detC = C_cur[0]; detAB = Ca_cur[0] * Cb_cur[0];
191
for (i = width + 1; i < c_step; i+= (width+1)) {
192
detAB *= (Ca_cur[i] * Cb_cur[i]);
196
rcorr = 2 * logf(detC * detC / detAB);
200
CblasRowMajor, CblasLower, CblasNoTrans, CblasNonUnit,
201
real_width, C_cur, width, mean_cur, 1
204
rmahal = blas_dot(real_width, mean_cur, 1, mean_cur, 1);
206
switch (mrses->dist) {
208
result = rmahal/8 + rcorr/4;
220
// printf("SPU result: %e (mahal: %e, corcor: %e)\n", result, rmahal, rcorr);
226
data->A = (MRSESDataType*)(allocation + pos);
227
pos += walloc * alloc * sizeof(MRSESDataType);
229
data->B = (MRSESDataType*)(allocation + pos);
230
pos += walloc * alloc * sizeof(MRSESDataType);
232
data->C = (MRSESDataType*)(allocation + pos);
233
pos += walloc2 * sizeof(MRSESDataType);
235
data->Ca = (MRSESDataType*)(allocation + pos);
236
pos += walloc2 * sizeof(MRSESDataType);
238
data->Cb = (MRSESDataType*)(allocation + pos);
239
pos += walloc2 * sizeof(MRSESDataType);
241
// memset(data->A + used_walloc * alloc, 0, (walloc - used_walloc) * alloc * sizeof(MRSESDataType));
242
// memset(data->B + used_walloc * alloc, 0, (walloc - used_walloc) * alloc * sizeof(MRSESDataType));
243
// memset(data->C + used_walloc * walloc, 0, (walloc - used_walloc) * walloc * sizeof(MRSESDataType));
245
// memset(data->mean + used_walloc, 0, (walloc - used_walloc) * sizeof(MRSESDataType));
246
// memset(data->mean_copy + used_walloc, 0, (walloc - used_walloc) * sizeof(MRSESDataType));
250
err = mrses_spe_real_run(
252
walign, walign2, aA, aB, //width, width2, nA, nB, alloc,
253
A, B, mean_copy, C, Ca, Cb
258
err = mrses_spe_real_run(
260
walign, walign2, aA, aB, //width, width2, nA, nB, alloc,
261
A, B, mean_copy, C, Ca, Cb
266
7. Vectorization tests with various invalid operations
268
vector float one = {1, -1, 1, 1};
269
vector float some_zero = {0, 1, 1, 0};
270
vector float zero = {0, 0, 0, 0};
273
result = divf4(one, some_zero);
275
printf("%f %f\n", *(((float*)&result)+1), *(((float*)&result)+2));
276
printf("%f %f\n", *(((float*)&result)+0), *(((float*)&result)+3));
278
result = sqrtf4(result);
280
printf("%f %f\n", *(((float*)&result)+1), *(((float*)&result)+2));
281
printf("%f %f\n", *(((float*)&result)+0), *(((float*)&result)+3));
283
result = divf4_fast(one, some_zero);
285
printf("%f %f\n", *(((float*)&result)+1), *(((float*)&result)+2));
286
printf("%f %f\n", *(((float*)&result)+0), *(((float*)&result)+3));
288
result = sqrtf4_fast(fmaxf4(result, zero));
290
printf("%f %f\n", *(((float*)&result)+1), *(((float*)&result)+2));
291
printf("%f %f\n", *(((float*)&result)+0), *(((float*)&result)+3));
297
8. Debugging vectorized code
299
int err = atlas_spotrf2(real_width, Ca, width);
300
if (err) printf("Problem with spotrf\n");
302
if ((((unsigned int)D)%16)==0) {
303
printf("Cholesky Ca\n");
304
PRINT_MATRIX("% 6.4f ", Ca, 16, 5, 5)
310
memset(C, 0, 16*16 * sizeof(MRSESDataType));
311
memcpy(C, Ca, rww * sizeof(MRSESDataType));
312
spe_axpy(16*16, 1, Cb, 1, C, 1);
313
spe_scal(16*16, 0.5, C, 1);
316
puts("=====================================");
317
int c_step = real_width * (width + 1);
318
atlas_spotrf2(real_width, Ca, width);
319
atlas_spotrf2(real_width, Cb, width);
320
atlas_spotrf2(real_width, C, width);
321
float detC = C[0]; float detAB = Ca[0] * Cb[0];
322
for (i = width + 1; i < c_step; i+= (width+1)) {
323
detAB *= (Ca[i] * Cb[i]);
327
float rcorr = 2 * logf(detC * detC / detAB);
328
printf("Real det: %e = %e %e\n", rcorr, detC, detAB);
331
9. Non-vectorized D-recovery
332
static inline int mrses_spe_real_multiply_recover_old(
334
short int pack, short int at_once,
335
short int d_step, short int width, short int ralloc, short int walloc, short int nA, short int nB,
336
MRSESDataType *A, MRSESDataType *B,
337
MRSESDataType *Ca, MRSESDataType *Cb,
338
MRSESIntType *drp_gen, MRSESIntType *rst_gen,
339
MRSESDataType *Ea, MRSESDataType *Eb
342
short int i, j, k, l;
345
if ((((int)D)%16)==0) {
346
if (*rst_gen > width) {
348
PRINT_MATRIX("% 6.4f", Ca, walloc, 5, 5);
351
if (*rst_gen > width) {
352
vec_ssyrk_rln_11(ralloc, nA, A, nA, Ca, walloc);
354
if ((((int)D)%16)==0) {
355
if (*rst_gen > width) {
356
printf("Update: %i\n", *drp_gen);
357
PRINT_MATRIX("% 6.4f", Ca, walloc, 5, 5);
360
if (*rst_gen > width) {
361
vec_ssyrk_rln_11(ralloc, nB, B, nB, Cb, walloc);
364
/* if (*rst_gen > width) {
365
vec_ssyrk_rln_11(ralloc, nA, A, nA, Ca, walloc);
366
vec_ssyrk_rln_11(ralloc, nB, B, nB, Cb, walloc);
369
short int c_step = width * (walloc + 1);
370
for (i = 0; i < at_once; ++i) {
371
short int e_offset = i * walloc;
372
short int c_offset = i * c_step;
374
short int gen = drp_gen[i];
376
MRSESDataType *Ea_cur = Ea + e_offset;
377
MRSESDataType *Eb_cur = Eb + e_offset;
379
MRSESDataType *Ca_cur = Ca + c_offset;
380
MRSESDataType *Cb_cur = Cb + c_offset;
382
MRSESDataType *Da = D + (3 * i + 1) * d_step;
383
MRSESDataType *Db = Da + d_step;
385
// PRINT_MATRIX("% 6f ", Ea_cur, 0, 1, 5);
387
if (rst_gen[i] > width) {
388
for (j = 0; j < width; j++) {
395
// if (fabs(Ca[k*walloc+l] - Ea_cur[j])>0.001) {
396
// printf("Restoring (%i): %i %i: % 6.4f to % 6.4f\n", gen, k, l, Ca_cur[k*walloc + l], Ea_cur[j]);
398
Ca_cur[k * walloc + l] = Ea_cur[j];
399
Cb_cur[k * walloc + l] = Eb_cur[j];
404
// Recovering to current stage
405
for (k = 0; k < width; ++k) {
406
for (l = 0; l < width; ++l) {
407
Da[pack*(k * width + l)] = Ca_cur[k * walloc + l];
408
Db[pack*(k * width + l)] = Cb_cur[k * walloc + l];
415
vec_ssyrk_rln_11(ralloc, nA, A, nA, Ta, walloc);
417
for (i = 0; i < 5; i++) {
418
for (j = 0; j < 5; j++) {
419
if (fabs(Ta[i*walloc+j] - Ca[i*walloc+j])>0.00001) {
426
printf("Restoring: %i, gen: %i\n", *rst_gen, *drp_gen);
427
printf("Computed\n");
428
PRINT_MATRIX("% 6.4f", Ca, walloc, 5, 5);
429
printf("Should be:\n");
430
PRINT_MATRIX("% 6.4f", Ta, walloc, 5, 5);
436
// vec_ssyrk_rln_11(ralloc, nB, B, nB, Tb, walloc);