8
#include <simdmath/sqrtf4.h>
9
#include <simdmath/fmaxf4.h>
10
#include <simdmath/divf4.h>
11
#include <simdmath/divf4_fast.h>
12
#include <simdmath/rsqrtf4.h>
13
#include <simdmath/recipf4.h>
14
#include <simdmath/recipf4_fast.h>
15
#include <simdmath/logf4.h>
18
#include <spu_intrinsics.h>
19
#else /* not __SPU__ */
24
static vector float zero = {0, 0, 0, 0};
25
static vector float two = {2, 2, 2, 2};
26
static vector float four = {4, 4, 4, 4};
27
static vector float eight = {8, 8, 8, 8};
30
// fast versions does not bring significant performance benefits
31
// fmaxf between nan and zero is zero
32
#define spu_max _fmaxf4
34
// divide by zero: nan, nan; in fast mode -0, -0
35
#define spu_div _divf4
36
//#define spu_div _divf4_fast
38
// sqrt of negative: 0; in fast mode: undefined, sqrt of nan: big num, ?
39
#define spu_sqrt(a) _sqrtf4(spu_max(a, zero))
40
//#define spu_sqrt(a) sqrtf4_fast(spu_max(a, zero))
42
#define spu_recip _recipf4
43
//#define spu_recip _recipf4_fast
45
#define spu_rsqrt _rsqrtf4
47
#define spu_log _logf4
49
#define VECTOR_PRINT(var, val) \
50
printf("Vector %10s is: {% 6.4e, % 6.4e, % 6.4e, % 6.4e}\n", var, spu_extract(val, 0), spu_extract(val, 1), spu_extract(val, 2), spu_extract(val, 3));
52
#define iVECTOR_PRINT(var, val) \
53
printf("Vector %10s is: {%i, %i, %i, %i}\n", var, spu_extract(val, 0), spu_extract(val, 1), spu_extract(val, 2), spu_extract(val, 3));
55
#define VECTOR_PRINT_MATRIX(num, fmt, MATRIX, width, n, m) \
58
for (i = 0; i < n; i++) { \
59
for (j = 0; j < m; j++) \
60
printf(fmt, spu_extract(MATRIX[i * width + j], num)); \
67
inline static vector unsigned int vec_potrfU_4(vector float *A, const short int lda)
69
vector float *pA1=A+lda, *pA2=pA1+lda, *pA3=pA2+lda;
70
vector float L11 = *A, L21 = *pA1, L31 = *pA2, L41 = *pA3;
71
vector float L22 = pA1[1], L32 = pA2[1], L42 = pA3[1];
72
vector float L33 = pA2[2], L43 = pA3[2];
73
vector float L44 = pA3[3];
75
vector unsigned int errors;
77
errors = spu_cmpgt(L11, zero);
82
L21 = spu_mul(L21, L11);
83
L31 = spu_mul(L31, L11);
84
L41 = spu_mul(L41, L11);
90
L22 = spu_nmsub(L21, L21, L22); // L22 -= L21*L21;
92
errors = spu_and(spu_cmpgt(L22, zero), errors);
95
pA1[1] = spu_recip(L22);
97
L32 = spu_mul(L22, spu_nmsub(L31, L21, L32)); // (L32 - L31*L21) * L22;
98
L42 = spu_mul(L22, spu_nmsub(L41, L21, L42)); // (L42 - L41*L21) * L22;
99
L33 = spu_nmsub(L32, L32, spu_nmsub(L31, L31, L33)); // (L33 - L31*L31) - L32*L32;
104
errors = spu_and(spu_cmpgt(L33, zero), errors);
106
L33 = spu_rsqrt(L33);
107
pA2[2] = spu_recip(L33);
109
L43 = spu_mul(L33, spu_nmsub(L42, L32, spu_nmsub(L41, L31, L43))); // ((L43 - L41*L31) - L42*L32) * L33;
110
L44 = spu_nmsub(L43, L43, spu_nmsub(L42, L42, spu_nmsub(L41, L41, L44))); // (((L44 - L41*L41) - L42*L42) - L43*L43);
113
errors = spu_and(spu_cmpgt(L44, zero), errors);
114
pA3[3] = spu_sqrt(L44);
120
inline static vector unsigned int vec_potrfU_3(vector float *A, const short int lda)
122
vector float *pA1=A+lda, *pA2=pA1+lda;
123
register vector float L11 = *A, L21 = *pA1, L31 = *pA2;
124
register vector float L22=pA1[1], L32=pA2[1];
125
register vector float L33=pA2[2];
127
vector unsigned int errors;
129
errors = spu_cmpgt(L11, zero);
131
L11 = spu_rsqrt(L11);
134
L21 = spu_mul(L21, L11);
135
L31 = spu_mul(L31, L11);
140
L22 = spu_nmsub(L21, L21, L22);
142
errors = spu_and(spu_cmpgt(L22, zero), errors);
144
L22 = spu_rsqrt(L22);
145
L32 = spu_mul( spu_nmsub(L31, L21, L32), L22); // (L32 - L31*L21) / sqrt(L22);
147
L33 = spu_nmsub(L32, L32, spu_nmsub(L31, L31, L33)); // (L33 - L31*L31) - L32 * L32;
149
errors = spu_and(spu_cmpgt(L33, zero), errors);
151
pA1[1] = spu_recip(L22);
153
pA2[2] = spu_sqrt(L33);
159
inline static vector unsigned int vec_potrfU_2(vector float *A, const short int lda)
161
vector float *pA1 = A + lda;
162
register vector float L11 = *A, L21 = *pA1, L22 = pA1[1];
164
register vector unsigned int errors;
166
errors = spu_cmpgt(L11, zero);
168
L11 = spu_rsqrt(L11);
170
*pA1 = L21 = spu_mul(L21, L11); // division by zero
172
L22 = spu_nmsub(L21, L21, L22); // L22 -= L21*L21;
174
errors = spu_and(spu_cmpgt(L22, zero), errors);
175
pA1[1] = spu_sqrt(L22);
180
inline static void vec_strsv_rlnn (short int N, const vector float *A, const int lda, vector float *X) {
181
register short int i, j;
183
for (i = 0; i < N; i++) {
184
vector float tmp = X[i];
186
for (j = 0; j < i; j++) {
187
tmp = spu_nmsub(A[lda * i + j], X[j], tmp); // tmp -= A[lda * i + j] * X[j];
189
X[i] = spu_div(tmp, A[(lda + 1) * i]); // X[i] = tmp / A[lda * i + i];
196
#include <sum_across_float4.h>
197
#include "vec_potrf_mtxmul.h"
200
#define HSUM(r1,r2,r3,r4) \
201
tmp1 = spu_rlqwbyte(r1, 8); \
202
tmp2 = spu_rlqwbyte(r3, 8); \
204
tmp3 = spu_sel(r1, r3, mask1); \
205
tmp4 = spu_sel(tmp1, tmp2, mask1); \
206
tmp5 = spu_add(tmp3, tmp4); \
208
tmp1 = spu_rlqwbyte(r2, 8); \
209
tmp2 = spu_rlqwbyte(r4, 8); \
211
tmp3 = spu_sel(r2, r4, mask1); \
212
tmp4 = spu_sel(tmp1, tmp2, mask1); \
213
tmp6 = spu_add(tmp3, tmp4); \
215
tmp1 = spu_rlqwbyte(tmp5, 4); \
216
tmp2 = spu_rlqwbyte(tmp6, -4); \
218
tmp3 = spu_sel(tmp5, tmp6, mask2); \
219
tmp4 = spu_sel(tmp1, tmp2, mask2); \
221
tmp5 = spu_add(tmp3, tmp4);
224
#define DO_MATRIX(size) \
226
for (k = 0; k < K; k += BLOCK_X) { \
227
const vector float *B = A + k; \
236
void vec_ssyrk_rln_11 (short int N, short int K, const vector float *A, short int lda, float *C, short int ldc)
238
register short int k;
240
register vector float tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
241
register vector unsigned int mask1 = {0, 0, 0xFFFFFFFF, 0xFFFFFFFF};
242
register vector unsigned int mask2 = {0, 0xFFFFFFFF, 0, 0xFFFFFFFF};
292
// assert((N%16)==0);
293
// ldc is walloc, we can have N=ralloc smaller for memory optimization, but still should use walloc here to have multiple of 16
294
memset(C, 0, N * ldc * sizeof(float));
295
ssyrk_spu((float*)A, C, 1, /*N*/ldc, K * 4, lda * 4, ldc);
300
for (i = 0; i < N; i++) {
301
for (j = 0; j <= i; j++) {
302
register vector float temp = zero;
303
for (k = 0; k < K; k++) {
304
temp = spu_madd(A[i * lda + k], A[j * lda + k], temp);
306
C[i * ldc + j] = spu_extract(temp, 0) + spu_extract(temp, 1) + spu_extract(temp, 2) + spu_extract(temp, 3);
313
#define DO_ROW(size) \
315
for (k = 0; k < K; k += BLOCK_X) { \
316
const vector float *B = A + row * lda + k; \
319
COMPUTE_V##size((A + l * lda + k)) \
321
SUM_V##size(((vector float*)(C+l)));
324
void vec_update_row (short int row, short int N, short int K, const vector float *A, short int lda, float *C) {
327
register vector float tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
328
register vector unsigned int mask1 = {0, 0, 0xFFFFFFFF, 0xFFFFFFFF};
329
register vector unsigned int mask2 = {0, 0xFFFFFFFF, 0, 0xFFFFFFFF};
330
// vector float temp;
337
for (l = 0; l < N; l+=16) {
338
short int rem = min(16, N - l);
405
for (k = 0; k < K; k+= BLOCK_X) {
406
const vector float *B = A + row * lda + k;
409
COMPUTE_V5((A + l * lda + k))
411
SUM_V5(((vector float*)(C+l)));
416
vector float temp[N];
417
memset(temp, 0, sizeof(vector float)*N);
418
for (k = 0; k < K; k++) {
420
register vector float Arow = A[row * lda + k];
421
for (l = 0; l < N; l++) {
422
temp[l] = spu_madd(Arow, A[l * lda + k], temp[l]);
423
C[l] = spu_extract(temp[l], 0) + spu_extract(temp[l], 1) + spu_extract(temp[l], 2) + spu_extract(temp[l], 3);
426
printf("Step: %i\n",k);
427
PRINT_MATRIX("% 6.4f ", C, 1, 1, 16);
430
for (l = 0; l < N; l++) {
431
C[l] = spu_extract(temp[l], 0) + spu_extract(temp[l], 1) + spu_extract(temp[l], 2) + spu_extract(temp[l], 3);
433
PRINT_MATRIX("% 6.4f ", C, 1, 1, 16);
437
for (l = 0; l < N; l++) {
444
register vector float tmp = zero;
445
for (k = 0; k < K; k++) {
446
tmp = spu_madd(A[i * lda + k], A[j * lda + k], tmp);
448
C[l] = spu_extract(tmp, 0) + spu_extract(tmp, 1) + spu_extract(tmp, 2) + spu_extract(tmp, 3);
451
PRINT_MATRIX("% 6.4f ", C, 1, 1, 16);
456
inline static void vec_spotrf_step(const short int M, const short int N, vector float *A, const short int lda) {
457
register short int i, j, k;
459
vector float *B = A + M * lda;
460
vector float *C = B + M;
463
for (i = 0; i < N; i++) {
464
for (j = 0; j < M; j++) {
465
vector float sum = {0, 0, 0, 0};
467
for (k = 0; k < j; k++) {
468
sum = spu_madd(A[j * lda + k], B[i * lda + k], sum);
471
B[i * lda + j] = spu_div(spu_sub(B[i * lda + j], sum), A[j * lda + j]);
474
for (j = 0; j <= i; j++) {
475
vector float temp = {0, 0, 0, 0};
477
for (k = 0; k < M; k++) {
478
temp = spu_madd(B[i * lda + k], B[j * lda + k], temp);
481
C[i * lda + j] = spu_sub(C[i * lda + j], temp);
487
// we ignore errors if it will lead to 0 determinant
488
vector unsigned int vec_spotrf_u(short int N, vector float *A, short int lda) {
489
short int Nleft, Nright;
491
vector unsigned int errors;
493
//PRINT_MULTIMATRIX(0, "% 7.4f ", ((float*)A), 5, 5, 5);
499
// printf("passed matrix\n");
500
// VECTOR_PRINT_MATRIX(0, "% 8.4f ", A, 5, 5, 5);
502
errors = vec_spotrf_u(Nleft, A, lda);
505
// VECTOR_PRINT_MATRIX(0, "% 8.4f ", A, 5, 5, 5);
507
vec_spotrf_step(Nleft, Nright, A, lda);
509
// VECTOR_PRINT_MATRIX(0, "% 8.4f ", A, 5, 5, 5);
511
spu_and(vec_spotrf_u(Nright, A + (lda + 1) * Nleft, lda), errors);
512
// printf("right\n");
513
// VECTOR_PRINT_MATRIX(0, "% 8.4f ", A, 5, 5, 5);
516
else if (N==4) errors = vec_potrfU_4(A, lda);
517
else if (N==3) errors = vec_potrfU_3(A, lda);
518
else if (N==2) errors = vec_potrfU_2(A, lda);
520
errors = spu_cmpgt(*A, zero);
522
} else return spu_cmpgt(zero, zero); // anything
527
vector float vec_rcorr(short int N, vector float *C, vector float *Ca, vector float *Cb) {
529
short int step = N + 1;
530
short int end = N * N;
531
vector float detC, detAB;
533
// VECTOR_PRINT_MATRIX(1, "%6.4f", C, 5, 5, 5);
536
detAB = Ca[0] * Cb[0];
537
for (i = step; i < end; i+= step) {
538
detAB = spu_mul(detAB, spu_mul(Ca[i], Cb[i]));
539
detC = spu_mul(detC, C[i]);
543
VECTOR_PRINT("detAB", detAB);
544
VECTOR_PRINT("detC", detC);
545
VECTOR_PRINT("detC^2", spu_mul(detC, detC));
546
VECTOR_PRINT("C/AB", spu_div(spu_mul(detC, detC), detAB));
547
VECTOR_PRINT("log", spu_log(spu_div(spu_mul(detC, detC), detAB)));
550
return spu_mul(two, spu_log(spu_div(spu_mul(detC, detC), detAB)));
553
vector float vec_rmahal(short int N, vector float *C, vector float *X) {
554
register short int i;
555
vector float sum = zero;
557
vec_strsv_rlnn(N, C, N, X);
558
for (i = 0; i < N; i++) {
559
vector float val = X[i];
560
sum = spu_madd(val, val, sum);
566
vector float vec_bhata(vector float rcorr, vector float rmahal) {
567
return spu_add(spu_div(rmahal, eight), spu_div(rcorr, four));