/ani/mrses : contents of cell/vec

: (revision 2)

To get this branch, use:

bzr branch
http://suren.me/webbzr/ani/mrses

#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <blas_s.h>
#include <simdmath.h>
#include <libvector.h>

#include <simdmath/sqrtf4.h>
#include <simdmath/fmaxf4.h>
#include <simdmath/divf4.h>
#include <simdmath/divf4_fast.h>
#include <simdmath/rsqrtf4.h>
#include <simdmath/recipf4.h>
#include <simdmath/recipf4_fast.h>
#include <simdmath/logf4.h>

#ifdef __SPU__
#include <spu_intrinsics.h>
#else /* not __SPU__ */
#include <altivec.h>
#endif


static vector float zero = {0, 0, 0, 0};
static vector float two = {2, 2, 2, 2};
static vector float four = {4, 4, 4, 4};
static vector float eight = {8, 8, 8, 8};


// fast versions does not bring significant performance benefits
// fmaxf between nan and zero is zero
#define spu_max _fmaxf4

// divide by zero: nan, nan; in fast mode -0, -0
#define spu_div _divf4
//#define spu_div _divf4_fast

// sqrt of negative: 0; in fast mode: undefined, sqrt of nan: big num, ?
#define spu_sqrt(a) _sqrtf4(spu_max(a, zero))
//#define spu_sqrt(a) sqrtf4_fast(spu_max(a, zero))

#define spu_recip _recipf4
//#define spu_recip _recipf4_fast

#define spu_rsqrt _rsqrtf4

#define spu_log _logf4

#define VECTOR_PRINT(var, val) \
    printf("Vector %10s is: {% 6.4e, % 6.4e, % 6.4e, % 6.4e}\n", var, spu_extract(val, 0), spu_extract(val, 1), spu_extract(val, 2), spu_extract(val, 3));

#define iVECTOR_PRINT(var, val) \
    printf("Vector %10s is: {%i, %i, %i, %i}\n", var, spu_extract(val, 0), spu_extract(val, 1), spu_extract(val, 2), spu_extract(val, 3));

#define VECTOR_PRINT_MATRIX(num, fmt, MATRIX, width, n, m) \
{ \
    int i, j; \
    for (i = 0; i < n; i++) { \
	for (j = 0; j < m; j++) \
	    printf(fmt, spu_extract(MATRIX[i * width + j], num)); \
	printf("\n"); \
    } \
    printf("\n"); \
}


inline static vector unsigned int vec_potrfU_4(vector float *A, const short int lda)
{
    vector float *pA1=A+lda, *pA2=pA1+lda, *pA3=pA2+lda;
    vector float L11 = *A, L21 = *pA1, L31 = *pA2, L41 = *pA3;
    vector float L22 = pA1[1], L32 = pA2[1], L42 = pA3[1];
    vector float L33 = pA2[2], L43 = pA3[2];
    vector float L44 = pA3[3];

    vector unsigned int errors;

    errors = spu_cmpgt(L11, zero);

    L11 = spu_rsqrt(L11);
    *A = spu_recip(L11);

    L21 = spu_mul(L21, L11);
    L31 = spu_mul(L31, L11);
    L41 = spu_mul(L41, L11);

    *pA1 = L21;
    *pA2 = L31;
    *pA3 = L41;

    L22 = spu_nmsub(L21, L21, L22);					// L22 -= L21*L21;

    errors = spu_and(spu_cmpgt(L22, zero), errors);

    L22 = spu_rsqrt(L22);
    pA1[1] = spu_recip(L22);

    L32 = spu_mul(L22, spu_nmsub(L31, L21, L32));			// (L32 - L31*L21) * L22;
    L42 = spu_mul(L22, spu_nmsub(L41, L21, L42));			// (L42 - L41*L21) * L22;
    L33 = spu_nmsub(L32, L32, spu_nmsub(L31, L31, L33));		// (L33 - L31*L31) - L32*L32;

    pA2[1] = L32;
    pA3[1] = L42;

    errors = spu_and(spu_cmpgt(L33, zero), errors);

    L33 = spu_rsqrt(L33);
    pA2[2] = spu_recip(L33);

    L43 = spu_mul(L33, spu_nmsub(L42, L32, spu_nmsub(L41, L31, L43)));		// ((L43 - L41*L31) - L42*L32) * L33;
    L44 = spu_nmsub(L43, L43, spu_nmsub(L42, L42, spu_nmsub(L41, L41, L44)));	// (((L44 - L41*L41) - L42*L42) - L43*L43);
    pA3[2] = L43;

    errors = spu_and(spu_cmpgt(L44, zero), errors);
    pA3[3] = spu_sqrt(L44);

    return errors;
}


inline static vector unsigned int vec_potrfU_3(vector float *A, const short int lda)
{
    vector float *pA1=A+lda, *pA2=pA1+lda;
    register vector float L11 = *A, L21 = *pA1, L31 = *pA2;
    register vector float L22=pA1[1], L32=pA2[1];
    register vector float L33=pA2[2];

    vector unsigned int errors;

    errors = spu_cmpgt(L11, zero);

    L11 = spu_rsqrt(L11);
    *A = spu_recip(L11);

    L21 = spu_mul(L21, L11);
    L31 = spu_mul(L31, L11);

    *pA1 = L21;
    *pA2 = L31;

    L22 = spu_nmsub(L21, L21, L22);

    errors = spu_and(spu_cmpgt(L22, zero), errors);

    L22 = spu_rsqrt(L22);
    L32 = spu_mul( spu_nmsub(L31, L21, L32), L22);  		// (L32 - L31*L21) / sqrt(L22);

    L33 = spu_nmsub(L32, L32, spu_nmsub(L31, L31, L33)); 	// (L33 - L31*L31) - L32 * L32;

    errors = spu_and(spu_cmpgt(L33, zero), errors);

    pA1[1] = spu_recip(L22);
    pA2[1] = L32;
    pA2[2] = spu_sqrt(L33);

    return errors;
}


inline static vector unsigned int vec_potrfU_2(vector float *A, const short int lda)
{
    vector float *pA1 = A + lda;
    register vector float L11 = *A, L21 = *pA1, L22 = pA1[1];

    register vector unsigned int errors;

    errors = spu_cmpgt(L11, zero);

    L11 = spu_rsqrt(L11);
    *A = spu_recip(L11);
    *pA1 = L21 = spu_mul(L21, L11);	// division by zero

    L22 = spu_nmsub(L21, L21, L22);	// L22 -= L21*L21;

    errors = spu_and(spu_cmpgt(L22, zero), errors);
    pA1[1] = spu_sqrt(L22);

    return errors;
}

inline static void vec_strsv_rlnn (short int N, const vector float *A, const int lda, vector float *X) {
    register short int i, j;

    for (i = 0; i < N; i++) {
        vector float tmp = X[i];

        for (j = 0; j < i; j++) {
            tmp = spu_nmsub(A[lda * i + j], X[j], tmp);			// tmp -= A[lda * i + j] * X[j];
        }
        X[i] = spu_div(tmp, A[(lda + 1) * i]);				// X[i] = tmp / A[lda * i + i];
    }
}

#define BLOCK_X 4
#define BLOCK_Y 8

#include <sum_across_float4.h>
#include "vec_potrf_mtxmul.h"


#define HSUM(r1,r2,r3,r4) \
	tmp1 = spu_rlqwbyte(r1, 8); \
	tmp2 = spu_rlqwbyte(r3, 8); \
 \
	tmp3 = spu_sel(r1, r3, mask1); \
	tmp4 = spu_sel(tmp1, tmp2, mask1); \
	tmp5 = spu_add(tmp3, tmp4); \
 \
	tmp1 = spu_rlqwbyte(r2, 8); \
	tmp2 = spu_rlqwbyte(r4, 8); \
 \
	tmp3 = spu_sel(r2, r4, mask1); \
	tmp4 = spu_sel(tmp1, tmp2, mask1); \
	tmp6 = spu_add(tmp3, tmp4); \
 \
	tmp1 = spu_rlqwbyte(tmp5, 4); \
	tmp2 = spu_rlqwbyte(tmp6, -4); \
 \
	tmp3 = spu_sel(tmp5, tmp6, mask2); \
	tmp4 = spu_sel(tmp1, tmp2, mask2); \
 \
	tmp5 = spu_add(tmp3, tmp4);


#define DO_MATRIX(size) \
    DECLARE_R##size \
    for (k = 0; k < K; k += BLOCK_X) { \
	const vector float *B = A + k; \
 \
	DECLARE_T##size(B) \
	COMPUTE_T##size(C) \
    } \
    SUM_T##size(C)


#include "tools.h"
void vec_ssyrk_rln_11 (short int N, short int K, const vector float *A, short int lda, float *C, short int ldc)
{
    register short int k;

    register vector float tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
    register vector unsigned int mask1 = {0, 0, 0xFFFFFFFF, 0xFFFFFFFF};
    register vector unsigned int mask2 = {0, 0xFFFFFFFF, 0, 0xFFFFFFFF};

    assert((K%16)==0);

    K >>= 2;
    lda >>= 2;

    if (N < 5) {
        if (N < 3) {
            if (N < 2) {
                DO_MATRIX(1)
            } else {
                DO_MATRIX(2)
            }
        } else {
            if (N < 4) {
                DO_MATRIX(3)
            } else {
                DO_MATRIX(4)
            }
        }
    } else if (N < 9) {
        if (N < 7) {
            if (N < 6) {
                DO_MATRIX(5)
            } else {
                DO_MATRIX(6)
            }
        } else {
            if (N < 8) {
                DO_MATRIX(7)
            } else {
                DO_MATRIX(8)
            }
        }
    } else if (N < 12) {
        if (N < 11) {
            if (N < 10) {
                DO_MATRIX(9)
            } else {
                DO_MATRIX(10)
            }
        } else {
            if (N < 12) {
                DO_MATRIX(11);
            } else {
                DO_MATRIX(12);
            }
        }
    } else {
//        assert((N%16)==0);
	    // ldc is walloc, we can have N=ralloc smaller for memory optimization, but still should use walloc here to have multiple of 16
        memset(C, 0, N * ldc * sizeof(float));
        ssyrk_spu((float*)A, C, 1, /*N*/ldc, K * 4, lda * 4, ldc);
    }

    /*
        int i,j;
        for (i = 0; i < N; i++) {
          for (j = 0; j <= i; j++) {
            register vector float temp = zero;
    	for (k = 0; k < K; k++) {
              temp = spu_madd(A[i * lda + k], A[j * lda + k], temp);
            }
            C[i * ldc + j] = spu_extract(temp, 0) +  spu_extract(temp, 1) +  spu_extract(temp, 2) +  spu_extract(temp, 3);
          }
        }
    */
}


#define DO_ROW(size) \
    DECLARE_VR##size \
    for (k = 0; k < K; k += BLOCK_X) { \
	const vector float *B = A + row * lda + k; \
    \
	DECLARE_VX(B) \
	COMPUTE_V##size((A + l * lda + k)) \
    } \
    SUM_V##size(((vector float*)(C+l)));


void vec_update_row (short int row, short int N, short int K, const vector float *A, short int lda, float *C) {
    int k, l;

    register vector float tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
    register vector unsigned int mask1 = {0, 0, 0xFFFFFFFF, 0xFFFFFFFF};
    register vector unsigned int mask2 = {0, 0xFFFFFFFF, 0, 0xFFFFFFFF};
//    vector float temp;

    assert((K%16)==0);

    K >>= 2;
    lda >>= 2;

    for (l = 0; l < N; l+=16) {
        short int rem = min(16, N - l);

        if (rem < 9) {
            if (rem < 5) {
                if (rem < 3) {
                    if (rem < 2) {
                        DO_ROW(1)
                    } else {
                        DO_ROW(2)
                    }
                } else {
                    if (rem < 4) {
                        DO_ROW(3)
                    } else {
                        DO_ROW(4)
                    }
                }
            } else
                if (rem < 7) {
                    if (rem < 6) {
                        DO_ROW(5)
                    } else {
                        DO_ROW(6)
                    }
                } else {
                    if (rem < 8) {
                        DO_ROW(7)
                    } else {
                        DO_ROW(8)
                    }
                }
        } else {
            if (rem < 12) {
                if (rem < 11) {
                    if (rem < 10) {
                        DO_ROW(9)
                    } else {
                        DO_ROW(10)
                    }
                } else {
                    if (rem < 12) {
                        DO_ROW(11);
                    } else {
                        DO_ROW(12);
                    }
                }
            } else {
                if (rem < 15) {
                    if (rem < 14) {
                        DO_ROW(13)
                    } else {
                        DO_ROW(14)
                    }
                } else {
                    if (rem < 16) {
                        DO_ROW(15);
                    } else {
                        DO_ROW(16);
                    }
                }
            }
        }
    }


    /*
        DECLARE_VR5
        for (k = 0; k < K; k+= BLOCK_X) {
    	const vector float *B = A + row * lda + k;

    	DECLARE_VX(B)
    	COMPUTE_V5((A + l * lda + k))
        }
        SUM_V5(((vector float*)(C+l)));
    */


    /*
        vector float temp[N];
        memset(temp, 0, sizeof(vector float)*N);
        for (k = 0; k < K; k++) {

    	register vector float Arow = A[row * lda + k];
    	for (l = 0; l < N; l++) {
    	    temp[l] = spu_madd(Arow, A[l * lda + k], temp[l]);
    	    C[l] = spu_extract(temp[l], 0) +  spu_extract(temp[l], 1) +  spu_extract(temp[l], 2) +  spu_extract(temp[l], 3);
    	}
    	if ((k%4)==3) {
    	    printf("Step: %i\n",k);
        	    PRINT_MATRIX("% 6.4f ", C, 1, 1, 16);
    	}
        }
        for (l = 0; l < N; l++) {
    	C[l] = spu_extract(temp[l], 0) +  spu_extract(temp[l], 1) +  spu_extract(temp[l], 2) +  spu_extract(temp[l], 3);
        }
        PRINT_MATRIX("% 6.4f ", C, 1, 1, 16);
    */

    /*
        for (l = 0; l < N; l++) {
    	if (l < row) {
    	    i = l; j = row;
    	} else {
    	    i = row; j = l;
    	}

            register vector float tmp = zero;
    	for (k = 0; k < K; k++) {
              tmp = spu_madd(A[i * lda + k], A[j * lda + k], tmp);
            }
            C[l] = spu_extract(tmp, 0) +  spu_extract(tmp, 1) +  spu_extract(tmp, 2) +  spu_extract(tmp, 3);
        }

        PRINT_MATRIX("% 6.4f ", C, 1, 1, 16);
    */
}


inline static void vec_spotrf_step(const short int M, const short int N, vector float *A, const short int lda) {
    register short int i, j, k;

    vector float *B = A + M * lda;
    vector float *C = B + M;


    for (i = 0; i < N; i++) {
        for (j = 0; j < M; j++) {
            vector float sum = {0, 0, 0, 0};

            for (k = 0; k < j; k++) {
                sum = spu_madd(A[j * lda + k],  B[i * lda + k], sum);
            }

            B[i * lda + j] =  spu_div(spu_sub(B[i * lda + j], sum), A[j * lda + j]);
        }

        for (j = 0; j <= i; j++) {
            vector float temp = {0, 0, 0, 0};

            for (k = 0; k < M; k++) {
                temp = spu_madd(B[i * lda + k],  B[j * lda + k], temp);
            }

            C[i * lda + j] = spu_sub(C[i * lda + j], temp);
        }
    }
}


// we ignore errors if it will lead to 0 determinant
vector unsigned int vec_spotrf_u(short int N, vector float *A, short int lda) {
    short int Nleft, Nright;

    vector unsigned int errors;

    //PRINT_MULTIMATRIX(0, "% 7.4f  ", ((float*)A), 5, 5, 5);

    if (N > 4) {
        Nleft = N >> 1;
        Nright = N - Nleft;

//    printf("passed matrix\n");
//    VECTOR_PRINT_MATRIX(0, "% 8.4f  ", A, 5, 5, 5);

        errors = vec_spotrf_u(Nleft, A, lda);

//    printf("left\n");
//    VECTOR_PRINT_MATRIX(0, "% 8.4f  ", A, 5, 5, 5);

        vec_spotrf_step(Nleft, Nright, A, lda);
//    printf("step\n");
//    VECTOR_PRINT_MATRIX(0, "% 8.4f  ", A, 5, 5, 5);

        spu_and(vec_spotrf_u(Nright, A + (lda + 1) * Nleft, lda), errors);
//    printf("right\n");
//    VECTOR_PRINT_MATRIX(0, "% 8.4f  ", A, 5, 5, 5);

    }
    else if (N==4) errors = vec_potrfU_4(A, lda);
    else if (N==3) errors = vec_potrfU_3(A, lda);
    else if (N==2) errors = vec_potrfU_2(A, lda);
    else if (N==1) {
        errors = spu_cmpgt(*A, zero);
        *A = spu_sqrt(*A);
    } else return spu_cmpgt(zero, zero); // anything

    return errors;
}

vector float vec_rcorr(short int N, vector float *C, vector float *Ca, vector float *Cb) {
    short int i;
    short int step = N + 1;
    short int end = N * N;
    vector float detC, detAB;

//    VECTOR_PRINT_MATRIX(1, "%6.4f", C, 5, 5, 5);

    detC = C[0];
    detAB = Ca[0] * Cb[0];
    for (i = step; i < end; i+= step) {
        detAB = spu_mul(detAB, spu_mul(Ca[i], Cb[i]));
        detC = spu_mul(detC, C[i]);
    }

    /*
        VECTOR_PRINT("detAB", detAB);
        VECTOR_PRINT("detC", detC);
        VECTOR_PRINT("detC^2", spu_mul(detC, detC));
        VECTOR_PRINT("C/AB", spu_div(spu_mul(detC, detC), detAB));
        VECTOR_PRINT("log", spu_log(spu_div(spu_mul(detC, detC), detAB)));
    */

    return spu_mul(two, spu_log(spu_div(spu_mul(detC, detC), detAB)));
}

vector float vec_rmahal(short int N, vector float *C, vector float *X) {
    register short int i;
    vector float sum = zero;

    vec_strsv_rlnn(N, C, N, X);
    for (i = 0; i < N; i++) {
        vector float val = X[i];
        sum = spu_madd(val, val, sum);
    }

    return sum;
}

vector float vec_bhata(vector float rcorr, vector float rmahal) {
    return spu_add(spu_div(rmahal, eight), spu_div(rcorr, four));
}


1 by Suren A. Chilingaryan Initial import	1	#include <stdio.h>
	2	#include <string.h>
	3	#include <assert.h>
	4	#include <blas_s.h>
	5	#include <simdmath.h>
	6	#include <libvector.h>
	7
	8	#include <simdmath/sqrtf4.h>
	9	#include <simdmath/fmaxf4.h>
	10	#include <simdmath/divf4.h>
	11	#include <simdmath/divf4_fast.h>
	12	#include <simdmath/rsqrtf4.h>
	13	#include <simdmath/recipf4.h>
	14	#include <simdmath/recipf4_fast.h>
	15	#include <simdmath/logf4.h>
	16
	17	#ifdef __SPU__
	18	#include <spu_intrinsics.h>
	19	#else /* not __SPU__ */
	20	#include <altivec.h>
	21	#endif
	22
	23
	24	static vector float zero = {0, 0, 0, 0};
	25	static vector float two = {2, 2, 2, 2};
	26	static vector float four = {4, 4, 4, 4};
	27	static vector float eight = {8, 8, 8, 8};
	28
	29
	30	// fast versions does not bring significant performance benefits
	31	// fmaxf between nan and zero is zero
	32	#define spu_max _fmaxf4
	33
	34	// divide by zero: nan, nan; in fast mode -0, -0
	35	#define spu_div _divf4
	36	//#define spu_div _divf4_fast
	37
	38	// sqrt of negative: 0; in fast mode: undefined, sqrt of nan: big num, ?
	39	#define spu_sqrt(a) _sqrtf4(spu_max(a, zero))
	40	//#define spu_sqrt(a) sqrtf4_fast(spu_max(a, zero))
	41
	42	#define spu_recip _recipf4
	43	//#define spu_recip _recipf4_fast
	44
	45	#define spu_rsqrt _rsqrtf4
	46
	47	#define spu_log _logf4
	48
	49	#define VECTOR_PRINT(var, val) \
	50	printf("Vector %10s is: {% 6.4e, % 6.4e, % 6.4e, % 6.4e}\n", var, spu_extract(val, 0), spu_extract(val, 1), spu_extract(val, 2), spu_extract(val, 3));
	51
	52	#define iVECTOR_PRINT(var, val) \
	53	printf("Vector %10s is: {%i, %i, %i, %i}\n", var, spu_extract(val, 0), spu_extract(val, 1), spu_extract(val, 2), spu_extract(val, 3));
	54
	55	#define VECTOR_PRINT_MATRIX(num, fmt, MATRIX, width, n, m) \
	56	{ \
	57	int i, j; \
	58	for (i = 0; i < n; i++) { \
	59	for (j = 0; j < m; j++) \
	60	printf(fmt, spu_extract(MATRIX[i * width + j], num)); \
	61	printf("\n"); \
	62	} \
	63	printf("\n"); \
	64	}
65
66
67	inline static vector unsigned int vec_potrfU_4(vector float *A, const short int lda)
68	{
69	vector float pA1=A+lda, pA2=pA1+lda, *pA3=pA2+lda;
70	vector float L11 = A, L21 = pA1, L31 = pA2, L41 = pA3;
71	vector float L22 = pA1[1], L32 = pA2[1], L42 = pA3[1];
72	vector float L33 = pA2[2], L43 = pA3[2];
73	vector float L44 = pA3[3];
74
75	vector unsigned int errors;
76
77	errors = spu_cmpgt(L11, zero);
78
79	L11 = spu_rsqrt(L11);
80	*A = spu_recip(L11);
81
82	L21 = spu_mul(L21, L11);
83	L31 = spu_mul(L31, L11);
84	L41 = spu_mul(L41, L11);
85
86	*pA1 = L21;
87	*pA2 = L31;
88	*pA3 = L41;
89
90	L22 = spu_nmsub(L21, L21, L22); // L22 -= L21*L21;
91
92	errors = spu_and(spu_cmpgt(L22, zero), errors);
93
94	L22 = spu_rsqrt(L22);
95	pA1[1] = spu_recip(L22);
96
97	L32 = spu_mul(L22, spu_nmsub(L31, L21, L32)); // (L32 - L31L21) L22;
98	L42 = spu_mul(L22, spu_nmsub(L41, L21, L42)); // (L42 - L41L21) L22;
99	L33 = spu_nmsub(L32, L32, spu_nmsub(L31, L31, L33)); // (L33 - L31L31) - L32L32;
100
101	pA2[1] = L32;
102	pA3[1] = L42;
103
104	errors = spu_and(spu_cmpgt(L33, zero), errors);
105
106	L33 = spu_rsqrt(L33);
107	pA2[2] = spu_recip(L33);
108
109	L43 = spu_mul(L33, spu_nmsub(L42, L32, spu_nmsub(L41, L31, L43))); // ((L43 - L41L31) - L42L32) * L33;
110	L44 = spu_nmsub(L43, L43, spu_nmsub(L42, L42, spu_nmsub(L41, L41, L44))); // (((L44 - L41L41) - L42L42) - L43*L43);
111	pA3[2] = L43;
112
113	errors = spu_and(spu_cmpgt(L44, zero), errors);
114	pA3[3] = spu_sqrt(L44);
115
116	return errors;
117	}
118
119
120	inline static vector unsigned int vec_potrfU_3(vector float *A, const short int lda)
121	{
122	vector float pA1=A+lda, pA2=pA1+lda;
123	register vector float L11 = A, L21 = pA1, L31 = *pA2;
124	register vector float L22=pA1[1], L32=pA2[1];
125	register vector float L33=pA2[2];
126
127	vector unsigned int errors;
128
129	errors = spu_cmpgt(L11, zero);
130
131	L11 = spu_rsqrt(L11);
132	*A = spu_recip(L11);
133
134	L21 = spu_mul(L21, L11);
135	L31 = spu_mul(L31, L11);
136
137	*pA1 = L21;
138	*pA2 = L31;
139
140	L22 = spu_nmsub(L21, L21, L22);
141
142	errors = spu_and(spu_cmpgt(L22, zero), errors);
143
144	L22 = spu_rsqrt(L22);
145	L32 = spu_mul( spu_nmsub(L31, L21, L32), L22); // (L32 - L31*L21) / sqrt(L22);
146
147	L33 = spu_nmsub(L32, L32, spu_nmsub(L31, L31, L33)); // (L33 - L31L31) - L32 L32;
148
149	errors = spu_and(spu_cmpgt(L33, zero), errors);
150
151	pA1[1] = spu_recip(L22);
152	pA2[1] = L32;
153	pA2[2] = spu_sqrt(L33);
154
155	return errors;
156	}
157
158
159	inline static vector unsigned int vec_potrfU_2(vector float *A, const short int lda)
160	{
161	vector float *pA1 = A + lda;
162	register vector float L11 = A, L21 = pA1, L22 = pA1[1];
163
164	register vector unsigned int errors;
165
166	errors = spu_cmpgt(L11, zero);
167
168	L11 = spu_rsqrt(L11);
169	*A = spu_recip(L11);
170	*pA1 = L21 = spu_mul(L21, L11); // division by zero
171
172	L22 = spu_nmsub(L21, L21, L22); // L22 -= L21*L21;
173
174	errors = spu_and(spu_cmpgt(L22, zero), errors);
175	pA1[1] = spu_sqrt(L22);
176
177	return errors;
178	}
179
180	inline static void vec_strsv_rlnn (short int N, const vector float A, const int lda, vector float X) {
181	register short int i, j;
182
183	for (i = 0; i < N; i++) {
184	vector float tmp = X[i];
185
186	for (j = 0; j < i; j++) {
187	tmp = spu_nmsub(A[lda * i + j], X[j], tmp); // tmp -= A[lda * i + j] * X[j];
188	}
189	X[i] = spu_div(tmp, A[(lda + 1) * i]); // X[i] = tmp / A[lda * i + i];
190	}
191	}
192
193	#define BLOCK_X 4
194	#define BLOCK_Y 8
195
196	#include <sum_across_float4.h>
197	#include "vec_potrf_mtxmul.h"
198
199
200	#define HSUM(r1,r2,r3,r4) \
201	tmp1 = spu_rlqwbyte(r1, 8); \
202	tmp2 = spu_rlqwbyte(r3, 8); \
203	\
204	tmp3 = spu_sel(r1, r3, mask1); \
205	tmp4 = spu_sel(tmp1, tmp2, mask1); \
206	tmp5 = spu_add(tmp3, tmp4); \
207	\
208	tmp1 = spu_rlqwbyte(r2, 8); \
209	tmp2 = spu_rlqwbyte(r4, 8); \
210	\
211	tmp3 = spu_sel(r2, r4, mask1); \
212	tmp4 = spu_sel(tmp1, tmp2, mask1); \
213	tmp6 = spu_add(tmp3, tmp4); \
214	\
215	tmp1 = spu_rlqwbyte(tmp5, 4); \
216	tmp2 = spu_rlqwbyte(tmp6, -4); \
217	\
218	tmp3 = spu_sel(tmp5, tmp6, mask2); \
219	tmp4 = spu_sel(tmp1, tmp2, mask2); \
220	\
221	tmp5 = spu_add(tmp3, tmp4);
222
223
224	#define DO_MATRIX(size) \
225	DECLARE_R##size \
226	for (k = 0; k < K; k += BLOCK_X) { \
227	const vector float *B = A + k; \
228	\
229	DECLARE_T##size(B) \
230	COMPUTE_T##size(C) \
231	} \
232	SUM_T##size(C)
233
234
235	#include "tools.h"
236	void vec_ssyrk_rln_11 (short int N, short int K, const vector float A, short int lda, float C, short int ldc)
237	{
238	register short int k;
239
240	register vector float tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
241	register vector unsigned int mask1 = {0, 0, 0xFFFFFFFF, 0xFFFFFFFF};
242	register vector unsigned int mask2 = {0, 0xFFFFFFFF, 0, 0xFFFFFFFF};
243
244	assert((K%16)==0);
245
246	K >>= 2;
247	lda >>= 2;
248
249	if (N < 5) {
250	if (N < 3) {
251	if (N < 2) {
252	DO_MATRIX(1)
253	} else {
254	DO_MATRIX(2)
255	}
256	} else {
257	if (N < 4) {
258	DO_MATRIX(3)
259	} else {
260	DO_MATRIX(4)
261	}
262	}
263	} else if (N < 9) {
264	if (N < 7) {
265	if (N < 6) {
266	DO_MATRIX(5)
267	} else {
268	DO_MATRIX(6)
269	}
270	} else {
271	if (N < 8) {
272	DO_MATRIX(7)
273	} else {
274	DO_MATRIX(8)
275	}
276	}
277	} else if (N < 12) {
278	if (N < 11) {
279	if (N < 10) {
280	DO_MATRIX(9)
281	} else {
282	DO_MATRIX(10)
283	}
284	} else {
285	if (N < 12) {
286	DO_MATRIX(11);
287	} else {
288	DO_MATRIX(12);
289	}
290	}
291	} else {
292	// assert((N%16)==0);
293	// ldc is walloc, we can have N=ralloc smaller for memory optimization, but still should use walloc here to have multiple of 16
294	memset(C, 0, N * ldc * sizeof(float));
295	ssyrk_spu((float)A, C, 1, /N/ldc, K 4, lda * 4, ldc);
296	}
297
298	/*
299	int i,j;
300	for (i = 0; i < N; i++) {
301	for (j = 0; j <= i; j++) {
302	register vector float temp = zero;
303	for (k = 0; k < K; k++) {
304	temp = spu_madd(A[i * lda + k], A[j * lda + k], temp);
305	}
306	C[i * ldc + j] = spu_extract(temp, 0) + spu_extract(temp, 1) + spu_extract(temp, 2) + spu_extract(temp, 3);
307	}
308	}
309	*/
310	}
311
312
313	#define DO_ROW(size) \
314	DECLARE_VR##size \
315	for (k = 0; k < K; k += BLOCK_X) { \
316	const vector float B = A + row lda + k; \
317	\
318	DECLARE_VX(B) \
319	COMPUTE_V##size((A + l * lda + k)) \
320	} \
321	SUM_V##size(((vector float*)(C+l)));
322
323
324	void vec_update_row (short int row, short int N, short int K, const vector float A, short int lda, float C) {
325	int k, l;
326
327	register vector float tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
328	register vector unsigned int mask1 = {0, 0, 0xFFFFFFFF, 0xFFFFFFFF};
329	register vector unsigned int mask2 = {0, 0xFFFFFFFF, 0, 0xFFFFFFFF};
330	// vector float temp;
331
332	assert((K%16)==0);
333
334	K >>= 2;
335	lda >>= 2;
336
337	for (l = 0; l < N; l+=16) {
338	short int rem = min(16, N - l);
339
340	if (rem < 9) {
341	if (rem < 5) {
342	if (rem < 3) {
343	if (rem < 2) {
344	DO_ROW(1)
345	} else {
346	DO_ROW(2)
347	}
348	} else {
349	if (rem < 4) {
350	DO_ROW(3)
351	} else {
352	DO_ROW(4)
353	}
354	}
355	} else
356	if (rem < 7) {
357	if (rem < 6) {
358	DO_ROW(5)
359	} else {
360	DO_ROW(6)
361	}
362	} else {
363	if (rem < 8) {
364	DO_ROW(7)
365	} else {
366	DO_ROW(8)
367	}
368	}
369	} else {
370	if (rem < 12) {
371	if (rem < 11) {
372	if (rem < 10) {
373	DO_ROW(9)
374	} else {
375	DO_ROW(10)
376	}
377	} else {
378	if (rem < 12) {
379	DO_ROW(11);
380	} else {
381	DO_ROW(12);
382	}
383	}
384	} else {
385	if (rem < 15) {
386	if (rem < 14) {
387	DO_ROW(13)
388	} else {
389	DO_ROW(14)
390	}
391	} else {
392	if (rem < 16) {
393	DO_ROW(15);
394	} else {
395	DO_ROW(16);
396	}
397	}
398	}
399	}
400	}
401
402
403	/*
404	DECLARE_VR5
405	for (k = 0; k < K; k+= BLOCK_X) {
406	const vector float B = A + row lda + k;
407
408	DECLARE_VX(B)
409	COMPUTE_V5((A + l * lda + k))
410	}
411	SUM_V5(((vector float*)(C+l)));
412	*/
413
414
415	/*
416	vector float temp[N];
417	memset(temp, 0, sizeof(vector float)*N);
418	for (k = 0; k < K; k++) {
419
420	register vector float Arow = A[row * lda + k];
421	for (l = 0; l < N; l++) {
422	temp[l] = spu_madd(Arow, A[l * lda + k], temp[l]);
423	C[l] = spu_extract(temp[l], 0) + spu_extract(temp[l], 1) + spu_extract(temp[l], 2) + spu_extract(temp[l], 3);
424	}
425	if ((k%4)==3) {
426	printf("Step: %i\n",k);
427	PRINT_MATRIX("% 6.4f ", C, 1, 1, 16);
428	}
429	}
430	for (l = 0; l < N; l++) {
431	C[l] = spu_extract(temp[l], 0) + spu_extract(temp[l], 1) + spu_extract(temp[l], 2) + spu_extract(temp[l], 3);
432	}
433	PRINT_MATRIX("% 6.4f ", C, 1, 1, 16);
434	*/
435
436	/*
437	for (l = 0; l < N; l++) {
438	if (l < row) {
439	i = l; j = row;
440	} else {
441	i = row; j = l;
442	}
443
444	register vector float tmp = zero;
445	for (k = 0; k < K; k++) {
446	tmp = spu_madd(A[i * lda + k], A[j * lda + k], tmp);
447	}
448	C[l] = spu_extract(tmp, 0) + spu_extract(tmp, 1) + spu_extract(tmp, 2) + spu_extract(tmp, 3);
449	}
450
451	PRINT_MATRIX("% 6.4f ", C, 1, 1, 16);
452	*/
453	}
454
455
456	inline static void vec_spotrf_step(const short int M, const short int N, vector float *A, const short int lda) {
457	register short int i, j, k;
458
459	vector float B = A + M lda;
460	vector float *C = B + M;
461
462
463	for (i = 0; i < N; i++) {
464	for (j = 0; j < M; j++) {
465	vector float sum = {0, 0, 0, 0};
466
467	for (k = 0; k < j; k++) {
468	sum = spu_madd(A[j * lda + k], B[i * lda + k], sum);
469	}
470
471	B[i * lda + j] = spu_div(spu_sub(B[i * lda + j], sum), A[j * lda + j]);
472	}
473
474	for (j = 0; j <= i; j++) {
475	vector float temp = {0, 0, 0, 0};
476
477	for (k = 0; k < M; k++) {
478	temp = spu_madd(B[i * lda + k], B[j * lda + k], temp);
479	}
480
481	C[i * lda + j] = spu_sub(C[i * lda + j], temp);
482	}
483	}
484	}
485
486
487	// we ignore errors if it will lead to 0 determinant
488	vector unsigned int vec_spotrf_u(short int N, vector float *A, short int lda) {
489	short int Nleft, Nright;
490
491	vector unsigned int errors;
492
493	//PRINT_MULTIMATRIX(0, "% 7.4f ", ((float*)A), 5, 5, 5);
494
495	if (N > 4) {
496	Nleft = N >> 1;
497	Nright = N - Nleft;
498
499	// printf("passed matrix\n");
500	// VECTOR_PRINT_MATRIX(0, "% 8.4f ", A, 5, 5, 5);
501
502	errors = vec_spotrf_u(Nleft, A, lda);
503
504	// printf("left\n");
505	// VECTOR_PRINT_MATRIX(0, "% 8.4f ", A, 5, 5, 5);
506
507	vec_spotrf_step(Nleft, Nright, A, lda);
508	// printf("step\n");
509	// VECTOR_PRINT_MATRIX(0, "% 8.4f ", A, 5, 5, 5);
510
511	spu_and(vec_spotrf_u(Nright, A + (lda + 1) * Nleft, lda), errors);
512	// printf("right\n");
513	// VECTOR_PRINT_MATRIX(0, "% 8.4f ", A, 5, 5, 5);
514
515	}
516	else if (N==4) errors = vec_potrfU_4(A, lda);
517	else if (N==3) errors = vec_potrfU_3(A, lda);
518	else if (N==2) errors = vec_potrfU_2(A, lda);
519	else if (N==1) {
520	errors = spu_cmpgt(*A, zero);
521	A = spu_sqrt(A);
522	} else return spu_cmpgt(zero, zero); // anything
523
524	return errors;
525	}
526
527	vector float vec_rcorr(short int N, vector float C, vector float Ca, vector float *Cb) {
528	short int i;
529	short int step = N + 1;
530	short int end = N * N;
531	vector float detC, detAB;
532
533	// VECTOR_PRINT_MATRIX(1, "%6.4f", C, 5, 5, 5);
534
535	detC = C[0];
536	detAB = Ca[0] * Cb[0];
537	for (i = step; i < end; i+= step) {
538	detAB = spu_mul(detAB, spu_mul(Ca[i], Cb[i]));
539	detC = spu_mul(detC, C[i]);
540	}
541
542	/*
543	VECTOR_PRINT("detAB", detAB);
544	VECTOR_PRINT("detC", detC);
545	VECTOR_PRINT("detC^2", spu_mul(detC, detC));
546	VECTOR_PRINT("C/AB", spu_div(spu_mul(detC, detC), detAB));
547	VECTOR_PRINT("log", spu_log(spu_div(spu_mul(detC, detC), detAB)));
548	*/
549
550	return spu_mul(two, spu_log(spu_div(spu_mul(detC, detC), detAB)));
551	}
552
553	vector float vec_rmahal(short int N, vector float C, vector float X) {
554	register short int i;
555	vector float sum = zero;
556
557	vec_strsv_rlnn(N, C, N, X);
558	for (i = 0; i < N; i++) {
559	vector float val = X[i];
560	sum = spu_madd(val, val, sum);
561	}
562
563	return sum;
564	}
565
566	vector float vec_bhata(vector float rcorr, vector float rmahal) {
567	return spu_add(spu_div(rmahal, eight), spu_div(rcorr, four));
568	}
569