/ani/mrses

To get this branch, use:
bzr branch http://suren.me/webbzr/ani/mrses
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#define T(i,j) T_ ## i ## _ ## j
#define R(i,j) R_ ## i ## _ ## j
#define V(i) V_ ## i
#define DECLARE_T(i, j, A) register vector float T(i,j) = *((A) + i*lda + j);
#define DECLARE_R(i, j) register vector float R(i,j) = spu_splats((float)0.0);
#define DECLARE_V(i, A) register vector float V(i) = *((A) + i);
#define DECLARE_TX(i, A) DECLARE_T(i, 0, A) DECLARE_T(i, 1, A) DECLARE_T(i, 2, A) DECLARE_T(i, 3, A)
#define DECLARE_VX(A) DECLARE_V(0, A) DECLARE_V(1, A) DECLARE_V(2, A) DECLARE_V(3, A)
#define DECLARE_T1(A) DECLARE_TX(0, A)
#define DECLARE_R1 DECLARE_R(0,0)
#define DECLARE_VR1 DECLARE_R(0,0)
#define DECLARE_T2(A) DECLARE_T1(A) DECLARE_TX(1, A)
#define DECLARE_VR2 DECLARE_VR1 DECLARE_R(1,0)
#define DECLARE_R2 DECLARE_R1 DECLARE_R(0, 1) DECLARE_R(1, 1)
#define DECLARE_T3(A) DECLARE_T2(A) DECLARE_TX(2, A)
#define DECLARE_VR3 DECLARE_VR2 DECLARE_R(2,0)
#define DECLARE_R3 DECLARE_R2 DECLARE_R(0, 2) DECLARE_R(1, 2) DECLARE_R(2, 2)
#define DECLARE_T4(A) DECLARE_T3(A) DECLARE_TX(3, A)
#define DECLARE_VR4 DECLARE_VR3 DECLARE_R(3,0)
#define DECLARE_R4 DECLARE_R3 DECLARE_R(0, 3) DECLARE_R(1, 3) DECLARE_R(2, 3) DECLARE_R(3, 3)
#define DECLARE_T5(A) DECLARE_T4(A) DECLARE_TX(4, A)
#define DECLARE_VR5 DECLARE_VR4 DECLARE_R(4,0)
#define DECLARE_R5 DECLARE_R4 DECLARE_R(0, 4) DECLARE_R(1, 4) DECLARE_R(2, 4) DECLARE_R(3, 4) DECLARE_R(4, 4)
#define DECLARE_T6(A) DECLARE_T5(A) DECLARE_TX(5, A)
#define DECLARE_VR6 DECLARE_VR5 DECLARE_R(5,0)
#define DECLARE_R6 DECLARE_R5 DECLARE_R(0, 5) DECLARE_R(1, 5) DECLARE_R(2, 5) DECLARE_R(3, 5) DECLARE_R(4, 5) DECLARE_R(5, 5)
#define DECLARE_T7(A) DECLARE_T6(A) DECLARE_TX(6, A)
#define DECLARE_VR7 DECLARE_VR6 DECLARE_R(6,0)
#define DECLARE_R7 DECLARE_R6 DECLARE_R(0, 6) DECLARE_R(1, 6) DECLARE_R(2, 6) DECLARE_R(3, 6) DECLARE_R(4, 6) DECLARE_R(5, 6) DECLARE_R(6, 6)
#define DECLARE_T8(A) DECLARE_T7(A) DECLARE_TX(7, A)
#define DECLARE_VR8 DECLARE_VR7 DECLARE_R(7,0)
#define DECLARE_R8 DECLARE_R7 DECLARE_R(0, 7) DECLARE_R(1, 7) DECLARE_R(2, 7) DECLARE_R(3, 7) DECLARE_R(4, 7) DECLARE_R(5, 7) DECLARE_R(6, 7) DECLARE_R(7, 7)
#define DECLARE_T9(A) DECLARE_T8(A) DECLARE_TX(8, A)
#define DECLARE_VR9 DECLARE_VR8 DECLARE_R(8,0)
#define DECLARE_R9 DECLARE_R8 DECLARE_R(0, 8) DECLARE_R(1, 8) DECLARE_R(2, 8) DECLARE_R(3, 8) DECLARE_R(4, 8) DECLARE_R(5, 8) DECLARE_R(6, 8) DECLARE_R(7, 8) DECLARE_R(8, 8)
#define DECLARE_T10(A) DECLARE_T9(A) DECLARE_TX(9, A)
#define DECLARE_VR10 DECLARE_VR9 DECLARE_R(9,0)
#define DECLARE_R10 DECLARE_R9 DECLARE_R(0, 9) DECLARE_R(1, 9) DECLARE_R(2, 9) DECLARE_R(3, 9) DECLARE_R(4, 9) DECLARE_R(5, 9) DECLARE_R(6, 9) DECLARE_R(7, 9) DECLARE_R(8, 9) DECLARE_R(9, 9)
#define DECLARE_T11(A) DECLARE_T10(A) DECLARE_TX(10, A)
#define DECLARE_VR11 DECLARE_VR10 DECLARE_R(10,0)
#define DECLARE_R11 DECLARE_R10 DECLARE_R(0, 10) DECLARE_R(1, 10) DECLARE_R(2, 10) DECLARE_R(3, 10) DECLARE_R(4, 10) DECLARE_R(5, 10) DECLARE_R(6, 10) DECLARE_R(7, 10) DECLARE_R(8, 10) DECLARE_R(9, 10) DECLARE_R(10, 10)
#define DECLARE_T12(A) DECLARE_T11(A) DECLARE_TX(11, A)
#define DECLARE_VR12 DECLARE_VR11 DECLARE_R(11,0)
#define DECLARE_R12 DECLARE_R11 DECLARE_R(0, 11) DECLARE_R(1, 11) DECLARE_R(2, 11) DECLARE_R(3, 11) DECLARE_R(4, 11) DECLARE_R(5, 11) DECLARE_R(6, 11) DECLARE_R(7, 11) DECLARE_R(8, 11) DECLARE_R(9, 11) DECLARE_R(10, 11) DECLARE_R(11, 11)
#define DECLARE_T13(A) DECLARE_T12(A) DECLARE_TX(12, A)
#define DECLARE_VR13 DECLARE_VR12 DECLARE_R(12,0)
#define DECLARE_R13 DECLARE_R12 DECLARE_R(0, 12) DECLARE_R(1, 12) DECLARE_R(2, 12) DECLARE_R(3, 12) DECLARE_R(4, 12) DECLARE_R(5, 12) DECLARE_R(6, 12) DECLARE_R(7, 12) DECLARE_R(8, 12) DECLARE_R(9, 12) DECLARE_R(10, 12) DECLARE_R(11, 12) DECLARE_R(12, 12)
#define DECLARE_T14(A) DECLARE_T13(A) DECLARE_TX(13, A)
#define DECLARE_VR14 DECLARE_VR13 DECLARE_R(13,0)
#define DECLARE_R14 DECLARE_R13 DECLARE_R(0, 13) DECLARE_R(1, 13) DECLARE_R(2, 13) DECLARE_R(3, 13) DECLARE_R(4, 13) DECLARE_R(5, 13) DECLARE_R(6, 13) DECLARE_R(7, 13) DECLARE_R(8, 13) DECLARE_R(9, 13) DECLARE_R(10, 13) DECLARE_R(11, 13) DECLARE_R(12, 13) DECLARE_R(13, 13)
#define DECLARE_T15(A) DECLARE_T14(A) DECLARE_TX(14, A)
#define DECLARE_VR15 DECLARE_VR14 DECLARE_R(14,0)
#define DECLARE_R15 DECLARE_R14 DECLARE_R(0, 14) DECLARE_R(1, 14) DECLARE_R(2, 14) DECLARE_R(3, 14) DECLARE_R(4, 14) DECLARE_R(5, 14) DECLARE_R(6, 14) DECLARE_R(7, 14) DECLARE_R(8, 14) DECLARE_R(9, 14) DECLARE_R(10, 14) DECLARE_R(11, 14) DECLARE_R(12, 14) DECLARE_R(13, 14) DECLARE_R(14, 14)
#define DECLARE_T16(A) DECLARE_T15(A) DECLARE_TX(15, A)
#define DECLARE_VR16 DECLARE_VR15 DECLARE_R(15,0)
#define DECLARE_R16 DECLARE_R15 DECLARE_R(0, 15) DECLARE_R(1, 15) DECLARE_R(2, 15) DECLARE_R(3, 15) DECLARE_R(4, 15) DECLARE_R(5, 15) DECLARE_R(6, 15) DECLARE_R(7, 15) DECLARE_R(8, 15) DECLARE_R(9, 15) DECLARE_R(10, 15) DECLARE_R(11, 15) DECLARE_R(12, 15) DECLARE_R(13, 15) DECLARE_R(14, 15) DECLARE_R(15, 15)

#define COMPUTE_T(i, j, l, var) spu_madd(T(i,l), T(j,l), var)
#define COMPUTE_V(A, l, k, var) spu_madd(V(k), A[l*lda + k], var)
#define COMPUTE_TX(i, j, C) R(i,j) = COMPUTE_T(i, j, 3, COMPUTE_T(i, j, 2, COMPUTE_T(i, j,  1, spu_madd(T(i,0), T(j,0), R(i,j)))));
#define COMPUTE_VX(A, l) R(l,0) = COMPUTE_V(A, l, 3, COMPUTE_V(A, l, 2, COMPUTE_V(A, l, 1, spu_madd(V(0), A[l*lda], R(l,0)))));
#define SAVE_TX(i, j, C) *((C) + i * ldc + j) = sum_across_float4(R(i,j));
#define COMPUTE_T1(C) COMPUTE_TX(0, 0, C)
#define COMPUTE_V1(A) COMPUTE_VX(A, 0)
#define SAVE_T1(C) SAVE_TX(0, 0, C)
#define COMPUTE_T2(C) COMPUTE_T1(C) COMPUTE_TX(1,1,C) COMPUTE_TX(0,1,C)
#define COMPUTE_V2(A) COMPUTE_V1(A) COMPUTE_VX(A,1)
#define SAVE_T2(C) SAVE_T1(C) SAVE_TX(1,1,C) SAVE_TX(0,1,C)
#define COMPUTE_T3(C) COMPUTE_T2(C) COMPUTE_TX(2,2,C) COMPUTE_TX(0,2,C) COMPUTE_TX(1,2,C)
#define COMPUTE_V3(A) COMPUTE_V2(A) COMPUTE_VX(A,2)
#define SAVE_T3(C) SAVE_T2(C) SAVE_TX(2,2,C) SAVE_TX(0,2,C) SAVE_TX(1,2,C)
#define COMPUTE_T4(C) COMPUTE_T3(C) COMPUTE_TX(3,3,C) COMPUTE_TX(0,3,C) COMPUTE_TX(1,3,C) COMPUTE_TX(2,3,C)
#define COMPUTE_V4(A) COMPUTE_V3(A) COMPUTE_VX(A,3)
#define SAVE_T4(C) SAVE_T3(C) SAVE_TX(3,3,C) SAVE_TX(0,3,C) SAVE_TX(1,3,C) SAVE_TX(2,3,C)
#define COMPUTE_T5(C) COMPUTE_T4(C) COMPUTE_TX(4,4,C) COMPUTE_TX(0,4,C) COMPUTE_TX(1,4,C) COMPUTE_TX(2,4,C) COMPUTE_TX(3,4,C)
#define COMPUTE_V5(A) COMPUTE_V4(A) COMPUTE_VX(A,4)
#define SAVE_T5(C) SAVE_T4(C) SAVE_TX(4,4,C) SAVE_TX(0,4,C) SAVE_TX(1,4,C) SAVE_TX(2,4,C) SAVE_TX(3,4,C)
#define COMPUTE_T6(C) COMPUTE_T5(C) COMPUTE_TX(5,5,C) COMPUTE_TX(0,5,C) COMPUTE_TX(1,5,C) COMPUTE_TX(2,5,C) COMPUTE_TX(3,5,C) COMPUTE_TX(4,5,C)
#define COMPUTE_V6(A) COMPUTE_V5(A) COMPUTE_VX(A,5)
#define SAVE_T6(C) SAVE_T5(C) SAVE_TX(5,5,C) SAVE_TX(0,5,C) SAVE_TX(1,5,C) SAVE_TX(2,5,C) SAVE_TX(3,5,C) SAVE_TX(4,5,C)
#define COMPUTE_T7(C) COMPUTE_T6(C) COMPUTE_TX(6,6,C) COMPUTE_TX(0,6,C) COMPUTE_TX(1,6,C) COMPUTE_TX(2,6,C) COMPUTE_TX(3,6,C) COMPUTE_TX(4,6,C) COMPUTE_TX(5,6,C)
#define COMPUTE_V7(A) COMPUTE_V6(A) COMPUTE_VX(A,6)
#define SAVE_T7(C) SAVE_T6(C) SAVE_TX(6,6,C) SAVE_TX(0,6,C) SAVE_TX(1,6,C) SAVE_TX(2,6,C) SAVE_TX(3,6,C) SAVE_TX(4,6,C) SAVE_TX(5,6,C)
#define COMPUTE_T8(C) COMPUTE_T7(C) COMPUTE_TX(7,7,C) COMPUTE_TX(0,7,C) COMPUTE_TX(1,7,C) COMPUTE_TX(2,7,C) COMPUTE_TX(3,7,C) COMPUTE_TX(4,7,C) COMPUTE_TX(5,7,C) COMPUTE_TX(6,7,C)
#define COMPUTE_V8(A) COMPUTE_V7(A) COMPUTE_VX(A,7)
#define SAVE_T8(C) SAVE_T7(C) SAVE_TX(7,7,C) SAVE_TX(0,7,C) SAVE_TX(1,7,C) SAVE_TX(2,7,C) SAVE_TX(3,7,C) SAVE_TX(4,7,C) SAVE_TX(5,7,C) SAVE_TX(6,7,C)
#define COMPUTE_T9(C) COMPUTE_T8(C) COMPUTE_TX(8,8,C) COMPUTE_TX(0,8,C) COMPUTE_TX(1,8,C) COMPUTE_TX(2,8,C) COMPUTE_TX(3,8,C) COMPUTE_TX(4,8,C) COMPUTE_TX(5,8,C) COMPUTE_TX(6,8,C) COMPUTE_TX(7,8,C)
#define COMPUTE_V9(A) COMPUTE_V8(A) COMPUTE_VX(A,8)
#define SAVE_T9(C) SAVE_T8(C) SAVE_TX(8,8,C) SAVE_TX(0,8,C) SAVE_TX(1,8,C) SAVE_TX(2,8,C) SAVE_TX(3,8,C) SAVE_TX(4,8,C) SAVE_TX(5,8,C) SAVE_TX(6,8,C) SAVE_TX(7,8,C)
#define COMPUTE_T10(C) COMPUTE_T9(C) COMPUTE_TX(9,9,C) COMPUTE_TX(0,9,C) COMPUTE_TX(1,9,C) COMPUTE_TX(2,9,C) COMPUTE_TX(3,9,C) COMPUTE_TX(4,9,C) COMPUTE_TX(5,9,C) COMPUTE_TX(6,9,C) COMPUTE_TX(7,9,C) COMPUTE_TX(8,9,C)
#define COMPUTE_V10(A) COMPUTE_V9(A) COMPUTE_VX(A,9)
#define SAVE_T10(C) SAVE_T9(C) SAVE_TX(9,9,C) SAVE_TX(0,9,C) SAVE_TX(1,9,C) SAVE_TX(2,9,C) SAVE_TX(3,9,C) SAVE_TX(4,9,C) SAVE_TX(5,9,C) SAVE_TX(6,9,C) SAVE_TX(7,9,C) SAVE_TX(8,9,C)
#define COMPUTE_T11(C) COMPUTE_T10(C) COMPUTE_TX(10,10,C) COMPUTE_TX(0,10,C) COMPUTE_TX(1,10,C) COMPUTE_TX(2,10,C) COMPUTE_TX(3,10,C) COMPUTE_TX(4,10,C) COMPUTE_TX(5,10,C) COMPUTE_TX(6,10,C) COMPUTE_TX(7,10,C) COMPUTE_TX(8,10,C) COMPUTE_TX(9,10,C)
#define COMPUTE_V11(A) COMPUTE_V10(A) COMPUTE_VX(A,10)
#define SAVE_T11(C) SAVE_T10(C) SAVE_TX(10,10,C) SAVE_TX(0,10,C) SAVE_TX(1,10,C) SAVE_TX(2,10,C) SAVE_TX(3,10,C) SAVE_TX(4,10,C) SAVE_TX(5,10,C) SAVE_TX(6,10,C) SAVE_TX(7,10,C) SAVE_TX(8,10,C) SAVE_TX(9,10,C)
#define COMPUTE_T12(C) COMPUTE_T11(C) COMPUTE_TX(11,11,C) COMPUTE_TX(0,11,C) COMPUTE_TX(1,11,C) COMPUTE_TX(2,11,C) COMPUTE_TX(3,11,C) COMPUTE_TX(4,11,C) COMPUTE_TX(5,11,C) COMPUTE_TX(6,11,C) COMPUTE_TX(7,11,C) COMPUTE_TX(8,11,C) COMPUTE_TX(9,11,C) COMPUTE_TX(10,11,C)
#define COMPUTE_V12(A) COMPUTE_V11(A) COMPUTE_VX(A,11)
#define SAVE_T12(C) SAVE_T11(C) SAVE_TX(11,11,C) SAVE_TX(0,11,C) SAVE_TX(1,11,C) SAVE_TX(2,11,C) SAVE_TX(3,11,C) SAVE_TX(4,11,C) SAVE_TX(5,11,C) SAVE_TX(6,11,C) SAVE_TX(7,11,C) SAVE_TX(8,11,C) SAVE_TX(9,11,C) SAVE_TX(10,11,C)
#define COMPUTE_T13(C) COMPUTE_T12(C) COMPUTE_TX(12,12,C) COMPUTE_TX(0,12,C) COMPUTE_TX(1,12,C) COMPUTE_TX(2,12,C) COMPUTE_TX(3,12,C) COMPUTE_TX(4,12,C) COMPUTE_TX(5,12,C) COMPUTE_TX(6,12,C) COMPUTE_TX(7,12,C) COMPUTE_TX(8,12,C) COMPUTE_TX(9,12,C) COMPUTE_TX(10,12,C) COMPUTE_TX(11,12,C)
#define COMPUTE_V13(A) COMPUTE_V12(A) COMPUTE_VX(A,12)
#define SAVE_T13(C) SAVE_T12(C) SAVE_TX(12,12,C) SAVE_TX(0,12,C) SAVE_TX(1,12,C) SAVE_TX(2,12,C) SAVE_TX(3,12,C) SAVE_TX(4,12,C) SAVE_TX(5,12,C) SAVE_TX(6,12,C) SAVE_TX(7,12,C) SAVE_TX(8,12,C) SAVE_TX(9,12,C) SAVE_TX(10,12,C) SAVE_TX(11,12,C)
#define COMPUTE_T14(C) COMPUTE_T13(C) COMPUTE_TX(13,13,C) COMPUTE_TX(0,13,C) COMPUTE_TX(1,13,C) COMPUTE_TX(2,13,C) COMPUTE_TX(3,13,C) COMPUTE_TX(4,13,C) COMPUTE_TX(5,13,C) COMPUTE_TX(6,13,C) COMPUTE_TX(7,13,C) COMPUTE_TX(8,13,C) COMPUTE_TX(9,13,C) COMPUTE_TX(10,13,C) COMPUTE_TX(11,13,C) COMPUTE_TX(12,13,C)
#define COMPUTE_V14(A) COMPUTE_V13(A) COMPUTE_VX(A,13)
#define SAVE_T14(C) SAVE_T13(C) SAVE_TX(13,13,C) SAVE_TX(0,13,C) SAVE_TX(1,13,C) SAVE_TX(2,13,C) SAVE_TX(3,13,C) SAVE_TX(4,13,C) SAVE_TX(5,13,C) SAVE_TX(6,13,C) SAVE_TX(7,13,C) SAVE_TX(8,13,C) SAVE_TX(9,13,C) SAVE_TX(10,13,C) SAVE_TX(11,13,C) SAVE_TX(12,13,C)
#define COMPUTE_T15(C) COMPUTE_T14(C) COMPUTE_TX(14,14,C) COMPUTE_TX(0,14,C) COMPUTE_TX(1,14,C) COMPUTE_TX(2,14,C) COMPUTE_TX(3,14,C) COMPUTE_TX(4,14,C) COMPUTE_TX(5,14,C) COMPUTE_TX(6,14,C) COMPUTE_TX(7,14,C) COMPUTE_TX(8,14,C) COMPUTE_TX(9,14,C) COMPUTE_TX(10,14,C) COMPUTE_TX(11,14,C) COMPUTE_TX(12,14,C) COMPUTE_TX(13,14,C)
#define COMPUTE_V15(A) COMPUTE_V14(A) COMPUTE_VX(A,14)
#define SAVE_T15(C) SAVE_T14(C) SAVE_TX(14,14,C) SAVE_TX(0,14,C) SAVE_TX(1,14,C) SAVE_TX(2,14,C) SAVE_TX(3,14,C) SAVE_TX(4,14,C) SAVE_TX(5,14,C) SAVE_TX(6,14,C) SAVE_TX(7,14,C) SAVE_TX(8,14,C) SAVE_TX(9,14,C) SAVE_TX(10,14,C) SAVE_TX(11,14,C) SAVE_TX(12,14,C) SAVE_TX(13,14,C)
#define COMPUTE_T16(C) COMPUTE_T15(C) COMPUTE_TX(15,15,C) COMPUTE_TX(0,15,C) COMPUTE_TX(1,15,C) COMPUTE_TX(2,15,C) COMPUTE_TX(3,15,C) COMPUTE_TX(4,15,C) COMPUTE_TX(5,15,C) COMPUTE_TX(6,15,C) COMPUTE_TX(7,15,C) COMPUTE_TX(8,15,C) COMPUTE_TX(9,15,C) COMPUTE_TX(10,15,C) COMPUTE_TX(11,15,C) COMPUTE_TX(12,15,C) COMPUTE_TX(13,15,C) COMPUTE_TX(14,15,C)
#define COMPUTE_V16(A) COMPUTE_V15(A) COMPUTE_VX(A,15)
#define SAVE_T16(C) SAVE_T15(C) SAVE_TX(15,15,C) SAVE_TX(0,15,C) SAVE_TX(1,15,C) SAVE_TX(2,15,C) SAVE_TX(3,15,C) SAVE_TX(4,15,C) SAVE_TX(5,15,C) SAVE_TX(6,15,C) SAVE_TX(7,15,C) SAVE_TX(8,15,C) SAVE_TX(9,15,C) SAVE_TX(10,15,C) SAVE_TX(11,15,C) SAVE_TX(12,15,C) SAVE_TX(13,15,C) SAVE_TX(14,15,C)

#define SUM_T1(C) HSUM(R(0, 0), zero, zero, zero);*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);
#define SUM_V1(C) HSUM(R(0, 0), zero, zero, zero); C[0] = tmp5;
#define SUM_T2(C) HSUM(R(0, 0), R(0, 1), R(1, 1), zero);*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);
#define SUM_V2(C) HSUM(R(0, 0), R(1, 0), zero, zero); C[0] = tmp5;
#define SUM_T3(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), zero, zero);*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);
#define SUM_V3(C) HSUM(R(0, 0), R(1, 0), R(2, 0), zero); C[0] = tmp5;
#define SUM_T4(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), zero, zero);*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);
#define SUM_V4(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5;
#define SUM_T5(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), zero);*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);
#define SUM_V5(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), zero, zero, zero); C[1] = tmp5;
#define SUM_T6(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), zero, zero, zero);*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);
#define SUM_V6(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), zero, zero); C[1] = tmp5;
#define SUM_T7(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3);
#define SUM_V7(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), zero); C[1] = tmp5;
#define SUM_T8(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(0, 7), R(1, 7), R(2, 7), R(3, 7));*((C) + 7 * ldc + 0) = spu_extract(tmp5,0);*((C) + 7 * ldc + 1) = spu_extract(tmp5,1);*((C) + 7 * ldc + 2) = spu_extract(tmp5,2);*((C) + 7 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 7), R(5, 7), R(6, 7), R(7, 7));*((C) + 7 * ldc + 4) = spu_extract(tmp5,0);*((C) + 7 * ldc + 5) = spu_extract(tmp5,1);*((C) + 7 * ldc + 6) = spu_extract(tmp5,2);*((C) + 7 * ldc + 7) = spu_extract(tmp5,3);
#define SUM_V8(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), R(7, 0)); C[1] = tmp5;
#define SUM_T9(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(0, 7), R(1, 7), R(2, 7), R(3, 7));*((C) + 7 * ldc + 0) = spu_extract(tmp5,0);*((C) + 7 * ldc + 1) = spu_extract(tmp5,1);*((C) + 7 * ldc + 2) = spu_extract(tmp5,2);*((C) + 7 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 7), R(5, 7), R(6, 7), R(7, 7));*((C) + 7 * ldc + 4) = spu_extract(tmp5,0);*((C) + 7 * ldc + 5) = spu_extract(tmp5,1);*((C) + 7 * ldc + 6) = spu_extract(tmp5,2);*((C) + 7 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(0, 8), R(1, 8), R(2, 8), R(3, 8));*((C) + 8 * ldc + 0) = spu_extract(tmp5,0);*((C) + 8 * ldc + 1) = spu_extract(tmp5,1);*((C) + 8 * ldc + 2) = spu_extract(tmp5,2);*((C) + 8 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 8), R(5, 8), R(6, 8), R(7, 8));*((C) + 8 * ldc + 4) = spu_extract(tmp5,0);*((C) + 8 * ldc + 5) = spu_extract(tmp5,1);*((C) + 8 * ldc + 6) = spu_extract(tmp5,2);*((C) + 8 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(8, 8), zero, zero, zero);*((C) + 8 * ldc + 8) = spu_extract(tmp5,0);
#define SUM_V9(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), R(7, 0)); C[1] = tmp5; HSUM(R(8, 0), zero, zero, zero); C[2] = tmp5;
#define SUM_T10(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(0, 7), R(1, 7), R(2, 7), R(3, 7));*((C) + 7 * ldc + 0) = spu_extract(tmp5,0);*((C) + 7 * ldc + 1) = spu_extract(tmp5,1);*((C) + 7 * ldc + 2) = spu_extract(tmp5,2);*((C) + 7 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 7), R(5, 7), R(6, 7), R(7, 7));*((C) + 7 * ldc + 4) = spu_extract(tmp5,0);*((C) + 7 * ldc + 5) = spu_extract(tmp5,1);*((C) + 7 * ldc + 6) = spu_extract(tmp5,2);*((C) + 7 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(0, 8), R(1, 8), R(2, 8), R(3, 8));*((C) + 8 * ldc + 0) = spu_extract(tmp5,0);*((C) + 8 * ldc + 1) = spu_extract(tmp5,1);*((C) + 8 * ldc + 2) = spu_extract(tmp5,2);*((C) + 8 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 8), R(5, 8), R(6, 8), R(7, 8));*((C) + 8 * ldc + 4) = spu_extract(tmp5,0);*((C) + 8 * ldc + 5) = spu_extract(tmp5,1);*((C) + 8 * ldc + 6) = spu_extract(tmp5,2);*((C) + 8 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(8, 8), R(0, 9), R(1, 9), R(2, 9));*((C) + 8 * ldc + 8) = spu_extract(tmp5,0);*((C) + 9 * ldc + 0) = spu_extract(tmp5,1);*((C) + 9 * ldc + 1) = spu_extract(tmp5,2);*((C) + 9 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 9), R(4, 9), R(5, 9), R(6, 9));*((C) + 9 * ldc + 3) = spu_extract(tmp5,0);*((C) + 9 * ldc + 4) = spu_extract(tmp5,1);*((C) + 9 * ldc + 5) = spu_extract(tmp5,2);*((C) + 9 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(7, 9), R(8, 9), R(9, 9), zero);*((C) + 9 * ldc + 7) = spu_extract(tmp5,0);*((C) + 9 * ldc + 8) = spu_extract(tmp5,1);*((C) + 9 * ldc + 9) = spu_extract(tmp5,2);
#define SUM_V10(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), R(7, 0)); C[1] = tmp5; HSUM(R(8, 0), R(9, 0), zero, zero); C[2] = tmp5;
#define SUM_T11(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(0, 7), R(1, 7), R(2, 7), R(3, 7));*((C) + 7 * ldc + 0) = spu_extract(tmp5,0);*((C) + 7 * ldc + 1) = spu_extract(tmp5,1);*((C) + 7 * ldc + 2) = spu_extract(tmp5,2);*((C) + 7 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 7), R(5, 7), R(6, 7), R(7, 7));*((C) + 7 * ldc + 4) = spu_extract(tmp5,0);*((C) + 7 * ldc + 5) = spu_extract(tmp5,1);*((C) + 7 * ldc + 6) = spu_extract(tmp5,2);*((C) + 7 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(0, 8), R(1, 8), R(2, 8), R(3, 8));*((C) + 8 * ldc + 0) = spu_extract(tmp5,0);*((C) + 8 * ldc + 1) = spu_extract(tmp5,1);*((C) + 8 * ldc + 2) = spu_extract(tmp5,2);*((C) + 8 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 8), R(5, 8), R(6, 8), R(7, 8));*((C) + 8 * ldc + 4) = spu_extract(tmp5,0);*((C) + 8 * ldc + 5) = spu_extract(tmp5,1);*((C) + 8 * ldc + 6) = spu_extract(tmp5,2);*((C) + 8 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(8, 8), R(0, 9), R(1, 9), R(2, 9));*((C) + 8 * ldc + 8) = spu_extract(tmp5,0);*((C) + 9 * ldc + 0) = spu_extract(tmp5,1);*((C) + 9 * ldc + 1) = spu_extract(tmp5,2);*((C) + 9 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 9), R(4, 9), R(5, 9), R(6, 9));*((C) + 9 * ldc + 3) = spu_extract(tmp5,0);*((C) + 9 * ldc + 4) = spu_extract(tmp5,1);*((C) + 9 * ldc + 5) = spu_extract(tmp5,2);*((C) + 9 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(7, 9), R(8, 9), R(9, 9), R(0, 10));*((C) + 9 * ldc + 7) = spu_extract(tmp5,0);*((C) + 9 * ldc + 8) = spu_extract(tmp5,1);*((C) + 9 * ldc + 9) = spu_extract(tmp5,2);*((C) + 10 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 10), R(2, 10), R(3, 10), R(4, 10));*((C) + 10 * ldc + 1) = spu_extract(tmp5,0);*((C) + 10 * ldc + 2) = spu_extract(tmp5,1);*((C) + 10 * ldc + 3) = spu_extract(tmp5,2);*((C) + 10 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 10), R(6, 10), R(7, 10), R(8, 10));*((C) + 10 * ldc + 5) = spu_extract(tmp5,0);*((C) + 10 * ldc + 6) = spu_extract(tmp5,1);*((C) + 10 * ldc + 7) = spu_extract(tmp5,2);*((C) + 10 * ldc + 8) = spu_extract(tmp5,3); HSUM(R(9, 10), R(10, 10), zero, zero);*((C) + 10 * ldc + 9) = spu_extract(tmp5,0);*((C) + 10 * ldc + 10) = spu_extract(tmp5,1);
#define SUM_V11(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), R(7, 0)); C[1] = tmp5; HSUM(R(8, 0), R(9, 0), R(10, 0), zero); C[2] = tmp5;
#define SUM_T12(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(0, 7), R(1, 7), R(2, 7), R(3, 7));*((C) + 7 * ldc + 0) = spu_extract(tmp5,0);*((C) + 7 * ldc + 1) = spu_extract(tmp5,1);*((C) + 7 * ldc + 2) = spu_extract(tmp5,2);*((C) + 7 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 7), R(5, 7), R(6, 7), R(7, 7));*((C) + 7 * ldc + 4) = spu_extract(tmp5,0);*((C) + 7 * ldc + 5) = spu_extract(tmp5,1);*((C) + 7 * ldc + 6) = spu_extract(tmp5,2);*((C) + 7 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(0, 8), R(1, 8), R(2, 8), R(3, 8));*((C) + 8 * ldc + 0) = spu_extract(tmp5,0);*((C) + 8 * ldc + 1) = spu_extract(tmp5,1);*((C) + 8 * ldc + 2) = spu_extract(tmp5,2);*((C) + 8 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 8), R(5, 8), R(6, 8), R(7, 8));*((C) + 8 * ldc + 4) = spu_extract(tmp5,0);*((C) + 8 * ldc + 5) = spu_extract(tmp5,1);*((C) + 8 * ldc + 6) = spu_extract(tmp5,2);*((C) + 8 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(8, 8), R(0, 9), R(1, 9), R(2, 9));*((C) + 8 * ldc + 8) = spu_extract(tmp5,0);*((C) + 9 * ldc + 0) = spu_extract(tmp5,1);*((C) + 9 * ldc + 1) = spu_extract(tmp5,2);*((C) + 9 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 9), R(4, 9), R(5, 9), R(6, 9));*((C) + 9 * ldc + 3) = spu_extract(tmp5,0);*((C) + 9 * ldc + 4) = spu_extract(tmp5,1);*((C) + 9 * ldc + 5) = spu_extract(tmp5,2);*((C) + 9 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(7, 9), R(8, 9), R(9, 9), R(0, 10));*((C) + 9 * ldc + 7) = spu_extract(tmp5,0);*((C) + 9 * ldc + 8) = spu_extract(tmp5,1);*((C) + 9 * ldc + 9) = spu_extract(tmp5,2);*((C) + 10 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 10), R(2, 10), R(3, 10), R(4, 10));*((C) + 10 * ldc + 1) = spu_extract(tmp5,0);*((C) + 10 * ldc + 2) = spu_extract(tmp5,1);*((C) + 10 * ldc + 3) = spu_extract(tmp5,2);*((C) + 10 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 10), R(6, 10), R(7, 10), R(8, 10));*((C) + 10 * ldc + 5) = spu_extract(tmp5,0);*((C) + 10 * ldc + 6) = spu_extract(tmp5,1);*((C) + 10 * ldc + 7) = spu_extract(tmp5,2);*((C) + 10 * ldc + 8) = spu_extract(tmp5,3); HSUM(R(9, 10), R(10, 10), R(0, 11), R(1, 11));*((C) + 10 * ldc + 9) = spu_extract(tmp5,0);*((C) + 10 * ldc + 10) = spu_extract(tmp5,1);*((C) + 11 * ldc + 0) = spu_extract(tmp5,2);*((C) + 11 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 11), R(3, 11), R(4, 11), R(5, 11));*((C) + 11 * ldc + 2) = spu_extract(tmp5,0);*((C) + 11 * ldc + 3) = spu_extract(tmp5,1);*((C) + 11 * ldc + 4) = spu_extract(tmp5,2);*((C) + 11 * ldc + 5) = spu_extract(tmp5,3); HSUM(R(6, 11), R(7, 11), R(8, 11), R(9, 11));*((C) + 11 * ldc + 6) = spu_extract(tmp5,0);*((C) + 11 * ldc + 7) = spu_extract(tmp5,1);*((C) + 11 * ldc + 8) = spu_extract(tmp5,2);*((C) + 11 * ldc + 9) = spu_extract(tmp5,3); HSUM(R(10, 11), R(11, 11), zero, zero);*((C) + 11 * ldc + 10) = spu_extract(tmp5,0);*((C) + 11 * ldc + 11) = spu_extract(tmp5,1);
#define SUM_V12(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), R(7, 0)); C[1] = tmp5; HSUM(R(8, 0), R(9, 0), R(10, 0), R(11, 0)); C[2] = tmp5;
#define SUM_T13(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(0, 7), R(1, 7), R(2, 7), R(3, 7));*((C) + 7 * ldc + 0) = spu_extract(tmp5,0);*((C) + 7 * ldc + 1) = spu_extract(tmp5,1);*((C) + 7 * ldc + 2) = spu_extract(tmp5,2);*((C) + 7 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 7), R(5, 7), R(6, 7), R(7, 7));*((C) + 7 * ldc + 4) = spu_extract(tmp5,0);*((C) + 7 * ldc + 5) = spu_extract(tmp5,1);*((C) + 7 * ldc + 6) = spu_extract(tmp5,2);*((C) + 7 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(0, 8), R(1, 8), R(2, 8), R(3, 8));*((C) + 8 * ldc + 0) = spu_extract(tmp5,0);*((C) + 8 * ldc + 1) = spu_extract(tmp5,1);*((C) + 8 * ldc + 2) = spu_extract(tmp5,2);*((C) + 8 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 8), R(5, 8), R(6, 8), R(7, 8));*((C) + 8 * ldc + 4) = spu_extract(tmp5,0);*((C) + 8 * ldc + 5) = spu_extract(tmp5,1);*((C) + 8 * ldc + 6) = spu_extract(tmp5,2);*((C) + 8 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(8, 8), R(0, 9), R(1, 9), R(2, 9));*((C) + 8 * ldc + 8) = spu_extract(tmp5,0);*((C) + 9 * ldc + 0) = spu_extract(tmp5,1);*((C) + 9 * ldc + 1) = spu_extract(tmp5,2);*((C) + 9 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 9), R(4, 9), R(5, 9), R(6, 9));*((C) + 9 * ldc + 3) = spu_extract(tmp5,0);*((C) + 9 * ldc + 4) = spu_extract(tmp5,1);*((C) + 9 * ldc + 5) = spu_extract(tmp5,2);*((C) + 9 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(7, 9), R(8, 9), R(9, 9), R(0, 10));*((C) + 9 * ldc + 7) = spu_extract(tmp5,0);*((C) + 9 * ldc + 8) = spu_extract(tmp5,1);*((C) + 9 * ldc + 9) = spu_extract(tmp5,2);*((C) + 10 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 10), R(2, 10), R(3, 10), R(4, 10));*((C) + 10 * ldc + 1) = spu_extract(tmp5,0);*((C) + 10 * ldc + 2) = spu_extract(tmp5,1);*((C) + 10 * ldc + 3) = spu_extract(tmp5,2);*((C) + 10 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 10), R(6, 10), R(7, 10), R(8, 10));*((C) + 10 * ldc + 5) = spu_extract(tmp5,0);*((C) + 10 * ldc + 6) = spu_extract(tmp5,1);*((C) + 10 * ldc + 7) = spu_extract(tmp5,2);*((C) + 10 * ldc + 8) = spu_extract(tmp5,3); HSUM(R(9, 10), R(10, 10), R(0, 11), R(1, 11));*((C) + 10 * ldc + 9) = spu_extract(tmp5,0);*((C) + 10 * ldc + 10) = spu_extract(tmp5,1);*((C) + 11 * ldc + 0) = spu_extract(tmp5,2);*((C) + 11 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 11), R(3, 11), R(4, 11), R(5, 11));*((C) + 11 * ldc + 2) = spu_extract(tmp5,0);*((C) + 11 * ldc + 3) = spu_extract(tmp5,1);*((C) + 11 * ldc + 4) = spu_extract(tmp5,2);*((C) + 11 * ldc + 5) = spu_extract(tmp5,3); HSUM(R(6, 11), R(7, 11), R(8, 11), R(9, 11));*((C) + 11 * ldc + 6) = spu_extract(tmp5,0);*((C) + 11 * ldc + 7) = spu_extract(tmp5,1);*((C) + 11 * ldc + 8) = spu_extract(tmp5,2);*((C) + 11 * ldc + 9) = spu_extract(tmp5,3); HSUM(R(10, 11), R(11, 11), R(0, 12), R(1, 12));*((C) + 11 * ldc + 10) = spu_extract(tmp5,0);*((C) + 11 * ldc + 11) = spu_extract(tmp5,1);*((C) + 12 * ldc + 0) = spu_extract(tmp5,2);*((C) + 12 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 12), R(3, 12), R(4, 12), R(5, 12));*((C) + 12 * ldc + 2) = spu_extract(tmp5,0);*((C) + 12 * ldc + 3) = spu_extract(tmp5,1);*((C) + 12 * ldc + 4) = spu_extract(tmp5,2);*((C) + 12 * ldc + 5) = spu_extract(tmp5,3); HSUM(R(6, 12), R(7, 12), R(8, 12), R(9, 12));*((C) + 12 * ldc + 6) = spu_extract(tmp5,0);*((C) + 12 * ldc + 7) = spu_extract(tmp5,1);*((C) + 12 * ldc + 8) = spu_extract(tmp5,2);*((C) + 12 * ldc + 9) = spu_extract(tmp5,3); HSUM(R(10, 12), R(11, 12), R(12, 12), zero);*((C) + 12 * ldc + 10) = spu_extract(tmp5,0);*((C) + 12 * ldc + 11) = spu_extract(tmp5,1);*((C) + 12 * ldc + 12) = spu_extract(tmp5,2);
#define SUM_V13(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), R(7, 0)); C[1] = tmp5; HSUM(R(8, 0), R(9, 0), R(10, 0), R(11, 0)); C[2] = tmp5; HSUM(R(12, 0), zero, zero, zero); C[3] = tmp5;
#define SUM_T14(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(0, 7), R(1, 7), R(2, 7), R(3, 7));*((C) + 7 * ldc + 0) = spu_extract(tmp5,0);*((C) + 7 * ldc + 1) = spu_extract(tmp5,1);*((C) + 7 * ldc + 2) = spu_extract(tmp5,2);*((C) + 7 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 7), R(5, 7), R(6, 7), R(7, 7));*((C) + 7 * ldc + 4) = spu_extract(tmp5,0);*((C) + 7 * ldc + 5) = spu_extract(tmp5,1);*((C) + 7 * ldc + 6) = spu_extract(tmp5,2);*((C) + 7 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(0, 8), R(1, 8), R(2, 8), R(3, 8));*((C) + 8 * ldc + 0) = spu_extract(tmp5,0);*((C) + 8 * ldc + 1) = spu_extract(tmp5,1);*((C) + 8 * ldc + 2) = spu_extract(tmp5,2);*((C) + 8 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 8), R(5, 8), R(6, 8), R(7, 8));*((C) + 8 * ldc + 4) = spu_extract(tmp5,0);*((C) + 8 * ldc + 5) = spu_extract(tmp5,1);*((C) + 8 * ldc + 6) = spu_extract(tmp5,2);*((C) + 8 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(8, 8), R(0, 9), R(1, 9), R(2, 9));*((C) + 8 * ldc + 8) = spu_extract(tmp5,0);*((C) + 9 * ldc + 0) = spu_extract(tmp5,1);*((C) + 9 * ldc + 1) = spu_extract(tmp5,2);*((C) + 9 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 9), R(4, 9), R(5, 9), R(6, 9));*((C) + 9 * ldc + 3) = spu_extract(tmp5,0);*((C) + 9 * ldc + 4) = spu_extract(tmp5,1);*((C) + 9 * ldc + 5) = spu_extract(tmp5,2);*((C) + 9 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(7, 9), R(8, 9), R(9, 9), R(0, 10));*((C) + 9 * ldc + 7) = spu_extract(tmp5,0);*((C) + 9 * ldc + 8) = spu_extract(tmp5,1);*((C) + 9 * ldc + 9) = spu_extract(tmp5,2);*((C) + 10 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 10), R(2, 10), R(3, 10), R(4, 10));*((C) + 10 * ldc + 1) = spu_extract(tmp5,0);*((C) + 10 * ldc + 2) = spu_extract(tmp5,1);*((C) + 10 * ldc + 3) = spu_extract(tmp5,2);*((C) + 10 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 10), R(6, 10), R(7, 10), R(8, 10));*((C) + 10 * ldc + 5) = spu_extract(tmp5,0);*((C) + 10 * ldc + 6) = spu_extract(tmp5,1);*((C) + 10 * ldc + 7) = spu_extract(tmp5,2);*((C) + 10 * ldc + 8) = spu_extract(tmp5,3); HSUM(R(9, 10), R(10, 10), R(0, 11), R(1, 11));*((C) + 10 * ldc + 9) = spu_extract(tmp5,0);*((C) + 10 * ldc + 10) = spu_extract(tmp5,1);*((C) + 11 * ldc + 0) = spu_extract(tmp5,2);*((C) + 11 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 11), R(3, 11), R(4, 11), R(5, 11));*((C) + 11 * ldc + 2) = spu_extract(tmp5,0);*((C) + 11 * ldc + 3) = spu_extract(tmp5,1);*((C) + 11 * ldc + 4) = spu_extract(tmp5,2);*((C) + 11 * ldc + 5) = spu_extract(tmp5,3); HSUM(R(6, 11), R(7, 11), R(8, 11), R(9, 11));*((C) + 11 * ldc + 6) = spu_extract(tmp5,0);*((C) + 11 * ldc + 7) = spu_extract(tmp5,1);*((C) + 11 * ldc + 8) = spu_extract(tmp5,2);*((C) + 11 * ldc + 9) = spu_extract(tmp5,3); HSUM(R(10, 11), R(11, 11), R(0, 12), R(1, 12));*((C) + 11 * ldc + 10) = spu_extract(tmp5,0);*((C) + 11 * ldc + 11) = spu_extract(tmp5,1);*((C) + 12 * ldc + 0) = spu_extract(tmp5,2);*((C) + 12 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 12), R(3, 12), R(4, 12), R(5, 12));*((C) + 12 * ldc + 2) = spu_extract(tmp5,0);*((C) + 12 * ldc + 3) = spu_extract(tmp5,1);*((C) + 12 * ldc + 4) = spu_extract(tmp5,2);*((C) + 12 * ldc + 5) = spu_extract(tmp5,3); HSUM(R(6, 12), R(7, 12), R(8, 12), R(9, 12));*((C) + 12 * ldc + 6) = spu_extract(tmp5,0);*((C) + 12 * ldc + 7) = spu_extract(tmp5,1);*((C) + 12 * ldc + 8) = spu_extract(tmp5,2);*((C) + 12 * ldc + 9) = spu_extract(tmp5,3); HSUM(R(10, 12), R(11, 12), R(12, 12), R(0, 13));*((C) + 12 * ldc + 10) = spu_extract(tmp5,0);*((C) + 12 * ldc + 11) = spu_extract(tmp5,1);*((C) + 12 * ldc + 12) = spu_extract(tmp5,2);*((C) + 13 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 13), R(2, 13), R(3, 13), R(4, 13));*((C) + 13 * ldc + 1) = spu_extract(tmp5,0);*((C) + 13 * ldc + 2) = spu_extract(tmp5,1);*((C) + 13 * ldc + 3) = spu_extract(tmp5,2);*((C) + 13 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 13), R(6, 13), R(7, 13), R(8, 13));*((C) + 13 * ldc + 5) = spu_extract(tmp5,0);*((C) + 13 * ldc + 6) = spu_extract(tmp5,1);*((C) + 13 * ldc + 7) = spu_extract(tmp5,2);*((C) + 13 * ldc + 8) = spu_extract(tmp5,3); HSUM(R(9, 13), R(10, 13), R(11, 13), R(12, 13));*((C) + 13 * ldc + 9) = spu_extract(tmp5,0);*((C) + 13 * ldc + 10) = spu_extract(tmp5,1);*((C) + 13 * ldc + 11) = spu_extract(tmp5,2);*((C) + 13 * ldc + 12) = spu_extract(tmp5,3); HSUM(R(13, 13), zero, zero, zero);*((C) + 13 * ldc + 13) = spu_extract(tmp5,0);
#define SUM_V14(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), R(7, 0)); C[1] = tmp5; HSUM(R(8, 0), R(9, 0), R(10, 0), R(11, 0)); C[2] = tmp5; HSUM(R(12, 0), R(13, 0), zero, zero); C[3] = tmp5;
#define SUM_T15(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(0, 7), R(1, 7), R(2, 7), R(3, 7));*((C) + 7 * ldc + 0) = spu_extract(tmp5,0);*((C) + 7 * ldc + 1) = spu_extract(tmp5,1);*((C) + 7 * ldc + 2) = spu_extract(tmp5,2);*((C) + 7 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 7), R(5, 7), R(6, 7), R(7, 7));*((C) + 7 * ldc + 4) = spu_extract(tmp5,0);*((C) + 7 * ldc + 5) = spu_extract(tmp5,1);*((C) + 7 * ldc + 6) = spu_extract(tmp5,2);*((C) + 7 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(0, 8), R(1, 8), R(2, 8), R(3, 8));*((C) + 8 * ldc + 0) = spu_extract(tmp5,0);*((C) + 8 * ldc + 1) = spu_extract(tmp5,1);*((C) + 8 * ldc + 2) = spu_extract(tmp5,2);*((C) + 8 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 8), R(5, 8), R(6, 8), R(7, 8));*((C) + 8 * ldc + 4) = spu_extract(tmp5,0);*((C) + 8 * ldc + 5) = spu_extract(tmp5,1);*((C) + 8 * ldc + 6) = spu_extract(tmp5,2);*((C) + 8 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(8, 8), R(0, 9), R(1, 9), R(2, 9));*((C) + 8 * ldc + 8) = spu_extract(tmp5,0);*((C) + 9 * ldc + 0) = spu_extract(tmp5,1);*((C) + 9 * ldc + 1) = spu_extract(tmp5,2);*((C) + 9 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 9), R(4, 9), R(5, 9), R(6, 9));*((C) + 9 * ldc + 3) = spu_extract(tmp5,0);*((C) + 9 * ldc + 4) = spu_extract(tmp5,1);*((C) + 9 * ldc + 5) = spu_extract(tmp5,2);*((C) + 9 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(7, 9), R(8, 9), R(9, 9), R(0, 10));*((C) + 9 * ldc + 7) = spu_extract(tmp5,0);*((C) + 9 * ldc + 8) = spu_extract(tmp5,1);*((C) + 9 * ldc + 9) = spu_extract(tmp5,2);*((C) + 10 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 10), R(2, 10), R(3, 10), R(4, 10));*((C) + 10 * ldc + 1) = spu_extract(tmp5,0);*((C) + 10 * ldc + 2) = spu_extract(tmp5,1);*((C) + 10 * ldc + 3) = spu_extract(tmp5,2);*((C) + 10 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 10), R(6, 10), R(7, 10), R(8, 10));*((C) + 10 * ldc + 5) = spu_extract(tmp5,0);*((C) + 10 * ldc + 6) = spu_extract(tmp5,1);*((C) + 10 * ldc + 7) = spu_extract(tmp5,2);*((C) + 10 * ldc + 8) = spu_extract(tmp5,3); HSUM(R(9, 10), R(10, 10), R(0, 11), R(1, 11));*((C) + 10 * ldc + 9) = spu_extract(tmp5,0);*((C) + 10 * ldc + 10) = spu_extract(tmp5,1);*((C) + 11 * ldc + 0) = spu_extract(tmp5,2);*((C) + 11 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 11), R(3, 11), R(4, 11), R(5, 11));*((C) + 11 * ldc + 2) = spu_extract(tmp5,0);*((C) + 11 * ldc + 3) = spu_extract(tmp5,1);*((C) + 11 * ldc + 4) = spu_extract(tmp5,2);*((C) + 11 * ldc + 5) = spu_extract(tmp5,3); HSUM(R(6, 11), R(7, 11), R(8, 11), R(9, 11));*((C) + 11 * ldc + 6) = spu_extract(tmp5,0);*((C) + 11 * ldc + 7) = spu_extract(tmp5,1);*((C) + 11 * ldc + 8) = spu_extract(tmp5,2);*((C) + 11 * ldc + 9) = spu_extract(tmp5,3); HSUM(R(10, 11), R(11, 11), R(0, 12), R(1, 12));*((C) + 11 * ldc + 10) = spu_extract(tmp5,0);*((C) + 11 * ldc + 11) = spu_extract(tmp5,1);*((C) + 12 * ldc + 0) = spu_extract(tmp5,2);*((C) + 12 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 12), R(3, 12), R(4, 12), R(5, 12));*((C) + 12 * ldc + 2) = spu_extract(tmp5,0);*((C) + 12 * ldc + 3) = spu_extract(tmp5,1);*((C) + 12 * ldc + 4) = spu_extract(tmp5,2);*((C) + 12 * ldc + 5) = spu_extract(tmp5,3); HSUM(R(6, 12), R(7, 12), R(8, 12), R(9, 12));*((C) + 12 * ldc + 6) = spu_extract(tmp5,0);*((C) + 12 * ldc + 7) = spu_extract(tmp5,1);*((C) + 12 * ldc + 8) = spu_extract(tmp5,2);*((C) + 12 * ldc + 9) = spu_extract(tmp5,3); HSUM(R(10, 12), R(11, 12), R(12, 12), R(0, 13));*((C) + 12 * ldc + 10) = spu_extract(tmp5,0);*((C) + 12 * ldc + 11) = spu_extract(tmp5,1);*((C) + 12 * ldc + 12) = spu_extract(tmp5,2);*((C) + 13 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 13), R(2, 13), R(3, 13), R(4, 13));*((C) + 13 * ldc + 1) = spu_extract(tmp5,0);*((C) + 13 * ldc + 2) = spu_extract(tmp5,1);*((C) + 13 * ldc + 3) = spu_extract(tmp5,2);*((C) + 13 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 13), R(6, 13), R(7, 13), R(8, 13));*((C) + 13 * ldc + 5) = spu_extract(tmp5,0);*((C) + 13 * ldc + 6) = spu_extract(tmp5,1);*((C) + 13 * ldc + 7) = spu_extract(tmp5,2);*((C) + 13 * ldc + 8) = spu_extract(tmp5,3); HSUM(R(9, 13), R(10, 13), R(11, 13), R(12, 13));*((C) + 13 * ldc + 9) = spu_extract(tmp5,0);*((C) + 13 * ldc + 10) = spu_extract(tmp5,1);*((C) + 13 * ldc + 11) = spu_extract(tmp5,2);*((C) + 13 * ldc + 12) = spu_extract(tmp5,3); HSUM(R(13, 13), R(0, 14), R(1, 14), R(2, 14));*((C) + 13 * ldc + 13) = spu_extract(tmp5,0);*((C) + 14 * ldc + 0) = spu_extract(tmp5,1);*((C) + 14 * ldc + 1) = spu_extract(tmp5,2);*((C) + 14 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 14), R(4, 14), R(5, 14), R(6, 14));*((C) + 14 * ldc + 3) = spu_extract(tmp5,0);*((C) + 14 * ldc + 4) = spu_extract(tmp5,1);*((C) + 14 * ldc + 5) = spu_extract(tmp5,2);*((C) + 14 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(7, 14), R(8, 14), R(9, 14), R(10, 14));*((C) + 14 * ldc + 7) = spu_extract(tmp5,0);*((C) + 14 * ldc + 8) = spu_extract(tmp5,1);*((C) + 14 * ldc + 9) = spu_extract(tmp5,2);*((C) + 14 * ldc + 10) = spu_extract(tmp5,3); HSUM(R(11, 14), R(12, 14), R(13, 14), R(14, 14));*((C) + 14 * ldc + 11) = spu_extract(tmp5,0);*((C) + 14 * ldc + 12) = spu_extract(tmp5,1);*((C) + 14 * ldc + 13) = spu_extract(tmp5,2);*((C) + 14 * ldc + 14) = spu_extract(tmp5,3);
#define SUM_V15(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), R(7, 0)); C[1] = tmp5; HSUM(R(8, 0), R(9, 0), R(10, 0), R(11, 0)); C[2] = tmp5; HSUM(R(12, 0), R(13, 0), R(14, 0), zero); C[3] = tmp5;
#define SUM_T16(C) HSUM(R(0, 0), R(0, 1), R(1, 1), R(0, 2));*((C) + 0 * ldc + 0) = spu_extract(tmp5,0);*((C) + 1 * ldc + 0) = spu_extract(tmp5,1);*((C) + 1 * ldc + 1) = spu_extract(tmp5,2);*((C) + 2 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 2), R(2, 2), R(0, 3), R(1, 3));*((C) + 2 * ldc + 1) = spu_extract(tmp5,0);*((C) + 2 * ldc + 2) = spu_extract(tmp5,1);*((C) + 3 * ldc + 0) = spu_extract(tmp5,2);*((C) + 3 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 3), R(3, 3), R(0, 4), R(1, 4));*((C) + 3 * ldc + 2) = spu_extract(tmp5,0);*((C) + 3 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 0) = spu_extract(tmp5,2);*((C) + 4 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 4), R(3, 4), R(4, 4), R(0, 5));*((C) + 4 * ldc + 2) = spu_extract(tmp5,0);*((C) + 4 * ldc + 3) = spu_extract(tmp5,1);*((C) + 4 * ldc + 4) = spu_extract(tmp5,2);*((C) + 5 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 5), R(2, 5), R(3, 5), R(4, 5));*((C) + 5 * ldc + 1) = spu_extract(tmp5,0);*((C) + 5 * ldc + 2) = spu_extract(tmp5,1);*((C) + 5 * ldc + 3) = spu_extract(tmp5,2);*((C) + 5 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 5), R(0, 6), R(1, 6), R(2, 6));*((C) + 5 * ldc + 5) = spu_extract(tmp5,0);*((C) + 6 * ldc + 0) = spu_extract(tmp5,1);*((C) + 6 * ldc + 1) = spu_extract(tmp5,2);*((C) + 6 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 6), R(4, 6), R(5, 6), R(6, 6));*((C) + 6 * ldc + 3) = spu_extract(tmp5,0);*((C) + 6 * ldc + 4) = spu_extract(tmp5,1);*((C) + 6 * ldc + 5) = spu_extract(tmp5,2);*((C) + 6 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(0, 7), R(1, 7), R(2, 7), R(3, 7));*((C) + 7 * ldc + 0) = spu_extract(tmp5,0);*((C) + 7 * ldc + 1) = spu_extract(tmp5,1);*((C) + 7 * ldc + 2) = spu_extract(tmp5,2);*((C) + 7 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 7), R(5, 7), R(6, 7), R(7, 7));*((C) + 7 * ldc + 4) = spu_extract(tmp5,0);*((C) + 7 * ldc + 5) = spu_extract(tmp5,1);*((C) + 7 * ldc + 6) = spu_extract(tmp5,2);*((C) + 7 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(0, 8), R(1, 8), R(2, 8), R(3, 8));*((C) + 8 * ldc + 0) = spu_extract(tmp5,0);*((C) + 8 * ldc + 1) = spu_extract(tmp5,1);*((C) + 8 * ldc + 2) = spu_extract(tmp5,2);*((C) + 8 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 8), R(5, 8), R(6, 8), R(7, 8));*((C) + 8 * ldc + 4) = spu_extract(tmp5,0);*((C) + 8 * ldc + 5) = spu_extract(tmp5,1);*((C) + 8 * ldc + 6) = spu_extract(tmp5,2);*((C) + 8 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(8, 8), R(0, 9), R(1, 9), R(2, 9));*((C) + 8 * ldc + 8) = spu_extract(tmp5,0);*((C) + 9 * ldc + 0) = spu_extract(tmp5,1);*((C) + 9 * ldc + 1) = spu_extract(tmp5,2);*((C) + 9 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 9), R(4, 9), R(5, 9), R(6, 9));*((C) + 9 * ldc + 3) = spu_extract(tmp5,0);*((C) + 9 * ldc + 4) = spu_extract(tmp5,1);*((C) + 9 * ldc + 5) = spu_extract(tmp5,2);*((C) + 9 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(7, 9), R(8, 9), R(9, 9), R(0, 10));*((C) + 9 * ldc + 7) = spu_extract(tmp5,0);*((C) + 9 * ldc + 8) = spu_extract(tmp5,1);*((C) + 9 * ldc + 9) = spu_extract(tmp5,2);*((C) + 10 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 10), R(2, 10), R(3, 10), R(4, 10));*((C) + 10 * ldc + 1) = spu_extract(tmp5,0);*((C) + 10 * ldc + 2) = spu_extract(tmp5,1);*((C) + 10 * ldc + 3) = spu_extract(tmp5,2);*((C) + 10 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 10), R(6, 10), R(7, 10), R(8, 10));*((C) + 10 * ldc + 5) = spu_extract(tmp5,0);*((C) + 10 * ldc + 6) = spu_extract(tmp5,1);*((C) + 10 * ldc + 7) = spu_extract(tmp5,2);*((C) + 10 * ldc + 8) = spu_extract(tmp5,3); HSUM(R(9, 10), R(10, 10), R(0, 11), R(1, 11));*((C) + 10 * ldc + 9) = spu_extract(tmp5,0);*((C) + 10 * ldc + 10) = spu_extract(tmp5,1);*((C) + 11 * ldc + 0) = spu_extract(tmp5,2);*((C) + 11 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 11), R(3, 11), R(4, 11), R(5, 11));*((C) + 11 * ldc + 2) = spu_extract(tmp5,0);*((C) + 11 * ldc + 3) = spu_extract(tmp5,1);*((C) + 11 * ldc + 4) = spu_extract(tmp5,2);*((C) + 11 * ldc + 5) = spu_extract(tmp5,3); HSUM(R(6, 11), R(7, 11), R(8, 11), R(9, 11));*((C) + 11 * ldc + 6) = spu_extract(tmp5,0);*((C) + 11 * ldc + 7) = spu_extract(tmp5,1);*((C) + 11 * ldc + 8) = spu_extract(tmp5,2);*((C) + 11 * ldc + 9) = spu_extract(tmp5,3); HSUM(R(10, 11), R(11, 11), R(0, 12), R(1, 12));*((C) + 11 * ldc + 10) = spu_extract(tmp5,0);*((C) + 11 * ldc + 11) = spu_extract(tmp5,1);*((C) + 12 * ldc + 0) = spu_extract(tmp5,2);*((C) + 12 * ldc + 1) = spu_extract(tmp5,3); HSUM(R(2, 12), R(3, 12), R(4, 12), R(5, 12));*((C) + 12 * ldc + 2) = spu_extract(tmp5,0);*((C) + 12 * ldc + 3) = spu_extract(tmp5,1);*((C) + 12 * ldc + 4) = spu_extract(tmp5,2);*((C) + 12 * ldc + 5) = spu_extract(tmp5,3); HSUM(R(6, 12), R(7, 12), R(8, 12), R(9, 12));*((C) + 12 * ldc + 6) = spu_extract(tmp5,0);*((C) + 12 * ldc + 7) = spu_extract(tmp5,1);*((C) + 12 * ldc + 8) = spu_extract(tmp5,2);*((C) + 12 * ldc + 9) = spu_extract(tmp5,3); HSUM(R(10, 12), R(11, 12), R(12, 12), R(0, 13));*((C) + 12 * ldc + 10) = spu_extract(tmp5,0);*((C) + 12 * ldc + 11) = spu_extract(tmp5,1);*((C) + 12 * ldc + 12) = spu_extract(tmp5,2);*((C) + 13 * ldc + 0) = spu_extract(tmp5,3); HSUM(R(1, 13), R(2, 13), R(3, 13), R(4, 13));*((C) + 13 * ldc + 1) = spu_extract(tmp5,0);*((C) + 13 * ldc + 2) = spu_extract(tmp5,1);*((C) + 13 * ldc + 3) = spu_extract(tmp5,2);*((C) + 13 * ldc + 4) = spu_extract(tmp5,3); HSUM(R(5, 13), R(6, 13), R(7, 13), R(8, 13));*((C) + 13 * ldc + 5) = spu_extract(tmp5,0);*((C) + 13 * ldc + 6) = spu_extract(tmp5,1);*((C) + 13 * ldc + 7) = spu_extract(tmp5,2);*((C) + 13 * ldc + 8) = spu_extract(tmp5,3); HSUM(R(9, 13), R(10, 13), R(11, 13), R(12, 13));*((C) + 13 * ldc + 9) = spu_extract(tmp5,0);*((C) + 13 * ldc + 10) = spu_extract(tmp5,1);*((C) + 13 * ldc + 11) = spu_extract(tmp5,2);*((C) + 13 * ldc + 12) = spu_extract(tmp5,3); HSUM(R(13, 13), R(0, 14), R(1, 14), R(2, 14));*((C) + 13 * ldc + 13) = spu_extract(tmp5,0);*((C) + 14 * ldc + 0) = spu_extract(tmp5,1);*((C) + 14 * ldc + 1) = spu_extract(tmp5,2);*((C) + 14 * ldc + 2) = spu_extract(tmp5,3); HSUM(R(3, 14), R(4, 14), R(5, 14), R(6, 14));*((C) + 14 * ldc + 3) = spu_extract(tmp5,0);*((C) + 14 * ldc + 4) = spu_extract(tmp5,1);*((C) + 14 * ldc + 5) = spu_extract(tmp5,2);*((C) + 14 * ldc + 6) = spu_extract(tmp5,3); HSUM(R(7, 14), R(8, 14), R(9, 14), R(10, 14));*((C) + 14 * ldc + 7) = spu_extract(tmp5,0);*((C) + 14 * ldc + 8) = spu_extract(tmp5,1);*((C) + 14 * ldc + 9) = spu_extract(tmp5,2);*((C) + 14 * ldc + 10) = spu_extract(tmp5,3); HSUM(R(11, 14), R(12, 14), R(13, 14), R(14, 14));*((C) + 14 * ldc + 11) = spu_extract(tmp5,0);*((C) + 14 * ldc + 12) = spu_extract(tmp5,1);*((C) + 14 * ldc + 13) = spu_extract(tmp5,2);*((C) + 14 * ldc + 14) = spu_extract(tmp5,3); HSUM(R(0, 15), R(1, 15), R(2, 15), R(3, 15));*((C) + 15 * ldc + 0) = spu_extract(tmp5,0);*((C) + 15 * ldc + 1) = spu_extract(tmp5,1);*((C) + 15 * ldc + 2) = spu_extract(tmp5,2);*((C) + 15 * ldc + 3) = spu_extract(tmp5,3); HSUM(R(4, 15), R(5, 15), R(6, 15), R(7, 15));*((C) + 15 * ldc + 4) = spu_extract(tmp5,0);*((C) + 15 * ldc + 5) = spu_extract(tmp5,1);*((C) + 15 * ldc + 6) = spu_extract(tmp5,2);*((C) + 15 * ldc + 7) = spu_extract(tmp5,3); HSUM(R(8, 15), R(9, 15), R(10, 15), R(11, 15));*((C) + 15 * ldc + 8) = spu_extract(tmp5,0);*((C) + 15 * ldc + 9) = spu_extract(tmp5,1);*((C) + 15 * ldc + 10) = spu_extract(tmp5,2);*((C) + 15 * ldc + 11) = spu_extract(tmp5,3); HSUM(R(12, 15), R(13, 15), R(14, 15), R(15, 15));*((C) + 15 * ldc + 12) = spu_extract(tmp5,0);*((C) + 15 * ldc + 13) = spu_extract(tmp5,1);*((C) + 15 * ldc + 14) = spu_extract(tmp5,2);*((C) + 15 * ldc + 15) = spu_extract(tmp5,3);
#define SUM_V16(C) HSUM(R(0, 0), R(1, 0), R(2, 0), R(3, 0)); C[0] = tmp5; HSUM(R(4, 0), R(5, 0), R(6, 0), R(7, 0)); C[1] = tmp5; HSUM(R(8, 0), R(9, 0), R(10, 0), R(11, 0)); C[2] = tmp5; HSUM(R(12, 0), R(13, 0), R(14, 0), R(15, 0)); C[3] = tmp5;