/ani/mrses

To get this branch, use:
bzr branch http://suren.me/webbzr/ani/mrses
1 by Suren A. Chilingaryan
Initial import
1
#include <stdio.h>
2
#include <stdlib.h>
3
#include <malloc.h>
4
#include <string.h>
5
#include <sys/time.h>
6
7
#include <blas_s.h>
8
#include <cblas.h>
9
#include <lapack.h>
10
11
#include <spu_intrinsics.h>
12
#include <spu_mfcio.h>
13
14
    // we should also consider HW_ALIGN (when change this values)
15
#define SPE_BLOCK 16		// Thats for BLAS/Lapack in items
16
17
#define HW_HIDE_DETAILS
18
#include "mrses_spe.h"
19
20
#define spe_scal(N,a,X,incX) sscal_spu(X, a, N)
21
#define spe_axpy(N, a, X, incX, Y, incY) saxpy_spu(X, Y, a, N)
22
#define spe_syrk(Order,Uplo,Trans, N, K, a, A, lda, b, C, ldc) \
23
    ssyrk_spu(A, C, a, N, K, lda, ldc)
24
25
26
#undef rnd
27
#define rnd(i) ((int)((i) * (rand() / (RAND_MAX + 1.0))))
28
29
#ifdef HW_USE_BLOCKED_MULTIPLY
30
# define calc_ralloc(width, walloc) walloc
31
#else
32
//# define calc_ralloc(width, walloc) ((width<13)?width:walloc)
33
# define calc_ralloc(width, walloc) width
34
#endif /* HW_USE_BLOCKED_MULTIPLY */
35
36
37
//#include "atlas_potrf.h"
38
#include "vec_potrf.h"
39
40
static inline int mrses_spe_real_multiply(
41
    MRSESDataType *D,
42
    short int pack, short int at_once,
43
    short int d_step, short int width, short int ralloc, short int walloc, short int nA, short int nB,
44
    MRSESDataType *A, MRSESDataType *B,
45
    MRSESDataType *Ca, MRSESDataType *Cb
46
) {
47
    short int i, k, l;
48
49
//    PRINT_MATRIX("% 6.4f ", A, nA, 16, 16)
50
51
/*
52
    int rww = at_once * walloc * width;
53
    memset(Ca, 0, rww * sizeof(MRSESDataType));
54
    memset(Cb, 0, rww * sizeof(MRSESDataType));
55
    spe_syrk(CblasRowMajor, CblasLower, CblasNoTrans, walloc, nA, 1, A, nA, 0, Ca, walloc);
56
    spe_syrk(CblasRowMajor, CblasLower, CblasNoTrans, walloc, nB, 1, B, nB, 0, Cb, walloc);
57
*/
58
    
59
60
    vec_ssyrk_rln_11(ralloc, nA, A, nA, Ca, walloc);
61
    vec_ssyrk_rln_11(ralloc, nB, B, nB, Cb, walloc);
62
63
    short int c_step = width * (walloc + 1);
64
    for (i = 0; i < at_once; ++i) {
65
	short int c_offset = i * c_step;
66
	MRSESDataType *Ca_cur = Ca + c_offset;
67
	MRSESDataType *Cb_cur = Cb + c_offset;
68
69
	MRSESDataType *Da = D + (3 * i + 1) * d_step;
70
	MRSESDataType *Db = Da + d_step;
71
	
72
	for (k = 0; k < width; ++k) {
73
	    for (l = 0; l < width; ++l) {
74
		Da[pack*(k * width + l)] = Ca_cur[k * walloc + l];
75
		Db[pack*(k * width + l)] = Cb_cur[k * walloc + l];
76
	    }
77
	}
78
    }
79
80
81
    return 0;
82
}
83
84
static vector unsigned int mask1 = {0, 0xFFFFFFFF, 0, 0xFFFFFFFF};
85
static vector unsigned int mask2 = {0, 0, 0xFFFFFFFF, 0xFFFFFFFF};
86
static vector unsigned int mask3 = {0xFFFFFFFF, 0, 0, 0xFFFFFFFF};
87
88
#define VECTOR_PRINT(var, val) \
89
    printf("Vector %10s is: {% 6.4e, % 6.4e, % 6.4e, % 6.4e}\n", var, spu_extract(val, 0), spu_extract(val, 1), spu_extract(val, 2), spu_extract(val, 3));
90
91
#define iVECTOR_PRINT(var, val) \
92
    printf("Vector %10s is: {%i, %i, %i, %i}\n", var, spu_extract(val, 0), spu_extract(val, 1), spu_extract(val, 2), spu_extract(val, 3));
93
94
95
#define REPACK(Da, Ca_cur) \
96
		r1 =              *(vector float*)(Ca_cur +             k * walloc + l); \
97
		r2 = spu_rlqwbyte(*(vector float*)(Ca_cur +     c_big + k * walloc + l), -4); \
98
		r3 = spu_rlqwbyte(*(vector float*)(Ca_cur + 2 * c_big + k * walloc + l), -8); \
99
		r4 = spu_rlqwbyte(*(vector float*)(Ca_cur + 3 * c_big + k * walloc + l), -12); \
100
		\
101
		r5 = spu_sel(r1, r2, mask1); \
102
		r6 = spu_sel(r3, r4, mask1); \
103
		\
104
		Da[k*width + l    ] = spu_sel(r5, r6, mask2); \
105
	        Da[k*width + l + 2] = spu_rlqwbyte(spu_sel(r6, r5, mask2), 8); \
106
		\
107
		r5 = spu_sel(r2, r1, mask1); \
108
		r6 = spu_sel(r4, r3, mask1); \
109
		\
110
		Da[k*width + l + 1] = spu_rlqwbyte(spu_sel(r5, r6, mask3), 4); \
111
		Da[k*width + l + 3] = spu_rlqwbyte(spu_sel(r6, r5, mask3), 12);
112
113
114
115
static inline int mrses_spe_real_multiply_recover(
116
    MRSESDataType *D,
117
    short int pack, short int at_once,
118
    short int d_step, short int width, short int ralloc, short int walloc,
119
    MRSESDataType *Ca, MRSESDataType *Cb,
120
    MRSESIntType *drp_gen, MRSESIntType *rst_gen,
121
    MRSESDataType *Ea, MRSESDataType *Eb
122
) {
123
    short int i, j, k, l, m;
124
    short int c_step = width * (walloc + 1);
125
    short int c_big = ralloc * walloc;
126
    short int e_big = at_once * walloc;
127
  
128
    for (i = 0; i < at_once; ++i) {
129
	short int e_offset = i * walloc;
130
	short int c_offset = i * c_step;
131
132
	MRSESDataType *Da = D + (3 * i + 1) * d_step;
133
	MRSESDataType *Db = Da + d_step;
134
135
	for (m = 0; m < pack; ++m, c_offset += c_big, e_offset += e_big) {
136
	    short int gen = drp_gen[m * at_once + i];
137
138
	    if (rst_gen[m * at_once + i] > width) {
139
		for (j = 0; j < width; j++) {
140
		    if (j < gen) {
141
			l = j; k = gen;
142
		    } else {
143
			l = gen; k = j;
144
		    }
145
		
146
		    Ca[c_offset + k * walloc + l] = Ea[e_offset + j];
147
		    Cb[c_offset + k * walloc + l] = Eb[e_offset + j];
148
		}
149
	    }
150
	}
151
152
	    // Recovering to current stage
153
    	for (k = 0; k < width; ++k) {
154
	    for (l = 0; l < width; l += 4) {
155
		register vector float r1, r2, r3, r4, r5, r6;
156
		
157
		REPACK(((vector float*)Da), ((float*)Ca));
158
		REPACK(((vector float*)Db), ((float*)Cb));
159
		
160
	    }
161
	}
162
    }
163
    
164
    return 0;
165
}
166
167
168
169
static inline int mrses_spe_real_multiply_update(
170
    MRSESDataType *D,
171
    short int pack, short int at_once,
172
    short int d_step, short int width, short int ralloc, short int walloc, short int nA, short int nB,
173
    MRSESDataType *A, MRSESDataType *B,
174
    MRSESIntType *drp_gen,
175
    MRSESDataType *Ea, MRSESDataType *Eb
176
) {
177
    short int i, j, k, l;
178
/*
179
    if (*rst_gen > width) {
180
	vec_ssyrk_rln_11(ralloc, nA, A, nA, Ca, walloc);
181
	vec_ssyrk_rln_11(ralloc, nB, B, nB, Cb, walloc);
182
    }
183
*/
184
185
    for (i = 0; i < at_once; ++i) {
186
	short int e_offset = i * walloc;
187
	short int gen = drp_gen[i];
188
189
	MRSESDataType *Ea_cur = Ea + e_offset;
190
	MRSESDataType *Eb_cur = Eb + e_offset;
191
192
	MRSESDataType *Da = D + (3 * i + 1) * d_step;
193
	MRSESDataType *Db = Da + d_step;
194
	
195
        vec_update_row(gen, ralloc, nA, A, nA, Ea_cur);
196
        vec_update_row(gen, ralloc, nB, B, nB, Eb_cur);
197
	
198
	for (j = 0; j < width; j++) {
199
	    if (j < gen) {
200
		l = j; k = gen;
201
	    } else {
202
		l = gen; k = j;
203
	    }
204
205
//    if ((((int)D)%16)==0) {
206
//	    if (fabs(Da[pack * (k*width + l)] - Ea_cur[j])>0.001) {
207
//		printf("Updating (%i): %i %i: % 6.4f to % 6.4f\n", gen, k, l, Da[pack * (k*width + l)], Ea_cur[j]);
208
//	    }
209
//    }
210
	    Da[pack*(k * width + l)] = Ea_cur[j];
211
	    Db[pack*(k * width + l)] = Eb_cur[j];
212
	}
213
    }
214
215
    return 0;
216
}
217
218
static inline int mrses_spe_real_decompose(
219
    MRSESDistance dist, MRSESDataType *res,
220
    short int pack, short int at_once,
221
    short int step, short int width,
222
    MRSESDataType *D, MRSESDataType * mean
223
) {
224
    short int i, j;
225
    short int mstep = pack * width;
226
227
    vector unsigned int err;
228
    vector unsigned int error;
229
    vector float rcorr;
230
    vector float rmahal;
231
    vector float result;
232
    vector float zero = {0, 0, 0, 0};
233
    
234
    for (i = 0; i < at_once; ++i) {
235
    	MRSESDataType *C = D + 3 * i * step;
236
	MRSESDataType *Ca = C + step;
237
    	MRSESDataType *Cb = Ca + step;
238
	MRSESDataType *m = mean + i * mstep;
239
240
	memcpy(C, Ca, step * sizeof(MRSESDataType));
241
	spe_axpy(step, 1, Cb, 1, C, 1);
242
	spe_scal(step, 0.5, C, 1);
243
244
	error = vec_spotrf_u(width, C, width);
245
246
	err = vec_spotrf_u(width, Ca, width);
247
	error = spu_and(err, error);
248
249
	err = vec_spotrf_u(width, Cb, width);
250
	error = spu_and(err, error);
251
	
252
	rcorr = vec_rcorr(width, C, Ca, Cb);
253
	rmahal = vec_rmahal(width, C, m);
254
255
	switch (dist) {
256
	    case BHATTACHARYYA:
257
		result = vec_bhata(rcorr, rmahal);
258
	    break;
259
	    case MAHALANOBIS:
260
		result = rmahal;
261
	    break;
262
	    case CORCOR:
263
		result = rcorr;
264
	    break;
265
	    default:
266
		result = zero;
267
	}
268
269
	for (j = 0; j < pack; j++) {
270
	    if (*(((unsigned int*)&error)+j)) {
271
		res[j * at_once + i] = *(((float*)&result)+j);
272
//		printf("SPU result: %e (mahal: %e, corcor: %e)\n", *(((float*)&result)+j), *(((float*)&rmahal)+j), *(((float*)&rcorr)+j));
273
	    } else {
274
		res [j * at_once + i] = 0;
275
//		printf("SPU result: singular matrix, assuming 0 distance\n");
276
	    }
277
	}
278
    }
279
    
280
    return 0;
281
}
282
283
struct MRSESTemporaryDataT {
284
    MRSESDataType *A;
285
    MRSESDataType *B;
286
    MRSESDataType *Ca;
287
    MRSESDataType *Cb;
288
    MRSESDataType *Ea;
289
    MRSESDataType *Eb;
290
    MRSESDataType *D;
291
    MRSESDataType *mean;
292
    MRSESDataType *mean_copy;
293
    MRSESIntType *index;
294
    MRSESIntType *index_storage;
295
    MRSESDataType *Mfull;
296
};
297
typedef struct MRSESTemporaryDataT MRSESTemporaryDataS;
298
typedef struct MRSESTemporaryDataT *MRSESTemporaryData;
299
300
static unsigned char *local_data = NULL;
301
302
static inline MRSESTemporaryData mrses_spe_malloc(MRSESContext mrses, unsigned int rdch) {
303
    unsigned char *allocation;
304
    short int aA, aB;
305
    MRSESIntType properties;
306
    short int at_once, used_ralloc;
307
    short int pack;
308
    short int width;
309
    short int walloc, walloc2, dalloc, ralloc;
310
    short int index_segment_size; 
311
    int pos = 0, size;
312
    int extra_rows = 0;
313
314
    MRSESTemporaryData data;
315
    
316
    properties = mrses->properties;
317
318
//    pos = calc_alloc(properties * sizeof(uint32_t), HW_ALIGN);
319
    if (local_data) return (MRSESTemporaryData)(local_data + pos);
320
321
    width = mrses->width;
322
    aA = calc_alloc(mrses->nA, SPE_BLOCK);
323
    aB = calc_alloc(mrses->nB, SPE_BLOCK);
324
    walloc = calc_alloc(width, SPE_BLOCK);
325
    ralloc = calc_ralloc(width, walloc);
326
    walloc2 = walloc * walloc;
327
    at_once = max(1, ralloc / width);
328
    pack = SIMD_BLOCK / sizeof(MRSESDataType);
329
    dalloc = calc_alloc(pack * width * width, 128);//64);
330
    index_segment_size = (HW_ALIGN / sizeof(MRSESIntType));
331
332
    used_ralloc = at_once * width;
333
    
334
    if ((ralloc < walloc)&&(ralloc > 12)) {
335
	 extra_rows = walloc - ralloc;
336
    }
337
338
    size = (
339
	//calc_alloc(properties * sizeof(uint32_t), HW_ALIGN) +							// result histogram
340
	calc_alloc(sizeof(MRSESTemporaryDataS), HW_ALIGN) +							// structure
341
	(pack * ralloc + extra_rows) * (aA + aB) * sizeof(MRSESDataType) +					// A, B		- source matrices
342
	2 * (pack * ralloc + extra_rows) * walloc * sizeof(MRSESDataType) +					// Ca, Cb	- variances (temp)
343
	2 * pack * at_once * walloc * sizeof(MRSESDataType) +							// Ea, Eb	- variance updates
344
	    3 * at_once * dalloc * sizeof(MRSESDataType) +							// D		- variances (aligned)
345
	2 * pack * ralloc * sizeof(MRSESDataType) +								// mean, mean_copy
346
//	calc_alloc(mrses->iterate_size * properties * sizeof(MRSESIntType), HW_ALIGN) +
347
	(walloc + index_segment_size) * at_once * pack * sizeof(MRSESIntType) +					// index + exchange area
348
	calc_alloc(properties * sizeof(MRSESDataType), HW_ALIGN)						// Mfull, complete mean
349
    );
350
/*    
351
    printf("Allocating data: %i KB (A+B: %li, C: %li, D: %li, mean: %li, Mfull: %li)\n", size / 1024,
352
	pack * ralloc * (aA + aB) * sizeof(MRSESDataType) / 1024,
353
	2 * (pack * ralloc + extra_rows) * walloc * sizeof(MRSESDataType) / 1024,
354
	3 * at_once * dalloc * sizeof(MRSESDataType) / 1024,
355
	2 * (pack * ralloc + extra_rows) * sizeof(MRSESDataType) / 1024,
356
	calc_alloc(properties * sizeof(MRSESDataType), HW_ALIGN) / 1024
357
    );
358
*/
359
    allocation = (unsigned char*)memalign(HW_ALIGN, size);
360
    if (!allocation) return NULL;
361
362
    memset(allocation, 0, properties * sizeof(uint32_t));
363
    
364
    data = (MRSESTemporaryData)(allocation + pos);
365
    pos += calc_alloc(sizeof(MRSESTemporaryDataS), HW_ALIGN);
366
367
    data->A = (MRSESDataType*)(allocation + pos);
368
    pos += (pack * ralloc + extra_rows) * aA * sizeof(MRSESDataType);
369
370
    data->B = (MRSESDataType*)(allocation + pos);
371
    pos += (pack * ralloc + extra_rows) * aB * sizeof(MRSESDataType);
372
373
    data->Ca = (MRSESDataType*)(allocation + pos);
374
    pos += (pack * ralloc + extra_rows) * walloc * sizeof(MRSESDataType);
375
376
    data->Cb = (MRSESDataType*)(allocation + pos);
377
    pos += (pack * ralloc + extra_rows) * walloc * sizeof(MRSESDataType);
378
379
    data->Ea = (MRSESDataType*)(allocation + pos);
380
    pos += pack * at_once * walloc * sizeof(MRSESDataType);
381
382
    data->Eb = (MRSESDataType*)(allocation + pos);
383
    pos += pack * at_once * walloc * sizeof(MRSESDataType);
384
    
385
    data->D = (MRSESDataType*)(allocation + pos);
386
    pos += 3 * at_once * dalloc * sizeof(MRSESDataType);
387
388
    data->mean = (MRSESDataType*)(allocation + pos);
389
    pos += pack * ralloc * sizeof(MRSESDataType);
390
391
    data->mean_copy = (MRSESDataType*)(allocation + pos);
392
    pos += pack * ralloc * sizeof(MRSESDataType);
393
394
    data->index = (MRSESIntType*)(allocation + pos);
395
    pos += walloc * at_once * pack * sizeof(MRSESIntType);
396
397
    data->index_storage = (MRSESIntType*)(allocation + pos);
398
    pos += index_segment_size * at_once * pack * sizeof(MRSESIntType);
399
400
    data->Mfull = (MRSESDataType*)(allocation + pos);
401
402
    spu_mfcdma32((void *)(data->Mfull), (unsigned int)(mrses->mean), calc_alloc(properties * sizeof(MRSESDataType), HW_ALIGN), rdch, MFC_GET_CMD);
403
404
    local_data = (void*)allocation;
405
406
    if (used_ralloc < ralloc) {
407
	    // we need to do it, to clean the end parts for non-even case
408
	memset(data->A, 0, pack * (ralloc) * aA * sizeof(MRSESDataType));
409
	memset(data->B, 0, pack * (ralloc) * aB * sizeof(MRSESDataType));
410
411
	memset(data->mean, 0, pack * ralloc * sizeof(MRSESDataType));
412
	memset(data->mean_copy, 0, pack * ralloc * sizeof(MRSESDataType));
413
414
    }
415
416
    return data;
417
}
418
419
420
int mrses_spe_iterate(int block_group, MRSESContext mrses, unsigned int rdch) {
421
    int idx;
422
    short int j, k, l, m;
423
424
    short int width = mrses->width;
425
    short int walloc = calc_alloc(width, SPE_BLOCK);
426
    short int alloc = mrses->alloc;
427
    short int ralloc = calc_ralloc(width, walloc);
428
    short int nA = mrses->nA;
429
    short int cA = calc_alloc(nA * sizeof(MRSESDataType), HW_ALIGN);
430
    short int aA = calc_alloc(nA, SPE_BLOCK);	// aligned size
431
    short int nB = mrses->nB;
432
    short int cB = calc_alloc(nB * sizeof(MRSESDataType), HW_ALIGN);
433
    short int aB = calc_alloc(nB, SPE_BLOCK);
434
    MRSESDistance dist = mrses->dist;
435
436
    MRSESIntType properties = mrses->properties;
437
    int i, iterations = mrses->iterations;
438
439
    short int iterate_size = mrses->iterate_size;
440
    short int at_once = max(1, ralloc / width);
441
    short int pack = SIMD_BLOCK / sizeof(MRSESDataType);
442
    short int dalloc = calc_alloc(pack * width * width, 128);
443
    short int index_segment_size = (HW_ALIGN / sizeof(MRSESIntType));
444
445
    int spe_real_block = pack * at_once;
446
    MRSESIntType rpl_gen_k, rpl_gen_segment;
447
    MRSESIntType drp_gen[spe_real_block], rpl_gen[spe_real_block], rst_gen[spe_real_block];
448
    MRSESDataType result[spe_real_block], cur_result[spe_real_block];
449
//    MRSESDataType *result = mrses->result + block;
450
    
451
    volatile MRSESDataType *Afull = mrses->A;
452
    volatile MRSESDataType *Bfull = mrses->B;
453
    MRSESIntType *mrses_index = mrses->index;
454
455
    MRSESTemporaryData data;
456
457
    MRSESDataType *A;
458
    MRSESDataType *B;
459
    MRSESDataType *Ca;
460
    MRSESDataType *Cb;
461
    MRSESDataType *Ea;
462
    MRSESDataType *Eb;
463
    MRSESDataType *D;
464
    MRSESDataType *mean;
465
    MRSESDataType *mean_copy;
466
467
    volatile MRSESIntType *index_storage, *index_storage_k;
468
    volatile MRSESIntType *index, *index_k;
469
    volatile MRSESDataType *Mfull;
470
471
#ifdef USE_FAST_RANDOM
472
    unsigned int g_seed;
473
    struct timeval tvseed;
474
#endif /* USE_FAST_RANDOM */
475
476
#ifdef USE_FAST_RANDOM
477
    gettimeofday(&tvseed, NULL);
478
    g_seed = tvseed.tv_usec;
479
#endif /* USE_FAST_RANDOM */
480
481
    data = mrses_spe_malloc(mrses, rdch);
482
    if (!data) {
483
	printf("Memory allocation failed\n");
484
	exit(1);
485
    }
486
487
    A = data->A;
488
    B = data->B;
489
    Ca = data->Ca;
490
    Cb = data->Cb;
491
    Ea = data->Ea;
492
    Eb = data->Eb;
493
    D = data->D;
494
    mean = data->mean;
495
    mean_copy = data->mean_copy;
496
    index = data->index;
497
    index_storage = data->index_storage;
498
    Mfull = data->Mfull;
499
500
//    printf("Iterate %i, pack %i, at_once %i ralloc %i walloc %i\n", iterate_size, pack, at_once, ralloc, walloc);
501
    for (l = 0; l < iterate_size; l += pack * at_once) {
502
503
	for (j = 0; j < spe_real_block; j++) {
504
	    spu_mfcdma32((void *)(index + j * walloc), (unsigned int)(mrses_index + (j + l + block_group * iterate_size) * properties), walloc * sizeof(MRSESIntType), rdch, MFC_GET_CMD);
505
	}
506
507
	(void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
508
509
	for (m = 0; m < pack; m++) {
510
	    for (j = 0, k = 0; j < at_once; ++j) {
511
		//printf("SPE: %i = %i %i %i\n", l + m * at_once + j, l, m, j);
512
		index_k = index + (m * at_once + j) * walloc;
513
		for (i = 0; i < width; ++i, ++k) {
514
		    idx = index_k[i];
515
		    //printf(" * %i\n", idx);
516
		    spu_mfcdma32((void *)(A + ((int)(m * ralloc + k)) * aA), (unsigned int)(Afull + idx * alloc), cA, rdch, MFC_GET_CMD);
517
		    spu_mfcdma32((void *)(B + ((int)(m * ralloc + k)) * aB), (unsigned int)(Bfull + idx * alloc), cB, rdch, MFC_GET_CMD);
518
		    mean[k * pack + m] = Mfull[idx];
519
		}
520
	    }
521
522
	    (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
523
524
	    mrses_spe_real_multiply(D + m, pack, at_once, dalloc, width, ralloc, walloc, aA, aB, A + m * ralloc * aA, B + m * ralloc * aB, Ca + m * ralloc * walloc, Cb + m * ralloc * walloc);
525
	}
526
527
	memcpy(mean_copy, mean, pack * ralloc * sizeof(MRSESDataType));
528
529
	for (k = 0; k < spe_real_block; k++) {
530
	    rst_gen[k] = width;
531
	}
532
533
	mrses_spe_real_decompose(dist, cur_result, pack, at_once, dalloc, width, D, mean_copy);
534
535
536
//    }
537
//    return 0;
538
	    
539
//    {
540
541
542
	for (i = 0; i < iterations; ++i) 
543
	{
544
	    memcpy(mean_copy, mean, pack * ralloc * sizeof(MRSESDataType));
545
546
#ifndef HW_USE_BLOCKED_MULTIPLY
547
	    mrses_spe_real_multiply_recover(D, pack, at_once, dalloc, width, ralloc, walloc, Ca, Cb, drp_gen, rst_gen, Ea, Eb);
548
#endif /* HW_USE_BLOCKED_MULTIPLY */
549
550
	    for (m = 0, k = 0; m < pack; ++m) {
551
		for (j = 0; j < at_once; ++j, ++k) {
552
		    index_k = index + k * walloc;
553
554
	    	    drp_gen[k] = rnd(width);
555
		    rpl_gen[k] = rnd(properties - width) + width;
556
	
557
		    if ((rst_gen[k] < width)&&(rst_gen[k] != drp_gen[k])) {
558
			idx = index_k[rst_gen[k]];
559
			//printf("%i, Restoring: %i to index %i\n", k, rst_gen[k], idx);
560
	    		spu_mfcdma32((void *)(A + ((int)(m * ralloc + j * width + rst_gen[k])) * aA), (unsigned int)(Afull + idx * alloc), cA, rdch, MFC_GET_CMD);
561
			spu_mfcdma32((void *)(B + ((int)(m * ralloc + j * width + rst_gen[k])) * aB), (unsigned int)(Bfull + idx * alloc), cB, rdch, MFC_GET_CMD);
562
		    }
563
564
		    
565
		    rpl_gen_k = rpl_gen[k];
566
		    if (rpl_gen_k < walloc) {
567
			idx = index_k[rpl_gen[k]];
568
		    } else {
569
			index_storage_k = index_storage + k * index_segment_size;
570
			rpl_gen_segment = (rpl_gen_k / index_segment_size) * index_segment_size;
571
572
//			printf("%i, getting part of index %i, index segment %i\n", k + l, rpl_gen_k, rpl_gen_segment);
573
574
			spu_mfcdma32((void*)(index_storage_k), (unsigned int)(mrses_index + (l + k + block_group * iterate_size) * properties + rpl_gen_segment), HW_ALIGN, rdch, MFC_GET_CMD);
575
			(void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
576
			
577
			idx = index_storage_k[rpl_gen_k - rpl_gen_segment];
578
			//printf("done, index: %i\n", idx);
579
		    }
580
			
581
//		    printf("%i, receiving data for %i, index %i (pack %i)\n", k + l, drp_gen[k], idx, m);
582
		    spu_mfcdma32((void *)(A + ((int)(m * ralloc + j * width + drp_gen[k])) * aA), (unsigned int)(Afull + idx * alloc), cA, rdch, MFC_GET_CMD);
583
		    spu_mfcdma32((void *)(B + ((int)(m * ralloc + j * width + drp_gen[k])) * aB), (unsigned int)(Bfull + idx * alloc), cB, rdch, MFC_GET_CMD);
584
		    //puts("done");
585
		    mean_copy[(j * width + drp_gen[k]) * pack + m] = Mfull[idx];
586
		}
587
	        (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
588
589
590
#ifdef HW_USE_BLOCKED_MULTIPLY
591
		mrses_spe_real_multiply(D + m, pack, at_once, dalloc, width, ralloc, walloc, aA, aB, A + m * ralloc * aA, B + m * ralloc * aB, Ca, Cb);
592
#else  /* HW_USE_BLOCKED_MULTIPLY */
593
		mrses_spe_real_multiply_update(
594
		    D + m, 
595
		    pack, at_once, dalloc, width, ralloc, walloc, aA, aB, 
596
		    A + m * ralloc * aA, B + m * ralloc * aB, 
597
		    drp_gen + m * at_once,
598
		    Ea + m * at_once * walloc, Eb + m * at_once * walloc
599
		);
600
#endif /* HW_USE_BLOCKED_MULTIPLY */
601
	    }
602
	
603
	    mrses_spe_real_decompose(dist, result, pack, at_once, dalloc, width, D, mean_copy);
604
	    
605
	    for (m = 0, k = 0; m < pack; ++m) {
606
		for (j = 0; j < at_once; ++j, ++k) {
607
		    if (result[k] < cur_result[k]) {
608
			rst_gen[k] = drp_gen[k];
609
		    } else {
610
			rst_gen[k] = width + 1;
611
		    	cur_result[k] = result[k];
612
613
			index_k = index + k * walloc;
614
			
615
			rpl_gen_k = rpl_gen[k];
616
			if (rpl_gen_k < walloc) {
617
			    idx = index_k[rpl_gen_k];
618
			    SWAPij(index_k, drp_gen[k], rpl_gen_k);
619
			} else {
620
			    index_storage_k = index_storage + k * index_segment_size;
621
			    rpl_gen_segment = (rpl_gen_k / index_segment_size) * index_segment_size;
622
			    idx = index_storage_k[rpl_gen_k - rpl_gen_segment];
623
			    
624
			    index_storage_k[rpl_gen_k - rpl_gen_segment] = index_k[drp_gen[k]];
625
			    
626
			    //printf("%i, write back\n", k);
627
				// write back segment
628
			    spu_mfcdma32((void*)(index_storage_k), (unsigned int)(mrses_index + (l + k + block_group * iterate_size) * properties + rpl_gen_segment), HW_ALIGN, rdch, MFC_PUT_CMD);
629
		    	    //(void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
630
			    
631
			    //puts("done");
632
			    index_k[drp_gen[k]] = idx;
633
			}
634
			mean[(j * width + drp_gen[k]) * pack + m] = Mfull[idx];
635
		    }
636
		}
637
	    }
638
639
	}
640
641
	    // We can write back initial part of index, but we don't really need to
642
	for (j = 0; j < spe_real_block; j++) {
643
	    spu_mfcdma32((void *)(index + j * walloc), (unsigned int)(mrses_index + (j + l + block_group * iterate_size) * properties), walloc * sizeof(MRSESIntType), rdch, MFC_PUT_CMD);
644
	}
645
	(void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
646
/*
647
	for (k = 0; k < spe_real_block; k++) {
648
	    printf("SPU result, block %i: %e\n", l + k, cur_result[k]);
649
	}
650
*/
651
    }
652
653
    return 0;
654
}
655
656
#include <simdmath.h>
657
int main(unsigned long long id __attribute__ ((unused)), unsigned long long argp, unsigned long long envp __attribute__ ((unused))) {
658
    short int size;
659
    volatile SPUParameters param;
660
    volatile MRSESContext mrses;
661
    unsigned int rdch; //, wrch;
662
663
#ifndef FIX_RANDOM
664
    struct timeval tv;
665
#endif /* FIX_RANDOM */
666
667
#ifdef TRACE_TIMINGS
668
    struct timeval tv1, tv2;
669
#endif /* TRACE_TIMINGS */
670
671
#ifdef TRACE_TIMINGS
672
    gettimeofday(&tv1, NULL);
673
#endif /* TRACE_TIMINGS */
674
    
675
    rdch = mfc_tag_reserve();
676
    if (rdch == MFC_TAG_INVALID) {
677
	printf("Unable to reserve DMA tag, returning");
678
	exit(1);
679
    }
680
681
    spu_writech(MFC_WrTagMask, 1 << rdch);
682
683
    spu_mfcdma32(&param, argp, sizeof(SPUParameters), rdch, MFC_GET_CMD);
684
    (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
685
686
687
    size = calc_alloc(sizeof(MRSESContextS), HW_ALIGN);
688
    mrses = (MRSESContext)malloc(size);
689
    spu_mfcdma32(mrses, (unsigned int)(param.mrses), size, rdch, MFC_GET_CMD);
690
    (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
691
692
#ifdef FIX_RANDOM
693
    srand(FIX_RANDOM);
694
#else /* FIX_RANDOM */
695
    gettimeofday(&tv, NULL);
696
    srand(tv.tv_usec);
697
    //srand(block);
698
#endif /* FIX_RANDOM */
699
700
701
    mrses_spe_iterate(param.block_group, mrses, rdch);
702
    
703
	// Optimize?
704
//    free(local_data);
705
//    local_data = NULL;
706
707
#ifdef TRACE_TIMINGS
708
    gettimeofday(&tv2, NULL);
709
    printf("SPU 0x%llx: %li ms\n", id, (tv2.tv_sec - tv1.tv_sec)*1000+(tv2.tv_usec - tv1.tv_usec)/1000);
710
#endif /* TRACE_TIMINGS */
711
712
    return 0;
713
}