9
#include "mrses_impl.h"
10
#include "mrses_ppu.h"
12
#ifdef USE_FAST_RANDOM
13
#define FASTRND(var, max, add) \
14
g_seed = (214013*g_seed+2531011); \
15
var = ((int)((max)*((g_seed>>16)&0x7FFF) / (0x7FFF + 1.0))) + add;
19
//#define rnd(i) ((int)((i) * ((random_r(&data->seed, &rndval)?0:rndval) / (RAND_MAX + 1.0))))
20
#define rnd(i) ((int)((i) * (rand_r(&data->seed) / (RAND_MAX + 1.0))))
23
static inline int mrses_ppu_real_run(
24
MRSESContext mrses, MRSESDataType *result,
25
int width, int width2, int nA, int nB,
26
MRSESDataType *A, MRSESDataType *B, MRSESDataType *mean,
27
MRSESDataType *C, MRSESDataType *Ca, MRSESDataType *Cb
33
MRSESDataType detAB, detC;
34
MRSESDataType rmahal, rcorr;
37
// We can save twice by not computing symmetric parts
38
//blas_gemm(CblasRowMajor, CblasNoTrans, CblasTrans, width, width, nA, 1, A, nA, A, nA, 0, Ca, width);
39
//blas_gemm(CblasRowMajor, CblasNoTrans, CblasTrans, width, width, nB, 1, B, nB, B, nB, 0, Cb, width);
40
//printf("%f %f\n", Ca[4], Ca[20]);
42
//blas_syrk(CblasRowMajor, CblasUpper, CblasTrans, nA, width, 1.0, A, nA, 0, Ca, width);
43
//blas_syrk(CblasRowMajor, CblasUpper, CblasTrans, nB, width, 1.0, B, nB, 0, Cb, width);
46
blas_syrk(CblasRowMajor, CblasUpper, CblasNoTrans, width, nA, 1.0, A, nA, 0, Ca, width);
47
blas_syrk(CblasRowMajor, CblasUpper, CblasNoTrans, width, nB, 1.0, B, nB, 0, Cb, width);
49
memcpy(C, Ca, width2 * sizeof(MRSESDataType));
50
blas_axpy(width2, 1, Cb, 1, C, 1);
51
blas_scal(width2, 0.5, C, 1);
53
// PRINT_MATRIX("%+6.4f ", Ca, 5, 5, 5)
54
// PRINT_MATRIX("%+6.4f ", A, nA, 5, 16)
56
lapack_potrf(&hmode, &width, C, &width, &err);
59
lapack_potrf(&hmode, &width, Ca, &width, &err);
62
lapack_potrf(&hmode, &width, Cb, &width, &err);
66
detC = C[0]; detAB = Ca[0] * Cb[0];
67
for (i = width + 1; i < width2; i+= (width+1)) {
68
detAB *= (Ca[i] * Cb[i]);
72
/* we just computing sqrt(detX) actually */
73
rcorr = 2 * logf(detC * detC / detAB);
77
CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit,
78
1, width, 1, C, width, mean, width //1?
83
CblasRowMajor, CblasUpper, CblasTrans, CblasNonUnit,
84
width, C, width, mean, 1
88
rmahal = blas_dot(width, mean, 1, mean, 1);
90
switch (mrses->dist) {
92
*result = rmahal/8 + rcorr/4;
106
printf("Det: %f = %f %f (%e)\n", rmahal/8 + rcorr/4, rmahal, rcorr, detC*detC);
112
struct MRSESTemporaryDataT {
119
MRSESDataType *mean_copy;
121
#ifndef USE_FAST_RANDOM
122
// struct random_data seed;
124
#endif /* USE_FAST_RANDOM */
126
typedef struct MRSESTemporaryDataT MRSESTemporaryDataS;
127
typedef struct MRSESTemporaryDataT *MRSESTemporaryData;
129
static inline MRSESTemporaryData mrses_ppu_malloc(HWThread thr, MRSESContext mrses) {
130
unsigned char *alloc;
131
MRSESTemporaryData data;
133
int properties = mrses->properties;
137
#ifndef USE_FAST_RANDOM
140
# endif /* FIX_RANDOM */
141
#endif /* USE_FAST_RANDOM */
143
pos = calc_alloc(properties * sizeof(uint32_t), HW_ALIGN);
144
if (thr->data) return (MRSESTemporaryData)(thr->data + pos);
146
width = mrses->width;
147
width2 = width * width;
148
nAB = max(mrses->nA, mrses->nB);
151
calc_alloc(properties * sizeof(uint32_t), HW_ALIGN) +
152
calc_alloc(sizeof(MRSESTemporaryDataS), HW_ALIGN) +
153
2 * calc_alloc(width * nAB * sizeof(MRSESDataType), HW_ALIGN) +
154
3 * calc_alloc(width * width * sizeof(MRSESDataType), HW_ALIGN) +
155
2 * calc_alloc(width * sizeof(MRSESDataType), HW_ALIGN)
158
posix_memalign((void*)&alloc, HW_ALIGN, size);
159
if (!alloc) return NULL;
161
memset(alloc, 0, properties * sizeof(uint32_t));
163
data = (MRSESTemporaryData)(alloc + pos);
164
pos += calc_alloc(sizeof(MRSESTemporaryDataS), HW_ALIGN);
166
data->A = (MRSESDataType*)(alloc + pos);
167
pos += calc_alloc(width * nAB * sizeof(MRSESDataType), HW_ALIGN);
169
data->B = (MRSESDataType*)(alloc + pos);
170
pos += calc_alloc(width * nAB * sizeof(MRSESDataType), HW_ALIGN);
172
data->C = (MRSESDataType*)(alloc + pos);
173
pos += calc_alloc(width * width * sizeof(MRSESDataType), HW_ALIGN);
175
data->Ca = (MRSESDataType*)(alloc + pos);
176
pos += calc_alloc(width * width * sizeof(MRSESDataType), HW_ALIGN);
178
data->Cb = (MRSESDataType*)(alloc + pos);
179
pos += calc_alloc(width * width * sizeof(MRSESDataType), HW_ALIGN);
181
data->mean = (MRSESDataType*)(alloc + pos);
182
pos += calc_alloc(width * sizeof(MRSESDataType), HW_ALIGN);
184
data->mean_copy = (MRSESDataType*)(alloc + pos);
186
#ifndef USE_FAST_RANDOM
188
// srandom_r(FIX_RANDOM, &data->seed);
189
data->seed = FIX_RANDOM;
190
# else /* FIX_RANDOM */
191
gettimeofday(&tv, NULL);
192
data->seed = tv.tv_usec;
193
// memset(&data->seed, 0, sizeof(struct random_data));
194
// srandom_r(tv.tv_usec, &data->seed);
195
# endif /* FIX_RANDOM */
196
#endif /* USE_FAST_RANDOM */
198
thr->data = (void*)alloc;
204
int mrses_ppu_run(HWThread thr, void *hwctx __attribute__ ((unused)), int block, MRSESContext mrses) {
208
int width = mrses->width;
209
int width2 = width * width;
210
int alloc = mrses->alloc;
214
MRSESIntType *index = mrses->index + block * width;
215
MRSESDataType *result = mrses->result + block;
217
MRSESDataType *Afull = mrses->A;
218
MRSESDataType *Bfull = mrses->B;
219
MRSESDataType *Mfull = mrses->mean;
221
MRSESTemporaryData data;
230
data = mrses_ppu_malloc(thr, mrses);
240
for (i = 0; i < width; i++) {
243
memcpy(A + i * nA, Afull + idx * alloc, nA * sizeof(MRSESDataType));
244
memcpy(B + i * nB, Bfull + idx * alloc, nB * sizeof(MRSESDataType));
245
mean[i] = Mfull[idx];
248
err = mrses_ppu_real_run(
250
width, width2, nA, nB,
251
A, B, mean, C, Ca, Cb
255
MRSESDataType *result = mrses->result + block;
256
*result = 0; /* Just do octave computation instead */
264
int mrses_ppu_iterate(HWThread thr, void *hwctx __attribute__ ((unused)), int block_group, MRSESContext mrses) {
268
int iterate_size = mrses->iterate_size;
269
int block = block_group * iterate_size;
270
int block_end = block + iterate_size;
272
int width = mrses->width;
273
int width2 = width * width;
274
int alloc = mrses->alloc;
277
int properties = mrses->properties;
278
int iterations = mrses->iterations;
283
MRSESIntType *ires = mrses->ires;
284
MRSESIntType drp_gen, rpl_gen, rst_gen;
285
MRSESDataType result = 0, cur_result = 0;
287
MRSESDataType *Afull = mrses->A;
288
MRSESDataType *Bfull = mrses->B;
289
MRSESDataType *Mfull = mrses->mean;
291
MRSESTemporaryData data;
299
MRSESDataType *mean_copy;
302
struct timeval tv1, tv2;
303
#endif /* TRACE_TIMINGS */
305
#ifdef USE_FAST_RANDOM
307
# ifndef TRACE_TIMINGS
308
struct timeval tvseed;
309
# endif /* TRACE_TIMINGS */
310
#else /* USE_FAST_RANDOM */
312
#endif /* USE_FAST_RANDOM */
315
gettimeofday(&tv1, NULL);
316
#endif /* TRACE_TIMINGS */
318
#ifdef USE_FAST_RANDOM
319
# ifdef TRACE_TIMINGS
320
g_seed = tv1.tv_usec;
322
gettimeofday(&tvseed, NULL);
323
g_seed = tvseed.tv_usec;
324
# endif /* TRACE_TIMINGS */
325
#endif /* USE_FAST_RANDOM */
328
data = mrses_ppu_malloc(thr, mrses);
337
mean_copy = data->mean_copy;
339
hist = (uint32_t*)thr->data;
342
for (;block < block_end; ++block) {
343
index = mrses->index + block * properties;
346
for (i = 0; i < width; ++i) {
349
memcpy(A + i * nA, Afull + idx * alloc, nA * sizeof(MRSESDataType));
350
memcpy(B + i * nB, Bfull + idx * alloc, nB * sizeof(MRSESDataType));
351
mean[i] = Mfull[idx];
354
memcpy(mean_copy, mean, width * sizeof(MRSESDataType));
356
err = mrses_ppu_real_run(
358
width, width2, nA, nB,
359
A, B, mean_copy, C, Ca, Cb
363
for (i = 0; i < iterations; i++) {
364
#ifdef USE_FAST_RANDOM
365
FASTRND(drp_gen, width, 0);
366
FASTRND(rpl_gen, properties - width, width);
367
#else /* USE_FAST_RANDOM */
368
drp_gen = rnd(width);
369
rpl_gen = rnd(properties - width) + width;
370
#endif /* USE_FAST_RANDOM */
372
if ((rst_gen < width)&&(rst_gen != drp_gen)) {
373
idx = index[rst_gen];
374
memcpy(A + rst_gen * nA, Afull + idx * alloc, nA * sizeof(MRSESDataType));
375
memcpy(B + rst_gen * nB, Bfull + idx * alloc, nB * sizeof(MRSESDataType));
379
idx = index[rpl_gen];
380
memcpy(A + drp_gen * nA, Afull + idx * alloc, nA * sizeof(MRSESDataType));
381
memcpy(B + drp_gen * nB, Bfull + idx * alloc, nB * sizeof(MRSESDataType));
383
memcpy(mean_copy, mean, width * sizeof(MRSESDataType));
384
mean_copy[drp_gen] = Mfull[idx];
386
err = mrses_ppu_real_run(
388
width, width2, nA, nB,
389
A, B, mean_copy, C, Ca, Cb
392
if (result < cur_result) {
395
// printf("%i=%i %i=%i %f %f\n", drp_gen, index[drp_gen], rpl_gen, index[rpl_gen], cur_result, result);
397
SWAPij(index, drp_gen, rpl_gen);
399
mean[drp_gen] = Mfull[idx];
402
for (i = 0; i < width; i++) {
404
// printf("%5i ", index[i] + 1);
407
memcpy(ires + width * block, index, width * sizeof(MRSESIntType));
414
gettimeofday(&tv2, NULL);
415
printf("Thead %p: %li ms\n", thr, (tv2.tv_sec - tv1.tv_sec)*1000+(tv2.tv_usec - tv1.tv_usec)/1000);
416
#endif /* TRACE_TIMINGS */