13
#include "mrses_impl.h"
17
MRSES_ITERATE_ENTRY = 1
19
typedef enum MRSESEntriesT MRSESEntries;
21
MRSESContext mrses_create_context() {
23
posix_memalign((void*)&ctx, HW_ALIGN, calc_alloc(sizeof(MRSESContextS), HW_ALIGN));
25
memset(ctx, 0, sizeof(MRSESContextS));
30
int mrses_init_context(MRSESContext ctx, int properties, int nA, const MRSESDataType *A, int nB, const MRSESDataType *B) {
40
#endif /* HW_HAVE_SPU */
43
MRSESDataType *ones, *meanA, *meanB;
50
assert(properties > 0);
52
mrses_free_context(ctx);
55
alloc = calc_alloc(nAB * sizeof(MRSESDataType), HW_ALIGN) / sizeof(MRSESDataType);
56
palloc = calc_alloc(properties * sizeof(MRSESDataType), HW_ALIGN) / sizeof(MRSESDataType);
57
matrix_size = properties * alloc;
62
ctx->properties = properties;
64
posix_memalign((void*)&ctx->A, HW_ALIGN, (alloc * properties)*sizeof(MRSESDataType));
65
posix_memalign((void*)&ctx->B, HW_ALIGN, (alloc * properties)*sizeof(MRSESDataType));
66
posix_memalign((void*)&ctx->mean, HW_ALIGN, palloc * sizeof(MRSESDataType));
67
posix_memalign((void*)&meanB, HW_ALIGN, properties*sizeof(MRSESDataType));
68
posix_memalign((void*)&ones, HW_ALIGN, nAB*sizeof(MRSESDataType));
72
if ((!ctx->A)||(!ctx->B)||(!meanA)||(!meanB)||(!ones)) {
74
if (meanB) free(meanB);
78
for (i = 0; i < nAB; i++)
81
memset(ctx->A, 0x7F, alloc * properties * sizeof(MRSESDataType));
82
// blas_gemv(CblasRowMajor,CblasNoTrans, properties, nA, 1. / nA, A, nA, ones, 1, 0, meanA, 1);
83
// blas_gemv(CblasRowMajor,CblasNoTrans, properties, nB, 1. / nB, B, nB, ones, 1, 0, meanB, 1);
85
for (i = 0; i < properties; i++) {
86
memcpy(ctx->A + i * alloc, A + i * nA, nA * sizeof(MRSESDataType));
90
aA = calc_alloc(nA * sizeof(MRSESDataType), HW_ALIGN) - nA * sizeof(MRSESDataType);
92
for (i = 0; i < properties; i++) {
93
memset(ctx->A + i * alloc + nA, 0, aA);
96
#endif /* HW_HAVE_SPU */
98
blas_gemv(CblasRowMajor,CblasNoTrans, properties, nA, 1. / nA, ctx->A, alloc, ones, 1, 0, meanA, 1);
99
for (i = 0; i < nA; i++) {
100
blas_axpy(properties, -1, meanA, 1, ctx->A + i, alloc);
102
blas_scal(matrix_size, 1.0 / sqrt(nA), ctx->A, 1);
105
for (i = 0; i < properties; i++) {
106
memcpy(ctx->B + i * alloc, B + i * nB, nB * sizeof(MRSESDataType));
110
aB = calc_alloc(nB * sizeof(MRSESDataType), HW_ALIGN) - nB * sizeof(MRSESDataType);
112
for (i = 0; i < properties; i++) {
113
memset(ctx->B + i * alloc + nB, 0, aB);
116
#endif /* HW_HAVE_SPU */
118
blas_gemv(CblasRowMajor,CblasNoTrans, properties, nB, 1. / nB, ctx->B, alloc, ones, 1, 0, meanB, 1);
119
for (i = 0; i < nB; i++) {
120
blas_axpy(properties, -1.0, meanB, 1, ctx->B + i, alloc);
122
blas_scal(matrix_size, 1.0 / sqrt(nB), ctx->B, 1);
124
blas_axpy(properties, -1.0, meanB, 1, meanA, 1);
128
blas_gemv(CblasRowMajor,CblasNoTrans, properties, nA, 1. / nA, A, nA, ones, 1, 0, meanA, 1);
129
blas_gemv(CblasRowMajor,CblasNoTrans, properties, nB, 1. / nB, B, nB, ones, 1, 0, meanB, 1);
131
matrix_size = properties * nA;
132
memcpy(ctx->A, A, matrix_size * sizeof(MRSESDataType));
133
for (i = 0; i < nA; i++) {
134
blas_axpy(properties, -1, meanA, 1, ctx->A + i, nA);
136
blas_scal(matrix_size, 1.0 / sqrt(nA), ctx->A, 1);
138
matrix_size = properties * nB;
139
memcpy(ctx->B, B, matrix_size * sizeof(MRSESDataType));
140
for (i = 0; i < nB; i++) {
141
blas_axpy(properties, -1.0, meanB, 1, ctx->B + i, nB);
143
blas_scal(matrix_size, 1.0 / sqrt(nB), ctx->B, 1);
144
blas_axpy(properties, -1.0, meanB, 1, meanA, 1);
148
//printf("%f %f.\n", meanA[4], meanA[5]);
149
//printf("%f %f.\n", meanB[4], meanB[5]);
154
ctx->sched = hw_sched_create();
155
hw_sched_set_sequential_mode(ctx->sched, &ctx->block_size, &ctx->cur_chunk);
163
int mrses_init_context_from_double(MRSESContext ctx, int properties, int nA, const double *A, int nB, const double *B) {
173
#endif /* HW_HAVE_SPU */
176
MRSESDataType *ones, *meanA, *meanB;
183
assert(properties > 0);
185
mrses_free_context(ctx);
188
alloc = calc_alloc(nAB * sizeof(MRSESDataType), HW_ALIGN) / sizeof(MRSESDataType);
189
palloc = calc_alloc(properties * sizeof(MRSESDataType), HW_ALIGN) / sizeof(MRSESDataType);
190
matrix_size = properties * alloc;
195
ctx->properties = properties;
197
posix_memalign((void*)&ctx->A, HW_ALIGN, (alloc * properties)*sizeof(MRSESDataType));
198
posix_memalign((void*)&ctx->B, HW_ALIGN, (alloc * properties)*sizeof(MRSESDataType));
199
posix_memalign((void*)&ctx->mean, HW_ALIGN, palloc * sizeof(MRSESDataType));
200
posix_memalign((void*)&meanB, HW_ALIGN, properties*sizeof(MRSESDataType));
201
posix_memalign((void*)&ones, HW_ALIGN, nAB*sizeof(MRSESDataType));
205
if ((!ctx->A)||(!ctx->B)||(!meanA)||(!meanB)||(!ones)) {
206
if (ones) free(ones);
207
if (meanB) free(meanB);
211
for (i = 0; i < nAB; i++)
214
memset(ctx->A, 0x7F, alloc * properties * sizeof(MRSESDataType));
217
for (i = 0; i < properties; i++) {
219
memcpy(ctx->A + i * alloc, A + i * nA, nA * sizeof(MRSESDataType));
221
for (j = 0; j < nA; j++) {
222
ctx->A[i * alloc + j] = A[i * nA + j];
227
aA = calc_alloc(nA * sizeof(MRSESDataType), HW_ALIGN) - nA * sizeof(MRSESDataType);
229
for (i = 0; i < properties; i++) {
230
memset(ctx->A + i * alloc + nA, 0, aA);
233
#endif /* HW_HAVE_SPU */
235
blas_gemv(CblasRowMajor,CblasNoTrans, properties, nA, 1. / nA, ctx->A, alloc, ones, 1, 0, meanA, 1);
236
for (i = 0; i < nA; i++) {
237
blas_axpy(properties, -1, meanA, 1, ctx->A + i, alloc);
239
blas_scal(matrix_size, 1.0 / sqrt(nA), ctx->A, 1);
242
for (i = 0; i < properties; i++) {
244
memcpy(ctx->B + i * alloc, B + i * nB, nB * sizeof(MRSESDataType));
246
for (j = 0; j < nA; j++) {
247
ctx->B[i * alloc + j] = B[i * nA + j];
252
aB = calc_alloc(nB * sizeof(MRSESDataType), HW_ALIGN) - nB * sizeof(MRSESDataType);
254
for (i = 0; i < properties; i++) {
255
memset(ctx->B + i * alloc + nB, 0, aB);
258
#endif /* HW_HAVE_SPU */
260
blas_gemv(CblasRowMajor,CblasNoTrans, properties, nB, 1. / nB, ctx->B, alloc, ones, 1, 0, meanB, 1);
261
for (i = 0; i < nB; i++) {
262
blas_axpy(properties, -1.0, meanB, 1, ctx->B + i, alloc);
264
blas_scal(matrix_size, 1.0 / sqrt(nB), ctx->B, 1);
266
blas_axpy(properties, -1.0, meanB, 1, meanA, 1);
270
blas_gemv(CblasRowMajor,CblasNoTrans, properties, nA, 1. / nA, A, nA, ones, 1, 0, meanA, 1);
271
blas_gemv(CblasRowMajor,CblasNoTrans, properties, nB, 1. / nB, B, nB, ones, 1, 0, meanB, 1);
273
matrix_size = properties * nA;
274
memcpy(ctx->A, A, matrix_size * sizeof(MRSESDataType));
275
for (i = 0; i < nA; i++) {
276
blas_axpy(properties, -1, meanA, 1, ctx->A + i, nA);
278
blas_scal(matrix_size, 1.0 / sqrt(nA), ctx->A, 1);
280
matrix_size = properties * nB;
281
memcpy(ctx->B, B, matrix_size * sizeof(MRSESDataType));
282
for (i = 0; i < nB; i++) {
283
blas_axpy(properties, -1.0, meanB, 1, ctx->B + i, nB);
285
blas_scal(matrix_size, 1.0 / sqrt(nB), ctx->B, 1);
286
blas_axpy(properties, -1.0, meanB, 1, meanA, 1);
290
//printf("%f %f.\n", meanA[4], meanA[5]);
291
//printf("%f %f.\n", meanB[4], meanB[5]);
296
ctx->sched = hw_sched_create();
297
hw_sched_set_sequential_mode(ctx->sched, &ctx->block_size, &ctx->cur_chunk);
307
int mrses_set_distance_mode(MRSESContext ctx, MRSESDistance dist) {
309
assert(dist < MRSES_DISTANCE_LAST);
315
int mrses_prepare(MRSESContext ctx, int width, int block_size) {
319
#endif /* HW_HAVE_SPU */
326
# ifdef HW_USE_BLOCKED_MULTIPLY
327
at_once = SPE_BLOCK / width;
328
if (at_once < 1) at_once = 1;
329
# else /* HW_USE_BLOCKED_MULTIPLY */
331
#endif /* HW_USE_BLOCKED_MULTIPLY */
333
pack = SIMD_BLOCK / sizeof(MRSESDataType);
336
index_block = HW_ALIGN / min(sizeof(MRSESDataType), sizeof(MRSESIntType));
338
ctx->iterate_size = lcm3(at_once, index_block, HW_ITERATE_BLOCKS);
339
#else /* HW_HAVE_SPU */
340
ctx->iterate_size = HW_ITERATE_BLOCKS;
341
#endif /* HW_HAVE_SPU */
343
reportMessage("block size: %i", ctx->iterate_size);
345
ctx->max_block_size = calc_alloc(block_size, ctx->iterate_size);
350
void mrses_free_context(MRSESContext ctx) {
353
if (ctx->sched) hw_sched_destroy(ctx->sched);
355
if (ctx->index) free(ctx->index);
356
if (ctx->mean) free(ctx->mean);
357
if (ctx->B) free(ctx->B);
358
if (ctx->A) free(ctx->A);
360
memset(ctx, 0, sizeof(MRSESContextS));
363
void mrses_destroy_context(MRSESContext ctx) {
364
mrses_free_context(ctx);
369
int mrses_compute(MRSESContext ctx, int block_size, MRSESIntType *index, MRSESDataType *res) {
371
assert(block_size > 0);
372
assert(block_size <= ctx->max_block_size);
377
ctx->block_size = block_size;
382
hw_sched_schedule_task(ctx->sched, ctx, MRSES_RUN_ENTRY);
383
hw_sched_wait_task(ctx->sched);
390
int mrses_iterate(MRSESContext ctx, int iterations, int block_size, MRSESIntType *ires) {
391
int i, width, properties;
394
#endif /* FIX_RANDOM */
396
MRSESIntType *index, *iptr, *iend;
399
assert(iterations > 0);
400
assert(block_size > 0);
401
assert(block_size <= ctx->max_block_size);
403
block_size = calc_alloc(block_size, ctx->iterate_size);
406
properties = ctx->properties;
410
#else /* FIX_RANDOM */
411
gettimeofday(&tv, NULL);
413
#endif /* FIX_RANDOM */
416
if (!ires) hw_sched_wait_task(ctx->sched);
419
posix_memalign((void*)&index, HW_ALIGN, (block_size * properties)*sizeof(MRSESIntType));
425
for (i = 0; i < properties; i++) {
429
iend = index + block_size * properties;
430
for (iptr = index + properties; iptr < iend; iptr += properties) {
431
memcpy(iptr, index, properties * sizeof(MRSESIntType));
433
for (i = 0; i < width; i++) {
434
SWAPi(iptr, properties, i);
437
for (i = 0; i < width; i++) {
438
SWAPi(index, properties, i);
442
for (i = 0; i < 5; i++) {
444
for (j = 0; j < 5; j++) {
445
printf("%5i ", index[i * properties + j]);
452
ctx->block_size = block_size / ctx->iterate_size;
455
ctx->iterations = iterations;
460
hw_sched_schedule_task(ctx->sched, ctx, MRSES_ITERATE_ENTRY);
463
hw_sched_wait_task(ctx->sched);
464
free(ctx->index); ctx->index = NULL;
470
int mrses_get_results(MRSESContext ctx, uint32_t *hist) {
473
HWSched sched = ctx->sched;
474
HWThread *thr = sched->thread;
475
int n_threads = sched->n_threads;
476
int properties = ctx->properties;
478
uint32_t *thr_hist = NULL;
481
hw_sched_wait_task(ctx->sched);
482
free(ctx->index); ctx->index = NULL;
485
for (i = 0; i < n_threads; i++) {
486
thr_hist = (uint32_t*)(thr[i]->data);
491
memcpy(hist, thr_hist, properties * sizeof(uint32_t));
492
memset(thr_hist, 0, properties * sizeof(uint32_t));
495
for (++i; i < n_threads; i++) {
496
thr_hist = (uint32_t*)(thr[i]->data);
497
if (!thr_hist) continue;
499
for (j = 0; j < properties; j++) {
500
hist[j] += thr_hist[j];
502
memset(thr_hist, 0, properties * sizeof(uint32_t));