/ani/mrses

To get this branch, use:
bzr branch http://suren.me/webbzr/ani/mrses
1 by Suren A. Chilingaryan
Initial import
1
#include <stdlib.h>
2
#include <stdio.h>
3
#include <errno.h>
4
#include <string.h>
5
#include <sys/time.h>
6
7
#include <cblas.h>
8
9
#include <assert.h>
10
11
#include "msg.h"
12
#include "hw_sched.h"
13
#include "mrses_impl.h"
14
15
enum MRSESEntriesT {
16
    MRSES_RUN_ENTRY = 0,
17
    MRSES_ITERATE_ENTRY = 1
18
};
19
typedef enum MRSESEntriesT MRSESEntries;
20
21
MRSESContext mrses_create_context() {
22
    MRSESContext ctx;
23
    posix_memalign((void*)&ctx, HW_ALIGN, calc_alloc(sizeof(MRSESContextS), HW_ALIGN));
24
    if (ctx) {
25
        memset(ctx, 0, sizeof(MRSESContextS));
26
    }
27
    return ctx;
28
}
29
30
int mrses_init_context(MRSESContext ctx, int properties, int nA, const MRSESDataType *A, int nB, const MRSESDataType *B) {
31
    int err = 0;
32
    int i;
33
    int nAB;
34
35
    int alloc;
36
    int palloc;
37
38
#ifdef HW_HAVE_SPU
39
    int aA, aB;
40
#endif /* HW_HAVE_SPU */
41
42
    int matrix_size;
43
    MRSESDataType *ones, *meanA, *meanB;
44
45
    assert(ctx);
46
    assert(A);
47
    assert(B);
48
    assert(nA > 0);
49
    assert(nB > 0);
50
    assert(properties > 0);
51
52
    mrses_free_context(ctx);
53
54
    nAB = max(nA, nB);    
55
    alloc = calc_alloc(nAB * sizeof(MRSESDataType), HW_ALIGN) / sizeof(MRSESDataType);
56
    palloc = calc_alloc(properties * sizeof(MRSESDataType), HW_ALIGN) / sizeof(MRSESDataType);
57
    matrix_size = properties * alloc;
58
    
59
    ctx->nA = nA;
60
    ctx->nB = nB;
61
    ctx->alloc = alloc;
62
    ctx->properties = properties;    
63
64
    posix_memalign((void*)&ctx->A, HW_ALIGN, (alloc * properties)*sizeof(MRSESDataType));
65
    posix_memalign((void*)&ctx->B, HW_ALIGN, (alloc * properties)*sizeof(MRSESDataType));
66
    posix_memalign((void*)&ctx->mean, HW_ALIGN, palloc * sizeof(MRSESDataType));
67
    posix_memalign((void*)&meanB, HW_ALIGN, properties*sizeof(MRSESDataType));
68
    posix_memalign((void*)&ones, HW_ALIGN, nAB*sizeof(MRSESDataType));
69
70
    meanA = ctx->mean;
71
72
    if ((!ctx->A)||(!ctx->B)||(!meanA)||(!meanB)||(!ones)) {
73
	if (ones) free(ones);
74
	if (meanB) free(meanB);
75
	return 1;
76
    }
77
	
78
    for (i = 0; i < nAB; i++) 
79
	ones[i] = 1.;
80
    
81
    memset(ctx->A, 0x7F, alloc * properties * sizeof(MRSESDataType));
82
//    blas_gemv(CblasRowMajor,CblasNoTrans, properties, nA, 1. / nA, A, nA, ones, 1, 0, meanA, 1);
83
//    blas_gemv(CblasRowMajor,CblasNoTrans, properties, nB, 1. / nB, B, nB, ones, 1, 0, meanB, 1);
84
85
    for (i = 0; i < properties; i++) {
86
	memcpy(ctx->A + i * alloc, A + i * nA, nA * sizeof(MRSESDataType));
87
    }
88
89
#ifdef HW_HAVE_SPU
90
    aA = calc_alloc(nA * sizeof(MRSESDataType), HW_ALIGN) - nA * sizeof(MRSESDataType);
91
    if (aA) {
92
	for (i = 0; i < properties; i++) {
93
	    memset(ctx->A + i * alloc + nA, 0, aA);
94
	}
95
    }
96
#endif /* HW_HAVE_SPU */
97
98
    blas_gemv(CblasRowMajor,CblasNoTrans, properties, nA, 1. / nA, ctx->A, alloc, ones, 1, 0, meanA, 1);
99
    for (i = 0; i < nA; i++) {
100
	blas_axpy(properties, -1, meanA, 1, ctx->A + i, alloc);
101
    }
102
    blas_scal(matrix_size, 1.0 / sqrt(nA), ctx->A, 1);
103
104
105
    for (i = 0; i < properties; i++) {
106
	memcpy(ctx->B + i * alloc, B + i * nB, nB * sizeof(MRSESDataType));
107
    }
108
109
#ifdef HW_HAVE_SPU
110
    aB = calc_alloc(nB * sizeof(MRSESDataType), HW_ALIGN) - nB * sizeof(MRSESDataType);
111
    if (aB) {
112
	for (i = 0; i < properties; i++) {
113
	    memset(ctx->B + i * alloc + nB, 0, aB);
114
	}
115
    }
116
#endif /* HW_HAVE_SPU */
117
118
    blas_gemv(CblasRowMajor,CblasNoTrans, properties, nB, 1. / nB, ctx->B, alloc, ones, 1, 0, meanB, 1);
119
    for (i = 0; i < nB; i++) {
120
	blas_axpy(properties, -1.0, meanB, 1, ctx->B + i, alloc);
121
    }
122
    blas_scal(matrix_size, 1.0 / sqrt(nB), ctx->B, 1);
123
124
    blas_axpy(properties, -1.0, meanB, 1, meanA, 1);
125
126
127
/*
128
    blas_gemv(CblasRowMajor,CblasNoTrans, properties, nA, 1. / nA, A, nA, ones, 1, 0, meanA, 1);
129
    blas_gemv(CblasRowMajor,CblasNoTrans, properties, nB, 1. / nB, B, nB, ones, 1, 0, meanB, 1);
130
131
    matrix_size = properties * nA;
132
    memcpy(ctx->A, A, matrix_size * sizeof(MRSESDataType));
133
    for (i = 0; i < nA; i++) {
134
	blas_axpy(properties, -1, meanA, 1, ctx->A + i, nA);
135
    }
136
    blas_scal(matrix_size, 1.0 / sqrt(nA), ctx->A, 1);
137
138
    matrix_size = properties * nB;
139
    memcpy(ctx->B, B, matrix_size * sizeof(MRSESDataType));
140
    for (i = 0; i < nB; i++) {
141
	blas_axpy(properties, -1.0, meanB, 1, ctx->B + i, nB);
142
    }
143
    blas_scal(matrix_size, 1.0 / sqrt(nB), ctx->B, 1);
144
    blas_axpy(properties, -1.0, meanB, 1, meanA, 1);
145
*/    
146
147
/*
148
    //printf("%f %f.\n", meanA[4], meanA[5]);
149
    //printf("%f %f.\n", meanB[4], meanB[5]);
150
*/
151
152
153
154
    ctx->sched = hw_sched_create();
155
    hw_sched_set_sequential_mode(ctx->sched, &ctx->block_size, &ctx->cur_chunk);
156
157
    free(ones);
158
    free(meanB);
159
    
160
    return err;
161
}
162
163
int mrses_init_context_from_double(MRSESContext ctx, int properties, int nA, const double *A, int nB, const double *B) {
164
    int err = 0;
165
    int i, j;
166
    int nAB;
167
168
    int alloc;
169
    int palloc;
170
171
#ifdef HW_HAVE_SPU
172
    int aA, aB;
173
#endif /* HW_HAVE_SPU */
174
175
    int matrix_size;
176
    MRSESDataType *ones, *meanA, *meanB;
177
178
    assert(ctx);
179
    assert(A);
180
    assert(B);
181
    assert(nA > 0);
182
    assert(nB > 0);
183
    assert(properties > 0);
184
185
    mrses_free_context(ctx);
186
187
    nAB = max(nA, nB);    
188
    alloc = calc_alloc(nAB * sizeof(MRSESDataType), HW_ALIGN) / sizeof(MRSESDataType);
189
    palloc = calc_alloc(properties * sizeof(MRSESDataType), HW_ALIGN) / sizeof(MRSESDataType);
190
    matrix_size = properties * alloc;
191
    
192
    ctx->nA = nA;
193
    ctx->nB = nB;
194
    ctx->alloc = alloc;
195
    ctx->properties = properties;    
196
197
    posix_memalign((void*)&ctx->A, HW_ALIGN, (alloc * properties)*sizeof(MRSESDataType));
198
    posix_memalign((void*)&ctx->B, HW_ALIGN, (alloc * properties)*sizeof(MRSESDataType));
199
    posix_memalign((void*)&ctx->mean, HW_ALIGN, palloc * sizeof(MRSESDataType));
200
    posix_memalign((void*)&meanB, HW_ALIGN, properties*sizeof(MRSESDataType));
201
    posix_memalign((void*)&ones, HW_ALIGN, nAB*sizeof(MRSESDataType));
202
203
    meanA = ctx->mean;
204
205
    if ((!ctx->A)||(!ctx->B)||(!meanA)||(!meanB)||(!ones)) {
206
	if (ones) free(ones);
207
	if (meanB) free(meanB);
208
	return 1;
209
    }
210
	
211
    for (i = 0; i < nAB; i++) 
212
	ones[i] = 1.;
213
    
214
    memset(ctx->A, 0x7F, alloc * properties * sizeof(MRSESDataType));
215
216
217
    for (i = 0; i < properties; i++) {
218
	/* DS: Change1, 
219
	memcpy(ctx->A + i * alloc, A + i * nA, nA * sizeof(MRSESDataType));
220
	*/
221
	for (j = 0; j < nA; j++) {
222
	    ctx->A[i * alloc + j] = A[i * nA + j];
223
	}
224
    }
225
226
#ifdef HW_HAVE_SPU
227
    aA = calc_alloc(nA * sizeof(MRSESDataType), HW_ALIGN) - nA * sizeof(MRSESDataType);
228
    if (aA) {
229
	for (i = 0; i < properties; i++) {
230
	    memset(ctx->A + i * alloc + nA, 0, aA);
231
	}
232
    }
233
#endif /* HW_HAVE_SPU */
234
235
    blas_gemv(CblasRowMajor,CblasNoTrans, properties, nA, 1. / nA, ctx->A, alloc, ones, 1, 0, meanA, 1);
236
    for (i = 0; i < nA; i++) {
237
	blas_axpy(properties, -1, meanA, 1, ctx->A + i, alloc);
238
    }
239
    blas_scal(matrix_size, 1.0 / sqrt(nA), ctx->A, 1);
240
241
242
    for (i = 0; i < properties; i++) {
243
	/* DS: Change2 
244
	    memcpy(ctx->B + i * alloc, B + i * nB, nB * sizeof(MRSESDataType));
245
	*/
246
	for (j = 0; j < nA; j++) {
247
	    ctx->B[i * alloc + j] = B[i * nA + j];
248
	}
249
    }
250
251
#ifdef HW_HAVE_SPU
252
    aB = calc_alloc(nB * sizeof(MRSESDataType), HW_ALIGN) - nB * sizeof(MRSESDataType);
253
    if (aB) {
254
	for (i = 0; i < properties; i++) {
255
	    memset(ctx->B + i * alloc + nB, 0, aB);
256
	}
257
    }
258
#endif /* HW_HAVE_SPU */
259
260
    blas_gemv(CblasRowMajor,CblasNoTrans, properties, nB, 1. / nB, ctx->B, alloc, ones, 1, 0, meanB, 1);
261
    for (i = 0; i < nB; i++) {
262
	blas_axpy(properties, -1.0, meanB, 1, ctx->B + i, alloc);
263
    }
264
    blas_scal(matrix_size, 1.0 / sqrt(nB), ctx->B, 1);
265
266
    blas_axpy(properties, -1.0, meanB, 1, meanA, 1);
267
268
269
/*
270
    blas_gemv(CblasRowMajor,CblasNoTrans, properties, nA, 1. / nA, A, nA, ones, 1, 0, meanA, 1);
271
    blas_gemv(CblasRowMajor,CblasNoTrans, properties, nB, 1. / nB, B, nB, ones, 1, 0, meanB, 1);
272
273
    matrix_size = properties * nA;
274
    memcpy(ctx->A, A, matrix_size * sizeof(MRSESDataType));
275
    for (i = 0; i < nA; i++) {
276
	blas_axpy(properties, -1, meanA, 1, ctx->A + i, nA);
277
    }
278
    blas_scal(matrix_size, 1.0 / sqrt(nA), ctx->A, 1);
279
280
    matrix_size = properties * nB;
281
    memcpy(ctx->B, B, matrix_size * sizeof(MRSESDataType));
282
    for (i = 0; i < nB; i++) {
283
	blas_axpy(properties, -1.0, meanB, 1, ctx->B + i, nB);
284
    }
285
    blas_scal(matrix_size, 1.0 / sqrt(nB), ctx->B, 1);
286
    blas_axpy(properties, -1.0, meanB, 1, meanA, 1);
287
*/    
288
289
/*
290
    //printf("%f %f.\n", meanA[4], meanA[5]);
291
    //printf("%f %f.\n", meanB[4], meanB[5]);
292
*/
293
294
295
296
    ctx->sched = hw_sched_create();
297
    hw_sched_set_sequential_mode(ctx->sched, &ctx->block_size, &ctx->cur_chunk);
298
299
    free(ones);
300
    free(meanB);
301
    
302
    return err;
303
}
304
305
306
307
int mrses_set_distance_mode(MRSESContext ctx, MRSESDistance dist) {
308
    assert(ctx); 
309
    assert(dist < MRSES_DISTANCE_LAST);
310
311
    ctx->dist = dist;
312
    return 0;
313
}
314
315
int mrses_prepare(MRSESContext ctx, int width, int block_size) {
316
#ifdef HW_HAVE_SPU
317
    int at_once, pack;
318
    int index_block;
319
#endif /* HW_HAVE_SPU */
320
321
    assert(ctx);
322
323
    ctx->width = width;
324
325
#ifdef HW_HAVE_SPU
326
# ifdef HW_USE_BLOCKED_MULTIPLY
327
    at_once = SPE_BLOCK / width;
328
    if (at_once < 1) at_once = 1;
329
# else /* HW_USE_BLOCKED_MULTIPLY */
330
    at_once = 1;
331
#endif /* HW_USE_BLOCKED_MULTIPLY */
332
333
    pack = SIMD_BLOCK / sizeof(MRSESDataType);
334
    at_once *= pack;
335
        
336
    index_block = HW_ALIGN / min(sizeof(MRSESDataType), sizeof(MRSESIntType));
337
338
    ctx->iterate_size = lcm3(at_once, index_block, HW_ITERATE_BLOCKS);
339
#else /* HW_HAVE_SPU */
340
    ctx->iterate_size = HW_ITERATE_BLOCKS;
341
#endif /* HW_HAVE_SPU */
342
343
    reportMessage("block size: %i", ctx->iterate_size);
344
345
    ctx->max_block_size = calc_alloc(block_size, ctx->iterate_size);
346
    
347
    return 0;
348
}
349
350
void mrses_free_context(MRSESContext ctx) {
351
    assert(ctx);
352
353
    if (ctx->sched) hw_sched_destroy(ctx->sched);
354
    
355
    if (ctx->index) free(ctx->index);
356
    if (ctx->mean) free(ctx->mean);
357
    if (ctx->B) free(ctx->B);
358
    if (ctx->A) free(ctx->A);
359
    
360
    memset(ctx, 0, sizeof(MRSESContextS));
361
}
362
363
void mrses_destroy_context(MRSESContext ctx) {
364
    mrses_free_context(ctx);
365
    free(ctx);
366
}
367
368
369
int mrses_compute(MRSESContext ctx, int block_size, MRSESIntType *index, MRSESDataType *res) {
370
    assert(ctx);
371
    assert(block_size > 0);
372
    assert(block_size <= ctx->max_block_size);
373
    assert(index);
374
    assert(res);
375
    assert(!ctx->index);
376
377
    ctx->block_size = block_size;
378
    ctx->cur_chunk = 0;
379
    ctx->index = index;
380
    ctx->result = res;
381
    
382
    hw_sched_schedule_task(ctx->sched, ctx, MRSES_RUN_ENTRY);
383
    hw_sched_wait_task(ctx->sched);
384
385
    ctx->index = NULL;
386
    
387
    return 0;
388
}
389
390
int mrses_iterate(MRSESContext ctx, int iterations, int block_size, MRSESIntType *ires) {
391
    int i, width, properties;
392
#ifndef FIX_RANDOM
393
    struct timeval tv;
394
#endif /* FIX_RANDOM */
395
396
    MRSESIntType *index, *iptr, *iend; 
397
    
398
    assert(ctx);
399
    assert(iterations > 0);
400
    assert(block_size > 0);
401
    assert(block_size <= ctx->max_block_size);
402
403
    block_size = calc_alloc(block_size, ctx->iterate_size);
404
405
    width = ctx->width;
406
    properties = ctx->properties;
407
408
#ifdef FIX_RANDOM
409
    srandom(FIX_RANDOM);
410
#else /* FIX_RANDOM */
411
    gettimeofday(&tv, NULL);
412
    srandom(tv.tv_usec);
413
#endif /* FIX_RANDOM */
414
415
    if (ctx->index) {
416
        if (!ires) hw_sched_wait_task(ctx->sched);
417
	index = ctx->index;
418
    } else {
419
        posix_memalign((void*)&index, HW_ALIGN, (block_size * properties)*sizeof(MRSESIntType));
420
	if (!index) {
421
	    return 1;
422
	}
423
    }
424
425
    for (i = 0; i < properties; i++) {
426
	index[i] = i;
427
    }
428
429
    iend = index + block_size * properties;    
430
    for (iptr = index + properties; iptr < iend; iptr += properties) {
431
	memcpy(iptr, index, properties * sizeof(MRSESIntType));
432
	
433
	for (i = 0; i < width; i++) {
434
	    SWAPi(iptr, properties, i);
435
	}
436
    }
437
    for (i = 0; i < width; i++) {
438
	SWAPi(index, properties, i);
439
    }
440
441
/*
442
    for (i = 0; i < 5; i++) {
443
	int j
444
	for (j = 0; j < 5; j++) {
445
	    printf("%5i ", index[i * properties + j]);
446
	}
447
	printf("\n");
448
    }
449
    printf("\n");
450
*/
451
    
452
    ctx->block_size = block_size / ctx->iterate_size;
453
    ctx->cur_chunk = 0;
454
455
    ctx->iterations = iterations;
456
    ctx->index = index;
457
    ctx->ires = ires;
458
    ctx->result = NULL;
459
460
    hw_sched_schedule_task(ctx->sched, ctx, MRSES_ITERATE_ENTRY);
461
462
    if (ires) {
463
	hw_sched_wait_task(ctx->sched);
464
	free(ctx->index); ctx->index = NULL;
465
    }
466
467
    return 0;
468
}
469
470
int mrses_get_results(MRSESContext ctx, uint32_t *hist) {
471
    int i, j;
472
    
473
    HWSched sched = ctx->sched;
474
    HWThread *thr = sched->thread;
475
    int n_threads = sched->n_threads;
476
    int properties = ctx->properties;
477
    
478
    uint32_t *thr_hist = NULL;
479
480
    if (ctx->index) {
481
        hw_sched_wait_task(ctx->sched);
482
	free(ctx->index); ctx->index = NULL;
483
    }
484
485
    for (i = 0; i < n_threads; i++) {
486
	thr_hist = (uint32_t*)(thr[i]->data);
487
	if (thr_hist) break;
488
    }
489
    
490
    if (thr_hist) {
491
	memcpy(hist, thr_hist, properties * sizeof(uint32_t));
492
	memset(thr_hist, 0, properties * sizeof(uint32_t));
493
    }
494
    
495
    for (++i; i < n_threads; i++) {
496
	thr_hist = (uint32_t*)(thr[i]->data);
497
	if (!thr_hist) continue;
498
	
499
	for (j = 0; j < properties; j++) {
500
	    hist[j] += thr_hist[j];
501
	}
502
	memset(thr_hist, 0, properties * sizeof(uint32_t));
503
    }
504
    
505
    return 0;
506
}