1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
|
#include <stdio.h>
#include <stdlib.h>
#include <mex.h>
#include "normxcorr_hw.h"
#include "local_sum.h"
#include "local_sum_kernel.cu"
#include <unistd.h>
int local_sum(TProcessingState *ps,
float *lsum, float *denom,
float *tmp1, float *tmp2,
float *in1, float *in2) {
int size = ps->subimage_size;
int lsize = ps->lsum_size;
int temp_size = ps->lsum_temp_size;
int alloc_size = ps->lsum_alloc_size;
int aligned_size = ps->lsum_aligned_size;
int short_size = ps->lsum_short_aligned_size;
// All arrays should be aligned
cudppMultiScan(ps->cudpp_plan, tmp1 + lsize * alloc_size, in1 + lsize * alloc_size, temp_size, size);
cudppMultiScan(ps->cudpp_plan, tmp2 + lsize * alloc_size, in2 + lsize * alloc_size, temp_size, size);
// Actually we can transpose here only a middle(image) rows, but in output lsum/denom
// arrays we should somehow manage to have zeros in other places. This will require
// memset which is not much faster? than our transpose routine, therefore it left
// as is for simplicity
dim3 block_dim(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);
dim3 grid_dim1(short_size / BLOCK_SIZE_2D, aligned_size / BLOCK_SIZE_2D, 1);
transpose1<<<grid_dim1,block_dim>>>(lsum, tmp1, alloc_size, alloc_size, lsize);
transpose1<<<grid_dim1,block_dim>>>(denom, tmp2, alloc_size, alloc_size, lsize);
cudppMultiScan(ps->cudpp_plan, tmp1, lsum, temp_size, size + lsize - 1);
cudppMultiScan(ps->cudpp_plan, tmp2, denom, temp_size, size + lsize - 1);
dim3 grid_dim2(short_size / BLOCK_SIZE_2D, short_size / BLOCK_SIZE_2D, 1);
transpose2<<<grid_dim2,block_dim>>>(lsum, denom, tmp1, tmp2, alloc_size, ps->fft_size, lsize);
return 0;
}
int local_sum_validate(TProcessingState *ps, int icp, const mxArray *lsum, const mxArray *denom) {
int res = 0;
#ifdef VALIDATE_LSUM
int size = ps->fft_size;
int size2 = size*size;
int alloc_size = ps->fft_alloc_size;
if (lsum&&denom) {
float *tmp = (float*)malloc(size2*sizeof(float));
cudaMemcpy(tmp, ps->cuda_lsum_buffer + icp * alloc_size, size2*sizeof(float), cudaMemcpyDeviceToHost);
float *real = (float*)mxGetData(lsum);
if (memcmp(tmp, real, size2*sizeof(float))) {
printf("lsum fault: %i\n", 1);
for (int i = 0; i < size2; i++) {
if (tmp[i] != real[i]) {
res = 1;
printf("lsum fault: %i %i - %f %f\n", i / size, i % size, tmp[i], real[i]);
break;
}
}
}
cudaMemcpy(tmp, ps->cuda_denom_buffer + icp * alloc_size, size2*sizeof(float), cudaMemcpyDeviceToHost);
real = (float*)mxGetData(denom);
if (memcmp(tmp, real, size2*sizeof(float))) {
for (int i = 0; i < size2; i++) {
float diff = (tmp[i] == real[i])?0:2*fabs(tmp[i] - real[i])/(tmp[i] = real[i]);
if (diff > 0.0001) {
res = 1;
printf("denom fault: %i %i - %f %f %f\n", i / size, i % size, tmp[i], real[i], diff);
break;
}
}
}
free(tmp);
}
#endif /* VALIDATE_LSUM */
return res;
}
|