6
#include "normxcorr_hw.h"
9
#include "local_sum_kernel.cu"
13
int local_sum(TProcessingState *ps,
14
float *lsum, float *denom,
15
float *tmp1, float *tmp2,
16
float *in1, float *in2) {
18
int size = ps->subimage_size;
19
int lsize = ps->lsum_size;
21
int temp_size = ps->lsum_temp_size;
22
int alloc_size = ps->lsum_alloc_size;
24
int aligned_size = ps->lsum_aligned_size;
25
int short_size = ps->lsum_short_aligned_size;
27
// All arrays should be aligned
28
cudppMultiScan(ps->cudpp_plan, tmp1 + lsize * alloc_size, in1 + lsize * alloc_size, temp_size, size);
29
cudppMultiScan(ps->cudpp_plan, tmp2 + lsize * alloc_size, in2 + lsize * alloc_size, temp_size, size);
31
// Actually we can transpose here only a middle(image) rows, but in output lsum/denom
32
// arrays we should somehow manage to have zeros in other places. This will require
33
// memset which is not much faster? than our transpose routine, therefore it left
34
// as is for simplicity
35
dim3 block_dim(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);
36
dim3 grid_dim1(short_size / BLOCK_SIZE_2D, aligned_size / BLOCK_SIZE_2D, 1);
37
transpose1<<<grid_dim1,block_dim>>>(lsum, tmp1, alloc_size, alloc_size, lsize);
38
transpose1<<<grid_dim1,block_dim>>>(denom, tmp2, alloc_size, alloc_size, lsize);
40
cudppMultiScan(ps->cudpp_plan, tmp1, lsum, temp_size, size + lsize - 1);
41
cudppMultiScan(ps->cudpp_plan, tmp2, denom, temp_size, size + lsize - 1);
43
dim3 grid_dim2(short_size / BLOCK_SIZE_2D, short_size / BLOCK_SIZE_2D, 1);
44
transpose2<<<grid_dim2,block_dim>>>(lsum, denom, tmp1, tmp2, alloc_size, ps->fft_size, lsize);
49
int local_sum_validate(TProcessingState *ps, int icp, const mxArray *lsum, const mxArray *denom) {
52
int size = ps->fft_size;
53
int size2 = size*size;
54
int alloc_size = ps->fft_alloc_size;
57
float *tmp = (float*)malloc(size2*sizeof(float));
59
cudaMemcpy(tmp, ps->cuda_lsum_buffer + icp * alloc_size, size2*sizeof(float), cudaMemcpyDeviceToHost);
60
float *real = (float*)mxGetData(lsum);
61
if (memcmp(tmp, real, size2*sizeof(float))) {
62
printf("lsum fault: %i\n", 1);
63
for (int i = 0; i < size2; i++) {
64
if (tmp[i] != real[i]) {
66
printf("lsum fault: %i %i - %f %f\n", i / size, i % size, tmp[i], real[i]);
72
cudaMemcpy(tmp, ps->cuda_denom_buffer + icp * alloc_size, size2*sizeof(float), cudaMemcpyDeviceToHost);
73
real = (float*)mxGetData(denom);
74
if (memcmp(tmp, real, size2*sizeof(float))) {
75
for (int i = 0; i < size2; i++) {
76
float diff = (tmp[i] == real[i])?0:2*fabs(tmp[i] - real[i])/(tmp[i] = real[i]);
79
printf("denom fault: %i %i - %f %f %f\n", i / size, i % size, tmp[i], real[i], diff);
87
#endif /* VALIDATE_LSUM */