6
1
#include "normxcorr_hw.h"
9
#include "local_sum_kernel.cu"
13
int local_sum(TProcessingState *ps,
2
#include "local_sum_kernel.cu.h"
5
static inline int local_sum(TProcessingState *ps,
14
6
float *lsum, float *denom,
15
7
float *tmp1, float *tmp2,
16
8
float *in1, float *in2,
57
int local_sum_validate(TProcessingState *ps, int icp, const mxArray *lsum, const mxArray *denom) {
60
int size = ps->fft_size;
61
int size2 = size*size;
62
int alloc_size = ps->fft_alloc_size;
65
float *tmp = (float*)malloc(size2*sizeof(float));
67
cudaMemcpy(tmp, ps->cuda_lsum_cache + icp * alloc_size, size2*sizeof(float), cudaMemcpyDeviceToHost);
68
float *real = (float*)mxGetData(lsum);
69
if (memcmp(tmp, real, size2*sizeof(float))) {
70
printf("lsum fault: %i\n", 1);
71
for (int i = 0; i < size2; i++) {
72
if (tmp[i] != real[i]) {
74
printf("lsum fault: %i %i - %f %f\n", i / size, i % size, tmp[i], real[i]);
80
cudaMemcpy(tmp, ps->cuda_denom_cache + icp * alloc_size, size2*sizeof(float), cudaMemcpyDeviceToHost);
81
real = (float*)mxGetData(denom);
82
if (memcmp(tmp, real, size2*sizeof(float))) {
83
for (int i = 0; i < size2; i++) {
84
float diff = (tmp[i] == real[i])?0:2*fabs(tmp[i] - real[i])/(tmp[i] = real[i]);
87
printf("denom fault: %i %i - %f %f %f\n", i / size, i % size, tmp[i], real[i], diff);
95
#endif /* VALIDATE_LSUM */