1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
|
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "dict_hw.h"
#include "normxcorr_hw.h"
int dictGetCorrelations(ProcessingState ps, int icp, float *res) {
int size = ps->fft_size;
int size2 = size * size;
cufftReal *cuda_result_buffer = (cufftReal*)ps->cuda_temp_buffer;
float *cuda_final_buffer = cuda_result_buffer + CP_BLOCK * ps->fft_alloc_size;
cudaMemcpy(res, cuda_final_buffer, size2*sizeof(cufftReal), cudaMemcpyDeviceToHost);
return 0;
}
int dictGetCorrections(ProcessingState ps, float *res_x, float *res_y) {
int ncp = ps->ncp;
int points_alloc = ps->points_alloc_size;
int ncp_alloc = ps->ncp_alloc_size;
float *move_x = ps->points + 6 * points_alloc;
float *move_y = move_x + points_alloc;
cudaMemcpy2D(
move_x, points_alloc * sizeof(float),
ps->cuda_points, ncp_alloc * sizeof(float),
ps->ncp * sizeof(float), 2,
cudaMemcpyDeviceToHost
);
memcpy(res_x, move_x, ncp * sizeof(float));
memcpy(res_y, move_y, ncp * sizeof(float));
return 0;
}
int dictGetLocalSum(ProcessingState ps, int icp, float *lsum, float *denom) {
int size = ps->fft_size;
int size2 = size*size;
int alloc_size = ps->fft_alloc_size;
if (!ps->use_cache) icp = 0;
if (lsum) {
cudaMemcpy(lsum, ps->cuda_lsum_cache + icp * alloc_size, size2*sizeof(float), cudaMemcpyDeviceToHost);
}
if (denom) {
cudaMemcpy(denom, ps->cuda_denom_cache + icp * alloc_size, size2*sizeof(float), cudaMemcpyDeviceToHost);
}
return 0;
}
|