/normxcorr/trunk

To get this branch, use:
bzr branch http://suren.me/webbzr/normxcorr/trunk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "dict_hw.h"
#include "normxcorr_hw.h"

int dictGetCorrelations(ProcessingState ps, int icp, float *res) {
    int size = ps->fft_size;
    int size2 = size * size;

    cufftReal *cuda_result_buffer = (cufftReal*)ps->cuda_temp_buffer;
    float *cuda_final_buffer = cuda_result_buffer + CP_BLOCK * ps->fft_alloc_size;
    cudaMemcpy(res, cuda_final_buffer, size2*sizeof(cufftReal), cudaMemcpyDeviceToHost);
    
    return 0;
}

int dictGetCorrections(ProcessingState ps, float *res_x, float *res_y) {
    int ncp = ps->ncp;
    int points_alloc = ps->points_alloc_size;
    int ncp_alloc = ps->ncp_alloc_size;

    float *move_x = ps->points + 6 * points_alloc;
    float *move_y = move_x + points_alloc;

    cudaMemcpy2D(
	move_x, points_alloc * sizeof(float),
	ps->cuda_points, ncp_alloc * sizeof(float),
	ps->ncp * sizeof(float), 2,
	cudaMemcpyDeviceToHost
    );

    memcpy(res_x, move_x, ncp * sizeof(float));
    memcpy(res_y, move_y, ncp * sizeof(float));

    return 0;
}

int dictGetLocalSum(ProcessingState ps, int icp, float *lsum, float *denom) {
    int size = ps->fft_size;
    int size2 = size*size;
    int alloc_size = ps->fft_alloc_size;
    
    if (!ps->use_cache) icp = 0;

    if (lsum) {
        cudaMemcpy(lsum, ps->cuda_lsum_cache + icp * alloc_size, size2*sizeof(float), cudaMemcpyDeviceToHost);
    }

    if (denom) {
	cudaMemcpy(denom, ps->cuda_denom_cache + icp * alloc_size, size2*sizeof(float), cudaMemcpyDeviceToHost);
    }

    return 0;
}