/normxcorr/trunk

To get this branch, use:
bzr branch http://suren.me/webbzr/normxcorr/trunk

« back to all changes in this revision

Viewing changes to cuda/normxcorr_hw.cu

  • Committer: Suren A. Chilingaryan
  • Date: 2009-12-10 05:17:54 UTC
  • Revision ID: csa@dside.dyndns.org-20091210051754-1y0pgbfodgwr3okc
memcpy3D

Show diffs side-by-side

added added

removed removed

Lines of Context:
297
297
        }
298
298
 
299
299
        cudaMemcpy2D(
300
 
            img,
 
300
            img + i * alloc_size,
301
301
            size * sizeof(uint8_t),
302
302
            fullimg + (xstart * height + ystart),
303
303
            height * sizeof(uint8_t),
308
308
        
309
309
        cudaMemcpy2D(
310
310
            cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),
311
 
            img, size * sizeof(uint8_t),
 
311
            img + i * alloc_size, size * sizeof(uint8_t),
312
312
            size * sizeof(uint8_t), size, cudaMemcpyHostToDevice
313
313
        );
314
314
 
426
426
        }
427
427
 
428
428
        cudaMemcpy2D(
429
 
            img,
 
429
            img + i * size2,//alloc_size,
430
430
            size * sizeof(uint8_t),
431
431
            fullimg + (xstart * height + ystart),
432
432
            height * sizeof(uint8_t),
434
434
            size,
435
435
            cudaMemcpyHostToHost
436
436
        );
437
 
 
 
437
/*
438
438
        cudaMemcpy2D(
439
439
            cuda_input_buffer + i * side_alloc2, side_alloc * sizeof(uint8_t),
440
 
            img, size * sizeof(uint8_t),
 
440
            img + i * size2, size * sizeof(uint8_t),
441
441
            size * sizeof(uint8_t), size, cudaMemcpyHostToDevice
442
442
        );
 
443
*/
443
444
    }
444
445
 
 
446
    cudaMemcpy3DParms copy_params = { 0 };
 
447
    copy_params.dstPtr   = make_cudaPitchedPtr(
 
448
        cuda_input_buffer, side_alloc * sizeof(uint8_t), side_alloc, side_alloc
 
449
    );
 
450
    copy_params.srcPtr   = make_cudaPitchedPtr(
 
451
        img, size * sizeof(uint8_t), size, size
 
452
    );
 
453
    copy_params.extent   = make_cudaExtent(size * sizeof(uint8_t), size, ncp);
 
454
    copy_params.kind     = cudaMemcpyHostToDevice;
 
455
    cudaMemcpy3D(&copy_params);
 
456
 
 
457
 
445
458
    dim3 block_2d(BLOCK_SIZE_2D, BLOCK_SIZE_2D, 1);
446
459
    dim3 block_side_cp(SIDE_BLOCK_SIZE, CP_BLOCK_SIZE, 1);
447
460
 
448
 
    //int input_blocks = calc_blocks(size2, BLOCK_SIZE_2D);
449
 
 
450
 
 
451
461
    int cp_blocks = calc_blocks(ncp, CP_BLOCK_SIZE);
452
462
    int cp_blocks1 = calc_blocks(ncp, BLOCK_SIZE_1D);
453
463
    int side_blocks = calc_blocks(size, SIDE_BLOCK_SIZE);
454
464
    int fft_blocks = calc_blocks(fft_size, SIDE_BLOCK_SIZE);
455
465
    int input_blocks = side_blocks * side_blocks * SIDE_BLOCK_SIZE;
456
466
 
457
 
 
458
467
        // Computing sum and std
459
468
    int32_t *stat_buf = (int*)ps->cuda_temp_buffer;
460
469