249
218
dataPtr = (uint8_t*)mxGetData(data);
250
219
cudaMemcpy(cudaInputPtr, dataPtr, N2*sizeof(uint8_t), cudaMemcpyHostToDevice);
256
vecPack<<<input_grid_dim, input_block_dim>>>(cudaRealPtr, size, cudaInputPtr, N);
259
cudaMemcpy(ps->cuda_lsum_buffer + icp * alloc_size, mxGetData(lsum), size2*sizeof(float), cudaMemcpyHostToDevice);
260
cudaMemcpy(ps->cuda_denom_buffer + icp * alloc_size, mxGetData(denom), size2*sizeof(float), cudaMemcpyHostToDevice);
269
cudaMemcpy(ps->cuda_nonzero_buffer + icp * alloc_size, mxGetData(nonzero), ps->cuda_nonzero_items[icp]*sizeof(uint16_t), cudaMemcpyHostToDevice);
273
221
float *lsum_temp = ps->cuda_lsum_temp;
274
222
int step = ps->lsum_alloc_size * ps->lsum_alloc_size;