Commit f633017d134aafeb46b1d866c8bc1abac42b1599

Authored by DepthDeluxe
1 parent edd8d3e4

added proper block filling to get the extra leftovers

openbr/plugins/cuda/passthrough.cu
@@ -12,10 +12,14 @@ namespace br { namespace cuda { @@ -12,10 +12,14 @@ namespace br { namespace cuda {
12 int rowInd = blockIdx.y*blockDim.y+threadIdx.y; 12 int rowInd = blockIdx.y*blockDim.y+threadIdx.y;
13 int colInd = blockIdx.x*blockDim.x+threadIdx.x; 13 int colInd = blockIdx.x*blockDim.x+threadIdx.x;
14 14
  15 + // don't do anything if we are outside the allowable positions
  16 + if (rowInd >= rows || colInd >= cols)
  17 + return;
  18 +
15 uint8_t srcVal = (srcPtr + rowInd*srcStep)[colInd]; 19 uint8_t srcVal = (srcPtr + rowInd*srcStep)[colInd];
16 uint8_t* rowDstPtr = dstPtr + rowInd*dstStep; 20 uint8_t* rowDstPtr = dstPtr + rowInd*dstStep;
17 21
18 - rowDstPtr[colInd] = srcVal; 22 + rowDstPtr[colInd] = srcVal * 2;
19 } 23 }
20 24
21 void passthrough_wrapper(GpuMat& src, GpuMat& dst) { 25 void passthrough_wrapper(GpuMat& src, GpuMat& dst) {
@@ -28,8 +32,8 @@ namespace br { namespace cuda { @@ -28,8 +32,8 @@ namespace br { namespace cuda {
28 32
29 // make 8 * 8 = 64 square block 33 // make 8 * 8 = 64 square block
30 dim3 threadsPerBlock(8, 8); 34 dim3 threadsPerBlock(8, 8);
31 - dim3 numBlocks(imageWidth / threadsPerBlock.x,  
32 - imageHeight / threadsPerBlock.y); 35 + dim3 numBlocks(imageWidth / threadsPerBlock.x + 1,
  36 + imageHeight / threadsPerBlock.y + 1);
33 37
34 passthrough_kernel<<<numBlocks, threadsPerBlock>>>(srcPtr, dstPtr, src.step, dst.step, imageWidth, imageHeight); 38 passthrough_kernel<<<numBlocks, threadsPerBlock>>>(srcPtr, dstPtr, src.step, dst.step, imageWidth, imageHeight);
35 } 39 }