Commit f633017d134aafeb46b1d866c8bc1abac42b1599
1 parent
edd8d3e4
added proper block filling to get the extra leftovers
Showing
1 changed file
with
7 additions
and
3 deletions
openbr/plugins/cuda/passthrough.cu
| ... | ... | @@ -12,10 +12,14 @@ namespace br { namespace cuda { |
| 12 | 12 | int rowInd = blockIdx.y*blockDim.y+threadIdx.y; |
| 13 | 13 | int colInd = blockIdx.x*blockDim.x+threadIdx.x; |
| 14 | 14 | |
| 15 | + // don't do anything if we are outside the allowable positions | |
| 16 | + if (rowInd >= rows || colInd >= cols) | |
| 17 | + return; | |
| 18 | + | |
| 15 | 19 | uint8_t srcVal = (srcPtr + rowInd*srcStep)[colInd]; |
| 16 | 20 | uint8_t* rowDstPtr = dstPtr + rowInd*dstStep; |
| 17 | 21 | |
| 18 | - rowDstPtr[colInd] = srcVal; | |
| 22 | + rowDstPtr[colInd] = srcVal * 2; | |
| 19 | 23 | } |
| 20 | 24 | |
| 21 | 25 | void passthrough_wrapper(GpuMat& src, GpuMat& dst) { |
| ... | ... | @@ -28,8 +32,8 @@ namespace br { namespace cuda { |
| 28 | 32 | |
| 29 | 33 | // make 8 * 8 = 64 square block |
| 30 | 34 | dim3 threadsPerBlock(8, 8); |
| 31 | - dim3 numBlocks(imageWidth / threadsPerBlock.x, | |
| 32 | - imageHeight / threadsPerBlock.y); | |
| 35 | + dim3 numBlocks(imageWidth / threadsPerBlock.x + 1, | |
| 36 | + imageHeight / threadsPerBlock.y + 1); | |
| 33 | 37 | |
| 34 | 38 | passthrough_kernel<<<numBlocks, threadsPerBlock>>>(srcPtr, dstPtr, src.step, dst.step, imageWidth, imageHeight); |
| 35 | 39 | } | ... | ... |