Commit 93a9cf84e9ba742af7fe4d579706cbd53b0197fc
1 parent
e5a544f0
Optimize the GPU grayscale transformation a little bit, but it's still slower than the CPU version
Showing
1 changed file
with
8 additions
and
8 deletions
openbr/plugins/cuda/cudargb2grayscale.cu
| ... | ... | @@ -19,22 +19,22 @@ namespace br{ namespace cuda { |
| 19 | 19 | int rowInd = blockIdx.y*blockDim.y+threadIdx.y; |
| 20 | 20 | int colInd = blockIdx.x*blockDim.x+threadIdx.x; |
| 21 | 21 | int index = rowInd*cols + colInd; |
| 22 | - if (rowInd < 1 || rowInd >= rows-1 || colInd < 1 || colInd >= cols-1) { | |
| 23 | - if (rowInd >= rows || colInd >= cols) { | |
| 24 | - return; | |
| 25 | - } else { | |
| 26 | - return; | |
| 27 | - } | |
| 22 | + if (rowInd < 0 || rowInd >= rows || colInd < 0 || colInd >= cols) { | |
| 23 | + return; | |
| 28 | 24 | } |
| 25 | + int new_index = 3 * index; | |
| 26 | + float g = (float) srcPtr[new_index]; | |
| 27 | + float b = (float) srcPtr[new_index+1]; | |
| 28 | + float r = (float) srcPtr[new_index+2]; | |
| 29 | 29 | |
| 30 | - dstPtr[index] = 0.299f * srcPtr[3*index] + 0.587f * srcPtr[3*index+1] + 0.114f * srcPtr[3*index+2]; | |
| 30 | + dstPtr[index] = (uint8_t) (0.299f * g + 0.587f * b + 0.114f * r); | |
| 31 | 31 | return; |
| 32 | 32 | } |
| 33 | 33 | |
| 34 | 34 | void cudargb2grayscale_wrapper(void* srcPtr, void** dstPtr, int rows, int cols) |
| 35 | 35 | { |
| 36 | 36 | cudaError_t err; |
| 37 | - dim3 threadsPerBlock(8, 8); | |
| 37 | + dim3 threadsPerBlock(9, 9); | |
| 38 | 38 | dim3 numBlocks(cols/threadsPerBlock.x + 1, |
| 39 | 39 | rows/threadsPerBlock.y + 1); |
| 40 | 40 | CUDA_SAFE_MALLOC(dstPtr, rows*cols*sizeof(uint8_t), &err); | ... | ... |