Commit 93a9cf84e9ba742af7fe4d579706cbd53b0197fc

Authored by boolli
1 parent e5a544f0

Optimize the GPU grayscale transformation a little bit, but it's still slower than the CPU version

openbr/plugins/cuda/cudargb2grayscale.cu
... ... @@ -19,22 +19,22 @@ namespace br{ namespace cuda {
19 19 int rowInd = blockIdx.y*blockDim.y+threadIdx.y;
20 20 int colInd = blockIdx.x*blockDim.x+threadIdx.x;
21 21 int index = rowInd*cols + colInd;
22   - if (rowInd < 1 || rowInd >= rows-1 || colInd < 1 || colInd >= cols-1) {
23   - if (rowInd >= rows || colInd >= cols) {
24   - return;
25   - } else {
26   - return;
27   - }
  22 + if (rowInd < 0 || rowInd >= rows || colInd < 0 || colInd >= cols) {
  23 + return;
28 24 }
  25 + int new_index = 3 * index;
  26 + float g = (float) srcPtr[new_index];
  27 + float b = (float) srcPtr[new_index+1];
  28 + float r = (float) srcPtr[new_index+2];
29 29  
30   - dstPtr[index] = 0.299f * srcPtr[3*index] + 0.587f * srcPtr[3*index+1] + 0.114f * srcPtr[3*index+2];
  30 + dstPtr[index] = (uint8_t) (0.299f * g + 0.587f * b + 0.114f * r);
31 31 return;
32 32 }
33 33  
34 34 void cudargb2grayscale_wrapper(void* srcPtr, void** dstPtr, int rows, int cols)
35 35 {
36 36 cudaError_t err;
37   - dim3 threadsPerBlock(8, 8);
  37 + dim3 threadsPerBlock(9, 9);
38 38 dim3 numBlocks(cols/threadsPerBlock.x + 1,
39 39 rows/threadsPerBlock.y + 1);
40 40 CUDA_SAFE_MALLOC(dstPtr, rows*cols*sizeof(uint8_t), &err);
... ...