Commit 93a9cf84e9ba742af7fe4d579706cbd53b0197fc

Authored by boolli
1 parent e5a544f0

Optimize the GPU grayscale transformation a little bit, but it's still slower than the CPU version

openbr/plugins/cuda/cudargb2grayscale.cu
@@ -19,22 +19,22 @@ namespace br{ namespace cuda { @@ -19,22 +19,22 @@ namespace br{ namespace cuda {
19 int rowInd = blockIdx.y*blockDim.y+threadIdx.y; 19 int rowInd = blockIdx.y*blockDim.y+threadIdx.y;
20 int colInd = blockIdx.x*blockDim.x+threadIdx.x; 20 int colInd = blockIdx.x*blockDim.x+threadIdx.x;
21 int index = rowInd*cols + colInd; 21 int index = rowInd*cols + colInd;
22 - if (rowInd < 1 || rowInd >= rows-1 || colInd < 1 || colInd >= cols-1) {  
23 - if (rowInd >= rows || colInd >= cols) {  
24 - return;  
25 - } else {  
26 - return;  
27 - } 22 + if (rowInd < 0 || rowInd >= rows || colInd < 0 || colInd >= cols) {
  23 + return;
28 } 24 }
  25 + int new_index = 3 * index;
  26 + float g = (float) srcPtr[new_index];
  27 + float b = (float) srcPtr[new_index+1];
  28 + float r = (float) srcPtr[new_index+2];
29 29
30 - dstPtr[index] = 0.299f * srcPtr[3*index] + 0.587f * srcPtr[3*index+1] + 0.114f * srcPtr[3*index+2]; 30 + dstPtr[index] = (uint8_t) (0.299f * g + 0.587f * b + 0.114f * r);
31 return; 31 return;
32 } 32 }
33 33
34 void cudargb2grayscale_wrapper(void* srcPtr, void** dstPtr, int rows, int cols) 34 void cudargb2grayscale_wrapper(void* srcPtr, void** dstPtr, int rows, int cols)
35 { 35 {
36 cudaError_t err; 36 cudaError_t err;
37 - dim3 threadsPerBlock(8, 8); 37 + dim3 threadsPerBlock(9, 9);
38 dim3 numBlocks(cols/threadsPerBlock.x + 1, 38 dim3 numBlocks(cols/threadsPerBlock.x + 1,
39 rows/threadsPerBlock.y + 1); 39 rows/threadsPerBlock.y + 1);
40 CUDA_SAFE_MALLOC(dstPtr, rows*cols*sizeof(uint8_t), &err); 40 CUDA_SAFE_MALLOC(dstPtr, rows*cols*sizeof(uint8_t), &err);