Commit b99b23d0b534302818cb2d39f7a9cb6cc8f01e68

Authored by Colin Heinzmann
1 parent 155b284d

increased parallelization of CUDAAffine

openbr/plugins/cuda/cudaaffine.cu
@@ -172,7 +172,7 @@ namespace br { namespace cuda { namespace affine { @@ -172,7 +172,7 @@ namespace br { namespace cuda { namespace affine {
172 CUDA_SAFE_MALLOC(dstPtr, dstRows*dstCols*sizeof(uint8_t), &err); 172 CUDA_SAFE_MALLOC(dstPtr, dstRows*dstCols*sizeof(uint8_t), &err);
173 173
174 // call the bilinear kernel function 174 // call the bilinear kernel function
175 - dim3 threadsPerBlock(8, 8); 175 + dim3 threadsPerBlock(32, 16);
176 dim3 numBlocks(dstCols/threadsPerBlock.x + 1, 176 dim3 numBlocks(dstCols/threadsPerBlock.x + 1,
177 dstRows/threadsPerBlock.y + 1); 177 dstRows/threadsPerBlock.y + 1);
178 178
@@ -186,7 +186,7 @@ namespace br { namespace cuda { namespace affine { @@ -186,7 +186,7 @@ namespace br { namespace cuda { namespace affine {
186 cudaError_t err; 186 cudaError_t err;
187 double* gpuInverse; 187 double* gpuInverse;
188 188
189 - dim3 threadsPerBlock(8, 8); 189 + dim3 threadsPerBlock(32, 16);
190 dim3 numBlocks(dst_cols/threadsPerBlock.x + 1, 190 dim3 numBlocks(dst_cols/threadsPerBlock.x + 1,
191 dst_rows/threadsPerBlock.y + 1); 191 dst_rows/threadsPerBlock.y + 1);
192 192