Commit e600a994b86d4847bf980261ee4605643f8fd630
1 parent
ea4c68bd
up to ~1350 enrollments/sec on CUDALBP+CUDAPCA
Showing
1 changed file
with
14 additions
and
10 deletions
openbr/plugins/cuda/cudapca.cu
| @@ -10,17 +10,21 @@ using namespace cv; | @@ -10,17 +10,21 @@ using namespace cv; | ||
| 10 | using namespace cv::gpu; | 10 | using namespace cv::gpu; |
| 11 | 11 | ||
| 12 | namespace br { namespace cuda { namespace pca { | 12 | namespace br { namespace cuda { namespace pca { |
| 13 | - __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int evCols, int stepSize) { | 13 | + __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int evRows, int evCols, int stepSize) { |
| 14 | int colInd = blockIdx.x*blockDim.x+threadIdx.x; | 14 | int colInd = blockIdx.x*blockDim.x+threadIdx.x; |
| 15 | + if (colInd >= evCols) { | ||
| 16 | + return; | ||
| 17 | + } | ||
| 15 | 18 | ||
| 16 | int stepNum = threadIdx.y; | 19 | int stepNum = threadIdx.y; |
| 17 | int iStart = stepNum*stepSize; | 20 | int iStart = stepNum*stepSize; |
| 18 | int iEnd = iStart+stepSize; | 21 | int iEnd = iStart+stepSize; |
| 19 | - | ||
| 20 | - // check dimensions | ||
| 21 | - if (colInd >= evCols) { | 22 | + if (iStart >= evRows) { |
| 22 | return; | 23 | return; |
| 23 | } | 24 | } |
| 25 | + if (iEnd > evRows) { | ||
| 26 | + iEnd = evRows; | ||
| 27 | + } | ||
| 24 | 28 | ||
| 25 | float acc = 0; | 29 | float acc = 0; |
| 26 | for (int i=iStart; i < iEnd; i++) { | 30 | for (int i=iStart; i < iEnd; i++) { |
| @@ -30,9 +34,8 @@ namespace br { namespace cuda { namespace pca { | @@ -30,9 +34,8 @@ namespace br { namespace cuda { namespace pca { | ||
| 30 | intermediaryBuffer[stepSize*stepNum + colInd] = acc; | 34 | intermediaryBuffer[stepSize*stepNum + colInd] = acc; |
| 31 | } | 35 | } |
| 32 | 36 | ||
| 33 | - __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* dst, int evCols, int numSteps, int stepSize) { | 37 | + __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* dst, int evRows, int evCols, int numSteps, int stepSize) { |
| 34 | int colInd = blockIdx.x*blockDim.x+threadIdx.x; | 38 | int colInd = blockIdx.x*blockDim.x+threadIdx.x; |
| 35 | - | ||
| 36 | if (colInd >= evCols) { | 39 | if (colInd >= evCols) { |
| 37 | return; | 40 | return; |
| 38 | } | 41 | } |
| @@ -62,7 +65,7 @@ namespace br { namespace cuda { namespace pca { | @@ -62,7 +65,7 @@ namespace br { namespace cuda { namespace pca { | ||
| 62 | float* _cudaSrcPtr; | 65 | float* _cudaSrcPtr; |
| 63 | float* _cudaDstPtr; | 66 | float* _cudaDstPtr; |
| 64 | 67 | ||
| 65 | - int _numSteps = 2; int _stepSize; | 68 | + int _numSteps; int _stepSize; |
| 66 | float* intermediaryBuffer; | 69 | float* intermediaryBuffer; |
| 67 | 70 | ||
| 68 | void loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) { | 71 | void loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) { |
| @@ -83,7 +86,8 @@ namespace br { namespace cuda { namespace pca { | @@ -83,7 +86,8 @@ namespace br { namespace cuda { namespace pca { | ||
| 83 | CUDA_SAFE_MALLOC(&_cudaDstPtr, _evCols*sizeof(float), &err); | 86 | CUDA_SAFE_MALLOC(&_cudaDstPtr, _evCols*sizeof(float), &err); |
| 84 | 87 | ||
| 85 | // initialize the intermediary working space, | 88 | // initialize the intermediary working space, |
| 86 | - _stepSize = _evRows / _numSteps; | 89 | + _numSteps = 16; |
| 90 | + _stepSize = _evRows / _numSteps + 1; | ||
| 87 | CUDA_SAFE_MALLOC(&intermediaryBuffer, _numSteps*_evCols*sizeof(float), &err); | 91 | CUDA_SAFE_MALLOC(&intermediaryBuffer, _numSteps*_evCols*sizeof(float), &err); |
| 88 | } | 92 | } |
| 89 | 93 | ||
| @@ -101,12 +105,12 @@ namespace br { namespace cuda { namespace pca { | @@ -101,12 +105,12 @@ namespace br { namespace cuda { namespace pca { | ||
| 101 | // perform the multiplication | 105 | // perform the multiplication |
| 102 | dim3 threadsPerBlock2d(64, _numSteps); | 106 | dim3 threadsPerBlock2d(64, _numSteps); |
| 103 | dim3 numBlocks2d(_evCols / threadsPerBlock2d.x + 1, 1); | 107 | dim3 numBlocks2d(_evCols / threadsPerBlock2d.x + 1, 1); |
| 104 | - multiplyKernel<<<numBlocks2d, threadsPerBlock2d>>>((float*)src, intermediaryBuffer, cudaEvPtr, _evCols, _stepSize); | 108 | + multiplyKernel<<<numBlocks2d, threadsPerBlock2d>>>((float*)src, intermediaryBuffer, cudaEvPtr, _evRows, _evCols, _stepSize); |
| 105 | CUDA_KERNEL_ERR_CHK(&err); | 109 | CUDA_KERNEL_ERR_CHK(&err); |
| 106 | 110 | ||
| 107 | threadsPerBlock = 64; | 111 | threadsPerBlock = 64; |
| 108 | numBlocks = _evCols / threadsPerBlock + 1; | 112 | numBlocks = _evCols / threadsPerBlock + 1; |
| 109 | - multiplyJoinKernel<<<numBlocks, threadsPerBlock>>>(intermediaryBuffer, (float*)(*dst), _evCols, _numSteps, _stepSize); | 113 | + multiplyJoinKernel<<<numBlocks, threadsPerBlock>>>(intermediaryBuffer, (float*)(*dst), _evRows, _evCols, _numSteps, _stepSize); |
| 110 | CUDA_KERNEL_ERR_CHK(&err); | 114 | CUDA_KERNEL_ERR_CHK(&err); |
| 111 | 115 | ||
| 112 | CUDA_SAFE_FREE(src, &err); // TODO(colin): figure out why adding this free causes memory corruption... | 116 | CUDA_SAFE_FREE(src, &err); // TODO(colin): figure out why adding this free causes memory corruption... |