Commit e600a994b86d4847bf980261ee4605643f8fd630
1 parent
ea4c68bd
up to ~1350 enrollments/sec on CUDALBP+CUDAPCA
Showing
1 changed file
with
14 additions
and
10 deletions
openbr/plugins/cuda/cudapca.cu
| ... | ... | @@ -10,17 +10,21 @@ using namespace cv; |
| 10 | 10 | using namespace cv::gpu; |
| 11 | 11 | |
| 12 | 12 | namespace br { namespace cuda { namespace pca { |
| 13 | - __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int evCols, int stepSize) { | |
| 13 | + __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int evRows, int evCols, int stepSize) { | |
| 14 | 14 | int colInd = blockIdx.x*blockDim.x+threadIdx.x; |
| 15 | + if (colInd >= evCols) { | |
| 16 | + return; | |
| 17 | + } | |
| 15 | 18 | |
| 16 | 19 | int stepNum = threadIdx.y; |
| 17 | 20 | int iStart = stepNum*stepSize; |
| 18 | 21 | int iEnd = iStart+stepSize; |
| 19 | - | |
| 20 | - // check dimensions | |
| 21 | - if (colInd >= evCols) { | |
| 22 | + if (iStart >= evRows) { | |
| 22 | 23 | return; |
| 23 | 24 | } |
| 25 | + if (iEnd > evRows) { | |
| 26 | + iEnd = evRows; | |
| 27 | + } | |
| 24 | 28 | |
| 25 | 29 | float acc = 0; |
| 26 | 30 | for (int i=iStart; i < iEnd; i++) { |
| ... | ... | @@ -30,9 +34,8 @@ namespace br { namespace cuda { namespace pca { |
| 30 | 34 | intermediaryBuffer[stepSize*stepNum + colInd] = acc; |
| 31 | 35 | } |
| 32 | 36 | |
| 33 | - __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* dst, int evCols, int numSteps, int stepSize) { | |
| 37 | + __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* dst, int evRows, int evCols, int numSteps, int stepSize) { | |
| 34 | 38 | int colInd = blockIdx.x*blockDim.x+threadIdx.x; |
| 35 | - | |
| 36 | 39 | if (colInd >= evCols) { |
| 37 | 40 | return; |
| 38 | 41 | } |
| ... | ... | @@ -62,7 +65,7 @@ namespace br { namespace cuda { namespace pca { |
| 62 | 65 | float* _cudaSrcPtr; |
| 63 | 66 | float* _cudaDstPtr; |
| 64 | 67 | |
| 65 | - int _numSteps = 2; int _stepSize; | |
| 68 | + int _numSteps; int _stepSize; | |
| 66 | 69 | float* intermediaryBuffer; |
| 67 | 70 | |
| 68 | 71 | void loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) { |
| ... | ... | @@ -83,7 +86,8 @@ namespace br { namespace cuda { namespace pca { |
| 83 | 86 | CUDA_SAFE_MALLOC(&_cudaDstPtr, _evCols*sizeof(float), &err); |
| 84 | 87 | |
| 85 | 88 | // initialize the intermediary working space, |
| 86 | - _stepSize = _evRows / _numSteps; | |
| 89 | + _numSteps = 16; | |
| 90 | + _stepSize = _evRows / _numSteps + 1; | |
| 87 | 91 | CUDA_SAFE_MALLOC(&intermediaryBuffer, _numSteps*_evCols*sizeof(float), &err); |
| 88 | 92 | } |
| 89 | 93 | |
| ... | ... | @@ -101,12 +105,12 @@ namespace br { namespace cuda { namespace pca { |
| 101 | 105 | // perform the multiplication |
| 102 | 106 | dim3 threadsPerBlock2d(64, _numSteps); |
| 103 | 107 | dim3 numBlocks2d(_evCols / threadsPerBlock2d.x + 1, 1); |
| 104 | - multiplyKernel<<<numBlocks2d, threadsPerBlock2d>>>((float*)src, intermediaryBuffer, cudaEvPtr, _evCols, _stepSize); | |
| 108 | + multiplyKernel<<<numBlocks2d, threadsPerBlock2d>>>((float*)src, intermediaryBuffer, cudaEvPtr, _evRows, _evCols, _stepSize); | |
| 105 | 109 | CUDA_KERNEL_ERR_CHK(&err); |
| 106 | 110 | |
| 107 | 111 | threadsPerBlock = 64; |
| 108 | 112 | numBlocks = _evCols / threadsPerBlock + 1; |
| 109 | - multiplyJoinKernel<<<numBlocks, threadsPerBlock>>>(intermediaryBuffer, (float*)(*dst), _evCols, _numSteps, _stepSize); | |
| 113 | + multiplyJoinKernel<<<numBlocks, threadsPerBlock>>>(intermediaryBuffer, (float*)(*dst), _evRows, _evCols, _numSteps, _stepSize); | |
| 110 | 114 | CUDA_KERNEL_ERR_CHK(&err); |
| 111 | 115 | |
| 112 | 116 | CUDA_SAFE_FREE(src, &err); // TODO(colin): figure out why adding this free causes memory corruption... | ... | ... |