Commit e600a994b86d4847bf980261ee4605643f8fd630

Authored by DepthDeluxe
1 parent ea4c68bd

up to ~1350 enrollments/sec on CUDALBP+CUDAPCA

Showing 1 changed file with 14 additions and 10 deletions
openbr/plugins/cuda/cudapca.cu
@@ -10,17 +10,21 @@ using namespace cv; @@ -10,17 +10,21 @@ using namespace cv;
10 using namespace cv::gpu; 10 using namespace cv::gpu;
11 11
12 namespace br { namespace cuda { namespace pca { 12 namespace br { namespace cuda { namespace pca {
13 - __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int evCols, int stepSize) { 13 + __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int evRows, int evCols, int stepSize) {
14 int colInd = blockIdx.x*blockDim.x+threadIdx.x; 14 int colInd = blockIdx.x*blockDim.x+threadIdx.x;
  15 + if (colInd >= evCols) {
  16 + return;
  17 + }
15 18
16 int stepNum = threadIdx.y; 19 int stepNum = threadIdx.y;
17 int iStart = stepNum*stepSize; 20 int iStart = stepNum*stepSize;
18 int iEnd = iStart+stepSize; 21 int iEnd = iStart+stepSize;
19 -  
20 - // check dimensions  
21 - if (colInd >= evCols) { 22 + if (iStart >= evRows) {
22 return; 23 return;
23 } 24 }
  25 + if (iEnd > evRows) {
  26 + iEnd = evRows;
  27 + }
24 28
25 float acc = 0; 29 float acc = 0;
26 for (int i=iStart; i < iEnd; i++) { 30 for (int i=iStart; i < iEnd; i++) {
@@ -30,9 +34,8 @@ namespace br { namespace cuda { namespace pca { @@ -30,9 +34,8 @@ namespace br { namespace cuda { namespace pca {
30 intermediaryBuffer[stepSize*stepNum + colInd] = acc; 34 intermediaryBuffer[stepSize*stepNum + colInd] = acc;
31 } 35 }
32 36
33 - __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* dst, int evCols, int numSteps, int stepSize) { 37 + __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* dst, int evRows, int evCols, int numSteps, int stepSize) {
34 int colInd = blockIdx.x*blockDim.x+threadIdx.x; 38 int colInd = blockIdx.x*blockDim.x+threadIdx.x;
35 -  
36 if (colInd >= evCols) { 39 if (colInd >= evCols) {
37 return; 40 return;
38 } 41 }
@@ -62,7 +65,7 @@ namespace br { namespace cuda { namespace pca { @@ -62,7 +65,7 @@ namespace br { namespace cuda { namespace pca {
62 float* _cudaSrcPtr; 65 float* _cudaSrcPtr;
63 float* _cudaDstPtr; 66 float* _cudaDstPtr;
64 67
65 - int _numSteps = 2; int _stepSize; 68 + int _numSteps; int _stepSize;
66 float* intermediaryBuffer; 69 float* intermediaryBuffer;
67 70
68 void loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) { 71 void loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) {
@@ -83,7 +86,8 @@ namespace br { namespace cuda { namespace pca { @@ -83,7 +86,8 @@ namespace br { namespace cuda { namespace pca {
83 CUDA_SAFE_MALLOC(&_cudaDstPtr, _evCols*sizeof(float), &err); 86 CUDA_SAFE_MALLOC(&_cudaDstPtr, _evCols*sizeof(float), &err);
84 87
85 // initialize the intermediary working space, 88 // initialize the intermediary working space,
86 - _stepSize = _evRows / _numSteps; 89 + _numSteps = 16;
  90 + _stepSize = _evRows / _numSteps + 1;
87 CUDA_SAFE_MALLOC(&intermediaryBuffer, _numSteps*_evCols*sizeof(float), &err); 91 CUDA_SAFE_MALLOC(&intermediaryBuffer, _numSteps*_evCols*sizeof(float), &err);
88 } 92 }
89 93
@@ -101,12 +105,12 @@ namespace br { namespace cuda { namespace pca { @@ -101,12 +105,12 @@ namespace br { namespace cuda { namespace pca {
101 // perform the multiplication 105 // perform the multiplication
102 dim3 threadsPerBlock2d(64, _numSteps); 106 dim3 threadsPerBlock2d(64, _numSteps);
103 dim3 numBlocks2d(_evCols / threadsPerBlock2d.x + 1, 1); 107 dim3 numBlocks2d(_evCols / threadsPerBlock2d.x + 1, 1);
104 - multiplyKernel<<<numBlocks2d, threadsPerBlock2d>>>((float*)src, intermediaryBuffer, cudaEvPtr, _evCols, _stepSize); 108 + multiplyKernel<<<numBlocks2d, threadsPerBlock2d>>>((float*)src, intermediaryBuffer, cudaEvPtr, _evRows, _evCols, _stepSize);
105 CUDA_KERNEL_ERR_CHK(&err); 109 CUDA_KERNEL_ERR_CHK(&err);
106 110
107 threadsPerBlock = 64; 111 threadsPerBlock = 64;
108 numBlocks = _evCols / threadsPerBlock + 1; 112 numBlocks = _evCols / threadsPerBlock + 1;
109 - multiplyJoinKernel<<<numBlocks, threadsPerBlock>>>(intermediaryBuffer, (float*)(*dst), _evCols, _numSteps, _stepSize); 113 + multiplyJoinKernel<<<numBlocks, threadsPerBlock>>>(intermediaryBuffer, (float*)(*dst), _evRows, _evCols, _numSteps, _stepSize);
110 CUDA_KERNEL_ERR_CHK(&err); 114 CUDA_KERNEL_ERR_CHK(&err);
111 115
112 CUDA_SAFE_FREE(src, &err); // TODO(colin): figure out why adding this free causes memory corruption... 116 CUDA_SAFE_FREE(src, &err); // TODO(colin): figure out why adding this free causes memory corruption...