Commit e600a994b86d4847bf980261ee4605643f8fd630

Authored by DepthDeluxe
1 parent ea4c68bd

up to ~1350 enrollments/sec on CUDALBP+CUDAPCA

Showing 1 changed file with 14 additions and 10 deletions
openbr/plugins/cuda/cudapca.cu
... ... @@ -10,17 +10,21 @@ using namespace cv;
10 10 using namespace cv::gpu;
11 11  
12 12 namespace br { namespace cuda { namespace pca {
13   - __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int evCols, int stepSize) {
  13 + __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int evRows, int evCols, int stepSize) {
14 14 int colInd = blockIdx.x*blockDim.x+threadIdx.x;
  15 + if (colInd >= evCols) {
  16 + return;
  17 + }
15 18  
16 19 int stepNum = threadIdx.y;
17 20 int iStart = stepNum*stepSize;
18 21 int iEnd = iStart+stepSize;
19   -
20   - // check dimensions
21   - if (colInd >= evCols) {
  22 + if (iStart >= evRows) {
22 23 return;
23 24 }
  25 + if (iEnd > evRows) {
  26 + iEnd = evRows;
  27 + }
24 28  
25 29 float acc = 0;
26 30 for (int i=iStart; i < iEnd; i++) {
... ... @@ -30,9 +34,8 @@ namespace br { namespace cuda { namespace pca {
30 34 intermediaryBuffer[stepSize*stepNum + colInd] = acc;
31 35 }
32 36  
33   - __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* dst, int evCols, int numSteps, int stepSize) {
  37 + __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* dst, int evRows, int evCols, int numSteps, int stepSize) {
34 38 int colInd = blockIdx.x*blockDim.x+threadIdx.x;
35   -
36 39 if (colInd >= evCols) {
37 40 return;
38 41 }
... ... @@ -62,7 +65,7 @@ namespace br { namespace cuda { namespace pca {
62 65 float* _cudaSrcPtr;
63 66 float* _cudaDstPtr;
64 67  
65   - int _numSteps = 2; int _stepSize;
  68 + int _numSteps; int _stepSize;
66 69 float* intermediaryBuffer;
67 70  
68 71 void loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) {
... ... @@ -83,7 +86,8 @@ namespace br { namespace cuda { namespace pca {
83 86 CUDA_SAFE_MALLOC(&_cudaDstPtr, _evCols*sizeof(float), &err);
84 87  
85 88 // initialize the intermediary working space,
86   - _stepSize = _evRows / _numSteps;
  89 + _numSteps = 16;
  90 + _stepSize = _evRows / _numSteps + 1;
87 91 CUDA_SAFE_MALLOC(&intermediaryBuffer, _numSteps*_evCols*sizeof(float), &err);
88 92 }
89 93  
... ... @@ -101,12 +105,12 @@ namespace br { namespace cuda { namespace pca {
101 105 // perform the multiplication
102 106 dim3 threadsPerBlock2d(64, _numSteps);
103 107 dim3 numBlocks2d(_evCols / threadsPerBlock2d.x + 1, 1);
104   - multiplyKernel<<<numBlocks2d, threadsPerBlock2d>>>((float*)src, intermediaryBuffer, cudaEvPtr, _evCols, _stepSize);
  108 + multiplyKernel<<<numBlocks2d, threadsPerBlock2d>>>((float*)src, intermediaryBuffer, cudaEvPtr, _evRows, _evCols, _stepSize);
105 109 CUDA_KERNEL_ERR_CHK(&err);
106 110  
107 111 threadsPerBlock = 64;
108 112 numBlocks = _evCols / threadsPerBlock + 1;
109   - multiplyJoinKernel<<<numBlocks, threadsPerBlock>>>(intermediaryBuffer, (float*)(*dst), _evCols, _numSteps, _stepSize);
  113 + multiplyJoinKernel<<<numBlocks, threadsPerBlock>>>(intermediaryBuffer, (float*)(*dst), _evRows, _evCols, _numSteps, _stepSize);
110 114 CUDA_KERNEL_ERR_CHK(&err);
111 115  
112 116 CUDA_SAFE_FREE(src, &err); // TODO(colin): figure out why adding this free causes memory corruption...
... ...