Commit ea4c68bdd990d3aa5fb1b3ee3f6c1a4c28b417b9

Authored by DepthDeluxe
1 parent 1a6637f6

2.5x performance improvement, parallelized by splitting up columns

openbr/plugins/cuda/cudapca.cpp
@@ -169,9 +169,6 @@ private: @@ -169,9 +169,6 @@ private:
169 cout << "\tRows: " << eVals.rows() << " Cols: " << eVals.cols() << endl; 169 cout << "\tRows: " << eVals.rows() << " Cols: " << eVals.cols() << endl;
170 cout << "Keep: " << keep << endl; 170 cout << "Keep: " << keep << endl;
171 171
172 - cout << "Mean first value: " << mean(0, 0) << endl;  
173 -  
174 -  
175 // TODO(colin): use Eigen Map class to generate map files so we don't have to copy the data 172 // TODO(colin): use Eigen Map class to generate map files so we don't have to copy the data
176 // serialize the eigenvectors 173 // serialize the eigenvectors
177 float* evBuffer = new float[eVecs.rows() * eVecs.cols()]; 174 float* evBuffer = new float[eVecs.rows() * eVecs.cols()];
openbr/plugins/cuda/cudapca.cu
@@ -10,19 +10,38 @@ using namespace cv; @@ -10,19 +10,38 @@ using namespace cv;
10 using namespace cv::gpu; 10 using namespace cv::gpu;
11 11
12 namespace br { namespace cuda { namespace pca { 12 namespace br { namespace cuda { namespace pca {
13 - __global__ void multiplyKernel(float* src, float* dst, float* evPtr, int evRows, int evCols) { 13 + __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int evCols, int stepSize) {
14 int colInd = blockIdx.x*blockDim.x+threadIdx.x; 14 int colInd = blockIdx.x*blockDim.x+threadIdx.x;
15 15
  16 + int stepNum = threadIdx.y;
  17 + int iStart = stepNum*stepSize;
  18 + int iEnd = iStart+stepSize;
  19 +
16 // check dimensions 20 // check dimensions
17 if (colInd >= evCols) { 21 if (colInd >= evCols) {
18 return; 22 return;
19 } 23 }
20 24
21 float acc = 0; 25 float acc = 0;
22 - for (int i=0; i < evRows; i++) { 26 + for (int i=iStart; i < iEnd; i++) {
23 acc += evPtr[evCols*i + colInd] * src[i]; 27 acc += evPtr[evCols*i + colInd] * src[i];
24 } 28 }
25 29
  30 + intermediaryBuffer[stepSize*stepNum + colInd] = acc;
  31 + }
  32 +
  33 + __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* dst, int evCols, int numSteps, int stepSize) {
  34 + int colInd = blockIdx.x*blockDim.x+threadIdx.x;
  35 +
  36 + if (colInd >= evCols) {
  37 + return;
  38 + }
  39 +
  40 + float acc = 0;
  41 + for (int i = 0; i < numSteps; i++) {
  42 + acc += intermediaryBuffer[stepSize*i + colInd];
  43 + }
  44 +
26 dst[colInd] = acc; 45 dst[colInd] = acc;
27 } 46 }
28 47
@@ -43,6 +62,9 @@ namespace br { namespace cuda { namespace pca { @@ -43,6 +62,9 @@ namespace br { namespace cuda { namespace pca {
43 float* _cudaSrcPtr; 62 float* _cudaSrcPtr;
44 float* _cudaDstPtr; 63 float* _cudaDstPtr;
45 64
  65 + int _numSteps = 2; int _stepSize;
  66 + float* intermediaryBuffer;
  67 +
46 void loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) { 68 void loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) {
47 _evRows = evRows; _evCols = evCols; 69 _evRows = evRows; _evCols = evCols;
48 _meanElems = meanElems; 70 _meanElems = meanElems;
@@ -59,13 +81,16 @@ namespace br { namespace cuda { namespace pca { @@ -59,13 +81,16 @@ namespace br { namespace cuda { namespace pca {
59 81
60 CUDA_SAFE_MALLOC(&_cudaSrcPtr, _meanElems*sizeof(float), &err); 82 CUDA_SAFE_MALLOC(&_cudaSrcPtr, _meanElems*sizeof(float), &err);
61 CUDA_SAFE_MALLOC(&_cudaDstPtr, _evCols*sizeof(float), &err); 83 CUDA_SAFE_MALLOC(&_cudaDstPtr, _evCols*sizeof(float), &err);
  84 +
  85 + // initialize the intermediary working space,
  86 + _stepSize = _evRows / _numSteps;
  87 + CUDA_SAFE_MALLOC(&intermediaryBuffer, _numSteps*_evCols*sizeof(float), &err);
62 } 88 }
63 89
64 void wrapper(void* src, void** dst) { 90 void wrapper(void* src, void** dst) {
65 - // copy the image to the GPU  
66 - //cudaMemcpy(_cudaSrcPtr, src, _meanElems*sizeof(float), cudaMemcpyHostToDevice);  
67 cudaError_t err; 91 cudaError_t err;
68 - CUDA_SAFE_MALLOC(dst, _evRows*_evCols*sizeof(float), &err); 92 + CUDA_SAFE_MALLOC(dst, _evCols*sizeof(float), &err);
  93 +
69 94
70 // subtract out the mean of the image (mean is 1xpixels in size) 95 // subtract out the mean of the image (mean is 1xpixels in size)
71 int threadsPerBlock = 64; 96 int threadsPerBlock = 64;
@@ -74,9 +99,14 @@ namespace br { namespace cuda { namespace pca { @@ -74,9 +99,14 @@ namespace br { namespace cuda { namespace pca {
74 CUDA_KERNEL_ERR_CHK(&err); 99 CUDA_KERNEL_ERR_CHK(&err);
75 100
76 // perform the multiplication 101 // perform the multiplication
  102 + dim3 threadsPerBlock2d(64, _numSteps);
  103 + dim3 numBlocks2d(_evCols / threadsPerBlock2d.x + 1, 1);
  104 + multiplyKernel<<<numBlocks2d, threadsPerBlock2d>>>((float*)src, intermediaryBuffer, cudaEvPtr, _evCols, _stepSize);
  105 + CUDA_KERNEL_ERR_CHK(&err);
  106 +
77 threadsPerBlock = 64; 107 threadsPerBlock = 64;
78 numBlocks = _evCols / threadsPerBlock + 1; 108 numBlocks = _evCols / threadsPerBlock + 1;
79 - multiplyKernel<<<numBlocks, threadsPerBlock>>>((float*)src, (float*)(*dst), cudaEvPtr, _evRows, _evCols); 109 + multiplyJoinKernel<<<numBlocks, threadsPerBlock>>>(intermediaryBuffer, (float*)(*dst), _evCols, _numSteps, _stepSize);
80 CUDA_KERNEL_ERR_CHK(&err); 110 CUDA_KERNEL_ERR_CHK(&err);
81 111
82 CUDA_SAFE_FREE(src, &err); // TODO(colin): figure out why adding this free causes memory corruption... 112 CUDA_SAFE_FREE(src, &err); // TODO(colin): figure out why adding this free causes memory corruption...