Commit ea4c68bdd990d3aa5fb1b3ee3f6c1a4c28b417b9

Authored by DepthDeluxe
1 parent 1a6637f6

2.5x performance improvement, parallelized by splitting up columns

openbr/plugins/cuda/cudapca.cpp
... ... @@ -169,9 +169,6 @@ private:
169 169 cout << "\tRows: " << eVals.rows() << " Cols: " << eVals.cols() << endl;
170 170 cout << "Keep: " << keep << endl;
171 171  
172   - cout << "Mean first value: " << mean(0, 0) << endl;
173   -
174   -
175 172 // TODO(colin): use Eigen Map class to generate map files so we don't have to copy the data
176 173 // serialize the eigenvectors
177 174 float* evBuffer = new float[eVecs.rows() * eVecs.cols()];
... ...
openbr/plugins/cuda/cudapca.cu
... ... @@ -10,19 +10,38 @@ using namespace cv;
10 10 using namespace cv::gpu;
11 11  
12 12 namespace br { namespace cuda { namespace pca {
13   - __global__ void multiplyKernel(float* src, float* dst, float* evPtr, int evRows, int evCols) {
  13 + __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int evCols, int stepSize) {
14 14 int colInd = blockIdx.x*blockDim.x+threadIdx.x;
15 15  
  16 + int stepNum = threadIdx.y;
  17 + int iStart = stepNum*stepSize;
  18 + int iEnd = iStart+stepSize;
  19 +
16 20 // check dimensions
17 21 if (colInd >= evCols) {
18 22 return;
19 23 }
20 24  
21 25 float acc = 0;
22   - for (int i=0; i < evRows; i++) {
  26 + for (int i=iStart; i < iEnd; i++) {
23 27 acc += evPtr[evCols*i + colInd] * src[i];
24 28 }
25 29  
  30 + intermediaryBuffer[stepSize*stepNum + colInd] = acc;
  31 + }
  32 +
  33 + __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* dst, int evCols, int numSteps, int stepSize) {
  34 + int colInd = blockIdx.x*blockDim.x+threadIdx.x;
  35 +
  36 + if (colInd >= evCols) {
  37 + return;
  38 + }
  39 +
  40 + float acc = 0;
  41 + for (int i = 0; i < numSteps; i++) {
  42 + acc += intermediaryBuffer[stepSize*i + colInd];
  43 + }
  44 +
26 45 dst[colInd] = acc;
27 46 }
28 47  
... ... @@ -43,6 +62,9 @@ namespace br { namespace cuda { namespace pca {
43 62 float* _cudaSrcPtr;
44 63 float* _cudaDstPtr;
45 64  
  65 + int _numSteps = 2; int _stepSize;
  66 + float* intermediaryBuffer;
  67 +
46 68 void loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) {
47 69 _evRows = evRows; _evCols = evCols;
48 70 _meanElems = meanElems;
... ... @@ -59,13 +81,16 @@ namespace br { namespace cuda { namespace pca {
59 81  
60 82 CUDA_SAFE_MALLOC(&_cudaSrcPtr, _meanElems*sizeof(float), &err);
61 83 CUDA_SAFE_MALLOC(&_cudaDstPtr, _evCols*sizeof(float), &err);
  84 +
  85 + // initialize the intermediary working space,
  86 + _stepSize = _evRows / _numSteps;
  87 + CUDA_SAFE_MALLOC(&intermediaryBuffer, _numSteps*_evCols*sizeof(float), &err);
62 88 }
63 89  
64 90 void wrapper(void* src, void** dst) {
65   - // copy the image to the GPU
66   - //cudaMemcpy(_cudaSrcPtr, src, _meanElems*sizeof(float), cudaMemcpyHostToDevice);
67 91 cudaError_t err;
68   - CUDA_SAFE_MALLOC(dst, _evRows*_evCols*sizeof(float), &err);
  92 + CUDA_SAFE_MALLOC(dst, _evCols*sizeof(float), &err);
  93 +
69 94  
70 95 // subtract out the mean of the image (mean is 1xpixels in size)
71 96 int threadsPerBlock = 64;
... ... @@ -74,9 +99,14 @@ namespace br { namespace cuda { namespace pca {
74 99 CUDA_KERNEL_ERR_CHK(&err);
75 100  
76 101 // perform the multiplication
  102 + dim3 threadsPerBlock2d(64, _numSteps);
  103 + dim3 numBlocks2d(_evCols / threadsPerBlock2d.x + 1, 1);
  104 + multiplyKernel<<<numBlocks2d, threadsPerBlock2d>>>((float*)src, intermediaryBuffer, cudaEvPtr, _evCols, _stepSize);
  105 + CUDA_KERNEL_ERR_CHK(&err);
  106 +
77 107 threadsPerBlock = 64;
78 108 numBlocks = _evCols / threadsPerBlock + 1;
79   - multiplyKernel<<<numBlocks, threadsPerBlock>>>((float*)src, (float*)(*dst), cudaEvPtr, _evRows, _evCols);
  109 + multiplyJoinKernel<<<numBlocks, threadsPerBlock>>>(intermediaryBuffer, (float*)(*dst), _evCols, _numSteps, _stepSize);
80 110 CUDA_KERNEL_ERR_CHK(&err);
81 111  
82 112 CUDA_SAFE_FREE(src, &err); // TODO(colin): figure out why adding this free causes memory corruption...
... ...