Commit ea4c68bdd990d3aa5fb1b3ee3f6c1a4c28b417b9
1 parent
1a6637f6
2.5x performance improvement, parallelized by splitting up columns
Showing
2 changed files
with
36 additions
and
9 deletions
openbr/plugins/cuda/cudapca.cpp
| ... | ... | @@ -169,9 +169,6 @@ private: |
| 169 | 169 | cout << "\tRows: " << eVals.rows() << " Cols: " << eVals.cols() << endl; |
| 170 | 170 | cout << "Keep: " << keep << endl; |
| 171 | 171 | |
| 172 | - cout << "Mean first value: " << mean(0, 0) << endl; | |
| 173 | - | |
| 174 | - | |
| 175 | 172 | // TODO(colin): use Eigen Map class to generate map files so we don't have to copy the data |
| 176 | 173 | // serialize the eigenvectors |
| 177 | 174 | float* evBuffer = new float[eVecs.rows() * eVecs.cols()]; | ... | ... |
openbr/plugins/cuda/cudapca.cu
| ... | ... | @@ -10,19 +10,38 @@ using namespace cv; |
| 10 | 10 | using namespace cv::gpu; |
| 11 | 11 | |
| 12 | 12 | namespace br { namespace cuda { namespace pca { |
| 13 | - __global__ void multiplyKernel(float* src, float* dst, float* evPtr, int evRows, int evCols) { | |
| 13 | + __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int evCols, int stepSize) { | |
| 14 | 14 | int colInd = blockIdx.x*blockDim.x+threadIdx.x; |
| 15 | 15 | |
| 16 | + int stepNum = threadIdx.y; | |
| 17 | + int iStart = stepNum*stepSize; | |
| 18 | + int iEnd = iStart+stepSize; | |
| 19 | + | |
| 16 | 20 | // check dimensions |
| 17 | 21 | if (colInd >= evCols) { |
| 18 | 22 | return; |
| 19 | 23 | } |
| 20 | 24 | |
| 21 | 25 | float acc = 0; |
| 22 | - for (int i=0; i < evRows; i++) { | |
| 26 | + for (int i=iStart; i < iEnd; i++) { | |
| 23 | 27 | acc += evPtr[evCols*i + colInd] * src[i]; |
| 24 | 28 | } |
| 25 | 29 | |
| 30 | + intermediaryBuffer[stepSize*stepNum + colInd] = acc; | |
| 31 | + } | |
| 32 | + | |
| 33 | + __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* dst, int evCols, int numSteps, int stepSize) { | |
| 34 | + int colInd = blockIdx.x*blockDim.x+threadIdx.x; | |
| 35 | + | |
| 36 | + if (colInd >= evCols) { | |
| 37 | + return; | |
| 38 | + } | |
| 39 | + | |
| 40 | + float acc = 0; | |
| 41 | + for (int i = 0; i < numSteps; i++) { | |
| 42 | + acc += intermediaryBuffer[stepSize*i + colInd]; | |
| 43 | + } | |
| 44 | + | |
| 26 | 45 | dst[colInd] = acc; |
| 27 | 46 | } |
| 28 | 47 | |
| ... | ... | @@ -43,6 +62,9 @@ namespace br { namespace cuda { namespace pca { |
| 43 | 62 | float* _cudaSrcPtr; |
| 44 | 63 | float* _cudaDstPtr; |
| 45 | 64 | |
| 65 | + int _numSteps = 2; int _stepSize; | |
| 66 | + float* intermediaryBuffer; | |
| 67 | + | |
| 46 | 68 | void loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) { |
| 47 | 69 | _evRows = evRows; _evCols = evCols; |
| 48 | 70 | _meanElems = meanElems; |
| ... | ... | @@ -59,13 +81,16 @@ namespace br { namespace cuda { namespace pca { |
| 59 | 81 | |
| 60 | 82 | CUDA_SAFE_MALLOC(&_cudaSrcPtr, _meanElems*sizeof(float), &err); |
| 61 | 83 | CUDA_SAFE_MALLOC(&_cudaDstPtr, _evCols*sizeof(float), &err); |
| 84 | + | |
| 85 | + // initialize the intermediary working space, | |
| 86 | + _stepSize = _evRows / _numSteps; | |
| 87 | + CUDA_SAFE_MALLOC(&intermediaryBuffer, _numSteps*_evCols*sizeof(float), &err); | |
| 62 | 88 | } |
| 63 | 89 | |
| 64 | 90 | void wrapper(void* src, void** dst) { |
| 65 | - // copy the image to the GPU | |
| 66 | - //cudaMemcpy(_cudaSrcPtr, src, _meanElems*sizeof(float), cudaMemcpyHostToDevice); | |
| 67 | 91 | cudaError_t err; |
| 68 | - CUDA_SAFE_MALLOC(dst, _evRows*_evCols*sizeof(float), &err); | |
| 92 | + CUDA_SAFE_MALLOC(dst, _evCols*sizeof(float), &err); | |
| 93 | + | |
| 69 | 94 | |
| 70 | 95 | // subtract out the mean of the image (mean is 1xpixels in size) |
| 71 | 96 | int threadsPerBlock = 64; |
| ... | ... | @@ -74,9 +99,14 @@ namespace br { namespace cuda { namespace pca { |
| 74 | 99 | CUDA_KERNEL_ERR_CHK(&err); |
| 75 | 100 | |
| 76 | 101 | // perform the multiplication |
| 102 | + dim3 threadsPerBlock2d(64, _numSteps); | |
| 103 | + dim3 numBlocks2d(_evCols / threadsPerBlock2d.x + 1, 1); | |
| 104 | + multiplyKernel<<<numBlocks2d, threadsPerBlock2d>>>((float*)src, intermediaryBuffer, cudaEvPtr, _evCols, _stepSize); | |
| 105 | + CUDA_KERNEL_ERR_CHK(&err); | |
| 106 | + | |
| 77 | 107 | threadsPerBlock = 64; |
| 78 | 108 | numBlocks = _evCols / threadsPerBlock + 1; |
| 79 | - multiplyKernel<<<numBlocks, threadsPerBlock>>>((float*)src, (float*)(*dst), cudaEvPtr, _evRows, _evCols); | |
| 109 | + multiplyJoinKernel<<<numBlocks, threadsPerBlock>>>(intermediaryBuffer, (float*)(*dst), _evCols, _numSteps, _stepSize); | |
| 80 | 110 | CUDA_KERNEL_ERR_CHK(&err); |
| 81 | 111 | |
| 82 | 112 | CUDA_SAFE_FREE(src, &err); // TODO(colin): figure out why adding this free causes memory corruption... | ... | ... |