Commit ea4c68bdd990d3aa5fb1b3ee3f6c1a4c28b417b9
1 parent
1a6637f6
2.5x performance improvement, parallelized by splitting up columns
Showing
2 changed files
with
36 additions
and
9 deletions
openbr/plugins/cuda/cudapca.cpp
| @@ -169,9 +169,6 @@ private: | @@ -169,9 +169,6 @@ private: | ||
| 169 | cout << "\tRows: " << eVals.rows() << " Cols: " << eVals.cols() << endl; | 169 | cout << "\tRows: " << eVals.rows() << " Cols: " << eVals.cols() << endl; |
| 170 | cout << "Keep: " << keep << endl; | 170 | cout << "Keep: " << keep << endl; |
| 171 | 171 | ||
| 172 | - cout << "Mean first value: " << mean(0, 0) << endl; | ||
| 173 | - | ||
| 174 | - | ||
| 175 | // TODO(colin): use Eigen Map class to generate map files so we don't have to copy the data | 172 | // TODO(colin): use Eigen Map class to generate map files so we don't have to copy the data |
| 176 | // serialize the eigenvectors | 173 | // serialize the eigenvectors |
| 177 | float* evBuffer = new float[eVecs.rows() * eVecs.cols()]; | 174 | float* evBuffer = new float[eVecs.rows() * eVecs.cols()]; |
openbr/plugins/cuda/cudapca.cu
| @@ -10,19 +10,38 @@ using namespace cv; | @@ -10,19 +10,38 @@ using namespace cv; | ||
| 10 | using namespace cv::gpu; | 10 | using namespace cv::gpu; |
| 11 | 11 | ||
| 12 | namespace br { namespace cuda { namespace pca { | 12 | namespace br { namespace cuda { namespace pca { |
| 13 | - __global__ void multiplyKernel(float* src, float* dst, float* evPtr, int evRows, int evCols) { | 13 | + __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int evCols, int stepSize) { |
| 14 | int colInd = blockIdx.x*blockDim.x+threadIdx.x; | 14 | int colInd = blockIdx.x*blockDim.x+threadIdx.x; |
| 15 | 15 | ||
| 16 | + int stepNum = threadIdx.y; | ||
| 17 | + int iStart = stepNum*stepSize; | ||
| 18 | + int iEnd = iStart+stepSize; | ||
| 19 | + | ||
| 16 | // check dimensions | 20 | // check dimensions |
| 17 | if (colInd >= evCols) { | 21 | if (colInd >= evCols) { |
| 18 | return; | 22 | return; |
| 19 | } | 23 | } |
| 20 | 24 | ||
| 21 | float acc = 0; | 25 | float acc = 0; |
| 22 | - for (int i=0; i < evRows; i++) { | 26 | + for (int i=iStart; i < iEnd; i++) { |
| 23 | acc += evPtr[evCols*i + colInd] * src[i]; | 27 | acc += evPtr[evCols*i + colInd] * src[i]; |
| 24 | } | 28 | } |
| 25 | 29 | ||
| 30 | + intermediaryBuffer[stepSize*stepNum + colInd] = acc; | ||
| 31 | + } | ||
| 32 | + | ||
| 33 | + __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* dst, int evCols, int numSteps, int stepSize) { | ||
| 34 | + int colInd = blockIdx.x*blockDim.x+threadIdx.x; | ||
| 35 | + | ||
| 36 | + if (colInd >= evCols) { | ||
| 37 | + return; | ||
| 38 | + } | ||
| 39 | + | ||
| 40 | + float acc = 0; | ||
| 41 | + for (int i = 0; i < numSteps; i++) { | ||
| 42 | + acc += intermediaryBuffer[stepSize*i + colInd]; | ||
| 43 | + } | ||
| 44 | + | ||
| 26 | dst[colInd] = acc; | 45 | dst[colInd] = acc; |
| 27 | } | 46 | } |
| 28 | 47 | ||
| @@ -43,6 +62,9 @@ namespace br { namespace cuda { namespace pca { | @@ -43,6 +62,9 @@ namespace br { namespace cuda { namespace pca { | ||
| 43 | float* _cudaSrcPtr; | 62 | float* _cudaSrcPtr; |
| 44 | float* _cudaDstPtr; | 63 | float* _cudaDstPtr; |
| 45 | 64 | ||
| 65 | + int _numSteps = 2; int _stepSize; | ||
| 66 | + float* intermediaryBuffer; | ||
| 67 | + | ||
| 46 | void loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) { | 68 | void loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) { |
| 47 | _evRows = evRows; _evCols = evCols; | 69 | _evRows = evRows; _evCols = evCols; |
| 48 | _meanElems = meanElems; | 70 | _meanElems = meanElems; |
| @@ -59,13 +81,16 @@ namespace br { namespace cuda { namespace pca { | @@ -59,13 +81,16 @@ namespace br { namespace cuda { namespace pca { | ||
| 59 | 81 | ||
| 60 | CUDA_SAFE_MALLOC(&_cudaSrcPtr, _meanElems*sizeof(float), &err); | 82 | CUDA_SAFE_MALLOC(&_cudaSrcPtr, _meanElems*sizeof(float), &err); |
| 61 | CUDA_SAFE_MALLOC(&_cudaDstPtr, _evCols*sizeof(float), &err); | 83 | CUDA_SAFE_MALLOC(&_cudaDstPtr, _evCols*sizeof(float), &err); |
| 84 | + | ||
| 85 | + // initialize the intermediary working space, | ||
| 86 | + _stepSize = _evRows / _numSteps; | ||
| 87 | + CUDA_SAFE_MALLOC(&intermediaryBuffer, _numSteps*_evCols*sizeof(float), &err); | ||
| 62 | } | 88 | } |
| 63 | 89 | ||
| 64 | void wrapper(void* src, void** dst) { | 90 | void wrapper(void* src, void** dst) { |
| 65 | - // copy the image to the GPU | ||
| 66 | - //cudaMemcpy(_cudaSrcPtr, src, _meanElems*sizeof(float), cudaMemcpyHostToDevice); | ||
| 67 | cudaError_t err; | 91 | cudaError_t err; |
| 68 | - CUDA_SAFE_MALLOC(dst, _evRows*_evCols*sizeof(float), &err); | 92 | + CUDA_SAFE_MALLOC(dst, _evCols*sizeof(float), &err); |
| 93 | + | ||
| 69 | 94 | ||
| 70 | // subtract out the mean of the image (mean is 1xpixels in size) | 95 | // subtract out the mean of the image (mean is 1xpixels in size) |
| 71 | int threadsPerBlock = 64; | 96 | int threadsPerBlock = 64; |
| @@ -74,9 +99,14 @@ namespace br { namespace cuda { namespace pca { | @@ -74,9 +99,14 @@ namespace br { namespace cuda { namespace pca { | ||
| 74 | CUDA_KERNEL_ERR_CHK(&err); | 99 | CUDA_KERNEL_ERR_CHK(&err); |
| 75 | 100 | ||
| 76 | // perform the multiplication | 101 | // perform the multiplication |
| 102 | + dim3 threadsPerBlock2d(64, _numSteps); | ||
| 103 | + dim3 numBlocks2d(_evCols / threadsPerBlock2d.x + 1, 1); | ||
| 104 | + multiplyKernel<<<numBlocks2d, threadsPerBlock2d>>>((float*)src, intermediaryBuffer, cudaEvPtr, _evCols, _stepSize); | ||
| 105 | + CUDA_KERNEL_ERR_CHK(&err); | ||
| 106 | + | ||
| 77 | threadsPerBlock = 64; | 107 | threadsPerBlock = 64; |
| 78 | numBlocks = _evCols / threadsPerBlock + 1; | 108 | numBlocks = _evCols / threadsPerBlock + 1; |
| 79 | - multiplyKernel<<<numBlocks, threadsPerBlock>>>((float*)src, (float*)(*dst), cudaEvPtr, _evRows, _evCols); | 109 | + multiplyJoinKernel<<<numBlocks, threadsPerBlock>>>(intermediaryBuffer, (float*)(*dst), _evCols, _numSteps, _stepSize); |
| 80 | CUDA_KERNEL_ERR_CHK(&err); | 110 | CUDA_KERNEL_ERR_CHK(&err); |
| 81 | 111 | ||
| 82 | CUDA_SAFE_FREE(src, &err); // TODO(colin): figure out why adding this free causes memory corruption... | 112 | CUDA_SAFE_FREE(src, &err); // TODO(colin): figure out why adding this free causes memory corruption... |