diff --git a/openbr/plugins/cuda/cudapca.cu b/openbr/plugins/cuda/cudapca.cu index 5c197a1..f6fd3a1 100644 --- a/openbr/plugins/cuda/cudapca.cu +++ b/openbr/plugins/cuda/cudapca.cu @@ -10,17 +10,21 @@ using namespace cv; using namespace cv::gpu; namespace br { namespace cuda { namespace pca { - __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int evCols, int stepSize) { + __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int evRows, int evCols, int stepSize) { int colInd = blockIdx.x*blockDim.x+threadIdx.x; + if (colInd >= evCols) { + return; + } int stepNum = threadIdx.y; int iStart = stepNum*stepSize; int iEnd = iStart+stepSize; - - // check dimensions - if (colInd >= evCols) { + if (iStart >= evRows) { return; } + if (iEnd > evRows) { + iEnd = evRows; + } float acc = 0; for (int i=iStart; i < iEnd; i++) { @@ -30,9 +34,8 @@ namespace br { namespace cuda { namespace pca { intermediaryBuffer[stepSize*stepNum + colInd] = acc; } - __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* dst, int evCols, int numSteps, int stepSize) { + __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* dst, int evRows, int evCols, int numSteps, int stepSize) { int colInd = blockIdx.x*blockDim.x+threadIdx.x; - if (colInd >= evCols) { return; } @@ -62,7 +65,7 @@ namespace br { namespace cuda { namespace pca { float* _cudaSrcPtr; float* _cudaDstPtr; - int _numSteps = 2; int _stepSize; + int _numSteps; int _stepSize; float* intermediaryBuffer; void loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) { @@ -83,7 +86,8 @@ namespace br { namespace cuda { namespace pca { CUDA_SAFE_MALLOC(&_cudaDstPtr, _evCols*sizeof(float), &err); // initialize the intermediary working space, - _stepSize = _evRows / _numSteps; + _numSteps = 16; + _stepSize = _evRows / _numSteps + 1; CUDA_SAFE_MALLOC(&intermediaryBuffer, _numSteps*_evCols*sizeof(float), &err); } @@ -101,12 +105,12 @@ namespace br { namespace cuda { namespace pca { // perform the multiplication dim3 threadsPerBlock2d(64, _numSteps); dim3 numBlocks2d(_evCols / threadsPerBlock2d.x + 1, 1); - multiplyKernel<<>>((float*)src, intermediaryBuffer, cudaEvPtr, _evCols, _stepSize); + multiplyKernel<<>>((float*)src, intermediaryBuffer, cudaEvPtr, _evRows, _evCols, _stepSize); CUDA_KERNEL_ERR_CHK(&err); threadsPerBlock = 64; numBlocks = _evCols / threadsPerBlock + 1; - multiplyJoinKernel<<>>(intermediaryBuffer, (float*)(*dst), _evCols, _numSteps, _stepSize); + multiplyJoinKernel<<>>(intermediaryBuffer, (float*)(*dst), _evRows, _evCols, _numSteps, _stepSize); CUDA_KERNEL_ERR_CHK(&err); CUDA_SAFE_FREE(src, &err); // TODO(colin): figure out why adding this free causes memory corruption...