diff --git a/openbr/plugins/cuda/cudapca.cpp b/openbr/plugins/cuda/cudapca.cpp index 998404b..fbee173 100644 --- a/openbr/plugins/cuda/cudapca.cpp +++ b/openbr/plugins/cuda/cudapca.cpp @@ -169,9 +169,6 @@ private: cout << "\tRows: " << eVals.rows() << " Cols: " << eVals.cols() << endl; cout << "Keep: " << keep << endl; - cout << "Mean first value: " << mean(0, 0) << endl; - - // TODO(colin): use Eigen Map class to generate map files so we don't have to copy the data // serialize the eigenvectors float* evBuffer = new float[eVecs.rows() * eVecs.cols()]; diff --git a/openbr/plugins/cuda/cudapca.cu b/openbr/plugins/cuda/cudapca.cu index 8351e85..5c197a1 100644 --- a/openbr/plugins/cuda/cudapca.cu +++ b/openbr/plugins/cuda/cudapca.cu @@ -10,19 +10,38 @@ using namespace cv; using namespace cv::gpu; namespace br { namespace cuda { namespace pca { - __global__ void multiplyKernel(float* src, float* dst, float* evPtr, int evRows, int evCols) { + __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int evCols, int stepSize) { int colInd = blockIdx.x*blockDim.x+threadIdx.x; + int stepNum = threadIdx.y; + int iStart = stepNum*stepSize; + int iEnd = iStart+stepSize; + // check dimensions if (colInd >= evCols) { return; } float acc = 0; - for (int i=0; i < evRows; i++) { + for (int i=iStart; i < iEnd; i++) { acc += evPtr[evCols*i + colInd] * src[i]; } + intermediaryBuffer[stepSize*stepNum + colInd] = acc; + } + + __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* dst, int evCols, int numSteps, int stepSize) { + int colInd = blockIdx.x*blockDim.x+threadIdx.x; + + if (colInd >= evCols) { + return; + } + + float acc = 0; + for (int i = 0; i < numSteps; i++) { + acc += intermediaryBuffer[stepSize*i + colInd]; + } + dst[colInd] = acc; } @@ -43,6 +62,9 @@ namespace br { namespace cuda { namespace pca { float* _cudaSrcPtr; float* _cudaDstPtr; + int _numSteps = 2; int _stepSize; + float* intermediaryBuffer; + void loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) { _evRows = evRows; _evCols = evCols; _meanElems = meanElems; @@ -59,13 +81,16 @@ namespace br { namespace cuda { namespace pca { CUDA_SAFE_MALLOC(&_cudaSrcPtr, _meanElems*sizeof(float), &err); CUDA_SAFE_MALLOC(&_cudaDstPtr, _evCols*sizeof(float), &err); + + // initialize the intermediary working space, + _stepSize = _evRows / _numSteps; + CUDA_SAFE_MALLOC(&intermediaryBuffer, _numSteps*_evCols*sizeof(float), &err); } void wrapper(void* src, void** dst) { - // copy the image to the GPU - //cudaMemcpy(_cudaSrcPtr, src, _meanElems*sizeof(float), cudaMemcpyHostToDevice); cudaError_t err; - CUDA_SAFE_MALLOC(dst, _evRows*_evCols*sizeof(float), &err); + CUDA_SAFE_MALLOC(dst, _evCols*sizeof(float), &err); + // subtract out the mean of the image (mean is 1xpixels in size) int threadsPerBlock = 64; @@ -74,9 +99,14 @@ namespace br { namespace cuda { namespace pca { CUDA_KERNEL_ERR_CHK(&err); // perform the multiplication + dim3 threadsPerBlock2d(64, _numSteps); + dim3 numBlocks2d(_evCols / threadsPerBlock2d.x + 1, 1); + multiplyKernel<<>>((float*)src, intermediaryBuffer, cudaEvPtr, _evCols, _stepSize); + CUDA_KERNEL_ERR_CHK(&err); + threadsPerBlock = 64; numBlocks = _evCols / threadsPerBlock + 1; - multiplyKernel<<>>((float*)src, (float*)(*dst), cudaEvPtr, _evRows, _evCols); + multiplyJoinKernel<<>>(intermediaryBuffer, (float*)(*dst), _evCols, _numSteps, _stepSize); CUDA_KERNEL_ERR_CHK(&err); CUDA_SAFE_FREE(src, &err); // TODO(colin): figure out why adding this free causes memory corruption...