diff --git a/openbr/plugins/cuda/cudapca.cpp b/openbr/plugins/cuda/cudapca.cpp index f74de2f..dd51a81 100644 --- a/openbr/plugins/cuda/cudapca.cpp +++ b/openbr/plugins/cuda/cudapca.cpp @@ -94,6 +94,10 @@ private: int cols = *((int*)srcDataPtr[2]); int type = *((int*)srcDataPtr[3]); + if (type != CV_32FC1) { + qFatal("Requires single channel 32-bit floating point matrices."); + } + Mat mat = Mat(rows, cols, type); br::cuda::pca::trainWrapper(cudaMemPtr, mat.ptr(), rows, cols); trainingQlist.append(Template(mat)); @@ -102,9 +106,6 @@ private: // assemble a TemplateList from the list of data TemplateList trainingSet(trainingQlist); - if (trainingSet.first().m().type() != CV_32FC1) { - qFatal("Requires single channel 32-bit floating point matrices."); - } originalRows = trainingSet.first().m().rows; // get number of rows of first image int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols; // get the size of the first image @@ -127,7 +128,7 @@ private: if (type != CV_32FC1) { cout << "ERR: Invalid image type" << endl; - return; + throw 0; } Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type()); diff --git a/openbr/plugins/cuda/cudapca.cu b/openbr/plugins/cuda/cudapca.cu index 05c34d6..cc877ce 100644 --- a/openbr/plugins/cuda/cudapca.cu +++ b/openbr/plugins/cuda/cudapca.cu @@ -30,61 +30,67 @@ using namespace cv::gpu; */ namespace br { namespace cuda { namespace pca { - __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int evRows, int evCols, int stepSize) { - int colInd = blockIdx.x*blockDim.x+threadIdx.x; - if (colInd >= evCols) { + __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int numEigenvectors, int numSteps, int stepSize, int numPixels) { + int evIdx = blockIdx.x*blockDim.x+threadIdx.x; + int stepIdx = blockIdx.y*blockDim.y+threadIdx.y; + + if (evIdx >= numEigenvectors || stepIdx >= numSteps) { return; } - int stepNum = threadIdx.y; - int iStart = stepNum*stepSize; - int iEnd = iStart+stepSize; - if (iStart >= evRows) { + float acc = 0; + int startIdx = stepSize*stepIdx; + int stopIdx = startIdx+stepSize; + if (startIdx >= numPixels) { return; } - if (iEnd > evRows) { - iEnd = evRows; + if (stopIdx >= numPixels) { + stopIdx = numPixels; } - - float acc = 0; - for (int i=iStart; i < iEnd; i++) { - acc += evPtr[evCols*i + colInd] * src[i]; + for(int i=startIdx; i < stopIdx; i++) { + acc += src[i]*evPtr[i*numEigenvectors + evIdx]; } - intermediaryBuffer[stepSize*stepNum + colInd] = acc; + intermediaryBuffer[stepIdx*stepSize + evIdx] = acc; } - __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* dst, int evRows, int evCols, int numSteps, int stepSize) { - int colInd = blockIdx.x*blockDim.x+threadIdx.x; - if (colInd >= evCols) { + __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* out, int numEigenvectors, int numSteps, int stepSize) { + int evIdx = blockIdx.x*blockDim.x+threadIdx.x; + if (evIdx >= numEigenvectors) { return; } float acc = 0; - for (int i = 0; i < numSteps; i++) { - acc += intermediaryBuffer[stepSize*i + colInd]; + for (int i=0; i < numSteps; i++) { + int ibIdx = i*stepSize + evIdx; + if (ibIdx >= numSteps*stepSize) { + break; + } + acc += intermediaryBuffer[ibIdx]; } - dst[colInd] = acc; + out[evIdx] = acc; } - __global__ void subtractMeanKernel(float* out, float* mean, int numCols) { - int colInd = blockIdx.x*blockDim.x+threadIdx.x; + __global__ void subtractMeanKernel(float* out, float* mean, int numElems) { + int idx = blockIdx.x*blockDim.x+threadIdx.x; // perform bound checking - if (colInd >= numCols) { + if (idx >= numElems) { return; } // subtract out the mean - out[colInd] -= mean[colInd]; + out[idx] -= mean[idx]; } + // _evRows: the number of pixels in the trained images + // _evCols: the number of eigenvectors + // _meanElems: the number of pixels in an image + // _stepSize: the number of pixels in a single step + // _numSteps: the number of steps required to complete operation float* cudaEvPtr; int _evRows; int _evCols; float* cudaMeanPtr; int _meanElems; - float* _cudaSrcPtr; - float* _cudaDstPtr; - int _numSteps; int _stepSize; float* intermediaryBuffer; @@ -102,13 +108,10 @@ namespace br { namespace cuda { namespace pca { CUDA_SAFE_MALLOC(&cudaMeanPtr, meanElems*sizeof(float), &err); CUDA_SAFE_MEMCPY(cudaMeanPtr, meanPtr, meanElems*sizeof(float), cudaMemcpyHostToDevice, &err); - CUDA_SAFE_MALLOC(&_cudaSrcPtr, _meanElems*sizeof(float), &err); - CUDA_SAFE_MALLOC(&_cudaDstPtr, _evCols*sizeof(float), &err); - // initialize the intermediary working space, - _numSteps = 16; - _stepSize = _evRows / _numSteps + 1; - CUDA_SAFE_MALLOC(&intermediaryBuffer, _numSteps*_evCols*sizeof(float), &err); + _stepSize = 2048; + _numSteps = _evRows / _stepSize + 1; + CUDA_SAFE_MALLOC(&intermediaryBuffer, _numSteps*_stepSize*sizeof(float), &err); } void trainWrapper(void* cudaSrc, float* data, int rows, int cols) { @@ -120,28 +123,31 @@ namespace br { namespace cuda { namespace pca { cudaError_t err; CUDA_SAFE_MALLOC(dst, _evCols*sizeof(float), &err); - if (imgRows*imgCols != _evRows) { + if (imgRows*imgCols != _evRows || imgRows*imgCols != _meanElems) { cout << "ERR: Image dimension mismatch!" << endl; throw 0; } - // subtract out the mean of the image (mean is 1xpixels in size) - int threadsPerBlock = 64; + // subtract out the mean of the image (mean is 1xpixels in size), perform in place (in src) + int threadsPerBlock = 512; int numBlocks = _meanElems / threadsPerBlock + 1; subtractMeanKernel<<>>((float*)src, cudaMeanPtr, _meanElems); CUDA_KERNEL_ERR_CHK(&err); - // perform the multiplication - dim3 threadsPerBlock2d(64, _numSteps); - dim3 numBlocks2d(_evCols / threadsPerBlock2d.x + 1, 1); - multiplyKernel<<>>((float*)src, intermediaryBuffer, cudaEvPtr, _evRows, _evCols, _stepSize); + // perform matrix multiplication + dim3 threadsPerBlock2d(512, 1); + dim3 numBlocks2d( + _evCols / threadsPerBlock2d.x + 1, + _numSteps / threadsPerBlock2d.y + 1); + multiplyKernel<<>>((float*)src, intermediaryBuffer, cudaEvPtr, _evCols, _numSteps, _stepSize, _meanElems); CUDA_KERNEL_ERR_CHK(&err); - threadsPerBlock = 64; + threadsPerBlock = 512; numBlocks = _evCols / threadsPerBlock + 1; - multiplyJoinKernel<<>>(intermediaryBuffer, (float*)(*dst), _evRows, _evCols, _numSteps, _stepSize); + multiplyJoinKernel<<>>(intermediaryBuffer, (float*)*dst, _evCols, _numSteps, _stepSize); CUDA_KERNEL_ERR_CHK(&err); - CUDA_SAFE_FREE(src, &err); // TODO(colin): figure out why adding this free causes memory corruption... + // free the src memory + CUDA_SAFE_FREE(src, &err); } }}}