Commit 155b284db2460355d749e88d5cebe197c9f22e64
1 parent
15c8a584
fixed memory index bug in PCA
Showing
2 changed files
with
54 additions
and
47 deletions
openbr/plugins/cuda/cudapca.cpp
| @@ -94,6 +94,10 @@ private: | @@ -94,6 +94,10 @@ private: | ||
| 94 | int cols = *((int*)srcDataPtr[2]); | 94 | int cols = *((int*)srcDataPtr[2]); |
| 95 | int type = *((int*)srcDataPtr[3]); | 95 | int type = *((int*)srcDataPtr[3]); |
| 96 | 96 | ||
| 97 | + if (type != CV_32FC1) { | ||
| 98 | + qFatal("Requires single channel 32-bit floating point matrices."); | ||
| 99 | + } | ||
| 100 | + | ||
| 97 | Mat mat = Mat(rows, cols, type); | 101 | Mat mat = Mat(rows, cols, type); |
| 98 | br::cuda::pca::trainWrapper(cudaMemPtr, mat.ptr<float>(), rows, cols); | 102 | br::cuda::pca::trainWrapper(cudaMemPtr, mat.ptr<float>(), rows, cols); |
| 99 | trainingQlist.append(Template(mat)); | 103 | trainingQlist.append(Template(mat)); |
| @@ -102,9 +106,6 @@ private: | @@ -102,9 +106,6 @@ private: | ||
| 102 | // assemble a TemplateList from the list of data | 106 | // assemble a TemplateList from the list of data |
| 103 | TemplateList trainingSet(trainingQlist); | 107 | TemplateList trainingSet(trainingQlist); |
| 104 | 108 | ||
| 105 | - if (trainingSet.first().m().type() != CV_32FC1) { | ||
| 106 | - qFatal("Requires single channel 32-bit floating point matrices."); | ||
| 107 | - } | ||
| 108 | 109 | ||
| 109 | originalRows = trainingSet.first().m().rows; // get number of rows of first image | 110 | originalRows = trainingSet.first().m().rows; // get number of rows of first image |
| 110 | int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols; // get the size of the first image | 111 | int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols; // get the size of the first image |
| @@ -127,7 +128,7 @@ private: | @@ -127,7 +128,7 @@ private: | ||
| 127 | 128 | ||
| 128 | if (type != CV_32FC1) { | 129 | if (type != CV_32FC1) { |
| 129 | cout << "ERR: Invalid image type" << endl; | 130 | cout << "ERR: Invalid image type" << endl; |
| 130 | - return; | 131 | + throw 0; |
| 131 | } | 132 | } |
| 132 | 133 | ||
| 133 | Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type()); | 134 | Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type()); |
openbr/plugins/cuda/cudapca.cu
| @@ -30,61 +30,67 @@ using namespace cv::gpu; | @@ -30,61 +30,67 @@ using namespace cv::gpu; | ||
| 30 | */ | 30 | */ |
| 31 | 31 | ||
| 32 | namespace br { namespace cuda { namespace pca { | 32 | namespace br { namespace cuda { namespace pca { |
| 33 | - __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int evRows, int evCols, int stepSize) { | ||
| 34 | - int colInd = blockIdx.x*blockDim.x+threadIdx.x; | ||
| 35 | - if (colInd >= evCols) { | 33 | + __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int numEigenvectors, int numSteps, int stepSize, int numPixels) { |
| 34 | + int evIdx = blockIdx.x*blockDim.x+threadIdx.x; | ||
| 35 | + int stepIdx = blockIdx.y*blockDim.y+threadIdx.y; | ||
| 36 | + | ||
| 37 | + if (evIdx >= numEigenvectors || stepIdx >= numSteps) { | ||
| 36 | return; | 38 | return; |
| 37 | } | 39 | } |
| 38 | 40 | ||
| 39 | - int stepNum = threadIdx.y; | ||
| 40 | - int iStart = stepNum*stepSize; | ||
| 41 | - int iEnd = iStart+stepSize; | ||
| 42 | - if (iStart >= evRows) { | 41 | + float acc = 0; |
| 42 | + int startIdx = stepSize*stepIdx; | ||
| 43 | + int stopIdx = startIdx+stepSize; | ||
| 44 | + if (startIdx >= numPixels) { | ||
| 43 | return; | 45 | return; |
| 44 | } | 46 | } |
| 45 | - if (iEnd > evRows) { | ||
| 46 | - iEnd = evRows; | 47 | + if (stopIdx >= numPixels) { |
| 48 | + stopIdx = numPixels; | ||
| 47 | } | 49 | } |
| 48 | - | ||
| 49 | - float acc = 0; | ||
| 50 | - for (int i=iStart; i < iEnd; i++) { | ||
| 51 | - acc += evPtr[evCols*i + colInd] * src[i]; | 50 | + for(int i=startIdx; i < stopIdx; i++) { |
| 51 | + acc += src[i]*evPtr[i*numEigenvectors + evIdx]; | ||
| 52 | } | 52 | } |
| 53 | 53 | ||
| 54 | - intermediaryBuffer[stepSize*stepNum + colInd] = acc; | 54 | + intermediaryBuffer[stepIdx*stepSize + evIdx] = acc; |
| 55 | } | 55 | } |
| 56 | 56 | ||
| 57 | - __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* dst, int evRows, int evCols, int numSteps, int stepSize) { | ||
| 58 | - int colInd = blockIdx.x*blockDim.x+threadIdx.x; | ||
| 59 | - if (colInd >= evCols) { | 57 | + __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* out, int numEigenvectors, int numSteps, int stepSize) { |
| 58 | + int evIdx = blockIdx.x*blockDim.x+threadIdx.x; | ||
| 59 | + if (evIdx >= numEigenvectors) { | ||
| 60 | return; | 60 | return; |
| 61 | } | 61 | } |
| 62 | 62 | ||
| 63 | float acc = 0; | 63 | float acc = 0; |
| 64 | - for (int i = 0; i < numSteps; i++) { | ||
| 65 | - acc += intermediaryBuffer[stepSize*i + colInd]; | 64 | + for (int i=0; i < numSteps; i++) { |
| 65 | + int ibIdx = i*stepSize + evIdx; | ||
| 66 | + if (ibIdx >= numSteps*stepSize) { | ||
| 67 | + break; | ||
| 68 | + } | ||
| 69 | + acc += intermediaryBuffer[ibIdx]; | ||
| 66 | } | 70 | } |
| 67 | 71 | ||
| 68 | - dst[colInd] = acc; | 72 | + out[evIdx] = acc; |
| 69 | } | 73 | } |
| 70 | 74 | ||
| 71 | - __global__ void subtractMeanKernel(float* out, float* mean, int numCols) { | ||
| 72 | - int colInd = blockIdx.x*blockDim.x+threadIdx.x; | 75 | + __global__ void subtractMeanKernel(float* out, float* mean, int numElems) { |
| 76 | + int idx = blockIdx.x*blockDim.x+threadIdx.x; | ||
| 73 | 77 | ||
| 74 | // perform bound checking | 78 | // perform bound checking |
| 75 | - if (colInd >= numCols) { | 79 | + if (idx >= numElems) { |
| 76 | return; | 80 | return; |
| 77 | } | 81 | } |
| 78 | 82 | ||
| 79 | // subtract out the mean | 83 | // subtract out the mean |
| 80 | - out[colInd] -= mean[colInd]; | 84 | + out[idx] -= mean[idx]; |
| 81 | } | 85 | } |
| 82 | 86 | ||
| 87 | + // _evRows: the number of pixels in the trained images | ||
| 88 | + // _evCols: the number of eigenvectors | ||
| 89 | + // _meanElems: the number of pixels in an image | ||
| 90 | + // _stepSize: the number of pixels in a single step | ||
| 91 | + // _numSteps: the number of steps required to complete operation | ||
| 83 | float* cudaEvPtr; int _evRows; int _evCols; | 92 | float* cudaEvPtr; int _evRows; int _evCols; |
| 84 | float* cudaMeanPtr; int _meanElems; | 93 | float* cudaMeanPtr; int _meanElems; |
| 85 | - float* _cudaSrcPtr; | ||
| 86 | - float* _cudaDstPtr; | ||
| 87 | - | ||
| 88 | int _numSteps; int _stepSize; | 94 | int _numSteps; int _stepSize; |
| 89 | float* intermediaryBuffer; | 95 | float* intermediaryBuffer; |
| 90 | 96 | ||
| @@ -102,13 +108,10 @@ namespace br { namespace cuda { namespace pca { | @@ -102,13 +108,10 @@ namespace br { namespace cuda { namespace pca { | ||
| 102 | CUDA_SAFE_MALLOC(&cudaMeanPtr, meanElems*sizeof(float), &err); | 108 | CUDA_SAFE_MALLOC(&cudaMeanPtr, meanElems*sizeof(float), &err); |
| 103 | CUDA_SAFE_MEMCPY(cudaMeanPtr, meanPtr, meanElems*sizeof(float), cudaMemcpyHostToDevice, &err); | 109 | CUDA_SAFE_MEMCPY(cudaMeanPtr, meanPtr, meanElems*sizeof(float), cudaMemcpyHostToDevice, &err); |
| 104 | 110 | ||
| 105 | - CUDA_SAFE_MALLOC(&_cudaSrcPtr, _meanElems*sizeof(float), &err); | ||
| 106 | - CUDA_SAFE_MALLOC(&_cudaDstPtr, _evCols*sizeof(float), &err); | ||
| 107 | - | ||
| 108 | // initialize the intermediary working space, | 111 | // initialize the intermediary working space, |
| 109 | - _numSteps = 16; | ||
| 110 | - _stepSize = _evRows / _numSteps + 1; | ||
| 111 | - CUDA_SAFE_MALLOC(&intermediaryBuffer, _numSteps*_evCols*sizeof(float), &err); | 112 | + _stepSize = 2048; |
| 113 | + _numSteps = _evRows / _stepSize + 1; | ||
| 114 | + CUDA_SAFE_MALLOC(&intermediaryBuffer, _numSteps*_stepSize*sizeof(float), &err); | ||
| 112 | } | 115 | } |
| 113 | 116 | ||
| 114 | void trainWrapper(void* cudaSrc, float* data, int rows, int cols) { | 117 | void trainWrapper(void* cudaSrc, float* data, int rows, int cols) { |
| @@ -120,28 +123,31 @@ namespace br { namespace cuda { namespace pca { | @@ -120,28 +123,31 @@ namespace br { namespace cuda { namespace pca { | ||
| 120 | cudaError_t err; | 123 | cudaError_t err; |
| 121 | CUDA_SAFE_MALLOC(dst, _evCols*sizeof(float), &err); | 124 | CUDA_SAFE_MALLOC(dst, _evCols*sizeof(float), &err); |
| 122 | 125 | ||
| 123 | - if (imgRows*imgCols != _evRows) { | 126 | + if (imgRows*imgCols != _evRows || imgRows*imgCols != _meanElems) { |
| 124 | cout << "ERR: Image dimension mismatch!" << endl; | 127 | cout << "ERR: Image dimension mismatch!" << endl; |
| 125 | throw 0; | 128 | throw 0; |
| 126 | } | 129 | } |
| 127 | 130 | ||
| 128 | - // subtract out the mean of the image (mean is 1xpixels in size) | ||
| 129 | - int threadsPerBlock = 64; | 131 | + // subtract out the mean of the image (mean is 1xpixels in size), perform in place (in src) |
| 132 | + int threadsPerBlock = 512; | ||
| 130 | int numBlocks = _meanElems / threadsPerBlock + 1; | 133 | int numBlocks = _meanElems / threadsPerBlock + 1; |
| 131 | subtractMeanKernel<<<numBlocks, threadsPerBlock>>>((float*)src, cudaMeanPtr, _meanElems); | 134 | subtractMeanKernel<<<numBlocks, threadsPerBlock>>>((float*)src, cudaMeanPtr, _meanElems); |
| 132 | CUDA_KERNEL_ERR_CHK(&err); | 135 | CUDA_KERNEL_ERR_CHK(&err); |
| 133 | 136 | ||
| 134 | - // perform the multiplication | ||
| 135 | - dim3 threadsPerBlock2d(64, _numSteps); | ||
| 136 | - dim3 numBlocks2d(_evCols / threadsPerBlock2d.x + 1, 1); | ||
| 137 | - multiplyKernel<<<numBlocks2d, threadsPerBlock2d>>>((float*)src, intermediaryBuffer, cudaEvPtr, _evRows, _evCols, _stepSize); | 137 | + // perform matrix multiplication |
| 138 | + dim3 threadsPerBlock2d(512, 1); | ||
| 139 | + dim3 numBlocks2d( | ||
| 140 | + _evCols / threadsPerBlock2d.x + 1, | ||
| 141 | + _numSteps / threadsPerBlock2d.y + 1); | ||
| 142 | + multiplyKernel<<<numBlocks2d, threadsPerBlock2d>>>((float*)src, intermediaryBuffer, cudaEvPtr, _evCols, _numSteps, _stepSize, _meanElems); | ||
| 138 | CUDA_KERNEL_ERR_CHK(&err); | 143 | CUDA_KERNEL_ERR_CHK(&err); |
| 139 | 144 | ||
| 140 | - threadsPerBlock = 64; | 145 | + threadsPerBlock = 512; |
| 141 | numBlocks = _evCols / threadsPerBlock + 1; | 146 | numBlocks = _evCols / threadsPerBlock + 1; |
| 142 | - multiplyJoinKernel<<<numBlocks, threadsPerBlock>>>(intermediaryBuffer, (float*)(*dst), _evRows, _evCols, _numSteps, _stepSize); | 147 | + multiplyJoinKernel<<<numBlocks, threadsPerBlock>>>(intermediaryBuffer, (float*)*dst, _evCols, _numSteps, _stepSize); |
| 143 | CUDA_KERNEL_ERR_CHK(&err); | 148 | CUDA_KERNEL_ERR_CHK(&err); |
| 144 | 149 | ||
| 145 | - CUDA_SAFE_FREE(src, &err); // TODO(colin): figure out why adding this free causes memory corruption... | 150 | + // free the src memory |
| 151 | + CUDA_SAFE_FREE(src, &err); | ||
| 146 | } | 152 | } |
| 147 | }}} | 153 | }}} |