Commit 155b284db2460355d749e88d5cebe197c9f22e64
1 parent
15c8a584
fixed memory index bug in PCA
Showing
2 changed files
with
54 additions
and
47 deletions
openbr/plugins/cuda/cudapca.cpp
| ... | ... | @@ -94,6 +94,10 @@ private: |
| 94 | 94 | int cols = *((int*)srcDataPtr[2]); |
| 95 | 95 | int type = *((int*)srcDataPtr[3]); |
| 96 | 96 | |
| 97 | + if (type != CV_32FC1) { | |
| 98 | + qFatal("Requires single channel 32-bit floating point matrices."); | |
| 99 | + } | |
| 100 | + | |
| 97 | 101 | Mat mat = Mat(rows, cols, type); |
| 98 | 102 | br::cuda::pca::trainWrapper(cudaMemPtr, mat.ptr<float>(), rows, cols); |
| 99 | 103 | trainingQlist.append(Template(mat)); |
| ... | ... | @@ -102,9 +106,6 @@ private: |
| 102 | 106 | // assemble a TemplateList from the list of data |
| 103 | 107 | TemplateList trainingSet(trainingQlist); |
| 104 | 108 | |
| 105 | - if (trainingSet.first().m().type() != CV_32FC1) { | |
| 106 | - qFatal("Requires single channel 32-bit floating point matrices."); | |
| 107 | - } | |
| 108 | 109 | |
| 109 | 110 | originalRows = trainingSet.first().m().rows; // get number of rows of first image |
| 110 | 111 | int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols; // get the size of the first image |
| ... | ... | @@ -127,7 +128,7 @@ private: |
| 127 | 128 | |
| 128 | 129 | if (type != CV_32FC1) { |
| 129 | 130 | cout << "ERR: Invalid image type" << endl; |
| 130 | - return; | |
| 131 | + throw 0; | |
| 131 | 132 | } |
| 132 | 133 | |
| 133 | 134 | Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type()); | ... | ... |
openbr/plugins/cuda/cudapca.cu
| ... | ... | @@ -30,61 +30,67 @@ using namespace cv::gpu; |
| 30 | 30 | */ |
| 31 | 31 | |
| 32 | 32 | namespace br { namespace cuda { namespace pca { |
| 33 | - __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int evRows, int evCols, int stepSize) { | |
| 34 | - int colInd = blockIdx.x*blockDim.x+threadIdx.x; | |
| 35 | - if (colInd >= evCols) { | |
| 33 | + __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int numEigenvectors, int numSteps, int stepSize, int numPixels) { | |
| 34 | + int evIdx = blockIdx.x*blockDim.x+threadIdx.x; | |
| 35 | + int stepIdx = blockIdx.y*blockDim.y+threadIdx.y; | |
| 36 | + | |
| 37 | + if (evIdx >= numEigenvectors || stepIdx >= numSteps) { | |
| 36 | 38 | return; |
| 37 | 39 | } |
| 38 | 40 | |
| 39 | - int stepNum = threadIdx.y; | |
| 40 | - int iStart = stepNum*stepSize; | |
| 41 | - int iEnd = iStart+stepSize; | |
| 42 | - if (iStart >= evRows) { | |
| 41 | + float acc = 0; | |
| 42 | + int startIdx = stepSize*stepIdx; | |
| 43 | + int stopIdx = startIdx+stepSize; | |
| 44 | + if (startIdx >= numPixels) { | |
| 43 | 45 | return; |
| 44 | 46 | } |
| 45 | - if (iEnd > evRows) { | |
| 46 | - iEnd = evRows; | |
| 47 | + if (stopIdx >= numPixels) { | |
| 48 | + stopIdx = numPixels; | |
| 47 | 49 | } |
| 48 | - | |
| 49 | - float acc = 0; | |
| 50 | - for (int i=iStart; i < iEnd; i++) { | |
| 51 | - acc += evPtr[evCols*i + colInd] * src[i]; | |
| 50 | + for(int i=startIdx; i < stopIdx; i++) { | |
| 51 | + acc += src[i]*evPtr[i*numEigenvectors + evIdx]; | |
| 52 | 52 | } |
| 53 | 53 | |
| 54 | - intermediaryBuffer[stepSize*stepNum + colInd] = acc; | |
| 54 | + intermediaryBuffer[stepIdx*stepSize + evIdx] = acc; | |
| 55 | 55 | } |
| 56 | 56 | |
| 57 | - __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* dst, int evRows, int evCols, int numSteps, int stepSize) { | |
| 58 | - int colInd = blockIdx.x*blockDim.x+threadIdx.x; | |
| 59 | - if (colInd >= evCols) { | |
| 57 | + __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* out, int numEigenvectors, int numSteps, int stepSize) { | |
| 58 | + int evIdx = blockIdx.x*blockDim.x+threadIdx.x; | |
| 59 | + if (evIdx >= numEigenvectors) { | |
| 60 | 60 | return; |
| 61 | 61 | } |
| 62 | 62 | |
| 63 | 63 | float acc = 0; |
| 64 | - for (int i = 0; i < numSteps; i++) { | |
| 65 | - acc += intermediaryBuffer[stepSize*i + colInd]; | |
| 64 | + for (int i=0; i < numSteps; i++) { | |
| 65 | + int ibIdx = i*stepSize + evIdx; | |
| 66 | + if (ibIdx >= numSteps*stepSize) { | |
| 67 | + break; | |
| 68 | + } | |
| 69 | + acc += intermediaryBuffer[ibIdx]; | |
| 66 | 70 | } |
| 67 | 71 | |
| 68 | - dst[colInd] = acc; | |
| 72 | + out[evIdx] = acc; | |
| 69 | 73 | } |
| 70 | 74 | |
| 71 | - __global__ void subtractMeanKernel(float* out, float* mean, int numCols) { | |
| 72 | - int colInd = blockIdx.x*blockDim.x+threadIdx.x; | |
| 75 | + __global__ void subtractMeanKernel(float* out, float* mean, int numElems) { | |
| 76 | + int idx = blockIdx.x*blockDim.x+threadIdx.x; | |
| 73 | 77 | |
| 74 | 78 | // perform bound checking |
| 75 | - if (colInd >= numCols) { | |
| 79 | + if (idx >= numElems) { | |
| 76 | 80 | return; |
| 77 | 81 | } |
| 78 | 82 | |
| 79 | 83 | // subtract out the mean |
| 80 | - out[colInd] -= mean[colInd]; | |
| 84 | + out[idx] -= mean[idx]; | |
| 81 | 85 | } |
| 82 | 86 | |
| 87 | + // _evRows: the number of pixels in the trained images | |
| 88 | + // _evCols: the number of eigenvectors | |
| 89 | + // _meanElems: the number of pixels in an image | |
| 90 | + // _stepSize: the number of pixels in a single step | |
| 91 | + // _numSteps: the number of steps required to complete operation | |
| 83 | 92 | float* cudaEvPtr; int _evRows; int _evCols; |
| 84 | 93 | float* cudaMeanPtr; int _meanElems; |
| 85 | - float* _cudaSrcPtr; | |
| 86 | - float* _cudaDstPtr; | |
| 87 | - | |
| 88 | 94 | int _numSteps; int _stepSize; |
| 89 | 95 | float* intermediaryBuffer; |
| 90 | 96 | |
| ... | ... | @@ -102,13 +108,10 @@ namespace br { namespace cuda { namespace pca { |
| 102 | 108 | CUDA_SAFE_MALLOC(&cudaMeanPtr, meanElems*sizeof(float), &err); |
| 103 | 109 | CUDA_SAFE_MEMCPY(cudaMeanPtr, meanPtr, meanElems*sizeof(float), cudaMemcpyHostToDevice, &err); |
| 104 | 110 | |
| 105 | - CUDA_SAFE_MALLOC(&_cudaSrcPtr, _meanElems*sizeof(float), &err); | |
| 106 | - CUDA_SAFE_MALLOC(&_cudaDstPtr, _evCols*sizeof(float), &err); | |
| 107 | - | |
| 108 | 111 | // initialize the intermediary working space, |
| 109 | - _numSteps = 16; | |
| 110 | - _stepSize = _evRows / _numSteps + 1; | |
| 111 | - CUDA_SAFE_MALLOC(&intermediaryBuffer, _numSteps*_evCols*sizeof(float), &err); | |
| 112 | + _stepSize = 2048; | |
| 113 | + _numSteps = _evRows / _stepSize + 1; | |
| 114 | + CUDA_SAFE_MALLOC(&intermediaryBuffer, _numSteps*_stepSize*sizeof(float), &err); | |
| 112 | 115 | } |
| 113 | 116 | |
| 114 | 117 | void trainWrapper(void* cudaSrc, float* data, int rows, int cols) { |
| ... | ... | @@ -120,28 +123,31 @@ namespace br { namespace cuda { namespace pca { |
| 120 | 123 | cudaError_t err; |
| 121 | 124 | CUDA_SAFE_MALLOC(dst, _evCols*sizeof(float), &err); |
| 122 | 125 | |
| 123 | - if (imgRows*imgCols != _evRows) { | |
| 126 | + if (imgRows*imgCols != _evRows || imgRows*imgCols != _meanElems) { | |
| 124 | 127 | cout << "ERR: Image dimension mismatch!" << endl; |
| 125 | 128 | throw 0; |
| 126 | 129 | } |
| 127 | 130 | |
| 128 | - // subtract out the mean of the image (mean is 1xpixels in size) | |
| 129 | - int threadsPerBlock = 64; | |
| 131 | + // subtract out the mean of the image (mean is 1xpixels in size), perform in place (in src) | |
| 132 | + int threadsPerBlock = 512; | |
| 130 | 133 | int numBlocks = _meanElems / threadsPerBlock + 1; |
| 131 | 134 | subtractMeanKernel<<<numBlocks, threadsPerBlock>>>((float*)src, cudaMeanPtr, _meanElems); |
| 132 | 135 | CUDA_KERNEL_ERR_CHK(&err); |
| 133 | 136 | |
| 134 | - // perform the multiplication | |
| 135 | - dim3 threadsPerBlock2d(64, _numSteps); | |
| 136 | - dim3 numBlocks2d(_evCols / threadsPerBlock2d.x + 1, 1); | |
| 137 | - multiplyKernel<<<numBlocks2d, threadsPerBlock2d>>>((float*)src, intermediaryBuffer, cudaEvPtr, _evRows, _evCols, _stepSize); | |
| 137 | + // perform matrix multiplication | |
| 138 | + dim3 threadsPerBlock2d(512, 1); | |
| 139 | + dim3 numBlocks2d( | |
| 140 | + _evCols / threadsPerBlock2d.x + 1, | |
| 141 | + _numSteps / threadsPerBlock2d.y + 1); | |
| 142 | + multiplyKernel<<<numBlocks2d, threadsPerBlock2d>>>((float*)src, intermediaryBuffer, cudaEvPtr, _evCols, _numSteps, _stepSize, _meanElems); | |
| 138 | 143 | CUDA_KERNEL_ERR_CHK(&err); |
| 139 | 144 | |
| 140 | - threadsPerBlock = 64; | |
| 145 | + threadsPerBlock = 512; | |
| 141 | 146 | numBlocks = _evCols / threadsPerBlock + 1; |
| 142 | - multiplyJoinKernel<<<numBlocks, threadsPerBlock>>>(intermediaryBuffer, (float*)(*dst), _evRows, _evCols, _numSteps, _stepSize); | |
| 147 | + multiplyJoinKernel<<<numBlocks, threadsPerBlock>>>(intermediaryBuffer, (float*)*dst, _evCols, _numSteps, _stepSize); | |
| 143 | 148 | CUDA_KERNEL_ERR_CHK(&err); |
| 144 | 149 | |
| 145 | - CUDA_SAFE_FREE(src, &err); // TODO(colin): figure out why adding this free causes memory corruption... | |
| 150 | + // free the src memory | |
| 151 | + CUDA_SAFE_FREE(src, &err); | |
| 146 | 152 | } |
| 147 | 153 | }}} | ... | ... |