Commit 155b284db2460355d749e88d5cebe197c9f22e64

Authored by Colin Heinzmann
1 parent 15c8a584

fixed memory index bug in PCA

openbr/plugins/cuda/cudapca.cpp
@@ -94,6 +94,10 @@ private: @@ -94,6 +94,10 @@ private:
94 int cols = *((int*)srcDataPtr[2]); 94 int cols = *((int*)srcDataPtr[2]);
95 int type = *((int*)srcDataPtr[3]); 95 int type = *((int*)srcDataPtr[3]);
96 96
  97 + if (type != CV_32FC1) {
  98 + qFatal("Requires single channel 32-bit floating point matrices.");
  99 + }
  100 +
97 Mat mat = Mat(rows, cols, type); 101 Mat mat = Mat(rows, cols, type);
98 br::cuda::pca::trainWrapper(cudaMemPtr, mat.ptr<float>(), rows, cols); 102 br::cuda::pca::trainWrapper(cudaMemPtr, mat.ptr<float>(), rows, cols);
99 trainingQlist.append(Template(mat)); 103 trainingQlist.append(Template(mat));
@@ -102,9 +106,6 @@ private: @@ -102,9 +106,6 @@ private:
102 // assemble a TemplateList from the list of data 106 // assemble a TemplateList from the list of data
103 TemplateList trainingSet(trainingQlist); 107 TemplateList trainingSet(trainingQlist);
104 108
105 - if (trainingSet.first().m().type() != CV_32FC1) {  
106 - qFatal("Requires single channel 32-bit floating point matrices.");  
107 - }  
108 109
109 originalRows = trainingSet.first().m().rows; // get number of rows of first image 110 originalRows = trainingSet.first().m().rows; // get number of rows of first image
110 int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols; // get the size of the first image 111 int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols; // get the size of the first image
@@ -127,7 +128,7 @@ private: @@ -127,7 +128,7 @@ private:
127 128
128 if (type != CV_32FC1) { 129 if (type != CV_32FC1) {
129 cout << "ERR: Invalid image type" << endl; 130 cout << "ERR: Invalid image type" << endl;
130 - return; 131 + throw 0;
131 } 132 }
132 133
133 Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type()); 134 Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
openbr/plugins/cuda/cudapca.cu
@@ -30,61 +30,67 @@ using namespace cv::gpu; @@ -30,61 +30,67 @@ using namespace cv::gpu;
30 */ 30 */
31 31
32 namespace br { namespace cuda { namespace pca { 32 namespace br { namespace cuda { namespace pca {
33 - __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int evRows, int evCols, int stepSize) {  
34 - int colInd = blockIdx.x*blockDim.x+threadIdx.x;  
35 - if (colInd >= evCols) { 33 + __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int numEigenvectors, int numSteps, int stepSize, int numPixels) {
  34 + int evIdx = blockIdx.x*blockDim.x+threadIdx.x;
  35 + int stepIdx = blockIdx.y*blockDim.y+threadIdx.y;
  36 +
  37 + if (evIdx >= numEigenvectors || stepIdx >= numSteps) {
36 return; 38 return;
37 } 39 }
38 40
39 - int stepNum = threadIdx.y;  
40 - int iStart = stepNum*stepSize;  
41 - int iEnd = iStart+stepSize;  
42 - if (iStart >= evRows) { 41 + float acc = 0;
  42 + int startIdx = stepSize*stepIdx;
  43 + int stopIdx = startIdx+stepSize;
  44 + if (startIdx >= numPixels) {
43 return; 45 return;
44 } 46 }
45 - if (iEnd > evRows) {  
46 - iEnd = evRows; 47 + if (stopIdx >= numPixels) {
  48 + stopIdx = numPixels;
47 } 49 }
48 -  
49 - float acc = 0;  
50 - for (int i=iStart; i < iEnd; i++) {  
51 - acc += evPtr[evCols*i + colInd] * src[i]; 50 + for(int i=startIdx; i < stopIdx; i++) {
  51 + acc += src[i]*evPtr[i*numEigenvectors + evIdx];
52 } 52 }
53 53
54 - intermediaryBuffer[stepSize*stepNum + colInd] = acc; 54 + intermediaryBuffer[stepIdx*stepSize + evIdx] = acc;
55 } 55 }
56 56
57 - __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* dst, int evRows, int evCols, int numSteps, int stepSize) {  
58 - int colInd = blockIdx.x*blockDim.x+threadIdx.x;  
59 - if (colInd >= evCols) { 57 + __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* out, int numEigenvectors, int numSteps, int stepSize) {
  58 + int evIdx = blockIdx.x*blockDim.x+threadIdx.x;
  59 + if (evIdx >= numEigenvectors) {
60 return; 60 return;
61 } 61 }
62 62
63 float acc = 0; 63 float acc = 0;
64 - for (int i = 0; i < numSteps; i++) {  
65 - acc += intermediaryBuffer[stepSize*i + colInd]; 64 + for (int i=0; i < numSteps; i++) {
  65 + int ibIdx = i*stepSize + evIdx;
  66 + if (ibIdx >= numSteps*stepSize) {
  67 + break;
  68 + }
  69 + acc += intermediaryBuffer[ibIdx];
66 } 70 }
67 71
68 - dst[colInd] = acc; 72 + out[evIdx] = acc;
69 } 73 }
70 74
71 - __global__ void subtractMeanKernel(float* out, float* mean, int numCols) {  
72 - int colInd = blockIdx.x*blockDim.x+threadIdx.x; 75 + __global__ void subtractMeanKernel(float* out, float* mean, int numElems) {
  76 + int idx = blockIdx.x*blockDim.x+threadIdx.x;
73 77
74 // perform bound checking 78 // perform bound checking
75 - if (colInd >= numCols) { 79 + if (idx >= numElems) {
76 return; 80 return;
77 } 81 }
78 82
79 // subtract out the mean 83 // subtract out the mean
80 - out[colInd] -= mean[colInd]; 84 + out[idx] -= mean[idx];
81 } 85 }
82 86
  87 + // _evRows: the number of pixels in the trained images
  88 + // _evCols: the number of eigenvectors
  89 + // _meanElems: the number of pixels in an image
  90 + // _stepSize: the number of pixels in a single step
  91 + // _numSteps: the number of steps required to complete operation
83 float* cudaEvPtr; int _evRows; int _evCols; 92 float* cudaEvPtr; int _evRows; int _evCols;
84 float* cudaMeanPtr; int _meanElems; 93 float* cudaMeanPtr; int _meanElems;
85 - float* _cudaSrcPtr;  
86 - float* _cudaDstPtr;  
87 -  
88 int _numSteps; int _stepSize; 94 int _numSteps; int _stepSize;
89 float* intermediaryBuffer; 95 float* intermediaryBuffer;
90 96
@@ -102,13 +108,10 @@ namespace br { namespace cuda { namespace pca { @@ -102,13 +108,10 @@ namespace br { namespace cuda { namespace pca {
102 CUDA_SAFE_MALLOC(&cudaMeanPtr, meanElems*sizeof(float), &err); 108 CUDA_SAFE_MALLOC(&cudaMeanPtr, meanElems*sizeof(float), &err);
103 CUDA_SAFE_MEMCPY(cudaMeanPtr, meanPtr, meanElems*sizeof(float), cudaMemcpyHostToDevice, &err); 109 CUDA_SAFE_MEMCPY(cudaMeanPtr, meanPtr, meanElems*sizeof(float), cudaMemcpyHostToDevice, &err);
104 110
105 - CUDA_SAFE_MALLOC(&_cudaSrcPtr, _meanElems*sizeof(float), &err);  
106 - CUDA_SAFE_MALLOC(&_cudaDstPtr, _evCols*sizeof(float), &err);  
107 -  
108 // initialize the intermediary working space, 111 // initialize the intermediary working space,
109 - _numSteps = 16;  
110 - _stepSize = _evRows / _numSteps + 1;  
111 - CUDA_SAFE_MALLOC(&intermediaryBuffer, _numSteps*_evCols*sizeof(float), &err); 112 + _stepSize = 2048;
  113 + _numSteps = _evRows / _stepSize + 1;
  114 + CUDA_SAFE_MALLOC(&intermediaryBuffer, _numSteps*_stepSize*sizeof(float), &err);
112 } 115 }
113 116
114 void trainWrapper(void* cudaSrc, float* data, int rows, int cols) { 117 void trainWrapper(void* cudaSrc, float* data, int rows, int cols) {
@@ -120,28 +123,31 @@ namespace br { namespace cuda { namespace pca { @@ -120,28 +123,31 @@ namespace br { namespace cuda { namespace pca {
120 cudaError_t err; 123 cudaError_t err;
121 CUDA_SAFE_MALLOC(dst, _evCols*sizeof(float), &err); 124 CUDA_SAFE_MALLOC(dst, _evCols*sizeof(float), &err);
122 125
123 - if (imgRows*imgCols != _evRows) { 126 + if (imgRows*imgCols != _evRows || imgRows*imgCols != _meanElems) {
124 cout << "ERR: Image dimension mismatch!" << endl; 127 cout << "ERR: Image dimension mismatch!" << endl;
125 throw 0; 128 throw 0;
126 } 129 }
127 130
128 - // subtract out the mean of the image (mean is 1xpixels in size)  
129 - int threadsPerBlock = 64; 131 + // subtract out the mean of the image (mean is 1xpixels in size), perform in place (in src)
  132 + int threadsPerBlock = 512;
130 int numBlocks = _meanElems / threadsPerBlock + 1; 133 int numBlocks = _meanElems / threadsPerBlock + 1;
131 subtractMeanKernel<<<numBlocks, threadsPerBlock>>>((float*)src, cudaMeanPtr, _meanElems); 134 subtractMeanKernel<<<numBlocks, threadsPerBlock>>>((float*)src, cudaMeanPtr, _meanElems);
132 CUDA_KERNEL_ERR_CHK(&err); 135 CUDA_KERNEL_ERR_CHK(&err);
133 136
134 - // perform the multiplication  
135 - dim3 threadsPerBlock2d(64, _numSteps);  
136 - dim3 numBlocks2d(_evCols / threadsPerBlock2d.x + 1, 1);  
137 - multiplyKernel<<<numBlocks2d, threadsPerBlock2d>>>((float*)src, intermediaryBuffer, cudaEvPtr, _evRows, _evCols, _stepSize); 137 + // perform matrix multiplication
  138 + dim3 threadsPerBlock2d(512, 1);
  139 + dim3 numBlocks2d(
  140 + _evCols / threadsPerBlock2d.x + 1,
  141 + _numSteps / threadsPerBlock2d.y + 1);
  142 + multiplyKernel<<<numBlocks2d, threadsPerBlock2d>>>((float*)src, intermediaryBuffer, cudaEvPtr, _evCols, _numSteps, _stepSize, _meanElems);
138 CUDA_KERNEL_ERR_CHK(&err); 143 CUDA_KERNEL_ERR_CHK(&err);
139 144
140 - threadsPerBlock = 64; 145 + threadsPerBlock = 512;
141 numBlocks = _evCols / threadsPerBlock + 1; 146 numBlocks = _evCols / threadsPerBlock + 1;
142 - multiplyJoinKernel<<<numBlocks, threadsPerBlock>>>(intermediaryBuffer, (float*)(*dst), _evRows, _evCols, _numSteps, _stepSize); 147 + multiplyJoinKernel<<<numBlocks, threadsPerBlock>>>(intermediaryBuffer, (float*)*dst, _evCols, _numSteps, _stepSize);
143 CUDA_KERNEL_ERR_CHK(&err); 148 CUDA_KERNEL_ERR_CHK(&err);
144 149
145 - CUDA_SAFE_FREE(src, &err); // TODO(colin): figure out why adding this free causes memory corruption... 150 + // free the src memory
  151 + CUDA_SAFE_FREE(src, &err);
146 } 152 }
147 }}} 153 }}}