Commit 155b284db2460355d749e88d5cebe197c9f22e64

Authored by Colin Heinzmann
1 parent 15c8a584

fixed memory index bug in PCA

openbr/plugins/cuda/cudapca.cpp
... ... @@ -94,6 +94,10 @@ private:
94 94 int cols = *((int*)srcDataPtr[2]);
95 95 int type = *((int*)srcDataPtr[3]);
96 96  
  97 + if (type != CV_32FC1) {
  98 + qFatal("Requires single channel 32-bit floating point matrices.");
  99 + }
  100 +
97 101 Mat mat = Mat(rows, cols, type);
98 102 br::cuda::pca::trainWrapper(cudaMemPtr, mat.ptr<float>(), rows, cols);
99 103 trainingQlist.append(Template(mat));
... ... @@ -102,9 +106,6 @@ private:
102 106 // assemble a TemplateList from the list of data
103 107 TemplateList trainingSet(trainingQlist);
104 108  
105   - if (trainingSet.first().m().type() != CV_32FC1) {
106   - qFatal("Requires single channel 32-bit floating point matrices.");
107   - }
108 109  
109 110 originalRows = trainingSet.first().m().rows; // get number of rows of first image
110 111 int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols; // get the size of the first image
... ... @@ -127,7 +128,7 @@ private:
127 128  
128 129 if (type != CV_32FC1) {
129 130 cout << "ERR: Invalid image type" << endl;
130   - return;
  131 + throw 0;
131 132 }
132 133  
133 134 Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
... ...
openbr/plugins/cuda/cudapca.cu
... ... @@ -30,61 +30,67 @@ using namespace cv::gpu;
30 30 */
31 31  
32 32 namespace br { namespace cuda { namespace pca {
33   - __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int evRows, int evCols, int stepSize) {
34   - int colInd = blockIdx.x*blockDim.x+threadIdx.x;
35   - if (colInd >= evCols) {
  33 + __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int numEigenvectors, int numSteps, int stepSize, int numPixels) {
  34 + int evIdx = blockIdx.x*blockDim.x+threadIdx.x;
  35 + int stepIdx = blockIdx.y*blockDim.y+threadIdx.y;
  36 +
  37 + if (evIdx >= numEigenvectors || stepIdx >= numSteps) {
36 38 return;
37 39 }
38 40  
39   - int stepNum = threadIdx.y;
40   - int iStart = stepNum*stepSize;
41   - int iEnd = iStart+stepSize;
42   - if (iStart >= evRows) {
  41 + float acc = 0;
  42 + int startIdx = stepSize*stepIdx;
  43 + int stopIdx = startIdx+stepSize;
  44 + if (startIdx >= numPixels) {
43 45 return;
44 46 }
45   - if (iEnd > evRows) {
46   - iEnd = evRows;
  47 + if (stopIdx >= numPixels) {
  48 + stopIdx = numPixels;
47 49 }
48   -
49   - float acc = 0;
50   - for (int i=iStart; i < iEnd; i++) {
51   - acc += evPtr[evCols*i + colInd] * src[i];
  50 + for(int i=startIdx; i < stopIdx; i++) {
  51 + acc += src[i]*evPtr[i*numEigenvectors + evIdx];
52 52 }
53 53  
54   - intermediaryBuffer[stepSize*stepNum + colInd] = acc;
  54 + intermediaryBuffer[stepIdx*stepSize + evIdx] = acc;
55 55 }
56 56  
57   - __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* dst, int evRows, int evCols, int numSteps, int stepSize) {
58   - int colInd = blockIdx.x*blockDim.x+threadIdx.x;
59   - if (colInd >= evCols) {
  57 + __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* out, int numEigenvectors, int numSteps, int stepSize) {
  58 + int evIdx = blockIdx.x*blockDim.x+threadIdx.x;
  59 + if (evIdx >= numEigenvectors) {
60 60 return;
61 61 }
62 62  
63 63 float acc = 0;
64   - for (int i = 0; i < numSteps; i++) {
65   - acc += intermediaryBuffer[stepSize*i + colInd];
  64 + for (int i=0; i < numSteps; i++) {
  65 + int ibIdx = i*stepSize + evIdx;
  66 + if (ibIdx >= numSteps*stepSize) {
  67 + break;
  68 + }
  69 + acc += intermediaryBuffer[ibIdx];
66 70 }
67 71  
68   - dst[colInd] = acc;
  72 + out[evIdx] = acc;
69 73 }
70 74  
71   - __global__ void subtractMeanKernel(float* out, float* mean, int numCols) {
72   - int colInd = blockIdx.x*blockDim.x+threadIdx.x;
  75 + __global__ void subtractMeanKernel(float* out, float* mean, int numElems) {
  76 + int idx = blockIdx.x*blockDim.x+threadIdx.x;
73 77  
74 78 // perform bound checking
75   - if (colInd >= numCols) {
  79 + if (idx >= numElems) {
76 80 return;
77 81 }
78 82  
79 83 // subtract out the mean
80   - out[colInd] -= mean[colInd];
  84 + out[idx] -= mean[idx];
81 85 }
82 86  
  87 + // _evRows: the number of pixels in the trained images
  88 + // _evCols: the number of eigenvectors
  89 + // _meanElems: the number of pixels in an image
  90 + // _stepSize: the number of pixels in a single step
  91 + // _numSteps: the number of steps required to complete operation
83 92 float* cudaEvPtr; int _evRows; int _evCols;
84 93 float* cudaMeanPtr; int _meanElems;
85   - float* _cudaSrcPtr;
86   - float* _cudaDstPtr;
87   -
88 94 int _numSteps; int _stepSize;
89 95 float* intermediaryBuffer;
90 96  
... ... @@ -102,13 +108,10 @@ namespace br { namespace cuda { namespace pca {
102 108 CUDA_SAFE_MALLOC(&cudaMeanPtr, meanElems*sizeof(float), &err);
103 109 CUDA_SAFE_MEMCPY(cudaMeanPtr, meanPtr, meanElems*sizeof(float), cudaMemcpyHostToDevice, &err);
104 110  
105   - CUDA_SAFE_MALLOC(&_cudaSrcPtr, _meanElems*sizeof(float), &err);
106   - CUDA_SAFE_MALLOC(&_cudaDstPtr, _evCols*sizeof(float), &err);
107   -
108 111 // initialize the intermediary working space,
109   - _numSteps = 16;
110   - _stepSize = _evRows / _numSteps + 1;
111   - CUDA_SAFE_MALLOC(&intermediaryBuffer, _numSteps*_evCols*sizeof(float), &err);
  112 + _stepSize = 2048;
  113 + _numSteps = _evRows / _stepSize + 1;
  114 + CUDA_SAFE_MALLOC(&intermediaryBuffer, _numSteps*_stepSize*sizeof(float), &err);
112 115 }
113 116  
114 117 void trainWrapper(void* cudaSrc, float* data, int rows, int cols) {
... ... @@ -120,28 +123,31 @@ namespace br { namespace cuda { namespace pca {
120 123 cudaError_t err;
121 124 CUDA_SAFE_MALLOC(dst, _evCols*sizeof(float), &err);
122 125  
123   - if (imgRows*imgCols != _evRows) {
  126 + if (imgRows*imgCols != _evRows || imgRows*imgCols != _meanElems) {
124 127 cout << "ERR: Image dimension mismatch!" << endl;
125 128 throw 0;
126 129 }
127 130  
128   - // subtract out the mean of the image (mean is 1xpixels in size)
129   - int threadsPerBlock = 64;
  131 + // subtract out the mean of the image (mean is 1xpixels in size), perform in place (in src)
  132 + int threadsPerBlock = 512;
130 133 int numBlocks = _meanElems / threadsPerBlock + 1;
131 134 subtractMeanKernel<<<numBlocks, threadsPerBlock>>>((float*)src, cudaMeanPtr, _meanElems);
132 135 CUDA_KERNEL_ERR_CHK(&err);
133 136  
134   - // perform the multiplication
135   - dim3 threadsPerBlock2d(64, _numSteps);
136   - dim3 numBlocks2d(_evCols / threadsPerBlock2d.x + 1, 1);
137   - multiplyKernel<<<numBlocks2d, threadsPerBlock2d>>>((float*)src, intermediaryBuffer, cudaEvPtr, _evRows, _evCols, _stepSize);
  137 + // perform matrix multiplication
  138 + dim3 threadsPerBlock2d(512, 1);
  139 + dim3 numBlocks2d(
  140 + _evCols / threadsPerBlock2d.x + 1,
  141 + _numSteps / threadsPerBlock2d.y + 1);
  142 + multiplyKernel<<<numBlocks2d, threadsPerBlock2d>>>((float*)src, intermediaryBuffer, cudaEvPtr, _evCols, _numSteps, _stepSize, _meanElems);
138 143 CUDA_KERNEL_ERR_CHK(&err);
139 144  
140   - threadsPerBlock = 64;
  145 + threadsPerBlock = 512;
141 146 numBlocks = _evCols / threadsPerBlock + 1;
142   - multiplyJoinKernel<<<numBlocks, threadsPerBlock>>>(intermediaryBuffer, (float*)(*dst), _evRows, _evCols, _numSteps, _stepSize);
  147 + multiplyJoinKernel<<<numBlocks, threadsPerBlock>>>(intermediaryBuffer, (float*)*dst, _evCols, _numSteps, _stepSize);
143 148 CUDA_KERNEL_ERR_CHK(&err);
144 149  
145   - CUDA_SAFE_FREE(src, &err); // TODO(colin): figure out why adding this free causes memory corruption...
  150 + // free the src memory
  151 + CUDA_SAFE_FREE(src, &err);
146 152 }
147 153 }}}
... ...