Commit d025a5da1ba863560cc8bc9b8c955363f1f76ca1

Authored by DepthDeluxe
1 parent 7f5c5a5a

added streamlined CUDA execution, fixed memory problems

openbr/plugins/cuda/copyfrom.cpp
@@ -22,8 +22,6 @@ namespace br @@ -22,8 +22,6 @@ namespace br
22 private: 22 private:
23 void project(const Template &src, Template &dst) const 23 void project(const Template &src, Template &dst) const
24 { 24 {
25 - cout << "CUDACopyFrom Start" << endl << endl << endl;  
26 -  
27 // pull the data back out of the Mat 25 // pull the data back out of the Mat
28 void* const* dataPtr = src.m().ptr<void*>(); 26 void* const* dataPtr = src.m().ptr<void*>();
29 void* cudaMemPtr = dataPtr[0]; 27 void* cudaMemPtr = dataPtr[0];
@@ -44,8 +42,6 @@ private: @@ -44,8 +42,6 @@ private:
44 break; 42 break;
45 } 43 }
46 dst = dstMat; 44 dst = dstMat;
47 -  
48 - cout << "CUDACopyFrom End" << endl;  
49 } 45 }
50 }; 46 };
51 47
openbr/plugins/cuda/cudacvtfloat.cpp
@@ -27,8 +27,6 @@ class CUDACvtFloatTransform : public UntrainableTransform @@ -27,8 +27,6 @@ class CUDACvtFloatTransform : public UntrainableTransform
27 public: 27 public:
28 void project(const Template &src, Template &dst) const 28 void project(const Template &src, Template &dst) const
29 { 29 {
30 - cout << "CUDACvtFloat Start" << endl;  
31 -  
32 void* const* srcDataPtr = src.m().ptr<void*>(); 30 void* const* srcDataPtr = src.m().ptr<void*>();
33 void* srcMemPtr = srcDataPtr[0]; 31 void* srcMemPtr = srcDataPtr[0];
34 int rows = *((int*)srcDataPtr[1]); 32 int rows = *((int*)srcDataPtr[1]);
@@ -51,8 +49,6 @@ class CUDACvtFloatTransform : public UntrainableTransform @@ -51,8 +49,6 @@ class CUDACvtFloatTransform : public UntrainableTransform
51 49
52 br::cuda::cudacvtfloat::wrapper((const unsigned char*)srcMemPtr, &dstDataPtr[0], rows, cols); 50 br::cuda::cudacvtfloat::wrapper((const unsigned char*)srcMemPtr, &dstDataPtr[0], rows, cols);
53 dst = dstMat; 51 dst = dstMat;
54 -  
55 - cout << "CUDACvtFloat End" << endl;  
56 } 52 }
57 }; 53 };
58 54
openbr/plugins/cuda/cudacvtfloat.cu
@@ -31,6 +31,9 @@ namespace br { namespace cuda { namespace cudacvtfloat { @@ -31,6 +31,9 @@ namespace br { namespace cuda { namespace cudacvtfloat {
31 ); 31 );
32 32
33 kernel<<<threadsPerBlock, blocks>>>(src, (float*)(*dst), rows, cols); 33 kernel<<<threadsPerBlock, blocks>>>(src, (float*)(*dst), rows, cols);
  34 +
  35 + // free the src memory since it is now in a newly allocated dst
  36 + cudaFree((void*)src);
34 } 37 }
35 38
36 }}} 39 }}}
openbr/plugins/cuda/cudalbp.cpp
@@ -166,8 +166,6 @@ class CUDALBPTransform : public UntrainableTransform @@ -166,8 +166,6 @@ class CUDALBPTransform : public UntrainableTransform
166 //matManager->release(a); 166 //matManager->release(a);
167 //matManager->release(b); 167 //matManager->release(b);
168 168
169 - cout << "CUDALBP Start" << endl;  
170 -  
171 void* const* srcDataPtr = src.m().ptr<void*>(); 169 void* const* srcDataPtr = src.m().ptr<void*>();
172 void* cudaSrcPtr = srcDataPtr[0]; 170 void* cudaSrcPtr = srcDataPtr[0];
173 int rows = *((int*)srcDataPtr[1]); 171 int rows = *((int*)srcDataPtr[1]);
@@ -182,8 +180,6 @@ class CUDALBPTransform : public UntrainableTransform @@ -182,8 +180,6 @@ class CUDALBPTransform : public UntrainableTransform
182 180
183 br::cuda::cudalbp_wrapper(cudaSrcPtr, &dstDataPtr[0], rows, cols); 181 br::cuda::cudalbp_wrapper(cudaSrcPtr, &dstDataPtr[0], rows, cols);
184 dst = dstMat; 182 dst = dstMat;
185 -  
186 - cout << "CUDALBP End" << endl;  
187 } 183 }
188 }; 184 };
189 185
openbr/plugins/cuda/cudalbp.cu
@@ -47,6 +47,8 @@ namespace br { namespace cuda { @@ -47,6 +47,8 @@ namespace br { namespace cuda {
47 47
48 cudaMalloc(dstPtr, rows*cols*sizeof(uint8_t)); 48 cudaMalloc(dstPtr, rows*cols*sizeof(uint8_t));
49 cudalbp_kernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), rows, cols, lut); 49 cudalbp_kernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), rows, cols, lut);
  50 +
  51 + cudaFree(srcPtr);
50 } 52 }
51 53
52 void cudalbp_init_wrapper(uint8_t* cpuLut) { 54 void cudalbp_init_wrapper(uint8_t* cpuLut) {
openbr/plugins/cuda/cudapca.cpp
@@ -16,14 +16,23 @@ @@ -16,14 +16,23 @@
16 #include <iostream> 16 #include <iostream>
17 using namespace std; 17 using namespace std;
18 18
  19 +#include <QList>
  20 +
19 #include <Eigen/Dense> 21 #include <Eigen/Dense>
20 -#include <openbr/plugins/openbr_internal.h>  
21 22
  23 +#include <opencv2/opencv.hpp>
  24 +using namespace cv;
  25 +
  26 +#include <openbr/plugins/openbr_internal.h>
22 #include <openbr/core/common.h> 27 #include <openbr/core/common.h>
23 #include <openbr/core/eigenutils.h> 28 #include <openbr/core/eigenutils.h>
24 #include <openbr/core/opencvutils.h> 29 #include <openbr/core/opencvutils.h>
25 30
26 -#include "cudapca.hpp" 31 +namespace br { namespace cuda {
  32 + void cudapca_loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems);
  33 + void cudapca_trainwrapper(const void* cudaDataPtr, float* dataPtr, int rows, int cols);
  34 + void cudapca_projectwrapper(void* src, void** dst);
  35 +}}
27 36
28 namespace br 37 namespace br
29 { 38 {
@@ -71,14 +80,30 @@ private: @@ -71,14 +80,30 @@ private:
71 return (srcMap - mean).squaredNorm() - projMap.squaredNorm(); 80 return (srcMap - mean).squaredNorm() - projMap.squaredNorm();
72 } 81 }
73 82
74 - void train(const TemplateList &trainingSet) 83 + void train(const TemplateList &cudaTrainingSet)
75 { 84 {
  85 + const int instances = cudaTrainingSet.size(); // get the number of training set instances
  86 + QList<Template> trainingQlist;
  87 + for(int i=0; i<instances; i++) {
  88 + Template currentTemplate = cudaTrainingSet[i];
  89 + void* const* srcDataPtr = currentTemplate.m().ptr<void*>();
  90 + const void* cudaMemPtr = srcDataPtr[0];
  91 + int rows = *((int*)srcDataPtr[1]);
  92 + int cols = *((int*)srcDataPtr[2]);
  93 + int type = *((int*)srcDataPtr[3]);
  94 +
  95 + Mat mat = Mat(rows, cols, type);
  96 + br::cuda::cudapca_trainwrapper(cudaMemPtr, mat.ptr<float>(), rows, cols);
  97 + trainingQlist.append(Template(mat));
  98 + TemplateList trainingSet;
  99 + }
  100 + TemplateList trainingSet(trainingQlist);
  101 +
76 if (trainingSet.first().m().type() != CV_32FC1) 102 if (trainingSet.first().m().type() != CV_32FC1)
77 qFatal("Requires single channel 32-bit floating point matrices."); 103 qFatal("Requires single channel 32-bit floating point matrices.");
78 104
79 originalRows = trainingSet.first().m().rows; // get number of rows of first image 105 originalRows = trainingSet.first().m().rows; // get number of rows of first image
80 int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols; // get the size of the first image 106 int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols; // get the size of the first image
81 - const int instances = trainingSet.size(); // get the number of training set instances  
82 107
83 // Map into 64-bit Eigen matrix 108 // Map into 64-bit Eigen matrix
84 Eigen::MatrixXd data(dimsIn, instances); // create a mat 109 Eigen::MatrixXd data(dimsIn, instances); // create a mat
@@ -90,10 +115,32 @@ private: @@ -90,10 +115,32 @@ private:
90 115
91 void project(const Template &src, Template &dst) const 116 void project(const Template &src, Template &dst) const
92 { 117 {
93 - dst = cv::Mat(1, keep, CV_32FC1); 118 +
  119 + void* const* srcDataPtr = src.m().ptr<void*>();
  120 + void* cudaMemPtr = srcDataPtr[0];
  121 + int rows = *((int*)srcDataPtr[1]);
  122 + int cols = *((int*)srcDataPtr[2]);
  123 + int type = *((int*)srcDataPtr[3]);
  124 +
  125 + if (type != CV_32FC1) {
  126 + cout << "ERR: Invalid image type" << endl;
  127 + return;
  128 + }
  129 +
  130 + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
  131 + void** dstDataPtr = dstMat.ptr<void*>();
  132 + dstDataPtr[1] = srcDataPtr[1]; *((int*)dstDataPtr[1]) = 1;
  133 + dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = keep;
  134 + dstDataPtr[3] = srcDataPtr[3];
  135 +
  136 + br::cuda::cudapca_projectwrapper(cudaMemPtr, &dstDataPtr[0]);
  137 +
  138 + dst = dstMat;
  139 +
  140 + //dst = cv::Mat(1, keep, CV_32FC1);
94 141
95 // perform the operation on the graphics card 142 // perform the operation on the graphics card
96 - cuda::cudapca_projectwrapper((float*)src.m().ptr<float>(), (float*)dst.m().ptr<float>()); 143 + //cuda::cudapca_projectwrapper((float*)src.m().ptr<float>(), (float*)dst.m().ptr<float>());
97 144
98 // Map Eigen into OpenCV 145 // Map Eigen into OpenCV
99 //Mat cpuDst = cv::Mat(1, keep, CV_32FC1); 146 //Mat cpuDst = cv::Mat(1, keep, CV_32FC1);
openbr/plugins/cuda/cudapca.cu
@@ -7,8 +7,6 @@ using namespace std; @@ -7,8 +7,6 @@ using namespace std;
7 using namespace cv; 7 using namespace cv;
8 using namespace cv::gpu; 8 using namespace cv::gpu;
9 9
10 -#include "cudapca.hpp"  
11 -  
12 namespace br { namespace cuda { 10 namespace br { namespace cuda {
13 __global__ void calculateCovariance_kernel(float* trainingSet, float* cov, int numRows, int numCols) { 11 __global__ void calculateCovariance_kernel(float* trainingSet, float* cov, int numRows, int numCols) {
14 int rowInd = blockIdx.y*blockDim.y + threadIdx.y; 12 int rowInd = blockIdx.y*blockDim.y + threadIdx.y;
@@ -77,116 +75,29 @@ namespace br { namespace cuda { @@ -77,116 +75,29 @@ namespace br { namespace cuda {
77 cudaMalloc(&_cudaDstPtr, _evCols*sizeof(float)); 75 cudaMalloc(&_cudaDstPtr, _evCols*sizeof(float));
78 } 76 }
79 77
80 - void cudapca_trainwrapper() {  
81 - /*  
82 - if (trainingSet[0].type() != CV_32FC1) {  
83 - std::cout << "ERR: Requires single 32-bit floating point matrix!";  
84 - return;  
85 - }  
86 -  
87 - cudaError_t status;  
88 -  
89 - const int numExamples = trainingSetSize;  
90 - int numPixels = trainingSet[0].rows * trainingSet[0].cols;  
91 -  
92 - // create a custom matrix  
93 - float* cudaDataPtr;  
94 - status = cudaMalloc(&cudaDataPtr, numPixels * numExamples * sizeof(float));  
95 - if (status != cudaSuccess) {  
96 - std::cout << "ERR: Memory allocation" << std::endl;  
97 - return;  
98 - }  
99 -  
100 - // copy all the data to the graphics card  
101 - for (int i=0; i < numExamples; i++) {  
102 - status = cudaMemcpy(cudaDataPtr + i*numPixels, trainingSet[i].ptr<float>(), numPixels*sizeof(float), cudaMemcpyHostToDevice);  
103 - if (status != cudaSuccess) {  
104 - std::cout << "ERR: Memcpy at index " << i << std::endl;  
105 - return;  
106 - }  
107 - }  
108 -  
109 - // start the core part of the algorithm  
110 - int numDimensions = numPixels;  
111 - const bool dominantEigenEstimation = (numDimensions > numExamples);  
112 -  
113 - // malloc and init mean  
114 - mean = new float[numDimensions];  
115 - for (int i=0; i < numDimensions; i++) {  
116 - mean[i] = 0;  
117 - }  
118 - float* cudaMeanPtr;  
119 - status = cudaMalloc(&cudaMeanPtr, numDimensions*sizeof(float));  
120 - if (status != cudaSuccess) {  
121 - std::cout << " ERR: Malloc of mean" << std::endl;  
122 - return;  
123 - }  
124 -  
125 - if (keep != 0) {  
126 - // compute the mean so we can subtract from data  
127 - for (int i=0; i < numExamples; i++) {  
128 - Mat& m = trainingSet[i];  
129 -  
130 - for (int j=0; j < numDimensions; j++) {  
131 - mean[j] += m.ptr<float>()[i*numDimensions + j];  
132 - }  
133 - }  
134 - for (int i=0; i < numDimensions; i++) {  
135 - mean[i] = mean[i] / numExamples;  
136 - }  
137 -  
138 - // copy mean over to graphics card  
139 - cudaMemcpy(cudaMeanPtr, mean, numExamples*sizeof(float), cudaMemcpyHostToDevice);  
140 - if (status != cudaSuccess) {  
141 - std::cout << " ERR: Cpy of mean" << std::endl;  
142 - return;  
143 - }  
144 -  
145 - // set the thread dimensions and run the kernel  
146 - dim3 threadsPerBlock(64, 1);  
147 - dim3 numBlocks(numDimensions/threadsPerBlock.x + 1,  
148 - numExamples/threadsPerBlock.y + 1);  
149 -  
150 - subtractMean_kernel<<<numBlocks, threadsPerBlock>>>(cudaDataPtr, cudaMeanPtr, numExamples, numDimensions);  
151 -  
152 - // calculate the covariance matrix using kernel  
153 - // malloc location for covariance matrix  
154 - float* cudaCovPtr;  
155 - status = cudaMalloc(&cudaCovPtr, numExamples*numExamples*sizeof(float));  
156 - if (status != cudaSuccess) h  
157 - std::cout << " ERR: Cpy of mean" << std::endl;  
158 - return;  
159 - }  
160 -  
161 - // calculate the covariance matrix  
162 - threadsPerBlock = dim3(8, 8);  
163 - numBlocks = dim3(numExamples/threadsPerBlock.x + 1,  
164 - numExamples/threadsPerBlock.y + 1);  
165 - calculateCovariance_kernel<<<numBlocks, threadsPerBlock>>>(cudaDataPtr, cudaCovPtr, numExamples, numDimensions);  
166 -  
167 - // perform eigendecomposition  
168 - //std::cout << "Skipping eigendecomposition" << std::endl;  
169 - cusolverStatus_t cusolverStatus;  
170 - cusolverStatus = cusolverDnSgebrd(cusolverHandle,)  
171 - }  
172 - */ 78 + void cudapca_trainwrapper(const void* cudaDataPtr, float* dataPtr, int rows, int cols) {
  79 + cudaMemcpy(dataPtr, cudaDataPtr, rows*cols*sizeof(float), cudaMemcpyDeviceToHost);
173 } 80 }
174 81
175 - void cudapca_projectwrapper(float* src, float* dst) { 82 + void cudapca_projectwrapper(void* src, void** dst) {
176 // copy the image to the GPU 83 // copy the image to the GPU
177 - cudaMemcpy(_cudaSrcPtr, src, _meanElems*sizeof(float), cudaMemcpyHostToDevice); 84 + //cudaMemcpy(_cudaSrcPtr, src, _meanElems*sizeof(float), cudaMemcpyHostToDevice);
  85 +
  86 + cudaMalloc(dst, _evRows*_evCols*sizeof(float));
178 87
179 // subtract out the mean of the image (mean is 1xpixels in size) 88 // subtract out the mean of the image (mean is 1xpixels in size)
180 int threadsPerBlock = 64; 89 int threadsPerBlock = 64;
181 int numBlocks = _meanElems / threadsPerBlock + 1; 90 int numBlocks = _meanElems / threadsPerBlock + 1;
182 - cudapca_project_subtractmean_kernel<<<numBlocks, threadsPerBlock>>>(_cudaSrcPtr, cudaMeanPtr, _meanElems); 91 + cudapca_project_subtractmean_kernel<<<numBlocks, threadsPerBlock>>>((float*)src, cudaMeanPtr, _meanElems);
183 92
184 // perform the multiplication 93 // perform the multiplication
185 threadsPerBlock = 64; 94 threadsPerBlock = 64;
186 numBlocks = _evCols / threadsPerBlock + 1; 95 numBlocks = _evCols / threadsPerBlock + 1;
187 - cudapca_project_multiply_kernel<<<numBlocks, threadsPerBlock>>>(_cudaSrcPtr, _cudaDstPtr, cudaEvPtr, _evRows, _evCols); 96 + cudapca_project_multiply_kernel<<<numBlocks, threadsPerBlock>>>((float*)src, (float*)(*dst), cudaEvPtr, _evRows, _evCols);
  97 +
  98 + //cudaFree(src); // TODO(colin): figure out why adding this free causes memory corruption...
188 99
189 // copy the data back to the CPU 100 // copy the data back to the CPU
190 - cudaMemcpy(dst, _cudaDstPtr, _evCols*sizeof(float), cudaMemcpyDeviceToHost); 101 + //cudaMemcpy(dst, _cudaDstPtr, _evCols*sizeof(float), cudaMemcpyDeviceToHost);
191 } 102 }
192 }} 103 }}
openbr/plugins/cuda/cudapca.hpp deleted
1 -#include <opencv2/opencv.hpp>  
2 -#include <opencv2/gpu/gpu.hpp>  
3 -  
4 -using namespace cv;  
5 -using namespace cv::gpu;  
6 -  
7 -namespace br { namespace cuda {  
8 - void cudapca_loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems);  
9 - void cudapca_trainwrapper();  
10 -  
11 - void cudapca_projectwrapper(float* src, float* dst);  
12 -}}