Commit d025a5da1ba863560cc8bc9b8c955363f1f76ca1

Authored by DepthDeluxe
1 parent 7f5c5a5a

added streamlined CUDA execution, fixed memory problems

openbr/plugins/cuda/copyfrom.cpp
... ... @@ -22,8 +22,6 @@ namespace br
22 22 private:
23 23 void project(const Template &src, Template &dst) const
24 24 {
25   - cout << "CUDACopyFrom Start" << endl << endl << endl;
26   -
27 25 // pull the data back out of the Mat
28 26 void* const* dataPtr = src.m().ptr<void*>();
29 27 void* cudaMemPtr = dataPtr[0];
... ... @@ -44,8 +42,6 @@ private:
44 42 break;
45 43 }
46 44 dst = dstMat;
47   -
48   - cout << "CUDACopyFrom End" << endl;
49 45 }
50 46 };
51 47  
... ...
openbr/plugins/cuda/cudacvtfloat.cpp
... ... @@ -27,8 +27,6 @@ class CUDACvtFloatTransform : public UntrainableTransform
27 27 public:
28 28 void project(const Template &src, Template &dst) const
29 29 {
30   - cout << "CUDACvtFloat Start" << endl;
31   -
32 30 void* const* srcDataPtr = src.m().ptr<void*>();
33 31 void* srcMemPtr = srcDataPtr[0];
34 32 int rows = *((int*)srcDataPtr[1]);
... ... @@ -51,8 +49,6 @@ class CUDACvtFloatTransform : public UntrainableTransform
51 49  
52 50 br::cuda::cudacvtfloat::wrapper((const unsigned char*)srcMemPtr, &dstDataPtr[0], rows, cols);
53 51 dst = dstMat;
54   -
55   - cout << "CUDACvtFloat End" << endl;
56 52 }
57 53 };
58 54  
... ...
openbr/plugins/cuda/cudacvtfloat.cu
... ... @@ -31,6 +31,9 @@ namespace br { namespace cuda { namespace cudacvtfloat {
31 31 );
32 32  
33 33 kernel<<<threadsPerBlock, blocks>>>(src, (float*)(*dst), rows, cols);
  34 +
  35 + // free the src memory since it is now in a newly allocated dst
  36 + cudaFree((void*)src);
34 37 }
35 38  
36 39 }}}
... ...
openbr/plugins/cuda/cudalbp.cpp
... ... @@ -166,8 +166,6 @@ class CUDALBPTransform : public UntrainableTransform
166 166 //matManager->release(a);
167 167 //matManager->release(b);
168 168  
169   - cout << "CUDALBP Start" << endl;
170   -
171 169 void* const* srcDataPtr = src.m().ptr<void*>();
172 170 void* cudaSrcPtr = srcDataPtr[0];
173 171 int rows = *((int*)srcDataPtr[1]);
... ... @@ -182,8 +180,6 @@ class CUDALBPTransform : public UntrainableTransform
182 180  
183 181 br::cuda::cudalbp_wrapper(cudaSrcPtr, &dstDataPtr[0], rows, cols);
184 182 dst = dstMat;
185   -
186   - cout << "CUDALBP End" << endl;
187 183 }
188 184 };
189 185  
... ...
openbr/plugins/cuda/cudalbp.cu
... ... @@ -47,6 +47,8 @@ namespace br { namespace cuda {
47 47  
48 48 cudaMalloc(dstPtr, rows*cols*sizeof(uint8_t));
49 49 cudalbp_kernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), rows, cols, lut);
  50 +
  51 + cudaFree(srcPtr);
50 52 }
51 53  
52 54 void cudalbp_init_wrapper(uint8_t* cpuLut) {
... ...
openbr/plugins/cuda/cudapca.cpp
... ... @@ -16,14 +16,23 @@
16 16 #include <iostream>
17 17 using namespace std;
18 18  
  19 +#include <QList>
  20 +
19 21 #include <Eigen/Dense>
20   -#include <openbr/plugins/openbr_internal.h>
21 22  
  23 +#include <opencv2/opencv.hpp>
  24 +using namespace cv;
  25 +
  26 +#include <openbr/plugins/openbr_internal.h>
22 27 #include <openbr/core/common.h>
23 28 #include <openbr/core/eigenutils.h>
24 29 #include <openbr/core/opencvutils.h>
25 30  
26   -#include "cudapca.hpp"
  31 +namespace br { namespace cuda {
  32 + void cudapca_loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems);
  33 + void cudapca_trainwrapper(const void* cudaDataPtr, float* dataPtr, int rows, int cols);
  34 + void cudapca_projectwrapper(void* src, void** dst);
  35 +}}
27 36  
28 37 namespace br
29 38 {
... ... @@ -71,14 +80,30 @@ private:
71 80 return (srcMap - mean).squaredNorm() - projMap.squaredNorm();
72 81 }
73 82  
74   - void train(const TemplateList &trainingSet)
  83 + void train(const TemplateList &cudaTrainingSet)
75 84 {
  85 + const int instances = cudaTrainingSet.size(); // get the number of training set instances
  86 + QList<Template> trainingQlist;
  87 + for(int i=0; i<instances; i++) {
  88 + Template currentTemplate = cudaTrainingSet[i];
  89 + void* const* srcDataPtr = currentTemplate.m().ptr<void*>();
  90 + const void* cudaMemPtr = srcDataPtr[0];
  91 + int rows = *((int*)srcDataPtr[1]);
  92 + int cols = *((int*)srcDataPtr[2]);
  93 + int type = *((int*)srcDataPtr[3]);
  94 +
  95 + Mat mat = Mat(rows, cols, type);
  96 + br::cuda::cudapca_trainwrapper(cudaMemPtr, mat.ptr<float>(), rows, cols);
  97 + trainingQlist.append(Template(mat));
  98 + TemplateList trainingSet;
  99 + }
  100 + TemplateList trainingSet(trainingQlist);
  101 +
76 102 if (trainingSet.first().m().type() != CV_32FC1)
77 103 qFatal("Requires single channel 32-bit floating point matrices.");
78 104  
79 105 originalRows = trainingSet.first().m().rows; // get number of rows of first image
80 106 int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols; // get the size of the first image
81   - const int instances = trainingSet.size(); // get the number of training set instances
82 107  
83 108 // Map into 64-bit Eigen matrix
84 109 Eigen::MatrixXd data(dimsIn, instances); // create a mat
... ... @@ -90,10 +115,32 @@ private:
90 115  
91 116 void project(const Template &src, Template &dst) const
92 117 {
93   - dst = cv::Mat(1, keep, CV_32FC1);
  118 +
  119 + void* const* srcDataPtr = src.m().ptr<void*>();
  120 + void* cudaMemPtr = srcDataPtr[0];
  121 + int rows = *((int*)srcDataPtr[1]);
  122 + int cols = *((int*)srcDataPtr[2]);
  123 + int type = *((int*)srcDataPtr[3]);
  124 +
  125 + if (type != CV_32FC1) {
  126 + cout << "ERR: Invalid image type" << endl;
  127 + return;
  128 + }
  129 +
  130 + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
  131 + void** dstDataPtr = dstMat.ptr<void*>();
  132 + dstDataPtr[1] = srcDataPtr[1]; *((int*)dstDataPtr[1]) = 1;
  133 + dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = keep;
  134 + dstDataPtr[3] = srcDataPtr[3];
  135 +
  136 + br::cuda::cudapca_projectwrapper(cudaMemPtr, &dstDataPtr[0]);
  137 +
  138 + dst = dstMat;
  139 +
  140 + //dst = cv::Mat(1, keep, CV_32FC1);
94 141  
95 142 // perform the operation on the graphics card
96   - cuda::cudapca_projectwrapper((float*)src.m().ptr<float>(), (float*)dst.m().ptr<float>());
  143 + //cuda::cudapca_projectwrapper((float*)src.m().ptr<float>(), (float*)dst.m().ptr<float>());
97 144  
98 145 // Map Eigen into OpenCV
99 146 //Mat cpuDst = cv::Mat(1, keep, CV_32FC1);
... ...
openbr/plugins/cuda/cudapca.cu
... ... @@ -7,8 +7,6 @@ using namespace std;
7 7 using namespace cv;
8 8 using namespace cv::gpu;
9 9  
10   -#include "cudapca.hpp"
11   -
12 10 namespace br { namespace cuda {
13 11 __global__ void calculateCovariance_kernel(float* trainingSet, float* cov, int numRows, int numCols) {
14 12 int rowInd = blockIdx.y*blockDim.y + threadIdx.y;
... ... @@ -77,116 +75,29 @@ namespace br { namespace cuda {
77 75 cudaMalloc(&_cudaDstPtr, _evCols*sizeof(float));
78 76 }
79 77  
80   - void cudapca_trainwrapper() {
81   - /*
82   - if (trainingSet[0].type() != CV_32FC1) {
83   - std::cout << "ERR: Requires single 32-bit floating point matrix!";
84   - return;
85   - }
86   -
87   - cudaError_t status;
88   -
89   - const int numExamples = trainingSetSize;
90   - int numPixels = trainingSet[0].rows * trainingSet[0].cols;
91   -
92   - // create a custom matrix
93   - float* cudaDataPtr;
94   - status = cudaMalloc(&cudaDataPtr, numPixels * numExamples * sizeof(float));
95   - if (status != cudaSuccess) {
96   - std::cout << "ERR: Memory allocation" << std::endl;
97   - return;
98   - }
99   -
100   - // copy all the data to the graphics card
101   - for (int i=0; i < numExamples; i++) {
102   - status = cudaMemcpy(cudaDataPtr + i*numPixels, trainingSet[i].ptr<float>(), numPixels*sizeof(float), cudaMemcpyHostToDevice);
103   - if (status != cudaSuccess) {
104   - std::cout << "ERR: Memcpy at index " << i << std::endl;
105   - return;
106   - }
107   - }
108   -
109   - // start the core part of the algorithm
110   - int numDimensions = numPixels;
111   - const bool dominantEigenEstimation = (numDimensions > numExamples);
112   -
113   - // malloc and init mean
114   - mean = new float[numDimensions];
115   - for (int i=0; i < numDimensions; i++) {
116   - mean[i] = 0;
117   - }
118   - float* cudaMeanPtr;
119   - status = cudaMalloc(&cudaMeanPtr, numDimensions*sizeof(float));
120   - if (status != cudaSuccess) {
121   - std::cout << " ERR: Malloc of mean" << std::endl;
122   - return;
123   - }
124   -
125   - if (keep != 0) {
126   - // compute the mean so we can subtract from data
127   - for (int i=0; i < numExamples; i++) {
128   - Mat& m = trainingSet[i];
129   -
130   - for (int j=0; j < numDimensions; j++) {
131   - mean[j] += m.ptr<float>()[i*numDimensions + j];
132   - }
133   - }
134   - for (int i=0; i < numDimensions; i++) {
135   - mean[i] = mean[i] / numExamples;
136   - }
137   -
138   - // copy mean over to graphics card
139   - cudaMemcpy(cudaMeanPtr, mean, numExamples*sizeof(float), cudaMemcpyHostToDevice);
140   - if (status != cudaSuccess) {
141   - std::cout << " ERR: Cpy of mean" << std::endl;
142   - return;
143   - }
144   -
145   - // set the thread dimensions and run the kernel
146   - dim3 threadsPerBlock(64, 1);
147   - dim3 numBlocks(numDimensions/threadsPerBlock.x + 1,
148   - numExamples/threadsPerBlock.y + 1);
149   -
150   - subtractMean_kernel<<<numBlocks, threadsPerBlock>>>(cudaDataPtr, cudaMeanPtr, numExamples, numDimensions);
151   -
152   - // calculate the covariance matrix using kernel
153   - // malloc location for covariance matrix
154   - float* cudaCovPtr;
155   - status = cudaMalloc(&cudaCovPtr, numExamples*numExamples*sizeof(float));
156   - if (status != cudaSuccess) h
157   - std::cout << " ERR: Cpy of mean" << std::endl;
158   - return;
159   - }
160   -
161   - // calculate the covariance matrix
162   - threadsPerBlock = dim3(8, 8);
163   - numBlocks = dim3(numExamples/threadsPerBlock.x + 1,
164   - numExamples/threadsPerBlock.y + 1);
165   - calculateCovariance_kernel<<<numBlocks, threadsPerBlock>>>(cudaDataPtr, cudaCovPtr, numExamples, numDimensions);
166   -
167   - // perform eigendecomposition
168   - //std::cout << "Skipping eigendecomposition" << std::endl;
169   - cusolverStatus_t cusolverStatus;
170   - cusolverStatus = cusolverDnSgebrd(cusolverHandle,)
171   - }
172   - */
  78 + void cudapca_trainwrapper(const void* cudaDataPtr, float* dataPtr, int rows, int cols) {
  79 + cudaMemcpy(dataPtr, cudaDataPtr, rows*cols*sizeof(float), cudaMemcpyDeviceToHost);
173 80 }
174 81  
175   - void cudapca_projectwrapper(float* src, float* dst) {
  82 + void cudapca_projectwrapper(void* src, void** dst) {
176 83 // copy the image to the GPU
177   - cudaMemcpy(_cudaSrcPtr, src, _meanElems*sizeof(float), cudaMemcpyHostToDevice);
  84 + //cudaMemcpy(_cudaSrcPtr, src, _meanElems*sizeof(float), cudaMemcpyHostToDevice);
  85 +
  86 + cudaMalloc(dst, _evRows*_evCols*sizeof(float));
178 87  
179 88 // subtract out the mean of the image (mean is 1xpixels in size)
180 89 int threadsPerBlock = 64;
181 90 int numBlocks = _meanElems / threadsPerBlock + 1;
182   - cudapca_project_subtractmean_kernel<<<numBlocks, threadsPerBlock>>>(_cudaSrcPtr, cudaMeanPtr, _meanElems);
  91 + cudapca_project_subtractmean_kernel<<<numBlocks, threadsPerBlock>>>((float*)src, cudaMeanPtr, _meanElems);
183 92  
184 93 // perform the multiplication
185 94 threadsPerBlock = 64;
186 95 numBlocks = _evCols / threadsPerBlock + 1;
187   - cudapca_project_multiply_kernel<<<numBlocks, threadsPerBlock>>>(_cudaSrcPtr, _cudaDstPtr, cudaEvPtr, _evRows, _evCols);
  96 + cudapca_project_multiply_kernel<<<numBlocks, threadsPerBlock>>>((float*)src, (float*)(*dst), cudaEvPtr, _evRows, _evCols);
  97 +
  98 + //cudaFree(src); // TODO(colin): figure out why adding this free causes memory corruption...
188 99  
189 100 // copy the data back to the CPU
190   - cudaMemcpy(dst, _cudaDstPtr, _evCols*sizeof(float), cudaMemcpyDeviceToHost);
  101 + //cudaMemcpy(dst, _cudaDstPtr, _evCols*sizeof(float), cudaMemcpyDeviceToHost);
191 102 }
192 103 }}
... ...
openbr/plugins/cuda/cudapca.hpp deleted
1   -#include <opencv2/opencv.hpp>
2   -#include <opencv2/gpu/gpu.hpp>
3   -
4   -using namespace cv;
5   -using namespace cv::gpu;
6   -
7   -namespace br { namespace cuda {
8   - void cudapca_loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems);
9   - void cudapca_trainwrapper();
10   -
11   - void cudapca_projectwrapper(float* src, float* dst);
12   -}}