Commit 59d38ff01fcfeb96b75fd610f8e180f11665788b

Authored by Josh Klontz
Committed by GitHub
2 parents a91c55a5 a7f8896e

Merge pull request #490 from DepthDeluxe/master

CUDA-accelerated PCA training and improved PCA projection speeds
openbr/plugins/cuda/cudadefines.hpp
@@ -19,6 +19,9 @@ @@ -19,6 +19,9 @@
19 using namespace std; 19 using namespace std;
20 #include <pthread.h> 20 #include <pthread.h>
21 21
  22 +#include <cublas_v2.h>
  23 +
  24 +
22 #define CUDA_SAFE_FREE(cudaPtr, errPtr) \ 25 #define CUDA_SAFE_FREE(cudaPtr, errPtr) \
23 /*cout << pthread_self() << ": CUDA Free: " << cudaPtr << endl;*/ \ 26 /*cout << pthread_self() << ": CUDA Free: " << cudaPtr << endl;*/ \
24 *errPtr = cudaFree(cudaPtr); \ 27 *errPtr = cudaFree(cudaPtr); \
@@ -48,3 +51,53 @@ using namespace std; @@ -48,3 +51,53 @@ using namespace std;
48 cout << pthread_self() << ": Kernel Call Err(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \ 51 cout << pthread_self() << ": Kernel Call Err(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \
49 throw 0; \ 52 throw 0; \
50 } 53 }
  54 +
  55 +#define CUBLAS_ERROR_CHECK(error) { \
  56 + switch (error) { \
  57 + case CUBLAS_STATUS_SUCCESS: \
  58 + break; \
  59 + case CUBLAS_STATUS_NOT_INITIALIZED: \
  60 + cout << "CUBLAS_STATUS_NOT_INITIALIZED" << endl; \
  61 + break; \
  62 + case CUBLAS_STATUS_ALLOC_FAILED: \
  63 + cout << "CUBLAS_STATUS_ALLOC_FAILED" << endl; \
  64 + break; \
  65 + case CUBLAS_STATUS_INVALID_VALUE: \
  66 + cout << "CUBLAS_STATUS_INVALID_VALUE" << endl;; \
  67 + break; \
  68 + case CUBLAS_STATUS_ARCH_MISMATCH: \
  69 + cout << "CUBLAS_STATUS_ARCH_MISMATCH" << endl;; \
  70 + break; \
  71 + case CUBLAS_STATUS_MAPPING_ERROR: \
  72 + cout << "CUBLAS_STATUS_MAPPING_ERROR" << endl; \
  73 + break; \
  74 + case CUBLAS_STATUS_EXECUTION_FAILED: \
  75 + cout << "CUBLAS_STATUS_EXECUTION_FAILED" << endl; \
  76 + break; \
  77 + case CUBLAS_STATUS_INTERNAL_ERROR: \
  78 + cout << "CUBLAS_STATUS_INTERNAL_ERROR" << endl; \
  79 + break; \
  80 + default: \
  81 + cout << "<unknown>: " << error << endl; \
  82 + break; \
  83 + } \
  84 +}
  85 +
  86 +#define CUSOLVER_ERROR_CHECK(error) { \
  87 + switch(error) { \
  88 + case CUSOLVER_STATUS_SUCCESS: \
  89 + break; \
  90 + case CUSOLVER_STATUS_NOT_INITIALIZED: \
  91 + cout << "CUSOLVER_STATUS_NOT_INITIALIZED" << endl; \
  92 + break; \
  93 + case CUSOLVER_STATUS_ALLOC_FAILED: \
  94 + cout << "CUSOLVER_STATUS_ALLOC_FAILED" << endl; \
  95 + break; \
  96 + case CUSOLVER_STATUS_ARCH_MISMATCH: \
  97 + cout << "CUSOLVER_STATUS_ARCH_MISMATCH" << endl; \
  98 + break; \
  99 + default: \
  100 + cout << "<unknown>: " << error << endl; \
  101 + break; \
  102 + } \
  103 +}
openbr/plugins/cuda/cudapca.cpp
@@ -30,11 +30,14 @@ using namespace cv; @@ -30,11 +30,14 @@ using namespace cv;
30 #include <openbr/core/eigenutils.h> 30 #include <openbr/core/eigenutils.h>
31 #include <openbr/core/opencvutils.h> 31 #include <openbr/core/opencvutils.h>
32 32
33 -// definitions from the CUDA source file 33 +#include <cuda_runtime.h>
  34 +#include <cublas_v2.h>
  35 +#include <cusolverDn.h>
  36 +#include "cudadefines.hpp"
  37 +
34 namespace br { namespace cuda { namespace pca { 38 namespace br { namespace cuda { namespace pca {
35 - void initializeWrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems);  
36 - void trainWrapper(void* cudaSrc, float* dst, int rows, int cols);  
37 - void wrapper(void* src, void** dst, int imgRows, int imgCols); 39 + void castFloatToDouble(float* a, int inca, double* b, int incb, int numElems);
  40 + void castDoubleToFloat(double* a, int inca, float* b, int incb, int numElems);
38 }}} 41 }}}
39 42
40 namespace br 43 namespace br
@@ -61,13 +64,26 @@ protected: @@ -61,13 +64,26 @@ protected:
61 BR_PROPERTY(int, drop, 0) 64 BR_PROPERTY(int, drop, 0)
62 BR_PROPERTY(bool, whiten, false) 65 BR_PROPERTY(bool, whiten, false)
63 66
64 - Eigen::VectorXf mean, eVals; 67 + Eigen::VectorXf mean;
  68 + Eigen::VectorXf eVals;
65 Eigen::MatrixXf eVecs; 69 Eigen::MatrixXf eVecs;
66 70
67 - int originalRows; 71 + cublasHandle_t cublasHandle;
  72 + float* cudaMeanPtr; // holds the "keep" long vector
  73 + float* cudaEvPtr; // holds all the eigenvectors
68 74
69 public: 75 public:
70 - CUDAPCATransform() : keep(0.95), drop(0), whiten(false) {} 76 + CUDAPCATransform() : keep(0.95), drop(0), whiten(false) {
  77 + // try to initialize CUBLAS
  78 + cublasStatus_t status;
  79 + status = cublasCreate(&cublasHandle);
  80 + CUBLAS_ERROR_CHECK(status);
  81 + }
  82 +
  83 + ~CUDAPCATransform() {
  84 + // tear down CUBLAS
  85 + cublasDestroy(cublasHandle);
  86 + }
71 87
72 private: 88 private:
73 double residualReconstructionError(const Template &src) const 89 double residualReconstructionError(const Template &src) const
@@ -83,45 +99,38 @@ private: @@ -83,45 +99,38 @@ private:
83 99
84 void train(const TemplateList &cudaTrainingSet) 100 void train(const TemplateList &cudaTrainingSet)
85 { 101 {
86 - // copy the data back from the graphics card so the training can be done on the CPU  
87 - const int instances = cudaTrainingSet.size(); // get the number of training set instances  
88 - QList<Template> trainingQlist;  
89 - for(int i=0; i<instances; i++) {  
90 - Template currentTemplate = cudaTrainingSet[i];  
91 - void* const* srcDataPtr = currentTemplate.m().ptr<void*>();  
92 - void* cudaMemPtr = srcDataPtr[0];  
93 - int rows = *((int*)srcDataPtr[1]);  
94 - int cols = *((int*)srcDataPtr[2]);  
95 - int type = *((int*)srcDataPtr[3]);  
96 -  
97 - if (type != CV_32FC1) {  
98 - qFatal("Requires single channel 32-bit floating point matrices.");  
99 - }  
100 -  
101 - Mat mat = Mat(rows, cols, type);  
102 - br::cuda::pca::trainWrapper(cudaMemPtr, mat.ptr<float>(), rows, cols);  
103 - trainingQlist.append(Template(mat));  
104 - }  
105 -  
106 - // assemble a TemplateList from the list of data  
107 - TemplateList trainingSet(trainingQlist);  
108 -  
109 -  
110 - originalRows = trainingSet.first().m().rows; // get number of rows of first image  
111 - int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols; // get the size of the first image 102 + cublasStatus_t cublasStatus;
  103 + cudaError_t cudaError;
  104 +
  105 + // put all the data into a single matrix to perform PCA
  106 + const int instances = cudaTrainingSet.size();
  107 + const int dimsIn = *(int*)cudaTrainingSet.first().m().ptr<void*>()[1]
  108 + * *(int*)cudaTrainingSet.first().m().ptr<void*>()[2];
  109 +
  110 + // copy the data over
  111 + double* cudaDataPtr;
  112 + CUDA_SAFE_MALLOC(&cudaDataPtr, instances*dimsIn*sizeof(cudaDataPtr[0]), &cudaError);
  113 + for (int i=0; i < instances; i++) {
  114 + br::cuda::pca::castFloatToDouble(
  115 + (float*)(cudaTrainingSet[i].m().ptr<void*>()[0]),
  116 + 1,
  117 + cudaDataPtr+i*dimsIn,
  118 + 1,
  119 + dimsIn
  120 + );
  121 + }
112 122
113 - // Map into 64-bit Eigen matrix  
114 - Eigen::MatrixXd data(dimsIn, instances); // create a mat  
115 - for (int i=0; i<instances; i++) {  
116 - data.col(i) = Eigen::Map<const Eigen::MatrixXf>(trainingSet[i].m().ptr<float>(), dimsIn, 1).cast<double>();  
117 - } 123 + trainCore(cudaDataPtr, dimsIn, instances);
118 124
119 - trainCore(data); 125 + CUDA_SAFE_FREE(cudaDataPtr, &cudaError);
120 } 126 }
121 127
122 void project(const Template &src, Template &dst) const 128 void project(const Template &src, Template &dst) const
123 { 129 {
  130 + cudaError_t cudaError;
  131 +
124 void* const* srcDataPtr = src.m().ptr<void*>(); 132 void* const* srcDataPtr = src.m().ptr<void*>();
  133 + float* srcGpuMatPtr = (float*)srcDataPtr[0];
125 int rows = *((int*)srcDataPtr[1]); 134 int rows = *((int*)srcDataPtr[1]);
126 int cols = *((int*)srcDataPtr[2]); 135 int cols = *((int*)srcDataPtr[2]);
127 int type = *((int*)srcDataPtr[3]); 136 int type = *((int*)srcDataPtr[3]);
@@ -131,137 +140,416 @@ private: @@ -131,137 +140,416 @@ private:
131 throw 0; 140 throw 0;
132 } 141 }
133 142
  143 + // save the destination rows
  144 + int dstRows = (int)keep;
  145 +
134 Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type()); 146 Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
135 void** dstDataPtr = dstMat.ptr<void*>(); 147 void** dstDataPtr = dstMat.ptr<void*>();
  148 + float** dstGpuMatPtrPtr = (float**)dstDataPtr;
136 dstDataPtr[1] = srcDataPtr[1]; *((int*)dstDataPtr[1]) = 1; 149 dstDataPtr[1] = srcDataPtr[1]; *((int*)dstDataPtr[1]) = 1;
137 - dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = keep; 150 + dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = dstRows;
138 dstDataPtr[3] = srcDataPtr[3]; 151 dstDataPtr[3] = srcDataPtr[3];
139 152
140 - cuda::pca::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);  
141 153
  154 + // allocate the memory and set to zero
  155 + //cout << "Allocating destination memory" << endl;
  156 + cublasStatus_t status;
  157 + cudaMalloc(dstGpuMatPtrPtr, dstRows*sizeof(float));
  158 + cudaMemset(*dstGpuMatPtrPtr, 0, dstRows*sizeof(float));
  159 +
  160 + {
  161 + float negativeOne = -1.0f;
  162 + status = cublasSaxpy(
  163 + cublasHandle, // handle
  164 + dstRows, // vector length
  165 + &negativeOne, // alpha (1)
  166 + cudaMeanPtr, // mean
  167 + 1, // stride
  168 + srcGpuMatPtr, // y, the source
  169 + 1 // stride
  170 + );
  171 + CUBLAS_ERROR_CHECK(status);
  172 + }
  173 +
  174 + {
  175 + float one = 1.0f;
  176 + float zero = 0.0f;
  177 + status = cublasSgemv(
  178 + cublasHandle, // handle
  179 + CUBLAS_OP_T, // normal vector multiplication
  180 + eVecs.rows(), // # rows
  181 + eVecs.cols(), // # cols
  182 + &one, // alpha (1)
  183 + cudaEvPtr, // pointer to the matrix
  184 + eVecs.rows(), // leading dimension of matrix
  185 + srcGpuMatPtr, // vector for multiplication
  186 + 1, // stride (1)
  187 + &zero, // beta (0)
  188 + *dstGpuMatPtrPtr, // vector to store the result
  189 + 1 // stride (1)
  190 + );
  191 + CUBLAS_ERROR_CHECK(status);
  192 + }
  193 +
  194 + //cout << "Saving result" << endl;
142 dst = dstMat; 195 dst = dstMat;
  196 + CUDA_SAFE_FREE(srcGpuMatPtr, &cudaError);
143 } 197 }
144 198
145 void store(QDataStream &stream) const 199 void store(QDataStream &stream) const
146 { 200 {
147 - stream << keep << drop << whiten << originalRows << mean << eVals << eVecs; 201 + stream << keep << drop << whiten << mean << eVecs;
148 } 202 }
149 203
150 void load(QDataStream &stream) 204 void load(QDataStream &stream)
151 { 205 {
152 - stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> eVecs; 206 + stream >> keep >> drop >> whiten >> mean >> eVecs;
  207 +
  208 + //cout << "Starting load process" << endl;
  209 +
  210 + cudaError_t cudaError;
  211 + cublasStatus_t cublasStatus;
  212 + CUDA_SAFE_MALLOC(&cudaMeanPtr, mean.rows()*mean.cols()*sizeof(float), &cudaError);
  213 + CUDA_SAFE_MALLOC(&cudaEvPtr, eVecs.rows()*eVecs.cols()*sizeof(float), &cudaError);
  214 +
  215 + //cout << "Setting vector" << endl;
  216 + // load the mean vector into GPU memory
  217 + cublasStatus = cublasSetVector(
  218 + mean.rows()*mean.cols(),
  219 + sizeof(float),
  220 + mean.data(),
  221 + 1,
  222 + cudaMeanPtr,
  223 + 1
  224 + );
  225 + CUBLAS_ERROR_CHECK(cublasStatus);
  226 +
  227 + //cout << "Setting the matrix" << endl;
  228 + // load the eigenvector matrix into GPU memory
  229 + cublasStatus = cublasSetMatrix(
  230 + eVecs.rows(),
  231 + eVecs.cols(),
  232 + sizeof(float),
  233 + eVecs.data(),
  234 + eVecs.rows(),
  235 + cudaEvPtr,
  236 + eVecs.rows()
  237 + );
  238 + CUBLAS_ERROR_CHECK(cublasStatus);
  239 + }
153 240
154 - // serialize the eigenvectors  
155 - float* evBuffer = new float[eVecs.rows() * eVecs.cols()];  
156 - for (int i=0; i < eVecs.rows(); i++) {  
157 - for (int j=0; j < eVecs.cols(); j++) {  
158 - evBuffer[i*eVecs.cols() + j] = eVecs(i, j);  
159 - }  
160 - } 241 +protected:
  242 + void trainCore(double* cudaDataPtr, int dimsIn, int instances) {
  243 + cudaError_t cudaError;
  244 +
  245 + const bool dominantEigenEstimation = (dimsIn > instances);
  246 +
  247 + Eigen::MatrixXd allEVals, allEVecs;
161 248
162 - // serialize the mean  
163 - float* meanBuffer = new float[mean.rows() * mean.cols()];  
164 - for (int i=0; i < mean.rows(); i++) {  
165 - for (int j=0; j < mean.cols(); j++) {  
166 - meanBuffer[i*mean.cols() + j] = mean(i, j); 249 + // allocate the eigenvectors
  250 + if (dominantEigenEstimation) {
  251 + allEVals = Eigen::MatrixXd(instances, 1);
  252 + allEVecs = Eigen::MatrixXd(dimsIn, instances);
  253 + } else {
  254 + allEVals = Eigen::MatrixXd(dimsIn, 1);
  255 + allEVecs = Eigen::MatrixXd(dimsIn, dimsIn);
  256 + }
  257 +
  258 + if (keep != 0) {
  259 + performCovarianceSVD(cudaDataPtr, dimsIn, instances, allEVals, allEVecs);
  260 + } else {
  261 + // null case
  262 + mean = Eigen::VectorXf::Zero(dimsIn);
  263 + allEVecs = Eigen::MatrixXd::Identity(dimsIn, dimsIn);
  264 + allEVals = Eigen::VectorXd::Ones(dimsIn);
  265 + }
  266 +
  267 + // *****************
  268 + // We have now found the eigenvalues and eigenvectors
  269 + // *****************
  270 +
  271 + if (keep <= 0) {
  272 + keep = dimsIn - drop;
  273 + } else if (keep < 1) {
  274 + // Keep eigenvectors that retain a certain energy percentage.
  275 + const double totalEnergy = allEVals.sum();
  276 + if (totalEnergy == 0) {
  277 + keep = 0;
  278 + } else {
  279 + double currentEnergy = 0;
  280 + int i=0;
  281 + while ((currentEnergy / totalEnergy < keep) && (i < allEVals.rows())) {
  282 + currentEnergy += allEVals(i);
  283 + i++;
  284 + }
  285 + keep = i - drop;
  286 + }
  287 + } else {
  288 + if (keep + drop > allEVals.rows()) {
  289 + qWarning("Insufficient samples, needed at least %d but only got %d.", (int)keep + drop, (int)allEVals.rows());
  290 + keep = allEVals.rows() - drop;
167 } 291 }
168 - } 292 + }
169 293
170 - // call the wrapper function  
171 - cuda::pca::initializeWrapper(evBuffer, eVecs.rows(), eVecs.cols(), meanBuffer, mean.rows()*mean.cols()); 294 + // Keep highest energy vectors
  295 + eVals = Eigen::VectorXf((int)keep, 1);
  296 + eVecs = Eigen::MatrixXf(allEVecs.rows(), (int)keep);
  297 + for (int i=0; i<keep; i++) {
  298 + int index = i+drop;
  299 + eVals(i) = allEVals(index);
  300 + eVecs.col(i) = allEVecs.col(index).cast<float>() / allEVecs.col(index).norm();
  301 + if (whiten) eVecs.col(i) /= sqrt(eVals(i));
  302 + }
172 303
173 - delete evBuffer;  
174 - delete meanBuffer; 304 + // Debug output
  305 + if (Globals->verbose) qDebug() << "PCA Training:\n\tDimsIn =" << dimsIn << "\n\tKeep =" << keep;
175 } 306 }
176 307
177 -protected:  
178 - void trainCore(Eigen::MatrixXd data)  
179 - {  
180 - int dimsIn = data.rows();  
181 - int instances = data.cols();  
182 - const bool dominantEigenEstimation = (dimsIn > instances);  
183 -  
184 - Eigen::MatrixXd allEVals, allEVecs;  
185 - if (keep != 0) {  
186 - // Compute and remove mean  
187 - mean = Eigen::VectorXf(dimsIn);  
188 - for (int i=0; i<dimsIn; i++) mean(i) = data.row(i).sum() / (float)instances;  
189 - for (int i=0; i<dimsIn; i++) data.row(i).array() -= mean(i);  
190 -  
191 - // Calculate covariance matrix  
192 - Eigen::MatrixXd cov;  
193 - if (dominantEigenEstimation) cov = data.transpose() * data / (instances-1.0);  
194 - else cov = data * data.transpose() / (instances-1.0);  
195 -  
196 - // Compute eigendecomposition. Returns eigenvectors/eigenvalues in increasing order by eigenvalue.  
197 - Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> eSolver(cov);  
198 - allEVals = eSolver.eigenvalues();  
199 - allEVecs = eSolver.eigenvectors();  
200 - if (dominantEigenEstimation) allEVecs = data * allEVecs;  
201 - } else {  
202 - // Null case  
203 - mean = Eigen::VectorXf::Zero(dimsIn);  
204 - allEVecs = Eigen::MatrixXd::Identity(dimsIn, dimsIn);  
205 - allEVals = Eigen::VectorXd::Ones(dimsIn);  
206 - }  
207 -  
208 - if (keep <= 0) {  
209 - keep = dimsIn - drop;  
210 - } else if (keep < 1) {  
211 - // Keep eigenvectors that retain a certain energy percentage.  
212 - const double totalEnergy = allEVals.sum();  
213 - if (totalEnergy == 0) {  
214 - keep = 0;  
215 - } else {  
216 - double currentEnergy = 0;  
217 - int i=0;  
218 - while ((currentEnergy / totalEnergy < keep) && (i < allEVals.rows())) {  
219 - currentEnergy += allEVals(allEVals.rows()-(i+1));  
220 - i++;  
221 - }  
222 - keep = i - drop;  
223 - }  
224 - } else {  
225 - if (keep + drop > allEVals.rows()) {  
226 - qWarning("Insufficient samples, needed at least %d but only got %d.", (int)keep + drop, (int)allEVals.rows());  
227 - keep = allEVals.rows() - drop;  
228 - }  
229 - }  
230 -  
231 - // Keep highest energy vectors  
232 - eVals = Eigen::VectorXf((int)keep, 1);  
233 - eVecs = Eigen::MatrixXf(allEVecs.rows(), (int)keep);  
234 - for (int i=0; i<keep; i++) {  
235 - int index = allEVals.rows()-(i+drop+1);  
236 - eVals(i) = allEVals(index);  
237 - eVecs.col(i) = allEVecs.col(index).cast<float>() / allEVecs.col(index).norm();  
238 - if (whiten) eVecs.col(i) /= sqrt(eVals(i));  
239 - }  
240 -  
241 - // Debug output  
242 - if (Globals->verbose) qDebug() << "PCA Training:\n\tDimsIn =" << dimsIn << "\n\tKeep =" << keep;  
243 - } 308 + // computes the covariance matrix and then pulls the eigenvalues+eigenvectors
  309 + // out of it using SVD of a symmetric matrix
  310 + void performCovarianceSVD(double* cudaDataPtr, int dimsIn, int instances, Eigen::MatrixXd& allEVals, Eigen::MatrixXd& allEVecs) {
  311 + cudaError_t cudaError;
  312 +
  313 + const bool dominantEigenEstimation = (dimsIn > instances);
  314 +
  315 + // used for temporary storage
  316 + Eigen::VectorXd meanDouble(dimsIn);
  317 +
  318 + // compute the mean
  319 + for (int i=0; i < dimsIn; i++) {
  320 + cublasDasum(
  321 + cublasHandle,
  322 + instances,
  323 + cudaDataPtr+i,
  324 + dimsIn,
  325 + meanDouble.data()+i
  326 + );
  327 + }
244 328
245 - void writeEigenVectors(const Eigen::MatrixXd &allEVals, const Eigen::MatrixXd &allEVecs) const  
246 - {  
247 - const int originalCols = mean.rows() / originalRows;  
248 -  
249 - { // Write out mean image  
250 - cv::Mat out(originalRows, originalCols, CV_32FC1);  
251 - Eigen::Map<Eigen::MatrixXf> outMap(out.ptr<float>(), mean.rows(), 1);  
252 - outMap = mean.col(0);  
253 - // OpenCVUtils::saveImage(out, Globals->Debug+"/PCA/eigenVectors/mean.png");  
254 - }  
255 -  
256 - // Write out sample eigen vectors (16 highest, 8 lowest), filename = eigenvalue.  
257 - for (int k=0; k<(int)allEVals.size(); k++) {  
258 - if ((k < 8) || (k >= (int)allEVals.size()-16)) {  
259 - cv::Mat out(originalRows, originalCols, CV_64FC1);  
260 - Eigen::Map<Eigen::MatrixXd> outMap(out.ptr<double>(), mean.rows(), 1);  
261 - outMap = allEVecs.col(k);  
262 - // OpenCVUtils::saveImage(out, Globals->Debug+"/PCA/eigenVectors/"+QString::number(allEVals(k),'f',0)+".png");  
263 - }  
264 - } 329 + // put data back on GPU for further processing
  330 + double* cudaMeanDoublePtr;
  331 + CUDA_SAFE_MALLOC(&cudaMeanDoublePtr, dimsIn*sizeof(cudaMeanDoublePtr[0]), &cudaError);
  332 + cublasSetVector(
  333 + dimsIn,
  334 + sizeof(cudaMeanDoublePtr[0]),
  335 + meanDouble.data(),
  336 + 1,
  337 + cudaMeanDoublePtr,
  338 + 1
  339 + );
  340 +
  341 + // scale to calculate average
  342 + {
  343 + double scaleFactor = 1.0/(double)instances;
  344 + cublasDscal(
  345 + cublasHandle,
  346 + dimsIn,
  347 + &scaleFactor,
  348 + cudaMeanDoublePtr,
  349 + 1
  350 + );
  351 + }
  352 +
  353 + // subtract mean from data
  354 + for (int i=0; i < instances; i++) {
  355 + double negativeOne = -1.0;
  356 + cublasDaxpy(
  357 + cublasHandle,
  358 + dimsIn,
  359 + &negativeOne,
  360 + cudaMeanDoublePtr,
  361 + 1,
  362 + cudaDataPtr+i*dimsIn,
  363 + 1
  364 + );
  365 + }
  366 +
  367 + // convert to float form and copy the data back
  368 + CUDA_SAFE_MALLOC(&cudaMeanPtr, dimsIn*sizeof(cudaMeanPtr[0]), &cudaError);
  369 + br::cuda::pca::castDoubleToFloat(cudaMeanDoublePtr, 1, cudaMeanPtr, 1, dimsIn);
  370 +
  371 + // copy the data back
  372 + mean = Eigen::VectorXf(dimsIn);
  373 + cublasGetVector(
  374 + dimsIn,
  375 + sizeof(cudaMeanPtr[0]),
  376 + cudaMeanPtr,
  377 + 1,
  378 + mean.data(),
  379 + 1
  380 + );
  381 +
  382 + // free up the memory
  383 + CUDA_SAFE_FREE(cudaMeanDoublePtr, &cudaError);
  384 + CUDA_SAFE_FREE(cudaMeanPtr, &cudaError);
  385 +
  386 + // allocate space for the covariance matrix
  387 + double* cudaCovariancePtr;
  388 + int covRows = allEVals.rows();
  389 + CUDA_SAFE_MALLOC(&cudaCovariancePtr, covRows*covRows*sizeof(cudaCovariancePtr[0]), &cudaError);
  390 +
  391 + // compute the covariance matrix
  392 + if (dominantEigenEstimation) {
  393 + // cov = data.transpose() * data / (instances-1.0);
  394 + const double scaleFactor = 1.0/(instances-1.0);
  395 + const double zero = 0.0;
  396 + cublasDgemm(
  397 + cublasHandle,
  398 + CUBLAS_OP_T,
  399 + CUBLAS_OP_N,
  400 + instances,
  401 + instances,
  402 + dimsIn,
  403 + &scaleFactor,
  404 + cudaDataPtr,
  405 + dimsIn,
  406 + cudaDataPtr,
  407 + dimsIn,
  408 + &zero,
  409 + cudaCovariancePtr,
  410 + covRows
  411 + );
  412 + } else {
  413 + // cov = data * data.transpose() / (instances-1.0);
  414 + const double scaleFactor = 1.0/(instances-1.0);
  415 + const double zero = 0.0;
  416 + cublasDgemm(
  417 + cublasHandle,
  418 + CUBLAS_OP_N,
  419 + CUBLAS_OP_T,
  420 + dimsIn,
  421 + dimsIn,
  422 + instances,
  423 + &scaleFactor,
  424 + cudaDataPtr,
  425 + dimsIn,
  426 + cudaDataPtr,
  427 + dimsIn,
  428 + &zero,
  429 + cudaCovariancePtr,
  430 + covRows
  431 + );
  432 + }
  433 +
  434 + cusolverDnHandle_t cusolverHandle;
  435 + cusolverStatus_t cusolverStatus;
  436 + cusolverDnCreate(&cusolverHandle);
  437 +
  438 + // allocate appropriate working space
  439 + int svdLWork;
  440 + cusolverDnDgesvd_bufferSize(
  441 + cusolverHandle,
  442 + covRows,
  443 + covRows,
  444 + &svdLWork
  445 + );
  446 + double* cudaSvdWork;
  447 + CUDA_SAFE_MALLOC(&cudaSvdWork, svdLWork*sizeof(cudaSvdWork[0]), &cudaError);
  448 +
  449 + double* cudaUPtr;
  450 + CUDA_SAFE_MALLOC(&cudaUPtr, covRows*covRows*sizeof(cudaUPtr[0]), &cudaError);
  451 + double* cudaSPtr;
  452 + CUDA_SAFE_MALLOC(&cudaSPtr, covRows*sizeof(cudaSPtr[0]), &cudaError);
  453 + double* cudaVTPtr;
  454 + CUDA_SAFE_MALLOC(&cudaVTPtr, covRows*covRows*sizeof(cudaVTPtr[0]), &cudaError);
  455 +
  456 + int* cudaSvdDevInfoPtr;
  457 + CUDA_SAFE_MALLOC(&cudaSvdDevInfoPtr, sizeof(*cudaSvdDevInfoPtr), &cudaError);
  458 + int svdDevInfo;
  459 +
  460 + // perform SVD on an n x m matrix, in this case the matrix is the covariance
  461 + // matrix and is symmetric, meaning the SVD will calculate the eigenvalues
  462 + // and eigenvectors for us.
  463 + cusolverStatus = cusolverDnDgesvd(
  464 + cusolverHandle,
  465 + 'A', // all columns of unitary matrix
  466 + 'A', // all columns of array VT
  467 + covRows, // m
  468 + covRows, // n
  469 + cudaCovariancePtr, // decomposing the covariance matrix
  470 + covRows, // lda
  471 + cudaSPtr, // holds S
  472 + cudaUPtr, // holds U
  473 + covRows, // ldu
  474 + cudaVTPtr, // holds VT
  475 + covRows, // ldvt
  476 + cudaSvdWork, // work buffer ptr
  477 + svdLWork, // length of the work buffer
  478 + NULL, // rwork, not used for real data types
  479 + cudaSvdDevInfoPtr // devInfo pointer
  480 + );
  481 + CUSOLVER_ERROR_CHECK(cusolverStatus);
  482 +
  483 + // get the eigenvalues and free memory
  484 + cublasGetVector(
  485 + covRows,
  486 + sizeof(cudaSPtr[0]),
  487 + cudaSPtr,
  488 + 1,
  489 + allEVals.data(),
  490 + 1
  491 + );
  492 + CUDA_SAFE_FREE(cudaSvdWork, &cudaError);
  493 + CUDA_SAFE_FREE(cudaSPtr, &cudaError);
  494 + CUDA_SAFE_FREE(cudaVTPtr, &cudaError);
  495 + CUDA_SAFE_FREE(cudaSvdDevInfoPtr, &cudaError);
  496 +
  497 + // if this is a dominant eigen estimation, then perform matrix multiplication again
  498 + // if (dominantEigenEstimation) allEVecs = data * allEVecs;
  499 + if (dominantEigenEstimation) {
  500 + double* cudaMultedAllEVecs;
  501 + CUDA_SAFE_MALLOC(&cudaMultedAllEVecs, dimsIn*instances*sizeof(cudaMultedAllEVecs[0]), &cudaError);
  502 + const double one = 1.0;
  503 + const double zero = 0;
  504 +
  505 + cublasDgemm(
  506 + cublasHandle, // handle
  507 + CUBLAS_OP_N, // transa
  508 + CUBLAS_OP_N, // transb
  509 + dimsIn, // m
  510 + instances, // n
  511 + instances, // k
  512 + &one, // alpha
  513 + cudaDataPtr, // A
  514 + dimsIn, // lda
  515 + cudaUPtr, // B
  516 + instances, // ldb
  517 + &zero, // beta
  518 + cudaMultedAllEVecs, // C
  519 + dimsIn // ldc
  520 + );
  521 +
  522 + // get the eigenvectors from the multiplied value
  523 + cublasGetMatrix(
  524 + dimsIn,
  525 + instances,
  526 + sizeof(cudaMultedAllEVecs[0]),
  527 + cudaMultedAllEVecs,
  528 + dimsIn,
  529 + allEVecs.data(),
  530 + dimsIn
  531 + );
  532 +
  533 + // free the memory used for multiplication
  534 + CUDA_SAFE_FREE(cudaMultedAllEVecs, &cudaError);
  535 + } else {
  536 + // get the eigenvectors straight from the SVD
  537 + cublasGetMatrix(
  538 + covRows,
  539 + covRows,
  540 + sizeof(cudaUPtr[0]),
  541 + cudaUPtr,
  542 + covRows,
  543 + allEVecs.data(),
  544 + covRows
  545 + );
  546 + }
  547 +
  548 +
  549 + // free all the memory
  550 + CUDA_SAFE_FREE(cudaCovariancePtr, &cudaError);
  551 + CUDA_SAFE_FREE(cudaUPtr, &cudaError);
  552 + cusolverDnDestroy(cusolverHandle);
265 } 553 }
266 }; 554 };
267 555
openbr/plugins/cuda/cudapca.cu
1 -/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *  
2 - * Copyright 2016 Colin Heinzmann *  
3 - * *  
4 - * Licensed under the Apache License, Version 2.0 (the "License"); *  
5 - * you may not use this file except in compliance with the License. *  
6 - * You may obtain a copy of the License at *  
7 - * *  
8 - * http://www.apache.org/licenses/LICENSE-2.0 *  
9 - * *  
10 - * Unless required by applicable law or agreed to in writing, software *  
11 - * distributed under the License is distributed on an "AS IS" BASIS, *  
12 - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *  
13 - * See the License for the specific language governing permissions and *  
14 - * limitations under the License. *  
15 - * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */  
16 -  
17 -#include <iostream>  
18 -using namespace std;  
19 -  
20 -#include <opencv2/opencv.hpp>  
21 -#include <opencv2/gpu/gpu.hpp>  
22 -  
23 #include "cudadefines.hpp" 1 #include "cudadefines.hpp"
24 2
25 -using namespace cv;  
26 -using namespace cv::gpu;  
27 -  
28 -/*  
29 - * These are the CUDA functions for CUDAPCA. See cudapca.cpp for more details  
30 - */  
31 -  
32 namespace br { namespace cuda { namespace pca { 3 namespace br { namespace cuda { namespace pca {
33 - __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int numEigenvectors, int numSteps, int stepSize, int numPixels) {  
34 - int evIdx = blockIdx.x*blockDim.x+threadIdx.x;  
35 - int stepIdx = blockIdx.y*blockDim.y+threadIdx.y;  
36 -  
37 - if (evIdx >= numEigenvectors || stepIdx >= numSteps) { 4 + __global__ void castFloatToDoubleKernel(float* a, int inca, double* b, int incb, int numElems) {
  5 + int index = blockIdx.x*blockDim.x+threadIdx.x;
  6 + if (index >= numElems) {
38 return; 7 return;
39 } 8 }
40 9
41 - float acc = 0;  
42 - int startIdx = stepSize*stepIdx;  
43 - int stopIdx = startIdx+stepSize;  
44 - if (startIdx >= numPixels) {  
45 - return;  
46 - }  
47 - if (stopIdx >= numPixels) {  
48 - stopIdx = numPixels;  
49 - }  
50 - for(int i=startIdx; i < stopIdx; i++) {  
51 - acc += src[i]*evPtr[i*numEigenvectors + evIdx];  
52 - }  
53 -  
54 - intermediaryBuffer[stepIdx*stepSize + evIdx] = acc; 10 + b[index*incb] = (double)a[index*inca];
55 } 11 }
56 12
57 - __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* out, int numEigenvectors, int numSteps, int stepSize) {  
58 - int evIdx = blockIdx.x*blockDim.x+threadIdx.x;  
59 - if (evIdx >= numEigenvectors) { 13 + __global__ void castDoubleToFloatKernel(double* a, int inca, float* b, int incb, int numElems) {
  14 + int index = blockIdx.x*blockDim.x+threadIdx.x;
  15 + if (index >= numElems) {
60 return; 16 return;
61 } 17 }
62 18
63 - if (numSteps*stepSize+evIdx >= numEigenvectors) {  
64 - numSteps--;  
65 - }  
66 -  
67 - float acc = 0;  
68 - for (int i=0; i < numSteps; i++) {  
69 - int ibIdx = i*stepSize + evIdx;  
70 - acc += intermediaryBuffer[ibIdx];  
71 - }  
72 -  
73 - out[evIdx] = acc; 19 + b[index*incb] = (float)a[index*inca];
74 } 20 }
75 21
76 - __global__ void subtractMeanKernel(float* out, float* mean, int numElems) {  
77 - int idx = blockIdx.x*blockDim.x+threadIdx.x;  
78 -  
79 - // perform bound checking  
80 - if (idx >= numElems) {  
81 - return;  
82 - }  
83 -  
84 - // subtract out the mean  
85 - out[idx] -= mean[idx];  
86 - }  
87 -  
88 - // _evRows: the number of pixels in the trained images  
89 - // _evCols: the number of eigenvectors  
90 - // _meanElems: the number of pixels in an image  
91 - // _stepSize: the number of pixels in a single step  
92 - // _numSteps: the number of steps required to complete operation  
93 - float* cudaEvPtr; int _evRows; int _evCols;  
94 - float* cudaMeanPtr; int _meanElems;  
95 - int _numSteps; int _stepSize;  
96 - float* intermediaryBuffer;  
97 -  
98 - void initializeWrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) {  
99 - _evRows = evRows; _evCols = evCols;  
100 - _meanElems = meanElems;  
101 -  
102 - cudaError_t err;  
103 -  
104 - // copy the eigenvectors to the GPU  
105 - CUDA_SAFE_MALLOC(&cudaEvPtr, evRows*evCols*sizeof(float), &err);  
106 - CUDA_SAFE_MEMCPY(cudaEvPtr, evPtr, evRows*evCols*sizeof(float), cudaMemcpyHostToDevice, &err);  
107 -  
108 - // copy the mean to the GPU  
109 - CUDA_SAFE_MALLOC(&cudaMeanPtr, meanElems*sizeof(float), &err);  
110 - CUDA_SAFE_MEMCPY(cudaMeanPtr, meanPtr, meanElems*sizeof(float), cudaMemcpyHostToDevice, &err);  
111 -  
112 - // initialize the intermediary working space,  
113 - _stepSize = 2048;  
114 - _numSteps = _evRows / _stepSize + 1;  
115 - CUDA_SAFE_MALLOC(&intermediaryBuffer, _numSteps*_stepSize*sizeof(float), &err);  
116 - } 22 + void castFloatToDouble(float* a, int inca, double* b, int incb, int numElems) {
  23 + int threadsPerBlock = 512;
  24 + int numBlocks = numElems / threadsPerBlock + 1;
117 25
118 - void trainWrapper(void* cudaSrc, float* data, int rows, int cols) {  
119 - cudaError_t err;  
120 - CUDA_SAFE_MEMCPY(data, cudaSrc, rows*cols*sizeof(float), cudaMemcpyDeviceToHost, &err); 26 + castFloatToDoubleKernel<<<numBlocks, threadsPerBlock>>>(a, inca, b, incb, numElems);
121 } 27 }
122 28
123 - void wrapper(void* src, void** dst, int imgRows, int imgCols) {  
124 - cudaError_t err;  
125 - CUDA_SAFE_MALLOC(dst, _evCols*sizeof(float), &err);  
126 -  
127 - if (imgRows*imgCols != _evRows || imgRows*imgCols != _meanElems) {  
128 - cout << "ERR: Image dimension mismatch!" << endl;  
129 - throw 0;  
130 - }  
131 -  
132 - // subtract out the mean of the image (mean is 1xpixels in size), perform in place (in src) 29 + void castDoubleToFloat(double* a, int inca, float* b, int incb, int numElems) {
133 int threadsPerBlock = 512; 30 int threadsPerBlock = 512;
134 - int numBlocks = _meanElems / threadsPerBlock + 1;  
135 - subtractMeanKernel<<<numBlocks, threadsPerBlock>>>((float*)src, cudaMeanPtr, _meanElems);  
136 - CUDA_KERNEL_ERR_CHK(&err);  
137 -  
138 - // perform matrix multiplication  
139 - dim3 threadsPerBlock2d(512, 1);  
140 - dim3 numBlocks2d(  
141 - _evCols / threadsPerBlock2d.x + 1,  
142 - _numSteps / threadsPerBlock2d.y + 1);  
143 - multiplyKernel<<<numBlocks2d, threadsPerBlock2d>>>((float*)src, intermediaryBuffer, cudaEvPtr, _evCols, _numSteps, _stepSize, _meanElems);  
144 - CUDA_KERNEL_ERR_CHK(&err);  
145 -  
146 - threadsPerBlock = 512;  
147 - numBlocks = _evCols / threadsPerBlock + 1;  
148 - multiplyJoinKernel<<<numBlocks, threadsPerBlock>>>(intermediaryBuffer, (float*)*dst, _evCols, _numSteps, _stepSize);  
149 - CUDA_KERNEL_ERR_CHK(&err); 31 + int numBlocks = numElems / threadsPerBlock + 1;
150 32
151 - // free the src memory  
152 - CUDA_SAFE_FREE(src, &err); 33 + castDoubleToFloatKernel<<<numBlocks, threadsPerBlock>>>(a, inca, b, incb, numElems);
153 } 34 }
154 }}} 35 }}}
openbr/plugins/cuda/module.cmake
@@ -28,6 +28,6 @@ if(BR_WITH_CUDA) @@ -28,6 +28,6 @@ if(BR_WITH_CUDA)
28 28
29 # add the compiled source and libs into the build system 29 # add the compiled source and libs into the build system
30 set(BR_THIRDPARTY_SRC ${BR_THIRDPARTY_SRC} ${CUDA_CPP_SRC} ${CUDA_CU_OBJ}) 30 set(BR_THIRDPARTY_SRC ${BR_THIRDPARTY_SRC} ${CUDA_CPP_SRC} ${CUDA_CU_OBJ})
31 - set(BR_THIRDPARTY_LIBS ${BR_THIRDPARTY_LIBS} ${CUDA_LIBRARIES}) 31 + set(BR_THIRDPARTY_LIBS ${BR_THIRDPARTY_LIBS} ${CUDA_LIBRARIES} "cublas" "cusolver")
32 32
33 endif() 33 endif()