Commit 59d38ff01fcfeb96b75fd610f8e180f11665788b

Authored by Josh Klontz
Committed by GitHub
2 parents a91c55a5 a7f8896e

Merge pull request #490 from DepthDeluxe/master

CUDA-accelerated PCA training and improved PCA projection speeds
openbr/plugins/cuda/cudadefines.hpp
... ... @@ -19,6 +19,9 @@
19 19 using namespace std;
20 20 #include <pthread.h>
21 21  
  22 +#include <cublas_v2.h>
  23 +
  24 +
22 25 #define CUDA_SAFE_FREE(cudaPtr, errPtr) \
23 26 /*cout << pthread_self() << ": CUDA Free: " << cudaPtr << endl;*/ \
24 27 *errPtr = cudaFree(cudaPtr); \
... ... @@ -48,3 +51,53 @@ using namespace std;
48 51 cout << pthread_self() << ": Kernel Call Err(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \
49 52 throw 0; \
50 53 }
  54 +
  55 +#define CUBLAS_ERROR_CHECK(error) { \
  56 + switch (error) { \
  57 + case CUBLAS_STATUS_SUCCESS: \
  58 + break; \
  59 + case CUBLAS_STATUS_NOT_INITIALIZED: \
  60 + cout << "CUBLAS_STATUS_NOT_INITIALIZED" << endl; \
  61 + break; \
  62 + case CUBLAS_STATUS_ALLOC_FAILED: \
  63 + cout << "CUBLAS_STATUS_ALLOC_FAILED" << endl; \
  64 + break; \
  65 + case CUBLAS_STATUS_INVALID_VALUE: \
  66 + cout << "CUBLAS_STATUS_INVALID_VALUE" << endl;; \
  67 + break; \
  68 + case CUBLAS_STATUS_ARCH_MISMATCH: \
  69 + cout << "CUBLAS_STATUS_ARCH_MISMATCH" << endl;; \
  70 + break; \
  71 + case CUBLAS_STATUS_MAPPING_ERROR: \
  72 + cout << "CUBLAS_STATUS_MAPPING_ERROR" << endl; \
  73 + break; \
  74 + case CUBLAS_STATUS_EXECUTION_FAILED: \
  75 + cout << "CUBLAS_STATUS_EXECUTION_FAILED" << endl; \
  76 + break; \
  77 + case CUBLAS_STATUS_INTERNAL_ERROR: \
  78 + cout << "CUBLAS_STATUS_INTERNAL_ERROR" << endl; \
  79 + break; \
  80 + default: \
  81 + cout << "<unknown>: " << error << endl; \
  82 + break; \
  83 + } \
  84 +}
  85 +
  86 +#define CUSOLVER_ERROR_CHECK(error) { \
  87 + switch(error) { \
  88 + case CUSOLVER_STATUS_SUCCESS: \
  89 + break; \
  90 + case CUSOLVER_STATUS_NOT_INITIALIZED: \
  91 + cout << "CUSOLVER_STATUS_NOT_INITIALIZED" << endl; \
  92 + break; \
  93 + case CUSOLVER_STATUS_ALLOC_FAILED: \
  94 + cout << "CUSOLVER_STATUS_ALLOC_FAILED" << endl; \
  95 + break; \
  96 + case CUSOLVER_STATUS_ARCH_MISMATCH: \
  97 + cout << "CUSOLVER_STATUS_ARCH_MISMATCH" << endl; \
  98 + break; \
  99 + default: \
  100 + cout << "<unknown>: " << error << endl; \
  101 + break; \
  102 + } \
  103 +}
... ...
openbr/plugins/cuda/cudapca.cpp
... ... @@ -30,11 +30,14 @@ using namespace cv;
30 30 #include <openbr/core/eigenutils.h>
31 31 #include <openbr/core/opencvutils.h>
32 32  
33   -// definitions from the CUDA source file
  33 +#include <cuda_runtime.h>
  34 +#include <cublas_v2.h>
  35 +#include <cusolverDn.h>
  36 +#include "cudadefines.hpp"
  37 +
34 38 namespace br { namespace cuda { namespace pca {
35   - void initializeWrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems);
36   - void trainWrapper(void* cudaSrc, float* dst, int rows, int cols);
37   - void wrapper(void* src, void** dst, int imgRows, int imgCols);
  39 + void castFloatToDouble(float* a, int inca, double* b, int incb, int numElems);
  40 + void castDoubleToFloat(double* a, int inca, float* b, int incb, int numElems);
38 41 }}}
39 42  
40 43 namespace br
... ... @@ -61,13 +64,26 @@ protected:
61 64 BR_PROPERTY(int, drop, 0)
62 65 BR_PROPERTY(bool, whiten, false)
63 66  
64   - Eigen::VectorXf mean, eVals;
  67 + Eigen::VectorXf mean;
  68 + Eigen::VectorXf eVals;
65 69 Eigen::MatrixXf eVecs;
66 70  
67   - int originalRows;
  71 + cublasHandle_t cublasHandle;
  72 + float* cudaMeanPtr; // holds the "keep" long vector
  73 + float* cudaEvPtr; // holds all the eigenvectors
68 74  
69 75 public:
70   - CUDAPCATransform() : keep(0.95), drop(0), whiten(false) {}
  76 + CUDAPCATransform() : keep(0.95), drop(0), whiten(false) {
  77 + // try to initialize CUBLAS
  78 + cublasStatus_t status;
  79 + status = cublasCreate(&cublasHandle);
  80 + CUBLAS_ERROR_CHECK(status);
  81 + }
  82 +
  83 + ~CUDAPCATransform() {
  84 + // tear down CUBLAS
  85 + cublasDestroy(cublasHandle);
  86 + }
71 87  
72 88 private:
73 89 double residualReconstructionError(const Template &src) const
... ... @@ -83,45 +99,38 @@ private:
83 99  
84 100 void train(const TemplateList &cudaTrainingSet)
85 101 {
86   - // copy the data back from the graphics card so the training can be done on the CPU
87   - const int instances = cudaTrainingSet.size(); // get the number of training set instances
88   - QList<Template> trainingQlist;
89   - for(int i=0; i<instances; i++) {
90   - Template currentTemplate = cudaTrainingSet[i];
91   - void* const* srcDataPtr = currentTemplate.m().ptr<void*>();
92   - void* cudaMemPtr = srcDataPtr[0];
93   - int rows = *((int*)srcDataPtr[1]);
94   - int cols = *((int*)srcDataPtr[2]);
95   - int type = *((int*)srcDataPtr[3]);
96   -
97   - if (type != CV_32FC1) {
98   - qFatal("Requires single channel 32-bit floating point matrices.");
99   - }
100   -
101   - Mat mat = Mat(rows, cols, type);
102   - br::cuda::pca::trainWrapper(cudaMemPtr, mat.ptr<float>(), rows, cols);
103   - trainingQlist.append(Template(mat));
104   - }
105   -
106   - // assemble a TemplateList from the list of data
107   - TemplateList trainingSet(trainingQlist);
108   -
109   -
110   - originalRows = trainingSet.first().m().rows; // get number of rows of first image
111   - int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols; // get the size of the first image
  102 + cublasStatus_t cublasStatus;
  103 + cudaError_t cudaError;
  104 +
  105 + // put all the data into a single matrix to perform PCA
  106 + const int instances = cudaTrainingSet.size();
  107 + const int dimsIn = *(int*)cudaTrainingSet.first().m().ptr<void*>()[1]
  108 + * *(int*)cudaTrainingSet.first().m().ptr<void*>()[2];
  109 +
  110 + // copy the data over
  111 + double* cudaDataPtr;
  112 + CUDA_SAFE_MALLOC(&cudaDataPtr, instances*dimsIn*sizeof(cudaDataPtr[0]), &cudaError);
  113 + for (int i=0; i < instances; i++) {
  114 + br::cuda::pca::castFloatToDouble(
  115 + (float*)(cudaTrainingSet[i].m().ptr<void*>()[0]),
  116 + 1,
  117 + cudaDataPtr+i*dimsIn,
  118 + 1,
  119 + dimsIn
  120 + );
  121 + }
112 122  
113   - // Map into 64-bit Eigen matrix
114   - Eigen::MatrixXd data(dimsIn, instances); // create a mat
115   - for (int i=0; i<instances; i++) {
116   - data.col(i) = Eigen::Map<const Eigen::MatrixXf>(trainingSet[i].m().ptr<float>(), dimsIn, 1).cast<double>();
117   - }
  123 + trainCore(cudaDataPtr, dimsIn, instances);
118 124  
119   - trainCore(data);
  125 + CUDA_SAFE_FREE(cudaDataPtr, &cudaError);
120 126 }
121 127  
122 128 void project(const Template &src, Template &dst) const
123 129 {
  130 + cudaError_t cudaError;
  131 +
124 132 void* const* srcDataPtr = src.m().ptr<void*>();
  133 + float* srcGpuMatPtr = (float*)srcDataPtr[0];
125 134 int rows = *((int*)srcDataPtr[1]);
126 135 int cols = *((int*)srcDataPtr[2]);
127 136 int type = *((int*)srcDataPtr[3]);
... ... @@ -131,137 +140,416 @@ private:
131 140 throw 0;
132 141 }
133 142  
  143 + // save the destination rows
  144 + int dstRows = (int)keep;
  145 +
134 146 Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
135 147 void** dstDataPtr = dstMat.ptr<void*>();
  148 + float** dstGpuMatPtrPtr = (float**)dstDataPtr;
136 149 dstDataPtr[1] = srcDataPtr[1]; *((int*)dstDataPtr[1]) = 1;
137   - dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = keep;
  150 + dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = dstRows;
138 151 dstDataPtr[3] = srcDataPtr[3];
139 152  
140   - cuda::pca::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
141 153  
  154 + // allocate the memory and set to zero
  155 + //cout << "Allocating destination memory" << endl;
  156 + cublasStatus_t status;
  157 + cudaMalloc(dstGpuMatPtrPtr, dstRows*sizeof(float));
  158 + cudaMemset(*dstGpuMatPtrPtr, 0, dstRows*sizeof(float));
  159 +
  160 + {
  161 + float negativeOne = -1.0f;
  162 + status = cublasSaxpy(
  163 + cublasHandle, // handle
  164 + dstRows, // vector length
  165 + &negativeOne, // alpha (1)
  166 + cudaMeanPtr, // mean
  167 + 1, // stride
  168 + srcGpuMatPtr, // y, the source
  169 + 1 // stride
  170 + );
  171 + CUBLAS_ERROR_CHECK(status);
  172 + }
  173 +
  174 + {
  175 + float one = 1.0f;
  176 + float zero = 0.0f;
  177 + status = cublasSgemv(
  178 + cublasHandle, // handle
  179 + CUBLAS_OP_T, // normal vector multiplication
  180 + eVecs.rows(), // # rows
  181 + eVecs.cols(), // # cols
  182 + &one, // alpha (1)
  183 + cudaEvPtr, // pointer to the matrix
  184 + eVecs.rows(), // leading dimension of matrix
  185 + srcGpuMatPtr, // vector for multiplication
  186 + 1, // stride (1)
  187 + &zero, // beta (0)
  188 + *dstGpuMatPtrPtr, // vector to store the result
  189 + 1 // stride (1)
  190 + );
  191 + CUBLAS_ERROR_CHECK(status);
  192 + }
  193 +
  194 + //cout << "Saving result" << endl;
142 195 dst = dstMat;
  196 + CUDA_SAFE_FREE(srcGpuMatPtr, &cudaError);
143 197 }
144 198  
145 199 void store(QDataStream &stream) const
146 200 {
147   - stream << keep << drop << whiten << originalRows << mean << eVals << eVecs;
  201 + stream << keep << drop << whiten << mean << eVecs;
148 202 }
149 203  
150 204 void load(QDataStream &stream)
151 205 {
152   - stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> eVecs;
  206 + stream >> keep >> drop >> whiten >> mean >> eVecs;
  207 +
  208 + //cout << "Starting load process" << endl;
  209 +
  210 + cudaError_t cudaError;
  211 + cublasStatus_t cublasStatus;
  212 + CUDA_SAFE_MALLOC(&cudaMeanPtr, mean.rows()*mean.cols()*sizeof(float), &cudaError);
  213 + CUDA_SAFE_MALLOC(&cudaEvPtr, eVecs.rows()*eVecs.cols()*sizeof(float), &cudaError);
  214 +
  215 + //cout << "Setting vector" << endl;
  216 + // load the mean vector into GPU memory
  217 + cublasStatus = cublasSetVector(
  218 + mean.rows()*mean.cols(),
  219 + sizeof(float),
  220 + mean.data(),
  221 + 1,
  222 + cudaMeanPtr,
  223 + 1
  224 + );
  225 + CUBLAS_ERROR_CHECK(cublasStatus);
  226 +
  227 + //cout << "Setting the matrix" << endl;
  228 + // load the eigenvector matrix into GPU memory
  229 + cublasStatus = cublasSetMatrix(
  230 + eVecs.rows(),
  231 + eVecs.cols(),
  232 + sizeof(float),
  233 + eVecs.data(),
  234 + eVecs.rows(),
  235 + cudaEvPtr,
  236 + eVecs.rows()
  237 + );
  238 + CUBLAS_ERROR_CHECK(cublasStatus);
  239 + }
153 240  
154   - // serialize the eigenvectors
155   - float* evBuffer = new float[eVecs.rows() * eVecs.cols()];
156   - for (int i=0; i < eVecs.rows(); i++) {
157   - for (int j=0; j < eVecs.cols(); j++) {
158   - evBuffer[i*eVecs.cols() + j] = eVecs(i, j);
159   - }
160   - }
  241 +protected:
  242 + void trainCore(double* cudaDataPtr, int dimsIn, int instances) {
  243 + cudaError_t cudaError;
  244 +
  245 + const bool dominantEigenEstimation = (dimsIn > instances);
  246 +
  247 + Eigen::MatrixXd allEVals, allEVecs;
161 248  
162   - // serialize the mean
163   - float* meanBuffer = new float[mean.rows() * mean.cols()];
164   - for (int i=0; i < mean.rows(); i++) {
165   - for (int j=0; j < mean.cols(); j++) {
166   - meanBuffer[i*mean.cols() + j] = mean(i, j);
  249 + // allocate the eigenvectors
  250 + if (dominantEigenEstimation) {
  251 + allEVals = Eigen::MatrixXd(instances, 1);
  252 + allEVecs = Eigen::MatrixXd(dimsIn, instances);
  253 + } else {
  254 + allEVals = Eigen::MatrixXd(dimsIn, 1);
  255 + allEVecs = Eigen::MatrixXd(dimsIn, dimsIn);
  256 + }
  257 +
  258 + if (keep != 0) {
  259 + performCovarianceSVD(cudaDataPtr, dimsIn, instances, allEVals, allEVecs);
  260 + } else {
  261 + // null case
  262 + mean = Eigen::VectorXf::Zero(dimsIn);
  263 + allEVecs = Eigen::MatrixXd::Identity(dimsIn, dimsIn);
  264 + allEVals = Eigen::VectorXd::Ones(dimsIn);
  265 + }
  266 +
  267 + // *****************
  268 + // We have now found the eigenvalues and eigenvectors
  269 + // *****************
  270 +
  271 + if (keep <= 0) {
  272 + keep = dimsIn - drop;
  273 + } else if (keep < 1) {
  274 + // Keep eigenvectors that retain a certain energy percentage.
  275 + const double totalEnergy = allEVals.sum();
  276 + if (totalEnergy == 0) {
  277 + keep = 0;
  278 + } else {
  279 + double currentEnergy = 0;
  280 + int i=0;
  281 + while ((currentEnergy / totalEnergy < keep) && (i < allEVals.rows())) {
  282 + currentEnergy += allEVals(i);
  283 + i++;
  284 + }
  285 + keep = i - drop;
  286 + }
  287 + } else {
  288 + if (keep + drop > allEVals.rows()) {
  289 + qWarning("Insufficient samples, needed at least %d but only got %d.", (int)keep + drop, (int)allEVals.rows());
  290 + keep = allEVals.rows() - drop;
167 291 }
168   - }
  292 + }
169 293  
170   - // call the wrapper function
171   - cuda::pca::initializeWrapper(evBuffer, eVecs.rows(), eVecs.cols(), meanBuffer, mean.rows()*mean.cols());
  294 + // Keep highest energy vectors
  295 + eVals = Eigen::VectorXf((int)keep, 1);
  296 + eVecs = Eigen::MatrixXf(allEVecs.rows(), (int)keep);
  297 + for (int i=0; i<keep; i++) {
  298 + int index = i+drop;
  299 + eVals(i) = allEVals(index);
  300 + eVecs.col(i) = allEVecs.col(index).cast<float>() / allEVecs.col(index).norm();
  301 + if (whiten) eVecs.col(i) /= sqrt(eVals(i));
  302 + }
172 303  
173   - delete evBuffer;
174   - delete meanBuffer;
  304 + // Debug output
  305 + if (Globals->verbose) qDebug() << "PCA Training:\n\tDimsIn =" << dimsIn << "\n\tKeep =" << keep;
175 306 }
176 307  
177   -protected:
178   - void trainCore(Eigen::MatrixXd data)
179   - {
180   - int dimsIn = data.rows();
181   - int instances = data.cols();
182   - const bool dominantEigenEstimation = (dimsIn > instances);
183   -
184   - Eigen::MatrixXd allEVals, allEVecs;
185   - if (keep != 0) {
186   - // Compute and remove mean
187   - mean = Eigen::VectorXf(dimsIn);
188   - for (int i=0; i<dimsIn; i++) mean(i) = data.row(i).sum() / (float)instances;
189   - for (int i=0; i<dimsIn; i++) data.row(i).array() -= mean(i);
190   -
191   - // Calculate covariance matrix
192   - Eigen::MatrixXd cov;
193   - if (dominantEigenEstimation) cov = data.transpose() * data / (instances-1.0);
194   - else cov = data * data.transpose() / (instances-1.0);
195   -
196   - // Compute eigendecomposition. Returns eigenvectors/eigenvalues in increasing order by eigenvalue.
197   - Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> eSolver(cov);
198   - allEVals = eSolver.eigenvalues();
199   - allEVecs = eSolver.eigenvectors();
200   - if (dominantEigenEstimation) allEVecs = data * allEVecs;
201   - } else {
202   - // Null case
203   - mean = Eigen::VectorXf::Zero(dimsIn);
204   - allEVecs = Eigen::MatrixXd::Identity(dimsIn, dimsIn);
205   - allEVals = Eigen::VectorXd::Ones(dimsIn);
206   - }
207   -
208   - if (keep <= 0) {
209   - keep = dimsIn - drop;
210   - } else if (keep < 1) {
211   - // Keep eigenvectors that retain a certain energy percentage.
212   - const double totalEnergy = allEVals.sum();
213   - if (totalEnergy == 0) {
214   - keep = 0;
215   - } else {
216   - double currentEnergy = 0;
217   - int i=0;
218   - while ((currentEnergy / totalEnergy < keep) && (i < allEVals.rows())) {
219   - currentEnergy += allEVals(allEVals.rows()-(i+1));
220   - i++;
221   - }
222   - keep = i - drop;
223   - }
224   - } else {
225   - if (keep + drop > allEVals.rows()) {
226   - qWarning("Insufficient samples, needed at least %d but only got %d.", (int)keep + drop, (int)allEVals.rows());
227   - keep = allEVals.rows() - drop;
228   - }
229   - }
230   -
231   - // Keep highest energy vectors
232   - eVals = Eigen::VectorXf((int)keep, 1);
233   - eVecs = Eigen::MatrixXf(allEVecs.rows(), (int)keep);
234   - for (int i=0; i<keep; i++) {
235   - int index = allEVals.rows()-(i+drop+1);
236   - eVals(i) = allEVals(index);
237   - eVecs.col(i) = allEVecs.col(index).cast<float>() / allEVecs.col(index).norm();
238   - if (whiten) eVecs.col(i) /= sqrt(eVals(i));
239   - }
240   -
241   - // Debug output
242   - if (Globals->verbose) qDebug() << "PCA Training:\n\tDimsIn =" << dimsIn << "\n\tKeep =" << keep;
243   - }
  308 + // computes the covariance matrix and then pulls the eigenvalues+eigenvectors
  309 + // out of it using SVD of a symmetric matrix
  310 + void performCovarianceSVD(double* cudaDataPtr, int dimsIn, int instances, Eigen::MatrixXd& allEVals, Eigen::MatrixXd& allEVecs) {
  311 + cudaError_t cudaError;
  312 +
  313 + const bool dominantEigenEstimation = (dimsIn > instances);
  314 +
  315 + // used for temporary storage
  316 + Eigen::VectorXd meanDouble(dimsIn);
  317 +
  318 + // compute the mean
  319 + for (int i=0; i < dimsIn; i++) {
  320 + cublasDasum(
  321 + cublasHandle,
  322 + instances,
  323 + cudaDataPtr+i,
  324 + dimsIn,
  325 + meanDouble.data()+i
  326 + );
  327 + }
244 328  
245   - void writeEigenVectors(const Eigen::MatrixXd &allEVals, const Eigen::MatrixXd &allEVecs) const
246   - {
247   - const int originalCols = mean.rows() / originalRows;
248   -
249   - { // Write out mean image
250   - cv::Mat out(originalRows, originalCols, CV_32FC1);
251   - Eigen::Map<Eigen::MatrixXf> outMap(out.ptr<float>(), mean.rows(), 1);
252   - outMap = mean.col(0);
253   - // OpenCVUtils::saveImage(out, Globals->Debug+"/PCA/eigenVectors/mean.png");
254   - }
255   -
256   - // Write out sample eigen vectors (16 highest, 8 lowest), filename = eigenvalue.
257   - for (int k=0; k<(int)allEVals.size(); k++) {
258   - if ((k < 8) || (k >= (int)allEVals.size()-16)) {
259   - cv::Mat out(originalRows, originalCols, CV_64FC1);
260   - Eigen::Map<Eigen::MatrixXd> outMap(out.ptr<double>(), mean.rows(), 1);
261   - outMap = allEVecs.col(k);
262   - // OpenCVUtils::saveImage(out, Globals->Debug+"/PCA/eigenVectors/"+QString::number(allEVals(k),'f',0)+".png");
263   - }
264   - }
  329 + // put data back on GPU for further processing
  330 + double* cudaMeanDoublePtr;
  331 + CUDA_SAFE_MALLOC(&cudaMeanDoublePtr, dimsIn*sizeof(cudaMeanDoublePtr[0]), &cudaError);
  332 + cublasSetVector(
  333 + dimsIn,
  334 + sizeof(cudaMeanDoublePtr[0]),
  335 + meanDouble.data(),
  336 + 1,
  337 + cudaMeanDoublePtr,
  338 + 1
  339 + );
  340 +
  341 + // scale to calculate average
  342 + {
  343 + double scaleFactor = 1.0/(double)instances;
  344 + cublasDscal(
  345 + cublasHandle,
  346 + dimsIn,
  347 + &scaleFactor,
  348 + cudaMeanDoublePtr,
  349 + 1
  350 + );
  351 + }
  352 +
  353 + // subtract mean from data
  354 + for (int i=0; i < instances; i++) {
  355 + double negativeOne = -1.0;
  356 + cublasDaxpy(
  357 + cublasHandle,
  358 + dimsIn,
  359 + &negativeOne,
  360 + cudaMeanDoublePtr,
  361 + 1,
  362 + cudaDataPtr+i*dimsIn,
  363 + 1
  364 + );
  365 + }
  366 +
  367 + // convert to float form and copy the data back
  368 + CUDA_SAFE_MALLOC(&cudaMeanPtr, dimsIn*sizeof(cudaMeanPtr[0]), &cudaError);
  369 + br::cuda::pca::castDoubleToFloat(cudaMeanDoublePtr, 1, cudaMeanPtr, 1, dimsIn);
  370 +
  371 + // copy the data back
  372 + mean = Eigen::VectorXf(dimsIn);
  373 + cublasGetVector(
  374 + dimsIn,
  375 + sizeof(cudaMeanPtr[0]),
  376 + cudaMeanPtr,
  377 + 1,
  378 + mean.data(),
  379 + 1
  380 + );
  381 +
  382 + // free up the memory
  383 + CUDA_SAFE_FREE(cudaMeanDoublePtr, &cudaError);
  384 + CUDA_SAFE_FREE(cudaMeanPtr, &cudaError);
  385 +
  386 + // allocate space for the covariance matrix
  387 + double* cudaCovariancePtr;
  388 + int covRows = allEVals.rows();
  389 + CUDA_SAFE_MALLOC(&cudaCovariancePtr, covRows*covRows*sizeof(cudaCovariancePtr[0]), &cudaError);
  390 +
  391 + // compute the covariance matrix
  392 + if (dominantEigenEstimation) {
  393 + // cov = data.transpose() * data / (instances-1.0);
  394 + const double scaleFactor = 1.0/(instances-1.0);
  395 + const double zero = 0.0;
  396 + cublasDgemm(
  397 + cublasHandle,
  398 + CUBLAS_OP_T,
  399 + CUBLAS_OP_N,
  400 + instances,
  401 + instances,
  402 + dimsIn,
  403 + &scaleFactor,
  404 + cudaDataPtr,
  405 + dimsIn,
  406 + cudaDataPtr,
  407 + dimsIn,
  408 + &zero,
  409 + cudaCovariancePtr,
  410 + covRows
  411 + );
  412 + } else {
  413 + // cov = data * data.transpose() / (instances-1.0);
  414 + const double scaleFactor = 1.0/(instances-1.0);
  415 + const double zero = 0.0;
  416 + cublasDgemm(
  417 + cublasHandle,
  418 + CUBLAS_OP_N,
  419 + CUBLAS_OP_T,
  420 + dimsIn,
  421 + dimsIn,
  422 + instances,
  423 + &scaleFactor,
  424 + cudaDataPtr,
  425 + dimsIn,
  426 + cudaDataPtr,
  427 + dimsIn,
  428 + &zero,
  429 + cudaCovariancePtr,
  430 + covRows
  431 + );
  432 + }
  433 +
  434 + cusolverDnHandle_t cusolverHandle;
  435 + cusolverStatus_t cusolverStatus;
  436 + cusolverDnCreate(&cusolverHandle);
  437 +
  438 + // allocate appropriate working space
  439 + int svdLWork;
  440 + cusolverDnDgesvd_bufferSize(
  441 + cusolverHandle,
  442 + covRows,
  443 + covRows,
  444 + &svdLWork
  445 + );
  446 + double* cudaSvdWork;
  447 + CUDA_SAFE_MALLOC(&cudaSvdWork, svdLWork*sizeof(cudaSvdWork[0]), &cudaError);
  448 +
  449 + double* cudaUPtr;
  450 + CUDA_SAFE_MALLOC(&cudaUPtr, covRows*covRows*sizeof(cudaUPtr[0]), &cudaError);
  451 + double* cudaSPtr;
  452 + CUDA_SAFE_MALLOC(&cudaSPtr, covRows*sizeof(cudaSPtr[0]), &cudaError);
  453 + double* cudaVTPtr;
  454 + CUDA_SAFE_MALLOC(&cudaVTPtr, covRows*covRows*sizeof(cudaVTPtr[0]), &cudaError);
  455 +
  456 + int* cudaSvdDevInfoPtr;
  457 + CUDA_SAFE_MALLOC(&cudaSvdDevInfoPtr, sizeof(*cudaSvdDevInfoPtr), &cudaError);
  458 + int svdDevInfo;
  459 +
  460 + // perform SVD on an n x m matrix, in this case the matrix is the covariance
  461 + // matrix and is symmetric, meaning the SVD will calculate the eigenvalues
  462 + // and eigenvectors for us.
  463 + cusolverStatus = cusolverDnDgesvd(
  464 + cusolverHandle,
  465 + 'A', // all columns of unitary matrix
  466 + 'A', // all columns of array VT
  467 + covRows, // m
  468 + covRows, // n
  469 + cudaCovariancePtr, // decomposing the covariance matrix
  470 + covRows, // lda
  471 + cudaSPtr, // holds S
  472 + cudaUPtr, // holds U
  473 + covRows, // ldu
  474 + cudaVTPtr, // holds VT
  475 + covRows, // ldvt
  476 + cudaSvdWork, // work buffer ptr
  477 + svdLWork, // length of the work buffer
  478 + NULL, // rwork, not used for real data types
  479 + cudaSvdDevInfoPtr // devInfo pointer
  480 + );
  481 + CUSOLVER_ERROR_CHECK(cusolverStatus);
  482 +
  483 + // get the eigenvalues and free memory
  484 + cublasGetVector(
  485 + covRows,
  486 + sizeof(cudaSPtr[0]),
  487 + cudaSPtr,
  488 + 1,
  489 + allEVals.data(),
  490 + 1
  491 + );
  492 + CUDA_SAFE_FREE(cudaSvdWork, &cudaError);
  493 + CUDA_SAFE_FREE(cudaSPtr, &cudaError);
  494 + CUDA_SAFE_FREE(cudaVTPtr, &cudaError);
  495 + CUDA_SAFE_FREE(cudaSvdDevInfoPtr, &cudaError);
  496 +
  497 + // if this is a dominant eigen estimation, then perform matrix multiplication again
  498 + // if (dominantEigenEstimation) allEVecs = data * allEVecs;
  499 + if (dominantEigenEstimation) {
  500 + double* cudaMultedAllEVecs;
  501 + CUDA_SAFE_MALLOC(&cudaMultedAllEVecs, dimsIn*instances*sizeof(cudaMultedAllEVecs[0]), &cudaError);
  502 + const double one = 1.0;
  503 + const double zero = 0;
  504 +
  505 + cublasDgemm(
  506 + cublasHandle, // handle
  507 + CUBLAS_OP_N, // transa
  508 + CUBLAS_OP_N, // transb
  509 + dimsIn, // m
  510 + instances, // n
  511 + instances, // k
  512 + &one, // alpha
  513 + cudaDataPtr, // A
  514 + dimsIn, // lda
  515 + cudaUPtr, // B
  516 + instances, // ldb
  517 + &zero, // beta
  518 + cudaMultedAllEVecs, // C
  519 + dimsIn // ldc
  520 + );
  521 +
  522 + // get the eigenvectors from the multiplied value
  523 + cublasGetMatrix(
  524 + dimsIn,
  525 + instances,
  526 + sizeof(cudaMultedAllEVecs[0]),
  527 + cudaMultedAllEVecs,
  528 + dimsIn,
  529 + allEVecs.data(),
  530 + dimsIn
  531 + );
  532 +
  533 + // free the memory used for multiplication
  534 + CUDA_SAFE_FREE(cudaMultedAllEVecs, &cudaError);
  535 + } else {
  536 + // get the eigenvectors straight from the SVD
  537 + cublasGetMatrix(
  538 + covRows,
  539 + covRows,
  540 + sizeof(cudaUPtr[0]),
  541 + cudaUPtr,
  542 + covRows,
  543 + allEVecs.data(),
  544 + covRows
  545 + );
  546 + }
  547 +
  548 +
  549 + // free all the memory
  550 + CUDA_SAFE_FREE(cudaCovariancePtr, &cudaError);
  551 + CUDA_SAFE_FREE(cudaUPtr, &cudaError);
  552 + cusolverDnDestroy(cusolverHandle);
265 553 }
266 554 };
267 555  
... ...
openbr/plugins/cuda/cudapca.cu
1   -/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2   - * Copyright 2016 Colin Heinzmann *
3   - * *
4   - * Licensed under the Apache License, Version 2.0 (the "License"); *
5   - * you may not use this file except in compliance with the License. *
6   - * You may obtain a copy of the License at *
7   - * *
8   - * http://www.apache.org/licenses/LICENSE-2.0 *
9   - * *
10   - * Unless required by applicable law or agreed to in writing, software *
11   - * distributed under the License is distributed on an "AS IS" BASIS, *
12   - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
13   - * See the License for the specific language governing permissions and *
14   - * limitations under the License. *
15   - * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
16   -
17   -#include <iostream>
18   -using namespace std;
19   -
20   -#include <opencv2/opencv.hpp>
21   -#include <opencv2/gpu/gpu.hpp>
22   -
23 1 #include "cudadefines.hpp"
24 2  
25   -using namespace cv;
26   -using namespace cv::gpu;
27   -
28   -/*
29   - * These are the CUDA functions for CUDAPCA. See cudapca.cpp for more details
30   - */
31   -
32 3 namespace br { namespace cuda { namespace pca {
33   - __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int numEigenvectors, int numSteps, int stepSize, int numPixels) {
34   - int evIdx = blockIdx.x*blockDim.x+threadIdx.x;
35   - int stepIdx = blockIdx.y*blockDim.y+threadIdx.y;
36   -
37   - if (evIdx >= numEigenvectors || stepIdx >= numSteps) {
  4 + __global__ void castFloatToDoubleKernel(float* a, int inca, double* b, int incb, int numElems) {
  5 + int index = blockIdx.x*blockDim.x+threadIdx.x;
  6 + if (index >= numElems) {
38 7 return;
39 8 }
40 9  
41   - float acc = 0;
42   - int startIdx = stepSize*stepIdx;
43   - int stopIdx = startIdx+stepSize;
44   - if (startIdx >= numPixels) {
45   - return;
46   - }
47   - if (stopIdx >= numPixels) {
48   - stopIdx = numPixels;
49   - }
50   - for(int i=startIdx; i < stopIdx; i++) {
51   - acc += src[i]*evPtr[i*numEigenvectors + evIdx];
52   - }
53   -
54   - intermediaryBuffer[stepIdx*stepSize + evIdx] = acc;
  10 + b[index*incb] = (double)a[index*inca];
55 11 }
56 12  
57   - __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* out, int numEigenvectors, int numSteps, int stepSize) {
58   - int evIdx = blockIdx.x*blockDim.x+threadIdx.x;
59   - if (evIdx >= numEigenvectors) {
  13 + __global__ void castDoubleToFloatKernel(double* a, int inca, float* b, int incb, int numElems) {
  14 + int index = blockIdx.x*blockDim.x+threadIdx.x;
  15 + if (index >= numElems) {
60 16 return;
61 17 }
62 18  
63   - if (numSteps*stepSize+evIdx >= numEigenvectors) {
64   - numSteps--;
65   - }
66   -
67   - float acc = 0;
68   - for (int i=0; i < numSteps; i++) {
69   - int ibIdx = i*stepSize + evIdx;
70   - acc += intermediaryBuffer[ibIdx];
71   - }
72   -
73   - out[evIdx] = acc;
  19 + b[index*incb] = (float)a[index*inca];
74 20 }
75 21  
76   - __global__ void subtractMeanKernel(float* out, float* mean, int numElems) {
77   - int idx = blockIdx.x*blockDim.x+threadIdx.x;
78   -
79   - // perform bound checking
80   - if (idx >= numElems) {
81   - return;
82   - }
83   -
84   - // subtract out the mean
85   - out[idx] -= mean[idx];
86   - }
87   -
88   - // _evRows: the number of pixels in the trained images
89   - // _evCols: the number of eigenvectors
90   - // _meanElems: the number of pixels in an image
91   - // _stepSize: the number of pixels in a single step
92   - // _numSteps: the number of steps required to complete operation
93   - float* cudaEvPtr; int _evRows; int _evCols;
94   - float* cudaMeanPtr; int _meanElems;
95   - int _numSteps; int _stepSize;
96   - float* intermediaryBuffer;
97   -
98   - void initializeWrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) {
99   - _evRows = evRows; _evCols = evCols;
100   - _meanElems = meanElems;
101   -
102   - cudaError_t err;
103   -
104   - // copy the eigenvectors to the GPU
105   - CUDA_SAFE_MALLOC(&cudaEvPtr, evRows*evCols*sizeof(float), &err);
106   - CUDA_SAFE_MEMCPY(cudaEvPtr, evPtr, evRows*evCols*sizeof(float), cudaMemcpyHostToDevice, &err);
107   -
108   - // copy the mean to the GPU
109   - CUDA_SAFE_MALLOC(&cudaMeanPtr, meanElems*sizeof(float), &err);
110   - CUDA_SAFE_MEMCPY(cudaMeanPtr, meanPtr, meanElems*sizeof(float), cudaMemcpyHostToDevice, &err);
111   -
112   - // initialize the intermediary working space,
113   - _stepSize = 2048;
114   - _numSteps = _evRows / _stepSize + 1;
115   - CUDA_SAFE_MALLOC(&intermediaryBuffer, _numSteps*_stepSize*sizeof(float), &err);
116   - }
  22 + void castFloatToDouble(float* a, int inca, double* b, int incb, int numElems) {
  23 + int threadsPerBlock = 512;
  24 + int numBlocks = numElems / threadsPerBlock + 1;
117 25  
118   - void trainWrapper(void* cudaSrc, float* data, int rows, int cols) {
119   - cudaError_t err;
120   - CUDA_SAFE_MEMCPY(data, cudaSrc, rows*cols*sizeof(float), cudaMemcpyDeviceToHost, &err);
  26 + castFloatToDoubleKernel<<<numBlocks, threadsPerBlock>>>(a, inca, b, incb, numElems);
121 27 }
122 28  
123   - void wrapper(void* src, void** dst, int imgRows, int imgCols) {
124   - cudaError_t err;
125   - CUDA_SAFE_MALLOC(dst, _evCols*sizeof(float), &err);
126   -
127   - if (imgRows*imgCols != _evRows || imgRows*imgCols != _meanElems) {
128   - cout << "ERR: Image dimension mismatch!" << endl;
129   - throw 0;
130   - }
131   -
132   - // subtract out the mean of the image (mean is 1xpixels in size), perform in place (in src)
  29 + void castDoubleToFloat(double* a, int inca, float* b, int incb, int numElems) {
133 30 int threadsPerBlock = 512;
134   - int numBlocks = _meanElems / threadsPerBlock + 1;
135   - subtractMeanKernel<<<numBlocks, threadsPerBlock>>>((float*)src, cudaMeanPtr, _meanElems);
136   - CUDA_KERNEL_ERR_CHK(&err);
137   -
138   - // perform matrix multiplication
139   - dim3 threadsPerBlock2d(512, 1);
140   - dim3 numBlocks2d(
141   - _evCols / threadsPerBlock2d.x + 1,
142   - _numSteps / threadsPerBlock2d.y + 1);
143   - multiplyKernel<<<numBlocks2d, threadsPerBlock2d>>>((float*)src, intermediaryBuffer, cudaEvPtr, _evCols, _numSteps, _stepSize, _meanElems);
144   - CUDA_KERNEL_ERR_CHK(&err);
145   -
146   - threadsPerBlock = 512;
147   - numBlocks = _evCols / threadsPerBlock + 1;
148   - multiplyJoinKernel<<<numBlocks, threadsPerBlock>>>(intermediaryBuffer, (float*)*dst, _evCols, _numSteps, _stepSize);
149   - CUDA_KERNEL_ERR_CHK(&err);
  31 + int numBlocks = numElems / threadsPerBlock + 1;
150 32  
151   - // free the src memory
152   - CUDA_SAFE_FREE(src, &err);
  33 + castDoubleToFloatKernel<<<numBlocks, threadsPerBlock>>>(a, inca, b, incb, numElems);
153 34 }
154 35 }}}
... ...
openbr/plugins/cuda/module.cmake
... ... @@ -28,6 +28,6 @@ if(BR_WITH_CUDA)
28 28  
29 29 # add the compiled source and libs into the build system
30 30 set(BR_THIRDPARTY_SRC ${BR_THIRDPARTY_SRC} ${CUDA_CPP_SRC} ${CUDA_CU_OBJ})
31   - set(BR_THIRDPARTY_LIBS ${BR_THIRDPARTY_LIBS} ${CUDA_LIBRARIES})
  31 + set(BR_THIRDPARTY_LIBS ${BR_THIRDPARTY_LIBS} ${CUDA_LIBRARIES} "cublas" "cusolver")
32 32  
33 33 endif()
... ...