Commit 233d59aefd4f351d8d9d4b14085fbdb0250e9cef

Authored by DepthDeluxe
1 parent b6adde60

pre-chopping

Showing 1 changed file with 392 additions and 96 deletions
openbr/plugins/cuda/cublaspca.cpp
... ... @@ -32,6 +32,7 @@ using namespace cv;
32 32  
33 33 #include <cuda_runtime.h>
34 34 #include <cublas_v2.h>
  35 +#include <cusolverDn.h>
35 36 #include "cudadefines.hpp"
36 37  
37 38 namespace br
... ... @@ -58,7 +59,7 @@ protected:
58 59 BR_PROPERTY(int, drop, 0)
59 60 BR_PROPERTY(bool, whiten, false)
60 61  
61   - Eigen::VectorXf mean, eVals;
  62 + Eigen::VectorXf mean;
62 63 Eigen::MatrixXf eVecs;
63 64  
64 65 int originalRows;
... ... @@ -94,41 +95,40 @@ private:
94 95  
95 96 void train(const TemplateList &cudaTrainingSet)
96 97 {
97   - // copy the data back from the graphics card so the training can be done on the CPU
98   - const int instances = cudaTrainingSet.size(); // get the number of training set instances
99   - QList<Template> trainingQlist;
100   - for(int i=0; i<instances; i++) {
101   - Template currentTemplate = cudaTrainingSet[i];
102   - void* const* srcDataPtr = currentTemplate.m().ptr<void*>();
103   - void* cudaMemPtr = srcDataPtr[0];
104   - int rows = *((int*)srcDataPtr[1]);
105   - int cols = *((int*)srcDataPtr[2]);
106   - int type = *((int*)srcDataPtr[3]);
107   -
108   - if (type != CV_32FC1) {
109   - qFatal("Requires single channel 32-bit floating point matrices.");
110   - }
  98 + cublasStatus_t cublasStatus;
  99 + cudaError_t cudaError;
111 100  
112   - // copy GPU mat data back to the CPU so we can do the training on the CPU
113   - Mat mat = Mat(rows, cols, type);
114   - cudaError_t err;
115   - CUDA_SAFE_MEMCPY(mat.ptr<float>(), cudaMemPtr, rows*cols*sizeof(float), cudaMemcpyDeviceToHost, &err);
116   - trainingQlist.append(Template(mat));
117   - }
  101 + // put all the data into a single matrix to perform PCA
  102 + const int instances = cudaTrainingSet.size();
  103 + const int instanceSize = *(int*)cudaTrainingSet.first().m().ptr<void*>()[1]
  104 + * *(int*)cudaTrainingSet.first().m().ptr<void*>()[2];
118 105  
119   - // assemble a TemplateList from the list of data
120   - TemplateList trainingSet(trainingQlist);
  106 + // get all the vectors from memory
  107 + Eigen::MatrixXf data(instanceSize, instances);
  108 + for (int i=0; i < instances; i++) {
  109 + float* currentCudaMatPtr = (float*)cudaTrainingSet[i].m().ptr<void*>()[0];
121 110  
122   - originalRows = trainingSet.first().m().rows; // get number of rows of first image
123   - int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols; // get the size of the first image
  111 + cublasGetVector(
  112 + instanceSize,
  113 + sizeof(float),
  114 + currentCudaMatPtr,
  115 + 1,
  116 + data.data()+i*instanceSize,
  117 + 1
  118 + );
  119 + }
124 120  
125   - // Map into 64-bit Eigen matrix - perform the column major conversion
126   - Eigen::MatrixXd data(dimsIn, instances); // create a mat
127   - for (int i=0; i<instances; i++) {
128   - data.col(i) = Eigen::Map<const Eigen::MatrixXf>(trainingSet[i].m().ptr<float>(), dimsIn, 1).cast<double>();
129   - }
  121 + Eigen::MatrixXd dataDouble(instanceSize, instances);
  122 + for (int i=0; i < instanceSize*instances; i++) {
  123 + dataDouble.data()[i] = (double)data.data()[i];
  124 + }
  125 +
  126 + // XXX: remove me
  127 + Eigen::MatrixXd test(3,3);
  128 + test << 1,2,3,4,5,6,7,8,9;
  129 + trainCore(test);
130 130  
131   - trainCore(data);
  131 + // trainCore(dataDouble);
132 132 }
133 133  
134 134 void project(const Template &src, Template &dst) const
... ... @@ -164,8 +164,6 @@ private:
164 164 cudaMemset(*dstGpuMatPtrPtr, 0, dstRows*sizeof(float));
165 165  
166 166 {
167   - //cout << "Ax + y" << endl;
168   - // subtract out the average
169 167 float negativeOne = -1.0f;
170 168 status = cublasSaxpy(
171 169 cublasHandle, // handle
... ... @@ -180,8 +178,6 @@ private:
180 178 }
181 179  
182 180 {
183   - //cout << "Matrix-Vector multiplication" << endl;
184   -
185 181 float one = 1.0f;
186 182 float zero = 0.0f;
187 183 status = cublasSgemv(
... ... @@ -207,12 +203,12 @@ private:
207 203  
208 204 void store(QDataStream &stream) const
209 205 {
210   - stream << keep << drop << whiten << originalRows << mean << eVals << eVecs;
  206 + stream << keep << drop << whiten << originalRows << mean << eVecs;
211 207 }
212 208  
213 209 void load(QDataStream &stream)
214 210 {
215   - stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> eVecs;
  211 + stream >> keep >> drop >> whiten >> originalRows >> mean >> eVecs;
216 212  
217 213 //cout << "Starting load process" << endl;
218 214  
... ... @@ -248,73 +244,373 @@ private:
248 244 }
249 245  
250 246 protected:
251   - void trainCore(Eigen::MatrixXd data)
252   - {
253   - int dimsIn = data.rows();
254   - int instances = data.cols();
255   - const bool dominantEigenEstimation = (dimsIn > instances);
256   -
257   - Eigen::MatrixXd allEVals, allEVecs;
258   - if (keep != 0) {
259   - // Compute and remove mean
260   - mean = Eigen::VectorXf(dimsIn);
261   - for (int i=0; i<dimsIn; i++) mean(i) = data.row(i).sum() / (float)instances;
262   - for (int i=0; i<dimsIn; i++) data.row(i).array() -= mean(i);
263   -
264   - // Calculate covariance matrix
265   - Eigen::MatrixXd cov;
266   - if (dominantEigenEstimation) cov = data.transpose() * data / (instances-1.0);
267   - else cov = data * data.transpose() / (instances-1.0);
268   -
269   - // Compute eigendecomposition. Returns eigenvectors/eigenvalues in increasing order by eigenvalue.
270   - Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> eSolver(cov);
271   - allEVals = eSolver.eigenvalues();
272   - allEVecs = eSolver.eigenvectors();
273   - if (dominantEigenEstimation) allEVecs = data * allEVecs;
274   - } else {
275   - // Null case
276   - mean = Eigen::VectorXf::Zero(dimsIn);
277   - allEVecs = Eigen::MatrixXd::Identity(dimsIn, dimsIn);
278   - allEVals = Eigen::VectorXd::Ones(dimsIn);
  247 + void trainCore(Eigen::MatrixXd data) {
  248 + cudaError_t cudaError;
  249 +
  250 + // utility variables
  251 + const double one = 1.0;
  252 + const double negativeOne = -1.0;
  253 + const double zero = 0.0;
  254 +
  255 + static int numTimesThrough = 0;
  256 + numTimesThrough++;
  257 +
  258 + int dimsIn = data.rows(); // the number of rows of the covariance matrix
  259 + int instances = data.cols(); // the number of columns of the covariance matrix
  260 + const bool dominantEigenEstimation = (dimsIn > instances);
  261 +
  262 + // Compute and remove mean
  263 + //mean = Eigen::VectorXf(dimsIn);
  264 + //for (int i=0; i<dimsIn; i++) mean(i) = data.row(i).sum() / (float)instances;
  265 + //for (int i=0; i<dimsIn; i++) data.row(i).array() -= mean(i);
  266 +
  267 + // allocate and place data in GPU memory
  268 + double* cudaDataPtr;
  269 + CUDA_SAFE_MALLOC(&cudaDataPtr, data.rows()*data.cols()*sizeof(cudaDataPtr[0]), &cudaError);
  270 + cublasSetMatrix(
  271 + data.rows(),
  272 + data.cols(),
  273 + sizeof(cudaDataPtr[0]),
  274 + data.data(),
  275 + data.rows(),
  276 + cudaDataPtr,
  277 + data.rows()
  278 + );
  279 +
  280 + // allocate space for the covariance matrix
  281 + double* cudaCovariancePtr;
  282 + int covRows = data.cols();
  283 + CUDA_SAFE_MALLOC(&cudaCovariancePtr, covRows*covRows*sizeof(cudaCovariancePtr[0]), &cudaError);
  284 +
  285 + // compute the covariance matrix
  286 + // cov = data.transpose() * data / (instances-1.0);
  287 + {
  288 + double scaleFactor = 1.0/(instances-1.0);
  289 + cublasDgemm(
  290 + cublasHandle,
  291 + CUBLAS_OP_T,
  292 + CUBLAS_OP_N,
  293 + data.cols(),
  294 + data.cols(),
  295 + data.rows(),
  296 + &scaleFactor,
  297 + cudaDataPtr,
  298 + data.rows(),
  299 + cudaDataPtr,
  300 + data.rows(),
  301 + &zero,
  302 + cudaCovariancePtr,
  303 + covRows
  304 + );
  305 + }
  306 +
  307 + // XXX: download the covariace matrix for debugging
  308 + Eigen::MatrixXd cov(covRows, covRows);
  309 + cublasGetMatrix(
  310 + covRows,
  311 + covRows,
  312 + sizeof(cov.data()[0]),
  313 + cudaCovariancePtr,
  314 + covRows,
  315 + cov.data(),
  316 + covRows
  317 + );
  318 +
  319 + // initialize cuSolver for the next part
  320 + cusolverDnHandle_t cusolverHandle;
  321 + cusolverDnCreate(&cusolverHandle);
  322 +
  323 + double* cudaEigenvaluesPtr;
  324 + {
  325 + double* cudaDiagonalPtr;
  326 + CUDA_SAFE_MALLOC(&cudaDiagonalPtr, covRows*sizeof(double), &cudaError);
  327 + double* cudaOffdiagonalPtr;
  328 + CUDA_SAFE_MALLOC(&cudaOffdiagonalPtr, covRows*sizeof(double), &cudaError);
  329 + double* cudaTauqPtr;
  330 + CUDA_SAFE_MALLOC(&cudaTauqPtr, covRows*sizeof(double), &cudaError);
  331 + double* cudaTaupPtr;
  332 + CUDA_SAFE_MALLOC(&cudaTaupPtr, covRows*sizeof(double), &cudaError);
  333 +
  334 + // calculate lwork
  335 + int lwork;
  336 + cusolverDnSgebrd_bufferSize(
  337 + cusolverHandle,
  338 + covRows,
  339 + covRows,
  340 + &lwork
  341 + );
  342 + double* cudaWorkBufferPtr;
  343 + CUDA_SAFE_MALLOC(&cudaWorkBufferPtr, lwork, &cudaError);
  344 +
  345 + int* cudaDevInfoPtr;
  346 + CUDA_SAFE_MALLOC(&cudaDevInfoPtr, sizeof(int), &cudaError);
  347 +
  348 + // call the eigenvalue decomposer
  349 + cusolverDnDgebrd(
  350 + cusolverHandle,
  351 + covRows,
  352 + covRows,
  353 + cudaCovariancePtr,
  354 + covRows,
  355 + cudaDiagonalPtr,
  356 + cudaOffdiagonalPtr,
  357 + cudaTauqPtr,
  358 + cudaTaupPtr,
  359 + cudaWorkBufferPtr,
  360 + lwork,
  361 + cudaDevInfoPtr
  362 + );
  363 +
  364 + // the eigenvalues are on the diagonal
  365 + cudaEigenvaluesPtr = cudaOffdiagonalPtr;
  366 +
  367 + /*
  368 + // initialize the result buffers
  369 + double* cudaOffdiagonalPtr;
  370 + CUDA_SAFE_MALLOC(&cudaEigenvaluesPtr, covRows*sizeof(cudaEigenvaluesPtr[0]), &cudaError);
  371 + CUDA_SAFE_MALLOC(&cudaOffdiagonalPtr, covRows*sizeof(cudaOffdiagonalPtr[0]), &cudaError);
  372 +
  373 + // initialize the tauq and taup buffers
  374 + double* cudaTauqPtr;
  375 + double* cudaTaupPtr;
  376 + CUDA_SAFE_MALLOC(&cudaTauqPtr, covRows*sizeof(cudaTauqPtr[0]), &cudaError);
  377 + CUDA_SAFE_MALLOC(&cudaTaupPtr, covRows*sizeof(cudaTaupPtr[0]), &cudaError);
  378 +
  379 + // build the work buffer
  380 + double* cudaWorkPtr;
  381 + int workBufferSize;
  382 + cusolverDnSgesvd_bufferSize(cusolverHandle, covRows, covRows, &workBufferSize);
  383 + CUDA_SAFE_MALLOC(&cudaWorkPtr, workBufferSize, &cudaError);
  384 +
  385 + int* cudaDevInfoPtr;
  386 + CUDA_SAFE_MALLOC(&cudaDevInfoPtr, sizeof(*cudaDevInfoPtr), &cudaError);
  387 +
  388 + // now pull the eigenvalues out
  389 + cusolverStatus_t cusolverStatus;
  390 + cusolverStatus = cusolverDnDgebrd(
  391 + cusolverHandle, // handle
  392 + covRows, // rows of Matrix A
  393 + covRows, // cols of Matrix A
  394 + cudaCovariancePtr, // CUDA pointer to matrix A
  395 + covRows, // leading dimension of A
  396 + cudaEigenvaluesPtr, // diagonal elements of bidiagonal matrix
  397 + cudaOffdiagonalPtr, // off-diagonal elements of matrix
  398 + cudaTauqPtr,
  399 + cudaTaupPtr,
  400 + cudaWorkPtr,
  401 + workBufferSize,
  402 + cudaDevInfoPtr
  403 + );
  404 +
  405 + // print out the devInfo
  406 + int devInfo;
  407 + cudaMemcpy(&devInfo, cudaDevInfoPtr, sizeof(devInfo), cudaMemcpyDeviceToHost);
  408 +
  409 + // now we have the eigenvalues
  410 +
  411 + // XXX: the off diagonal values
  412 + Eigen::VectorXd offDiagonal(covRows);
  413 + cublasGetVector(
  414 + covRows,
  415 + sizeof(offDiagonal.data()[0]),
  416 + cudaOffdiagonalPtr,
  417 + 1,
  418 + offDiagonal.data(),
  419 + 1
  420 + );
  421 +
  422 + // clean up
  423 + CUDA_SAFE_FREE(cudaOffdiagonalPtr, &cudaError);
  424 + CUDA_SAFE_FREE(cudaTauqPtr, &cudaError);
  425 + CUDA_SAFE_FREE(cudaTaupPtr, &cudaError);
  426 + CUDA_SAFE_FREE(cudaWorkPtr, &cudaError);
  427 + CUDA_SAFE_FREE(cudaDevInfoPtr, &cudaError);
  428 + */
  429 + }
  430 +
  431 + // copy the eigenvalues back to the CPU
  432 + Eigen::VectorXd allEVals(covRows);
  433 + cublasGetVector(
  434 + covRows,
  435 + sizeof(allEVals.data()[0]),
  436 + cudaEigenvaluesPtr,
  437 + 1,
  438 + allEVals.data(),
  439 + 1
  440 + );
  441 + CUDA_SAFE_FREE(cudaEigenvaluesPtr, &cudaError);
  442 +
  443 + // now find the eigenvectors
  444 + Eigen::MatrixXd allEVecs(covRows, covRows);
  445 + double* cudaCoefficientMatrix;
  446 + CUDA_SAFE_MALLOC(&cudaCoefficientMatrix, covRows*covRows*sizeof(cudaCoefficientMatrix[0]), &cudaError);
  447 + for (int i=0; i < covRows; i++) {
  448 + // load cov into matrix
  449 + cublasSetMatrix(
  450 + covRows,
  451 + covRows,
  452 + sizeof(cov.data()[0]),
  453 + cov.data(),
  454 + covRows,
  455 + cudaCoefficientMatrix,
  456 + covRows
  457 + );
  458 +
  459 + // subtract out the Eigenvalue from the center of the matrix
  460 + // first copy the eigenvalue into a single buffer
  461 + double* cudaEigenvalueSubtractBuffer;
  462 + CUDA_SAFE_MALLOC(&cudaEigenvalueSubtractBuffer, covRows*sizeof(cudaEigenvalueSubtractBuffer[0]), &cudaError);
  463 + for(int j = 0; j < covRows; j++) {
  464 + cublasSetVector(
  465 + 1,
  466 + sizeof(cudaEigenvalueSubtractBuffer[0]),
  467 + &allEVals.data()[i],
  468 + 1,
  469 + &cudaEigenvalueSubtractBuffer[j],
  470 + 1
  471 + );
279 472 }
280 473  
281   - if (keep <= 0) {
282   - keep = dimsIn - drop;
283   - } else if (keep < 1) {
284   - // Keep eigenvectors that retain a certain energy percentage.
285   - const double totalEnergy = allEVals.sum();
286   - if (totalEnergy == 0) {
287   - keep = 0;
288   - } else {
289   - double currentEnergy = 0;
290   - int i=0;
291   - while ((currentEnergy / totalEnergy < keep) && (i < allEVals.rows())) {
292   - currentEnergy += allEVals(allEVals.rows()-(i+1));
293   - i++;
294   - }
295   - keep = i - drop;
296   - }
297   - } else {
298   - if (keep + drop > allEVals.rows()) {
299   - qWarning("Insufficient samples, needed at least %d but only got %d.", (int)keep + drop, (int)allEVals.rows());
300   - keep = allEVals.rows() - drop;
301   - }
  474 + // perform the subtraction
  475 + cublasDaxpy(
  476 + cublasHandle,
  477 + covRows,
  478 + &negativeOne,
  479 + cudaEigenvalueSubtractBuffer,
  480 + 1,
  481 + cudaCoefficientMatrix,
  482 + covRows+1 // move across the diagonal
  483 + );
  484 +
  485 + // perform the Cholesky factorization of the coefficient matrix
  486 + double* cudaWorkBufferPtr;
  487 + int lwork;
  488 + cusolverDnDpotrf_bufferSize(
  489 + cusolverHandle,
  490 + CUBLAS_FILL_MODE_UPPER,
  491 + covRows,
  492 + cudaCoefficientMatrix,
  493 + covRows,
  494 + &lwork
  495 + );
  496 + CUDA_SAFE_MALLOC(&cudaWorkBufferPtr, lwork, &cudaError);
  497 +
  498 + int* cudaDevInfoPtr;
  499 + CUDA_SAFE_MALLOC(&cudaDevInfoPtr, sizeof(*cudaDevInfoPtr), &cudaError);
  500 +
  501 + cusolverDnDpotrf(
  502 + cusolverHandle,
  503 + CUBLAS_FILL_MODE_UPPER,
  504 + covRows,
  505 + cudaCoefficientMatrix,
  506 + covRows,
  507 + cudaWorkBufferPtr,
  508 + lwork,
  509 + cudaDevInfoPtr
  510 + );
  511 + int devInfo;
  512 + CUDA_SAFE_MEMCPY(&devInfo, cudaDevInfoPtr, sizeof(devInfo), cudaMemcpyDeviceToHost, &cudaError);
  513 + cout << "DevInfo: " << devInfo << endl;
  514 +
  515 +
  516 + // XXX: remove after dbugging
  517 + Eigen::MatrixXd anotherMatrix(covRows, covRows);
  518 + cublasGetMatrix(
  519 + covRows,
  520 + covRows,
  521 + sizeof(anotherMatrix.data()[0]),
  522 + cudaCoefficientMatrix,
  523 + 1,
  524 + anotherMatrix.data(),
  525 + 1
  526 + );
  527 +
  528 + // the first element of B is equal to the covariance matrix, the rest are zeroes
  529 + double* cudaBVector;
  530 + CUDA_SAFE_MALLOC(&cudaBVector, covRows*sizeof(cudaBVector[0]), &cudaError);
  531 + // load the top element to be the same as first of coefficient
  532 + // this results in the first variable being zero and assigning
  533 + // values for the rest of the matrix
  534 + cublasDcopy(
  535 + cublasHandle,
  536 + 1,
  537 + cudaCoefficientMatrix,
  538 + 1,
  539 + cudaBVector,
  540 + 1
  541 + );
  542 + // load the rest 0's
  543 + for (int j = 1; j < covRows; j++) {
  544 + cublasSetVector(
  545 + 1,
  546 + sizeof(cudaBVector[0]),
  547 + &zero,
  548 + 1,
  549 + &cudaBVector[j],
  550 + 1
  551 + );
302 552 }
303 553  
304   - // Keep highest energy vectors
305   - eVals = Eigen::VectorXf((int)keep, 1);
306   - eVecs = Eigen::MatrixXf(allEVecs.rows(), (int)keep);
307   - for (int i=0; i<keep; i++) {
308   - int index = allEVals.rows()-(i+drop+1);
309   - eVals(i) = allEVals(index);
310   - eVecs.col(i) = allEVecs.col(index).cast<float>() / allEVecs.col(index).norm();
311   - if (whiten) eVecs.col(i) /= sqrt(eVals(i));
  554 + // solve the system of linear equations
  555 + cusolverDnDpotrs(
  556 + cusolverHandle,
  557 + CUBLAS_FILL_MODE_LOWER,
  558 + covRows,
  559 + 1, // we are solving a single system of equations
  560 + cudaCoefficientMatrix,
  561 + covRows,
  562 + cudaBVector,
  563 + covRows,
  564 + cudaDevInfoPtr
  565 + );
  566 + CUDA_SAFE_MEMCPY(&devInfo, cudaDevInfoPtr, sizeof(devInfo), cudaMemcpyDeviceToHost, &cudaError);
  567 + cout << "DevInfo: " << devInfo << endl;
  568 +
  569 + // should have the solution
  570 + Eigen::VectorXd solutionVector(covRows);
  571 + cublasGetVector(
  572 + covRows,
  573 + sizeof(solutionVector.data()[0]),
  574 + solutionVector.data(),
  575 + 1,
  576 + cudaBVector,
  577 + 1
  578 + );
  579 +
  580 + cout << "solution: [";
  581 + for (int i=0; i < covRows; i++) {
  582 + cout << solutionVector.data()[i] << ", ";
312 583 }
  584 + cout << "];" << endl;
313 585  
314   - // Debug output
315   - if (Globals->verbose) qDebug() << "PCA Training:\n\tDimsIn =" << dimsIn << "\n\tKeep =" << keep;
  586 + }
  587 +
  588 + // Keep eigenvectors that retain a certain energy percentage.
  589 + const float totalEnergy = allEVals.sum();
  590 + if (totalEnergy == 0) {
  591 + keep = 0;
  592 + } else {
  593 + float currentEnergy = 0;
  594 + int i=0;
  595 + while ((currentEnergy / totalEnergy < keep) && (i < allEVals.rows())) {
  596 + currentEnergy += allEVals(allEVals.rows()-(i+1));
  597 + i++;
  598 + }
  599 + keep = i - drop;
  600 + }
  601 +
  602 + // Keep highest energy vectors
  603 + Eigen::VectorXf eVals = Eigen::VectorXf((int)keep, 1);
  604 + for (int i=0; i<keep; i++) {
  605 + int index = allEVals.rows()-(i+drop+1);
  606 + eVals(i) = allEVals(index);
  607 + }
  608 +
  609 + cusolverDnDestroy(cusolverHandle);
  610 + cout << "DONE" << endl;
316 611 }
317 612  
  613 +
318 614 void writeEigenVectors(const Eigen::MatrixXd &allEVals, const Eigen::MatrixXd &allEVecs) const
319 615 {
320 616 const int originalCols = mean.rows() / originalRows;
... ...