Merge pull request #490 from DepthDeluxe/master

CUDA-accelerated PCA training and improved PCA projection speeds

Merge pull request #490 from DepthDeluxe/master
CUDA-accelerated PCA training and improved PCA projection speeds
Josh Klontz · GitHub
2 parents a91c55a5 a7f8896e
Showing 4 changed files with 504 additions and 282 deletions
openbr/plugins/cuda/cudadefines.hpp
openbr/plugins/cuda/cudapca.cpp
openbr/plugins/cuda/cudapca.cu
openbr/plugins/cuda/module.cmake
@@ -19,6 +19,9 @@
 using namespace std;
 #include <pthread.h>
+#include <cublas_v2.h>
+
+
 #define CUDA_SAFE_FREE(cudaPtr, errPtr) \
   /*cout << pthread_self() << ": CUDA Free: " << cudaPtr << endl;*/ \
   *errPtr = cudaFree(cudaPtr); \
@@ -48,3 +51,53 @@ using namespace std;
     cout << pthread_self() << ": Kernel Call Err(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \
     throw 0; \
   }
+
+#define CUBLAS_ERROR_CHECK(error) { \
+  switch (error) { \
+    case CUBLAS_STATUS_SUCCESS: \
+      break; \
+    case CUBLAS_STATUS_NOT_INITIALIZED: \
+      cout << "CUBLAS_STATUS_NOT_INITIALIZED" << endl; \
+      break; \
+    case CUBLAS_STATUS_ALLOC_FAILED: \
+      cout << "CUBLAS_STATUS_ALLOC_FAILED" << endl; \
+      break; \
+    case CUBLAS_STATUS_INVALID_VALUE: \
+      cout << "CUBLAS_STATUS_INVALID_VALUE" << endl;; \
+      break; \
+    case CUBLAS_STATUS_ARCH_MISMATCH: \
+      cout << "CUBLAS_STATUS_ARCH_MISMATCH" << endl;; \
+      break; \
+    case CUBLAS_STATUS_MAPPING_ERROR: \
+      cout << "CUBLAS_STATUS_MAPPING_ERROR" << endl; \
+      break; \
+    case CUBLAS_STATUS_EXECUTION_FAILED: \
+      cout << "CUBLAS_STATUS_EXECUTION_FAILED" << endl; \
+      break; \
+    case CUBLAS_STATUS_INTERNAL_ERROR: \
+      cout << "CUBLAS_STATUS_INTERNAL_ERROR" << endl; \
+      break; \
+    default: \
+      cout << "<unknown>: " << error << endl; \
+      break; \
+  } \
+}
+
+#define CUSOLVER_ERROR_CHECK(error) { \
+  switch(error) { \
+    case CUSOLVER_STATUS_SUCCESS: \
+      break; \
+    case CUSOLVER_STATUS_NOT_INITIALIZED: \
+      cout << "CUSOLVER_STATUS_NOT_INITIALIZED" << endl; \
+      break; \
+    case CUSOLVER_STATUS_ALLOC_FAILED: \
+      cout << "CUSOLVER_STATUS_ALLOC_FAILED" << endl; \
+      break; \
+    case CUSOLVER_STATUS_ARCH_MISMATCH: \
+      cout << "CUSOLVER_STATUS_ARCH_MISMATCH" << endl; \
+      break; \
+    default: \
+      cout << "<unknown>: " << error << endl; \
+      break; \
+  } \
+}
@@ -30,11 +30,14 @@ using namespace cv;
 #include <openbr/core/eigenutils.h>
 #include <openbr/core/opencvutils.h>
-// definitions from the CUDA source file
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cusolverDn.h>
+#include "cudadefines.hpp"
+
 namespace br { namespace cuda { namespace pca {
-  void initializeWrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems);
-  void trainWrapper(void* cudaSrc, float* dst, int rows, int cols);
-  void wrapper(void* src, void** dst, int imgRows, int imgCols);
+  void castFloatToDouble(float* a, int inca, double* b, int incb, int numElems);
+  void castDoubleToFloat(double* a, int inca, float* b, int incb, int numElems);
 }}}
 namespace br
@@ -61,13 +64,26 @@ protected:
     BR_PROPERTY(int, drop, 0)
     BR_PROPERTY(bool, whiten, false)
-    Eigen::VectorXf mean, eVals;
+    Eigen::VectorXf mean;
+    Eigen::VectorXf eVals;
     Eigen::MatrixXf eVecs;
-    int originalRows;
+    cublasHandle_t cublasHandle;
+    float* cudaMeanPtr;     // holds the "keep" long vector
+    float* cudaEvPtr;       // holds all the eigenvectors
 public:
-    CUDAPCATransform() : keep(0.95), drop(0), whiten(false) {}
+    CUDAPCATransform() : keep(0.95), drop(0), whiten(false) {
+      // try to initialize CUBLAS
+      cublasStatus_t status;
+      status = cublasCreate(&cublasHandle);
+      CUBLAS_ERROR_CHECK(status);
+    }
+
+    ~CUDAPCATransform() {
+      // tear down CUBLAS
+      cublasDestroy(cublasHandle);
+    }
 private:
     double residualReconstructionError(const Template &src) const
@@ -83,45 +99,38 @@ private:
     void train(const TemplateList &cudaTrainingSet)
     {
-      // copy the data back from the graphics card so the training can be done on the CPU
-        const int instances = cudaTrainingSet.size();       // get the number of training set instances
-        QList<Template> trainingQlist;
-        for(int i=0; i<instances; i++) {
-          Template currentTemplate = cudaTrainingSet[i];
-          void* const* srcDataPtr = currentTemplate.m().ptr<void*>();
-          void* cudaMemPtr = srcDataPtr[0];
-          int rows = *((int*)srcDataPtr[1]);
-          int cols = *((int*)srcDataPtr[2]);
-          int type = *((int*)srcDataPtr[3]);
-
-          if (type != CV_32FC1) {
-            qFatal("Requires single channel 32-bit floating point matrices.");
-          }
-
-          Mat mat = Mat(rows, cols, type);
-          br::cuda::pca::trainWrapper(cudaMemPtr, mat.ptr<float>(), rows, cols);
-          trainingQlist.append(Template(mat));
-        }
-
-        // assemble a TemplateList from the list of data
-        TemplateList trainingSet(trainingQlist);
-
-
-        originalRows = trainingSet.first().m().rows;    // get number of rows of first image
-        int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols; // get the size of the first image
+      cublasStatus_t cublasStatus;
+      cudaError_t cudaError;
+
+      // put all the data into a single matrix to perform PCA
+      const int instances = cudaTrainingSet.size();
+      const int dimsIn = *(int*)cudaTrainingSet.first().m().ptr<void*>()[1]
+                               * *(int*)cudaTrainingSet.first().m().ptr<void*>()[2];
+
+      // copy the data over
+      double* cudaDataPtr;
+      CUDA_SAFE_MALLOC(&cudaDataPtr, instances*dimsIn*sizeof(cudaDataPtr[0]), &cudaError);
+      for (int i=0; i < instances; i++) {
+        br::cuda::pca::castFloatToDouble(
+          (float*)(cudaTrainingSet[i].m().ptr<void*>()[0]),
+          1,
+          cudaDataPtr+i*dimsIn,
+          1,
+          dimsIn
+        );
+      }
-        // Map into 64-bit Eigen matrix
-        Eigen::MatrixXd data(dimsIn, instances);        // create a mat
-        for (int i=0; i<instances; i++) {
-          data.col(i) = Eigen::Map<const Eigen::MatrixXf>(trainingSet[i].m().ptr<float>(), dimsIn, 1).cast<double>();
-        }
+      trainCore(cudaDataPtr, dimsIn, instances);
-        trainCore(data);
+      CUDA_SAFE_FREE(cudaDataPtr, &cudaError);
     }
     void project(const Template &src, Template &dst) const
     {
+      cudaError_t cudaError;
+
       void* const* srcDataPtr = src.m().ptr<void*>();
+      float* srcGpuMatPtr = (float*)srcDataPtr[0];
       int rows = *((int*)srcDataPtr[1]);
       int cols = *((int*)srcDataPtr[2]);
       int type = *((int*)srcDataPtr[3]);
@@ -131,137 +140,416 @@ private:
         throw 0;
       }
+      // save the destination rows
+      int dstRows = (int)keep;
+
       Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
       void** dstDataPtr = dstMat.ptr<void*>();
+      float** dstGpuMatPtrPtr = (float**)dstDataPtr;
       dstDataPtr[1] = srcDataPtr[1];  *((int*)dstDataPtr[1]) = 1;
-      dstDataPtr[2] = srcDataPtr[2];  *((int*)dstDataPtr[2]) = keep;
+      dstDataPtr[2] = srcDataPtr[2];  *((int*)dstDataPtr[2]) = dstRows;
       dstDataPtr[3] = srcDataPtr[3];
-      cuda::pca::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
+      // allocate the memory and set to zero
+      //cout << "Allocating destination memory" << endl;
+      cublasStatus_t status;
+      cudaMalloc(dstGpuMatPtrPtr, dstRows*sizeof(float));
+      cudaMemset(*dstGpuMatPtrPtr, 0, dstRows*sizeof(float));
+
+      {
+        float negativeOne = -1.0f;
+        status = cublasSaxpy(
+          cublasHandle,       // handle
+          dstRows,            // vector length
+          &negativeOne,       // alpha (1)
+          cudaMeanPtr,        // mean
+          1,                  // stride
+          srcGpuMatPtr,       // y, the source
+          1                   // stride
+        );
+        CUBLAS_ERROR_CHECK(status);
+      }
+
+      {
+        float one = 1.0f;
+        float zero = 0.0f;
+        status = cublasSgemv(
+          cublasHandle,       // handle
+          CUBLAS_OP_T,        // normal vector multiplication
+          eVecs.rows(),       // # rows
+          eVecs.cols(),       // # cols
+          &one,               // alpha (1)
+          cudaEvPtr,          // pointer to the matrix
+          eVecs.rows(),       // leading dimension of matrix
+          srcGpuMatPtr,       // vector for multiplication
+          1,                  // stride (1)
+          &zero,              // beta (0)
+          *dstGpuMatPtrPtr,   // vector to store the result
+          1                   // stride (1)
+        );
+        CUBLAS_ERROR_CHECK(status);
+      }
+
+      //cout << "Saving result" << endl;
       dst = dstMat;
+      CUDA_SAFE_FREE(srcGpuMatPtr, &cudaError);
     }
     void store(QDataStream &stream) const
     {
-        stream << keep << drop << whiten << originalRows << mean << eVals << eVecs;
+        stream << keep << drop << whiten <<  mean << eVecs;
     }
     void load(QDataStream &stream)
     {
-        stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> eVecs;
+        stream >> keep >> drop >> whiten >> mean >> eVecs;
+
+        //cout << "Starting load process" << endl;
+
+        cudaError_t cudaError;
+        cublasStatus_t cublasStatus;
+        CUDA_SAFE_MALLOC(&cudaMeanPtr, mean.rows()*mean.cols()*sizeof(float), &cudaError);
+        CUDA_SAFE_MALLOC(&cudaEvPtr, eVecs.rows()*eVecs.cols()*sizeof(float), &cudaError);
+
+        //cout << "Setting vector" << endl;
+        // load the mean vector into GPU memory
+        cublasStatus = cublasSetVector(
+          mean.rows()*mean.cols(),
+          sizeof(float),
+          mean.data(),
+          1,
+          cudaMeanPtr,
+          1
+        );
+        CUBLAS_ERROR_CHECK(cublasStatus);
+
+        //cout << "Setting the matrix" << endl;
+        // load the eigenvector matrix into GPU memory
+        cublasStatus = cublasSetMatrix(
+          eVecs.rows(),
+          eVecs.cols(),
+          sizeof(float),
+          eVecs.data(),
+          eVecs.rows(),
+          cudaEvPtr,
+          eVecs.rows()
+        );
+        CUBLAS_ERROR_CHECK(cublasStatus);
+    }
-        // serialize the eigenvectors
-        float* evBuffer = new float[eVecs.rows() * eVecs.cols()];
-        for (int i=0; i < eVecs.rows(); i++) {
-          for (int j=0; j < eVecs.cols(); j++) {
-            evBuffer[i*eVecs.cols() + j] = eVecs(i, j);
-          }
-        }
+protected:
+    void trainCore(double* cudaDataPtr, int dimsIn, int instances) {
+      cudaError_t cudaError;
+
+      const bool dominantEigenEstimation = (dimsIn > instances);
+
+      Eigen::MatrixXd allEVals, allEVecs;
-        // serialize the mean
-        float* meanBuffer = new float[mean.rows() * mean.cols()];
-        for (int i=0; i < mean.rows(); i++) {
-          for (int j=0; j < mean.cols(); j++) {
-            meanBuffer[i*mean.cols() + j] = mean(i, j);
+      // allocate the eigenvectors
+      if (dominantEigenEstimation) {
+        allEVals = Eigen::MatrixXd(instances, 1);
+        allEVecs = Eigen::MatrixXd(dimsIn, instances);
+      } else {
+        allEVals = Eigen::MatrixXd(dimsIn, 1);
+        allEVecs = Eigen::MatrixXd(dimsIn, dimsIn);
+      }
+
+      if (keep != 0) {
+        performCovarianceSVD(cudaDataPtr, dimsIn, instances, allEVals, allEVecs);
+      } else {
+        // null case
+        mean = Eigen::VectorXf::Zero(dimsIn);
+        allEVecs = Eigen::MatrixXd::Identity(dimsIn, dimsIn);
+        allEVals = Eigen::VectorXd::Ones(dimsIn);
+      }
+
+      // *****************
+      // We have now found the eigenvalues and eigenvectors
+      // *****************
+
+      if (keep <= 0) {
+          keep = dimsIn - drop;
+      } else if (keep < 1) {
+          // Keep eigenvectors that retain a certain energy percentage.
+          const double totalEnergy = allEVals.sum();
+          if (totalEnergy == 0) {
+              keep = 0;
+          } else {
+              double currentEnergy = 0;
+              int i=0;
+              while ((currentEnergy / totalEnergy < keep) && (i < allEVals.rows())) {
+                  currentEnergy += allEVals(i);
+                  i++;
+              }
+              keep = i - drop;
+          }
+      } else {
+          if (keep + drop > allEVals.rows()) {
+              qWarning("Insufficient samples, needed at least %d but only got %d.", (int)keep + drop, (int)allEVals.rows());
+              keep = allEVals.rows() - drop;
           }
-        }
+      }
-        // call the wrapper function
-        cuda::pca::initializeWrapper(evBuffer, eVecs.rows(), eVecs.cols(), meanBuffer, mean.rows()*mean.cols());
+      // Keep highest energy vectors
+      eVals = Eigen::VectorXf((int)keep, 1);
+      eVecs = Eigen::MatrixXf(allEVecs.rows(), (int)keep);
+      for (int i=0; i<keep; i++) {
+          int index = i+drop;
+          eVals(i) = allEVals(index);
+          eVecs.col(i) = allEVecs.col(index).cast<float>() / allEVecs.col(index).norm();
+          if (whiten) eVecs.col(i) /= sqrt(eVals(i));
+      }
-        delete evBuffer;
-        delete meanBuffer;
+      // Debug output
+      if (Globals->verbose) qDebug() << "PCA Training:\n\tDimsIn =" << dimsIn << "\n\tKeep =" << keep;
     }
-protected:
-    void trainCore(Eigen::MatrixXd data)
-    {
-        int dimsIn = data.rows();
-        int instances = data.cols();
-        const bool dominantEigenEstimation = (dimsIn > instances);
-
-        Eigen::MatrixXd allEVals, allEVecs;
-        if (keep != 0) {
-            // Compute and remove mean
-            mean = Eigen::VectorXf(dimsIn);
-            for (int i=0; i<dimsIn; i++) mean(i) = data.row(i).sum() / (float)instances;
-            for (int i=0; i<dimsIn; i++) data.row(i).array() -= mean(i);
-
-            // Calculate covariance matrix
-            Eigen::MatrixXd cov;
-            if (dominantEigenEstimation) cov = data.transpose() * data / (instances-1.0);
-            else                         cov = data * data.transpose() / (instances-1.0);
-
-            // Compute eigendecomposition. Returns eigenvectors/eigenvalues in increasing order by eigenvalue.
-            Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> eSolver(cov);
-            allEVals = eSolver.eigenvalues();
-            allEVecs = eSolver.eigenvectors();
-            if (dominantEigenEstimation) allEVecs = data * allEVecs;
-        } else {
-            // Null case
-            mean = Eigen::VectorXf::Zero(dimsIn);
-            allEVecs = Eigen::MatrixXd::Identity(dimsIn, dimsIn);
-            allEVals = Eigen::VectorXd::Ones(dimsIn);
-        }
-
-        if (keep <= 0) {
-            keep = dimsIn - drop;
-        } else if (keep < 1) {
-            // Keep eigenvectors that retain a certain energy percentage.
-            const double totalEnergy = allEVals.sum();
-            if (totalEnergy == 0) {
-                keep = 0;
-            } else {
-                double currentEnergy = 0;
-                int i=0;
-                while ((currentEnergy / totalEnergy < keep) && (i < allEVals.rows())) {
-                    currentEnergy += allEVals(allEVals.rows()-(i+1));
-                    i++;
-                }
-                keep = i - drop;
-            }
-        } else {
-            if (keep + drop > allEVals.rows()) {
-                qWarning("Insufficient samples, needed at least %d but only got %d.", (int)keep + drop, (int)allEVals.rows());
-                keep = allEVals.rows() - drop;
-            }
-        }
-
-        // Keep highest energy vectors
-        eVals = Eigen::VectorXf((int)keep, 1);
-        eVecs = Eigen::MatrixXf(allEVecs.rows(), (int)keep);
-        for (int i=0; i<keep; i++) {
-            int index = allEVals.rows()-(i+drop+1);
-            eVals(i) = allEVals(index);
-            eVecs.col(i) = allEVecs.col(index).cast<float>() / allEVecs.col(index).norm();
-            if (whiten) eVecs.col(i) /= sqrt(eVals(i));
-        }
-
-        // Debug output
-        if (Globals->verbose) qDebug() << "PCA Training:\n\tDimsIn =" << dimsIn << "\n\tKeep =" << keep;
-    }
+    // computes the covariance matrix and then pulls the eigenvalues+eigenvectors
+    // out of it using SVD of a symmetric matrix
+    void performCovarianceSVD(double* cudaDataPtr, int dimsIn, int instances, Eigen::MatrixXd& allEVals, Eigen::MatrixXd& allEVecs) {
+      cudaError_t cudaError;
+
+      const bool dominantEigenEstimation = (dimsIn > instances);
+
+      // used for temporary storage
+      Eigen::VectorXd meanDouble(dimsIn);
+
+      // compute the mean
+      for (int i=0; i < dimsIn; i++) {
+        cublasDasum(
+          cublasHandle,
+          instances,
+          cudaDataPtr+i,
+          dimsIn,
+          meanDouble.data()+i
+        );
+      }
-    void writeEigenVectors(const Eigen::MatrixXd &allEVals, const Eigen::MatrixXd &allEVecs) const
-    {
-        const int originalCols = mean.rows() / originalRows;
-
-        { // Write out mean image
-            cv::Mat out(originalRows, originalCols, CV_32FC1);
-            Eigen::Map<Eigen::MatrixXf> outMap(out.ptr<float>(), mean.rows(), 1);
-            outMap = mean.col(0);
-            // OpenCVUtils::saveImage(out, Globals->Debug+"/PCA/eigenVectors/mean.png");
-        }
-
-        // Write out sample eigen vectors (16 highest, 8 lowest), filename = eigenvalue.
-        for (int k=0; k<(int)allEVals.size(); k++) {
-            if ((k < 8) || (k >= (int)allEVals.size()-16)) {
-                cv::Mat out(originalRows, originalCols, CV_64FC1);
-                Eigen::Map<Eigen::MatrixXd> outMap(out.ptr<double>(), mean.rows(), 1);
-                outMap = allEVecs.col(k);
-                // OpenCVUtils::saveImage(out, Globals->Debug+"/PCA/eigenVectors/"+QString::number(allEVals(k),'f',0)+".png");
-            }
-        }
+      // put data back on GPU for further processing
+      double* cudaMeanDoublePtr;
+      CUDA_SAFE_MALLOC(&cudaMeanDoublePtr, dimsIn*sizeof(cudaMeanDoublePtr[0]), &cudaError);
+      cublasSetVector(
+        dimsIn,
+        sizeof(cudaMeanDoublePtr[0]),
+        meanDouble.data(),
+        1,
+        cudaMeanDoublePtr,
+        1
+      );
+
+      // scale to calculate average
+      {
+        double scaleFactor = 1.0/(double)instances;
+        cublasDscal(
+          cublasHandle,
+          dimsIn,
+          &scaleFactor,
+          cudaMeanDoublePtr,
+          1
+        );
+      }
+
+      // subtract mean from data
+      for (int i=0; i < instances; i++) {
+        double negativeOne = -1.0;
+        cublasDaxpy(
+          cublasHandle,
+          dimsIn,
+          &negativeOne,
+          cudaMeanDoublePtr,
+          1,
+          cudaDataPtr+i*dimsIn,
+          1
+        );
+      }
+
+      // convert to float form and copy the data back
+      CUDA_SAFE_MALLOC(&cudaMeanPtr, dimsIn*sizeof(cudaMeanPtr[0]), &cudaError);
+      br::cuda::pca::castDoubleToFloat(cudaMeanDoublePtr, 1, cudaMeanPtr, 1, dimsIn);
+
+      // copy the data back
+      mean = Eigen::VectorXf(dimsIn);
+      cublasGetVector(
+        dimsIn,
+        sizeof(cudaMeanPtr[0]),
+        cudaMeanPtr,
+        1,
+        mean.data(),
+        1
+      );
+
+      // free up the memory
+      CUDA_SAFE_FREE(cudaMeanDoublePtr, &cudaError);
+      CUDA_SAFE_FREE(cudaMeanPtr, &cudaError);
+
+      // allocate space for the covariance matrix
+      double* cudaCovariancePtr;
+      int covRows = allEVals.rows();
+      CUDA_SAFE_MALLOC(&cudaCovariancePtr, covRows*covRows*sizeof(cudaCovariancePtr[0]), &cudaError);
+
+      // compute the covariance matrix
+      if (dominantEigenEstimation) {
+        // cov = data.transpose() * data / (instances-1.0);
+        const double scaleFactor = 1.0/(instances-1.0);
+        const double zero = 0.0;
+        cublasDgemm(
+          cublasHandle,
+          CUBLAS_OP_T,
+          CUBLAS_OP_N,
+          instances,
+          instances,
+          dimsIn,
+          &scaleFactor,
+          cudaDataPtr,
+          dimsIn,
+          cudaDataPtr,
+          dimsIn,
+          &zero,
+          cudaCovariancePtr,
+          covRows
+        );
+      } else {
+        // cov = data * data.transpose() / (instances-1.0);
+        const double scaleFactor = 1.0/(instances-1.0);
+        const double zero = 0.0;
+        cublasDgemm(
+          cublasHandle,
+          CUBLAS_OP_N,
+          CUBLAS_OP_T,
+          dimsIn,
+          dimsIn,
+          instances,
+          &scaleFactor,
+          cudaDataPtr,
+          dimsIn,
+          cudaDataPtr,
+          dimsIn,
+          &zero,
+          cudaCovariancePtr,
+          covRows
+        );
+      }
+
+      cusolverDnHandle_t cusolverHandle;
+      cusolverStatus_t cusolverStatus;
+      cusolverDnCreate(&cusolverHandle);
+
+      // allocate appropriate working space
+      int svdLWork;
+      cusolverDnDgesvd_bufferSize(
+        cusolverHandle,
+        covRows,
+        covRows,
+        &svdLWork
+      );
+      double* cudaSvdWork;
+      CUDA_SAFE_MALLOC(&cudaSvdWork, svdLWork*sizeof(cudaSvdWork[0]), &cudaError);
+
+      double* cudaUPtr;
+      CUDA_SAFE_MALLOC(&cudaUPtr, covRows*covRows*sizeof(cudaUPtr[0]), &cudaError);
+      double* cudaSPtr;
+      CUDA_SAFE_MALLOC(&cudaSPtr, covRows*sizeof(cudaSPtr[0]), &cudaError);
+      double* cudaVTPtr;
+      CUDA_SAFE_MALLOC(&cudaVTPtr, covRows*covRows*sizeof(cudaVTPtr[0]), &cudaError);
+
+      int* cudaSvdDevInfoPtr;
+      CUDA_SAFE_MALLOC(&cudaSvdDevInfoPtr, sizeof(*cudaSvdDevInfoPtr), &cudaError);
+      int svdDevInfo;
+
+      // perform SVD on an n x m matrix, in this case the matrix is the covariance
+      // matrix and is symmetric, meaning the SVD will calculate the eigenvalues
+      // and eigenvectors for us.
+      cusolverStatus = cusolverDnDgesvd(
+        cusolverHandle,
+        'A',                // all columns of unitary matrix
+        'A',                // all columns of array VT
+        covRows,            // m
+        covRows,            // n
+        cudaCovariancePtr,  // decomposing the covariance matrix
+        covRows,            // lda
+        cudaSPtr,           // holds S
+        cudaUPtr,           // holds U
+        covRows,            // ldu
+        cudaVTPtr,          // holds VT
+        covRows,            // ldvt
+        cudaSvdWork,        // work buffer ptr
+        svdLWork,           // length of the work buffer
+        NULL,               // rwork, not used for real data types
+        cudaSvdDevInfoPtr   // devInfo pointer
+      );
+      CUSOLVER_ERROR_CHECK(cusolverStatus);
+
+      // get the eigenvalues and free memory
+      cublasGetVector(
+        covRows,
+        sizeof(cudaSPtr[0]),
+        cudaSPtr,
+        1,
+        allEVals.data(),
+        1
+      );
+      CUDA_SAFE_FREE(cudaSvdWork, &cudaError);
+      CUDA_SAFE_FREE(cudaSPtr, &cudaError);
+      CUDA_SAFE_FREE(cudaVTPtr, &cudaError);
+      CUDA_SAFE_FREE(cudaSvdDevInfoPtr, &cudaError);
+
+      // if this is a dominant eigen estimation, then perform matrix multiplication again
+      // if (dominantEigenEstimation) allEVecs = data * allEVecs;
+      if (dominantEigenEstimation) {
+        double* cudaMultedAllEVecs;
+        CUDA_SAFE_MALLOC(&cudaMultedAllEVecs, dimsIn*instances*sizeof(cudaMultedAllEVecs[0]), &cudaError);
+        const double one = 1.0;
+        const double zero = 0;
+
+        cublasDgemm(
+          cublasHandle,   // handle
+          CUBLAS_OP_N,    // transa
+          CUBLAS_OP_N,    // transb
+          dimsIn,         // m
+          instances,      // n
+          instances,      // k
+          &one,           // alpha
+          cudaDataPtr,    // A
+          dimsIn,         // lda
+          cudaUPtr,       // B
+          instances,      // ldb
+          &zero,          // beta
+          cudaMultedAllEVecs, // C
+          dimsIn          // ldc
+        );
+
+        // get the eigenvectors from the multiplied value
+        cublasGetMatrix(
+          dimsIn,
+          instances,
+          sizeof(cudaMultedAllEVecs[0]),
+          cudaMultedAllEVecs,
+          dimsIn,
+          allEVecs.data(),
+          dimsIn
+        );
+
+        // free the memory used for multiplication
+        CUDA_SAFE_FREE(cudaMultedAllEVecs, &cudaError);
+      } else {
+        // get the eigenvectors straight from the SVD
+        cublasGetMatrix(
+          covRows,
+          covRows,
+          sizeof(cudaUPtr[0]),
+          cudaUPtr,
+          covRows,
+          allEVecs.data(),
+          covRows
+        );
+      }
+
+
+      // free all the memory
+      CUDA_SAFE_FREE(cudaCovariancePtr, &cudaError);
+      CUDA_SAFE_FREE(cudaUPtr, &cudaError);
+      cusolverDnDestroy(cusolverHandle);
     }
 };
-/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
- * Copyright 2016 Colin Heinzmann                                            *
- *                                                                           *
- * Licensed under the Apache License, Version 2.0 (the "License");           *
- * you may not use this file except in compliance with the License.          *
- * You may obtain a copy of the License at                                   *
- *                                                                           *
- *     http://www.apache.org/licenses/LICENSE-2.0                            *
- *                                                                           *
- * Unless required by applicable law or agreed to in writing, software       *
- * distributed under the License is distributed on an "AS IS" BASIS,         *
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  *
- * See the License for the specific language governing permissions and       *
- * limitations under the License.                                            *
- * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
-
-#include <iostream>
-using namespace std;
-
-#include <opencv2/opencv.hpp>
-#include <opencv2/gpu/gpu.hpp>
-
 #include "cudadefines.hpp"
-using namespace cv;
-using namespace cv::gpu;
-
-/*
- * These are the CUDA functions for CUDAPCA.  See cudapca.cpp for more details
- */
-
 namespace br { namespace cuda { namespace pca {
-  __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int numEigenvectors, int numSteps, int stepSize, int numPixels) {
-    int evIdx = blockIdx.x*blockDim.x+threadIdx.x;
-    int stepIdx = blockIdx.y*blockDim.y+threadIdx.y;
-
-    if (evIdx >= numEigenvectors || stepIdx >= numSteps) {
+  __global__ void castFloatToDoubleKernel(float* a, int inca, double* b, int incb, int numElems) {
+    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    if (index >= numElems) {
       return;
     }
-    float acc = 0;
-    int startIdx = stepSize*stepIdx;
-    int stopIdx = startIdx+stepSize;
-    if (startIdx >= numPixels) {
-      return;
-    }
-    if (stopIdx >= numPixels) {
-      stopIdx = numPixels;
-    }
-    for(int i=startIdx; i < stopIdx; i++) {
-      acc += src[i]*evPtr[i*numEigenvectors + evIdx];
-    }
-
-    intermediaryBuffer[stepIdx*stepSize + evIdx] = acc;
+    b[index*incb] = (double)a[index*inca];
   }
-  __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* out, int numEigenvectors, int numSteps, int stepSize) {
-    int evIdx = blockIdx.x*blockDim.x+threadIdx.x;
-    if (evIdx >= numEigenvectors) {
+  __global__ void castDoubleToFloatKernel(double* a, int inca, float* b, int incb, int numElems) {
+    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    if (index >= numElems) {
       return;
     }
-    if (numSteps*stepSize+evIdx >= numEigenvectors) {
-      numSteps--;
-    }
-
-    float acc = 0;
-    for (int i=0; i < numSteps; i++) {
-      int ibIdx = i*stepSize + evIdx;
-      acc += intermediaryBuffer[ibIdx];
-    }
-
-    out[evIdx] = acc;
+    b[index*incb] = (float)a[index*inca];
   }
-  __global__ void subtractMeanKernel(float* out, float* mean, int numElems) {
-    int idx = blockIdx.x*blockDim.x+threadIdx.x;
-
-    // perform bound checking
-    if (idx >= numElems) {
-      return;
-    }
-
-    // subtract out the mean
-    out[idx] -= mean[idx];
-  }
-
-  // _evRows: the number of pixels in the trained images
-  // _evCols: the number of eigenvectors
-  // _meanElems: the number of pixels in an image
-  // _stepSize: the number of pixels in a single step
-  // _numSteps: the number of steps required to complete operation
-  float* cudaEvPtr; int _evRows; int _evCols;
-  float* cudaMeanPtr; int _meanElems;
-  int _numSteps; int _stepSize;
-  float* intermediaryBuffer;
-
-  void initializeWrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) {
-    _evRows = evRows; _evCols = evCols;
-    _meanElems = meanElems;
-
-    cudaError_t err;
-
-    // copy the eigenvectors to the GPU
-    CUDA_SAFE_MALLOC(&cudaEvPtr, evRows*evCols*sizeof(float), &err);
-    CUDA_SAFE_MEMCPY(cudaEvPtr, evPtr, evRows*evCols*sizeof(float), cudaMemcpyHostToDevice, &err);
-
-    // copy the mean to the GPU
-    CUDA_SAFE_MALLOC(&cudaMeanPtr, meanElems*sizeof(float), &err);
-    CUDA_SAFE_MEMCPY(cudaMeanPtr, meanPtr, meanElems*sizeof(float), cudaMemcpyHostToDevice, &err);
-
-    // initialize the intermediary working space,
-    _stepSize = 2048;
-    _numSteps = _evRows / _stepSize + 1;
-    CUDA_SAFE_MALLOC(&intermediaryBuffer, _numSteps*_stepSize*sizeof(float), &err);
-  }
+  void castFloatToDouble(float* a, int inca, double* b, int incb, int numElems) {
+    int threadsPerBlock = 512;
+    int numBlocks = numElems / threadsPerBlock + 1;
-  void trainWrapper(void* cudaSrc, float* data, int rows, int cols) {
-    cudaError_t err;
-    CUDA_SAFE_MEMCPY(data, cudaSrc, rows*cols*sizeof(float), cudaMemcpyDeviceToHost, &err);
+    castFloatToDoubleKernel<<<numBlocks, threadsPerBlock>>>(a, inca, b, incb, numElems);
   }
-  void wrapper(void* src, void** dst, int imgRows, int imgCols) {
-    cudaError_t err;
-    CUDA_SAFE_MALLOC(dst, _evCols*sizeof(float), &err);
-
-    if (imgRows*imgCols != _evRows || imgRows*imgCols != _meanElems) {
-      cout << "ERR: Image dimension mismatch!" << endl;
-      throw 0;
-    }
-
-    // subtract out the mean of the image (mean is 1xpixels in size), perform in place (in src)
+  void castDoubleToFloat(double* a, int inca, float* b, int incb, int numElems) {
     int threadsPerBlock = 512;
-    int numBlocks = _meanElems / threadsPerBlock + 1;
-    subtractMeanKernel<<<numBlocks, threadsPerBlock>>>((float*)src, cudaMeanPtr, _meanElems);
-    CUDA_KERNEL_ERR_CHK(&err);
-
-    // perform matrix multiplication
-    dim3 threadsPerBlock2d(512, 1);
-    dim3 numBlocks2d(
-        _evCols / threadsPerBlock2d.x + 1,
-        _numSteps / threadsPerBlock2d.y + 1);
-    multiplyKernel<<<numBlocks2d, threadsPerBlock2d>>>((float*)src, intermediaryBuffer, cudaEvPtr, _evCols, _numSteps, _stepSize, _meanElems);
-    CUDA_KERNEL_ERR_CHK(&err);
-
-    threadsPerBlock = 512;
-    numBlocks = _evCols / threadsPerBlock + 1;
-    multiplyJoinKernel<<<numBlocks, threadsPerBlock>>>(intermediaryBuffer, (float*)*dst, _evCols, _numSteps, _stepSize);
-    CUDA_KERNEL_ERR_CHK(&err);
+    int numBlocks = numElems / threadsPerBlock + 1;
-    // free the src memory
-    CUDA_SAFE_FREE(src, &err);
+    castDoubleToFloatKernel<<<numBlocks, threadsPerBlock>>>(a, inca, b, incb, numElems);
   }
 }}}
@@ -28,6 +28,6 @@ if(BR_WITH_CUDA)
   # add the compiled source and libs into the build system
   set(BR_THIRDPARTY_SRC ${BR_THIRDPARTY_SRC} ${CUDA_CPP_SRC} ${CUDA_CU_OBJ})
-  set(BR_THIRDPARTY_LIBS ${BR_THIRDPARTY_LIBS} ${CUDA_LIBRARIES})
+  set(BR_THIRDPARTY_LIBS ${BR_THIRDPARTY_LIBS} ${CUDA_LIBRARIES} "cublas" "cusolver")
 endif()