Commit b6adde60b31838380945ae08efcaeaf42e028e50

Authored by DepthDeluxe
1 parent 237af8f5

added cuBLAS accelerated PCA

much faster than homebrew PCA
openbr/plugins/cuda/cublaspca.cpp 0 → 100644
  1 +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
  2 + * Copyright 2016 Colin Heinzmann *
  3 + * *
  4 + * Licensed under the Apache License, Version 2.0 (the "License"); *
  5 + * you may not use this file except in compliance with the License. *
  6 + * You may obtain a copy of the License at *
  7 + * *
  8 + * http://www.apache.org/licenses/LICENSE-2.0 *
  9 + * *
  10 + * Unless required by applicable law or agreed to in writing, software *
  11 + * distributed under the License is distributed on an "AS IS" BASIS, *
  12 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
  13 + * See the License for the specific language governing permissions and *
  14 + * limitations under the License. *
  15 + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
  16 +
  17 +#include <iostream>
  18 +using namespace std;
  19 +#include <unistd.h>
  20 +
  21 +#include <QList>
  22 +
  23 +#include <Eigen/Dense>
  24 +
  25 +#include <opencv2/opencv.hpp>
  26 +using namespace cv;
  27 +
  28 +#include <openbr/plugins/openbr_internal.h>
  29 +#include <openbr/core/common.h>
  30 +#include <openbr/core/eigenutils.h>
  31 +#include <openbr/core/opencvutils.h>
  32 +
  33 +#include <cuda_runtime.h>
  34 +#include <cublas_v2.h>
  35 +#include "cudadefines.hpp"
  36 +
  37 +namespace br
  38 +{
  39 +/*!
  40 + * \ingroup transforms
  41 + * \brief Projects input into learned Principal Component Analysis subspace using CUDA. Modified from original PCA plugin.
  42 + * \author Colin Heinzmann \cite DepthDeluxe
  43 + *
  44 + * \br_property float keep Options are: [keep < 0 - All eigenvalues are retained, keep == 0 - No PCA is performed and the eigenvectors form an identity matrix, 0 < keep < 1 - Keep is the fraction of the variance to retain, keep >= 1 - keep is the number of leading eigenvectors to retain] Default is 0.95.
  45 + * \br_property int drop The number of leading eigen-dimensions to drop.
  46 + * \br_property bool whiten Whether or not to perform PCA whitening (i.e., normalize variance of each dimension to unit norm)
  47 + */
  48 +class CUBLASPCATransform : public Transform
  49 +{
  50 + Q_OBJECT
  51 +
  52 +protected:
  53 + Q_PROPERTY(float keep READ get_keep WRITE set_keep RESET reset_keep STORED false)
  54 + Q_PROPERTY(int drop READ get_drop WRITE set_drop RESET reset_drop STORED false)
  55 + Q_PROPERTY(bool whiten READ get_whiten WRITE set_whiten RESET reset_whiten STORED false)
  56 +
  57 + BR_PROPERTY(float, keep, 0.95)
  58 + BR_PROPERTY(int, drop, 0)
  59 + BR_PROPERTY(bool, whiten, false)
  60 +
  61 + Eigen::VectorXf mean, eVals;
  62 + Eigen::MatrixXf eVecs;
  63 +
  64 + int originalRows;
  65 +
  66 + cublasHandle_t cublasHandle;
  67 + float* cudaMeanPtr; // holds the "keep" long vector
  68 + float* cudaEvPtr; // holds all the eigenvectors
  69 +
  70 +public:
  71 + CUBLASPCATransform() : keep(0.95), drop(0), whiten(false) {
  72 + // try to initialize CUBLAS
  73 + cublasStatus_t status;
  74 + status = cublasCreate(&cublasHandle);
  75 + CUBLAS_ERROR_CHECK(status);
  76 + }
  77 +
  78 + ~CUBLASPCATransform() {
  79 + // tear down CUBLAS
  80 + cublasDestroy(cublasHandle);
  81 + }
  82 +
  83 +private:
  84 + double residualReconstructionError(const Template &src) const
  85 + {
  86 + Template proj;
  87 + project(src, proj);
  88 +
  89 + Eigen::Map<const Eigen::VectorXf> srcMap(src.m().ptr<float>(), src.m().rows*src.m().cols);
  90 + Eigen::Map<Eigen::VectorXf> projMap(proj.m().ptr<float>(), keep);
  91 +
  92 + return (srcMap - mean).squaredNorm() - projMap.squaredNorm();
  93 + }
  94 +
  95 + void train(const TemplateList &cudaTrainingSet)
  96 + {
  97 + // copy the data back from the graphics card so the training can be done on the CPU
  98 + const int instances = cudaTrainingSet.size(); // get the number of training set instances
  99 + QList<Template> trainingQlist;
  100 + for(int i=0; i<instances; i++) {
  101 + Template currentTemplate = cudaTrainingSet[i];
  102 + void* const* srcDataPtr = currentTemplate.m().ptr<void*>();
  103 + void* cudaMemPtr = srcDataPtr[0];
  104 + int rows = *((int*)srcDataPtr[1]);
  105 + int cols = *((int*)srcDataPtr[2]);
  106 + int type = *((int*)srcDataPtr[3]);
  107 +
  108 + if (type != CV_32FC1) {
  109 + qFatal("Requires single channel 32-bit floating point matrices.");
  110 + }
  111 +
  112 + // copy GPU mat data back to the CPU so we can do the training on the CPU
  113 + Mat mat = Mat(rows, cols, type);
  114 + cudaError_t err;
  115 + CUDA_SAFE_MEMCPY(mat.ptr<float>(), cudaMemPtr, rows*cols*sizeof(float), cudaMemcpyDeviceToHost, &err);
  116 + trainingQlist.append(Template(mat));
  117 + }
  118 +
  119 + // assemble a TemplateList from the list of data
  120 + TemplateList trainingSet(trainingQlist);
  121 +
  122 + originalRows = trainingSet.first().m().rows; // get number of rows of first image
  123 + int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols; // get the size of the first image
  124 +
  125 + // Map into 64-bit Eigen matrix - perform the column major conversion
  126 + Eigen::MatrixXd data(dimsIn, instances); // create a mat
  127 + for (int i=0; i<instances; i++) {
  128 + data.col(i) = Eigen::Map<const Eigen::MatrixXf>(trainingSet[i].m().ptr<float>(), dimsIn, 1).cast<double>();
  129 + }
  130 +
  131 + trainCore(data);
  132 + }
  133 +
  134 + void project(const Template &src, Template &dst) const
  135 + {
  136 + //cout << "Starting projection" << endl;
  137 +
  138 + void* const* srcDataPtr = src.m().ptr<void*>();
  139 + float* srcGpuMatPtr = (float*)srcDataPtr[0];
  140 + int rows = *((int*)srcDataPtr[1]);
  141 + int cols = *((int*)srcDataPtr[2]);
  142 + int type = *((int*)srcDataPtr[3]);
  143 +
  144 + if (type != CV_32FC1) {
  145 + cout << "ERR: Invalid image type" << endl;
  146 + throw 0;
  147 + }
  148 +
  149 + // save the destination rows
  150 + int dstRows = (int)keep;
  151 +
  152 + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
  153 + void** dstDataPtr = dstMat.ptr<void*>();
  154 + float** dstGpuMatPtrPtr = (float**)dstDataPtr;
  155 + dstDataPtr[1] = srcDataPtr[1]; *((int*)dstDataPtr[1]) = 1;
  156 + dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = dstRows;
  157 + dstDataPtr[3] = srcDataPtr[3];
  158 +
  159 +
  160 + // allocate the memory and set to zero
  161 + //cout << "Allocating destination memory" << endl;
  162 + cublasStatus_t status;
  163 + cudaMalloc(dstGpuMatPtrPtr, dstRows*sizeof(float));
  164 + cudaMemset(*dstGpuMatPtrPtr, 0, dstRows*sizeof(float));
  165 +
  166 + {
  167 + //cout << "Ax + y" << endl;
  168 + // subtract out the average
  169 + float negativeOne = -1.0f;
  170 + status = cublasSaxpy(
  171 + cublasHandle, // handle
  172 + dstRows, // vector length
  173 + &negativeOne, // alpha (1)
  174 + cudaMeanPtr, // mean
  175 + 1, // stride
  176 + srcGpuMatPtr, // y, the source
  177 + 1 // stride
  178 + );
  179 + CUBLAS_ERROR_CHECK(status);
  180 + }
  181 +
  182 + {
  183 + //cout << "Matrix-Vector multiplication" << endl;
  184 +
  185 + float one = 1.0f;
  186 + float zero = 0.0f;
  187 + status = cublasSgemv(
  188 + cublasHandle, // handle
  189 + CUBLAS_OP_T, // normal vector multiplication
  190 + eVecs.rows(), // # rows
  191 + eVecs.cols(), // # cols
  192 + &one, // alpha (1)
  193 + cudaEvPtr, // pointer to the matrix
  194 + eVecs.rows(), // leading dimension of matrix
  195 + srcGpuMatPtr, // vector for multiplication
  196 + 1, // stride (1)
  197 + &zero, // beta (0)
  198 + *dstGpuMatPtrPtr, // vector to store the result
  199 + 1 // stride (1)
  200 + );
  201 + }
  202 +
  203 + //cout << "Saving result" << endl;
  204 + dst = dstMat;
  205 + cudaFree(srcGpuMatPtr);
  206 + }
  207 +
  208 + void store(QDataStream &stream) const
  209 + {
  210 + stream << keep << drop << whiten << originalRows << mean << eVals << eVecs;
  211 + }
  212 +
  213 + void load(QDataStream &stream)
  214 + {
  215 + stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> eVecs;
  216 +
  217 + //cout << "Starting load process" << endl;
  218 +
  219 + cudaError_t cudaError;
  220 + cublasStatus_t cublasStatus;
  221 + CUDA_SAFE_MALLOC(&cudaMeanPtr, mean.rows()*mean.cols()*sizeof(float), &cudaError);
  222 + CUDA_SAFE_MALLOC(&cudaEvPtr, eVecs.rows()*eVecs.cols()*sizeof(float), &cudaError);
  223 +
  224 + //cout << "Setting vector" << endl;
  225 + // load the mean vector into GPU memory
  226 + cublasStatus = cublasSetVector(
  227 + mean.rows()*mean.cols(),
  228 + sizeof(float),
  229 + mean.data(),
  230 + 1,
  231 + cudaMeanPtr,
  232 + 1
  233 + );
  234 + CUBLAS_ERROR_CHECK(cublasStatus);
  235 +
  236 + //cout << "Setting the matrix" << endl;
  237 + // load the eigenvector matrix into GPU memory
  238 + cublasStatus = cublasSetMatrix(
  239 + eVecs.rows(),
  240 + eVecs.cols(),
  241 + sizeof(float),
  242 + eVecs.data(),
  243 + eVecs.rows(),
  244 + cudaEvPtr,
  245 + eVecs.rows()
  246 + );
  247 + CUBLAS_ERROR_CHECK(cublasStatus);
  248 + }
  249 +
  250 +protected:
  251 + void trainCore(Eigen::MatrixXd data)
  252 + {
  253 + int dimsIn = data.rows();
  254 + int instances = data.cols();
  255 + const bool dominantEigenEstimation = (dimsIn > instances);
  256 +
  257 + Eigen::MatrixXd allEVals, allEVecs;
  258 + if (keep != 0) {
  259 + // Compute and remove mean
  260 + mean = Eigen::VectorXf(dimsIn);
  261 + for (int i=0; i<dimsIn; i++) mean(i) = data.row(i).sum() / (float)instances;
  262 + for (int i=0; i<dimsIn; i++) data.row(i).array() -= mean(i);
  263 +
  264 + // Calculate covariance matrix
  265 + Eigen::MatrixXd cov;
  266 + if (dominantEigenEstimation) cov = data.transpose() * data / (instances-1.0);
  267 + else cov = data * data.transpose() / (instances-1.0);
  268 +
  269 + // Compute eigendecomposition. Returns eigenvectors/eigenvalues in increasing order by eigenvalue.
  270 + Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> eSolver(cov);
  271 + allEVals = eSolver.eigenvalues();
  272 + allEVecs = eSolver.eigenvectors();
  273 + if (dominantEigenEstimation) allEVecs = data * allEVecs;
  274 + } else {
  275 + // Null case
  276 + mean = Eigen::VectorXf::Zero(dimsIn);
  277 + allEVecs = Eigen::MatrixXd::Identity(dimsIn, dimsIn);
  278 + allEVals = Eigen::VectorXd::Ones(dimsIn);
  279 + }
  280 +
  281 + if (keep <= 0) {
  282 + keep = dimsIn - drop;
  283 + } else if (keep < 1) {
  284 + // Keep eigenvectors that retain a certain energy percentage.
  285 + const double totalEnergy = allEVals.sum();
  286 + if (totalEnergy == 0) {
  287 + keep = 0;
  288 + } else {
  289 + double currentEnergy = 0;
  290 + int i=0;
  291 + while ((currentEnergy / totalEnergy < keep) && (i < allEVals.rows())) {
  292 + currentEnergy += allEVals(allEVals.rows()-(i+1));
  293 + i++;
  294 + }
  295 + keep = i - drop;
  296 + }
  297 + } else {
  298 + if (keep + drop > allEVals.rows()) {
  299 + qWarning("Insufficient samples, needed at least %d but only got %d.", (int)keep + drop, (int)allEVals.rows());
  300 + keep = allEVals.rows() - drop;
  301 + }
  302 + }
  303 +
  304 + // Keep highest energy vectors
  305 + eVals = Eigen::VectorXf((int)keep, 1);
  306 + eVecs = Eigen::MatrixXf(allEVecs.rows(), (int)keep);
  307 + for (int i=0; i<keep; i++) {
  308 + int index = allEVals.rows()-(i+drop+1);
  309 + eVals(i) = allEVals(index);
  310 + eVecs.col(i) = allEVecs.col(index).cast<float>() / allEVecs.col(index).norm();
  311 + if (whiten) eVecs.col(i) /= sqrt(eVals(i));
  312 + }
  313 +
  314 + // Debug output
  315 + if (Globals->verbose) qDebug() << "PCA Training:\n\tDimsIn =" << dimsIn << "\n\tKeep =" << keep;
  316 + }
  317 +
  318 + void writeEigenVectors(const Eigen::MatrixXd &allEVals, const Eigen::MatrixXd &allEVecs) const
  319 + {
  320 + const int originalCols = mean.rows() / originalRows;
  321 +
  322 + { // Write out mean image
  323 + cv::Mat out(originalRows, originalCols, CV_32FC1);
  324 + Eigen::Map<Eigen::MatrixXf> outMap(out.ptr<float>(), mean.rows(), 1);
  325 + outMap = mean.col(0);
  326 + // OpenCVUtils::saveImage(out, Globals->Debug+"/PCA/eigenVectors/mean.png");
  327 + }
  328 +
  329 + // Write out sample eigen vectors (16 highest, 8 lowest), filename = eigenvalue.
  330 + for (int k=0; k<(int)allEVals.size(); k++) {
  331 + if ((k < 8) || (k >= (int)allEVals.size()-16)) {
  332 + cv::Mat out(originalRows, originalCols, CV_64FC1);
  333 + Eigen::Map<Eigen::MatrixXd> outMap(out.ptr<double>(), mean.rows(), 1);
  334 + outMap = allEVecs.col(k);
  335 + // OpenCVUtils::saveImage(out, Globals->Debug+"/PCA/eigenVectors/"+QString::number(allEVals(k),'f',0)+".png");
  336 + }
  337 + }
  338 + }
  339 +};
  340 +
  341 +BR_REGISTER(Transform, CUBLASPCATransform)
  342 +} // namespace br
  343 +
  344 +#include "cuda/cublaspca.moc"
... ...
openbr/plugins/cuda/cudadefines.hpp
... ... @@ -48,3 +48,25 @@ using namespace std;
48 48 cout << pthread_self() << ": Kernel Call Err(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \
49 49 throw 0; \
50 50 }
  51 +
  52 +#define CUBLAS_ERROR_CHECK(error) \
  53 + if (error != CUBLAS_STATUS_SUCCESS) { \
  54 + switch (error) { \
  55 + case CUBLAS_STATUS_NOT_INITIALIZED: \
  56 + cout << "CUBLAS_STATUS_NOT_INITIALIZED" << endl;; \
  57 + case CUBLAS_STATUS_ALLOC_FAILED: \
  58 + cout << "CUBLAS_STATUS_ALLOC_FAILED" << endl;; \
  59 + case CUBLAS_STATUS_INVALID_VALUE: \
  60 + cout << "CUBLAS_STATUS_INVALID_VALUE" << endl;; \
  61 + case CUBLAS_STATUS_ARCH_MISMATCH: \
  62 + cout << "CUBLAS_STATUS_ARCH_MISMATCH" << endl;; \
  63 + case CUBLAS_STATUS_MAPPING_ERROR: \
  64 + cout << "CUBLAS_STATUS_MAPPING_ERROR" << endl;; \
  65 + case CUBLAS_STATUS_EXECUTION_FAILED: \
  66 + cout << "CUBLAS_STATUS_EXECUTION_FAILED" << endl;; \
  67 + case CUBLAS_STATUS_INTERNAL_ERROR: \
  68 + cout << "CUBLAS_STATUS_INTERNAL_ERROR" << endl;; \
  69 + default: \
  70 + cout << "<unknown>" << endl; \
  71 + } \
  72 + }
... ...
openbr/plugins/cuda/module.cmake
... ... @@ -28,6 +28,6 @@ if(BR_WITH_CUDA)
28 28  
29 29 # add the compiled source and libs into the build system
30 30 set(BR_THIRDPARTY_SRC ${BR_THIRDPARTY_SRC} ${CUDA_CPP_SRC} ${CUDA_CU_OBJ})
31   - set(BR_THIRDPARTY_LIBS ${BR_THIRDPARTY_LIBS} ${CUDA_LIBRARIES})
  31 + set(BR_THIRDPARTY_LIBS ${BR_THIRDPARTY_LIBS} ${CUDA_LIBRARIES} "cublas")
32 32  
33 33 endif()
... ...