Commit b6adde60b31838380945ae08efcaeaf42e028e50
1 parent
237af8f5
added cuBLAS accelerated PCA
much faster than homebrew PCA
Showing
3 changed files
with
367 additions
and
1 deletions
openbr/plugins/cuda/cublaspca.cpp
0 → 100644
| 1 | +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * | ||
| 2 | + * Copyright 2016 Colin Heinzmann * | ||
| 3 | + * * | ||
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); * | ||
| 5 | + * you may not use this file except in compliance with the License. * | ||
| 6 | + * You may obtain a copy of the License at * | ||
| 7 | + * * | ||
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 * | ||
| 9 | + * * | ||
| 10 | + * Unless required by applicable law or agreed to in writing, software * | ||
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, * | ||
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * | ||
| 13 | + * See the License for the specific language governing permissions and * | ||
| 14 | + * limitations under the License. * | ||
| 15 | + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ | ||
| 16 | + | ||
| 17 | +#include <iostream> | ||
| 18 | +using namespace std; | ||
| 19 | +#include <unistd.h> | ||
| 20 | + | ||
| 21 | +#include <QList> | ||
| 22 | + | ||
| 23 | +#include <Eigen/Dense> | ||
| 24 | + | ||
| 25 | +#include <opencv2/opencv.hpp> | ||
| 26 | +using namespace cv; | ||
| 27 | + | ||
| 28 | +#include <openbr/plugins/openbr_internal.h> | ||
| 29 | +#include <openbr/core/common.h> | ||
| 30 | +#include <openbr/core/eigenutils.h> | ||
| 31 | +#include <openbr/core/opencvutils.h> | ||
| 32 | + | ||
| 33 | +#include <cuda_runtime.h> | ||
| 34 | +#include <cublas_v2.h> | ||
| 35 | +#include "cudadefines.hpp" | ||
| 36 | + | ||
| 37 | +namespace br | ||
| 38 | +{ | ||
| 39 | +/*! | ||
| 40 | + * \ingroup transforms | ||
| 41 | + * \brief Projects input into learned Principal Component Analysis subspace using CUDA. Modified from original PCA plugin. | ||
| 42 | + * \author Colin Heinzmann \cite DepthDeluxe | ||
| 43 | + * | ||
| 44 | + * \br_property float keep Options are: [keep < 0 - All eigenvalues are retained, keep == 0 - No PCA is performed and the eigenvectors form an identity matrix, 0 < keep < 1 - Keep is the fraction of the variance to retain, keep >= 1 - keep is the number of leading eigenvectors to retain] Default is 0.95. | ||
| 45 | + * \br_property int drop The number of leading eigen-dimensions to drop. | ||
| 46 | + * \br_property bool whiten Whether or not to perform PCA whitening (i.e., normalize variance of each dimension to unit norm) | ||
| 47 | + */ | ||
| 48 | +class CUBLASPCATransform : public Transform | ||
| 49 | +{ | ||
| 50 | + Q_OBJECT | ||
| 51 | + | ||
| 52 | +protected: | ||
| 53 | + Q_PROPERTY(float keep READ get_keep WRITE set_keep RESET reset_keep STORED false) | ||
| 54 | + Q_PROPERTY(int drop READ get_drop WRITE set_drop RESET reset_drop STORED false) | ||
| 55 | + Q_PROPERTY(bool whiten READ get_whiten WRITE set_whiten RESET reset_whiten STORED false) | ||
| 56 | + | ||
| 57 | + BR_PROPERTY(float, keep, 0.95) | ||
| 58 | + BR_PROPERTY(int, drop, 0) | ||
| 59 | + BR_PROPERTY(bool, whiten, false) | ||
| 60 | + | ||
| 61 | + Eigen::VectorXf mean, eVals; | ||
| 62 | + Eigen::MatrixXf eVecs; | ||
| 63 | + | ||
| 64 | + int originalRows; | ||
| 65 | + | ||
| 66 | + cublasHandle_t cublasHandle; | ||
| 67 | + float* cudaMeanPtr; // holds the "keep" long vector | ||
| 68 | + float* cudaEvPtr; // holds all the eigenvectors | ||
| 69 | + | ||
| 70 | +public: | ||
| 71 | + CUBLASPCATransform() : keep(0.95), drop(0), whiten(false) { | ||
| 72 | + // try to initialize CUBLAS | ||
| 73 | + cublasStatus_t status; | ||
| 74 | + status = cublasCreate(&cublasHandle); | ||
| 75 | + CUBLAS_ERROR_CHECK(status); | ||
| 76 | + } | ||
| 77 | + | ||
| 78 | + ~CUBLASPCATransform() { | ||
| 79 | + // tear down CUBLAS | ||
| 80 | + cublasDestroy(cublasHandle); | ||
| 81 | + } | ||
| 82 | + | ||
| 83 | +private: | ||
| 84 | + double residualReconstructionError(const Template &src) const | ||
| 85 | + { | ||
| 86 | + Template proj; | ||
| 87 | + project(src, proj); | ||
| 88 | + | ||
| 89 | + Eigen::Map<const Eigen::VectorXf> srcMap(src.m().ptr<float>(), src.m().rows*src.m().cols); | ||
| 90 | + Eigen::Map<Eigen::VectorXf> projMap(proj.m().ptr<float>(), keep); | ||
| 91 | + | ||
| 92 | + return (srcMap - mean).squaredNorm() - projMap.squaredNorm(); | ||
| 93 | + } | ||
| 94 | + | ||
| 95 | + void train(const TemplateList &cudaTrainingSet) | ||
| 96 | + { | ||
| 97 | + // copy the data back from the graphics card so the training can be done on the CPU | ||
| 98 | + const int instances = cudaTrainingSet.size(); // get the number of training set instances | ||
| 99 | + QList<Template> trainingQlist; | ||
| 100 | + for(int i=0; i<instances; i++) { | ||
| 101 | + Template currentTemplate = cudaTrainingSet[i]; | ||
| 102 | + void* const* srcDataPtr = currentTemplate.m().ptr<void*>(); | ||
| 103 | + void* cudaMemPtr = srcDataPtr[0]; | ||
| 104 | + int rows = *((int*)srcDataPtr[1]); | ||
| 105 | + int cols = *((int*)srcDataPtr[2]); | ||
| 106 | + int type = *((int*)srcDataPtr[3]); | ||
| 107 | + | ||
| 108 | + if (type != CV_32FC1) { | ||
| 109 | + qFatal("Requires single channel 32-bit floating point matrices."); | ||
| 110 | + } | ||
| 111 | + | ||
| 112 | + // copy GPU mat data back to the CPU so we can do the training on the CPU | ||
| 113 | + Mat mat = Mat(rows, cols, type); | ||
| 114 | + cudaError_t err; | ||
| 115 | + CUDA_SAFE_MEMCPY(mat.ptr<float>(), cudaMemPtr, rows*cols*sizeof(float), cudaMemcpyDeviceToHost, &err); | ||
| 116 | + trainingQlist.append(Template(mat)); | ||
| 117 | + } | ||
| 118 | + | ||
| 119 | + // assemble a TemplateList from the list of data | ||
| 120 | + TemplateList trainingSet(trainingQlist); | ||
| 121 | + | ||
| 122 | + originalRows = trainingSet.first().m().rows; // get number of rows of first image | ||
| 123 | + int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols; // get the size of the first image | ||
| 124 | + | ||
| 125 | + // Map into 64-bit Eigen matrix - perform the column major conversion | ||
| 126 | + Eigen::MatrixXd data(dimsIn, instances); // create a mat | ||
| 127 | + for (int i=0; i<instances; i++) { | ||
| 128 | + data.col(i) = Eigen::Map<const Eigen::MatrixXf>(trainingSet[i].m().ptr<float>(), dimsIn, 1).cast<double>(); | ||
| 129 | + } | ||
| 130 | + | ||
| 131 | + trainCore(data); | ||
| 132 | + } | ||
| 133 | + | ||
| 134 | + void project(const Template &src, Template &dst) const | ||
| 135 | + { | ||
| 136 | + //cout << "Starting projection" << endl; | ||
| 137 | + | ||
| 138 | + void* const* srcDataPtr = src.m().ptr<void*>(); | ||
| 139 | + float* srcGpuMatPtr = (float*)srcDataPtr[0]; | ||
| 140 | + int rows = *((int*)srcDataPtr[1]); | ||
| 141 | + int cols = *((int*)srcDataPtr[2]); | ||
| 142 | + int type = *((int*)srcDataPtr[3]); | ||
| 143 | + | ||
| 144 | + if (type != CV_32FC1) { | ||
| 145 | + cout << "ERR: Invalid image type" << endl; | ||
| 146 | + throw 0; | ||
| 147 | + } | ||
| 148 | + | ||
| 149 | + // save the destination rows | ||
| 150 | + int dstRows = (int)keep; | ||
| 151 | + | ||
| 152 | + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type()); | ||
| 153 | + void** dstDataPtr = dstMat.ptr<void*>(); | ||
| 154 | + float** dstGpuMatPtrPtr = (float**)dstDataPtr; | ||
| 155 | + dstDataPtr[1] = srcDataPtr[1]; *((int*)dstDataPtr[1]) = 1; | ||
| 156 | + dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = dstRows; | ||
| 157 | + dstDataPtr[3] = srcDataPtr[3]; | ||
| 158 | + | ||
| 159 | + | ||
| 160 | + // allocate the memory and set to zero | ||
| 161 | + //cout << "Allocating destination memory" << endl; | ||
| 162 | + cublasStatus_t status; | ||
| 163 | + cudaMalloc(dstGpuMatPtrPtr, dstRows*sizeof(float)); | ||
| 164 | + cudaMemset(*dstGpuMatPtrPtr, 0, dstRows*sizeof(float)); | ||
| 165 | + | ||
| 166 | + { | ||
| 167 | + //cout << "Ax + y" << endl; | ||
| 168 | + // subtract out the average | ||
| 169 | + float negativeOne = -1.0f; | ||
| 170 | + status = cublasSaxpy( | ||
| 171 | + cublasHandle, // handle | ||
| 172 | + dstRows, // vector length | ||
| 173 | + &negativeOne, // alpha (1) | ||
| 174 | + cudaMeanPtr, // mean | ||
| 175 | + 1, // stride | ||
| 176 | + srcGpuMatPtr, // y, the source | ||
| 177 | + 1 // stride | ||
| 178 | + ); | ||
| 179 | + CUBLAS_ERROR_CHECK(status); | ||
| 180 | + } | ||
| 181 | + | ||
| 182 | + { | ||
| 183 | + //cout << "Matrix-Vector multiplication" << endl; | ||
| 184 | + | ||
| 185 | + float one = 1.0f; | ||
| 186 | + float zero = 0.0f; | ||
| 187 | + status = cublasSgemv( | ||
| 188 | + cublasHandle, // handle | ||
| 189 | + CUBLAS_OP_T, // normal vector multiplication | ||
| 190 | + eVecs.rows(), // # rows | ||
| 191 | + eVecs.cols(), // # cols | ||
| 192 | + &one, // alpha (1) | ||
| 193 | + cudaEvPtr, // pointer to the matrix | ||
| 194 | + eVecs.rows(), // leading dimension of matrix | ||
| 195 | + srcGpuMatPtr, // vector for multiplication | ||
| 196 | + 1, // stride (1) | ||
| 197 | + &zero, // beta (0) | ||
| 198 | + *dstGpuMatPtrPtr, // vector to store the result | ||
| 199 | + 1 // stride (1) | ||
| 200 | + ); | ||
| 201 | + } | ||
| 202 | + | ||
| 203 | + //cout << "Saving result" << endl; | ||
| 204 | + dst = dstMat; | ||
| 205 | + cudaFree(srcGpuMatPtr); | ||
| 206 | + } | ||
| 207 | + | ||
| 208 | + void store(QDataStream &stream) const | ||
| 209 | + { | ||
| 210 | + stream << keep << drop << whiten << originalRows << mean << eVals << eVecs; | ||
| 211 | + } | ||
| 212 | + | ||
| 213 | + void load(QDataStream &stream) | ||
| 214 | + { | ||
| 215 | + stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> eVecs; | ||
| 216 | + | ||
| 217 | + //cout << "Starting load process" << endl; | ||
| 218 | + | ||
| 219 | + cudaError_t cudaError; | ||
| 220 | + cublasStatus_t cublasStatus; | ||
| 221 | + CUDA_SAFE_MALLOC(&cudaMeanPtr, mean.rows()*mean.cols()*sizeof(float), &cudaError); | ||
| 222 | + CUDA_SAFE_MALLOC(&cudaEvPtr, eVecs.rows()*eVecs.cols()*sizeof(float), &cudaError); | ||
| 223 | + | ||
| 224 | + //cout << "Setting vector" << endl; | ||
| 225 | + // load the mean vector into GPU memory | ||
| 226 | + cublasStatus = cublasSetVector( | ||
| 227 | + mean.rows()*mean.cols(), | ||
| 228 | + sizeof(float), | ||
| 229 | + mean.data(), | ||
| 230 | + 1, | ||
| 231 | + cudaMeanPtr, | ||
| 232 | + 1 | ||
| 233 | + ); | ||
| 234 | + CUBLAS_ERROR_CHECK(cublasStatus); | ||
| 235 | + | ||
| 236 | + //cout << "Setting the matrix" << endl; | ||
| 237 | + // load the eigenvector matrix into GPU memory | ||
| 238 | + cublasStatus = cublasSetMatrix( | ||
| 239 | + eVecs.rows(), | ||
| 240 | + eVecs.cols(), | ||
| 241 | + sizeof(float), | ||
| 242 | + eVecs.data(), | ||
| 243 | + eVecs.rows(), | ||
| 244 | + cudaEvPtr, | ||
| 245 | + eVecs.rows() | ||
| 246 | + ); | ||
| 247 | + CUBLAS_ERROR_CHECK(cublasStatus); | ||
| 248 | + } | ||
| 249 | + | ||
| 250 | +protected: | ||
| 251 | + void trainCore(Eigen::MatrixXd data) | ||
| 252 | + { | ||
| 253 | + int dimsIn = data.rows(); | ||
| 254 | + int instances = data.cols(); | ||
| 255 | + const bool dominantEigenEstimation = (dimsIn > instances); | ||
| 256 | + | ||
| 257 | + Eigen::MatrixXd allEVals, allEVecs; | ||
| 258 | + if (keep != 0) { | ||
| 259 | + // Compute and remove mean | ||
| 260 | + mean = Eigen::VectorXf(dimsIn); | ||
| 261 | + for (int i=0; i<dimsIn; i++) mean(i) = data.row(i).sum() / (float)instances; | ||
| 262 | + for (int i=0; i<dimsIn; i++) data.row(i).array() -= mean(i); | ||
| 263 | + | ||
| 264 | + // Calculate covariance matrix | ||
| 265 | + Eigen::MatrixXd cov; | ||
| 266 | + if (dominantEigenEstimation) cov = data.transpose() * data / (instances-1.0); | ||
| 267 | + else cov = data * data.transpose() / (instances-1.0); | ||
| 268 | + | ||
| 269 | + // Compute eigendecomposition. Returns eigenvectors/eigenvalues in increasing order by eigenvalue. | ||
| 270 | + Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> eSolver(cov); | ||
| 271 | + allEVals = eSolver.eigenvalues(); | ||
| 272 | + allEVecs = eSolver.eigenvectors(); | ||
| 273 | + if (dominantEigenEstimation) allEVecs = data * allEVecs; | ||
| 274 | + } else { | ||
| 275 | + // Null case | ||
| 276 | + mean = Eigen::VectorXf::Zero(dimsIn); | ||
| 277 | + allEVecs = Eigen::MatrixXd::Identity(dimsIn, dimsIn); | ||
| 278 | + allEVals = Eigen::VectorXd::Ones(dimsIn); | ||
| 279 | + } | ||
| 280 | + | ||
| 281 | + if (keep <= 0) { | ||
| 282 | + keep = dimsIn - drop; | ||
| 283 | + } else if (keep < 1) { | ||
| 284 | + // Keep eigenvectors that retain a certain energy percentage. | ||
| 285 | + const double totalEnergy = allEVals.sum(); | ||
| 286 | + if (totalEnergy == 0) { | ||
| 287 | + keep = 0; | ||
| 288 | + } else { | ||
| 289 | + double currentEnergy = 0; | ||
| 290 | + int i=0; | ||
| 291 | + while ((currentEnergy / totalEnergy < keep) && (i < allEVals.rows())) { | ||
| 292 | + currentEnergy += allEVals(allEVals.rows()-(i+1)); | ||
| 293 | + i++; | ||
| 294 | + } | ||
| 295 | + keep = i - drop; | ||
| 296 | + } | ||
| 297 | + } else { | ||
| 298 | + if (keep + drop > allEVals.rows()) { | ||
| 299 | + qWarning("Insufficient samples, needed at least %d but only got %d.", (int)keep + drop, (int)allEVals.rows()); | ||
| 300 | + keep = allEVals.rows() - drop; | ||
| 301 | + } | ||
| 302 | + } | ||
| 303 | + | ||
| 304 | + // Keep highest energy vectors | ||
| 305 | + eVals = Eigen::VectorXf((int)keep, 1); | ||
| 306 | + eVecs = Eigen::MatrixXf(allEVecs.rows(), (int)keep); | ||
| 307 | + for (int i=0; i<keep; i++) { | ||
| 308 | + int index = allEVals.rows()-(i+drop+1); | ||
| 309 | + eVals(i) = allEVals(index); | ||
| 310 | + eVecs.col(i) = allEVecs.col(index).cast<float>() / allEVecs.col(index).norm(); | ||
| 311 | + if (whiten) eVecs.col(i) /= sqrt(eVals(i)); | ||
| 312 | + } | ||
| 313 | + | ||
| 314 | + // Debug output | ||
| 315 | + if (Globals->verbose) qDebug() << "PCA Training:\n\tDimsIn =" << dimsIn << "\n\tKeep =" << keep; | ||
| 316 | + } | ||
| 317 | + | ||
| 318 | + void writeEigenVectors(const Eigen::MatrixXd &allEVals, const Eigen::MatrixXd &allEVecs) const | ||
| 319 | + { | ||
| 320 | + const int originalCols = mean.rows() / originalRows; | ||
| 321 | + | ||
| 322 | + { // Write out mean image | ||
| 323 | + cv::Mat out(originalRows, originalCols, CV_32FC1); | ||
| 324 | + Eigen::Map<Eigen::MatrixXf> outMap(out.ptr<float>(), mean.rows(), 1); | ||
| 325 | + outMap = mean.col(0); | ||
| 326 | + // OpenCVUtils::saveImage(out, Globals->Debug+"/PCA/eigenVectors/mean.png"); | ||
| 327 | + } | ||
| 328 | + | ||
| 329 | + // Write out sample eigen vectors (16 highest, 8 lowest), filename = eigenvalue. | ||
| 330 | + for (int k=0; k<(int)allEVals.size(); k++) { | ||
| 331 | + if ((k < 8) || (k >= (int)allEVals.size()-16)) { | ||
| 332 | + cv::Mat out(originalRows, originalCols, CV_64FC1); | ||
| 333 | + Eigen::Map<Eigen::MatrixXd> outMap(out.ptr<double>(), mean.rows(), 1); | ||
| 334 | + outMap = allEVecs.col(k); | ||
| 335 | + // OpenCVUtils::saveImage(out, Globals->Debug+"/PCA/eigenVectors/"+QString::number(allEVals(k),'f',0)+".png"); | ||
| 336 | + } | ||
| 337 | + } | ||
| 338 | + } | ||
| 339 | +}; | ||
| 340 | + | ||
| 341 | +BR_REGISTER(Transform, CUBLASPCATransform) | ||
| 342 | +} // namespace br | ||
| 343 | + | ||
| 344 | +#include "cuda/cublaspca.moc" |
openbr/plugins/cuda/cudadefines.hpp
| @@ -48,3 +48,25 @@ using namespace std; | @@ -48,3 +48,25 @@ using namespace std; | ||
| 48 | cout << pthread_self() << ": Kernel Call Err(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \ | 48 | cout << pthread_self() << ": Kernel Call Err(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \ |
| 49 | throw 0; \ | 49 | throw 0; \ |
| 50 | } | 50 | } |
| 51 | + | ||
| 52 | +#define CUBLAS_ERROR_CHECK(error) \ | ||
| 53 | + if (error != CUBLAS_STATUS_SUCCESS) { \ | ||
| 54 | + switch (error) { \ | ||
| 55 | + case CUBLAS_STATUS_NOT_INITIALIZED: \ | ||
| 56 | + cout << "CUBLAS_STATUS_NOT_INITIALIZED" << endl;; \ | ||
| 57 | + case CUBLAS_STATUS_ALLOC_FAILED: \ | ||
| 58 | + cout << "CUBLAS_STATUS_ALLOC_FAILED" << endl;; \ | ||
| 59 | + case CUBLAS_STATUS_INVALID_VALUE: \ | ||
| 60 | + cout << "CUBLAS_STATUS_INVALID_VALUE" << endl;; \ | ||
| 61 | + case CUBLAS_STATUS_ARCH_MISMATCH: \ | ||
| 62 | + cout << "CUBLAS_STATUS_ARCH_MISMATCH" << endl;; \ | ||
| 63 | + case CUBLAS_STATUS_MAPPING_ERROR: \ | ||
| 64 | + cout << "CUBLAS_STATUS_MAPPING_ERROR" << endl;; \ | ||
| 65 | + case CUBLAS_STATUS_EXECUTION_FAILED: \ | ||
| 66 | + cout << "CUBLAS_STATUS_EXECUTION_FAILED" << endl;; \ | ||
| 67 | + case CUBLAS_STATUS_INTERNAL_ERROR: \ | ||
| 68 | + cout << "CUBLAS_STATUS_INTERNAL_ERROR" << endl;; \ | ||
| 69 | + default: \ | ||
| 70 | + cout << "<unknown>" << endl; \ | ||
| 71 | + } \ | ||
| 72 | + } |
openbr/plugins/cuda/module.cmake
| @@ -28,6 +28,6 @@ if(BR_WITH_CUDA) | @@ -28,6 +28,6 @@ if(BR_WITH_CUDA) | ||
| 28 | 28 | ||
| 29 | # add the compiled source and libs into the build system | 29 | # add the compiled source and libs into the build system |
| 30 | set(BR_THIRDPARTY_SRC ${BR_THIRDPARTY_SRC} ${CUDA_CPP_SRC} ${CUDA_CU_OBJ}) | 30 | set(BR_THIRDPARTY_SRC ${BR_THIRDPARTY_SRC} ${CUDA_CPP_SRC} ${CUDA_CU_OBJ}) |
| 31 | - set(BR_THIRDPARTY_LIBS ${BR_THIRDPARTY_LIBS} ${CUDA_LIBRARIES}) | 31 | + set(BR_THIRDPARTY_LIBS ${BR_THIRDPARTY_LIBS} ${CUDA_LIBRARIES} "cublas") |
| 32 | 32 | ||
| 33 | endif() | 33 | endif() |