Commit 39f67cdab4412630c8ed8c334668a98a8624ef82

Authored by DepthDeluxe
1 parent 7f8093c9

added CUDAPCA project acceleration

doesn't make things too much faster...
openbr/plugins/cuda/cudapca.cpp
... ... @@ -13,8 +13,8 @@
13 13 * See the License for the specific language governing permissions and *
14 14 * limitations under the License. *
15 15 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
16   -// CUDA includes
17   -#include <cusolverDn.h>
  16 +#include <iostream>
  17 +using namespace std;
18 18  
19 19 #include <Eigen/Dense>
20 20 #include <openbr/plugins/openbr_internal.h>
... ... @@ -23,6 +23,8 @@
23 23 #include <openbr/core/eigenutils.h>
24 24 #include <openbr/core/opencvutils.h>
25 25  
  26 +#include "cudapca.hpp"
  27 +
26 28 namespace br
27 29 {
28 30 /*!
... ... @@ -74,12 +76,12 @@ private:
74 76 if (trainingSet.first().m().type() != CV_32FC1)
75 77 qFatal("Requires single channel 32-bit floating point matrices.");
76 78  
77   - originalRows = trainingSet.first().m().rows;
78   - int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols;
79   - const int instances = trainingSet.size();
  79 + originalRows = trainingSet.first().m().rows; // get number of rows of first image
  80 + int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols; // get the size of the first image
  81 + const int instances = trainingSet.size(); // get the number of training set instances
80 82  
81 83 // Map into 64-bit Eigen matrix
82   - Eigen::MatrixXd data(dimsIn, instances);
  84 + Eigen::MatrixXd data(dimsIn, instances); // create a mat
83 85 for (int i=0; i<instances; i++)
84 86 data.col(i) = Eigen::Map<const Eigen::MatrixXf>(trainingSet[i].m().ptr<float>(), dimsIn, 1).cast<double>();
85 87  
... ... @@ -90,12 +92,16 @@ private:
90 92 {
91 93 dst = cv::Mat(1, keep, CV_32FC1);
92 94  
  95 + // perform the operation on the graphics card
  96 + cuda::cudapca_projectwrapper((float*)src.m().ptr<float>(), (float*)dst.m().ptr<float>());
  97 +
93 98 // Map Eigen into OpenCV
94   - Eigen::Map<const Eigen::MatrixXf> inMap(src.m().ptr<float>(), src.m().rows*src.m().cols, 1);
95   - Eigen::Map<Eigen::MatrixXf> outMap(dst.m().ptr<float>(), keep, 1);
  99 + //Mat cpuDst = cv::Mat(1, keep, CV_32FC1);
  100 + //Eigen::Map<const Eigen::MatrixXf> inMap(src.m().ptr<float>(), src.m().rows*src.m().cols, 1);
  101 + //Eigen::Map<Eigen::MatrixXf> outMap(dst.m().ptr<float>(), keep, 1);
96 102  
97 103 // Do projection
98   - outMap = eVecs.transpose() * (inMap - mean);
  104 + //cpuOutMap = eVecs.transpose() * (inMap - mean);
99 105 }
100 106  
101 107 void store(QDataStream &stream) const
... ... @@ -106,6 +112,41 @@ private:
106 112 void load(QDataStream &stream)
107 113 {
108 114 stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> eVecs;
  115 +
  116 + cout << "Mean Dimensions" << endl;
  117 + cout << "\tRows: " << mean.rows() << " Cols: " << mean.cols() << endl;
  118 + cout << "eVecs Dimensions" << endl;
  119 + cout << "\tRows: " << eVecs.rows() << " Cols: " << eVecs.cols() << endl;
  120 + cout << "eVals Dimensions" << endl;
  121 + cout << "\tRows: " << eVals.rows() << " Cols: " << eVals.cols() << endl;
  122 + cout << "Keep: " << keep << endl;
  123 +
  124 + cout << "Mean first value: " << mean(0, 0) << endl;
  125 +
  126 + // TODO(colin): use Eigen Map class to generate map files so we don't have to copy the data
  127 + // serialize the eigenvectors
  128 + float* evBuffer = new float[eVecs.rows() * eVecs.cols()];
  129 + for (int i=0; i < eVecs.rows(); i++) {
  130 + for (int j=0; j < eVecs.cols(); j++) {
  131 + evBuffer[i*eVecs.cols() + j] = eVecs(i, j);
  132 + }
  133 + }
  134 +
  135 + // serialize the mean
  136 + float* meanBuffer = new float[mean.rows() * mean.cols()];
  137 + for (int i=0; i < mean.rows(); i++) {
  138 + for (int j=0; j < mean.cols(); j++) {
  139 + meanBuffer[i*mean.cols() + j] = mean(i, j);
  140 + }
  141 + }
  142 +
  143 + cout << "Meanbuffer first value: " << meanBuffer[0] << endl;
  144 +
  145 + // call the wrapper function
  146 + cuda::cudapca_loadwrapper(evBuffer, eVecs.rows(), eVecs.cols(), meanBuffer, mean.rows(), mean.cols(), keep);
  147 +
  148 + delete evBuffer;
  149 + delete meanBuffer;
109 150 }
110 151  
111 152 protected:
... ...
openbr/plugins/cuda/cudapca.cu 0 → 100644
  1 +#include <iostream>
  2 +using namespace std;
  3 +
  4 +#include <opencv2/opencv.hpp>
  5 +#include <opencv2/gpu/gpu.hpp>
  6 +
  7 +using namespace cv;
  8 +using namespace cv::gpu;
  9 +
  10 +#include "cudapca.hpp"
  11 +
  12 +namespace br { namespace cuda {
  13 + __global__ void calculateCovariance_kernel(float* trainingSet, float* cov, int numRows, int numCols) {
  14 + int rowInd = blockIdx.y*blockDim.y + threadIdx.y;
  15 + int colInd = blockIdx.x*blockDim.x + threadIdx.x;
  16 +
  17 + // this calculates trainingSet' * trainingSet
  18 + if (rowInd >= numRows || colInd >= numCols) {
  19 + return;
  20 + }
  21 +
  22 + // get a reference the value we wish to write
  23 + float& out = cov[rowInd*numRows + colInd];
  24 +
  25 + // calculate the value of this position
  26 + out = 0;
  27 + for (int i=0; i<numRows; i++) {
  28 + out += trainingSet[rowInd*numCols + colInd] * trainingSet[rowInd*numCols + numRows]; // XXX(colin): not sure if this is correct
  29 + }
  30 + out = out / (numRows-1);
  31 + }
  32 +
  33 + __global__ void cudapca_project_multiply_kernel(float* src, float* dst, float* evPtr, int evRows, int evCols) {
  34 + int colInd = blockIdx.x*blockDim.x+threadIdx.x;
  35 +
  36 + // check dimensions
  37 + if (colInd >= evCols) {
  38 + return;
  39 + }
  40 +
  41 + dst[colInd] = 0;
  42 + for (int i=0; i < evRows; i++) {
  43 + dst[colInd] += evPtr[evCols*i + colInd] * src[i];
  44 + }
  45 + }
  46 +
  47 + __global__ void cudapca_project_subtractmean_kernel(float* out, float* mean, int cols) {
  48 + int colInd = blockIdx.x*blockDim.x+threadIdx.x;
  49 +
  50 + // perform bound checking
  51 + if (colInd >= cols) {
  52 + return;
  53 + }
  54 +
  55 + // subtract out the mean
  56 + out[colInd] -= mean[colInd];
  57 + }
  58 +
  59 + float* cudaEvPtr; int _evRows; int _evCols;
  60 + float* cudaMeanPtr; int _meanRows; int _meanCols;
  61 + int _keep;
  62 +
  63 + void cudapca_initwrapper() {
  64 +
  65 + }
  66 +
  67 + void cudapca_loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanRows, int meanCols, int keep) {
  68 + _evRows = evRows; _evCols = evCols;
  69 + _meanRows = meanRows; _meanCols = meanCols;
  70 + _keep = keep;
  71 +
  72 + // copy the eigenvectors to the GPU
  73 + cudaMalloc(&cudaEvPtr, evRows*evCols*sizeof(float));
  74 + cudaMemcpy(cudaEvPtr, evPtr, evRows*evCols*sizeof(float), cudaMemcpyHostToDevice);
  75 +
  76 + // copy the mean to the GPU
  77 + cudaMalloc(&cudaMeanPtr, meanRows*meanCols*sizeof(float));
  78 + cudaMemcpy(cudaMeanPtr, meanPtr, meanRows*meanCols*sizeof(float), cudaMemcpyHostToDevice);
  79 + }
  80 +
  81 + void cudapca_trainwrapper() {
  82 + /*
  83 + if (trainingSet[0].type() != CV_32FC1) {
  84 + std::cout << "ERR: Requires single 32-bit floating point matrix!";
  85 + return;
  86 + }
  87 +
  88 + cudaError_t status;
  89 +
  90 + const int numExamples = trainingSetSize;
  91 + int numPixels = trainingSet[0].rows * trainingSet[0].cols;
  92 +
  93 + // create a custom matrix
  94 + float* cudaDataPtr;
  95 + status = cudaMalloc(&cudaDataPtr, numPixels * numExamples * sizeof(float));
  96 + if (status != cudaSuccess) {
  97 + std::cout << "ERR: Memory allocation" << std::endl;
  98 + return;
  99 + }
  100 +
  101 + // copy all the data to the graphics card
  102 + for (int i=0; i < numExamples; i++) {
  103 + status = cudaMemcpy(cudaDataPtr + i*numPixels, trainingSet[i].ptr<float>(), numPixels*sizeof(float), cudaMemcpyHostToDevice);
  104 + if (status != cudaSuccess) {
  105 + std::cout << "ERR: Memcpy at index " << i << std::endl;
  106 + return;
  107 + }
  108 + }
  109 +
  110 + // start the core part of the algorithm
  111 + int numDimensions = numPixels;
  112 + const bool dominantEigenEstimation = (numDimensions > numExamples);
  113 +
  114 + // malloc and init mean
  115 + mean = new float[numDimensions];
  116 + for (int i=0; i < numDimensions; i++) {
  117 + mean[i] = 0;
  118 + }
  119 + float* cudaMeanPtr;
  120 + status = cudaMalloc(&cudaMeanPtr, numDimensions*sizeof(float));
  121 + if (status != cudaSuccess) {
  122 + std::cout << " ERR: Malloc of mean" << std::endl;
  123 + return;
  124 + }
  125 +
  126 + if (keep != 0) {
  127 + // compute the mean so we can subtract from data
  128 + for (int i=0; i < numExamples; i++) {
  129 + Mat& m = trainingSet[i];
  130 +
  131 + for (int j=0; j < numDimensions; j++) {
  132 + mean[j] += m.ptr<float>()[i*numDimensions + j];
  133 + }
  134 + }
  135 + for (int i=0; i < numDimensions; i++) {
  136 + mean[i] = mean[i] / numExamples;
  137 + }
  138 +
  139 + // copy mean over to graphics card
  140 + cudaMemcpy(cudaMeanPtr, mean, numExamples*sizeof(float), cudaMemcpyHostToDevice);
  141 + if (status != cudaSuccess) {
  142 + std::cout << " ERR: Cpy of mean" << std::endl;
  143 + return;
  144 + }
  145 +
  146 + // set the thread dimensions and run the kernel
  147 + dim3 threadsPerBlock(64, 1);
  148 + dim3 numBlocks(numDimensions/threadsPerBlock.x + 1,
  149 + numExamples/threadsPerBlock.y + 1);
  150 +
  151 + subtractMean_kernel<<<numBlocks, threadsPerBlock>>>(cudaDataPtr, cudaMeanPtr, numExamples, numDimensions);
  152 +
  153 + // calculate the covariance matrix using kernel
  154 + // malloc location for covariance matrix
  155 + float* cudaCovPtr;
  156 + status = cudaMalloc(&cudaCovPtr, numExamples*numExamples*sizeof(float));
  157 + if (status != cudaSuccess) h
  158 + std::cout << " ERR: Cpy of mean" << std::endl;
  159 + return;
  160 + }
  161 +
  162 + // calculate the covariance matrix
  163 + threadsPerBlock = dim3(8, 8);
  164 + numBlocks = dim3(numExamples/threadsPerBlock.x + 1,
  165 + numExamples/threadsPerBlock.y + 1);
  166 + calculateCovariance_kernel<<<numBlocks, threadsPerBlock>>>(cudaDataPtr, cudaCovPtr, numExamples, numDimensions);
  167 +
  168 + // perform eigendecomposition
  169 + //std::cout << "Skipping eigendecomposition" << std::endl;
  170 + cusolverStatus_t cusolverStatus;
  171 + cusolverStatus = cusolverDnSgebrd(cusolverHandle,)
  172 + }
  173 + */
  174 + }
  175 +
  176 + void cudapca_projectwrapper(float* src, float* dst) {
  177 + // copy the image to the GPU
  178 + float* cudaSrcPtr;
  179 + cudaMalloc(&cudaSrcPtr, _meanRows*_meanCols*sizeof(float));
  180 + cudaMemcpy(cudaSrcPtr, src, _meanRows*_meanCols*sizeof(float), cudaMemcpyHostToDevice);
  181 +
  182 + float* cudaDstPtr;
  183 + cudaMalloc(&cudaDstPtr, _keep*sizeof(float));
  184 +
  185 + // subtract out the mean of the image (mean is 1xpixels in size)
  186 + int threadsPerBlock = 64;
  187 + int numBlocks = _meanRows*_meanCols / threadsPerBlock;
  188 + cudapca_project_subtractmean_kernel<<<numBlocks, threadsPerBlock>>>(cudaSrcPtr, cudaMeanPtr, _meanRows*_meanCols);
  189 +
  190 + // perform the multiplication
  191 + threadsPerBlock = 64;
  192 + numBlocks = _keep / threadsPerBlock;
  193 + cudapca_project_multiply_kernel<<<numBlocks, threadsPerBlock>>>(cudaSrcPtr, cudaDstPtr, cudaEvPtr, _evRows, _evCols);
  194 +
  195 + // copy the data back to the CPU
  196 + cudaMemcpy(dst, cudaDstPtr, _keep*sizeof(float), cudaMemcpyDeviceToHost);
  197 +
  198 + cudaFree(cudaSrcPtr);
  199 + cudaFree(cudaDstPtr);
  200 + }
  201 +}}
... ...
openbr/plugins/cuda/cudapca.hpp 0 → 100644
  1 +#include <opencv2/opencv.hpp>
  2 +#include <opencv2/gpu/gpu.hpp>
  3 +
  4 +using namespace cv;
  5 +using namespace cv::gpu;
  6 +
  7 +namespace br { namespace cuda {
  8 + void cudapca_initwrapper();
  9 +
  10 + void cudapca_loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanRows, int meanCols, int keep);
  11 + void cudapca_trainwrapper();
  12 +
  13 + void cudapca_projectwrapper(float* src, float* dst);
  14 +}}
... ...