Commit 39f67cdab4412630c8ed8c334668a98a8624ef82
1 parent
7f8093c9
added CUDAPCA project acceleration
doesn't make things too much faster...
Showing
3 changed files
with
265 additions
and
9 deletions
openbr/plugins/cuda/cudapca.cpp
| @@ -13,8 +13,8 @@ | @@ -13,8 +13,8 @@ | ||
| 13 | * See the License for the specific language governing permissions and * | 13 | * See the License for the specific language governing permissions and * |
| 14 | * limitations under the License. * | 14 | * limitations under the License. * |
| 15 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ | 15 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ |
| 16 | -// CUDA includes | ||
| 17 | -#include <cusolverDn.h> | 16 | +#include <iostream> |
| 17 | +using namespace std; | ||
| 18 | 18 | ||
| 19 | #include <Eigen/Dense> | 19 | #include <Eigen/Dense> |
| 20 | #include <openbr/plugins/openbr_internal.h> | 20 | #include <openbr/plugins/openbr_internal.h> |
| @@ -23,6 +23,8 @@ | @@ -23,6 +23,8 @@ | ||
| 23 | #include <openbr/core/eigenutils.h> | 23 | #include <openbr/core/eigenutils.h> |
| 24 | #include <openbr/core/opencvutils.h> | 24 | #include <openbr/core/opencvutils.h> |
| 25 | 25 | ||
| 26 | +#include "cudapca.hpp" | ||
| 27 | + | ||
| 26 | namespace br | 28 | namespace br |
| 27 | { | 29 | { |
| 28 | /*! | 30 | /*! |
| @@ -74,12 +76,12 @@ private: | @@ -74,12 +76,12 @@ private: | ||
| 74 | if (trainingSet.first().m().type() != CV_32FC1) | 76 | if (trainingSet.first().m().type() != CV_32FC1) |
| 75 | qFatal("Requires single channel 32-bit floating point matrices."); | 77 | qFatal("Requires single channel 32-bit floating point matrices."); |
| 76 | 78 | ||
| 77 | - originalRows = trainingSet.first().m().rows; | ||
| 78 | - int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols; | ||
| 79 | - const int instances = trainingSet.size(); | 79 | + originalRows = trainingSet.first().m().rows; // get number of rows of first image |
| 80 | + int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols; // get the size of the first image | ||
| 81 | + const int instances = trainingSet.size(); // get the number of training set instances | ||
| 80 | 82 | ||
| 81 | // Map into 64-bit Eigen matrix | 83 | // Map into 64-bit Eigen matrix |
| 82 | - Eigen::MatrixXd data(dimsIn, instances); | 84 | + Eigen::MatrixXd data(dimsIn, instances); // create a mat |
| 83 | for (int i=0; i<instances; i++) | 85 | for (int i=0; i<instances; i++) |
| 84 | data.col(i) = Eigen::Map<const Eigen::MatrixXf>(trainingSet[i].m().ptr<float>(), dimsIn, 1).cast<double>(); | 86 | data.col(i) = Eigen::Map<const Eigen::MatrixXf>(trainingSet[i].m().ptr<float>(), dimsIn, 1).cast<double>(); |
| 85 | 87 | ||
| @@ -90,12 +92,16 @@ private: | @@ -90,12 +92,16 @@ private: | ||
| 90 | { | 92 | { |
| 91 | dst = cv::Mat(1, keep, CV_32FC1); | 93 | dst = cv::Mat(1, keep, CV_32FC1); |
| 92 | 94 | ||
| 95 | + // perform the operation on the graphics card | ||
| 96 | + cuda::cudapca_projectwrapper((float*)src.m().ptr<float>(), (float*)dst.m().ptr<float>()); | ||
| 97 | + | ||
| 93 | // Map Eigen into OpenCV | 98 | // Map Eigen into OpenCV |
| 94 | - Eigen::Map<const Eigen::MatrixXf> inMap(src.m().ptr<float>(), src.m().rows*src.m().cols, 1); | ||
| 95 | - Eigen::Map<Eigen::MatrixXf> outMap(dst.m().ptr<float>(), keep, 1); | 99 | + //Mat cpuDst = cv::Mat(1, keep, CV_32FC1); |
| 100 | + //Eigen::Map<const Eigen::MatrixXf> inMap(src.m().ptr<float>(), src.m().rows*src.m().cols, 1); | ||
| 101 | + //Eigen::Map<Eigen::MatrixXf> outMap(dst.m().ptr<float>(), keep, 1); | ||
| 96 | 102 | ||
| 97 | // Do projection | 103 | // Do projection |
| 98 | - outMap = eVecs.transpose() * (inMap - mean); | 104 | + //cpuOutMap = eVecs.transpose() * (inMap - mean); |
| 99 | } | 105 | } |
| 100 | 106 | ||
| 101 | void store(QDataStream &stream) const | 107 | void store(QDataStream &stream) const |
| @@ -106,6 +112,41 @@ private: | @@ -106,6 +112,41 @@ private: | ||
| 106 | void load(QDataStream &stream) | 112 | void load(QDataStream &stream) |
| 107 | { | 113 | { |
| 108 | stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> eVecs; | 114 | stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> eVecs; |
| 115 | + | ||
| 116 | + cout << "Mean Dimensions" << endl; | ||
| 117 | + cout << "\tRows: " << mean.rows() << " Cols: " << mean.cols() << endl; | ||
| 118 | + cout << "eVecs Dimensions" << endl; | ||
| 119 | + cout << "\tRows: " << eVecs.rows() << " Cols: " << eVecs.cols() << endl; | ||
| 120 | + cout << "eVals Dimensions" << endl; | ||
| 121 | + cout << "\tRows: " << eVals.rows() << " Cols: " << eVals.cols() << endl; | ||
| 122 | + cout << "Keep: " << keep << endl; | ||
| 123 | + | ||
| 124 | + cout << "Mean first value: " << mean(0, 0) << endl; | ||
| 125 | + | ||
| 126 | + // TODO(colin): use Eigen Map class to generate map files so we don't have to copy the data | ||
| 127 | + // serialize the eigenvectors | ||
| 128 | + float* evBuffer = new float[eVecs.rows() * eVecs.cols()]; | ||
| 129 | + for (int i=0; i < eVecs.rows(); i++) { | ||
| 130 | + for (int j=0; j < eVecs.cols(); j++) { | ||
| 131 | + evBuffer[i*eVecs.cols() + j] = eVecs(i, j); | ||
| 132 | + } | ||
| 133 | + } | ||
| 134 | + | ||
| 135 | + // serialize the mean | ||
| 136 | + float* meanBuffer = new float[mean.rows() * mean.cols()]; | ||
| 137 | + for (int i=0; i < mean.rows(); i++) { | ||
| 138 | + for (int j=0; j < mean.cols(); j++) { | ||
| 139 | + meanBuffer[i*mean.cols() + j] = mean(i, j); | ||
| 140 | + } | ||
| 141 | + } | ||
| 142 | + | ||
| 143 | + cout << "Meanbuffer first value: " << meanBuffer[0] << endl; | ||
| 144 | + | ||
| 145 | + // call the wrapper function | ||
| 146 | + cuda::cudapca_loadwrapper(evBuffer, eVecs.rows(), eVecs.cols(), meanBuffer, mean.rows(), mean.cols(), keep); | ||
| 147 | + | ||
| 148 | + delete evBuffer; | ||
| 149 | + delete meanBuffer; | ||
| 109 | } | 150 | } |
| 110 | 151 | ||
| 111 | protected: | 152 | protected: |
openbr/plugins/cuda/cudapca.cu
0 → 100644
| 1 | +#include <iostream> | ||
| 2 | +using namespace std; | ||
| 3 | + | ||
| 4 | +#include <opencv2/opencv.hpp> | ||
| 5 | +#include <opencv2/gpu/gpu.hpp> | ||
| 6 | + | ||
| 7 | +using namespace cv; | ||
| 8 | +using namespace cv::gpu; | ||
| 9 | + | ||
| 10 | +#include "cudapca.hpp" | ||
| 11 | + | ||
| 12 | +namespace br { namespace cuda { | ||
| 13 | + __global__ void calculateCovariance_kernel(float* trainingSet, float* cov, int numRows, int numCols) { | ||
| 14 | + int rowInd = blockIdx.y*blockDim.y + threadIdx.y; | ||
| 15 | + int colInd = blockIdx.x*blockDim.x + threadIdx.x; | ||
| 16 | + | ||
| 17 | + // this calculates trainingSet' * trainingSet | ||
| 18 | + if (rowInd >= numRows || colInd >= numCols) { | ||
| 19 | + return; | ||
| 20 | + } | ||
| 21 | + | ||
| 22 | + // get a reference the value we wish to write | ||
| 23 | + float& out = cov[rowInd*numRows + colInd]; | ||
| 24 | + | ||
| 25 | + // calculate the value of this position | ||
| 26 | + out = 0; | ||
| 27 | + for (int i=0; i<numRows; i++) { | ||
| 28 | + out += trainingSet[rowInd*numCols + colInd] * trainingSet[rowInd*numCols + numRows]; // XXX(colin): not sure if this is correct | ||
| 29 | + } | ||
| 30 | + out = out / (numRows-1); | ||
| 31 | + } | ||
| 32 | + | ||
| 33 | + __global__ void cudapca_project_multiply_kernel(float* src, float* dst, float* evPtr, int evRows, int evCols) { | ||
| 34 | + int colInd = blockIdx.x*blockDim.x+threadIdx.x; | ||
| 35 | + | ||
| 36 | + // check dimensions | ||
| 37 | + if (colInd >= evCols) { | ||
| 38 | + return; | ||
| 39 | + } | ||
| 40 | + | ||
| 41 | + dst[colInd] = 0; | ||
| 42 | + for (int i=0; i < evRows; i++) { | ||
| 43 | + dst[colInd] += evPtr[evCols*i + colInd] * src[i]; | ||
| 44 | + } | ||
| 45 | + } | ||
| 46 | + | ||
| 47 | + __global__ void cudapca_project_subtractmean_kernel(float* out, float* mean, int cols) { | ||
| 48 | + int colInd = blockIdx.x*blockDim.x+threadIdx.x; | ||
| 49 | + | ||
| 50 | + // perform bound checking | ||
| 51 | + if (colInd >= cols) { | ||
| 52 | + return; | ||
| 53 | + } | ||
| 54 | + | ||
| 55 | + // subtract out the mean | ||
| 56 | + out[colInd] -= mean[colInd]; | ||
| 57 | + } | ||
| 58 | + | ||
| 59 | + float* cudaEvPtr; int _evRows; int _evCols; | ||
| 60 | + float* cudaMeanPtr; int _meanRows; int _meanCols; | ||
| 61 | + int _keep; | ||
| 62 | + | ||
| 63 | + void cudapca_initwrapper() { | ||
| 64 | + | ||
| 65 | + } | ||
| 66 | + | ||
| 67 | + void cudapca_loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanRows, int meanCols, int keep) { | ||
| 68 | + _evRows = evRows; _evCols = evCols; | ||
| 69 | + _meanRows = meanRows; _meanCols = meanCols; | ||
| 70 | + _keep = keep; | ||
| 71 | + | ||
| 72 | + // copy the eigenvectors to the GPU | ||
| 73 | + cudaMalloc(&cudaEvPtr, evRows*evCols*sizeof(float)); | ||
| 74 | + cudaMemcpy(cudaEvPtr, evPtr, evRows*evCols*sizeof(float), cudaMemcpyHostToDevice); | ||
| 75 | + | ||
| 76 | + // copy the mean to the GPU | ||
| 77 | + cudaMalloc(&cudaMeanPtr, meanRows*meanCols*sizeof(float)); | ||
| 78 | + cudaMemcpy(cudaMeanPtr, meanPtr, meanRows*meanCols*sizeof(float), cudaMemcpyHostToDevice); | ||
| 79 | + } | ||
| 80 | + | ||
| 81 | + void cudapca_trainwrapper() { | ||
| 82 | + /* | ||
| 83 | + if (trainingSet[0].type() != CV_32FC1) { | ||
| 84 | + std::cout << "ERR: Requires single 32-bit floating point matrix!"; | ||
| 85 | + return; | ||
| 86 | + } | ||
| 87 | + | ||
| 88 | + cudaError_t status; | ||
| 89 | + | ||
| 90 | + const int numExamples = trainingSetSize; | ||
| 91 | + int numPixels = trainingSet[0].rows * trainingSet[0].cols; | ||
| 92 | + | ||
| 93 | + // create a custom matrix | ||
| 94 | + float* cudaDataPtr; | ||
| 95 | + status = cudaMalloc(&cudaDataPtr, numPixels * numExamples * sizeof(float)); | ||
| 96 | + if (status != cudaSuccess) { | ||
| 97 | + std::cout << "ERR: Memory allocation" << std::endl; | ||
| 98 | + return; | ||
| 99 | + } | ||
| 100 | + | ||
| 101 | + // copy all the data to the graphics card | ||
| 102 | + for (int i=0; i < numExamples; i++) { | ||
| 103 | + status = cudaMemcpy(cudaDataPtr + i*numPixels, trainingSet[i].ptr<float>(), numPixels*sizeof(float), cudaMemcpyHostToDevice); | ||
| 104 | + if (status != cudaSuccess) { | ||
| 105 | + std::cout << "ERR: Memcpy at index " << i << std::endl; | ||
| 106 | + return; | ||
| 107 | + } | ||
| 108 | + } | ||
| 109 | + | ||
| 110 | + // start the core part of the algorithm | ||
| 111 | + int numDimensions = numPixels; | ||
| 112 | + const bool dominantEigenEstimation = (numDimensions > numExamples); | ||
| 113 | + | ||
| 114 | + // malloc and init mean | ||
| 115 | + mean = new float[numDimensions]; | ||
| 116 | + for (int i=0; i < numDimensions; i++) { | ||
| 117 | + mean[i] = 0; | ||
| 118 | + } | ||
| 119 | + float* cudaMeanPtr; | ||
| 120 | + status = cudaMalloc(&cudaMeanPtr, numDimensions*sizeof(float)); | ||
| 121 | + if (status != cudaSuccess) { | ||
| 122 | + std::cout << " ERR: Malloc of mean" << std::endl; | ||
| 123 | + return; | ||
| 124 | + } | ||
| 125 | + | ||
| 126 | + if (keep != 0) { | ||
| 127 | + // compute the mean so we can subtract from data | ||
| 128 | + for (int i=0; i < numExamples; i++) { | ||
| 129 | + Mat& m = trainingSet[i]; | ||
| 130 | + | ||
| 131 | + for (int j=0; j < numDimensions; j++) { | ||
| 132 | + mean[j] += m.ptr<float>()[i*numDimensions + j]; | ||
| 133 | + } | ||
| 134 | + } | ||
| 135 | + for (int i=0; i < numDimensions; i++) { | ||
| 136 | + mean[i] = mean[i] / numExamples; | ||
| 137 | + } | ||
| 138 | + | ||
| 139 | + // copy mean over to graphics card | ||
| 140 | + cudaMemcpy(cudaMeanPtr, mean, numExamples*sizeof(float), cudaMemcpyHostToDevice); | ||
| 141 | + if (status != cudaSuccess) { | ||
| 142 | + std::cout << " ERR: Cpy of mean" << std::endl; | ||
| 143 | + return; | ||
| 144 | + } | ||
| 145 | + | ||
| 146 | + // set the thread dimensions and run the kernel | ||
| 147 | + dim3 threadsPerBlock(64, 1); | ||
| 148 | + dim3 numBlocks(numDimensions/threadsPerBlock.x + 1, | ||
| 149 | + numExamples/threadsPerBlock.y + 1); | ||
| 150 | + | ||
| 151 | + subtractMean_kernel<<<numBlocks, threadsPerBlock>>>(cudaDataPtr, cudaMeanPtr, numExamples, numDimensions); | ||
| 152 | + | ||
| 153 | + // calculate the covariance matrix using kernel | ||
| 154 | + // malloc location for covariance matrix | ||
| 155 | + float* cudaCovPtr; | ||
| 156 | + status = cudaMalloc(&cudaCovPtr, numExamples*numExamples*sizeof(float)); | ||
| 157 | + if (status != cudaSuccess) h | ||
| 158 | + std::cout << " ERR: Cpy of mean" << std::endl; | ||
| 159 | + return; | ||
| 160 | + } | ||
| 161 | + | ||
| 162 | + // calculate the covariance matrix | ||
| 163 | + threadsPerBlock = dim3(8, 8); | ||
| 164 | + numBlocks = dim3(numExamples/threadsPerBlock.x + 1, | ||
| 165 | + numExamples/threadsPerBlock.y + 1); | ||
| 166 | + calculateCovariance_kernel<<<numBlocks, threadsPerBlock>>>(cudaDataPtr, cudaCovPtr, numExamples, numDimensions); | ||
| 167 | + | ||
| 168 | + // perform eigendecomposition | ||
| 169 | + //std::cout << "Skipping eigendecomposition" << std::endl; | ||
| 170 | + cusolverStatus_t cusolverStatus; | ||
| 171 | + cusolverStatus = cusolverDnSgebrd(cusolverHandle,) | ||
| 172 | + } | ||
| 173 | + */ | ||
| 174 | + } | ||
| 175 | + | ||
| 176 | + void cudapca_projectwrapper(float* src, float* dst) { | ||
| 177 | + // copy the image to the GPU | ||
| 178 | + float* cudaSrcPtr; | ||
| 179 | + cudaMalloc(&cudaSrcPtr, _meanRows*_meanCols*sizeof(float)); | ||
| 180 | + cudaMemcpy(cudaSrcPtr, src, _meanRows*_meanCols*sizeof(float), cudaMemcpyHostToDevice); | ||
| 181 | + | ||
| 182 | + float* cudaDstPtr; | ||
| 183 | + cudaMalloc(&cudaDstPtr, _keep*sizeof(float)); | ||
| 184 | + | ||
| 185 | + // subtract out the mean of the image (mean is 1xpixels in size) | ||
| 186 | + int threadsPerBlock = 64; | ||
| 187 | + int numBlocks = _meanRows*_meanCols / threadsPerBlock; | ||
| 188 | + cudapca_project_subtractmean_kernel<<<numBlocks, threadsPerBlock>>>(cudaSrcPtr, cudaMeanPtr, _meanRows*_meanCols); | ||
| 189 | + | ||
| 190 | + // perform the multiplication | ||
| 191 | + threadsPerBlock = 64; | ||
| 192 | + numBlocks = _keep / threadsPerBlock; | ||
| 193 | + cudapca_project_multiply_kernel<<<numBlocks, threadsPerBlock>>>(cudaSrcPtr, cudaDstPtr, cudaEvPtr, _evRows, _evCols); | ||
| 194 | + | ||
| 195 | + // copy the data back to the CPU | ||
| 196 | + cudaMemcpy(dst, cudaDstPtr, _keep*sizeof(float), cudaMemcpyDeviceToHost); | ||
| 197 | + | ||
| 198 | + cudaFree(cudaSrcPtr); | ||
| 199 | + cudaFree(cudaDstPtr); | ||
| 200 | + } | ||
| 201 | +}} |
openbr/plugins/cuda/cudapca.hpp
0 → 100644
| 1 | +#include <opencv2/opencv.hpp> | ||
| 2 | +#include <opencv2/gpu/gpu.hpp> | ||
| 3 | + | ||
| 4 | +using namespace cv; | ||
| 5 | +using namespace cv::gpu; | ||
| 6 | + | ||
| 7 | +namespace br { namespace cuda { | ||
| 8 | + void cudapca_initwrapper(); | ||
| 9 | + | ||
| 10 | + void cudapca_loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanRows, int meanCols, int keep); | ||
| 11 | + void cudapca_trainwrapper(); | ||
| 12 | + | ||
| 13 | + void cudapca_projectwrapper(float* src, float* dst); | ||
| 14 | +}} |