diff --git a/openbr/plugins/cuda/cudapca.cpp b/openbr/plugins/cuda/cudapca.cpp index e7e5e0c..1e5845c 100644 --- a/openbr/plugins/cuda/cudapca.cpp +++ b/openbr/plugins/cuda/cudapca.cpp @@ -13,8 +13,8 @@ * See the License for the specific language governing permissions and * * limitations under the License. * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ -// CUDA includes -#include +#include +using namespace std; #include #include @@ -23,6 +23,8 @@ #include #include +#include "cudapca.hpp" + namespace br { /*! @@ -74,12 +76,12 @@ private: if (trainingSet.first().m().type() != CV_32FC1) qFatal("Requires single channel 32-bit floating point matrices."); - originalRows = trainingSet.first().m().rows; - int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols; - const int instances = trainingSet.size(); + originalRows = trainingSet.first().m().rows; // get number of rows of first image + int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols; // get the size of the first image + const int instances = trainingSet.size(); // get the number of training set instances // Map into 64-bit Eigen matrix - Eigen::MatrixXd data(dimsIn, instances); + Eigen::MatrixXd data(dimsIn, instances); // create a mat for (int i=0; i(trainingSet[i].m().ptr(), dimsIn, 1).cast(); @@ -90,12 +92,16 @@ private: { dst = cv::Mat(1, keep, CV_32FC1); + // perform the operation on the graphics card + cuda::cudapca_projectwrapper((float*)src.m().ptr(), (float*)dst.m().ptr()); + // Map Eigen into OpenCV - Eigen::Map inMap(src.m().ptr(), src.m().rows*src.m().cols, 1); - Eigen::Map outMap(dst.m().ptr(), keep, 1); + //Mat cpuDst = cv::Mat(1, keep, CV_32FC1); + //Eigen::Map inMap(src.m().ptr(), src.m().rows*src.m().cols, 1); + //Eigen::Map outMap(dst.m().ptr(), keep, 1); // Do projection - outMap = eVecs.transpose() * (inMap - mean); + //cpuOutMap = eVecs.transpose() * (inMap - mean); } void store(QDataStream &stream) const @@ -106,6 +112,41 @@ private: void load(QDataStream &stream) { stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> eVecs; + + cout << "Mean Dimensions" << endl; + cout << "\tRows: " << mean.rows() << " Cols: " << mean.cols() << endl; + cout << "eVecs Dimensions" << endl; + cout << "\tRows: " << eVecs.rows() << " Cols: " << eVecs.cols() << endl; + cout << "eVals Dimensions" << endl; + cout << "\tRows: " << eVals.rows() << " Cols: " << eVals.cols() << endl; + cout << "Keep: " << keep << endl; + + cout << "Mean first value: " << mean(0, 0) << endl; + + // TODO(colin): use Eigen Map class to generate map files so we don't have to copy the data + // serialize the eigenvectors + float* evBuffer = new float[eVecs.rows() * eVecs.cols()]; + for (int i=0; i < eVecs.rows(); i++) { + for (int j=0; j < eVecs.cols(); j++) { + evBuffer[i*eVecs.cols() + j] = eVecs(i, j); + } + } + + // serialize the mean + float* meanBuffer = new float[mean.rows() * mean.cols()]; + for (int i=0; i < mean.rows(); i++) { + for (int j=0; j < mean.cols(); j++) { + meanBuffer[i*mean.cols() + j] = mean(i, j); + } + } + + cout << "Meanbuffer first value: " << meanBuffer[0] << endl; + + // call the wrapper function + cuda::cudapca_loadwrapper(evBuffer, eVecs.rows(), eVecs.cols(), meanBuffer, mean.rows(), mean.cols(), keep); + + delete evBuffer; + delete meanBuffer; } protected: diff --git a/openbr/plugins/cuda/cudapca.cu b/openbr/plugins/cuda/cudapca.cu new file mode 100644 index 0000000..bd6f7fb --- /dev/null +++ b/openbr/plugins/cuda/cudapca.cu @@ -0,0 +1,201 @@ +#include +using namespace std; + +#include +#include + +using namespace cv; +using namespace cv::gpu; + +#include "cudapca.hpp" + +namespace br { namespace cuda { + __global__ void calculateCovariance_kernel(float* trainingSet, float* cov, int numRows, int numCols) { + int rowInd = blockIdx.y*blockDim.y + threadIdx.y; + int colInd = blockIdx.x*blockDim.x + threadIdx.x; + + // this calculates trainingSet' * trainingSet + if (rowInd >= numRows || colInd >= numCols) { + return; + } + + // get a reference the value we wish to write + float& out = cov[rowInd*numRows + colInd]; + + // calculate the value of this position + out = 0; + for (int i=0; i= evCols) { + return; + } + + dst[colInd] = 0; + for (int i=0; i < evRows; i++) { + dst[colInd] += evPtr[evCols*i + colInd] * src[i]; + } + } + + __global__ void cudapca_project_subtractmean_kernel(float* out, float* mean, int cols) { + int colInd = blockIdx.x*blockDim.x+threadIdx.x; + + // perform bound checking + if (colInd >= cols) { + return; + } + + // subtract out the mean + out[colInd] -= mean[colInd]; + } + + float* cudaEvPtr; int _evRows; int _evCols; + float* cudaMeanPtr; int _meanRows; int _meanCols; + int _keep; + + void cudapca_initwrapper() { + + } + + void cudapca_loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanRows, int meanCols, int keep) { + _evRows = evRows; _evCols = evCols; + _meanRows = meanRows; _meanCols = meanCols; + _keep = keep; + + // copy the eigenvectors to the GPU + cudaMalloc(&cudaEvPtr, evRows*evCols*sizeof(float)); + cudaMemcpy(cudaEvPtr, evPtr, evRows*evCols*sizeof(float), cudaMemcpyHostToDevice); + + // copy the mean to the GPU + cudaMalloc(&cudaMeanPtr, meanRows*meanCols*sizeof(float)); + cudaMemcpy(cudaMeanPtr, meanPtr, meanRows*meanCols*sizeof(float), cudaMemcpyHostToDevice); + } + + void cudapca_trainwrapper() { + /* + if (trainingSet[0].type() != CV_32FC1) { + std::cout << "ERR: Requires single 32-bit floating point matrix!"; + return; + } + + cudaError_t status; + + const int numExamples = trainingSetSize; + int numPixels = trainingSet[0].rows * trainingSet[0].cols; + + // create a custom matrix + float* cudaDataPtr; + status = cudaMalloc(&cudaDataPtr, numPixels * numExamples * sizeof(float)); + if (status != cudaSuccess) { + std::cout << "ERR: Memory allocation" << std::endl; + return; + } + + // copy all the data to the graphics card + for (int i=0; i < numExamples; i++) { + status = cudaMemcpy(cudaDataPtr + i*numPixels, trainingSet[i].ptr(), numPixels*sizeof(float), cudaMemcpyHostToDevice); + if (status != cudaSuccess) { + std::cout << "ERR: Memcpy at index " << i << std::endl; + return; + } + } + + // start the core part of the algorithm + int numDimensions = numPixels; + const bool dominantEigenEstimation = (numDimensions > numExamples); + + // malloc and init mean + mean = new float[numDimensions]; + for (int i=0; i < numDimensions; i++) { + mean[i] = 0; + } + float* cudaMeanPtr; + status = cudaMalloc(&cudaMeanPtr, numDimensions*sizeof(float)); + if (status != cudaSuccess) { + std::cout << " ERR: Malloc of mean" << std::endl; + return; + } + + if (keep != 0) { + // compute the mean so we can subtract from data + for (int i=0; i < numExamples; i++) { + Mat& m = trainingSet[i]; + + for (int j=0; j < numDimensions; j++) { + mean[j] += m.ptr()[i*numDimensions + j]; + } + } + for (int i=0; i < numDimensions; i++) { + mean[i] = mean[i] / numExamples; + } + + // copy mean over to graphics card + cudaMemcpy(cudaMeanPtr, mean, numExamples*sizeof(float), cudaMemcpyHostToDevice); + if (status != cudaSuccess) { + std::cout << " ERR: Cpy of mean" << std::endl; + return; + } + + // set the thread dimensions and run the kernel + dim3 threadsPerBlock(64, 1); + dim3 numBlocks(numDimensions/threadsPerBlock.x + 1, + numExamples/threadsPerBlock.y + 1); + + subtractMean_kernel<<>>(cudaDataPtr, cudaMeanPtr, numExamples, numDimensions); + + // calculate the covariance matrix using kernel + // malloc location for covariance matrix + float* cudaCovPtr; + status = cudaMalloc(&cudaCovPtr, numExamples*numExamples*sizeof(float)); + if (status != cudaSuccess) h + std::cout << " ERR: Cpy of mean" << std::endl; + return; + } + + // calculate the covariance matrix + threadsPerBlock = dim3(8, 8); + numBlocks = dim3(numExamples/threadsPerBlock.x + 1, + numExamples/threadsPerBlock.y + 1); + calculateCovariance_kernel<<>>(cudaDataPtr, cudaCovPtr, numExamples, numDimensions); + + // perform eigendecomposition + //std::cout << "Skipping eigendecomposition" << std::endl; + cusolverStatus_t cusolverStatus; + cusolverStatus = cusolverDnSgebrd(cusolverHandle,) + } + */ + } + + void cudapca_projectwrapper(float* src, float* dst) { + // copy the image to the GPU + float* cudaSrcPtr; + cudaMalloc(&cudaSrcPtr, _meanRows*_meanCols*sizeof(float)); + cudaMemcpy(cudaSrcPtr, src, _meanRows*_meanCols*sizeof(float), cudaMemcpyHostToDevice); + + float* cudaDstPtr; + cudaMalloc(&cudaDstPtr, _keep*sizeof(float)); + + // subtract out the mean of the image (mean is 1xpixels in size) + int threadsPerBlock = 64; + int numBlocks = _meanRows*_meanCols / threadsPerBlock; + cudapca_project_subtractmean_kernel<<>>(cudaSrcPtr, cudaMeanPtr, _meanRows*_meanCols); + + // perform the multiplication + threadsPerBlock = 64; + numBlocks = _keep / threadsPerBlock; + cudapca_project_multiply_kernel<<>>(cudaSrcPtr, cudaDstPtr, cudaEvPtr, _evRows, _evCols); + + // copy the data back to the CPU + cudaMemcpy(dst, cudaDstPtr, _keep*sizeof(float), cudaMemcpyDeviceToHost); + + cudaFree(cudaSrcPtr); + cudaFree(cudaDstPtr); + } +}} diff --git a/openbr/plugins/cuda/cudapca.hpp b/openbr/plugins/cuda/cudapca.hpp new file mode 100644 index 0000000..b655e3e --- /dev/null +++ b/openbr/plugins/cuda/cudapca.hpp @@ -0,0 +1,14 @@ +#include +#include + +using namespace cv; +using namespace cv::gpu; + +namespace br { namespace cuda { + void cudapca_initwrapper(); + + void cudapca_loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanRows, int meanCols, int keep); + void cudapca_trainwrapper(); + + void cudapca_projectwrapper(float* src, float* dst); +}}