From 5e16788d451105dcdc4fef2778996043ec3a40ad Mon Sep 17 00:00:00 2001 From: DepthDeluxe Date: Fri, 12 Feb 2016 15:38:42 -0500 Subject: [PATCH] added pipelined CUDALBP --- openbr/plugins/cuda/copyfrom.cpp | 9 +++++++++ openbr/plugins/cuda/cudalbp.cpp | 67 ++++++++++++++++++++++++++++++++++++++++--------------------------- openbr/plugins/cuda/cudalbp.cu | 58 ++++++++++++++++++++++++++++------------------------------ openbr/plugins/cuda/cudalbp.hpp | 9 --------- 4 files changed, 77 insertions(+), 66 deletions(-) delete mode 100644 openbr/plugins/cuda/cudalbp.hpp diff --git a/openbr/plugins/cuda/copyfrom.cpp b/openbr/plugins/cuda/copyfrom.cpp index 38dc913..68b9507 100644 --- a/openbr/plugins/cuda/copyfrom.cpp +++ b/openbr/plugins/cuda/copyfrom.cpp @@ -23,6 +23,8 @@ namespace br private: void project(const Template &src, Template &dst) const { + cout << "CUDACopyFrom Start" << endl; + // pull the data back out of the Mat void* const* dataPtr = src.m().ptr(); void* cudaMemPtr = dataPtr[0]; @@ -30,9 +32,16 @@ private: int cols = *((int*)dataPtr[2]); int type = *((int*)dataPtr[3]); + cout << "cudaMemPtr: " << cudaMemPtr << endl; + cout << "rows: " << rows << endl; + cout << "cols: " << cols << endl; + cout << "type: " << type << endl; + dst = Mat(rows, cols, type); br::cuda::cudacopyfrom::wrapper(cudaMemPtr, dst.m().ptr(), rows, cols); + + cout << "CUDACopyFrom End" << endl; } }; diff --git a/openbr/plugins/cuda/cudalbp.cpp b/openbr/plugins/cuda/cudalbp.cpp index 08f0227..ec6d71a 100644 --- a/openbr/plugins/cuda/cudalbp.cpp +++ b/openbr/plugins/cuda/cudalbp.cpp @@ -15,8 +15,7 @@ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ #include -//#include -//#include +using namespace std; #include #include @@ -32,7 +31,6 @@ #include -#include "cudalbp.hpp" #include "MatManager.hpp" using namespace cv; @@ -60,19 +58,17 @@ string type2str(int type) { return r; } -int ctr = 0; -pthread_mutex_t* uploadMutex = NULL; +namespace br { namespace cuda { + void cudalbp_wrapper(void* srcPtr, void** dstPtr, int rows, int cols); + void cudalbp_init_wrapper(uint8_t* lut); +}} namespace br { - /*! * \ingroup transforms - * \brief Convert the image into a feature vector using Local Binary Patterns - * \br_paper Ahonen, T.; Hadid, A.; Pietikainen, M.; - * "Face Description with Local Binary Patterns: Application to Face Recognition" - * Pattern Analysis and Machine Intelligence, IEEE Transactions, vol.28, no.12, pp.2037-2041, Dec. 2006 - * \author Josh Klontz \cite jklontz + * \brief Convert the image into a feature vector using Local Binary Patterns in CUDA + * \author Colin Heinzmann, Li Li \cite DepthDeluxe, booli */ class CUDALBPTransform : public UntrainableTransform { @@ -86,10 +82,8 @@ class CUDALBPTransform : public UntrainableTransform private: uchar lut[256]; - uint8_t* lutGpuPtr; uchar null; - cuda::MatManager* matManager; public: @@ -146,31 +140,50 @@ class CUDALBPTransform : public UntrainableTransform matManager = new cuda::MatManager(10); // copy lut over to the GPU - br::cuda::cudalbp_init_wrapper(lut, &lutGpuPtr); + br::cuda::cudalbp_init_wrapper(lut); std::cout << "Initialized CUDALBP" << std::endl; } void project(const Template &src, Template &dst) const { - Mat& m = (Mat&)src.m(); - cuda::MatManager::matindex a; - cuda::MatManager::matindex b; - a = matManager->reserve(m); - matManager->upload(a, m); + //Mat& m = (Mat&)src.m(); + //cuda::MatManager::matindex a; + //cuda::MatManager::matindex b; + //a = matManager->reserve(m); + //matManager->upload(a, m); // reserve the second mat and check the dimensiosn - b = matManager->reserve(m); - - uint8_t* srcMatPtr = matManager->get_mat_pointer_from_index(a); - uint8_t* dstMatPtr = matManager->get_mat_pointer_from_index(b); - br::cuda::cudalbp_wrapper(srcMatPtr, dstMatPtr, lutGpuPtr, m.cols, m.rows, m.step1()); + //b = matManager->reserve(m); - matManager->download(b, dst); + //uint8_t* srcMatPtr = matManager->get_mat_pointer_from_index(a); + //uint8_t* dstMatPtr = matManager->get_mat_pointer_from_index(b); + //br::cuda::cudalbp_wrapper(srcMatPtr, dstMatPtr, lutGpuPtr, m.cols, m.rows, m.step1()); + + //matManager->download(b, dst); // release both the mats - matManager->release(a); - matManager->release(b); + //matManager->release(a); + //matManager->release(b); + + cout << "CUDALBP Start" << endl; + + void* const* srcDataPtr = src.m().ptr(); + void* cudaSrcPtr = srcDataPtr[0]; + int rows = *((int*)srcDataPtr[1]); + int cols = *((int*)srcDataPtr[2]); + int type = *((int*)srcDataPtr[3]); + + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type()); + void** dstDataPtr = dstMat.ptr(); + dstDataPtr[1] = srcDataPtr[1]; + dstDataPtr[2] = srcDataPtr[2]; + dstDataPtr[3] = srcDataPtr[3]; + + br::cuda::cudalbp_wrapper(cudaSrcPtr, &dstDataPtr[0], rows, cols); + dst = dstMat; + + cout << "CUDALBP End" << endl; } }; diff --git a/openbr/plugins/cuda/cudalbp.cu b/openbr/plugins/cuda/cudalbp.cu index 23aec27..69a3dd0 100644 --- a/openbr/plugins/cuda/cudalbp.cu +++ b/openbr/plugins/cuda/cudalbp.cu @@ -4,55 +4,53 @@ using namespace cv; using namespace cv::gpu; -#include "cudalbp.hpp" - namespace br { namespace cuda { - __device__ __forceinline__ uint8_t cudalbp_kernel_get_pixel_value(int row, int col, uint8_t* srcPtr, size_t srcStep, int rows, int cols) { - return (row >= rows || col >= cols) ? 0 : (srcPtr + row*srcStep)[col]; + uint8_t* lut; + + __device__ __forceinline__ uint8_t cudalbp_kernel_get_pixel_value(int row, int col, uint8_t* srcPtr, int rows, int cols) { + return (row >= rows || col >= cols) ? 0 : (srcPtr + row*cols)[col]; } - __global__ void cudalbp_kernel(uint8_t* srcPtr, uint8_t* dstPtr, size_t srcStep, size_t dstStep, int rows, int cols, uint8_t* lut) + __global__ void cudalbp_kernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols, uint8_t* lut) { int rowInd = blockIdx.y*blockDim.y+threadIdx.y; int colInd = blockIdx.x*blockDim.x+threadIdx.x; int radius = 1; // don't do anything if the index is out of bounds - if (rowInd >= rows || colInd >= cols) + if (rowInd >= rows || colInd >= cols) { return; - - const uint8_t cval = cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+0*radius, srcPtr, srcStep, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]); // center value - uint8_t val = lut[(cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd-1*radius, srcPtr, srcStep, rows, cols) >= cval ? 128 : 0) | - (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+0*radius, srcPtr, srcStep, rows, cols) >= cval ? 64 : 0) | - (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+1*radius, srcPtr, srcStep, rows, cols) >= cval ? 32 : 0) | - (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+1*radius, srcPtr, srcStep, rows, cols) >= cval ? 16 : 0) | - (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+1*radius, srcPtr, srcStep, rows, cols) >= cval ? 8 : 0) | - (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+0*radius, srcPtr, srcStep, rows, cols) >= cval ? 4 : 0) | - (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd-1*radius, srcPtr, srcStep, rows, cols) >= cval ? 2 : 0) | - (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd-1*radius, srcPtr, srcStep, rows, cols) >= cval ? 1 : 0)]; + } + + const uint8_t cval = cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+0*radius, srcPtr, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]); // center value + uint8_t val = lut[(cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 128 : 0) | + (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 64 : 0) | + (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 32 : 0) | + (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 16 : 0) | + (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 8 : 0) | + (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 4 : 0) | + (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 2 : 0) | + (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 1 : 0)]; // store calculated value away in the right place - uint8_t* dstRowPtr = dstPtr + rowInd*dstStep; - dstRowPtr[colInd] = val; + int index = rowInd*cols + colInd; + dstPtr[index] = val; } - void cudalbp_wrapper(uint8_t* srcPtr, uint8_t* dstPtr, uint8_t* lut, int imageWidth, int imageHeight, size_t step) + //void cudalbp_wrapper(uint8_t* srcPtr, uint8_t* dstPtr, uint8_t* lut, int imageWidth, int imageHeight, size_t step) + void cudalbp_wrapper(void* srcPtr, void** dstPtr, int rows, int cols) { // make 8 * 8 = 64 square block dim3 threadsPerBlock(8, 8); + dim3 numBlocks(cols/threadsPerBlock.x + 1, + rows/threadsPerBlock.y + 1); - dim3 numBlocks(imageWidth/threadsPerBlock.x + 1, - imageHeight/threadsPerBlock.y + 1); - - //printf("Src Image Dimesions:\n\trows: %d\tcols: %d\n", src.rows, src.cols); - //printf("Dst Image Dimesions:\n\trows: %d\tcols: %d\n", dst.rows, dst.cols); - //printf("Running CUDALBP\nBlock Dimensions:\n\tx: %d\ty: %d\n", numBlocks.x, numBlocks.y); - - cudalbp_kernel<<>>(srcPtr, dstPtr, step, step, imageHeight, imageWidth, lut); + cudaMalloc(dstPtr, rows*cols*sizeof(uint8_t)); + cudalbp_kernel<<>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), rows, cols, lut); } - void cudalbp_init_wrapper(uint8_t* lut, uint8_t** lutGpuPtrPtr) { - cudaMalloc(lutGpuPtrPtr, 256*sizeof(uint8_t)); - cudaMemcpy(*lutGpuPtrPtr, lut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice); + void cudalbp_init_wrapper(uint8_t* cpuLut) { + cudaMalloc(&lut, 256*sizeof(uint8_t)); + cudaMemcpy(lut, cpuLut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice); } }} diff --git a/openbr/plugins/cuda/cudalbp.hpp b/openbr/plugins/cuda/cudalbp.hpp deleted file mode 100644 index 7fa270c..0000000 --- a/openbr/plugins/cuda/cudalbp.hpp +++ /dev/null @@ -1,9 +0,0 @@ -#include - -using namespace cv; -using namespace cv::gpu; - -namespace br { namespace cuda { - void cudalbp_init_wrapper(uint8_t* lut, uint8_t** lutGpuPtrPtr); - void cudalbp_wrapper(uint8_t* src, uint8_t* dst, uint8_t* lut, int imageWidth, int imageHeight, size_t step); -}} -- libgit2 0.21.4