Commit 5e16788d451105dcdc4fef2778996043ec3a40ad
1 parent
aaabb3cb
added pipelined CUDALBP
Showing
4 changed files
with
77 additions
and
66 deletions
openbr/plugins/cuda/copyfrom.cpp
| @@ -23,6 +23,8 @@ namespace br | @@ -23,6 +23,8 @@ namespace br | ||
| 23 | private: | 23 | private: |
| 24 | void project(const Template &src, Template &dst) const | 24 | void project(const Template &src, Template &dst) const |
| 25 | { | 25 | { |
| 26 | + cout << "CUDACopyFrom Start" << endl; | ||
| 27 | + | ||
| 26 | // pull the data back out of the Mat | 28 | // pull the data back out of the Mat |
| 27 | void* const* dataPtr = src.m().ptr<void*>(); | 29 | void* const* dataPtr = src.m().ptr<void*>(); |
| 28 | void* cudaMemPtr = dataPtr[0]; | 30 | void* cudaMemPtr = dataPtr[0]; |
| @@ -30,9 +32,16 @@ private: | @@ -30,9 +32,16 @@ private: | ||
| 30 | int cols = *((int*)dataPtr[2]); | 32 | int cols = *((int*)dataPtr[2]); |
| 31 | int type = *((int*)dataPtr[3]); | 33 | int type = *((int*)dataPtr[3]); |
| 32 | 34 | ||
| 35 | + cout << "cudaMemPtr: " << cudaMemPtr << endl; | ||
| 36 | + cout << "rows: " << rows << endl; | ||
| 37 | + cout << "cols: " << cols << endl; | ||
| 38 | + cout << "type: " << type << endl; | ||
| 39 | + | ||
| 33 | dst = Mat(rows, cols, type); | 40 | dst = Mat(rows, cols, type); |
| 34 | 41 | ||
| 35 | br::cuda::cudacopyfrom::wrapper(cudaMemPtr, dst.m().ptr<unsigned char>(), rows, cols); | 42 | br::cuda::cudacopyfrom::wrapper(cudaMemPtr, dst.m().ptr<unsigned char>(), rows, cols); |
| 43 | + | ||
| 44 | + cout << "CUDACopyFrom End" << endl; | ||
| 36 | } | 45 | } |
| 37 | }; | 46 | }; |
| 38 | 47 |
openbr/plugins/cuda/cudalbp.cpp
| @@ -15,8 +15,7 @@ | @@ -15,8 +15,7 @@ | ||
| 15 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ | 15 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ |
| 16 | 16 | ||
| 17 | #include <iostream> | 17 | #include <iostream> |
| 18 | -//#include <thread> | ||
| 19 | -//#include <mutex> | 18 | +using namespace std; |
| 20 | 19 | ||
| 21 | #include <sys/types.h> | 20 | #include <sys/types.h> |
| 22 | #include <unistd.h> | 21 | #include <unistd.h> |
| @@ -32,7 +31,6 @@ | @@ -32,7 +31,6 @@ | ||
| 32 | 31 | ||
| 33 | #include <openbr/plugins/openbr_internal.h> | 32 | #include <openbr/plugins/openbr_internal.h> |
| 34 | 33 | ||
| 35 | -#include "cudalbp.hpp" | ||
| 36 | #include "MatManager.hpp" | 34 | #include "MatManager.hpp" |
| 37 | 35 | ||
| 38 | using namespace cv; | 36 | using namespace cv; |
| @@ -60,19 +58,17 @@ string type2str(int type) { | @@ -60,19 +58,17 @@ string type2str(int type) { | ||
| 60 | return r; | 58 | return r; |
| 61 | } | 59 | } |
| 62 | 60 | ||
| 63 | -int ctr = 0; | ||
| 64 | -pthread_mutex_t* uploadMutex = NULL; | 61 | +namespace br { namespace cuda { |
| 62 | + void cudalbp_wrapper(void* srcPtr, void** dstPtr, int rows, int cols); | ||
| 63 | + void cudalbp_init_wrapper(uint8_t* lut); | ||
| 64 | +}} | ||
| 65 | 65 | ||
| 66 | namespace br | 66 | namespace br |
| 67 | { | 67 | { |
| 68 | - | ||
| 69 | /*! | 68 | /*! |
| 70 | * \ingroup transforms | 69 | * \ingroup transforms |
| 71 | - * \brief Convert the image into a feature vector using Local Binary Patterns | ||
| 72 | - * \br_paper Ahonen, T.; Hadid, A.; Pietikainen, M.; | ||
| 73 | - * "Face Description with Local Binary Patterns: Application to Face Recognition" | ||
| 74 | - * Pattern Analysis and Machine Intelligence, IEEE Transactions, vol.28, no.12, pp.2037-2041, Dec. 2006 | ||
| 75 | - * \author Josh Klontz \cite jklontz | 70 | + * \brief Convert the image into a feature vector using Local Binary Patterns in CUDA |
| 71 | + * \author Colin Heinzmann, Li Li \cite DepthDeluxe, booli | ||
| 76 | */ | 72 | */ |
| 77 | class CUDALBPTransform : public UntrainableTransform | 73 | class CUDALBPTransform : public UntrainableTransform |
| 78 | { | 74 | { |
| @@ -86,10 +82,8 @@ class CUDALBPTransform : public UntrainableTransform | @@ -86,10 +82,8 @@ class CUDALBPTransform : public UntrainableTransform | ||
| 86 | 82 | ||
| 87 | private: | 83 | private: |
| 88 | uchar lut[256]; | 84 | uchar lut[256]; |
| 89 | - uint8_t* lutGpuPtr; | ||
| 90 | uchar null; | 85 | uchar null; |
| 91 | 86 | ||
| 92 | - | ||
| 93 | cuda::MatManager* matManager; | 87 | cuda::MatManager* matManager; |
| 94 | 88 | ||
| 95 | public: | 89 | public: |
| @@ -146,31 +140,50 @@ class CUDALBPTransform : public UntrainableTransform | @@ -146,31 +140,50 @@ class CUDALBPTransform : public UntrainableTransform | ||
| 146 | matManager = new cuda::MatManager(10); | 140 | matManager = new cuda::MatManager(10); |
| 147 | 141 | ||
| 148 | // copy lut over to the GPU | 142 | // copy lut over to the GPU |
| 149 | - br::cuda::cudalbp_init_wrapper(lut, &lutGpuPtr); | 143 | + br::cuda::cudalbp_init_wrapper(lut); |
| 150 | 144 | ||
| 151 | std::cout << "Initialized CUDALBP" << std::endl; | 145 | std::cout << "Initialized CUDALBP" << std::endl; |
| 152 | } | 146 | } |
| 153 | 147 | ||
| 154 | void project(const Template &src, Template &dst) const | 148 | void project(const Template &src, Template &dst) const |
| 155 | { | 149 | { |
| 156 | - Mat& m = (Mat&)src.m(); | ||
| 157 | - cuda::MatManager::matindex a; | ||
| 158 | - cuda::MatManager::matindex b; | ||
| 159 | - a = matManager->reserve(m); | ||
| 160 | - matManager->upload(a, m); | 150 | + //Mat& m = (Mat&)src.m(); |
| 151 | + //cuda::MatManager::matindex a; | ||
| 152 | + //cuda::MatManager::matindex b; | ||
| 153 | + //a = matManager->reserve(m); | ||
| 154 | + //matManager->upload(a, m); | ||
| 161 | 155 | ||
| 162 | // reserve the second mat and check the dimensiosn | 156 | // reserve the second mat and check the dimensiosn |
| 163 | - b = matManager->reserve(m); | ||
| 164 | - | ||
| 165 | - uint8_t* srcMatPtr = matManager->get_mat_pointer_from_index(a); | ||
| 166 | - uint8_t* dstMatPtr = matManager->get_mat_pointer_from_index(b); | ||
| 167 | - br::cuda::cudalbp_wrapper(srcMatPtr, dstMatPtr, lutGpuPtr, m.cols, m.rows, m.step1()); | 157 | + //b = matManager->reserve(m); |
| 168 | 158 | ||
| 169 | - matManager->download(b, dst); | 159 | + //uint8_t* srcMatPtr = matManager->get_mat_pointer_from_index(a); |
| 160 | + //uint8_t* dstMatPtr = matManager->get_mat_pointer_from_index(b); | ||
| 161 | + //br::cuda::cudalbp_wrapper(srcMatPtr, dstMatPtr, lutGpuPtr, m.cols, m.rows, m.step1()); | ||
| 162 | + | ||
| 163 | + //matManager->download(b, dst); | ||
| 170 | 164 | ||
| 171 | // release both the mats | 165 | // release both the mats |
| 172 | - matManager->release(a); | ||
| 173 | - matManager->release(b); | 166 | + //matManager->release(a); |
| 167 | + //matManager->release(b); | ||
| 168 | + | ||
| 169 | + cout << "CUDALBP Start" << endl; | ||
| 170 | + | ||
| 171 | + void* const* srcDataPtr = src.m().ptr<void*>(); | ||
| 172 | + void* cudaSrcPtr = srcDataPtr[0]; | ||
| 173 | + int rows = *((int*)srcDataPtr[1]); | ||
| 174 | + int cols = *((int*)srcDataPtr[2]); | ||
| 175 | + int type = *((int*)srcDataPtr[3]); | ||
| 176 | + | ||
| 177 | + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type()); | ||
| 178 | + void** dstDataPtr = dstMat.ptr<void*>(); | ||
| 179 | + dstDataPtr[1] = srcDataPtr[1]; | ||
| 180 | + dstDataPtr[2] = srcDataPtr[2]; | ||
| 181 | + dstDataPtr[3] = srcDataPtr[3]; | ||
| 182 | + | ||
| 183 | + br::cuda::cudalbp_wrapper(cudaSrcPtr, &dstDataPtr[0], rows, cols); | ||
| 184 | + dst = dstMat; | ||
| 185 | + | ||
| 186 | + cout << "CUDALBP End" << endl; | ||
| 174 | } | 187 | } |
| 175 | }; | 188 | }; |
| 176 | 189 |
openbr/plugins/cuda/cudalbp.cu
| @@ -4,55 +4,53 @@ | @@ -4,55 +4,53 @@ | ||
| 4 | using namespace cv; | 4 | using namespace cv; |
| 5 | using namespace cv::gpu; | 5 | using namespace cv::gpu; |
| 6 | 6 | ||
| 7 | -#include "cudalbp.hpp" | ||
| 8 | - | ||
| 9 | namespace br { namespace cuda { | 7 | namespace br { namespace cuda { |
| 10 | - __device__ __forceinline__ uint8_t cudalbp_kernel_get_pixel_value(int row, int col, uint8_t* srcPtr, size_t srcStep, int rows, int cols) { | ||
| 11 | - return (row >= rows || col >= cols) ? 0 : (srcPtr + row*srcStep)[col]; | 8 | + uint8_t* lut; |
| 9 | + | ||
| 10 | + __device__ __forceinline__ uint8_t cudalbp_kernel_get_pixel_value(int row, int col, uint8_t* srcPtr, int rows, int cols) { | ||
| 11 | + return (row >= rows || col >= cols) ? 0 : (srcPtr + row*cols)[col]; | ||
| 12 | } | 12 | } |
| 13 | 13 | ||
| 14 | - __global__ void cudalbp_kernel(uint8_t* srcPtr, uint8_t* dstPtr, size_t srcStep, size_t dstStep, int rows, int cols, uint8_t* lut) | 14 | + __global__ void cudalbp_kernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols, uint8_t* lut) |
| 15 | { | 15 | { |
| 16 | int rowInd = blockIdx.y*blockDim.y+threadIdx.y; | 16 | int rowInd = blockIdx.y*blockDim.y+threadIdx.y; |
| 17 | int colInd = blockIdx.x*blockDim.x+threadIdx.x; | 17 | int colInd = blockIdx.x*blockDim.x+threadIdx.x; |
| 18 | int radius = 1; | 18 | int radius = 1; |
| 19 | 19 | ||
| 20 | // don't do anything if the index is out of bounds | 20 | // don't do anything if the index is out of bounds |
| 21 | - if (rowInd >= rows || colInd >= cols) | 21 | + if (rowInd >= rows || colInd >= cols) { |
| 22 | return; | 22 | return; |
| 23 | - | ||
| 24 | - const uint8_t cval = cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+0*radius, srcPtr, srcStep, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]); // center value | ||
| 25 | - uint8_t val = lut[(cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd-1*radius, srcPtr, srcStep, rows, cols) >= cval ? 128 : 0) | | ||
| 26 | - (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+0*radius, srcPtr, srcStep, rows, cols) >= cval ? 64 : 0) | | ||
| 27 | - (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+1*radius, srcPtr, srcStep, rows, cols) >= cval ? 32 : 0) | | ||
| 28 | - (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+1*radius, srcPtr, srcStep, rows, cols) >= cval ? 16 : 0) | | ||
| 29 | - (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+1*radius, srcPtr, srcStep, rows, cols) >= cval ? 8 : 0) | | ||
| 30 | - (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+0*radius, srcPtr, srcStep, rows, cols) >= cval ? 4 : 0) | | ||
| 31 | - (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd-1*radius, srcPtr, srcStep, rows, cols) >= cval ? 2 : 0) | | ||
| 32 | - (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd-1*radius, srcPtr, srcStep, rows, cols) >= cval ? 1 : 0)]; | 23 | + } |
| 24 | + | ||
| 25 | + const uint8_t cval = cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+0*radius, srcPtr, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]); // center value | ||
| 26 | + uint8_t val = lut[(cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 128 : 0) | | ||
| 27 | + (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 64 : 0) | | ||
| 28 | + (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 32 : 0) | | ||
| 29 | + (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 16 : 0) | | ||
| 30 | + (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 8 : 0) | | ||
| 31 | + (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 4 : 0) | | ||
| 32 | + (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 2 : 0) | | ||
| 33 | + (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 1 : 0)]; | ||
| 33 | 34 | ||
| 34 | // store calculated value away in the right place | 35 | // store calculated value away in the right place |
| 35 | - uint8_t* dstRowPtr = dstPtr + rowInd*dstStep; | ||
| 36 | - dstRowPtr[colInd] = val; | 36 | + int index = rowInd*cols + colInd; |
| 37 | + dstPtr[index] = val; | ||
| 37 | } | 38 | } |
| 38 | 39 | ||
| 39 | - void cudalbp_wrapper(uint8_t* srcPtr, uint8_t* dstPtr, uint8_t* lut, int imageWidth, int imageHeight, size_t step) | 40 | + //void cudalbp_wrapper(uint8_t* srcPtr, uint8_t* dstPtr, uint8_t* lut, int imageWidth, int imageHeight, size_t step) |
| 41 | + void cudalbp_wrapper(void* srcPtr, void** dstPtr, int rows, int cols) | ||
| 40 | { | 42 | { |
| 41 | // make 8 * 8 = 64 square block | 43 | // make 8 * 8 = 64 square block |
| 42 | dim3 threadsPerBlock(8, 8); | 44 | dim3 threadsPerBlock(8, 8); |
| 45 | + dim3 numBlocks(cols/threadsPerBlock.x + 1, | ||
| 46 | + rows/threadsPerBlock.y + 1); | ||
| 43 | 47 | ||
| 44 | - dim3 numBlocks(imageWidth/threadsPerBlock.x + 1, | ||
| 45 | - imageHeight/threadsPerBlock.y + 1); | ||
| 46 | - | ||
| 47 | - //printf("Src Image Dimesions:\n\trows: %d\tcols: %d\n", src.rows, src.cols); | ||
| 48 | - //printf("Dst Image Dimesions:\n\trows: %d\tcols: %d\n", dst.rows, dst.cols); | ||
| 49 | - //printf("Running CUDALBP\nBlock Dimensions:\n\tx: %d\ty: %d\n", numBlocks.x, numBlocks.y); | ||
| 50 | - | ||
| 51 | - cudalbp_kernel<<<numBlocks, threadsPerBlock>>>(srcPtr, dstPtr, step, step, imageHeight, imageWidth, lut); | 48 | + cudaMalloc(dstPtr, rows*cols*sizeof(uint8_t)); |
| 49 | + cudalbp_kernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), rows, cols, lut); | ||
| 52 | } | 50 | } |
| 53 | 51 | ||
| 54 | - void cudalbp_init_wrapper(uint8_t* lut, uint8_t** lutGpuPtrPtr) { | ||
| 55 | - cudaMalloc(lutGpuPtrPtr, 256*sizeof(uint8_t)); | ||
| 56 | - cudaMemcpy(*lutGpuPtrPtr, lut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice); | 52 | + void cudalbp_init_wrapper(uint8_t* cpuLut) { |
| 53 | + cudaMalloc(&lut, 256*sizeof(uint8_t)); | ||
| 54 | + cudaMemcpy(lut, cpuLut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice); | ||
| 57 | } | 55 | } |
| 58 | }} | 56 | }} |
openbr/plugins/cuda/cudalbp.hpp deleted
| 1 | -#include <opencv2/gpu/gpu.hpp> | ||
| 2 | - | ||
| 3 | -using namespace cv; | ||
| 4 | -using namespace cv::gpu; | ||
| 5 | - | ||
| 6 | -namespace br { namespace cuda { | ||
| 7 | - void cudalbp_init_wrapper(uint8_t* lut, uint8_t** lutGpuPtrPtr); | ||
| 8 | - void cudalbp_wrapper(uint8_t* src, uint8_t* dst, uint8_t* lut, int imageWidth, int imageHeight, size_t step); | ||
| 9 | -}} |