Commit 5e16788d451105dcdc4fef2778996043ec3a40ad
1 parent
aaabb3cb
added pipelined CUDALBP
Showing
4 changed files
with
77 additions
and
66 deletions
openbr/plugins/cuda/copyfrom.cpp
| ... | ... | @@ -23,6 +23,8 @@ namespace br |
| 23 | 23 | private: |
| 24 | 24 | void project(const Template &src, Template &dst) const |
| 25 | 25 | { |
| 26 | + cout << "CUDACopyFrom Start" << endl; | |
| 27 | + | |
| 26 | 28 | // pull the data back out of the Mat |
| 27 | 29 | void* const* dataPtr = src.m().ptr<void*>(); |
| 28 | 30 | void* cudaMemPtr = dataPtr[0]; |
| ... | ... | @@ -30,9 +32,16 @@ private: |
| 30 | 32 | int cols = *((int*)dataPtr[2]); |
| 31 | 33 | int type = *((int*)dataPtr[3]); |
| 32 | 34 | |
| 35 | + cout << "cudaMemPtr: " << cudaMemPtr << endl; | |
| 36 | + cout << "rows: " << rows << endl; | |
| 37 | + cout << "cols: " << cols << endl; | |
| 38 | + cout << "type: " << type << endl; | |
| 39 | + | |
| 33 | 40 | dst = Mat(rows, cols, type); |
| 34 | 41 | |
| 35 | 42 | br::cuda::cudacopyfrom::wrapper(cudaMemPtr, dst.m().ptr<unsigned char>(), rows, cols); |
| 43 | + | |
| 44 | + cout << "CUDACopyFrom End" << endl; | |
| 36 | 45 | } |
| 37 | 46 | }; |
| 38 | 47 | ... | ... |
openbr/plugins/cuda/cudalbp.cpp
| ... | ... | @@ -15,8 +15,7 @@ |
| 15 | 15 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ |
| 16 | 16 | |
| 17 | 17 | #include <iostream> |
| 18 | -//#include <thread> | |
| 19 | -//#include <mutex> | |
| 18 | +using namespace std; | |
| 20 | 19 | |
| 21 | 20 | #include <sys/types.h> |
| 22 | 21 | #include <unistd.h> |
| ... | ... | @@ -32,7 +31,6 @@ |
| 32 | 31 | |
| 33 | 32 | #include <openbr/plugins/openbr_internal.h> |
| 34 | 33 | |
| 35 | -#include "cudalbp.hpp" | |
| 36 | 34 | #include "MatManager.hpp" |
| 37 | 35 | |
| 38 | 36 | using namespace cv; |
| ... | ... | @@ -60,19 +58,17 @@ string type2str(int type) { |
| 60 | 58 | return r; |
| 61 | 59 | } |
| 62 | 60 | |
| 63 | -int ctr = 0; | |
| 64 | -pthread_mutex_t* uploadMutex = NULL; | |
| 61 | +namespace br { namespace cuda { | |
| 62 | + void cudalbp_wrapper(void* srcPtr, void** dstPtr, int rows, int cols); | |
| 63 | + void cudalbp_init_wrapper(uint8_t* lut); | |
| 64 | +}} | |
| 65 | 65 | |
| 66 | 66 | namespace br |
| 67 | 67 | { |
| 68 | - | |
| 69 | 68 | /*! |
| 70 | 69 | * \ingroup transforms |
| 71 | - * \brief Convert the image into a feature vector using Local Binary Patterns | |
| 72 | - * \br_paper Ahonen, T.; Hadid, A.; Pietikainen, M.; | |
| 73 | - * "Face Description with Local Binary Patterns: Application to Face Recognition" | |
| 74 | - * Pattern Analysis and Machine Intelligence, IEEE Transactions, vol.28, no.12, pp.2037-2041, Dec. 2006 | |
| 75 | - * \author Josh Klontz \cite jklontz | |
| 70 | + * \brief Convert the image into a feature vector using Local Binary Patterns in CUDA | |
| 71 | + * \author Colin Heinzmann, Li Li \cite DepthDeluxe, booli | |
| 76 | 72 | */ |
| 77 | 73 | class CUDALBPTransform : public UntrainableTransform |
| 78 | 74 | { |
| ... | ... | @@ -86,10 +82,8 @@ class CUDALBPTransform : public UntrainableTransform |
| 86 | 82 | |
| 87 | 83 | private: |
| 88 | 84 | uchar lut[256]; |
| 89 | - uint8_t* lutGpuPtr; | |
| 90 | 85 | uchar null; |
| 91 | 86 | |
| 92 | - | |
| 93 | 87 | cuda::MatManager* matManager; |
| 94 | 88 | |
| 95 | 89 | public: |
| ... | ... | @@ -146,31 +140,50 @@ class CUDALBPTransform : public UntrainableTransform |
| 146 | 140 | matManager = new cuda::MatManager(10); |
| 147 | 141 | |
| 148 | 142 | // copy lut over to the GPU |
| 149 | - br::cuda::cudalbp_init_wrapper(lut, &lutGpuPtr); | |
| 143 | + br::cuda::cudalbp_init_wrapper(lut); | |
| 150 | 144 | |
| 151 | 145 | std::cout << "Initialized CUDALBP" << std::endl; |
| 152 | 146 | } |
| 153 | 147 | |
| 154 | 148 | void project(const Template &src, Template &dst) const |
| 155 | 149 | { |
| 156 | - Mat& m = (Mat&)src.m(); | |
| 157 | - cuda::MatManager::matindex a; | |
| 158 | - cuda::MatManager::matindex b; | |
| 159 | - a = matManager->reserve(m); | |
| 160 | - matManager->upload(a, m); | |
| 150 | + //Mat& m = (Mat&)src.m(); | |
| 151 | + //cuda::MatManager::matindex a; | |
| 152 | + //cuda::MatManager::matindex b; | |
| 153 | + //a = matManager->reserve(m); | |
| 154 | + //matManager->upload(a, m); | |
| 161 | 155 | |
| 162 | 156 | // reserve the second mat and check the dimensiosn |
| 163 | - b = matManager->reserve(m); | |
| 164 | - | |
| 165 | - uint8_t* srcMatPtr = matManager->get_mat_pointer_from_index(a); | |
| 166 | - uint8_t* dstMatPtr = matManager->get_mat_pointer_from_index(b); | |
| 167 | - br::cuda::cudalbp_wrapper(srcMatPtr, dstMatPtr, lutGpuPtr, m.cols, m.rows, m.step1()); | |
| 157 | + //b = matManager->reserve(m); | |
| 168 | 158 | |
| 169 | - matManager->download(b, dst); | |
| 159 | + //uint8_t* srcMatPtr = matManager->get_mat_pointer_from_index(a); | |
| 160 | + //uint8_t* dstMatPtr = matManager->get_mat_pointer_from_index(b); | |
| 161 | + //br::cuda::cudalbp_wrapper(srcMatPtr, dstMatPtr, lutGpuPtr, m.cols, m.rows, m.step1()); | |
| 162 | + | |
| 163 | + //matManager->download(b, dst); | |
| 170 | 164 | |
| 171 | 165 | // release both the mats |
| 172 | - matManager->release(a); | |
| 173 | - matManager->release(b); | |
| 166 | + //matManager->release(a); | |
| 167 | + //matManager->release(b); | |
| 168 | + | |
| 169 | + cout << "CUDALBP Start" << endl; | |
| 170 | + | |
| 171 | + void* const* srcDataPtr = src.m().ptr<void*>(); | |
| 172 | + void* cudaSrcPtr = srcDataPtr[0]; | |
| 173 | + int rows = *((int*)srcDataPtr[1]); | |
| 174 | + int cols = *((int*)srcDataPtr[2]); | |
| 175 | + int type = *((int*)srcDataPtr[3]); | |
| 176 | + | |
| 177 | + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type()); | |
| 178 | + void** dstDataPtr = dstMat.ptr<void*>(); | |
| 179 | + dstDataPtr[1] = srcDataPtr[1]; | |
| 180 | + dstDataPtr[2] = srcDataPtr[2]; | |
| 181 | + dstDataPtr[3] = srcDataPtr[3]; | |
| 182 | + | |
| 183 | + br::cuda::cudalbp_wrapper(cudaSrcPtr, &dstDataPtr[0], rows, cols); | |
| 184 | + dst = dstMat; | |
| 185 | + | |
| 186 | + cout << "CUDALBP End" << endl; | |
| 174 | 187 | } |
| 175 | 188 | }; |
| 176 | 189 | ... | ... |
openbr/plugins/cuda/cudalbp.cu
| ... | ... | @@ -4,55 +4,53 @@ |
| 4 | 4 | using namespace cv; |
| 5 | 5 | using namespace cv::gpu; |
| 6 | 6 | |
| 7 | -#include "cudalbp.hpp" | |
| 8 | - | |
| 9 | 7 | namespace br { namespace cuda { |
| 10 | - __device__ __forceinline__ uint8_t cudalbp_kernel_get_pixel_value(int row, int col, uint8_t* srcPtr, size_t srcStep, int rows, int cols) { | |
| 11 | - return (row >= rows || col >= cols) ? 0 : (srcPtr + row*srcStep)[col]; | |
| 8 | + uint8_t* lut; | |
| 9 | + | |
| 10 | + __device__ __forceinline__ uint8_t cudalbp_kernel_get_pixel_value(int row, int col, uint8_t* srcPtr, int rows, int cols) { | |
| 11 | + return (row >= rows || col >= cols) ? 0 : (srcPtr + row*cols)[col]; | |
| 12 | 12 | } |
| 13 | 13 | |
| 14 | - __global__ void cudalbp_kernel(uint8_t* srcPtr, uint8_t* dstPtr, size_t srcStep, size_t dstStep, int rows, int cols, uint8_t* lut) | |
| 14 | + __global__ void cudalbp_kernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols, uint8_t* lut) | |
| 15 | 15 | { |
| 16 | 16 | int rowInd = blockIdx.y*blockDim.y+threadIdx.y; |
| 17 | 17 | int colInd = blockIdx.x*blockDim.x+threadIdx.x; |
| 18 | 18 | int radius = 1; |
| 19 | 19 | |
| 20 | 20 | // don't do anything if the index is out of bounds |
| 21 | - if (rowInd >= rows || colInd >= cols) | |
| 21 | + if (rowInd >= rows || colInd >= cols) { | |
| 22 | 22 | return; |
| 23 | - | |
| 24 | - const uint8_t cval = cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+0*radius, srcPtr, srcStep, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]); // center value | |
| 25 | - uint8_t val = lut[(cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd-1*radius, srcPtr, srcStep, rows, cols) >= cval ? 128 : 0) | | |
| 26 | - (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+0*radius, srcPtr, srcStep, rows, cols) >= cval ? 64 : 0) | | |
| 27 | - (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+1*radius, srcPtr, srcStep, rows, cols) >= cval ? 32 : 0) | | |
| 28 | - (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+1*radius, srcPtr, srcStep, rows, cols) >= cval ? 16 : 0) | | |
| 29 | - (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+1*radius, srcPtr, srcStep, rows, cols) >= cval ? 8 : 0) | | |
| 30 | - (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+0*radius, srcPtr, srcStep, rows, cols) >= cval ? 4 : 0) | | |
| 31 | - (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd-1*radius, srcPtr, srcStep, rows, cols) >= cval ? 2 : 0) | | |
| 32 | - (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd-1*radius, srcPtr, srcStep, rows, cols) >= cval ? 1 : 0)]; | |
| 23 | + } | |
| 24 | + | |
| 25 | + const uint8_t cval = cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+0*radius, srcPtr, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]); // center value | |
| 26 | + uint8_t val = lut[(cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 128 : 0) | | |
| 27 | + (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 64 : 0) | | |
| 28 | + (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 32 : 0) | | |
| 29 | + (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 16 : 0) | | |
| 30 | + (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 8 : 0) | | |
| 31 | + (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 4 : 0) | | |
| 32 | + (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 2 : 0) | | |
| 33 | + (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 1 : 0)]; | |
| 33 | 34 | |
| 34 | 35 | // store calculated value away in the right place |
| 35 | - uint8_t* dstRowPtr = dstPtr + rowInd*dstStep; | |
| 36 | - dstRowPtr[colInd] = val; | |
| 36 | + int index = rowInd*cols + colInd; | |
| 37 | + dstPtr[index] = val; | |
| 37 | 38 | } |
| 38 | 39 | |
| 39 | - void cudalbp_wrapper(uint8_t* srcPtr, uint8_t* dstPtr, uint8_t* lut, int imageWidth, int imageHeight, size_t step) | |
| 40 | + //void cudalbp_wrapper(uint8_t* srcPtr, uint8_t* dstPtr, uint8_t* lut, int imageWidth, int imageHeight, size_t step) | |
| 41 | + void cudalbp_wrapper(void* srcPtr, void** dstPtr, int rows, int cols) | |
| 40 | 42 | { |
| 41 | 43 | // make 8 * 8 = 64 square block |
| 42 | 44 | dim3 threadsPerBlock(8, 8); |
| 45 | + dim3 numBlocks(cols/threadsPerBlock.x + 1, | |
| 46 | + rows/threadsPerBlock.y + 1); | |
| 43 | 47 | |
| 44 | - dim3 numBlocks(imageWidth/threadsPerBlock.x + 1, | |
| 45 | - imageHeight/threadsPerBlock.y + 1); | |
| 46 | - | |
| 47 | - //printf("Src Image Dimesions:\n\trows: %d\tcols: %d\n", src.rows, src.cols); | |
| 48 | - //printf("Dst Image Dimesions:\n\trows: %d\tcols: %d\n", dst.rows, dst.cols); | |
| 49 | - //printf("Running CUDALBP\nBlock Dimensions:\n\tx: %d\ty: %d\n", numBlocks.x, numBlocks.y); | |
| 50 | - | |
| 51 | - cudalbp_kernel<<<numBlocks, threadsPerBlock>>>(srcPtr, dstPtr, step, step, imageHeight, imageWidth, lut); | |
| 48 | + cudaMalloc(dstPtr, rows*cols*sizeof(uint8_t)); | |
| 49 | + cudalbp_kernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), rows, cols, lut); | |
| 52 | 50 | } |
| 53 | 51 | |
| 54 | - void cudalbp_init_wrapper(uint8_t* lut, uint8_t** lutGpuPtrPtr) { | |
| 55 | - cudaMalloc(lutGpuPtrPtr, 256*sizeof(uint8_t)); | |
| 56 | - cudaMemcpy(*lutGpuPtrPtr, lut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice); | |
| 52 | + void cudalbp_init_wrapper(uint8_t* cpuLut) { | |
| 53 | + cudaMalloc(&lut, 256*sizeof(uint8_t)); | |
| 54 | + cudaMemcpy(lut, cpuLut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice); | |
| 57 | 55 | } |
| 58 | 56 | }} | ... | ... |
openbr/plugins/cuda/cudalbp.hpp deleted
| 1 | -#include <opencv2/gpu/gpu.hpp> | |
| 2 | - | |
| 3 | -using namespace cv; | |
| 4 | -using namespace cv::gpu; | |
| 5 | - | |
| 6 | -namespace br { namespace cuda { | |
| 7 | - void cudalbp_init_wrapper(uint8_t* lut, uint8_t** lutGpuPtrPtr); | |
| 8 | - void cudalbp_wrapper(uint8_t* src, uint8_t* dst, uint8_t* lut, int imageWidth, int imageHeight, size_t step); | |
| 9 | -}} |