Commit 5e16788d451105dcdc4fef2778996043ec3a40ad

Authored by DepthDeluxe
1 parent aaabb3cb

added pipelined CUDALBP

openbr/plugins/cuda/copyfrom.cpp
... ... @@ -23,6 +23,8 @@ namespace br
23 23 private:
24 24 void project(const Template &src, Template &dst) const
25 25 {
  26 + cout << "CUDACopyFrom Start" << endl;
  27 +
26 28 // pull the data back out of the Mat
27 29 void* const* dataPtr = src.m().ptr<void*>();
28 30 void* cudaMemPtr = dataPtr[0];
... ... @@ -30,9 +32,16 @@ private:
30 32 int cols = *((int*)dataPtr[2]);
31 33 int type = *((int*)dataPtr[3]);
32 34  
  35 + cout << "cudaMemPtr: " << cudaMemPtr << endl;
  36 + cout << "rows: " << rows << endl;
  37 + cout << "cols: " << cols << endl;
  38 + cout << "type: " << type << endl;
  39 +
33 40 dst = Mat(rows, cols, type);
34 41  
35 42 br::cuda::cudacopyfrom::wrapper(cudaMemPtr, dst.m().ptr<unsigned char>(), rows, cols);
  43 +
  44 + cout << "CUDACopyFrom End" << endl;
36 45 }
37 46 };
38 47  
... ...
openbr/plugins/cuda/cudalbp.cpp
... ... @@ -15,8 +15,7 @@
15 15 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
16 16  
17 17 #include <iostream>
18   -//#include <thread>
19   -//#include <mutex>
  18 +using namespace std;
20 19  
21 20 #include <sys/types.h>
22 21 #include <unistd.h>
... ... @@ -32,7 +31,6 @@
32 31  
33 32 #include <openbr/plugins/openbr_internal.h>
34 33  
35   -#include "cudalbp.hpp"
36 34 #include "MatManager.hpp"
37 35  
38 36 using namespace cv;
... ... @@ -60,19 +58,17 @@ string type2str(int type) {
60 58 return r;
61 59 }
62 60  
63   -int ctr = 0;
64   -pthread_mutex_t* uploadMutex = NULL;
  61 +namespace br { namespace cuda {
  62 + void cudalbp_wrapper(void* srcPtr, void** dstPtr, int rows, int cols);
  63 + void cudalbp_init_wrapper(uint8_t* lut);
  64 +}}
65 65  
66 66 namespace br
67 67 {
68   -
69 68 /*!
70 69 * \ingroup transforms
71   - * \brief Convert the image into a feature vector using Local Binary Patterns
72   - * \br_paper Ahonen, T.; Hadid, A.; Pietikainen, M.;
73   - * "Face Description with Local Binary Patterns: Application to Face Recognition"
74   - * Pattern Analysis and Machine Intelligence, IEEE Transactions, vol.28, no.12, pp.2037-2041, Dec. 2006
75   - * \author Josh Klontz \cite jklontz
  70 + * \brief Convert the image into a feature vector using Local Binary Patterns in CUDA
  71 + * \author Colin Heinzmann, Li Li \cite DepthDeluxe, booli
76 72 */
77 73 class CUDALBPTransform : public UntrainableTransform
78 74 {
... ... @@ -86,10 +82,8 @@ class CUDALBPTransform : public UntrainableTransform
86 82  
87 83 private:
88 84 uchar lut[256];
89   - uint8_t* lutGpuPtr;
90 85 uchar null;
91 86  
92   -
93 87 cuda::MatManager* matManager;
94 88  
95 89 public:
... ... @@ -146,31 +140,50 @@ class CUDALBPTransform : public UntrainableTransform
146 140 matManager = new cuda::MatManager(10);
147 141  
148 142 // copy lut over to the GPU
149   - br::cuda::cudalbp_init_wrapper(lut, &lutGpuPtr);
  143 + br::cuda::cudalbp_init_wrapper(lut);
150 144  
151 145 std::cout << "Initialized CUDALBP" << std::endl;
152 146 }
153 147  
154 148 void project(const Template &src, Template &dst) const
155 149 {
156   - Mat& m = (Mat&)src.m();
157   - cuda::MatManager::matindex a;
158   - cuda::MatManager::matindex b;
159   - a = matManager->reserve(m);
160   - matManager->upload(a, m);
  150 + //Mat& m = (Mat&)src.m();
  151 + //cuda::MatManager::matindex a;
  152 + //cuda::MatManager::matindex b;
  153 + //a = matManager->reserve(m);
  154 + //matManager->upload(a, m);
161 155  
162 156 // reserve the second mat and check the dimensiosn
163   - b = matManager->reserve(m);
164   -
165   - uint8_t* srcMatPtr = matManager->get_mat_pointer_from_index(a);
166   - uint8_t* dstMatPtr = matManager->get_mat_pointer_from_index(b);
167   - br::cuda::cudalbp_wrapper(srcMatPtr, dstMatPtr, lutGpuPtr, m.cols, m.rows, m.step1());
  157 + //b = matManager->reserve(m);
168 158  
169   - matManager->download(b, dst);
  159 + //uint8_t* srcMatPtr = matManager->get_mat_pointer_from_index(a);
  160 + //uint8_t* dstMatPtr = matManager->get_mat_pointer_from_index(b);
  161 + //br::cuda::cudalbp_wrapper(srcMatPtr, dstMatPtr, lutGpuPtr, m.cols, m.rows, m.step1());
  162 +
  163 + //matManager->download(b, dst);
170 164  
171 165 // release both the mats
172   - matManager->release(a);
173   - matManager->release(b);
  166 + //matManager->release(a);
  167 + //matManager->release(b);
  168 +
  169 + cout << "CUDALBP Start" << endl;
  170 +
  171 + void* const* srcDataPtr = src.m().ptr<void*>();
  172 + void* cudaSrcPtr = srcDataPtr[0];
  173 + int rows = *((int*)srcDataPtr[1]);
  174 + int cols = *((int*)srcDataPtr[2]);
  175 + int type = *((int*)srcDataPtr[3]);
  176 +
  177 + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
  178 + void** dstDataPtr = dstMat.ptr<void*>();
  179 + dstDataPtr[1] = srcDataPtr[1];
  180 + dstDataPtr[2] = srcDataPtr[2];
  181 + dstDataPtr[3] = srcDataPtr[3];
  182 +
  183 + br::cuda::cudalbp_wrapper(cudaSrcPtr, &dstDataPtr[0], rows, cols);
  184 + dst = dstMat;
  185 +
  186 + cout << "CUDALBP End" << endl;
174 187 }
175 188 };
176 189  
... ...
openbr/plugins/cuda/cudalbp.cu
... ... @@ -4,55 +4,53 @@
4 4 using namespace cv;
5 5 using namespace cv::gpu;
6 6  
7   -#include "cudalbp.hpp"
8   -
9 7 namespace br { namespace cuda {
10   - __device__ __forceinline__ uint8_t cudalbp_kernel_get_pixel_value(int row, int col, uint8_t* srcPtr, size_t srcStep, int rows, int cols) {
11   - return (row >= rows || col >= cols) ? 0 : (srcPtr + row*srcStep)[col];
  8 + uint8_t* lut;
  9 +
  10 + __device__ __forceinline__ uint8_t cudalbp_kernel_get_pixel_value(int row, int col, uint8_t* srcPtr, int rows, int cols) {
  11 + return (row >= rows || col >= cols) ? 0 : (srcPtr + row*cols)[col];
12 12 }
13 13  
14   - __global__ void cudalbp_kernel(uint8_t* srcPtr, uint8_t* dstPtr, size_t srcStep, size_t dstStep, int rows, int cols, uint8_t* lut)
  14 + __global__ void cudalbp_kernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols, uint8_t* lut)
15 15 {
16 16 int rowInd = blockIdx.y*blockDim.y+threadIdx.y;
17 17 int colInd = blockIdx.x*blockDim.x+threadIdx.x;
18 18 int radius = 1;
19 19  
20 20 // don't do anything if the index is out of bounds
21   - if (rowInd >= rows || colInd >= cols)
  21 + if (rowInd >= rows || colInd >= cols) {
22 22 return;
23   -
24   - const uint8_t cval = cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+0*radius, srcPtr, srcStep, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]); // center value
25   - uint8_t val = lut[(cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd-1*radius, srcPtr, srcStep, rows, cols) >= cval ? 128 : 0) |
26   - (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+0*radius, srcPtr, srcStep, rows, cols) >= cval ? 64 : 0) |
27   - (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+1*radius, srcPtr, srcStep, rows, cols) >= cval ? 32 : 0) |
28   - (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+1*radius, srcPtr, srcStep, rows, cols) >= cval ? 16 : 0) |
29   - (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+1*radius, srcPtr, srcStep, rows, cols) >= cval ? 8 : 0) |
30   - (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+0*radius, srcPtr, srcStep, rows, cols) >= cval ? 4 : 0) |
31   - (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd-1*radius, srcPtr, srcStep, rows, cols) >= cval ? 2 : 0) |
32   - (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd-1*radius, srcPtr, srcStep, rows, cols) >= cval ? 1 : 0)];
  23 + }
  24 +
  25 + const uint8_t cval = cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+0*radius, srcPtr, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]); // center value
  26 + uint8_t val = lut[(cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 128 : 0) |
  27 + (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 64 : 0) |
  28 + (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 32 : 0) |
  29 + (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 16 : 0) |
  30 + (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 8 : 0) |
  31 + (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 4 : 0) |
  32 + (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 2 : 0) |
  33 + (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 1 : 0)];
33 34  
34 35 // store calculated value away in the right place
35   - uint8_t* dstRowPtr = dstPtr + rowInd*dstStep;
36   - dstRowPtr[colInd] = val;
  36 + int index = rowInd*cols + colInd;
  37 + dstPtr[index] = val;
37 38 }
38 39  
39   - void cudalbp_wrapper(uint8_t* srcPtr, uint8_t* dstPtr, uint8_t* lut, int imageWidth, int imageHeight, size_t step)
  40 + //void cudalbp_wrapper(uint8_t* srcPtr, uint8_t* dstPtr, uint8_t* lut, int imageWidth, int imageHeight, size_t step)
  41 + void cudalbp_wrapper(void* srcPtr, void** dstPtr, int rows, int cols)
40 42 {
41 43 // make 8 * 8 = 64 square block
42 44 dim3 threadsPerBlock(8, 8);
  45 + dim3 numBlocks(cols/threadsPerBlock.x + 1,
  46 + rows/threadsPerBlock.y + 1);
43 47  
44   - dim3 numBlocks(imageWidth/threadsPerBlock.x + 1,
45   - imageHeight/threadsPerBlock.y + 1);
46   -
47   - //printf("Src Image Dimesions:\n\trows: %d\tcols: %d\n", src.rows, src.cols);
48   - //printf("Dst Image Dimesions:\n\trows: %d\tcols: %d\n", dst.rows, dst.cols);
49   - //printf("Running CUDALBP\nBlock Dimensions:\n\tx: %d\ty: %d\n", numBlocks.x, numBlocks.y);
50   -
51   - cudalbp_kernel<<<numBlocks, threadsPerBlock>>>(srcPtr, dstPtr, step, step, imageHeight, imageWidth, lut);
  48 + cudaMalloc(dstPtr, rows*cols*sizeof(uint8_t));
  49 + cudalbp_kernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), rows, cols, lut);
52 50 }
53 51  
54   - void cudalbp_init_wrapper(uint8_t* lut, uint8_t** lutGpuPtrPtr) {
55   - cudaMalloc(lutGpuPtrPtr, 256*sizeof(uint8_t));
56   - cudaMemcpy(*lutGpuPtrPtr, lut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice);
  52 + void cudalbp_init_wrapper(uint8_t* cpuLut) {
  53 + cudaMalloc(&lut, 256*sizeof(uint8_t));
  54 + cudaMemcpy(lut, cpuLut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice);
57 55 }
58 56 }}
... ...
openbr/plugins/cuda/cudalbp.hpp deleted
1   -#include <opencv2/gpu/gpu.hpp>
2   -
3   -using namespace cv;
4   -using namespace cv::gpu;
5   -
6   -namespace br { namespace cuda {
7   - void cudalbp_init_wrapper(uint8_t* lut, uint8_t** lutGpuPtrPtr);
8   - void cudalbp_wrapper(uint8_t* src, uint8_t* dst, uint8_t* lut, int imageWidth, int imageHeight, size_t step);
9   -}}