Commit 5e16788d451105dcdc4fef2778996043ec3a40ad

Authored by DepthDeluxe
1 parent aaabb3cb

added pipelined CUDALBP

openbr/plugins/cuda/copyfrom.cpp
@@ -23,6 +23,8 @@ namespace br @@ -23,6 +23,8 @@ namespace br
23 private: 23 private:
24 void project(const Template &src, Template &dst) const 24 void project(const Template &src, Template &dst) const
25 { 25 {
  26 + cout << "CUDACopyFrom Start" << endl;
  27 +
26 // pull the data back out of the Mat 28 // pull the data back out of the Mat
27 void* const* dataPtr = src.m().ptr<void*>(); 29 void* const* dataPtr = src.m().ptr<void*>();
28 void* cudaMemPtr = dataPtr[0]; 30 void* cudaMemPtr = dataPtr[0];
@@ -30,9 +32,16 @@ private: @@ -30,9 +32,16 @@ private:
30 int cols = *((int*)dataPtr[2]); 32 int cols = *((int*)dataPtr[2]);
31 int type = *((int*)dataPtr[3]); 33 int type = *((int*)dataPtr[3]);
32 34
  35 + cout << "cudaMemPtr: " << cudaMemPtr << endl;
  36 + cout << "rows: " << rows << endl;
  37 + cout << "cols: " << cols << endl;
  38 + cout << "type: " << type << endl;
  39 +
33 dst = Mat(rows, cols, type); 40 dst = Mat(rows, cols, type);
34 41
35 br::cuda::cudacopyfrom::wrapper(cudaMemPtr, dst.m().ptr<unsigned char>(), rows, cols); 42 br::cuda::cudacopyfrom::wrapper(cudaMemPtr, dst.m().ptr<unsigned char>(), rows, cols);
  43 +
  44 + cout << "CUDACopyFrom End" << endl;
36 } 45 }
37 }; 46 };
38 47
openbr/plugins/cuda/cudalbp.cpp
@@ -15,8 +15,7 @@ @@ -15,8 +15,7 @@
15 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 15 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
16 16
17 #include <iostream> 17 #include <iostream>
18 -//#include <thread>  
19 -//#include <mutex> 18 +using namespace std;
20 19
21 #include <sys/types.h> 20 #include <sys/types.h>
22 #include <unistd.h> 21 #include <unistd.h>
@@ -32,7 +31,6 @@ @@ -32,7 +31,6 @@
32 31
33 #include <openbr/plugins/openbr_internal.h> 32 #include <openbr/plugins/openbr_internal.h>
34 33
35 -#include "cudalbp.hpp"  
36 #include "MatManager.hpp" 34 #include "MatManager.hpp"
37 35
38 using namespace cv; 36 using namespace cv;
@@ -60,19 +58,17 @@ string type2str(int type) { @@ -60,19 +58,17 @@ string type2str(int type) {
60 return r; 58 return r;
61 } 59 }
62 60
63 -int ctr = 0;  
64 -pthread_mutex_t* uploadMutex = NULL; 61 +namespace br { namespace cuda {
  62 + void cudalbp_wrapper(void* srcPtr, void** dstPtr, int rows, int cols);
  63 + void cudalbp_init_wrapper(uint8_t* lut);
  64 +}}
65 65
66 namespace br 66 namespace br
67 { 67 {
68 -  
69 /*! 68 /*!
70 * \ingroup transforms 69 * \ingroup transforms
71 - * \brief Convert the image into a feature vector using Local Binary Patterns  
72 - * \br_paper Ahonen, T.; Hadid, A.; Pietikainen, M.;  
73 - * "Face Description with Local Binary Patterns: Application to Face Recognition"  
74 - * Pattern Analysis and Machine Intelligence, IEEE Transactions, vol.28, no.12, pp.2037-2041, Dec. 2006  
75 - * \author Josh Klontz \cite jklontz 70 + * \brief Convert the image into a feature vector using Local Binary Patterns in CUDA
  71 + * \author Colin Heinzmann, Li Li \cite DepthDeluxe, booli
76 */ 72 */
77 class CUDALBPTransform : public UntrainableTransform 73 class CUDALBPTransform : public UntrainableTransform
78 { 74 {
@@ -86,10 +82,8 @@ class CUDALBPTransform : public UntrainableTransform @@ -86,10 +82,8 @@ class CUDALBPTransform : public UntrainableTransform
86 82
87 private: 83 private:
88 uchar lut[256]; 84 uchar lut[256];
89 - uint8_t* lutGpuPtr;  
90 uchar null; 85 uchar null;
91 86
92 -  
93 cuda::MatManager* matManager; 87 cuda::MatManager* matManager;
94 88
95 public: 89 public:
@@ -146,31 +140,50 @@ class CUDALBPTransform : public UntrainableTransform @@ -146,31 +140,50 @@ class CUDALBPTransform : public UntrainableTransform
146 matManager = new cuda::MatManager(10); 140 matManager = new cuda::MatManager(10);
147 141
148 // copy lut over to the GPU 142 // copy lut over to the GPU
149 - br::cuda::cudalbp_init_wrapper(lut, &lutGpuPtr); 143 + br::cuda::cudalbp_init_wrapper(lut);
150 144
151 std::cout << "Initialized CUDALBP" << std::endl; 145 std::cout << "Initialized CUDALBP" << std::endl;
152 } 146 }
153 147
154 void project(const Template &src, Template &dst) const 148 void project(const Template &src, Template &dst) const
155 { 149 {
156 - Mat& m = (Mat&)src.m();  
157 - cuda::MatManager::matindex a;  
158 - cuda::MatManager::matindex b;  
159 - a = matManager->reserve(m);  
160 - matManager->upload(a, m); 150 + //Mat& m = (Mat&)src.m();
  151 + //cuda::MatManager::matindex a;
  152 + //cuda::MatManager::matindex b;
  153 + //a = matManager->reserve(m);
  154 + //matManager->upload(a, m);
161 155
162 // reserve the second mat and check the dimensiosn 156 // reserve the second mat and check the dimensiosn
163 - b = matManager->reserve(m);  
164 -  
165 - uint8_t* srcMatPtr = matManager->get_mat_pointer_from_index(a);  
166 - uint8_t* dstMatPtr = matManager->get_mat_pointer_from_index(b);  
167 - br::cuda::cudalbp_wrapper(srcMatPtr, dstMatPtr, lutGpuPtr, m.cols, m.rows, m.step1()); 157 + //b = matManager->reserve(m);
168 158
169 - matManager->download(b, dst); 159 + //uint8_t* srcMatPtr = matManager->get_mat_pointer_from_index(a);
  160 + //uint8_t* dstMatPtr = matManager->get_mat_pointer_from_index(b);
  161 + //br::cuda::cudalbp_wrapper(srcMatPtr, dstMatPtr, lutGpuPtr, m.cols, m.rows, m.step1());
  162 +
  163 + //matManager->download(b, dst);
170 164
171 // release both the mats 165 // release both the mats
172 - matManager->release(a);  
173 - matManager->release(b); 166 + //matManager->release(a);
  167 + //matManager->release(b);
  168 +
  169 + cout << "CUDALBP Start" << endl;
  170 +
  171 + void* const* srcDataPtr = src.m().ptr<void*>();
  172 + void* cudaSrcPtr = srcDataPtr[0];
  173 + int rows = *((int*)srcDataPtr[1]);
  174 + int cols = *((int*)srcDataPtr[2]);
  175 + int type = *((int*)srcDataPtr[3]);
  176 +
  177 + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
  178 + void** dstDataPtr = dstMat.ptr<void*>();
  179 + dstDataPtr[1] = srcDataPtr[1];
  180 + dstDataPtr[2] = srcDataPtr[2];
  181 + dstDataPtr[3] = srcDataPtr[3];
  182 +
  183 + br::cuda::cudalbp_wrapper(cudaSrcPtr, &dstDataPtr[0], rows, cols);
  184 + dst = dstMat;
  185 +
  186 + cout << "CUDALBP End" << endl;
174 } 187 }
175 }; 188 };
176 189
openbr/plugins/cuda/cudalbp.cu
@@ -4,55 +4,53 @@ @@ -4,55 +4,53 @@
4 using namespace cv; 4 using namespace cv;
5 using namespace cv::gpu; 5 using namespace cv::gpu;
6 6
7 -#include "cudalbp.hpp"  
8 -  
9 namespace br { namespace cuda { 7 namespace br { namespace cuda {
10 - __device__ __forceinline__ uint8_t cudalbp_kernel_get_pixel_value(int row, int col, uint8_t* srcPtr, size_t srcStep, int rows, int cols) {  
11 - return (row >= rows || col >= cols) ? 0 : (srcPtr + row*srcStep)[col]; 8 + uint8_t* lut;
  9 +
  10 + __device__ __forceinline__ uint8_t cudalbp_kernel_get_pixel_value(int row, int col, uint8_t* srcPtr, int rows, int cols) {
  11 + return (row >= rows || col >= cols) ? 0 : (srcPtr + row*cols)[col];
12 } 12 }
13 13
14 - __global__ void cudalbp_kernel(uint8_t* srcPtr, uint8_t* dstPtr, size_t srcStep, size_t dstStep, int rows, int cols, uint8_t* lut) 14 + __global__ void cudalbp_kernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols, uint8_t* lut)
15 { 15 {
16 int rowInd = blockIdx.y*blockDim.y+threadIdx.y; 16 int rowInd = blockIdx.y*blockDim.y+threadIdx.y;
17 int colInd = blockIdx.x*blockDim.x+threadIdx.x; 17 int colInd = blockIdx.x*blockDim.x+threadIdx.x;
18 int radius = 1; 18 int radius = 1;
19 19
20 // don't do anything if the index is out of bounds 20 // don't do anything if the index is out of bounds
21 - if (rowInd >= rows || colInd >= cols) 21 + if (rowInd >= rows || colInd >= cols) {
22 return; 22 return;
23 -  
24 - const uint8_t cval = cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+0*radius, srcPtr, srcStep, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]); // center value  
25 - uint8_t val = lut[(cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd-1*radius, srcPtr, srcStep, rows, cols) >= cval ? 128 : 0) |  
26 - (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+0*radius, srcPtr, srcStep, rows, cols) >= cval ? 64 : 0) |  
27 - (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+1*radius, srcPtr, srcStep, rows, cols) >= cval ? 32 : 0) |  
28 - (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+1*radius, srcPtr, srcStep, rows, cols) >= cval ? 16 : 0) |  
29 - (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+1*radius, srcPtr, srcStep, rows, cols) >= cval ? 8 : 0) |  
30 - (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+0*radius, srcPtr, srcStep, rows, cols) >= cval ? 4 : 0) |  
31 - (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd-1*radius, srcPtr, srcStep, rows, cols) >= cval ? 2 : 0) |  
32 - (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd-1*radius, srcPtr, srcStep, rows, cols) >= cval ? 1 : 0)]; 23 + }
  24 +
  25 + const uint8_t cval = cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+0*radius, srcPtr, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]); // center value
  26 + uint8_t val = lut[(cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 128 : 0) |
  27 + (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 64 : 0) |
  28 + (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 32 : 0) |
  29 + (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 16 : 0) |
  30 + (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 8 : 0) |
  31 + (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 4 : 0) |
  32 + (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 2 : 0) |
  33 + (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 1 : 0)];
33 34
34 // store calculated value away in the right place 35 // store calculated value away in the right place
35 - uint8_t* dstRowPtr = dstPtr + rowInd*dstStep;  
36 - dstRowPtr[colInd] = val; 36 + int index = rowInd*cols + colInd;
  37 + dstPtr[index] = val;
37 } 38 }
38 39
39 - void cudalbp_wrapper(uint8_t* srcPtr, uint8_t* dstPtr, uint8_t* lut, int imageWidth, int imageHeight, size_t step) 40 + //void cudalbp_wrapper(uint8_t* srcPtr, uint8_t* dstPtr, uint8_t* lut, int imageWidth, int imageHeight, size_t step)
  41 + void cudalbp_wrapper(void* srcPtr, void** dstPtr, int rows, int cols)
40 { 42 {
41 // make 8 * 8 = 64 square block 43 // make 8 * 8 = 64 square block
42 dim3 threadsPerBlock(8, 8); 44 dim3 threadsPerBlock(8, 8);
  45 + dim3 numBlocks(cols/threadsPerBlock.x + 1,
  46 + rows/threadsPerBlock.y + 1);
43 47
44 - dim3 numBlocks(imageWidth/threadsPerBlock.x + 1,  
45 - imageHeight/threadsPerBlock.y + 1);  
46 -  
47 - //printf("Src Image Dimesions:\n\trows: %d\tcols: %d\n", src.rows, src.cols);  
48 - //printf("Dst Image Dimesions:\n\trows: %d\tcols: %d\n", dst.rows, dst.cols);  
49 - //printf("Running CUDALBP\nBlock Dimensions:\n\tx: %d\ty: %d\n", numBlocks.x, numBlocks.y);  
50 -  
51 - cudalbp_kernel<<<numBlocks, threadsPerBlock>>>(srcPtr, dstPtr, step, step, imageHeight, imageWidth, lut); 48 + cudaMalloc(dstPtr, rows*cols*sizeof(uint8_t));
  49 + cudalbp_kernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), rows, cols, lut);
52 } 50 }
53 51
54 - void cudalbp_init_wrapper(uint8_t* lut, uint8_t** lutGpuPtrPtr) {  
55 - cudaMalloc(lutGpuPtrPtr, 256*sizeof(uint8_t));  
56 - cudaMemcpy(*lutGpuPtrPtr, lut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice); 52 + void cudalbp_init_wrapper(uint8_t* cpuLut) {
  53 + cudaMalloc(&lut, 256*sizeof(uint8_t));
  54 + cudaMemcpy(lut, cpuLut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice);
57 } 55 }
58 }} 56 }}
openbr/plugins/cuda/cudalbp.hpp deleted
1 -#include <opencv2/gpu/gpu.hpp>  
2 -  
3 -using namespace cv;  
4 -using namespace cv::gpu;  
5 -  
6 -namespace br { namespace cuda {  
7 - void cudalbp_init_wrapper(uint8_t* lut, uint8_t** lutGpuPtrPtr);  
8 - void cudalbp_wrapper(uint8_t* src, uint8_t* dst, uint8_t* lut, int imageWidth, int imageHeight, size_t step);  
9 -}}