added pipelined CUDALBP

DepthDeluxe
1 parent aaabb3cb
Showing 4 changed files with 77 additions and 66 deletions
openbr/plugins/cuda/copyfrom.cpp
openbr/plugins/cuda/cudalbp.cpp
openbr/plugins/cuda/cudalbp.cu
openbr/plugins/cuda/cudalbp.hpp
@@ -23,6 +23,8 @@ namespace br
 private:
     void project(const Template &src, Template &dst) const
     {
+      cout << "CUDACopyFrom Start" << endl;
+
       // pull the data back out of the Mat
       void* const* dataPtr = src.m().ptr<void*>();
       void* cudaMemPtr = dataPtr[0];
@@ -30,9 +32,16 @@ private:
       int cols = *((int*)dataPtr[2]);
       int type = *((int*)dataPtr[3]);
  
+      cout << "cudaMemPtr: " << cudaMemPtr << endl;
+      cout << "rows: " << rows << endl;
+      cout << "cols: " << cols << endl;
+      cout << "type: " << type << endl;
+
       dst = Mat(rows, cols, type);
  
       br::cuda::cudacopyfrom::wrapper(cudaMemPtr, dst.m().ptr<unsigned char>(), rows, cols);
+
+      cout << "CUDACopyFrom End" << endl;
     }
   };
  
@@ -15,8 +15,7 @@
  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
  
 #include <iostream>
-//#include <thread>
-//#include <mutex>
+using namespace std;
  
 #include <sys/types.h>
 #include <unistd.h>
@@ -32,7 +31,6 @@
  
 #include <openbr/plugins/openbr_internal.h>
  
-#include "cudalbp.hpp"
 #include "MatManager.hpp"
  
 using namespace cv;
@@ -60,19 +58,17 @@ string type2str(int type) {
   return r;
 }
  
-int ctr = 0;
-pthread_mutex_t* uploadMutex = NULL;
+namespace br { namespace cuda {
+  void cudalbp_wrapper(void* srcPtr, void** dstPtr, int rows, int cols);
+  void cudalbp_init_wrapper(uint8_t* lut);
+}}
  
 namespace br
 {
-
 /*!
  * \ingroup transforms
- * \brief Convert the image into a feature vector using Local Binary Patterns
- * \br_paper Ahonen, T.; Hadid, A.; Pietikainen, M.;
- *           "Face Description with Local Binary Patterns: Application to Face Recognition"
- *           Pattern Analysis and Machine Intelligence, IEEE Transactions, vol.28, no.12, pp.2037-2041, Dec. 2006
- * \author Josh Klontz \cite jklontz
+ * \brief Convert the image into a feature vector using Local Binary Patterns in CUDA
+ * \author Colin Heinzmann, Li Li \cite DepthDeluxe, booli
  */
 class CUDALBPTransform : public UntrainableTransform
 {
@@ -86,10 +82,8 @@ class CUDALBPTransform : public UntrainableTransform
  
   private:
     uchar lut[256];
-    uint8_t* lutGpuPtr;
     uchar null;
  
-
     cuda::MatManager* matManager;
  
   public:
@@ -146,31 +140,50 @@ class CUDALBPTransform : public UntrainableTransform
         matManager = new cuda::MatManager(10);
  
         // copy lut over to the GPU
-        br::cuda::cudalbp_init_wrapper(lut, &lutGpuPtr);
+        br::cuda::cudalbp_init_wrapper(lut);
  
         std::cout << "Initialized CUDALBP" << std::endl;
     }
  
     void project(const Template &src, Template &dst) const
     {
-        Mat& m = (Mat&)src.m();
-        cuda::MatManager::matindex a;
-        cuda::MatManager::matindex b;
-        a = matManager->reserve(m);
-        matManager->upload(a, m);
+        //Mat& m = (Mat&)src.m();
+        //cuda::MatManager::matindex a;
+        //cuda::MatManager::matindex b;
+        //a = matManager->reserve(m);
+        //matManager->upload(a, m);
  
         // reserve the second mat and check the dimensiosn
-        b = matManager->reserve(m);
-        
-        uint8_t* srcMatPtr = matManager->get_mat_pointer_from_index(a);
-        uint8_t* dstMatPtr = matManager->get_mat_pointer_from_index(b);
-        br::cuda::cudalbp_wrapper(srcMatPtr, dstMatPtr, lutGpuPtr, m.cols, m.rows, m.step1());
+        //b = matManager->reserve(m);
  
-        matManager->download(b, dst);
+        //uint8_t* srcMatPtr = matManager->get_mat_pointer_from_index(a);
+        //uint8_t* dstMatPtr = matManager->get_mat_pointer_from_index(b);
+        //br::cuda::cudalbp_wrapper(srcMatPtr, dstMatPtr, lutGpuPtr, m.cols, m.rows, m.step1());
+
+        //matManager->download(b, dst);
  
         // release both the mats
-        matManager->release(a);
-        matManager->release(b);
+        //matManager->release(a);
+        //matManager->release(b);
+
+        cout << "CUDALBP Start" << endl;
+
+        void* const* srcDataPtr = src.m().ptr<void*>();
+        void* cudaSrcPtr = srcDataPtr[0];
+        int rows = *((int*)srcDataPtr[1]);
+        int cols = *((int*)srcDataPtr[2]);
+        int type = *((int*)srcDataPtr[3]);
+
+        Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
+        void** dstDataPtr = dstMat.ptr<void*>();
+        dstDataPtr[1] = srcDataPtr[1];
+        dstDataPtr[2] = srcDataPtr[2];
+        dstDataPtr[3] = srcDataPtr[3];
+
+        br::cuda::cudalbp_wrapper(cudaSrcPtr, &dstDataPtr[0], rows, cols);
+        dst = dstMat;
+
+        cout << "CUDALBP End" << endl;
     }
 };
  
@@ -4,55 +4,53 @@
 using namespace cv;
 using namespace cv::gpu;
  
-#include "cudalbp.hpp"
-
 namespace br { namespace cuda {
-  __device__ __forceinline__ uint8_t cudalbp_kernel_get_pixel_value(int row, int col, uint8_t* srcPtr, size_t srcStep, int rows, int cols) {
-    return (row >= rows || col >= cols) ? 0 : (srcPtr + row*srcStep)[col];
+  uint8_t* lut;
+
+  __device__ __forceinline__ uint8_t cudalbp_kernel_get_pixel_value(int row, int col, uint8_t* srcPtr, int rows, int cols) {
+    return (row >= rows || col >= cols) ? 0 : (srcPtr + row*cols)[col];
   }
  
-  __global__ void cudalbp_kernel(uint8_t* srcPtr, uint8_t* dstPtr, size_t srcStep, size_t dstStep, int rows, int cols, uint8_t* lut)
+  __global__ void cudalbp_kernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols, uint8_t* lut)
   {
     int rowInd = blockIdx.y*blockDim.y+threadIdx.y;
     int colInd = blockIdx.x*blockDim.x+threadIdx.x;
     int radius = 1;
  
     // don't do anything if the index is out of bounds
-    if (rowInd >= rows || colInd >= cols)
+    if (rowInd >= rows || colInd >= cols) {
       return;
-
-    const uint8_t cval = cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+0*radius, srcPtr, srcStep, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]);                      // center value
-    uint8_t val = lut[(cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd-1*radius, srcPtr, srcStep, rows, cols) >= cval ? 128 : 0) |
-                      (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+0*radius, srcPtr, srcStep, rows, cols) >= cval ? 64  : 0) |
-                      (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+1*radius, srcPtr, srcStep, rows, cols) >= cval ? 32  : 0) |
-                      (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+1*radius, srcPtr, srcStep, rows, cols) >= cval ? 16  : 0) |
-                      (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+1*radius, srcPtr, srcStep, rows, cols) >= cval ? 8   : 0) |
-                      (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+0*radius, srcPtr, srcStep, rows, cols) >= cval ? 4   : 0) |
-                      (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd-1*radius, srcPtr, srcStep, rows, cols) >= cval ? 2   : 0) |
-                      (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd-1*radius, srcPtr, srcStep, rows, cols) >= cval ? 1   : 0)];
+    }
+
+    const uint8_t cval = cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+0*radius, srcPtr, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]);                      // center value
+    uint8_t val = lut[(cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 128 : 0) |
+                      (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 64  : 0) |
+                      (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 32  : 0) |
+                      (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 16  : 0) |
+                      (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 8   : 0) |
+                      (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 4   : 0) |
+                      (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 2   : 0) |
+                      (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 1   : 0)];
  
     // store calculated value away in the right place
-    uint8_t* dstRowPtr = dstPtr + rowInd*dstStep;
-    dstRowPtr[colInd] = val;
+    int index = rowInd*cols + colInd;
+    dstPtr[index] = val;
   }
  
-  void cudalbp_wrapper(uint8_t* srcPtr, uint8_t* dstPtr, uint8_t* lut, int imageWidth, int imageHeight, size_t step)
+  //void cudalbp_wrapper(uint8_t* srcPtr, uint8_t* dstPtr, uint8_t* lut, int imageWidth, int imageHeight, size_t step)
+  void cudalbp_wrapper(void* srcPtr, void** dstPtr, int rows, int cols)
   {
     // make 8 * 8 = 64 square block
     dim3 threadsPerBlock(8, 8);
+    dim3 numBlocks(cols/threadsPerBlock.x + 1,
+                   rows/threadsPerBlock.y + 1);
  
-    dim3 numBlocks(imageWidth/threadsPerBlock.x + 1,
-                   imageHeight/threadsPerBlock.y + 1);
-
-    //printf("Src Image Dimesions:\n\trows: %d\tcols: %d\n", src.rows, src.cols);
-    //printf("Dst Image Dimesions:\n\trows: %d\tcols: %d\n", dst.rows, dst.cols);
-    //printf("Running CUDALBP\nBlock Dimensions:\n\tx: %d\ty: %d\n", numBlocks.x, numBlocks.y);
-
-    cudalbp_kernel<<<numBlocks, threadsPerBlock>>>(srcPtr, dstPtr, step, step, imageHeight, imageWidth, lut);
+    cudaMalloc(dstPtr, rows*cols*sizeof(uint8_t));
+    cudalbp_kernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), rows, cols, lut);
   }
  
-  void cudalbp_init_wrapper(uint8_t* lut, uint8_t** lutGpuPtrPtr) {
-    cudaMalloc(lutGpuPtrPtr, 256*sizeof(uint8_t));
-    cudaMemcpy(*lutGpuPtrPtr, lut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice);
+  void cudalbp_init_wrapper(uint8_t* cpuLut) {
+    cudaMalloc(&lut, 256*sizeof(uint8_t));
+    cudaMemcpy(lut, cpuLut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice);
   }
 }}
-#include <opencv2/gpu/gpu.hpp>
-
-using namespace cv;
-using namespace cv::gpu;
-
-namespace br { namespace cuda {
-  void cudalbp_init_wrapper(uint8_t* lut, uint8_t** lutGpuPtrPtr);
-  void cudalbp_wrapper(uint8_t* src, uint8_t* dst, uint8_t* lut, int imageWidth, int imageHeight, size_t step);
-}}