fixed the threading problem by adding locks

DepthDeluxe
1 parent cc916f3d
Showing 3 changed files with 103 additions and 37 deletions
openbr/plugins/cuda/cudalbp.cpp
openbr/plugins/cuda/cudalbp.cu
openbr/plugins/cuda/passthrough.cpp
@@ -14,6 +14,16 @@
  * limitations under the License.                                            *
  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+#include <iostream>
+//#include <thread>
+//#include <mutex>
+
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <pthread.h>
+
+#include <opencv2/opencv.hpp>
 #include <opencv2/imgproc/imgproc.hpp>
 #include <opencv2/imgproc/imgproc_c.h>
 #include <opencv2/highgui/highgui.hpp>
@@ -26,6 +36,32 @@
 using namespace cv;
+string type2str(int type) {
+  string r;
+
+  uchar depth = type & CV_MAT_DEPTH_MASK;
+  uchar chans = 1 + (type >> CV_CN_SHIFT);
+
+  switch ( depth ) {
+    case CV_8U:  r = "8U"; break;
+    case CV_8S:  r = "8S"; break;
+    case CV_16U: r = "16U"; break;
+    case CV_16S: r = "16S"; break;
+    case CV_32S: r = "32S"; break;
+    case CV_32F: r = "32F"; break;
+    case CV_64F: r = "64F"; break;
+    default:     r = "User"; break;
+  }
+
+  r += "C";
+  r += (chans+'0');
+
+  return r;
+}
+
+int ctr = 0;
+pthread_mutex_t* uploadMutex = NULL;
+
 namespace br
 {
@@ -47,10 +83,15 @@ class CUDALBPTransform : public UntrainableTransform
     BR_PROPERTY(int, maxTransitions, 8)
     BR_PROPERTY(bool, rotationInvariant, false)
+  private:
     uchar lut[256];
     uint8_t* lutGpuPtr;
     uchar null;
+    //std::mutex uploadMutex;
+    pthread_mutex_t* uploadMutex;
+
+  public:
     /* Returns the number of 0->1 or 1->0 transitions in i */
     static int numTransitions(int i)
     {
@@ -100,24 +141,66 @@ class CUDALBPTransform : public UntrainableTransform
             if (!set[i])
                 lut[i] = null; // Set to null id
+
         // copy lut over to the GPU
         br::cuda::cudalbp_init_wrapper(lut, &lutGpuPtr);
+
+        // initialize the mutex
+        std::cout << "STARING EVERYTHING" << std::endl<< std::flush;
+        if (uploadMutex == NULL) {
+          uploadMutex = (pthread_mutex_t*)malloc(sizeof(pthread_mutex_t));
+          pthread_mutex_init(uploadMutex, NULL);
+        }
     }
     void project(const Template &src, Template &dst) const
     {
-      // assume we are using an 8-bit 1 channel image
-      GpuMat srcGpuMat, dstGpuMat;
-
-      // copy the data to the GPU
-      srcGpuMat.upload(src.m());
-      dstGpuMat.upload(src.m());
-
-      // call the kernel function
-      br::cuda::cudalbp_wrapper(srcGpuMat, dstGpuMat, lutGpuPtr);
-
-      // download the result
-      dstGpuMat.download(dst.m());
+        int myCtr = ctr++;
+        GpuMat a, b;
+        const Mat& m = src.m();
+
+        std::cout << "PID: " << getpid() << std::endl << std::flush;
+
+        //std::cout << "START: " << myCtr << std::endl << std::flush;
+
+
+        //std::cout << "Image type: " << type2str(m.type()) << std::endl << std::flush;
+        pthread_mutex_lock(uploadMutex);
+        a.create(m.size(), m.type());
+        b.create(m.size(), m.type());
+        pthread_mutex_unlock(uploadMutex);
+
+        pthread_mutex_lock(uploadMutex);
+        a.upload(m);
+        b.upload(m);
+        pthread_mutex_unlock(uploadMutex);
+
+        // resize the mats
+        //if (m.size() != srcGpuMat->size()) {
+        //  printf("resizing...\n");
+        //  srcGpuMat->release();                    dstGpuMat->release();
+        //  srcGpuMat->create(m.size(), CV_8UC1);    dstGpuMat->create(m.size(), CV_8UC1);
+        //}
+
+        // copy the data to the GPU
+        //srcGpuMat->upload(m);
+
+        // call the kernel function
+        //br::cuda::cudalbp_wrapper(*srcGpuMat, *dstGpuMat, lutGpuPtr);
+        pthread_mutex_lock(uploadMutex);
+        br::cuda::cudalbp_wrapper(a, b, lutGpuPtr);
+        pthread_mutex_unlock(uploadMutex);
+
+        // download the result to the destination
+        //dstGpuMat->download(dst.m());
+        pthread_mutex_lock(uploadMutex);
+        b.download(dst.m());
+        pthread_mutex_unlock(uploadMutex);
+
+        pthread_mutex_lock(uploadMutex);
+        a.release();
+        b.release();
+        pthread_mutex_unlock(uploadMutex);
     }
 };
@@ -51,9 +51,9 @@ namespace br { namespace cuda {
     dim3 numBlocks(imageWidth/threadsPerBlock.x + 1,
                    imageHeight/threadsPerBlock.y + 1);
-    printf("Src Image Dimesions:\n\trows: %d\tcols: %d\n", src.rows, src.cols);
-    printf("Dst Image Dimesions:\n\trows: %d\tcols: %d\n", dst.rows, dst.cols);
-    printf("Running CUDALBP\nBlock Dimensions:\n\tx: %d\ty: %d\n", numBlocks.x, numBlocks.y);
+    //printf("Src Image Dimesions:\n\trows: %d\tcols: %d\n", src.rows, src.cols);
+    //printf("Dst Image Dimesions:\n\trows: %d\tcols: %d\n", dst.rows, dst.cols);
+    //printf("Running CUDALBP\nBlock Dimensions:\n\tx: %d\ty: %d\n", numBlocks.x, numBlocks.y);
     cudalbp_kernel<<<numBlocks, threadsPerBlock>>>(srcPtr, dstPtr, src.step, dst.step, imageHeight, imageWidth, lut);
   }
@@ -10,28 +10,6 @@ using namespace cv::gpu;
 #include <iostream>
-string type2str(int type) {
-  string r;
-
-  uchar depth = type & CV_MAT_DEPTH_MASK;
-  uchar chans = 1 + (type >> CV_CN_SHIFT);
-
-  switch ( depth ) {
-    case CV_8U:  r = "8U"; break;
-    case CV_8S:  r = "8S"; break;
-    case CV_16U: r = "16U"; break;
-    case CV_16S: r = "16S"; break;
-    case CV_32S: r = "32S"; break;
-    case CV_32F: r = "32F"; break;
-    case CV_64F: r = "64F"; break;
-    default:     r = "User"; break;
-  }
-
-  r += "C";
-  r += (chans+'0');
-
-  return r;
-}
 namespace br
 {
@@ -54,6 +32,11 @@ private:
       dstGpuMat.download(dst.m());
       // TODO(colin): add delete code
+      srcGpuMat.release();
+      dstGpuMat.release();
+
+      printf("srcGpuMat empty: %d\n", (int)srcGpuMat.empty());
+      printf("dstGpuMat empty: %d\n", (int)srcGpuMat.empty());
     }
   };