processing can now be done totally on the graphics card

DepthDeluxe
1 parent 5e16788d
Showing 5 changed files with 48 additions and 27 deletions
openbr/plugins/cuda/copyfrom.cpp
openbr/plugins/cuda/copyfrom.cu
openbr/plugins/cuda/cudacvtfloat.cpp
openbr/plugins/cuda/cudacvtfloat.cu
openbr/plugins/cuda/cudacvtfloat.hpp
@@ -11,7 +11,7 @@ using namespace cv;
 // extern CUDA declaration
 namespace br { namespace cuda { namespace cudacopyfrom {
   //template <typename T> void wrapper(void* src, T* out, int rows, int cols) {
-  void wrapper(void* src, unsigned char* out, const int rows, const int cols);
+  void wrapper(void* src, float* out, const int rows, const int cols);
 }}}
  
 namespace br
@@ -23,7 +23,7 @@ namespace br
 private:
     void project(const Template &src, Template &dst) const
     {
-      cout << "CUDACopyFrom Start" << endl;
+      cout << "CUDACopyFrom Start" << endl << endl << endl;
  
       // pull the data back out of the Mat
       void* const* dataPtr = src.m().ptr<void*>();
@@ -32,16 +32,26 @@ private:
       int cols = *((int*)dataPtr[2]);
       int type = *((int*)dataPtr[3]);
  
+      if (type != CV_32FC1) {
+        cout << "ERR: Invalid data type!" << endl;
+        return;
+      }
+
       cout << "cudaMemPtr: " << cudaMemPtr << endl;
       cout << "rows: " << rows << endl;
       cout << "cols: " << cols << endl;
       cout << "type: " << type << endl;
  
-      dst = Mat(rows, cols, type);
-
-      br::cuda::cudacopyfrom::wrapper(cudaMemPtr, dst.m().ptr<unsigned char>(), rows, cols);
+      Mat dstMat = Mat(rows, cols, type);
+      br::cuda::cudacopyfrom::wrapper(cudaMemPtr, dstMat.ptr<float>(), rows, cols);
+      dst = dstMat;
  
       cout << "CUDACopyFrom End" << endl;
+
+      cout << "DST Data" << endl;
+      cout << "rows: " << dstMat.rows << endl;
+      cout << "cols: " << dstMat.cols << endl;
+      cout << "type: " << dstMat.type() << endl;
     }
   };
  
 namespace br { namespace cuda { namespace cudacopyfrom {
   //template <typename T> void wrapper(void* src, T* out, int rows, int cols) {
-  void wrapper(void* src, unsigned char* out, const int rows, const int cols) {
-    cudaMemcpy(out, src, rows*cols*sizeof(unsigned char), cudaMemcpyDeviceToHost);
+  void wrapper(void* src, float* dst, const int rows, const int cols) {
+    cudaMemcpy(dst, src, rows*cols*sizeof(float), cudaMemcpyDeviceToHost);
     cudaFree(src);
   }
 }}}
@@ -7,7 +7,10 @@ using namespace cv;
  
 #include <openbr/plugins/openbr_internal.h>
  
-#include "cudacvtfloat.hpp"
+
+namespace br { namespace cuda { namespace cudacvtfloat {
+  void wrapper(const unsigned char* src, void** dst, int rows, int cols);
+}}}
  
 namespace br
 {
@@ -24,20 +27,32 @@ class CUDACvtFloatTransform : public UntrainableTransform
   public:
     void project(const Template &src, Template &dst) const
     {
+      cout << "CUDACvtFloat Start" << endl;
+
+      void* const* srcDataPtr = src.m().ptr<void*>();
+      void* srcMemPtr = srcDataPtr[0];
+      int rows = *((int*)srcDataPtr[1]);
+      int cols = *((int*)srcDataPtr[2]);
+      int type = *((int*)srcDataPtr[3]);
+
       // assume the image type is 256-monochrome
       // TODO(colin): real exception handling
-      if (src.m().type() != CV_8UC1) {
+      if (type != CV_8UC1) {
         cout << "ERR: Invalid memory format" << endl;
         return;
       }
  
+      // build the destination mat
+      Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
+      void** dstDataPtr = dstMat.ptr<void*>();
+      dstDataPtr[1] = srcDataPtr[1];
+      dstDataPtr[2] = srcDataPtr[2];
+      dstDataPtr[3] = srcDataPtr[3]; *((int*)dstDataPtr[3]) = CV_32FC1;
  
-      int rows = src.m().rows;
-      int cols = src.m().cols;
-
-      dst = Mat(rows, cols, CV_32FC1);
+      br::cuda::cudacvtfloat::wrapper((const unsigned char*)srcMemPtr, &dstDataPtr[0], rows, cols);
+      dst = dstMat;
  
-      br::cuda::cudacvtfloat::wrapper((const unsigned char*)src.m().ptr<unsigned char>(), dst.m().ptr<float>(), rows, cols);
+      cout << "CUDACvtFloat End" << endl;
     }
 };
  
@@ -14,13 +14,15 @@ namespace br { namespace cuda { namespace cudacvtfloat {
     dst[index] = (float)src[index];
   }
  
-  void wrapper(const unsigned char* src, float* dst, int rows, int cols) {
-    unsigned char* cudaSrc;
-    cudaMalloc(&cudaSrc, rows*cols*sizeof(unsigned char));
-    cudaMemcpy(cudaSrc, src, rows*cols*sizeof(unsigned char), cudaMemcpyHostToDevice);
+  void wrapper(const unsigned char* src, void** dst, int rows, int cols) {
+    //unsigned char* cudaSrc;
+    //cudaMalloc(&cudaSrc, rows*cols*sizeof(unsigned char));
+    //cudaMemcpy(cudaSrc, src, rows*cols*sizeof(unsigned char), cudaMemcpyHostToDevice);
  
-    float* cudaDst;
-    cudaMalloc(&cudaDst, rows*cols*sizeof(float));
+    //float* cudaDst;
+    //cudaMalloc(&cudaDst, rows*cols*sizeof(float));
+
+    cudaMalloc(dst, rows*cols*sizeof(float));
  
     dim3 threadsPerBlock(8, 8);
     dim3 blocks(
@@ -28,10 +30,7 @@ namespace br { namespace cuda { namespace cudacvtfloat {
       rows / threadsPerBlock.y + 1
     );
  
-    kernel<<<threadsPerBlock, blocks>>>(cudaSrc, cudaDst, rows, cols);
-
-    // copy the data back to the destination
-    cudaMemcpy(dst, cudaDst, rows*cols*sizeof(float), cudaMemcpyDeviceToHost);
+    kernel<<<threadsPerBlock, blocks>>>(src, (float*)(*dst), rows, cols);
   }
  
 }}}
-namespace br { namespace cuda { namespace cudacvtfloat {
-  void wrapper(const unsigned char* src, float* dst, int rows, int cols);
-}}}