From 4e59b2b58f7bd025265838a29835fbbfb5225c51 Mon Sep 17 00:00:00 2001
From: DepthDeluxe <cheinzmann3@gmail.com>
Date: Mon, 15 Feb 2016 18:37:28 -0500
Subject: [PATCH] full carryover support with proper memory management * wrapped all CUDA memory functions in safety calls * fixed CUDALBP invalid memory access which caused performance and stability issues * carryover support for all plugins, only copy over memory once throughout the whole computation

---
 openbr/plugins/cuda/copyfrom.cpp     |  5 ++---
 openbr/plugins/cuda/copyfrom.cu      |  7 +++++--
 openbr/plugins/cuda/copyto.cpp       | 23 +++++++++++------------
 openbr/plugins/cuda/copyto.cu        |  7 +++++--
 openbr/plugins/cuda/cudacvtfloat.cpp |  8 +++-----
 openbr/plugins/cuda/cudacvtfloat.cu  | 16 +++++++++++-----
 openbr/plugins/cuda/cudadefines.hpp  | 33 +++++++++++++++++++++++++++++++++
 openbr/plugins/cuda/cudalbp.cpp      |  7 +++----
 openbr/plugins/cuda/cudalbp.cu       | 31 +++++++++++++++++++++++--------
 openbr/plugins/cuda/cudapca.cpp      | 23 +++++++++++++----------
 openbr/plugins/cuda/cudapca.cu       | 30 +++++++++++++++++++-----------
 openbr/plugins/distance/dist.cpp     | 12 ++++++++++++
 12 files changed, 140 insertions(+), 62 deletions(-)
 create mode 100644 openbr/plugins/cuda/cudadefines.hpp
diff --git a/openbr/plugins/cuda/copyfrom.cpp b/openbr/plugins/cuda/copyfrom.cpp
index 846b7ba..8b0a88b 100644
--- a/openbr/plugins/cuda/copyfrom.cpp
+++ b/openbr/plugins/cuda/copyfrom.cpp
@@ -24,7 +24,6 @@ private:
     {
       // pull the data back out of the Mat
       void* const* dataPtr = src.m().ptr<void*>();
-      void* cudaMemPtr = dataPtr[0];
       int rows = *((int*)dataPtr[1]);
       int cols = *((int*)dataPtr[2]);
       int type = *((int*)dataPtr[3]);
@@ -32,10 +31,10 @@ private:
       Mat dstMat = Mat(rows, cols, type);
       switch(type) {
       case CV_32FC1:
-        br::cuda::cudacopyfrom::wrapper(cudaMemPtr, dstMat.ptr<float>(), rows, cols);
+        br::cuda::cudacopyfrom::wrapper(dataPtr[0], dstMat.ptr<float>(), rows, cols);
         break;
       case CV_8UC1:
-        br::cuda::cudacopyfrom::wrapper(cudaMemPtr, dstMat.ptr<unsigned char>(), rows, cols);
+        br::cuda::cudacopyfrom::wrapper(dataPtr[0], dstMat.ptr<unsigned char>(), rows, cols);
         break;
       default:
         cout << "ERR: Invalid image format" << endl;
diff --git a/openbr/plugins/cuda/copyfrom.cu b/openbr/plugins/cuda/copyfrom.cu
index 2a85f93..ddc77f8 100644
--- a/openbr/plugins/cuda/copyfrom.cu
+++ b/openbr/plugins/cuda/copyfrom.cu
@@ -1,7 +1,10 @@
+#include "cudadefines.hpp"
+
 namespace br { namespace cuda { namespace cudacopyfrom {
   template <typename T> void wrapper(void* src, T* dst, int rows, int cols) {
-    cudaMemcpy(dst, src, rows*cols*sizeof(T), cudaMemcpyDeviceToHost);
-    cudaFree(src);
+    cudaError_t err;
+    CUDA_SAFE_MEMCPY(dst, src, rows*cols*sizeof(T), cudaMemcpyDeviceToHost, &err);
+    CUDA_SAFE_FREE(src, &err);
   }
 
   template void wrapper(void*, float*, int, int);
diff --git a/openbr/plugins/cuda/copyto.cpp b/openbr/plugins/cuda/copyto.cpp
index 7288553..902c335 100644
--- a/openbr/plugins/cuda/copyto.cpp
+++ b/openbr/plugins/cuda/copyto.cpp
@@ -27,29 +27,28 @@ private:
       const int rows = srcMat.rows;
       const int cols = srcMat.cols;
 
+      // output will be a single pointer to graphics card memory
+      Mat dstMat = Mat(4, 1, DataType<void*>::type);
+      void** dstMatData = dstMat.ptr<void*>();
+
+      // save cuda ptr, rows, cols, then type
+      dstMatData[1] = new int; *((int*)dstMatData[1]) = rows;
+      dstMatData[2] = new int; *((int*)dstMatData[2]) = cols;
+      dstMatData[3] = new int; *((int*)dstMatData[3]) = srcMat.type();
+
       void* cudaMemPtr;
       switch(srcMat.type()) {
       case CV_32FC1:
-        br::cuda::cudacopyto::wrapper(srcMat.ptr<float>(), &cudaMemPtr, rows, cols);
+        br::cuda::cudacopyto::wrapper(srcMat.ptr<float>(), &dstMatData[0], rows, cols);
         break;
       case CV_8UC1:
-        br::cuda::cudacopyto::wrapper(srcMat.ptr<unsigned char>(), &cudaMemPtr, rows, cols);
+        br::cuda::cudacopyto::wrapper(srcMat.ptr<unsigned char>(), &dstMatData[0], rows, cols);
         break;
       default:
         cout << "ERR: Invalid image type! " << type2str(srcMat.type()) << endl;
         return;
       }
 
-      // output will be a single pointer to graphics card memory
-      Mat dstMat = Mat(4, 1, DataType<void*>::type);
-      void** dstMatData = dstMat.ptr<void*>();
-
-      // save cuda ptr, rows, cols, then type
-      dstMatData[0] = cudaMemPtr;
-      dstMatData[1] = new int; *((int*)dstMatData[1]) = rows;
-      dstMatData[2] = new int; *((int*)dstMatData[2]) = cols;
-      dstMatData[3] = new int; *((int*)dstMatData[3]) = srcMat.type();
-
       dst = dstMat;
     }
   };
diff --git a/openbr/plugins/cuda/copyto.cu b/openbr/plugins/cuda/copyto.cu
index feecb83..0742ff7 100644
--- a/openbr/plugins/cuda/copyto.cu
+++ b/openbr/plugins/cuda/copyto.cu
@@ -1,7 +1,10 @@
+#include "cudadefines.hpp"
+
 namespace br { namespace cuda { namespace cudacopyto {
   template <typename T> void wrapper(const T* in, void** out, const int rows, const int cols) {
-    cudaMalloc(out, rows*cols*sizeof(T));
-    cudaMemcpy(*out, in, rows*cols*sizeof(T), cudaMemcpyHostToDevice);
+    cudaError_t err;
+    CUDA_SAFE_MALLOC(out, rows*cols*sizeof(T), &err);
+    CUDA_SAFE_MEMCPY(*out, in, rows*cols*sizeof(T), cudaMemcpyHostToDevice, &err);
   }
 
   template void wrapper(const float* in, void** out, const int rows, const int cols);
diff --git a/openbr/plugins/cuda/cudacvtfloat.cpp b/openbr/plugins/cuda/cudacvtfloat.cpp
index fada942..d774a05 100644
--- a/openbr/plugins/cuda/cudacvtfloat.cpp
+++ b/openbr/plugins/cuda/cudacvtfloat.cpp
@@ -1,15 +1,14 @@
 #include <iostream>
-#include <unistd.h>
 using namespace std;
+#include <unistd.h>
 
 #include <opencv2/opencv.hpp>
 using namespace cv;
 
 #include <openbr/plugins/openbr_internal.h>
 
-
 namespace br { namespace cuda { namespace cudacvtfloat {
-  void wrapper(const unsigned char* src, void** dst, int rows, int cols);
+  void wrapper(void* src, void** dst, int rows, int cols);
 }}}
 
 namespace br
@@ -28,7 +27,6 @@ class CUDACvtFloatTransform : public UntrainableTransform
     void project(const Template &src, Template &dst) const
     {
       void* const* srcDataPtr = src.m().ptr<void*>();
-      void* srcMemPtr = srcDataPtr[0];
       int rows = *((int*)srcDataPtr[1]);
       int cols = *((int*)srcDataPtr[2]);
       int type = *((int*)srcDataPtr[3]);
@@ -47,7 +45,7 @@ class CUDACvtFloatTransform : public UntrainableTransform
       dstDataPtr[2] = srcDataPtr[2];
       dstDataPtr[3] = srcDataPtr[3]; *((int*)dstDataPtr[3]) = CV_32FC1;
 
-      br::cuda::cudacvtfloat::wrapper((const unsigned char*)srcMemPtr, &dstDataPtr[0], rows, cols);
+      br::cuda::cudacvtfloat::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
       dst = dstMat;
     }
 };
diff --git a/openbr/plugins/cuda/cudacvtfloat.cu b/openbr/plugins/cuda/cudacvtfloat.cu
index 93651e6..70bf2e4 100644
--- a/openbr/plugins/cuda/cudacvtfloat.cu
+++ b/openbr/plugins/cuda/cudacvtfloat.cu
@@ -1,3 +1,8 @@
+#include <iostream>
+using namespace std;
+
+#include "cudadefines.hpp"
+
 namespace br { namespace cuda { namespace cudacvtfloat {
 
   __global__ void kernel(const unsigned char* src, float* dst, int rows, int cols) {
@@ -14,15 +19,15 @@ namespace br { namespace cuda { namespace cudacvtfloat {
     dst[index] = (float)src[index];
   }
 
-  void wrapper(const unsigned char* src, void** dst, int rows, int cols) {
+  void wrapper(void* src, void** dst, int rows, int cols) {
     //unsigned char* cudaSrc;
     //cudaMalloc(&cudaSrc, rows*cols*sizeof(unsigned char));
     //cudaMemcpy(cudaSrc, src, rows*cols*sizeof(unsigned char), cudaMemcpyHostToDevice);
 
     //float* cudaDst;
     //cudaMalloc(&cudaDst, rows*cols*sizeof(float));
-
-    cudaMalloc(dst, rows*cols*sizeof(float));
+    cudaError_t err;
+    CUDA_SAFE_MALLOC(dst, rows*cols*sizeof(float), &err);
 
     dim3 threadsPerBlock(8, 8);
     dim3 blocks(
@@ -30,10 +35,11 @@ namespace br { namespace cuda { namespace cudacvtfloat {
       rows / threadsPerBlock.y + 1
     );
 
-    kernel<<<threadsPerBlock, blocks>>>(src, (float*)(*dst), rows, cols);
+    kernel<<<threadsPerBlock, blocks>>>((const unsigned char*)src, (float*)(*dst), rows, cols);
+    CUDA_KERNEL_ERR_CHK(&err);
 
     // free the src memory since it is now in a newly allocated dst
-    cudaFree((void*)src);
+    CUDA_SAFE_FREE(src, &err);
   }
 
 }}}
diff --git a/openbr/plugins/cuda/cudadefines.hpp b/openbr/plugins/cuda/cudadefines.hpp
new file mode 100644
index 0000000..354be2b
--- /dev/null
+++ b/openbr/plugins/cuda/cudadefines.hpp
@@ -0,0 +1,33 @@
+#include <iostream>
+using namespace std;
+#include <pthread.h>
+
+#define CUDA_SAFE_FREE(cudaPtr, errPtr) \
+  /*cout << pthread_self() << ": CUDA Free: " << cudaPtr << endl;*/ \
+  *errPtr = cudaFree(cudaPtr); \
+  if (*errPtr != cudaSuccess) { \
+    cout << pthread_self() << ": CUDA Free Error(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \
+    throw 0; \
+  }
+
+#define CUDA_SAFE_MALLOC(cudaPtrPtr, size, errPtr) \
+  *errPtr = cudaMalloc(cudaPtrPtr, size); \
+  if (*errPtr != cudaSuccess) { \
+    cout << pthread_self() << ": CUDA Malloc Error(" << *errPtr  << "): " << cudaGetErrorString(*errPtr) << endl; \
+    throw 0; \
+  } \
+  /* cout << pthread_self() << ": CUDA Malloc: " << (void*)*(int**)cudaPtrPtr << endl; */;
+
+#define CUDA_SAFE_MEMCPY(dstPtr, srcPtr, count, kind, errPtr) \
+  *errPtr = cudaMemcpy(dstPtr, srcPtr, count, kind); \
+  if (*errPtr != cudaSuccess) { \
+    cout << pthread_self() << ": CUDA Memcpy Error(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \
+    throw 0; \
+  }
+
+#define CUDA_KERNEL_ERR_CHK(errPtr) \
+  *errPtr = cudaPeekAtLastError(); \
+  if (*errPtr != cudaSuccess) { \
+    cout << pthread_self() << ": Kernel Call Err(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \
+    throw 0; \
+  }
diff --git a/openbr/plugins/cuda/cudalbp.cpp b/openbr/plugins/cuda/cudalbp.cpp
index 4cebf7c..2385c9b 100644
--- a/openbr/plugins/cuda/cudalbp.cpp
+++ b/openbr/plugins/cuda/cudalbp.cpp
@@ -84,7 +84,7 @@ class CUDALBPTransform : public UntrainableTransform
     uchar lut[256];
     uchar null;
 
-    cuda::MatManager* matManager;
+    //cuda::MatManager* matManager;
 
   public:
     /* Returns the number of 0->1 or 1->0 transitions in i */
@@ -137,7 +137,7 @@ class CUDALBPTransform : public UntrainableTransform
                 lut[i] = null; // Set to null id
 
         // init the mat manager for managing 10 mats
-        matManager = new cuda::MatManager(10);
+        //matManager = new cuda::MatManager(10);
 
         // copy lut over to the GPU
         br::cuda::cudalbp_init_wrapper(lut);
@@ -167,7 +167,6 @@ class CUDALBPTransform : public UntrainableTransform
         //matManager->release(b);
 
         void* const* srcDataPtr = src.m().ptr<void*>();
-        void* cudaSrcPtr = srcDataPtr[0];
         int rows = *((int*)srcDataPtr[1]);
         int cols = *((int*)srcDataPtr[2]);
         int type = *((int*)srcDataPtr[3]);
@@ -178,7 +177,7 @@ class CUDALBPTransform : public UntrainableTransform
         dstDataPtr[2] = srcDataPtr[2];
         dstDataPtr[3] = srcDataPtr[3];
 
-        br::cuda::cudalbp_wrapper(cudaSrcPtr, &dstDataPtr[0], rows, cols);
+        br::cuda::cudalbp_wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
         dst = dstMat;
     }
 };
diff --git a/openbr/plugins/cuda/cudalbp.cu b/openbr/plugins/cuda/cudalbp.cu
index af4464c..bb5be9a 100644
--- a/openbr/plugins/cuda/cudalbp.cu
+++ b/openbr/plugins/cuda/cudalbp.cu
@@ -1,6 +1,11 @@
+#include <iostream>
+using namespace std;
+
 #include <opencv2/gpu/gpu.hpp>
 #include <stdio.h>
 
+#include "cudadefines.hpp"
+
 using namespace cv;
 using namespace cv::gpu;
 
@@ -8,7 +13,7 @@ namespace br { namespace cuda {
   uint8_t* lut;
 
   __device__ __forceinline__ uint8_t cudalbp_kernel_get_pixel_value(int row, int col, uint8_t* srcPtr, int rows, int cols) {
-    return (row >= rows || col >= cols) ? 0 : (srcPtr + row*cols)[col];
+    return (srcPtr + row*cols)[col];
   }
 
   __global__ void cudalbp_kernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols, uint8_t* lut)
@@ -17,9 +22,16 @@ namespace br { namespace cuda {
     int colInd = blockIdx.x*blockDim.x+threadIdx.x;
     int radius = 1;
 
+    int index = rowInd*cols + colInd;
+
     // don't do anything if the index is out of bounds
-    if (rowInd >= rows || colInd >= cols) {
-      return;
+    if (rowInd < 1 || rowInd >= rows-1 || colInd < 1 || colInd >= cols-1) {
+      if (rowInd >= rows || colInd >= cols) {
+        return;
+      } else {
+        dstPtr[index] = 0;
+        return;
+      }
     }
 
     const uint8_t cval = cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+0*radius, srcPtr, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]);                      // center value
@@ -33,26 +45,29 @@ namespace br { namespace cuda {
                       (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 1   : 0)];
 
     // store calculated value away in the right place
-    int index = rowInd*cols + colInd;
     dstPtr[index] = val;
   }
 
   //void cudalbp_wrapper(uint8_t* srcPtr, uint8_t* dstPtr, uint8_t* lut, int imageWidth, int imageHeight, size_t step)
   void cudalbp_wrapper(void* srcPtr, void** dstPtr, int rows, int cols)
   {
+    cudaError_t err;
+
     // make 8 * 8 = 64 square block
     dim3 threadsPerBlock(8, 8);
     dim3 numBlocks(cols/threadsPerBlock.x + 1,
                    rows/threadsPerBlock.y + 1);
 
-    cudaMalloc(dstPtr, rows*cols*sizeof(uint8_t));
+    CUDA_SAFE_MALLOC(dstPtr, rows*cols*sizeof(uint8_t), &err);
     cudalbp_kernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), rows, cols, lut);
+    CUDA_KERNEL_ERR_CHK(&err);
 
-    cudaFree(srcPtr);
+    CUDA_SAFE_FREE(srcPtr, &err);
   }
 
   void cudalbp_init_wrapper(uint8_t* cpuLut) {
-    cudaMalloc(&lut, 256*sizeof(uint8_t));
-    cudaMemcpy(lut, cpuLut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice);
+    cudaError_t err;
+    CUDA_SAFE_MALLOC(&lut, 256*sizeof(uint8_t), &err);
+    CUDA_SAFE_MEMCPY(lut, cpuLut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice, &err);
   }
 }}
diff --git a/openbr/plugins/cuda/cudapca.cpp b/openbr/plugins/cuda/cudapca.cpp
index 1ede027..7f56070 100644
--- a/openbr/plugins/cuda/cudapca.cpp
+++ b/openbr/plugins/cuda/cudapca.cpp
@@ -15,6 +15,7 @@
  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 #include <iostream>
 using namespace std;
+#include <unistd.h>
 
 #include <QList>
 
@@ -30,7 +31,7 @@ using namespace cv;
 
 namespace br { namespace cuda {
   void cudapca_loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems);
-  void cudapca_trainwrapper(const void* cudaDataPtr, float* dataPtr, int rows, int cols);
+  void cudapca_trainwrapper(void* cudaDataPtr, float* dataPtr, int rows, int cols);
   void cudapca_projectwrapper(void* src, void** dst);
 }}
 
@@ -82,12 +83,13 @@ private:
 
     void train(const TemplateList &cudaTrainingSet)
     {
+      // copy the data back from the graphics card so the training can be done on the CPU
         const int instances = cudaTrainingSet.size();       // get the number of training set instances
         QList<Template> trainingQlist;
         for(int i=0; i<instances; i++) {
           Template currentTemplate = cudaTrainingSet[i];
           void* const* srcDataPtr = currentTemplate.m().ptr<void*>();
-          const void* cudaMemPtr = srcDataPtr[0];
+          void* cudaMemPtr = srcDataPtr[0];
           int rows = *((int*)srcDataPtr[1]);
           int cols = *((int*)srcDataPtr[2]);
           int type = *((int*)srcDataPtr[3]);
@@ -95,29 +97,30 @@ private:
           Mat mat = Mat(rows, cols, type);
           br::cuda::cudapca_trainwrapper(cudaMemPtr, mat.ptr<float>(), rows, cols);
           trainingQlist.append(Template(mat));
-        TemplateList trainingSet;
         }
+
+        // assemble a TemplateList from the list of data
         TemplateList trainingSet(trainingQlist);
 
-        if (trainingSet.first().m().type() != CV_32FC1)
-            qFatal("Requires single channel 32-bit floating point matrices.");
+        if (trainingSet.first().m().type() != CV_32FC1) {
+          qFatal("Requires single channel 32-bit floating point matrices.");
+        }
 
         originalRows = trainingSet.first().m().rows;    // get number of rows of first image
         int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols; // get the size of the first image
 
         // Map into 64-bit Eigen matrix
         Eigen::MatrixXd data(dimsIn, instances);        // create a mat
-        for (int i=0; i<instances; i++)
-            data.col(i) = Eigen::Map<const Eigen::MatrixXf>(trainingSet[i].m().ptr<float>(), dimsIn, 1).cast<double>();
+        for (int i=0; i<instances; i++) {
+          data.col(i) = Eigen::Map<const Eigen::MatrixXf>(trainingSet[i].m().ptr<float>(), dimsIn, 1).cast<double>();
+        }
 
         trainCore(data);
     }
 
     void project(const Template &src, Template &dst) const
     {
-
       void* const* srcDataPtr = src.m().ptr<void*>();
-      void* cudaMemPtr = srcDataPtr[0];
       int rows = *((int*)srcDataPtr[1]);
       int cols = *((int*)srcDataPtr[2]);
       int type = *((int*)srcDataPtr[3]);
@@ -133,7 +136,7 @@ private:
       dstDataPtr[2] = srcDataPtr[2];  *((int*)dstDataPtr[2]) = keep;
       dstDataPtr[3] = srcDataPtr[3];
 
-      br::cuda::cudapca_projectwrapper(cudaMemPtr, &dstDataPtr[0]);
+      br::cuda::cudapca_projectwrapper(srcDataPtr[0], &dstDataPtr[0]);
 
       dst = dstMat;
 
diff --git a/openbr/plugins/cuda/cudapca.cu b/openbr/plugins/cuda/cudapca.cu
index bc75fcf..10a59fb 100644
--- a/openbr/plugins/cuda/cudapca.cu
+++ b/openbr/plugins/cuda/cudapca.cu
@@ -4,6 +4,8 @@ using namespace std;
 #include <opencv2/opencv.hpp>
 #include <opencv2/gpu/gpu.hpp>
 
+#include "cudadefines.hpp"
+
 using namespace cv;
 using namespace cv::gpu;
 
@@ -63,39 +65,45 @@ namespace br { namespace cuda {
     _evRows = evRows; _evCols = evCols;
     _meanElems = meanElems;
 
+    cudaError_t err;
+
     // copy the eigenvectors to the GPU
-    cudaMalloc(&cudaEvPtr, evRows*evCols*sizeof(float));
-    cudaMemcpy(cudaEvPtr, evPtr, evRows*evCols*sizeof(float), cudaMemcpyHostToDevice);
+    CUDA_SAFE_MALLOC(&cudaEvPtr, evRows*evCols*sizeof(float), &err);
+    CUDA_SAFE_MEMCPY(cudaEvPtr, evPtr, evRows*evCols*sizeof(float), cudaMemcpyHostToDevice, &err);
 
     // copy the mean to the GPU
-    cudaMalloc(&cudaMeanPtr, meanElems*sizeof(float));
-    cudaMemcpy(cudaMeanPtr, meanPtr, meanElems*sizeof(float), cudaMemcpyHostToDevice);
+    CUDA_SAFE_MALLOC(&cudaMeanPtr, meanElems*sizeof(float), &err);
+    CUDA_SAFE_MEMCPY(cudaMeanPtr, meanPtr, meanElems*sizeof(float), cudaMemcpyHostToDevice, &err);
 
-    cudaMalloc(&_cudaSrcPtr, _meanElems*sizeof(float));
-    cudaMalloc(&_cudaDstPtr, _evCols*sizeof(float));
+    CUDA_SAFE_MALLOC(&_cudaSrcPtr, _meanElems*sizeof(float), &err);
+    CUDA_SAFE_MALLOC(&_cudaDstPtr, _evCols*sizeof(float), &err);
   }
 
-  void cudapca_trainwrapper(const void* cudaDataPtr, float* dataPtr, int rows, int cols) {
-    cudaMemcpy(dataPtr, cudaDataPtr, rows*cols*sizeof(float), cudaMemcpyDeviceToHost);
+  void cudapca_trainwrapper(void* cudaDataPtr, float* dataPtr, int rows, int cols) {
+    cudaError_t err;
+    CUDA_SAFE_MEMCPY(dataPtr, cudaDataPtr, rows*cols*sizeof(float), cudaMemcpyDeviceToHost, &err);
+    CUDA_SAFE_FREE(cudaDataPtr, &err);
   }
 
   void cudapca_projectwrapper(void* src, void** dst) {
     // copy the image to the GPU
     //cudaMemcpy(_cudaSrcPtr, src, _meanElems*sizeof(float), cudaMemcpyHostToDevice);
-
-    cudaMalloc(dst, _evRows*_evCols*sizeof(float));
+    cudaError_t err;
+    CUDA_SAFE_MALLOC(dst, _evRows*_evCols*sizeof(float), &err);
 
     // subtract out the mean of the image (mean is 1xpixels in size)
     int threadsPerBlock = 64;
     int numBlocks = _meanElems / threadsPerBlock + 1;
     cudapca_project_subtractmean_kernel<<<numBlocks, threadsPerBlock>>>((float*)src, cudaMeanPtr, _meanElems);
+    CUDA_KERNEL_ERR_CHK(&err);
 
     // perform the multiplication
     threadsPerBlock = 64;
     numBlocks = _evCols / threadsPerBlock + 1;
     cudapca_project_multiply_kernel<<<numBlocks, threadsPerBlock>>>((float*)src, (float*)(*dst), cudaEvPtr, _evRows, _evCols);
+    CUDA_KERNEL_ERR_CHK(&err);
 
-    //cudaFree(src);    // TODO(colin): figure out why adding this free causes memory corruption...
+    CUDA_SAFE_FREE(src, &err);    // TODO(colin): figure out why adding this free causes memory corruption...
 
     // copy the data back to the CPU
     //cudaMemcpy(dst, _cudaDstPtr, _evCols*sizeof(float), cudaMemcpyDeviceToHost);
diff --git a/openbr/plugins/distance/dist.cpp b/openbr/plugins/distance/dist.cpp
index 427dbd9..2f6ace8 100644
--- a/openbr/plugins/distance/dist.cpp
+++ b/openbr/plugins/distance/dist.cpp
@@ -14,6 +14,9 @@
  * limitations under the License.                                            *
  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 
+#include <iostream>
+using namespace std;
+
 #include <opencv2/imgproc/imgproc.hpp>
 #include <openbr/plugins/openbr_internal.h>
 
@@ -52,6 +55,15 @@ private:
 
     float compare(const Mat &a, const Mat &b) const
     {
+      /*
+      cout << "Mat A" << endl;
+      cout << "rows: " << a.rows << "\tcols: " << a.cols << endl;
+      cout << "a.ptr<float>()[0]: " << a.ptr<float>()[0] << endl;
+      cout << "Mat B" << endl;
+      cout << "rows: " << b.rows << "\tcols: " << b.cols << endl;
+      cout << "b.ptr<float>()[0]: " << b.ptr<float>()[0] << endl;
+      */
+
         if ((a.size != b.size) ||
             (a.type() != b.type()))
                 return -std::numeric_limits<float>::max();
--
libgit2 0.21.4