From 1dabf42dad2e33b7a9e3d11b4142a98fd53c5f9d Mon Sep 17 00:00:00 2001
From: DepthDeluxe <cheinzmann3@gmail.com>
Date: Wed, 30 Mar 2016 15:07:21 -0400
Subject: [PATCH] code refactoring for pull request

---
 openbr/core/cuda/MatManager.cu            | 116 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 openbr/core/cuda/MatManager.hpp           |  39 +++++++++++++++++++++++++++++++++++++++
 openbr/plugins/cuda/CUDAL2.cpp            |  58 ----------------------------------------------------------
 openbr/plugins/cuda/CUDAL2.cu             |  68 --------------------------------------------------------------------
 openbr/plugins/cuda/MatManager.cu         | 116 --------------------------------------------------------------------------------------------------------------------
 openbr/plugins/cuda/MatManager.hpp        |  39 ---------------------------------------
 openbr/plugins/cuda/copyfrom.cpp          |  24 ++++++++++++------------
 openbr/plugins/cuda/copyfrom.cu           |   2 +-
 openbr/plugins/cuda/copyto.cpp            |  22 ++++++++++++++--------
 openbr/plugins/cuda/copyto.cu             |   4 +++-
 openbr/plugins/cuda/cudaaffine.cpp        | 293 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------------------------------------------------------------------------------------------------------------------------------
 openbr/plugins/cuda/cudaaffine.cu         |  55 +++++++++++++++++++++++++++----------------------------
 openbr/plugins/cuda/cudacvtfloat.cpp      |   7 ++++---
 openbr/plugins/cuda/cudacvtfloat.cu       |   8 +-------
 openbr/plugins/cuda/cudal2.cpp            |  59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 openbr/plugins/cuda/cudal2.cu             |  65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 openbr/plugins/cuda/cudalbp.cpp           |  69 +++++++++++----------------------------------------------------------
 openbr/plugins/cuda/cudalbp.cu            |  36 ++++++++++++++++++++----------------
 openbr/plugins/cuda/cudapca.cpp           |  32 +++++---------------------------
 openbr/plugins/cuda/cudapca.cu            |  10 +++++-----
 openbr/plugins/cuda/cudargb2grayscale.cpp |  15 ++++++++-------
 openbr/plugins/cuda/cudargb2grayscale.cu  |  12 ++++++------
 openbr/plugins/cuda/passthrough.cpp       |  46 ----------------------------------------------
 openbr/plugins/cuda/passthrough.cu        |  43 -------------------------------------------
 openbr/plugins/cuda/passthrough.hpp       |   3 ---
 25 files changed, 542 insertions(+), 699 deletions(-)
 create mode 100644 openbr/core/cuda/MatManager.cu
 create mode 100644 openbr/core/cuda/MatManager.hpp
 delete mode 100644 openbr/plugins/cuda/CUDAL2.cpp
 delete mode 100644 openbr/plugins/cuda/CUDAL2.cu
 delete mode 100644 openbr/plugins/cuda/MatManager.cu
 delete mode 100644 openbr/plugins/cuda/MatManager.hpp
 create mode 100644 openbr/plugins/cuda/cudal2.cpp
 create mode 100644 openbr/plugins/cuda/cudal2.cu
 delete mode 100644 openbr/plugins/cuda/passthrough.cpp
 delete mode 100644 openbr/plugins/cuda/passthrough.cu
 delete mode 100644 openbr/plugins/cuda/passthrough.hpp

diff --git a/openbr/core/cuda/MatManager.cu b/openbr/core/cuda/MatManager.cu
new file mode 100644
index 0000000..4823249
--- /dev/null
+++ b/openbr/core/cuda/MatManager.cu
@@ -0,0 +1,116 @@
+#include <pthread.h>
+#include <semaphore.h>
+
+#include <opencv2/opencv.hpp>
+
+#include "MatManager.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+
+namespace br { namespace cuda {
+  MatManager::MatManager(int num) {
+    _numMats = num;
+
+    // initialize the an array of Mats
+    _mats = (uint8_t**)malloc(num * sizeof(uint8_t*));
+    _matTaken = (bool*)malloc(num * sizeof(bool));
+    _matsDimension = (int*)malloc(num * sizeof(int));
+
+    for (int i=0; i < num; i++) {
+      cudaMalloc(&_mats[i], 1 * sizeof(uint8_t));
+
+      // initialize matTaken
+      _matTaken[i] = false;
+
+      // initialize all mat dimensions to be 1
+      _matsDimension[i] = 1;
+    }
+
+    // initialize the locks
+    _matTakenLock = new pthread_mutex_t;
+    pthread_mutex_init(_matTakenLock, NULL);
+    _matsDimensionLock = new pthread_mutex_t;
+    pthread_mutex_init(_matsDimensionLock, NULL);
+
+    // initialize the semaphore
+    _matSemaphore = new sem_t;
+    sem_init(_matSemaphore, 0, _numMats);
+  }
+
+  MatManager::matindex MatManager::reserve(Mat &mat) {
+    int reservedMatIndex = 0;
+
+    sem_wait(_matSemaphore);
+    pthread_mutex_lock(_matTakenLock);
+    int i;
+    for (i=0; i < _numMats; i++) {
+      if ( !_matTaken[i] ) {
+        _matTaken[i] = true;
+        reservedMatIndex = i;
+        break;
+      }
+    }
+    if (i == _numMats) {
+      std::cout << "Cannot reserve a mat. Not enough GpuMat resourses\n" << std::endl << std::flush;
+    }
+
+    pthread_mutex_unlock(_matTakenLock);
+
+    // reallocate if size does not match
+    pthread_mutex_lock(_matsDimensionLock);
+    if (_matsDimension[reservedMatIndex] != mat.rows * mat.cols) {
+      cudaFree(_mats[reservedMatIndex]); // free the previous memory first
+      cudaMalloc(&_mats[reservedMatIndex], mat.rows * mat.cols * sizeof(uint8_t));
+      // change the dimension of that matrix
+      _matsDimension[reservedMatIndex] = mat.rows * mat.cols;
+
+    }
+    pthread_mutex_unlock(_matsDimensionLock);
+    return reservedMatIndex;
+  }
+
+  void MatManager::upload(MatManager::matindex reservedMatIndex, Mat& mat) {
+    // copy the content of the Mat to GPU
+    uint8_t* reservedMat = _mats[reservedMatIndex];
+    cudaMemcpy(reservedMat, mat.ptr<uint8_t>(), mat.rows * mat.cols, cudaMemcpyHostToDevice);
+  }
+
+  void MatManager::download(MatManager::matindex reservedMatIndex, Mat& dstMat) {
+    // copy the mat data back
+    int dimension = dstMat.rows * dstMat.cols;
+    uint8_t* reservedMat = _mats[reservedMatIndex];
+    cudaMemcpy(dstMat.ptr<uint8_t>(), reservedMat, dimension, cudaMemcpyDeviceToHost);
+  }
+
+  void MatManager::release(MatManager::matindex reservedMatIndex) {
+    uint8_t* reservedMat = _mats[reservedMatIndex];
+    pthread_mutex_lock(_matTakenLock);
+    bool foundMatch = false;
+    for (int i=0; i < _numMats; i++) {
+      if (reservedMat == _mats[i]) {
+        _matTaken[i] = false;
+        foundMatch = true;
+      }
+    }
+    pthread_mutex_unlock(_matTakenLock);
+
+    // return unconditionally if we didn't find a match
+    if (!foundMatch) {
+      std::cout << "Reservedmat is not in the _mats array" << std::endl << std::flush;
+      return;
+    }
+    sem_post(_matSemaphore);
+  }
+
+  MatManager::~MatManager() {
+    // assume a single thread is destroying the manager
+    // TODO(colin): add the destroy code
+    //std::cout << "Start to destroy.." << std::endl << std::flush;
+  }
+
+  uint8_t* MatManager::get_mat_pointer_from_index(MatManager::matindex matIndex) {
+    return _mats[matIndex];
+  }
+
+}}
diff --git a/openbr/core/cuda/MatManager.hpp b/openbr/core/cuda/MatManager.hpp
new file mode 100644
index 0000000..e58ad42
--- /dev/null
+++ b/openbr/core/cuda/MatManager.hpp
@@ -0,0 +1,39 @@
+/*
+NOTES
+Mat reservations should return a handle instead of a pointer
+*/
+
+#include <pthread.h>
+#include <semaphore.h>
+
+#include <opencv2/opencv.hpp>
+#include <opencv2/gpu/gpu.hpp>
+
+using namespace cv;
+using namespace cv::gpu;
+
+namespace br { namespace cuda {
+  class MatManager {
+  private:
+    int _numMats;
+    uint8_t** _mats;         // holds all the mats
+    bool* _matTaken;       // holds whether or not they are taken
+    int* _matsDimension;   // holds the dimension of the Mats
+
+    pthread_mutex_t* _matTakenLock;            // lock for matTaken table
+    pthread_mutex_t* _matsDimensionLock;     // lock for _matsDimension table and _mats table
+    sem_t* _matSemaphore;
+
+  public:
+    typedef int matindex;
+    MatManager(int num);
+
+    int reserve(Mat &mat);
+    void upload(matindex reservedMatIndex, Mat& mat);
+    void download(matindex reservedMatIndex, Mat& dstMat);
+    void release(matindex matIndex);
+    uint8_t* get_mat_pointer_from_index(matindex matIndex);
+
+    ~MatManager();
+  };
+}}
diff --git a/openbr/plugins/cuda/CUDAL2.cpp b/openbr/plugins/cuda/CUDAL2.cpp
deleted file mode 100644
index 6b248d0..0000000
--- a/openbr/plugins/cuda/CUDAL2.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
- * Copyright 2012 The MITRE Corporation                                      *
- *                                                                           *
- * Licensed under the Apache License, Version 2.0 (the "License");           *
- * you may not use this file except in compliance with the License.          *
- * You may obtain a copy of the License at                                   *
- *                                                                           *
- *     http://www.apache.org/licenses/LICENSE-2.0                            *
- *                                                                           *
- * Unless required by applicable law or agreed to in writing, software       *
- * distributed under the License is distributed on an "AS IS" BASIS,         *
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  *
- * See the License for the specific language governing permissions and       *
- * limitations under the License.                                            *
- * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
-
-#include <iostream>
-using namespace std;
-
-#include <openbr/plugins/openbr_internal.h>
-
-namespace br { namespace cuda { namespace L2{
-  void wrapper(float* cudaAPtr, float* cudaBPtr, int length, float* outPtr);
-}}}
-
-namespace br
-{
-
-/*!
- * \ingroup distances
- * \brief L2 distance computed using eigen.
- * \author Josh Klontz \cite jklontz
- */
-class CUDAL2Distance : public UntrainableDistance
-{
-    Q_OBJECT
-
-    float compare(const cv::Mat &a, const cv::Mat &b) const
-    {
-      void* const* srcDataPtr = a.ptr<void*>();
-      float* cudaAPtr = (float*)srcDataPtr[0];
-      int rows = *((int*)srcDataPtr[1]);
-      int cols = *((int*)srcDataPtr[2]);
-
-      float* cudaBPtr = (float*)b.ptr<void*>()[0];
-
-      float out;
-      br::cuda::L2::wrapper(cudaAPtr, cudaBPtr, rows*cols, &out);
-
-      return out;
-    }
-};
-
-BR_REGISTER(Distance, CUDAL2Distance)
-
-} // namespace br
-
-#include "cuda/CUDAL2.moc"
diff --git a/openbr/plugins/cuda/CUDAL2.cu b/openbr/plugins/cuda/CUDAL2.cu
deleted file mode 100644
index 8d52b35..0000000
--- a/openbr/plugins/cuda/CUDAL2.cu
+++ /dev/null
@@ -1,68 +0,0 @@
-#include <math.h>
-
-#include "cudadefines.hpp"
-
-namespace br { namespace cuda { namespace L2 {
-
-  __global__ void my_subtract_kernel(float* aPtr, float* bPtr, float* workPtr, int length) {
-    int index = blockIdx.x*blockDim.x+threadIdx.x;
-
-    if (index >= length) {
-      return;
-    }
-
-    // perform the subtraction in-place
-    // use b because it is the comparison
-    // image
-    workPtr[index] = aPtr[index] - bPtr[index];
-    workPtr[index] = workPtr[index] * workPtr[index];
-  }
-
-  __global__ void collapse_kernel(float* inPtr, float* outPtr, int length) {
-    // make sure there is only one thread that we are calling
-    if (blockIdx.x != 0 || threadIdx.x != 0) {
-      return;
-    }
-
-    // sum up all the values
-    *outPtr = 0;
-    for (int i=0; i < length; i++) {
-      *outPtr = *outPtr + inPtr[i];
-    }
-
-    // take the square root
-    *outPtr = sqrtf(*outPtr);
-  }
-
-  void wrapper(float* cudaAPtr, float* cudaBPtr, int length, float* outPtr) {
-    cudaError_t err;
-    float* cudaOutPtr;
-    CUDA_SAFE_MALLOC(&cudaOutPtr, sizeof(float), &err);
-
-    float* cudaWorkBufferPtr;
-    CUDA_SAFE_MALLOC(&cudaWorkBufferPtr, sizeof(float)*length, &err);
-
-    // perform the subtraction
-    int threadsPerBlock = 64;
-    int numBlocks = length / threadsPerBlock + 1;
-    my_subtract_kernel<<<threadsPerBlock, numBlocks>>>(cudaAPtr, cudaBPtr, cudaWorkBufferPtr, length);
-    CUDA_KERNEL_ERR_CHK(&err);
-
-    // perform the collapse
-    collapse_kernel<<<1,1>>>(cudaWorkBufferPtr, cudaOutPtr, length);
-    CUDA_KERNEL_ERR_CHK(&err);
-
-    // copy the single value back to the destinsion
-    CUDA_SAFE_MEMCPY(outPtr, cudaOutPtr, sizeof(float), cudaMemcpyDeviceToHost, &err);
-
-    CUDA_SAFE_FREE(cudaOutPtr, &err);
-
-    // do not free aPtr which should be the reference library
-    // only free bPtr, which is the image we are comparing
-    CUDA_SAFE_FREE(cudaBPtr, &err);
-    CUDA_SAFE_FREE(cudaWorkBufferPtr, &err);
-  }
-}}}
-
-// 128CUDAEigenfaces on 6400 ATT: 54.367s
-// 128CUDAEigenfacesL2 on 6400 ATT: 
diff --git a/openbr/plugins/cuda/MatManager.cu b/openbr/plugins/cuda/MatManager.cu
deleted file mode 100644
index 4823249..0000000
--- a/openbr/plugins/cuda/MatManager.cu
+++ /dev/null
@@ -1,116 +0,0 @@
-#include <pthread.h>
-#include <semaphore.h>
-
-#include <opencv2/opencv.hpp>
-
-#include "MatManager.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-namespace br { namespace cuda {
-  MatManager::MatManager(int num) {
-    _numMats = num;
-
-    // initialize the an array of Mats
-    _mats = (uint8_t**)malloc(num * sizeof(uint8_t*));
-    _matTaken = (bool*)malloc(num * sizeof(bool));
-    _matsDimension = (int*)malloc(num * sizeof(int));
-
-    for (int i=0; i < num; i++) {
-      cudaMalloc(&_mats[i], 1 * sizeof(uint8_t));
-
-      // initialize matTaken
-      _matTaken[i] = false;
-
-      // initialize all mat dimensions to be 1
-      _matsDimension[i] = 1;
-    }
-
-    // initialize the locks
-    _matTakenLock = new pthread_mutex_t;
-    pthread_mutex_init(_matTakenLock, NULL);
-    _matsDimensionLock = new pthread_mutex_t;
-    pthread_mutex_init(_matsDimensionLock, NULL);
-
-    // initialize the semaphore
-    _matSemaphore = new sem_t;
-    sem_init(_matSemaphore, 0, _numMats);
-  }
-
-  MatManager::matindex MatManager::reserve(Mat &mat) {
-    int reservedMatIndex = 0;
-
-    sem_wait(_matSemaphore);
-    pthread_mutex_lock(_matTakenLock);
-    int i;
-    for (i=0; i < _numMats; i++) {
-      if ( !_matTaken[i] ) {
-        _matTaken[i] = true;
-        reservedMatIndex = i;
-        break;
-      }
-    }
-    if (i == _numMats) {
-      std::cout << "Cannot reserve a mat. Not enough GpuMat resourses\n" << std::endl << std::flush;
-    }
-
-    pthread_mutex_unlock(_matTakenLock);
-
-    // reallocate if size does not match
-    pthread_mutex_lock(_matsDimensionLock);
-    if (_matsDimension[reservedMatIndex] != mat.rows * mat.cols) {
-      cudaFree(_mats[reservedMatIndex]); // free the previous memory first
-      cudaMalloc(&_mats[reservedMatIndex], mat.rows * mat.cols * sizeof(uint8_t));
-      // change the dimension of that matrix
-      _matsDimension[reservedMatIndex] = mat.rows * mat.cols;
-
-    }
-    pthread_mutex_unlock(_matsDimensionLock);
-    return reservedMatIndex;
-  }
-
-  void MatManager::upload(MatManager::matindex reservedMatIndex, Mat& mat) {
-    // copy the content of the Mat to GPU
-    uint8_t* reservedMat = _mats[reservedMatIndex];
-    cudaMemcpy(reservedMat, mat.ptr<uint8_t>(), mat.rows * mat.cols, cudaMemcpyHostToDevice);
-  }
-
-  void MatManager::download(MatManager::matindex reservedMatIndex, Mat& dstMat) {
-    // copy the mat data back
-    int dimension = dstMat.rows * dstMat.cols;
-    uint8_t* reservedMat = _mats[reservedMatIndex];
-    cudaMemcpy(dstMat.ptr<uint8_t>(), reservedMat, dimension, cudaMemcpyDeviceToHost);
-  }
-
-  void MatManager::release(MatManager::matindex reservedMatIndex) {
-    uint8_t* reservedMat = _mats[reservedMatIndex];
-    pthread_mutex_lock(_matTakenLock);
-    bool foundMatch = false;
-    for (int i=0; i < _numMats; i++) {
-      if (reservedMat == _mats[i]) {
-        _matTaken[i] = false;
-        foundMatch = true;
-      }
-    }
-    pthread_mutex_unlock(_matTakenLock);
-
-    // return unconditionally if we didn't find a match
-    if (!foundMatch) {
-      std::cout << "Reservedmat is not in the _mats array" << std::endl << std::flush;
-      return;
-    }
-    sem_post(_matSemaphore);
-  }
-
-  MatManager::~MatManager() {
-    // assume a single thread is destroying the manager
-    // TODO(colin): add the destroy code
-    //std::cout << "Start to destroy.." << std::endl << std::flush;
-  }
-
-  uint8_t* MatManager::get_mat_pointer_from_index(MatManager::matindex matIndex) {
-    return _mats[matIndex];
-  }
-
-}}
diff --git a/openbr/plugins/cuda/MatManager.hpp b/openbr/plugins/cuda/MatManager.hpp
deleted file mode 100644
index e58ad42..0000000
--- a/openbr/plugins/cuda/MatManager.hpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
-NOTES
-Mat reservations should return a handle instead of a pointer
-*/
-
-#include <pthread.h>
-#include <semaphore.h>
-
-#include <opencv2/opencv.hpp>
-#include <opencv2/gpu/gpu.hpp>
-
-using namespace cv;
-using namespace cv::gpu;
-
-namespace br { namespace cuda {
-  class MatManager {
-  private:
-    int _numMats;
-    uint8_t** _mats;         // holds all the mats
-    bool* _matTaken;       // holds whether or not they are taken
-    int* _matsDimension;   // holds the dimension of the Mats
-
-    pthread_mutex_t* _matTakenLock;            // lock for matTaken table
-    pthread_mutex_t* _matsDimensionLock;     // lock for _matsDimension table and _mats table
-    sem_t* _matSemaphore;
-
-  public:
-    typedef int matindex;
-    MatManager(int num);
-
-    int reserve(Mat &mat);
-    void upload(matindex reservedMatIndex, Mat& mat);
-    void download(matindex reservedMatIndex, Mat& dstMat);
-    void release(matindex matIndex);
-    uint8_t* get_mat_pointer_from_index(matindex matIndex);
-
-    ~MatManager();
-  };
-}}
diff --git a/openbr/plugins/cuda/copyfrom.cpp b/openbr/plugins/cuda/copyfrom.cpp
index 4bfc0bf..287b3b9 100644
--- a/openbr/plugins/cuda/copyfrom.cpp
+++ b/openbr/plugins/cuda/copyfrom.cpp
@@ -2,21 +2,25 @@
 
 #include <opencv2/opencv.hpp>
 
-//#include <gperftools/profiler.h>
-
 #include <openbr/plugins/openbr_internal.h>
 
 using namespace std;
 
 using namespace cv;
 
-// extern CUDA declaration
-namespace br { namespace cuda { namespace cudacopyfrom {
+// CUDA functions for this plugin
+namespace br { namespace cuda { namespace copyfrom {
   template <typename T> void wrapper(void* src, T* out, int rows, int cols);
 }}}
 
 namespace br
 {
+  /*!
+  * \ingroup transforms
+  * \brief Copies a transform from the GPU to the CPU.
+  * \author Colin Heinzmann \cite DepthDeluxe
+  * \note Method: Automatically matches image dimensions, works for 32-bit single channel, 8-bit single channel, and 8-bit 3 channel
+  */
   class CUDACopyFrom : public UntrainableTransform
   {
     Q_OBJECT
@@ -24,8 +28,6 @@ namespace br
 private:
     void project(const Template &src, Template &dst) const
     {
-//      ProfilerStart("PROFILEME.log");
-
       // pull the data back out of the Mat
       void* const* dataPtr = src.m().ptr<void*>();
       int rows = *((int*)dataPtr[1]);
@@ -35,21 +37,19 @@ private:
       Mat dstMat = Mat(rows, cols, type);
       switch(type) {
       case CV_32FC1:
-        br::cuda::cudacopyfrom::wrapper(dataPtr[0], dstMat.ptr<float>(), rows, cols);
+        cuda::copyfrom::wrapper(dataPtr[0], dstMat.ptr<float>(), rows, cols);
         break;
       case CV_8UC1:
-        br::cuda::cudacopyfrom::wrapper(dataPtr[0], dstMat.ptr<unsigned char>(), rows, cols);
+        cuda::copyfrom::wrapper(dataPtr[0], dstMat.ptr<unsigned char>(), rows, cols);
         break;
       case CV_8UC3:
-        br::cuda::cudacopyfrom::wrapper(dataPtr[0], dstMat.ptr<unsigned char>(), rows, cols * 3);
+        cuda::copyfrom::wrapper(dataPtr[0], dstMat.ptr<unsigned char>(), rows, cols * 3);
         break;
       default:
-        cout << "ERR: Invalid image format" << endl;
+        cout << "ERR: Invalid image type (" << type << ")" << endl;
         break;
       }
       dst = dstMat;
-
-//      ProfilerStop();
     }
   };
 
diff --git a/openbr/plugins/cuda/copyfrom.cu b/openbr/plugins/cuda/copyfrom.cu
index ddc77f8..5574ea6 100644
--- a/openbr/plugins/cuda/copyfrom.cu
+++ b/openbr/plugins/cuda/copyfrom.cu
@@ -1,6 +1,6 @@
 #include "cudadefines.hpp"
 
-namespace br { namespace cuda { namespace cudacopyfrom {
+namespace br { namespace cuda { namespace copyfrom {
   template <typename T> void wrapper(void* src, T* dst, int rows, int cols) {
     cudaError_t err;
     CUDA_SAFE_MEMCPY(dst, src, rows*cols*sizeof(T), cudaMemcpyDeviceToHost, &err);
diff --git a/openbr/plugins/cuda/copyto.cpp b/openbr/plugins/cuda/copyto.cpp
index cc63e58..0fad607 100644
--- a/openbr/plugins/cuda/copyto.cpp
+++ b/openbr/plugins/cuda/copyto.cpp
@@ -8,14 +8,20 @@ using namespace std;
 
 using namespace cv;
 
-extern string type2str(int type);
-
-namespace br { namespace cuda { namespace cudacopyto {
+// definitions from the CUDA source file
+namespace br { namespace cuda { namespace copyto {
   template <typename T> void wrapper(const T* in, void** out, const int rows, const int cols);
 }}}
 
 namespace br
 {
+
+  /*!
+  * \ingroup transforms
+  * \brief Copies a transform to the GPU.
+  * \author Colin Heinzmann \cite DepthDeluxe
+  * \note Method: Automatically matches image dimensions, works for 32-bit single channel, 8-bit single channel, and 8-bit 3 channel
+  */
   class CUDACopyTo : public UntrainableTransform
   {
     Q_OBJECT
@@ -25,7 +31,7 @@ private:
     {
       const Mat& srcMat = src.m();
       const int rows = srcMat.rows;
-      const int cols = srcMat.cols; 
+      const int cols = srcMat.cols;
 
       // output will be a single pointer to graphics card memory
       Mat dstMat = Mat(4, 1, DataType<void*>::type);
@@ -39,16 +45,16 @@ private:
       void* cudaMemPtr;
       switch(srcMat.type()) {
       case CV_32FC1:
-        br::cuda::cudacopyto::wrapper(srcMat.ptr<float>(), &dstMatData[0], rows, cols);
+        cuda::copyto::wrapper(srcMat.ptr<float>(), &dstMatData[0], rows, cols);
         break;
       case CV_8UC1:
-        br::cuda::cudacopyto::wrapper(srcMat.ptr<unsigned char>(), &dstMatData[0], rows, cols);
+        cuda::copyto::wrapper(srcMat.ptr<unsigned char>(), &dstMatData[0], rows, cols);
         break;
       case CV_8UC3:
-        br::cuda::cudacopyto::wrapper(srcMat.ptr<unsigned char>(), &dstMatData[0], rows, 3*cols);
+        cuda::copyto::wrapper(srcMat.ptr<unsigned char>(), &dstMatData[0], rows, 3*cols);
         break;
       default:
-        cout << "ERR: Invalid image type! " << type2str(srcMat.type()) << endl;
+        cout << "ERR: Invalid image type (" << srcMat.type() << ")" << endl;
         return;
       }
 
diff --git a/openbr/plugins/cuda/copyto.cu b/openbr/plugins/cuda/copyto.cu
index 0742ff7..4c66605 100644
--- a/openbr/plugins/cuda/copyto.cu
+++ b/openbr/plugins/cuda/copyto.cu
@@ -1,6 +1,7 @@
 #include "cudadefines.hpp"
 
-namespace br { namespace cuda { namespace cudacopyto {
+namespace br { namespace cuda { namespace copyto {
+
   template <typename T> void wrapper(const T* in, void** out, const int rows, const int cols) {
     cudaError_t err;
     CUDA_SAFE_MALLOC(out, rows*cols*sizeof(T), &err);
@@ -9,4 +10,5 @@ namespace br { namespace cuda { namespace cudacopyto {
 
   template void wrapper(const float* in, void** out, const int rows, const int cols);
   template void wrapper(const unsigned char* in, void** out, const int rows, const int cols);
+
 }}}
diff --git a/openbr/plugins/cuda/cudaaffine.cpp b/openbr/plugins/cuda/cudaaffine.cpp
index 46dd71f..eec10f4 100644
--- a/openbr/plugins/cuda/cudaaffine.cpp
+++ b/openbr/plugins/cuda/cudaaffine.cpp
@@ -33,159 +33,158 @@ using namespace std;
 #include <openbr/plugins/openbr_internal.h>
 #include <openbr/core/opencvutils.h>
 
-#include "MatManager.hpp"
-
 using namespace cv;
 
-namespace br { namespace cuda {
-	void cudaaffine_wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols);
-}}
+// definitions from the CUDA source file
+namespace br { namespace cuda { namespace affine {
+	void wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols);
+}}}
 
 namespace br
 {
 
-/*!
- * \ingroup transforms
- * \brief Performs a two or three point registration.
- * \author Josh Klontz \cite jklontz
- * \note Method: Area should be used for shrinking an image, Cubic for slow but accurate enlargment, Bilin for fast enlargement.
- */
-class CUDAAffineTransform : public UntrainableTransform
-{
-    Q_OBJECT
-    Q_ENUMS(Method)
-
-public:
-    /*!< */
-    enum Method { Near = INTER_NEAREST,
-                  Area = INTER_AREA,
-                  Bilin = INTER_LINEAR,
-                  Cubic = INTER_CUBIC,
-                  Lanczo = INTER_LANCZOS4};
-
-private:
-    Q_PROPERTY(int width READ get_width WRITE set_width RESET reset_width STORED false)
-    Q_PROPERTY(int height READ get_height WRITE set_height RESET reset_height STORED false)
-    Q_PROPERTY(float x1 READ get_x1 WRITE set_x1 RESET reset_x1 STORED false)
-    Q_PROPERTY(float y1 READ get_y1 WRITE set_y1 RESET reset_y1 STORED false)
-    Q_PROPERTY(float x2 READ get_x2 WRITE set_x2 RESET reset_x2 STORED false)
-    Q_PROPERTY(float y2 READ get_y2 WRITE set_y2 RESET reset_y2 STORED false)
-    Q_PROPERTY(float x3 READ get_x3 WRITE set_x3 RESET reset_x3 STORED false)
-    Q_PROPERTY(float y3 READ get_y3 WRITE set_y3 RESET reset_y3 STORED false)
-    Q_PROPERTY(Method method READ get_method WRITE set_method RESET reset_method STORED false)
-    Q_PROPERTY(bool storeAffine READ get_storeAffine WRITE set_storeAffine RESET reset_storeAffine STORED false)
-    Q_PROPERTY(bool warpPoints READ get_warpPoints WRITE set_warpPoints RESET reset_warpPoints STORED false)
-    BR_PROPERTY(int, width, 64)
-    BR_PROPERTY(int, height, 64)
-    BR_PROPERTY(float, x1, 0)
-    BR_PROPERTY(float, y1, 0)
-    BR_PROPERTY(float, x2, -1)
-    BR_PROPERTY(float, y2, -1)
-    BR_PROPERTY(float, x3, -1)
-    BR_PROPERTY(float, y3, -1)
-    BR_PROPERTY(Method, method, Bilin)
-    BR_PROPERTY(bool, storeAffine, false)
-    BR_PROPERTY(bool, warpPoints, false)
-
-    static Point2f getThirdAffinePoint(const Point2f &a, const Point2f &b)
-    {
-        float dx = b.x - a.x;
-        float dy = b.y - a.y;
-        return Point2f(a.x - dy, a.y + dx);
-    }
-
-    void project(const Template &src, Template &dst) const
-    {
-        const bool twoPoints = ((x3 == -1) || (y3 == -1));
-
-        Point2f dstPoints[3];
-        dstPoints[0] = Point2f(x1*width, y1*height);
-        dstPoints[1] = Point2f((x2 == -1 ? 1 - x1 : x2)*width, (y2 == -1 ? y1 : y2)*height);
-        if (twoPoints) dstPoints[2] = getThirdAffinePoint(dstPoints[0], dstPoints[1]);
-        else           dstPoints[2] = Point2f(x3*width, y3*height);
-
-        Point2f srcPoints[3];
-        if (src.file.contains("Affine_0") &&
-            src.file.contains("Affine_1") &&
-            (src.file.contains("Affine_2") || twoPoints)) {
-            srcPoints[0] = OpenCVUtils::toPoint(src.file.get<QPointF>("Affine_0"));
-            srcPoints[1] = OpenCVUtils::toPoint(src.file.get<QPointF>("Affine_1"));
-            if (!twoPoints) srcPoints[2] = OpenCVUtils::toPoint(src.file.get<QPointF>("Affine_2"));
-        } else {
-            const QList<Point2f> landmarks = OpenCVUtils::toPoints(src.file.points());
-
-            if ((landmarks.size() < 2) || (!twoPoints && (landmarks.size() < 3))) {
-                resize(src, dst, Size(width, height));
-                return;
-            } else {
-                srcPoints[0] = landmarks[0];
-                srcPoints[1] = landmarks[1];
-                if (!twoPoints) srcPoints[2] = landmarks[2];
-            }
-        }
-        if (twoPoints) srcPoints[2] = getThirdAffinePoint(srcPoints[0], srcPoints[1]);
-
-        // Code section being altered (original)
-        //
-        // Mat affineTransform = getAffineTransform(srcPoints, dstPoints);
-        // warpAffine(src, dst, affineTransform, Size(width, height), method);
-        //
-        // end original
-
-        Mat affineTransform = getAffineTransform(srcPoints, dstPoints);
-
-        void* const* srcDataPtr = src.m().ptr<void*>();
-        int rows = *((int*)srcDataPtr[1]);
-        int cols = *((int*)srcDataPtr[2]);
-        int type = *((int*)srcDataPtr[3]);
-
-        Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
-        void** dstDataPtr = dstMat.ptr<void*>();
-
-        dstDataPtr[1] = srcDataPtr[1]; *((int*)dstDataPtr[1]) = height;  // rows
-        dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = width;   // cols
-        dstDataPtr[3] = srcDataPtr[3];
-
-        // Print the transform
-        //for(int x = 0; x < affineTransform.rows; x++){
-            //for(int y = 0; y < affineTransform.cols; y++){
-                //printf("%8.3f\t", affineTransform.at<double>(x, y));
-            //}
-            //printf("\n");
-        //}
-
-        br::cuda::cudaaffine_wrapper(srcDataPtr[0], &dstDataPtr[0], affineTransform, rows, cols, height, width);
-
-        // end altered code
-
-        if (warpPoints) {
-            QList<QPointF> points = src.file.points();
-            QList<QPointF> rotatedPoints;
-            for (int i=0; i<points.size(); i++) {
-                rotatedPoints.append(QPointF(points.at(i).x()*affineTransform.at<double>(0,0)+
-                                             points.at(i).y()*affineTransform.at<double>(0,1)+
-                                             affineTransform.at<double>(0,2),
-                                             points.at(i).x()*affineTransform.at<double>(1,0)+
-                                             points.at(i).y()*affineTransform.at<double>(1,1)+
-                                             affineTransform.at<double>(1,2)));
-            }
-
-            dst.file.setPoints(rotatedPoints);
-        }
-
-        if (storeAffine) {
-            QList<float> affineParams;
-            for (int i = 0 ; i < 2; i++)
-                for (int j = 0; j < 3; j++)
-                    affineParams.append(affineTransform.at<double>(i, j));
-            dst.file.setList("affineParameters", affineParams);
-        }
-
-        dst = dstMat;
-    }
-};
-
-BR_REGISTER(Transform, CUDAAffineTransform)
+	/*!
+	* \ingroup transforms
+	* \brief Performs a two or three point registration on the GPU.  Modified from stock OpenBR implementation
+	* \author Greg Schrock \cite gls022
+	* \note Method: Area should be used for shrinking an image, Cubic for slow but accurate enlargment, Bilin for fast enlargement.
+	*/
+	class CUDAAffineTransform : public UntrainableTransform
+	{
+	    Q_OBJECT
+	    Q_ENUMS(Method)
+
+	public:
+	    /*!< */
+	    enum Method { Near = INTER_NEAREST,
+	                  Area = INTER_AREA,
+	                  Bilin = INTER_LINEAR,
+	                  Cubic = INTER_CUBIC,
+	                  Lanczo = INTER_LANCZOS4};
+
+	private:
+	    Q_PROPERTY(int width READ get_width WRITE set_width RESET reset_width STORED false)
+	    Q_PROPERTY(int height READ get_height WRITE set_height RESET reset_height STORED false)
+	    Q_PROPERTY(float x1 READ get_x1 WRITE set_x1 RESET reset_x1 STORED false)
+	    Q_PROPERTY(float y1 READ get_y1 WRITE set_y1 RESET reset_y1 STORED false)
+	    Q_PROPERTY(float x2 READ get_x2 WRITE set_x2 RESET reset_x2 STORED false)
+	    Q_PROPERTY(float y2 READ get_y2 WRITE set_y2 RESET reset_y2 STORED false)
+	    Q_PROPERTY(float x3 READ get_x3 WRITE set_x3 RESET reset_x3 STORED false)
+	    Q_PROPERTY(float y3 READ get_y3 WRITE set_y3 RESET reset_y3 STORED false)
+	    Q_PROPERTY(Method method READ get_method WRITE set_method RESET reset_method STORED false)
+	    Q_PROPERTY(bool storeAffine READ get_storeAffine WRITE set_storeAffine RESET reset_storeAffine STORED false)
+	    Q_PROPERTY(bool warpPoints READ get_warpPoints WRITE set_warpPoints RESET reset_warpPoints STORED false)
+	    BR_PROPERTY(int, width, 64)
+	    BR_PROPERTY(int, height, 64)
+	    BR_PROPERTY(float, x1, 0)
+	    BR_PROPERTY(float, y1, 0)
+	    BR_PROPERTY(float, x2, -1)
+	    BR_PROPERTY(float, y2, -1)
+	    BR_PROPERTY(float, x3, -1)
+	    BR_PROPERTY(float, y3, -1)
+	    BR_PROPERTY(Method, method, Bilin)
+	    BR_PROPERTY(bool, storeAffine, false)
+	    BR_PROPERTY(bool, warpPoints, false)
+
+	    static Point2f getThirdAffinePoint(const Point2f &a, const Point2f &b)
+	    {
+	        float dx = b.x - a.x;
+	        float dy = b.y - a.y;
+	        return Point2f(a.x - dy, a.y + dx);
+	    }
+
+	    void project(const Template &src, Template &dst) const
+	    {
+	        const bool twoPoints = ((x3 == -1) || (y3 == -1));
+
+	        Point2f dstPoints[3];
+	        dstPoints[0] = Point2f(x1*width, y1*height);
+	        dstPoints[1] = Point2f((x2 == -1 ? 1 - x1 : x2)*width, (y2 == -1 ? y1 : y2)*height);
+	        if (twoPoints) dstPoints[2] = getThirdAffinePoint(dstPoints[0], dstPoints[1]);
+	        else           dstPoints[2] = Point2f(x3*width, y3*height);
+
+	        Point2f srcPoints[3];
+	        if (src.file.contains("Affine_0") &&
+	            src.file.contains("Affine_1") &&
+	            (src.file.contains("Affine_2") || twoPoints)) {
+	            srcPoints[0] = OpenCVUtils::toPoint(src.file.get<QPointF>("Affine_0"));
+	            srcPoints[1] = OpenCVUtils::toPoint(src.file.get<QPointF>("Affine_1"));
+	            if (!twoPoints) srcPoints[2] = OpenCVUtils::toPoint(src.file.get<QPointF>("Affine_2"));
+	        } else {
+	            const QList<Point2f> landmarks = OpenCVUtils::toPoints(src.file.points());
+
+	            if ((landmarks.size() < 2) || (!twoPoints && (landmarks.size() < 3))) {
+	                resize(src, dst, Size(width, height));
+	                return;
+	            } else {
+	                srcPoints[0] = landmarks[0];
+	                srcPoints[1] = landmarks[1];
+	                if (!twoPoints) srcPoints[2] = landmarks[2];
+	            }
+	        }
+	        if (twoPoints) srcPoints[2] = getThirdAffinePoint(srcPoints[0], srcPoints[1]);
+
+	        // Code section being altered (original)
+	        //
+	        // Mat affineTransform = getAffineTransform(srcPoints, dstPoints);
+	        // warpAffine(src, dst, affineTransform, Size(width, height), method);
+	        //
+	        // end original
+
+	        Mat affineTransform = getAffineTransform(srcPoints, dstPoints);
+
+	        void* const* srcDataPtr = src.m().ptr<void*>();
+	        int rows = *((int*)srcDataPtr[1]);
+	        int cols = *((int*)srcDataPtr[2]);
+	        int type = *((int*)srcDataPtr[3]);
+
+	        Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
+	        void** dstDataPtr = dstMat.ptr<void*>();
+
+	        dstDataPtr[1] = srcDataPtr[1]; *((int*)dstDataPtr[1]) = height;  // rows
+	        dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = width;   // cols
+	        dstDataPtr[3] = srcDataPtr[3];
+
+	        // Print the transform
+	        //for(int x = 0; x < affineTransform.rows; x++){
+	            //for(int y = 0; y < affineTransform.cols; y++){
+	                //printf("%8.3f\t", affineTransform.at<double>(x, y));
+	            //}
+	            //printf("\n");
+	        //}
+
+	        cuda::affine::wrapper(srcDataPtr[0], &dstDataPtr[0], affineTransform, rows, cols, height, width);
+
+	        // end altered code
+
+	        if (warpPoints) {
+	            QList<QPointF> points = src.file.points();
+	            QList<QPointF> rotatedPoints;
+	            for (int i=0; i<points.size(); i++) {
+	                rotatedPoints.append(QPointF(points.at(i).x()*affineTransform.at<double>(0,0)+
+	                                             points.at(i).y()*affineTransform.at<double>(0,1)+
+	                                             affineTransform.at<double>(0,2),
+	                                             points.at(i).x()*affineTransform.at<double>(1,0)+
+	                                             points.at(i).y()*affineTransform.at<double>(1,1)+
+	                                             affineTransform.at<double>(1,2)));
+	            }
+
+	            dst.file.setPoints(rotatedPoints);
+	        }
+
+	        if (storeAffine) {
+	            QList<float> affineParams;
+	            for (int i = 0 ; i < 2; i++)
+	                for (int j = 0; j < 3; j++)
+	                    affineParams.append(affineTransform.at<double>(i, j));
+	            dst.file.setList("affineParameters", affineParams);
+	        }
+
+	        dst = dstMat;
+	    }
+	};
+
+	BR_REGISTER(Transform, CUDAAffineTransform)
 
 } // namespace br
 
diff --git a/openbr/plugins/cuda/cudaaffine.cu b/openbr/plugins/cuda/cudaaffine.cu
index 0c19dde..79cc244 100644
--- a/openbr/plugins/cuda/cudaaffine.cu
+++ b/openbr/plugins/cuda/cudaaffine.cu
@@ -11,30 +11,30 @@ using namespace std;
 using namespace cv;
 using namespace cv::gpu;
 
-namespace br { namespace cuda {
+namespace br { namespace cuda { namespace affine {
 
-    __device__ __forceinline__ uint8_t cudaaffine_kernel_get_pixel_value(int row, int col, uint8_t* srcPtr, int rows, int cols) {
+    __device__ __forceinline__ uint8_t getPixelValueDevice(int row, int col, uint8_t* srcPtr, int rows, int cols) {
         if (row < 0 || row > rows || col < 0 || col > cols) {
             if (row > rows || col > cols) {
                 return 0;
             } else{
-                return 0; 
+                return 0;
             }
         }
         return (srcPtr + row*cols)[col];
     }
 
 
-    __device__ __forceinline__ uint8_t cudaaffine_kernel_get_bilinear_pixel_value(double row, double col, uint8_t* srcPtr, int rows, int cols) {
+    __device__ __forceinline__ uint8_t getBilinearPixelValueDevice(double row, double col, uint8_t* srcPtr, int rows, int cols) {
         // don't do anything if the index is out of bounds
         if (row < 0 || row > rows || col < 0 || col > cols) {
             if (row > rows || col > cols) {
                 return 0;
             } else{
-                return 0; 
+                return 0;
             }
         }
-        
+
         // http://www.sci.utah.edu/~acoste/uou/Image/project3/ArthurCOSTE_Project3.pdf
         // Bilinear Transformation
         // f(Px, Py) = f(Q11)×(1−Rx)×(1−Sy)+f(Q21)×(Rx)×(1−Sy)+f(Q12)×(1−Rx)×(Sy)+f(Q22)×(Rx)×(Sy)
@@ -48,22 +48,22 @@ namespace br { namespace cuda {
         double d_row = row - row1;
         double d_col = col - col1;
 
-        int Q11 = cudaaffine_kernel_get_pixel_value(row1, col1, srcPtr, rows, cols);
-        int Q21 = cudaaffine_kernel_get_pixel_value(row2, col1, srcPtr, rows, cols);
-        int Q12 = cudaaffine_kernel_get_pixel_value(row1, col2, srcPtr, rows, cols);
-        int Q22 = cudaaffine_kernel_get_pixel_value(row2, col2, srcPtr, rows, cols);
+        int Q11 = getPixelValueDevice(row1, col1, srcPtr, rows, cols);
+        int Q21 = getPixelValueDevice(row2, col1, srcPtr, rows, cols);
+        int Q12 = getPixelValueDevice(row1, col2, srcPtr, rows, cols);
+        int Q22 = getPixelValueDevice(row2, col2, srcPtr, rows, cols);
 
         double val = Q22*(d_row*d_col) + Q12*((1-d_row)*d_col) + Q21*(d_row*(1-d_col)) + Q11*((1-d_row)*(1-d_col));
         return ((uint8_t) round(val));
     }
 
-    __device__ __forceinline__ uint8_t cudaaffine_kernel_get_distance_pixel_value(double row, double col, uint8_t* srcPtr, int rows, int cols) {
+    __device__ __forceinline__ uint8_t getDistancePixelValueDevice(double row, double col, uint8_t* srcPtr, int rows, int cols) {
         // don't do anything if the index is out of bounds
         if (row < 1 || row >= rows-1 || col < 1 || col >= cols-1) {
             if (row >= rows || col >= cols) {
                 return 0;
             } else{
-                return 0; 
+                return 0;
             }
         }
 
@@ -90,10 +90,10 @@ namespace br { namespace cuda {
         double w3 = d3/sum;
         double w4 = d4/sum;
 
-        uint8_t v1 = cudaaffine_kernel_get_pixel_value(row1, col1, srcPtr, rows, cols);
-        uint8_t v2 = cudaaffine_kernel_get_pixel_value(row2, col1, srcPtr, rows, cols);
-        uint8_t v3 = cudaaffine_kernel_get_pixel_value(row1, col2, srcPtr, rows, cols);
-        uint8_t v4 = cudaaffine_kernel_get_pixel_value(row2, col2, srcPtr, rows, cols);
+        uint8_t v1 = getPixelValueDevice(row1, col1, srcPtr, rows, cols);
+        uint8_t v2 = getPixelValueDevice(row2, col1, srcPtr, rows, cols);
+        uint8_t v3 = getPixelValueDevice(row1, col2, srcPtr, rows, cols);
+        uint8_t v4 = getPixelValueDevice(row2, col2, srcPtr, rows, cols);
 
         return round(w1*v1 + w2*v2 + w3*v3 + w4*v4);
     }
@@ -105,16 +105,16 @@ namespace br { namespace cuda {
      * src_row            - The computed source pixel row (mapping from this row)
      * src_col            - The computed source pixel column (mapping from this col)
      */
-    __device__ __forceinline__ void cudaaffine_kernel_get_src_coord(double *trans_inv, int dst_row, int dst_col, double* src_row_pnt, double* src_col_pnt){
+    __device__ __forceinline__ void getSrcCoordDevice(double *trans_inv, int dst_row, int dst_col, double* src_row_pnt, double* src_col_pnt){
         *src_col_pnt = dst_col * trans_inv[0] + dst_row * trans_inv[3] + trans_inv[6];
         *src_row_pnt = dst_col * trans_inv[1] + dst_row * trans_inv[4] + trans_inv[7];
 
         //printf("Dst: [%d, %d, 1] = [%d, %d, 1] \n[ %0.4f, %0.4f, %0.4f] \n[ %0.4f, %0.4f, %0.4f ]\n[ %0.4f, %0.4f, %0.4f ]\n\n", *src_col, *src_row, dst_col, dst_row, trans_inv[0], trans_inv[1], trans_inv[2], trans_inv[3], trans_inv[4], trans_inv[5], trans_inv[6], trans_inv[7], trans_inv[8]);
 
 		}
-				
 
-    __global__ void cudaaffine_kernel(uint8_t* srcPtr, uint8_t* dstPtr, double* trans_inv, int src_rows, int src_cols, int dst_rows, int dst_cols){
+
+    __global__ void affineKernel(uint8_t* srcPtr, uint8_t* dstPtr, double* trans_inv, int src_rows, int src_cols, int dst_rows, int dst_cols){
         int dstRowInd = blockIdx.y*blockDim.y+threadIdx.y;
         int dstColInd = blockIdx.x*blockDim.x+threadIdx.x;
         int dstIndex = dstRowInd*dst_cols + dstColInd;
@@ -134,15 +134,15 @@ namespace br { namespace cuda {
             }
         }
 
-        cudaaffine_kernel_get_src_coord(trans_inv, dstRowInd, dstColInd, &srcRowPnt, &srcColPnt);
-        //const uint8_t cval = cudaaffine_kernel_get_distance_pixel_value(srcRowPnt, srcColPnt, srcPtr, src_rows, src_cols); // Get initial pixel value
-        const uint8_t cval = cudaaffine_kernel_get_bilinear_pixel_value(srcRowPnt, srcColPnt, srcPtr, src_rows, src_cols); // Get initial pixel value
-        //const uint8_t cval = cudaaffine_kernel_get_pixel_value(round(srcRowPnt), round(srcColPnt), srcPtr, src_rows, src_cols); // Get initial pixel value
+        getSrcCoordDevice(trans_inv, dstRowInd, dstColInd, &srcRowPnt, &srcColPnt);
+        //const uint8_t cval = getDistancePixelValueDevice(srcRowPnt, srcColPnt, srcPtr, src_rows, src_cols); // Get initial pixel value
+        const uint8_t cval = getBilinearPixelValueDevice(srcRowPnt, srcColPnt, srcPtr, src_rows, src_cols); // Get initial pixel value
+        //const uint8_t cval = getPixelValueDevice(round(srcRowPnt), round(srcColPnt), srcPtr, src_rows, src_cols); // Get initial pixel value
 
         dstPtr[dstIndex] = cval;
     }
 
-    void cudaaffine_wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols) {
+    void wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols) {
         cudaError_t err;
         double* gpuInverse;
 
@@ -152,7 +152,7 @@ namespace br { namespace cuda {
 
         //************************************************************************
         // Input affine is a 2x3 Mat whose transpose is used in the computations
-        // [x, y, 1] = [u, v, 1] [ a^T | [0 0 1]^T ] 
+        // [x, y, 1] = [u, v, 1] [ a^T | [0 0 1]^T ]
         // See "Digital Image Warping" by George Wolburg (p. 50)
         //************************************************************************
 
@@ -210,7 +210,7 @@ namespace br { namespace cuda {
 
         CUDA_SAFE_MEMCPY(gpuInverse, affineInverse, 9*sizeof(double), cudaMemcpyHostToDevice, &err);
 
-        cudaaffine_kernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), gpuInverse, src_rows, src_cols, dst_rows, dst_cols);
+        affineKernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), gpuInverse, src_rows, src_cols, dst_rows, dst_cols);
         CUDA_KERNEL_ERR_CHK(&err);
 
         CUDA_SAFE_FREE(srcPtr, &err);
@@ -225,5 +225,4 @@ namespace br { namespace cuda {
         // }
         // printf("\n");
     }
-} // end cuda
-} // end br
+}}}
diff --git a/openbr/plugins/cuda/cudacvtfloat.cpp b/openbr/plugins/cuda/cudacvtfloat.cpp
index d774a05..ced2b9c 100644
--- a/openbr/plugins/cuda/cudacvtfloat.cpp
+++ b/openbr/plugins/cuda/cudacvtfloat.cpp
@@ -7,7 +7,8 @@ using namespace cv;
 
 #include <openbr/plugins/openbr_internal.h>
 
-namespace br { namespace cuda { namespace cudacvtfloat {
+// definitions from the CUDA source file
+namespace br { namespace cuda { namespace cvtfloat {
   void wrapper(void* src, void** dst, int rows, int cols);
 }}}
 
@@ -16,7 +17,7 @@ namespace br
 
 /*!
  * \ingroup transforms
- * \brief Converts byte to floating point
+ * \brief Converts 8-bit images currently on GPU into 32-bit floating point equivalent.
  * \author Colin Heinzmann \cite DepthDeluxe
  */
 class CUDACvtFloatTransform : public UntrainableTransform
@@ -45,7 +46,7 @@ class CUDACvtFloatTransform : public UntrainableTransform
       dstDataPtr[2] = srcDataPtr[2];
       dstDataPtr[3] = srcDataPtr[3]; *((int*)dstDataPtr[3]) = CV_32FC1;
 
-      br::cuda::cudacvtfloat::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
+      cuda::cvtfloat::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
       dst = dstMat;
     }
 };
diff --git a/openbr/plugins/cuda/cudacvtfloat.cu b/openbr/plugins/cuda/cudacvtfloat.cu
index 70bf2e4..bcc4665 100644
--- a/openbr/plugins/cuda/cudacvtfloat.cu
+++ b/openbr/plugins/cuda/cudacvtfloat.cu
@@ -3,7 +3,7 @@ using namespace std;
 
 #include "cudadefines.hpp"
 
-namespace br { namespace cuda { namespace cudacvtfloat {
+namespace br { namespace cuda { namespace cvtfloat {
 
   __global__ void kernel(const unsigned char* src, float* dst, int rows, int cols) {
     // get my index
@@ -20,12 +20,6 @@ namespace br { namespace cuda { namespace cudacvtfloat {
   }
 
   void wrapper(void* src, void** dst, int rows, int cols) {
-    //unsigned char* cudaSrc;
-    //cudaMalloc(&cudaSrc, rows*cols*sizeof(unsigned char));
-    //cudaMemcpy(cudaSrc, src, rows*cols*sizeof(unsigned char), cudaMemcpyHostToDevice);
-
-    //float* cudaDst;
-    //cudaMalloc(&cudaDst, rows*cols*sizeof(float));
     cudaError_t err;
     CUDA_SAFE_MALLOC(dst, rows*cols*sizeof(float), &err);
 
diff --git a/openbr/plugins/cuda/cudal2.cpp b/openbr/plugins/cuda/cudal2.cpp
new file mode 100644
index 0000000..8cbc7c6
--- /dev/null
+++ b/openbr/plugins/cuda/cudal2.cpp
@@ -0,0 +1,59 @@
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright 2012 The MITRE Corporation                                      *
+ *                                                                           *
+ * Licensed under the Apache License, Version 2.0 (the "License");           *
+ * you may not use this file except in compliance with the License.          *
+ * You may obtain a copy of the License at                                   *
+ *                                                                           *
+ *     http://www.apache.org/licenses/LICENSE-2.0                            *
+ *                                                                           *
+ * Unless required by applicable law or agreed to in writing, software       *
+ * distributed under the License is distributed on an "AS IS" BASIS,         *
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  *
+ * See the License for the specific language governing permissions and       *
+ * limitations under the License.                                            *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#include <iostream>
+using namespace std;
+
+#include <openbr/plugins/openbr_internal.h>
+
+// definitions from the CUDA source file
+namespace br { namespace cuda { namespace L2 {
+  void wrapper(float* cudaAPtr, float* cudaBPtr, int length, float* outPtr);
+}}}
+
+namespace br
+{
+
+/*!
+ * \ingroup distances
+ * \brief L2 distance computed using eigen.
+ * \author Colin Heinzmann \cite DepthDeluxe
+ */
+class CUDAL2Distance : public UntrainableDistance
+{
+    Q_OBJECT
+
+    float compare(const cv::Mat &a, const cv::Mat &b) const
+    {
+      void* const* srcDataPtr = a.ptr<void*>();
+      float* cudaAPtr = (float*)srcDataPtr[0];
+      int rows = *((int*)srcDataPtr[1]);
+      int cols = *((int*)srcDataPtr[2]);
+
+      float* cudaBPtr = (float*)b.ptr<void*>()[0];
+
+      float out;
+      cuda::L2::wrapper(cudaAPtr, cudaBPtr, rows*cols, &out);
+
+      return out;
+    }
+};
+
+BR_REGISTER(Distance, CUDAL2Distance)
+
+} // namespace br
+
+#include "cuda/cudal2.moc"
diff --git a/openbr/plugins/cuda/cudal2.cu b/openbr/plugins/cuda/cudal2.cu
new file mode 100644
index 0000000..23db28d
--- /dev/null
+++ b/openbr/plugins/cuda/cudal2.cu
@@ -0,0 +1,65 @@
+#include <math.h>
+
+#include "cudadefines.hpp"
+
+namespace br { namespace cuda { namespace L2 {
+
+  __global__ void subtractKernel(float* aPtr, float* bPtr, float* workPtr, int length) {
+    int index = blockIdx.x*blockDim.x+threadIdx.x;
+
+    if (index >= length) {
+      return;
+    }
+
+    // perform the subtraction in-place
+    // use b because it is the comparison
+    // image
+    workPtr[index] = aPtr[index] - bPtr[index];
+    workPtr[index] = workPtr[index] * workPtr[index];
+  }
+
+  __global__ void collapseKernel(float* inPtr, float* outPtr, int length) {
+    // make sure there is only one thread that we are calling
+    if (blockIdx.x != 0 || threadIdx.x != 0) {
+      return;
+    }
+
+    // sum up all the values
+    *outPtr = 0;
+    for (int i=0; i < length; i++) {
+      *outPtr = *outPtr + inPtr[i];
+    }
+
+    // take the square root
+    *outPtr = sqrtf(*outPtr);
+  }
+
+  void wrapper(float* cudaAPtr, float* cudaBPtr, int length, float* outPtr) {
+    cudaError_t err;
+    float* cudaOutPtr;
+    CUDA_SAFE_MALLOC(&cudaOutPtr, sizeof(float), &err);
+
+    float* cudaWorkBufferPtr;
+    CUDA_SAFE_MALLOC(&cudaWorkBufferPtr, sizeof(float)*length, &err);
+
+    // perform the subtraction
+    int threadsPerBlock = 64;
+    int numBlocks = length / threadsPerBlock + 1;
+    subtractKernel<<<threadsPerBlock, numBlocks>>>(cudaAPtr, cudaBPtr, cudaWorkBufferPtr, length);
+    CUDA_KERNEL_ERR_CHK(&err);
+
+    // perform the collapse
+    collapseKernel<<<1,1>>>(cudaWorkBufferPtr, cudaOutPtr, length);
+    CUDA_KERNEL_ERR_CHK(&err);
+
+    // copy the single value back to the destinsion
+    CUDA_SAFE_MEMCPY(outPtr, cudaOutPtr, sizeof(float), cudaMemcpyDeviceToHost, &err);
+
+    CUDA_SAFE_FREE(cudaOutPtr, &err);
+
+    // do not free aPtr which should be the reference library
+    // only free bPtr, which is the image we are comparing
+    CUDA_SAFE_FREE(cudaBPtr, &err);
+    CUDA_SAFE_FREE(cudaWorkBufferPtr, &err);
+  }
+}}}
diff --git a/openbr/plugins/cuda/cudalbp.cpp b/openbr/plugins/cuda/cudalbp.cpp
index 2385c9b..f2f2d9f 100644
--- a/openbr/plugins/cuda/cudalbp.cpp
+++ b/openbr/plugins/cuda/cudalbp.cpp
@@ -31,44 +31,21 @@ using namespace std;
 
 #include <openbr/plugins/openbr_internal.h>
 
-#include "MatManager.hpp"
-
 using namespace cv;
 
-string type2str(int type) {
-  string r;
-
-  uchar depth = type & CV_MAT_DEPTH_MASK;
-  uchar chans = 1 + (type >> CV_CN_SHIFT);
-
-  switch ( depth ) {
-    case CV_8U:  r = "8U"; break;
-    case CV_8S:  r = "8S"; break;
-    case CV_16U: r = "16U"; break;
-    case CV_16S: r = "16S"; break;
-    case CV_32S: r = "32S"; break;
-    case CV_32F: r = "32F"; break;
-    case CV_64F: r = "64F"; break;
-    default:     r = "User"; break;
-  }
-
-  r += "C";
-  r += (chans+'0');
-
-  return r;
-}
-
-namespace br { namespace cuda {
-  void cudalbp_wrapper(void* srcPtr, void** dstPtr, int rows, int cols);
-  void cudalbp_init_wrapper(uint8_t* lut);
-}}
+// definitions from the CUDA source file
+namespace br { namespace cuda { namespace lbp {
+  void wrapper(void* srcPtr, void** dstPtr, int rows, int cols);
+  void initializeWrapper(uint8_t* lut);
+}}}
 
 namespace br
 {
 /*!
  * \ingroup transforms
- * \brief Convert the image into a feature vector using Local Binary Patterns in CUDA
- * \author Colin Heinzmann, Li Li \cite DepthDeluxe, booli
+ * \brief Convert the image into a feature vector using Local Binary Patterns in CUDA.  Modified from stock OpenBR plugin.
+ * \author Colin Heinzmann \cite DepthDeluxe
+ * \author Li Li \cite booli
  */
 class CUDALBPTransform : public UntrainableTransform
 {
@@ -84,8 +61,6 @@ class CUDALBPTransform : public UntrainableTransform
     uchar lut[256];
     uchar null;
 
-    //cuda::MatManager* matManager;
-
   public:
     /* Returns the number of 0->1 or 1->0 transitions in i */
     static int numTransitions(int i)
@@ -136,36 +111,14 @@ class CUDALBPTransform : public UntrainableTransform
             if (!set[i])
                 lut[i] = null; // Set to null id
 
-        // init the mat manager for managing 10 mats
-        //matManager = new cuda::MatManager(10);
-
         // copy lut over to the GPU
-        br::cuda::cudalbp_init_wrapper(lut);
+        cuda::lbp::initializeWrapper(lut);
 
         std::cout << "Initialized CUDALBP" << std::endl;
     }
 
     void project(const Template &src, Template &dst) const
     {
-        //Mat& m = (Mat&)src.m();
-        //cuda::MatManager::matindex a;
-        //cuda::MatManager::matindex b;
-        //a = matManager->reserve(m);
-        //matManager->upload(a, m);
-
-        // reserve the second mat and check the dimensiosn
-        //b = matManager->reserve(m);
-
-        //uint8_t* srcMatPtr = matManager->get_mat_pointer_from_index(a);
-        //uint8_t* dstMatPtr = matManager->get_mat_pointer_from_index(b);
-        //br::cuda::cudalbp_wrapper(srcMatPtr, dstMatPtr, lutGpuPtr, m.cols, m.rows, m.step1());
-
-        //matManager->download(b, dst);
-
-        // release both the mats
-        //matManager->release(a);
-        //matManager->release(b);
-
         void* const* srcDataPtr = src.m().ptr<void*>();
         int rows = *((int*)srcDataPtr[1]);
         int cols = *((int*)srcDataPtr[2]);
@@ -177,13 +130,13 @@ class CUDALBPTransform : public UntrainableTransform
         dstDataPtr[2] = srcDataPtr[2];
         dstDataPtr[3] = srcDataPtr[3];
 
-        br::cuda::cudalbp_wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
+        cuda::lbp::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
         dst = dstMat;
     }
 };
 
 BR_REGISTER(Transform, CUDALBPTransform)
 
-} // namespace br
+}
 
 #include "cuda/cudalbp.moc"
diff --git a/openbr/plugins/cuda/cudalbp.cu b/openbr/plugins/cuda/cudalbp.cu
index bb5be9a..61548df 100644
--- a/openbr/plugins/cuda/cudalbp.cu
+++ b/openbr/plugins/cuda/cudalbp.cu
@@ -9,14 +9,18 @@ using namespace std;
 using namespace cv;
 using namespace cv::gpu;
 
-namespace br { namespace cuda {
+/*
+ * These are the CUDA functions for CUDALBP.  See cudapca.cpp for more details
+ */
+
+namespace br { namespace cuda { namespace lbp {
   uint8_t* lut;
 
-  __device__ __forceinline__ uint8_t cudalbp_kernel_get_pixel_value(int row, int col, uint8_t* srcPtr, int rows, int cols) {
+  __device__ __forceinline__ uint8_t getPixelValueKernel(int row, int col, uint8_t* srcPtr, int rows, int cols) {
     return (srcPtr + row*cols)[col];
   }
 
-  __global__ void cudalbp_kernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols, uint8_t* lut)
+  __global__ void lutKernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols, uint8_t* lut)
   {
     int rowInd = blockIdx.y*blockDim.y+threadIdx.y;
     int colInd = blockIdx.x*blockDim.x+threadIdx.x;
@@ -34,22 +38,22 @@ namespace br { namespace cuda {
       }
     }
 
-    const uint8_t cval = cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+0*radius, srcPtr, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]);                      // center value
-    uint8_t val = lut[(cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 128 : 0) |
-                      (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 64  : 0) |
-                      (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 32  : 0) |
-                      (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 16  : 0) |
-                      (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 8   : 0) |
-                      (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 4   : 0) |
-                      (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 2   : 0) |
-                      (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 1   : 0)];
+    const uint8_t cval = getPixelValueKernel(rowInd+0*radius, colInd+0*radius, srcPtr, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]);                      // center value
+    uint8_t val = lut[(getPixelValueKernel(rowInd-1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 128 : 0) |
+                      (getPixelValueKernel(rowInd-1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 64  : 0) |
+                      (getPixelValueKernel(rowInd-1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 32  : 0) |
+                      (getPixelValueKernel(rowInd+0*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 16  : 0) |
+                      (getPixelValueKernel(rowInd+1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 8   : 0) |
+                      (getPixelValueKernel(rowInd+1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 4   : 0) |
+                      (getPixelValueKernel(rowInd+1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 2   : 0) |
+                      (getPixelValueKernel(rowInd+0*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 1   : 0)];
 
     // store calculated value away in the right place
     dstPtr[index] = val;
   }
 
   //void cudalbp_wrapper(uint8_t* srcPtr, uint8_t* dstPtr, uint8_t* lut, int imageWidth, int imageHeight, size_t step)
-  void cudalbp_wrapper(void* srcPtr, void** dstPtr, int rows, int cols)
+  void wrapper(void* srcPtr, void** dstPtr, int rows, int cols)
   {
     cudaError_t err;
 
@@ -59,15 +63,15 @@ namespace br { namespace cuda {
                    rows/threadsPerBlock.y + 1);
 
     CUDA_SAFE_MALLOC(dstPtr, rows*cols*sizeof(uint8_t), &err);
-    cudalbp_kernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), rows, cols, lut);
+    lutKernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), rows, cols, lut);
     CUDA_KERNEL_ERR_CHK(&err);
 
     CUDA_SAFE_FREE(srcPtr, &err);
   }
 
-  void cudalbp_init_wrapper(uint8_t* cpuLut) {
+  void initializeWrapper(uint8_t* cpuLut) {
     cudaError_t err;
     CUDA_SAFE_MALLOC(&lut, 256*sizeof(uint8_t), &err);
     CUDA_SAFE_MEMCPY(lut, cpuLut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice, &err);
   }
-}}
+}}}
diff --git a/openbr/plugins/cuda/cudapca.cpp b/openbr/plugins/cuda/cudapca.cpp
index fbee173..c1677c8 100644
--- a/openbr/plugins/cuda/cudapca.cpp
+++ b/openbr/plugins/cuda/cudapca.cpp
@@ -29,8 +29,9 @@ using namespace cv;
 #include <openbr/core/eigenutils.h>
 #include <openbr/core/opencvutils.h>
 
+// definitions from the CUDA source file
 namespace br { namespace cuda { namespace pca {
-  void loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems);
+  void initializeWrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems);
   void wrapper(void* src, void** dst);
 }}}
 
@@ -38,9 +39,7 @@ namespace br
 {
 /*!
  * \ingroup transforms
- * \brief Projects input into learned Principal Component Analysis subspace using CUDA.
- * \author Brendan Klare \cite bklare
- * \author Josh Klontz \cite jklontz
+ * \brief Projects input into learned Principal Component Analysis subspace using CUDA. Modified from original PCA plugin.
  * \author Colin Heinzmann \cite DepthDeluxe
  *
  * \br_property float keep Options are: [keep < 0 - All eigenvalues are retained, keep == 0 - No PCA is performed and the eigenvectors form an identity matrix, 0 < keep < 1 - Keep is the fraction of the variance to retain, keep >= 1 - keep is the number of leading eigenvectors to retain] Default is 0.95.
@@ -134,22 +133,9 @@ private:
       dstDataPtr[2] = srcDataPtr[2];  *((int*)dstDataPtr[2]) = keep;
       dstDataPtr[3] = srcDataPtr[3];
 
-      br::cuda::pca::wrapper(srcDataPtr[0], &dstDataPtr[0]);
+      cuda::pca::wrapper(srcDataPtr[0], &dstDataPtr[0]);
 
       dst = dstMat;
-
-        //dst = cv::Mat(1, keep, CV_32FC1);
-
-        // perform the operation on the graphics card
-        //cuda::cudapca_projectwrapper((float*)src.m().ptr<float>(), (float*)dst.m().ptr<float>());
-
-        // Map Eigen into OpenCV
-        //Mat cpuDst = cv::Mat(1, keep, CV_32FC1);
-        //Eigen::Map<const Eigen::MatrixXf> inMap(src.m().ptr<float>(), src.m().rows*src.m().cols, 1);
-        //Eigen::Map<Eigen::MatrixXf> outMap(cpuDst.ptr<float>(), keep, 1);
-
-        // Do projection
-        //outMap = eVecs.transpose() * (inMap - mean);
     }
 
     void store(QDataStream &stream) const
@@ -161,14 +147,6 @@ private:
     {
         stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> eVecs;
 
-        cout << "Mean Dimensions" << endl;
-        cout << "\tRows: " << mean.rows() << " Cols: " << mean.cols() << endl;
-        cout << "eVecs Dimensions" << endl;
-        cout << "\tRows: " << eVecs.rows() << " Cols: " << eVecs.cols() << endl;
-        cout << "eVals Dimensions" << endl;
-        cout << "\tRows: " << eVals.rows() << " Cols: " << eVals.cols() << endl;
-        cout << "Keep: " << keep << endl;
-
         // TODO(colin): use Eigen Map class to generate map files so we don't have to copy the data
         // serialize the eigenvectors
         float* evBuffer = new float[eVecs.rows() * eVecs.cols()];
@@ -187,7 +165,7 @@ private:
         }
 
         // call the wrapper function
-        br::cuda::pca::loadwrapper(evBuffer, eVecs.rows(), eVecs.cols(), meanBuffer, mean.rows()*mean.cols());
+        cuda::pca::initializeWrapper(evBuffer, eVecs.rows(), eVecs.cols(), meanBuffer, mean.rows()*mean.cols());
 
         delete evBuffer;
         delete meanBuffer;
diff --git a/openbr/plugins/cuda/cudapca.cu b/openbr/plugins/cuda/cudapca.cu
index f6fd3a1..001dd49 100644
--- a/openbr/plugins/cuda/cudapca.cu
+++ b/openbr/plugins/cuda/cudapca.cu
@@ -9,6 +9,10 @@ using namespace std;
 using namespace cv;
 using namespace cv::gpu;
 
+/*
+ * These are the CUDA functions for CUDAPCA.  See cudapca.cpp for more details
+ */
+
 namespace br { namespace cuda { namespace pca {
   __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int evRows, int evCols, int stepSize) {
     int colInd = blockIdx.x*blockDim.x+threadIdx.x;
@@ -68,7 +72,7 @@ namespace br { namespace cuda { namespace pca {
   int _numSteps; int _stepSize;
   float* intermediaryBuffer;
 
-  void loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) {
+  void initializeWrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) {
     _evRows = evRows; _evCols = evCols;
     _meanElems = meanElems;
 
@@ -95,7 +99,6 @@ namespace br { namespace cuda { namespace pca {
     cudaError_t err;
     CUDA_SAFE_MALLOC(dst, _evCols*sizeof(float), &err);
 
-
     // subtract out the mean of the image (mean is 1xpixels in size)
     int threadsPerBlock = 64;
     int numBlocks = _meanElems / threadsPerBlock + 1;
@@ -114,8 +117,5 @@ namespace br { namespace cuda { namespace pca {
     CUDA_KERNEL_ERR_CHK(&err);
 
     CUDA_SAFE_FREE(src, &err);    // TODO(colin): figure out why adding this free causes memory corruption...
-
-    // copy the data back to the CPU
-    //cudaMemcpy(dst, _cudaDstPtr, _evCols*sizeof(float), cudaMemcpyDeviceToHost);
   }
 }}}
diff --git a/openbr/plugins/cuda/cudargb2grayscale.cpp b/openbr/plugins/cuda/cudargb2grayscale.cpp
index 6f72728..1db7cca 100644
--- a/openbr/plugins/cuda/cudargb2grayscale.cpp
+++ b/openbr/plugins/cuda/cudargb2grayscale.cpp
@@ -25,17 +25,18 @@
 
 using namespace cv;
 
-namespace br { namespace cuda{
-  void cudargb2grayscale_wrapper(void* srcPtr, void**dstPtr, int rows, int cols);
-}}
+// definitions from the CUDA source file
+namespace br { namespace cuda { namespace rgb2grayscale {
+  void wrapper(void* srcPtr, void**dstPtr, int rows, int cols);
+}}}
 
 namespace br
 {
 
 /*!
  * \ingroup transforms
- * \brief Colorspace conversion.
- * \author Li Li \cite Josh Klontz \cite jklontz
+ * \brief Converts 3-channel images to grayscale
+ * \author Li Li \cite booli
  */
 class CUDARGB2GrayScaleTransform : public UntrainableTransform
 {
@@ -57,8 +58,8 @@ private:
         dstDataPtr[2] = srcDataPtr[2];
         dstDataPtr[3] = srcDataPtr[3];
         *((int*)dstDataPtr[3]) = CV_8UC1; // not sure if the type of the new mat is the same
-       
-        br::cuda::cudargb2grayscale_wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
+
+        cuda::rgb2grayscale::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
         dst = dstMat;
 
         /*
diff --git a/openbr/plugins/cuda/cudargb2grayscale.cu b/openbr/plugins/cuda/cudargb2grayscale.cu
index 266389c..8efaafa 100644
--- a/openbr/plugins/cuda/cudargb2grayscale.cu
+++ b/openbr/plugins/cuda/cudargb2grayscale.cu
@@ -12,9 +12,9 @@ using namespace std;
 using namespace cv;
 using namespace cv::gpu;
 
-namespace br{ namespace cuda {
+namespace br { namespace cuda { namespace rgb2grayscale {
 
-  __global__ void cudargb2grayscale_kernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols)
+  __global__ void kernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols)
   {
     int rowInd = blockIdx.y*blockDim.y+threadIdx.y;
     int colInd = blockIdx.x*blockDim.x+threadIdx.x;
@@ -31,7 +31,7 @@ namespace br{ namespace cuda {
     return;
   }
 
-  void cudargb2grayscale_wrapper(void* srcPtr, void** dstPtr, int rows, int cols)
+  void wrapper(void* srcPtr, void** dstPtr, int rows, int cols)
   {
     cudaError_t err;
     dim3 threadsPerBlock(9, 9);
@@ -39,9 +39,9 @@ namespace br{ namespace cuda {
                    rows/threadsPerBlock.y + 1);
     CUDA_SAFE_MALLOC(dstPtr, rows*cols*sizeof(uint8_t), &err);
 
-    cudargb2grayscale_kernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*) (*dstPtr), rows, cols);
+    kernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*) (*dstPtr), rows, cols);
     CUDA_KERNEL_ERR_CHK(&err);
     CUDA_SAFE_FREE(srcPtr, &err);
-  } 
+  }
 
-}}
+}}}
diff --git a/openbr/plugins/cuda/passthrough.cpp b/openbr/plugins/cuda/passthrough.cpp
deleted file mode 100644
index f7018ea..0000000
--- a/openbr/plugins/cuda/passthrough.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#include <openbr/plugins/openbr_internal.h>
-
-#include <opencv2/imgproc/imgproc.hpp>
-#include <opencv2/gpu/gpu.hpp>
-
-using namespace cv;
-using namespace cv::gpu;
-
-#include "passthrough.hpp"
-
-#include <iostream>
-
-
-namespace br
-{
-  class CUDAPassthroughTransform : public UntrainableTransform
-  {
-    Q_OBJECT
-
-private:
-    void project(const Template &src, Template &dst) const
-    {
-      // note: if you convert the image to grayscale, you get 8UC1
-
-      // upload the src mat to the GPU
-      GpuMat srcGpuMat, dstGpuMat;
-      srcGpuMat.upload(src.m());
-      dstGpuMat.upload(src.m());
-
-      br::cuda::passthrough_wrapper(srcGpuMat, dstGpuMat);
-
-      dstGpuMat.download(dst.m());
-
-      // TODO(colin): add delete code
-      srcGpuMat.release();
-      dstGpuMat.release();
-
-      printf("srcGpuMat empty: %d\n", (int)srcGpuMat.empty());
-      printf("dstGpuMat empty: %d\n", (int)srcGpuMat.empty());
-    }
-  };
-
-  BR_REGISTER(Transform, CUDAPassthroughTransform);
-}
-
-#include "cuda/passthrough.moc"
diff --git a/openbr/plugins/cuda/passthrough.cu b/openbr/plugins/cuda/passthrough.cu
deleted file mode 100644
index 49f94e9..0000000
--- a/openbr/plugins/cuda/passthrough.cu
+++ /dev/null
@@ -1,43 +0,0 @@
-// note: Using 8-bit unsigned 1 channel images
-
-#include <opencv2/gpu/gpu.hpp>
-
-using namespace cv;
-using namespace cv::gpu;
-
-#include "passthrough.hpp"
-
-namespace br { namespace cuda {
-  __global__ void passthrough_kernel(uint8_t* srcPtr, uint8_t* dstPtr, size_t srcStep, size_t dstStep, int cols, int rows) {
-    int rowInd = blockIdx.y*blockDim.y+threadIdx.y;
-    int colInd = blockIdx.x*blockDim.x+threadIdx.x;
-
-    // don't do anything if we are outside the allowable positions
-    if (rowInd >= rows || colInd >= cols)
-      return;
-
-    uint8_t srcVal = (srcPtr + rowInd*srcStep)[colInd];
-    uint8_t* rowDstPtr = dstPtr + rowInd*dstStep;
-
-    rowDstPtr[colInd] = srcVal;
-  }
-
-  void passthrough_wrapper(GpuMat& src, GpuMat& dst) {
-    // convert the GpuMats to pointers
-    uint8_t* srcPtr = (uint8_t*)src.data;
-    uint8_t* dstPtr = (uint8_t*)dst.data;
-
-    int imageWidth = src.cols;
-    int imageHeight = src.rows;
-
-    // make 8 * 8 = 64 square block
-    dim3 threadsPerBlock(8, 8);
-    dim3 numBlocks(imageWidth / threadsPerBlock.x + 1,
-                   imageHeight / threadsPerBlock.y + 1);
-
-    passthrough_kernel<<<numBlocks, threadsPerBlock>>>(srcPtr, dstPtr, src.step, dst.step, imageWidth, imageHeight);
-  }
-}}
-
-
-// read http://stackoverflow.com/questions/31927297/array-of-ptrstepszgpumat-to-a-c-cuda-kernel
diff --git a/openbr/plugins/cuda/passthrough.hpp b/openbr/plugins/cuda/passthrough.hpp
deleted file mode 100644
index 8749684..0000000
--- a/openbr/plugins/cuda/passthrough.hpp
+++ /dev/null
@@ -1,3 +0,0 @@
-namespace br { namespace cuda {
-  void passthrough_wrapper(GpuMat& src, GpuMat& dst);
-}}
--
libgit2 0.21.4