diff --git a/openbr/plugins/cuda/MatManager.cu b/openbr/core/cuda/MatManager.cu
index 4823249..4823249 100644
--- a/openbr/plugins/cuda/MatManager.cu
+++ b/openbr/core/cuda/MatManager.cu
diff --git a/openbr/plugins/cuda/MatManager.hpp b/openbr/core/cuda/MatManager.hpp
index e58ad42..e58ad42 100644
--- a/openbr/plugins/cuda/MatManager.hpp
+++ b/openbr/core/cuda/MatManager.hpp
diff --git a/openbr/plugins/cuda/copyfrom.cpp b/openbr/plugins/cuda/copyfrom.cpp
index 4bfc0bf..287b3b9 100644
--- a/openbr/plugins/cuda/copyfrom.cpp
+++ b/openbr/plugins/cuda/copyfrom.cpp
@@ -2,21 +2,25 @@
 
 #include <opencv2/opencv.hpp>
 
-//#include <gperftools/profiler.h>
-
 #include <openbr/plugins/openbr_internal.h>
 
 using namespace std;
 
 using namespace cv;
 
-// extern CUDA declaration
-namespace br { namespace cuda { namespace cudacopyfrom {
+// CUDA functions for this plugin
+namespace br { namespace cuda { namespace copyfrom {
   template <typename T> void wrapper(void* src, T* out, int rows, int cols);
 }}}
 
 namespace br
 {
+  /*!
+  * \ingroup transforms
+  * \brief Copies a transform from the GPU to the CPU.
+  * \author Colin Heinzmann \cite DepthDeluxe
+  * \note Method: Automatically matches image dimensions, works for 32-bit single channel, 8-bit single channel, and 8-bit 3 channel
+  */
   class CUDACopyFrom : public UntrainableTransform
   {
     Q_OBJECT
@@ -24,8 +28,6 @@ namespace br
 private:
     void project(const Template &src, Template &dst) const
     {
-//      ProfilerStart("PROFILEME.log");
-
       // pull the data back out of the Mat
       void* const* dataPtr = src.m().ptr<void*>();
       int rows = *((int*)dataPtr[1]);
@@ -35,21 +37,19 @@ private:
       Mat dstMat = Mat(rows, cols, type);
       switch(type) {
       case CV_32FC1:
-        br::cuda::cudacopyfrom::wrapper(dataPtr[0], dstMat.ptr<float>(), rows, cols);
+        cuda::copyfrom::wrapper(dataPtr[0], dstMat.ptr<float>(), rows, cols);
         break;
       case CV_8UC1:
-        br::cuda::cudacopyfrom::wrapper(dataPtr[0], dstMat.ptr<unsigned char>(), rows, cols);
+        cuda::copyfrom::wrapper(dataPtr[0], dstMat.ptr<unsigned char>(), rows, cols);
         break;
       case CV_8UC3:
-        br::cuda::cudacopyfrom::wrapper(dataPtr[0], dstMat.ptr<unsigned char>(), rows, cols * 3);
+        cuda::copyfrom::wrapper(dataPtr[0], dstMat.ptr<unsigned char>(), rows, cols * 3);
         break;
       default:
-        cout << "ERR: Invalid image format" << endl;
+        cout << "ERR: Invalid image type (" << type << ")" << endl;
         break;
       }
       dst = dstMat;
-
-//      ProfilerStop();
     }
   };
 
diff --git a/openbr/plugins/cuda/copyfrom.cu b/openbr/plugins/cuda/copyfrom.cu
index ddc77f8..5574ea6 100644
--- a/openbr/plugins/cuda/copyfrom.cu
+++ b/openbr/plugins/cuda/copyfrom.cu
@@ -1,6 +1,6 @@
 #include "cudadefines.hpp"
 
-namespace br { namespace cuda { namespace cudacopyfrom {
+namespace br { namespace cuda { namespace copyfrom {
   template <typename T> void wrapper(void* src, T* dst, int rows, int cols) {
     cudaError_t err;
     CUDA_SAFE_MEMCPY(dst, src, rows*cols*sizeof(T), cudaMemcpyDeviceToHost, &err);
diff --git a/openbr/plugins/cuda/copyto.cpp b/openbr/plugins/cuda/copyto.cpp
index cc63e58..0fad607 100644
--- a/openbr/plugins/cuda/copyto.cpp
+++ b/openbr/plugins/cuda/copyto.cpp
@@ -8,14 +8,20 @@ using namespace std;
 
 using namespace cv;
 
-extern string type2str(int type);
-
-namespace br { namespace cuda { namespace cudacopyto {
+// definitions from the CUDA source file
+namespace br { namespace cuda { namespace copyto {
   template <typename T> void wrapper(const T* in, void** out, const int rows, const int cols);
 }}}
 
 namespace br
 {
+
+  /*!
+  * \ingroup transforms
+  * \brief Copies a transform to the GPU.
+  * \author Colin Heinzmann \cite DepthDeluxe
+  * \note Method: Automatically matches image dimensions, works for 32-bit single channel, 8-bit single channel, and 8-bit 3 channel
+  */
   class CUDACopyTo : public UntrainableTransform
   {
     Q_OBJECT
@@ -25,7 +31,7 @@ private:
     {
       const Mat& srcMat = src.m();
       const int rows = srcMat.rows;
-      const int cols = srcMat.cols; 
+      const int cols = srcMat.cols;
 
       // output will be a single pointer to graphics card memory
       Mat dstMat = Mat(4, 1, DataType<void*>::type);
@@ -39,16 +45,16 @@ private:
       void* cudaMemPtr;
       switch(srcMat.type()) {
       case CV_32FC1:
-        br::cuda::cudacopyto::wrapper(srcMat.ptr<float>(), &dstMatData[0], rows, cols);
+        cuda::copyto::wrapper(srcMat.ptr<float>(), &dstMatData[0], rows, cols);
         break;
       case CV_8UC1:
-        br::cuda::cudacopyto::wrapper(srcMat.ptr<unsigned char>(), &dstMatData[0], rows, cols);
+        cuda::copyto::wrapper(srcMat.ptr<unsigned char>(), &dstMatData[0], rows, cols);
         break;
       case CV_8UC3:
-        br::cuda::cudacopyto::wrapper(srcMat.ptr<unsigned char>(), &dstMatData[0], rows, 3*cols);
+        cuda::copyto::wrapper(srcMat.ptr<unsigned char>(), &dstMatData[0], rows, 3*cols);
         break;
       default:
-        cout << "ERR: Invalid image type! " << type2str(srcMat.type()) << endl;
+        cout << "ERR: Invalid image type (" << srcMat.type() << ")" << endl;
         return;
       }
 
diff --git a/openbr/plugins/cuda/copyto.cu b/openbr/plugins/cuda/copyto.cu
index 0742ff7..4c66605 100644
--- a/openbr/plugins/cuda/copyto.cu
+++ b/openbr/plugins/cuda/copyto.cu
@@ -1,6 +1,7 @@
 #include "cudadefines.hpp"
 
-namespace br { namespace cuda { namespace cudacopyto {
+namespace br { namespace cuda { namespace copyto {
+
   template <typename T> void wrapper(const T* in, void** out, const int rows, const int cols) {
     cudaError_t err;
     CUDA_SAFE_MALLOC(out, rows*cols*sizeof(T), &err);
@@ -9,4 +10,5 @@ namespace br { namespace cuda { namespace cudacopyto {
 
   template void wrapper(const float* in, void** out, const int rows, const int cols);
   template void wrapper(const unsigned char* in, void** out, const int rows, const int cols);
+
 }}}
diff --git a/openbr/plugins/cuda/cudaaffine.cpp b/openbr/plugins/cuda/cudaaffine.cpp
index 46dd71f..eec10f4 100644
--- a/openbr/plugins/cuda/cudaaffine.cpp
+++ b/openbr/plugins/cuda/cudaaffine.cpp
@@ -33,159 +33,158 @@ using namespace std;
 #include <openbr/plugins/openbr_internal.h>
 #include <openbr/core/opencvutils.h>
 
-#include "MatManager.hpp"
-
 using namespace cv;
 
-namespace br { namespace cuda {
-	void cudaaffine_wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols);
-}}
+// definitions from the CUDA source file
+namespace br { namespace cuda { namespace affine {
+	void wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols);
+}}}
 
 namespace br
 {
 
-/*!
- * \ingroup transforms
- * \brief Performs a two or three point registration.
- * \author Josh Klontz \cite jklontz
- * \note Method: Area should be used for shrinking an image, Cubic for slow but accurate enlargment, Bilin for fast enlargement.
- */
-class CUDAAffineTransform : public UntrainableTransform
-{
-    Q_OBJECT
-    Q_ENUMS(Method)
-
-public:
-    /*!< */
-    enum Method { Near = INTER_NEAREST,
-                  Area = INTER_AREA,
-                  Bilin = INTER_LINEAR,
-                  Cubic = INTER_CUBIC,
-                  Lanczo = INTER_LANCZOS4};
-
-private:
-    Q_PROPERTY(int width READ get_width WRITE set_width RESET reset_width STORED false)
-    Q_PROPERTY(int height READ get_height WRITE set_height RESET reset_height STORED false)
-    Q_PROPERTY(float x1 READ get_x1 WRITE set_x1 RESET reset_x1 STORED false)
-    Q_PROPERTY(float y1 READ get_y1 WRITE set_y1 RESET reset_y1 STORED false)
-    Q_PROPERTY(float x2 READ get_x2 WRITE set_x2 RESET reset_x2 STORED false)
-    Q_PROPERTY(float y2 READ get_y2 WRITE set_y2 RESET reset_y2 STORED false)
-    Q_PROPERTY(float x3 READ get_x3 WRITE set_x3 RESET reset_x3 STORED false)
-    Q_PROPERTY(float y3 READ get_y3 WRITE set_y3 RESET reset_y3 STORED false)
-    Q_PROPERTY(Method method READ get_method WRITE set_method RESET reset_method STORED false)
-    Q_PROPERTY(bool storeAffine READ get_storeAffine WRITE set_storeAffine RESET reset_storeAffine STORED false)
-    Q_PROPERTY(bool warpPoints READ get_warpPoints WRITE set_warpPoints RESET reset_warpPoints STORED false)
-    BR_PROPERTY(int, width, 64)
-    BR_PROPERTY(int, height, 64)
-    BR_PROPERTY(float, x1, 0)
-    BR_PROPERTY(float, y1, 0)
-    BR_PROPERTY(float, x2, -1)
-    BR_PROPERTY(float, y2, -1)
-    BR_PROPERTY(float, x3, -1)
-    BR_PROPERTY(float, y3, -1)
-    BR_PROPERTY(Method, method, Bilin)
-    BR_PROPERTY(bool, storeAffine, false)
-    BR_PROPERTY(bool, warpPoints, false)
-
-    static Point2f getThirdAffinePoint(const Point2f &a, const Point2f &b)
-    {
-        float dx = b.x - a.x;
-        float dy = b.y - a.y;
-        return Point2f(a.x - dy, a.y + dx);
-    }
-
-    void project(const Template &src, Template &dst) const
-    {
-        const bool twoPoints = ((x3 == -1) || (y3 == -1));
-
-        Point2f dstPoints[3];
-        dstPoints[0] = Point2f(x1*width, y1*height);
-        dstPoints[1] = Point2f((x2 == -1 ? 1 - x1 : x2)*width, (y2 == -1 ? y1 : y2)*height);
-        if (twoPoints) dstPoints[2] = getThirdAffinePoint(dstPoints[0], dstPoints[1]);
-        else           dstPoints[2] = Point2f(x3*width, y3*height);
-
-        Point2f srcPoints[3];
-        if (src.file.contains("Affine_0") &&
-            src.file.contains("Affine_1") &&
-            (src.file.contains("Affine_2") || twoPoints)) {
-            srcPoints[0] = OpenCVUtils::toPoint(src.file.get<QPointF>("Affine_0"));
-            srcPoints[1] = OpenCVUtils::toPoint(src.file.get<QPointF>("Affine_1"));
-            if (!twoPoints) srcPoints[2] = OpenCVUtils::toPoint(src.file.get<QPointF>("Affine_2"));
-        } else {
-            const QList<Point2f> landmarks = OpenCVUtils::toPoints(src.file.points());
-
-            if ((landmarks.size() < 2) || (!twoPoints && (landmarks.size() < 3))) {
-                resize(src, dst, Size(width, height));
-                return;
-            } else {
-                srcPoints[0] = landmarks[0];
-                srcPoints[1] = landmarks[1];
-                if (!twoPoints) srcPoints[2] = landmarks[2];
-            }
-        }
-        if (twoPoints) srcPoints[2] = getThirdAffinePoint(srcPoints[0], srcPoints[1]);
-
-        // Code section being altered (original)
-        //
-        // Mat affineTransform = getAffineTransform(srcPoints, dstPoints);
-        // warpAffine(src, dst, affineTransform, Size(width, height), method);
-        //
-        // end original
-
-        Mat affineTransform = getAffineTransform(srcPoints, dstPoints);
-
-        void* const* srcDataPtr = src.m().ptr<void*>();
-        int rows = *((int*)srcDataPtr[1]);
-        int cols = *((int*)srcDataPtr[2]);
-        int type = *((int*)srcDataPtr[3]);
-
-        Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
-        void** dstDataPtr = dstMat.ptr<void*>();
-
-        dstDataPtr[1] = srcDataPtr[1]; *((int*)dstDataPtr[1]) = height;  // rows
-        dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = width;   // cols
-        dstDataPtr[3] = srcDataPtr[3];
-
-        // Print the transform
-        //for(int x = 0; x < affineTransform.rows; x++){
-            //for(int y = 0; y < affineTransform.cols; y++){
-                //printf("%8.3f\t", affineTransform.at<double>(x, y));
-            //}
-            //printf("\n");
-        //}
-
-        br::cuda::cudaaffine_wrapper(srcDataPtr[0], &dstDataPtr[0], affineTransform, rows, cols, height, width);
-
-        // end altered code
-
-        if (warpPoints) {
-            QList<QPointF> points = src.file.points();
-            QList<QPointF> rotatedPoints;
-            for (int i=0; i<points.size(); i++) {
-                rotatedPoints.append(QPointF(points.at(i).x()*affineTransform.at<double>(0,0)+
-                                             points.at(i).y()*affineTransform.at<double>(0,1)+
-                                             affineTransform.at<double>(0,2),
-                                             points.at(i).x()*affineTransform.at<double>(1,0)+
-                                             points.at(i).y()*affineTransform.at<double>(1,1)+
-                                             affineTransform.at<double>(1,2)));
-            }
-
-            dst.file.setPoints(rotatedPoints);
-        }
-
-        if (storeAffine) {
-            QList<float> affineParams;
-            for (int i = 0 ; i < 2; i++)
-                for (int j = 0; j < 3; j++)
-                    affineParams.append(affineTransform.at<double>(i, j));
-            dst.file.setList("affineParameters", affineParams);
-        }
-
-        dst = dstMat;
-    }
-};
-
-BR_REGISTER(Transform, CUDAAffineTransform)
+	/*!
+	* \ingroup transforms
+	* \brief Performs a two or three point registration on the GPU.  Modified from stock OpenBR implementation
+	* \author Greg Schrock \cite gls022
+	* \note Method: Area should be used for shrinking an image, Cubic for slow but accurate enlargment, Bilin for fast enlargement.
+	*/
+	class CUDAAffineTransform : public UntrainableTransform
+	{
+	    Q_OBJECT
+	    Q_ENUMS(Method)
+
+	public:
+	    /*!< */
+	    enum Method { Near = INTER_NEAREST,
+	                  Area = INTER_AREA,
+	                  Bilin = INTER_LINEAR,
+	                  Cubic = INTER_CUBIC,
+	                  Lanczo = INTER_LANCZOS4};
+
+	private:
+	    Q_PROPERTY(int width READ get_width WRITE set_width RESET reset_width STORED false)
+	    Q_PROPERTY(int height READ get_height WRITE set_height RESET reset_height STORED false)
+	    Q_PROPERTY(float x1 READ get_x1 WRITE set_x1 RESET reset_x1 STORED false)
+	    Q_PROPERTY(float y1 READ get_y1 WRITE set_y1 RESET reset_y1 STORED false)
+	    Q_PROPERTY(float x2 READ get_x2 WRITE set_x2 RESET reset_x2 STORED false)
+	    Q_PROPERTY(float y2 READ get_y2 WRITE set_y2 RESET reset_y2 STORED false)
+	    Q_PROPERTY(float x3 READ get_x3 WRITE set_x3 RESET reset_x3 STORED false)
+	    Q_PROPERTY(float y3 READ get_y3 WRITE set_y3 RESET reset_y3 STORED false)
+	    Q_PROPERTY(Method method READ get_method WRITE set_method RESET reset_method STORED false)
+	    Q_PROPERTY(bool storeAffine READ get_storeAffine WRITE set_storeAffine RESET reset_storeAffine STORED false)
+	    Q_PROPERTY(bool warpPoints READ get_warpPoints WRITE set_warpPoints RESET reset_warpPoints STORED false)
+	    BR_PROPERTY(int, width, 64)
+	    BR_PROPERTY(int, height, 64)
+	    BR_PROPERTY(float, x1, 0)
+	    BR_PROPERTY(float, y1, 0)
+	    BR_PROPERTY(float, x2, -1)
+	    BR_PROPERTY(float, y2, -1)
+	    BR_PROPERTY(float, x3, -1)
+	    BR_PROPERTY(float, y3, -1)
+	    BR_PROPERTY(Method, method, Bilin)
+	    BR_PROPERTY(bool, storeAffine, false)
+	    BR_PROPERTY(bool, warpPoints, false)
+
+	    static Point2f getThirdAffinePoint(const Point2f &a, const Point2f &b)
+	    {
+	        float dx = b.x - a.x;
+	        float dy = b.y - a.y;
+	        return Point2f(a.x - dy, a.y + dx);
+	    }
+
+	    void project(const Template &src, Template &dst) const
+	    {
+	        const bool twoPoints = ((x3 == -1) || (y3 == -1));
+
+	        Point2f dstPoints[3];
+	        dstPoints[0] = Point2f(x1*width, y1*height);
+	        dstPoints[1] = Point2f((x2 == -1 ? 1 - x1 : x2)*width, (y2 == -1 ? y1 : y2)*height);
+	        if (twoPoints) dstPoints[2] = getThirdAffinePoint(dstPoints[0], dstPoints[1]);
+	        else           dstPoints[2] = Point2f(x3*width, y3*height);
+
+	        Point2f srcPoints[3];
+	        if (src.file.contains("Affine_0") &&
+	            src.file.contains("Affine_1") &&
+	            (src.file.contains("Affine_2") || twoPoints)) {
+	            srcPoints[0] = OpenCVUtils::toPoint(src.file.get<QPointF>("Affine_0"));
+	            srcPoints[1] = OpenCVUtils::toPoint(src.file.get<QPointF>("Affine_1"));
+	            if (!twoPoints) srcPoints[2] = OpenCVUtils::toPoint(src.file.get<QPointF>("Affine_2"));
+	        } else {
+	            const QList<Point2f> landmarks = OpenCVUtils::toPoints(src.file.points());
+
+	            if ((landmarks.size() < 2) || (!twoPoints && (landmarks.size() < 3))) {
+	                resize(src, dst, Size(width, height));
+	                return;
+	            } else {
+	                srcPoints[0] = landmarks[0];
+	                srcPoints[1] = landmarks[1];
+	                if (!twoPoints) srcPoints[2] = landmarks[2];
+	            }
+	        }
+	        if (twoPoints) srcPoints[2] = getThirdAffinePoint(srcPoints[0], srcPoints[1]);
+
+	        // Code section being altered (original)
+	        //
+	        // Mat affineTransform = getAffineTransform(srcPoints, dstPoints);
+	        // warpAffine(src, dst, affineTransform, Size(width, height), method);
+	        //
+	        // end original
+
+	        Mat affineTransform = getAffineTransform(srcPoints, dstPoints);
+
+	        void* const* srcDataPtr = src.m().ptr<void*>();
+	        int rows = *((int*)srcDataPtr[1]);
+	        int cols = *((int*)srcDataPtr[2]);
+	        int type = *((int*)srcDataPtr[3]);
+
+	        Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
+	        void** dstDataPtr = dstMat.ptr<void*>();
+
+	        dstDataPtr[1] = srcDataPtr[1]; *((int*)dstDataPtr[1]) = height;  // rows
+	        dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = width;   // cols
+	        dstDataPtr[3] = srcDataPtr[3];
+
+	        // Print the transform
+	        //for(int x = 0; x < affineTransform.rows; x++){
+	            //for(int y = 0; y < affineTransform.cols; y++){
+	                //printf("%8.3f\t", affineTransform.at<double>(x, y));
+	            //}
+	            //printf("\n");
+	        //}
+
+	        cuda::affine::wrapper(srcDataPtr[0], &dstDataPtr[0], affineTransform, rows, cols, height, width);
+
+	        // end altered code
+
+	        if (warpPoints) {
+	            QList<QPointF> points = src.file.points();
+	            QList<QPointF> rotatedPoints;
+	            for (int i=0; i<points.size(); i++) {
+	                rotatedPoints.append(QPointF(points.at(i).x()*affineTransform.at<double>(0,0)+
+	                                             points.at(i).y()*affineTransform.at<double>(0,1)+
+	                                             affineTransform.at<double>(0,2),
+	                                             points.at(i).x()*affineTransform.at<double>(1,0)+
+	                                             points.at(i).y()*affineTransform.at<double>(1,1)+
+	                                             affineTransform.at<double>(1,2)));
+	            }
+
+	            dst.file.setPoints(rotatedPoints);
+	        }
+
+	        if (storeAffine) {
+	            QList<float> affineParams;
+	            for (int i = 0 ; i < 2; i++)
+	                for (int j = 0; j < 3; j++)
+	                    affineParams.append(affineTransform.at<double>(i, j));
+	            dst.file.setList("affineParameters", affineParams);
+	        }
+
+	        dst = dstMat;
+	    }
+	};
+
+	BR_REGISTER(Transform, CUDAAffineTransform)
 
 } // namespace br
 
diff --git a/openbr/plugins/cuda/cudaaffine.cu b/openbr/plugins/cuda/cudaaffine.cu
index 0c19dde..79cc244 100644
--- a/openbr/plugins/cuda/cudaaffine.cu
+++ b/openbr/plugins/cuda/cudaaffine.cu
@@ -11,30 +11,30 @@ using namespace std;
 using namespace cv;
 using namespace cv::gpu;
 
-namespace br { namespace cuda {
+namespace br { namespace cuda { namespace affine {
 
-    __device__ __forceinline__ uint8_t cudaaffine_kernel_get_pixel_value(int row, int col, uint8_t* srcPtr, int rows, int cols) {
+    __device__ __forceinline__ uint8_t getPixelValueDevice(int row, int col, uint8_t* srcPtr, int rows, int cols) {
         if (row < 0 || row > rows || col < 0 || col > cols) {
             if (row > rows || col > cols) {
                 return 0;
             } else{
-                return 0; 
+                return 0;
             }
         }
         return (srcPtr + row*cols)[col];
     }
 
 
-    __device__ __forceinline__ uint8_t cudaaffine_kernel_get_bilinear_pixel_value(double row, double col, uint8_t* srcPtr, int rows, int cols) {
+    __device__ __forceinline__ uint8_t getBilinearPixelValueDevice(double row, double col, uint8_t* srcPtr, int rows, int cols) {
         // don't do anything if the index is out of bounds
         if (row < 0 || row > rows || col < 0 || col > cols) {
             if (row > rows || col > cols) {
                 return 0;
             } else{
-                return 0; 
+                return 0;
             }
         }
-        
+
         // http://www.sci.utah.edu/~acoste/uou/Image/project3/ArthurCOSTE_Project3.pdf
         // Bilinear Transformation
         // f(Px, Py) = f(Q11)×(1−Rx)×(1−Sy)+f(Q21)×(Rx)×(1−Sy)+f(Q12)×(1−Rx)×(Sy)+f(Q22)×(Rx)×(Sy)
@@ -48,22 +48,22 @@ namespace br { namespace cuda {
         double d_row = row - row1;
         double d_col = col - col1;
 
-        int Q11 = cudaaffine_kernel_get_pixel_value(row1, col1, srcPtr, rows, cols);
-        int Q21 = cudaaffine_kernel_get_pixel_value(row2, col1, srcPtr, rows, cols);
-        int Q12 = cudaaffine_kernel_get_pixel_value(row1, col2, srcPtr, rows, cols);
-        int Q22 = cudaaffine_kernel_get_pixel_value(row2, col2, srcPtr, rows, cols);
+        int Q11 = getPixelValueDevice(row1, col1, srcPtr, rows, cols);
+        int Q21 = getPixelValueDevice(row2, col1, srcPtr, rows, cols);
+        int Q12 = getPixelValueDevice(row1, col2, srcPtr, rows, cols);
+        int Q22 = getPixelValueDevice(row2, col2, srcPtr, rows, cols);
 
         double val = Q22*(d_row*d_col) + Q12*((1-d_row)*d_col) + Q21*(d_row*(1-d_col)) + Q11*((1-d_row)*(1-d_col));
         return ((uint8_t) round(val));
     }
 
-    __device__ __forceinline__ uint8_t cudaaffine_kernel_get_distance_pixel_value(double row, double col, uint8_t* srcPtr, int rows, int cols) {
+    __device__ __forceinline__ uint8_t getDistancePixelValueDevice(double row, double col, uint8_t* srcPtr, int rows, int cols) {
         // don't do anything if the index is out of bounds
         if (row < 1 || row >= rows-1 || col < 1 || col >= cols-1) {
             if (row >= rows || col >= cols) {
                 return 0;
             } else{
-                return 0; 
+                return 0;
             }
         }
 
@@ -90,10 +90,10 @@ namespace br { namespace cuda {
         double w3 = d3/sum;
         double w4 = d4/sum;
 
-        uint8_t v1 = cudaaffine_kernel_get_pixel_value(row1, col1, srcPtr, rows, cols);
-        uint8_t v2 = cudaaffine_kernel_get_pixel_value(row2, col1, srcPtr, rows, cols);
-        uint8_t v3 = cudaaffine_kernel_get_pixel_value(row1, col2, srcPtr, rows, cols);
-        uint8_t v4 = cudaaffine_kernel_get_pixel_value(row2, col2, srcPtr, rows, cols);
+        uint8_t v1 = getPixelValueDevice(row1, col1, srcPtr, rows, cols);
+        uint8_t v2 = getPixelValueDevice(row2, col1, srcPtr, rows, cols);
+        uint8_t v3 = getPixelValueDevice(row1, col2, srcPtr, rows, cols);
+        uint8_t v4 = getPixelValueDevice(row2, col2, srcPtr, rows, cols);
 
         return round(w1*v1 + w2*v2 + w3*v3 + w4*v4);
     }
@@ -105,16 +105,16 @@ namespace br { namespace cuda {
      * src_row            - The computed source pixel row (mapping from this row)
      * src_col            - The computed source pixel column (mapping from this col)
      */
-    __device__ __forceinline__ void cudaaffine_kernel_get_src_coord(double *trans_inv, int dst_row, int dst_col, double* src_row_pnt, double* src_col_pnt){
+    __device__ __forceinline__ void getSrcCoordDevice(double *trans_inv, int dst_row, int dst_col, double* src_row_pnt, double* src_col_pnt){
         *src_col_pnt = dst_col * trans_inv[0] + dst_row * trans_inv[3] + trans_inv[6];
         *src_row_pnt = dst_col * trans_inv[1] + dst_row * trans_inv[4] + trans_inv[7];
 
         //printf("Dst: [%d, %d, 1] = [%d, %d, 1] \n[ %0.4f, %0.4f, %0.4f] \n[ %0.4f, %0.4f, %0.4f ]\n[ %0.4f, %0.4f, %0.4f ]\n\n", *src_col, *src_row, dst_col, dst_row, trans_inv[0], trans_inv[1], trans_inv[2], trans_inv[3], trans_inv[4], trans_inv[5], trans_inv[6], trans_inv[7], trans_inv[8]);
 
 		}
-				
 
-    __global__ void cudaaffine_kernel(uint8_t* srcPtr, uint8_t* dstPtr, double* trans_inv, int src_rows, int src_cols, int dst_rows, int dst_cols){
+
+    __global__ void affineKernel(uint8_t* srcPtr, uint8_t* dstPtr, double* trans_inv, int src_rows, int src_cols, int dst_rows, int dst_cols){
         int dstRowInd = blockIdx.y*blockDim.y+threadIdx.y;
         int dstColInd = blockIdx.x*blockDim.x+threadIdx.x;
         int dstIndex = dstRowInd*dst_cols + dstColInd;
@@ -134,15 +134,15 @@ namespace br { namespace cuda {
             }
         }
 
-        cudaaffine_kernel_get_src_coord(trans_inv, dstRowInd, dstColInd, &srcRowPnt, &srcColPnt);
-        //const uint8_t cval = cudaaffine_kernel_get_distance_pixel_value(srcRowPnt, srcColPnt, srcPtr, src_rows, src_cols); // Get initial pixel value
-        const uint8_t cval = cudaaffine_kernel_get_bilinear_pixel_value(srcRowPnt, srcColPnt, srcPtr, src_rows, src_cols); // Get initial pixel value
-        //const uint8_t cval = cudaaffine_kernel_get_pixel_value(round(srcRowPnt), round(srcColPnt), srcPtr, src_rows, src_cols); // Get initial pixel value
+        getSrcCoordDevice(trans_inv, dstRowInd, dstColInd, &srcRowPnt, &srcColPnt);
+        //const uint8_t cval = getDistancePixelValueDevice(srcRowPnt, srcColPnt, srcPtr, src_rows, src_cols); // Get initial pixel value
+        const uint8_t cval = getBilinearPixelValueDevice(srcRowPnt, srcColPnt, srcPtr, src_rows, src_cols); // Get initial pixel value
+        //const uint8_t cval = getPixelValueDevice(round(srcRowPnt), round(srcColPnt), srcPtr, src_rows, src_cols); // Get initial pixel value
 
         dstPtr[dstIndex] = cval;
     }
 
-    void cudaaffine_wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols) {
+    void wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols) {
         cudaError_t err;
         double* gpuInverse;
 
@@ -152,7 +152,7 @@ namespace br { namespace cuda {
 
         //************************************************************************
         // Input affine is a 2x3 Mat whose transpose is used in the computations
-        // [x, y, 1] = [u, v, 1] [ a^T | [0 0 1]^T ] 
+        // [x, y, 1] = [u, v, 1] [ a^T | [0 0 1]^T ]
         // See "Digital Image Warping" by George Wolburg (p. 50)
         //************************************************************************
 
@@ -210,7 +210,7 @@ namespace br { namespace cuda {
 
         CUDA_SAFE_MEMCPY(gpuInverse, affineInverse, 9*sizeof(double), cudaMemcpyHostToDevice, &err);
 
-        cudaaffine_kernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), gpuInverse, src_rows, src_cols, dst_rows, dst_cols);
+        affineKernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), gpuInverse, src_rows, src_cols, dst_rows, dst_cols);
         CUDA_KERNEL_ERR_CHK(&err);
 
         CUDA_SAFE_FREE(srcPtr, &err);
@@ -225,5 +225,4 @@ namespace br { namespace cuda {
         // }
         // printf("\n");
     }
-} // end cuda
-} // end br
+}}}
diff --git a/openbr/plugins/cuda/cudacvtfloat.cpp b/openbr/plugins/cuda/cudacvtfloat.cpp
index d774a05..ced2b9c 100644
--- a/openbr/plugins/cuda/cudacvtfloat.cpp
+++ b/openbr/plugins/cuda/cudacvtfloat.cpp
@@ -7,7 +7,8 @@ using namespace cv;
 
 #include <openbr/plugins/openbr_internal.h>
 
-namespace br { namespace cuda { namespace cudacvtfloat {
+// definitions from the CUDA source file
+namespace br { namespace cuda { namespace cvtfloat {
   void wrapper(void* src, void** dst, int rows, int cols);
 }}}
 
@@ -16,7 +17,7 @@ namespace br
 
 /*!
  * \ingroup transforms
- * \brief Converts byte to floating point
+ * \brief Converts 8-bit images currently on GPU into 32-bit floating point equivalent.
  * \author Colin Heinzmann \cite DepthDeluxe
  */
 class CUDACvtFloatTransform : public UntrainableTransform
@@ -45,7 +46,7 @@ class CUDACvtFloatTransform : public UntrainableTransform
       dstDataPtr[2] = srcDataPtr[2];
       dstDataPtr[3] = srcDataPtr[3]; *((int*)dstDataPtr[3]) = CV_32FC1;
 
-      br::cuda::cudacvtfloat::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
+      cuda::cvtfloat::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
       dst = dstMat;
     }
 };
diff --git a/openbr/plugins/cuda/cudacvtfloat.cu b/openbr/plugins/cuda/cudacvtfloat.cu
index 70bf2e4..bcc4665 100644
--- a/openbr/plugins/cuda/cudacvtfloat.cu
+++ b/openbr/plugins/cuda/cudacvtfloat.cu
@@ -3,7 +3,7 @@ using namespace std;
 
 #include "cudadefines.hpp"
 
-namespace br { namespace cuda { namespace cudacvtfloat {
+namespace br { namespace cuda { namespace cvtfloat {
 
   __global__ void kernel(const unsigned char* src, float* dst, int rows, int cols) {
     // get my index
@@ -20,12 +20,6 @@ namespace br { namespace cuda { namespace cudacvtfloat {
   }
 
   void wrapper(void* src, void** dst, int rows, int cols) {
-    //unsigned char* cudaSrc;
-    //cudaMalloc(&cudaSrc, rows*cols*sizeof(unsigned char));
-    //cudaMemcpy(cudaSrc, src, rows*cols*sizeof(unsigned char), cudaMemcpyHostToDevice);
-
-    //float* cudaDst;
-    //cudaMalloc(&cudaDst, rows*cols*sizeof(float));
     cudaError_t err;
     CUDA_SAFE_MALLOC(dst, rows*cols*sizeof(float), &err);
 
diff --git a/openbr/plugins/cuda/CUDAL2.cpp b/openbr/plugins/cuda/cudal2.cpp
index 6b248d0..8cbc7c6 100644
--- a/openbr/plugins/cuda/CUDAL2.cpp
+++ b/openbr/plugins/cuda/cudal2.cpp
@@ -19,7 +19,8 @@ using namespace std;
 
 #include <openbr/plugins/openbr_internal.h>
 
-namespace br { namespace cuda { namespace L2{
+// definitions from the CUDA source file
+namespace br { namespace cuda { namespace L2 {
   void wrapper(float* cudaAPtr, float* cudaBPtr, int length, float* outPtr);
 }}}
 
@@ -29,7 +30,7 @@ namespace br
 /*!
  * \ingroup distances
  * \brief L2 distance computed using eigen.
- * \author Josh Klontz \cite jklontz
+ * \author Colin Heinzmann \cite DepthDeluxe
  */
 class CUDAL2Distance : public UntrainableDistance
 {
@@ -45,7 +46,7 @@ class CUDAL2Distance : public UntrainableDistance
       float* cudaBPtr = (float*)b.ptr<void*>()[0];
 
       float out;
-      br::cuda::L2::wrapper(cudaAPtr, cudaBPtr, rows*cols, &out);
+      cuda::L2::wrapper(cudaAPtr, cudaBPtr, rows*cols, &out);
 
       return out;
     }
@@ -55,4 +56,4 @@ BR_REGISTER(Distance, CUDAL2Distance)
 
 } // namespace br
 
-#include "cuda/CUDAL2.moc"
+#include "cuda/cudal2.moc"
diff --git a/openbr/plugins/cuda/CUDAL2.cu b/openbr/plugins/cuda/cudal2.cu
index 8d52b35..23db28d 100644
--- a/openbr/plugins/cuda/CUDAL2.cu
+++ b/openbr/plugins/cuda/cudal2.cu
@@ -4,7 +4,7 @@
 
 namespace br { namespace cuda { namespace L2 {
 
-  __global__ void my_subtract_kernel(float* aPtr, float* bPtr, float* workPtr, int length) {
+  __global__ void subtractKernel(float* aPtr, float* bPtr, float* workPtr, int length) {
     int index = blockIdx.x*blockDim.x+threadIdx.x;
 
     if (index >= length) {
@@ -18,7 +18,7 @@ namespace br { namespace cuda { namespace L2 {
     workPtr[index] = workPtr[index] * workPtr[index];
   }
 
-  __global__ void collapse_kernel(float* inPtr, float* outPtr, int length) {
+  __global__ void collapseKernel(float* inPtr, float* outPtr, int length) {
     // make sure there is only one thread that we are calling
     if (blockIdx.x != 0 || threadIdx.x != 0) {
       return;
@@ -45,11 +45,11 @@ namespace br { namespace cuda { namespace L2 {
     // perform the subtraction
     int threadsPerBlock = 64;
     int numBlocks = length / threadsPerBlock + 1;
-    my_subtract_kernel<<<threadsPerBlock, numBlocks>>>(cudaAPtr, cudaBPtr, cudaWorkBufferPtr, length);
+    subtractKernel<<<threadsPerBlock, numBlocks>>>(cudaAPtr, cudaBPtr, cudaWorkBufferPtr, length);
     CUDA_KERNEL_ERR_CHK(&err);
 
     // perform the collapse
-    collapse_kernel<<<1,1>>>(cudaWorkBufferPtr, cudaOutPtr, length);
+    collapseKernel<<<1,1>>>(cudaWorkBufferPtr, cudaOutPtr, length);
     CUDA_KERNEL_ERR_CHK(&err);
 
     // copy the single value back to the destinsion
@@ -63,6 +63,3 @@ namespace br { namespace cuda { namespace L2 {
     CUDA_SAFE_FREE(cudaWorkBufferPtr, &err);
   }
 }}}
-
-// 128CUDAEigenfaces on 6400 ATT: 54.367s
-// 128CUDAEigenfacesL2 on 6400 ATT: 
diff --git a/openbr/plugins/cuda/cudalbp.cpp b/openbr/plugins/cuda/cudalbp.cpp
index 2385c9b..f2f2d9f 100644
--- a/openbr/plugins/cuda/cudalbp.cpp
+++ b/openbr/plugins/cuda/cudalbp.cpp
@@ -31,44 +31,21 @@ using namespace std;
 
 #include <openbr/plugins/openbr_internal.h>
 
-#include "MatManager.hpp"
-
 using namespace cv;
 
-string type2str(int type) {
-  string r;
-
-  uchar depth = type & CV_MAT_DEPTH_MASK;
-  uchar chans = 1 + (type >> CV_CN_SHIFT);
-
-  switch ( depth ) {
-    case CV_8U:  r = "8U"; break;
-    case CV_8S:  r = "8S"; break;
-    case CV_16U: r = "16U"; break;
-    case CV_16S: r = "16S"; break;
-    case CV_32S: r = "32S"; break;
-    case CV_32F: r = "32F"; break;
-    case CV_64F: r = "64F"; break;
-    default:     r = "User"; break;
-  }
-
-  r += "C";
-  r += (chans+'0');
-
-  return r;
-}
-
-namespace br { namespace cuda {
-  void cudalbp_wrapper(void* srcPtr, void** dstPtr, int rows, int cols);
-  void cudalbp_init_wrapper(uint8_t* lut);
-}}
+// definitions from the CUDA source file
+namespace br { namespace cuda { namespace lbp {
+  void wrapper(void* srcPtr, void** dstPtr, int rows, int cols);
+  void initializeWrapper(uint8_t* lut);
+}}}
 
 namespace br
 {
 /*!
  * \ingroup transforms
- * \brief Convert the image into a feature vector using Local Binary Patterns in CUDA
- * \author Colin Heinzmann, Li Li \cite DepthDeluxe, booli
+ * \brief Convert the image into a feature vector using Local Binary Patterns in CUDA.  Modified from stock OpenBR plugin.
+ * \author Colin Heinzmann \cite DepthDeluxe
+ * \author Li Li \cite booli
  */
 class CUDALBPTransform : public UntrainableTransform
 {
@@ -84,8 +61,6 @@ class CUDALBPTransform : public UntrainableTransform
     uchar lut[256];
     uchar null;
 
-    //cuda::MatManager* matManager;
-
   public:
     /* Returns the number of 0->1 or 1->0 transitions in i */
     static int numTransitions(int i)
@@ -136,36 +111,14 @@ class CUDALBPTransform : public UntrainableTransform
             if (!set[i])
                 lut[i] = null; // Set to null id
 
-        // init the mat manager for managing 10 mats
-        //matManager = new cuda::MatManager(10);
-
         // copy lut over to the GPU
-        br::cuda::cudalbp_init_wrapper(lut);
+        cuda::lbp::initializeWrapper(lut);
 
         std::cout << "Initialized CUDALBP" << std::endl;
     }
 
     void project(const Template &src, Template &dst) const
     {
-        //Mat& m = (Mat&)src.m();
-        //cuda::MatManager::matindex a;
-        //cuda::MatManager::matindex b;
-        //a = matManager->reserve(m);
-        //matManager->upload(a, m);
-
-        // reserve the second mat and check the dimensiosn
-        //b = matManager->reserve(m);
-
-        //uint8_t* srcMatPtr = matManager->get_mat_pointer_from_index(a);
-        //uint8_t* dstMatPtr = matManager->get_mat_pointer_from_index(b);
-        //br::cuda::cudalbp_wrapper(srcMatPtr, dstMatPtr, lutGpuPtr, m.cols, m.rows, m.step1());
-
-        //matManager->download(b, dst);
-
-        // release both the mats
-        //matManager->release(a);
-        //matManager->release(b);
-
         void* const* srcDataPtr = src.m().ptr<void*>();
         int rows = *((int*)srcDataPtr[1]);
         int cols = *((int*)srcDataPtr[2]);
@@ -177,13 +130,13 @@ class CUDALBPTransform : public UntrainableTransform
         dstDataPtr[2] = srcDataPtr[2];
         dstDataPtr[3] = srcDataPtr[3];
 
-        br::cuda::cudalbp_wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
+        cuda::lbp::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
         dst = dstMat;
     }
 };
 
 BR_REGISTER(Transform, CUDALBPTransform)
 
-} // namespace br
+}
 
 #include "cuda/cudalbp.moc"
diff --git a/openbr/plugins/cuda/cudalbp.cu b/openbr/plugins/cuda/cudalbp.cu
index bb5be9a..61548df 100644
--- a/openbr/plugins/cuda/cudalbp.cu
+++ b/openbr/plugins/cuda/cudalbp.cu
@@ -9,14 +9,18 @@ using namespace std;
 using namespace cv;
 using namespace cv::gpu;
 
-namespace br { namespace cuda {
+/*
+ * These are the CUDA functions for CUDALBP.  See cudapca.cpp for more details
+ */
+
+namespace br { namespace cuda { namespace lbp {
   uint8_t* lut;
 
-  __device__ __forceinline__ uint8_t cudalbp_kernel_get_pixel_value(int row, int col, uint8_t* srcPtr, int rows, int cols) {
+  __device__ __forceinline__ uint8_t getPixelValueKernel(int row, int col, uint8_t* srcPtr, int rows, int cols) {
     return (srcPtr + row*cols)[col];
   }
 
-  __global__ void cudalbp_kernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols, uint8_t* lut)
+  __global__ void lutKernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols, uint8_t* lut)
   {
     int rowInd = blockIdx.y*blockDim.y+threadIdx.y;
     int colInd = blockIdx.x*blockDim.x+threadIdx.x;
@@ -34,22 +38,22 @@ namespace br { namespace cuda {
       }
     }
 
-    const uint8_t cval = cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+0*radius, srcPtr, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]);                      // center value
-    uint8_t val = lut[(cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 128 : 0) |
-                      (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 64  : 0) |
-                      (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 32  : 0) |
-                      (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 16  : 0) |
-                      (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 8   : 0) |
-                      (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 4   : 0) |
-                      (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 2   : 0) |
-                      (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 1   : 0)];
+    const uint8_t cval = getPixelValueKernel(rowInd+0*radius, colInd+0*radius, srcPtr, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]);                      // center value
+    uint8_t val = lut[(getPixelValueKernel(rowInd-1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 128 : 0) |
+                      (getPixelValueKernel(rowInd-1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 64  : 0) |
+                      (getPixelValueKernel(rowInd-1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 32  : 0) |
+                      (getPixelValueKernel(rowInd+0*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 16  : 0) |
+                      (getPixelValueKernel(rowInd+1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 8   : 0) |
+                      (getPixelValueKernel(rowInd+1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 4   : 0) |
+                      (getPixelValueKernel(rowInd+1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 2   : 0) |
+                      (getPixelValueKernel(rowInd+0*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 1   : 0)];
 
     // store calculated value away in the right place
     dstPtr[index] = val;
   }
 
   //void cudalbp_wrapper(uint8_t* srcPtr, uint8_t* dstPtr, uint8_t* lut, int imageWidth, int imageHeight, size_t step)
-  void cudalbp_wrapper(void* srcPtr, void** dstPtr, int rows, int cols)
+  void wrapper(void* srcPtr, void** dstPtr, int rows, int cols)
   {
     cudaError_t err;
 
@@ -59,15 +63,15 @@ namespace br { namespace cuda {
                    rows/threadsPerBlock.y + 1);
 
     CUDA_SAFE_MALLOC(dstPtr, rows*cols*sizeof(uint8_t), &err);
-    cudalbp_kernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), rows, cols, lut);
+    lutKernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), rows, cols, lut);
     CUDA_KERNEL_ERR_CHK(&err);
 
     CUDA_SAFE_FREE(srcPtr, &err);
   }
 
-  void cudalbp_init_wrapper(uint8_t* cpuLut) {
+  void initializeWrapper(uint8_t* cpuLut) {
     cudaError_t err;
     CUDA_SAFE_MALLOC(&lut, 256*sizeof(uint8_t), &err);
     CUDA_SAFE_MEMCPY(lut, cpuLut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice, &err);
   }
-}}
+}}}
diff --git a/openbr/plugins/cuda/cudapca.cpp b/openbr/plugins/cuda/cudapca.cpp
index fbee173..c1677c8 100644
--- a/openbr/plugins/cuda/cudapca.cpp
+++ b/openbr/plugins/cuda/cudapca.cpp
@@ -29,8 +29,9 @@ using namespace cv;
 #include <openbr/core/eigenutils.h>
 #include <openbr/core/opencvutils.h>
 
+// definitions from the CUDA source file
 namespace br { namespace cuda { namespace pca {
-  void loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems);
+  void initializeWrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems);
   void wrapper(void* src, void** dst);
 }}}
 
@@ -38,9 +39,7 @@ namespace br
 {
 /*!
  * \ingroup transforms
- * \brief Projects input into learned Principal Component Analysis subspace using CUDA.
- * \author Brendan Klare \cite bklare
- * \author Josh Klontz \cite jklontz
+ * \brief Projects input into learned Principal Component Analysis subspace using CUDA. Modified from original PCA plugin.
  * \author Colin Heinzmann \cite DepthDeluxe
  *
  * \br_property float keep Options are: [keep < 0 - All eigenvalues are retained, keep == 0 - No PCA is performed and the eigenvectors form an identity matrix, 0 < keep < 1 - Keep is the fraction of the variance to retain, keep >= 1 - keep is the number of leading eigenvectors to retain] Default is 0.95.
@@ -134,22 +133,9 @@ private:
       dstDataPtr[2] = srcDataPtr[2];  *((int*)dstDataPtr[2]) = keep;
       dstDataPtr[3] = srcDataPtr[3];
 
-      br::cuda::pca::wrapper(srcDataPtr[0], &dstDataPtr[0]);
+      cuda::pca::wrapper(srcDataPtr[0], &dstDataPtr[0]);
 
       dst = dstMat;
-
-        //dst = cv::Mat(1, keep, CV_32FC1);
-
-        // perform the operation on the graphics card
-        //cuda::cudapca_projectwrapper((float*)src.m().ptr<float>(), (float*)dst.m().ptr<float>());
-
-        // Map Eigen into OpenCV
-        //Mat cpuDst = cv::Mat(1, keep, CV_32FC1);
-        //Eigen::Map<const Eigen::MatrixXf> inMap(src.m().ptr<float>(), src.m().rows*src.m().cols, 1);
-        //Eigen::Map<Eigen::MatrixXf> outMap(cpuDst.ptr<float>(), keep, 1);
-
-        // Do projection
-        //outMap = eVecs.transpose() * (inMap - mean);
     }
 
     void store(QDataStream &stream) const
@@ -161,14 +147,6 @@ private:
     {
         stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> eVecs;
 
-        cout << "Mean Dimensions" << endl;
-        cout << "\tRows: " << mean.rows() << " Cols: " << mean.cols() << endl;
-        cout << "eVecs Dimensions" << endl;
-        cout << "\tRows: " << eVecs.rows() << " Cols: " << eVecs.cols() << endl;
-        cout << "eVals Dimensions" << endl;
-        cout << "\tRows: " << eVals.rows() << " Cols: " << eVals.cols() << endl;
-        cout << "Keep: " << keep << endl;
-
         // TODO(colin): use Eigen Map class to generate map files so we don't have to copy the data
         // serialize the eigenvectors
         float* evBuffer = new float[eVecs.rows() * eVecs.cols()];
@@ -187,7 +165,7 @@ private:
         }
 
         // call the wrapper function
-        br::cuda::pca::loadwrapper(evBuffer, eVecs.rows(), eVecs.cols(), meanBuffer, mean.rows()*mean.cols());
+        cuda::pca::initializeWrapper(evBuffer, eVecs.rows(), eVecs.cols(), meanBuffer, mean.rows()*mean.cols());
 
         delete evBuffer;
         delete meanBuffer;
diff --git a/openbr/plugins/cuda/cudapca.cu b/openbr/plugins/cuda/cudapca.cu
index f6fd3a1..001dd49 100644
--- a/openbr/plugins/cuda/cudapca.cu
+++ b/openbr/plugins/cuda/cudapca.cu
@@ -9,6 +9,10 @@ using namespace std;
 using namespace cv;
 using namespace cv::gpu;
 
+/*
+ * These are the CUDA functions for CUDAPCA.  See cudapca.cpp for more details
+ */
+
 namespace br { namespace cuda { namespace pca {
   __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int evRows, int evCols, int stepSize) {
     int colInd = blockIdx.x*blockDim.x+threadIdx.x;
@@ -68,7 +72,7 @@ namespace br { namespace cuda { namespace pca {
   int _numSteps; int _stepSize;
   float* intermediaryBuffer;
 
-  void loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) {
+  void initializeWrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) {
     _evRows = evRows; _evCols = evCols;
     _meanElems = meanElems;
 
@@ -95,7 +99,6 @@ namespace br { namespace cuda { namespace pca {
     cudaError_t err;
     CUDA_SAFE_MALLOC(dst, _evCols*sizeof(float), &err);
 
-
     // subtract out the mean of the image (mean is 1xpixels in size)
     int threadsPerBlock = 64;
     int numBlocks = _meanElems / threadsPerBlock + 1;
@@ -114,8 +117,5 @@ namespace br { namespace cuda { namespace pca {
     CUDA_KERNEL_ERR_CHK(&err);
 
     CUDA_SAFE_FREE(src, &err);    // TODO(colin): figure out why adding this free causes memory corruption...
-
-    // copy the data back to the CPU
-    //cudaMemcpy(dst, _cudaDstPtr, _evCols*sizeof(float), cudaMemcpyDeviceToHost);
   }
 }}}
diff --git a/openbr/plugins/cuda/cudargb2grayscale.cpp b/openbr/plugins/cuda/cudargb2grayscale.cpp
index 6f72728..1db7cca 100644
--- a/openbr/plugins/cuda/cudargb2grayscale.cpp
+++ b/openbr/plugins/cuda/cudargb2grayscale.cpp
@@ -25,17 +25,18 @@
 
 using namespace cv;
 
-namespace br { namespace cuda{
-  void cudargb2grayscale_wrapper(void* srcPtr, void**dstPtr, int rows, int cols);
-}}
+// definitions from the CUDA source file
+namespace br { namespace cuda { namespace rgb2grayscale {
+  void wrapper(void* srcPtr, void**dstPtr, int rows, int cols);
+}}}
 
 namespace br
 {
 
 /*!
  * \ingroup transforms
- * \brief Colorspace conversion.
- * \author Li Li \cite Josh Klontz \cite jklontz
+ * \brief Converts 3-channel images to grayscale
+ * \author Li Li \cite booli
  */
 class CUDARGB2GrayScaleTransform : public UntrainableTransform
 {
@@ -57,8 +58,8 @@ private:
         dstDataPtr[2] = srcDataPtr[2];
         dstDataPtr[3] = srcDataPtr[3];
         *((int*)dstDataPtr[3]) = CV_8UC1; // not sure if the type of the new mat is the same
-       
-        br::cuda::cudargb2grayscale_wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
+
+        cuda::rgb2grayscale::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
         dst = dstMat;
 
         /*
diff --git a/openbr/plugins/cuda/cudargb2grayscale.cu b/openbr/plugins/cuda/cudargb2grayscale.cu
index 266389c..8efaafa 100644
--- a/openbr/plugins/cuda/cudargb2grayscale.cu
+++ b/openbr/plugins/cuda/cudargb2grayscale.cu
@@ -12,9 +12,9 @@ using namespace std;
 using namespace cv;
 using namespace cv::gpu;
 
-namespace br{ namespace cuda {
+namespace br { namespace cuda { namespace rgb2grayscale {
 
-  __global__ void cudargb2grayscale_kernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols)
+  __global__ void kernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols)
   {
     int rowInd = blockIdx.y*blockDim.y+threadIdx.y;
     int colInd = blockIdx.x*blockDim.x+threadIdx.x;
@@ -31,7 +31,7 @@ namespace br{ namespace cuda {
     return;
   }
 
-  void cudargb2grayscale_wrapper(void* srcPtr, void** dstPtr, int rows, int cols)
+  void wrapper(void* srcPtr, void** dstPtr, int rows, int cols)
   {
     cudaError_t err;
     dim3 threadsPerBlock(9, 9);
@@ -39,9 +39,9 @@ namespace br{ namespace cuda {
                    rows/threadsPerBlock.y + 1);
     CUDA_SAFE_MALLOC(dstPtr, rows*cols*sizeof(uint8_t), &err);
 
-    cudargb2grayscale_kernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*) (*dstPtr), rows, cols);
+    kernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*) (*dstPtr), rows, cols);
     CUDA_KERNEL_ERR_CHK(&err);
     CUDA_SAFE_FREE(srcPtr, &err);
-  } 
+  }
 
-}}
+}}}
diff --git a/openbr/plugins/cuda/passthrough.cpp b/openbr/plugins/cuda/passthrough.cpp
deleted file mode 100644
index f7018ea..0000000
--- a/openbr/plugins/cuda/passthrough.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#include <openbr/plugins/openbr_internal.h>
-
-#include <opencv2/imgproc/imgproc.hpp>
-#include <opencv2/gpu/gpu.hpp>
-
-using namespace cv;
-using namespace cv::gpu;
-
-#include "passthrough.hpp"
-
-#include <iostream>
-
-
-namespace br
-{
-  class CUDAPassthroughTransform : public UntrainableTransform
-  {
-    Q_OBJECT
-
-private:
-    void project(const Template &src, Template &dst) const
-    {
-      // note: if you convert the image to grayscale, you get 8UC1
-
-      // upload the src mat to the GPU
-      GpuMat srcGpuMat, dstGpuMat;
-      srcGpuMat.upload(src.m());
-      dstGpuMat.upload(src.m());
-
-      br::cuda::passthrough_wrapper(srcGpuMat, dstGpuMat);
-
-      dstGpuMat.download(dst.m());
-
-      // TODO(colin): add delete code
-      srcGpuMat.release();
-      dstGpuMat.release();
-
-      printf("srcGpuMat empty: %d\n", (int)srcGpuMat.empty());
-      printf("dstGpuMat empty: %d\n", (int)srcGpuMat.empty());
-    }
-  };
-
-  BR_REGISTER(Transform, CUDAPassthroughTransform);
-}
-
-#include "cuda/passthrough.moc"
diff --git a/openbr/plugins/cuda/passthrough.cu b/openbr/plugins/cuda/passthrough.cu
deleted file mode 100644
index 49f94e9..0000000
--- a/openbr/plugins/cuda/passthrough.cu
+++ /dev/null
@@ -1,43 +0,0 @@
-// note: Using 8-bit unsigned 1 channel images
-
-#include <opencv2/gpu/gpu.hpp>
-
-using namespace cv;
-using namespace cv::gpu;
-
-#include "passthrough.hpp"
-
-namespace br { namespace cuda {
-  __global__ void passthrough_kernel(uint8_t* srcPtr, uint8_t* dstPtr, size_t srcStep, size_t dstStep, int cols, int rows) {
-    int rowInd = blockIdx.y*blockDim.y+threadIdx.y;
-    int colInd = blockIdx.x*blockDim.x+threadIdx.x;
-
-    // don't do anything if we are outside the allowable positions
-    if (rowInd >= rows || colInd >= cols)
-      return;
-
-    uint8_t srcVal = (srcPtr + rowInd*srcStep)[colInd];
-    uint8_t* rowDstPtr = dstPtr + rowInd*dstStep;
-
-    rowDstPtr[colInd] = srcVal;
-  }
-
-  void passthrough_wrapper(GpuMat& src, GpuMat& dst) {
-    // convert the GpuMats to pointers
-    uint8_t* srcPtr = (uint8_t*)src.data;
-    uint8_t* dstPtr = (uint8_t*)dst.data;
-
-    int imageWidth = src.cols;
-    int imageHeight = src.rows;
-
-    // make 8 * 8 = 64 square block
-    dim3 threadsPerBlock(8, 8);
-    dim3 numBlocks(imageWidth / threadsPerBlock.x + 1,
-                   imageHeight / threadsPerBlock.y + 1);
-
-    passthrough_kernel<<<numBlocks, threadsPerBlock>>>(srcPtr, dstPtr, src.step, dst.step, imageWidth, imageHeight);
-  }
-}}
-
-
-// read http://stackoverflow.com/questions/31927297/array-of-ptrstepszgpumat-to-a-c-cuda-kernel
diff --git a/openbr/plugins/cuda/passthrough.hpp b/openbr/plugins/cuda/passthrough.hpp
deleted file mode 100644
index 8749684..0000000
--- a/openbr/plugins/cuda/passthrough.hpp
+++ /dev/null
@@ -1,3 +0,0 @@
-namespace br { namespace cuda {
-  void passthrough_wrapper(GpuMat& src, GpuMat& dst);
-}}