Merge pull request #464 from CGLG/master

CUDA-accelerated plugin support

Merge pull request #464 from CGLG/master
CUDA-accelerated plugin support
Josh Klontz
2 parents 2849f3b9 552e99f9
Showing 20 changed files with 1892 additions and 1 deletions
openbr/plugins/cuda/README.md
openbr/plugins/cuda/copyfrom.cpp
openbr/plugins/cuda/copyfrom.cu
openbr/plugins/cuda/copyto.cpp
openbr/plugins/cuda/copyto.cu
openbr/plugins/cuda/cudaaffine.cpp
openbr/plugins/cuda/cudaaffine.cu
openbr/plugins/cuda/cudacvtfloat.cpp
openbr/plugins/cuda/cudacvtfloat.cu
openbr/plugins/cuda/cudadefines.hpp
openbr/plugins/cuda/cudal2.cpp
openbr/plugins/cuda/cudal2.cu
openbr/plugins/cuda/cudalbp.cpp
openbr/plugins/cuda/cudalbp.cu
openbr/plugins/cuda/cudapca.cpp
openbr/plugins/cuda/cudapca.cu
openbr/plugins/cuda/cudargb2grayscale.cpp
openbr/plugins/cuda/cudargb2grayscale.cu
openbr/plugins/cuda/module.cmake
openbr/plugins/plugins.cmake
+# CUDA Plugins
+CUDA plugins are very similar to normal plugins.  A single plugin is split into
+two files: the `.cpp` file with the BR standard plugin definition and the `.cu`
+file with your kernel and wrapper functions.
+
+## The `.cpp` file
+Every main plugin file must have the names of the kernel wrapper functions
+defined at the top of the program.  Once the definitions are there, just call
+the CUDA functions as you need them
+
+## The `.cu` file
+All functions within the CUDA file must be declared inside their own namespace
+under `br::cuda`.  For example the plugin `passthrough` must have all functions
+inside it declared under the namespace `br::cuda::passthrough`.
+
+## CPU Template object format
+Like any other BR Transform, the plugin must return an object for the next
+plugin to consume.  For performance reasons, we don't copy data to and from
+the graphics card for every transform.  Instead, we use this space to transfer
+data about how to access the image data and its type.  The Mat is an array of data type `void*`.
+
+Index   | Item Name   | Type      | Description
+--------|-------------|-----------|------------
+0       | GpuData     | void*     | Pointer to the graphics card data
+1       | rows        | int       | Number of rows in the Mat
+2       | cols        | int       | Number of colums in the Mat
+3       | type        | int       | OpenCV mat data type code (i.e. `mat.type()`)
+
+It is expected that the wrapper function does the proper GPU memory handling
+to make sure that the GpuData pointer in the output mat is pointing to the
+data that the plugin is outputting.
+
+## Example: Passthrough
+This example plugin takes in input data and passes it straight to the output.
+The BR transform calls the wrapper function which exists in the CUDA file which
+in turn calls the kernel routine to copy the data in the GPU.
+
+**Note**: This program assumes that a previous Transform, namely `CUDACopyTo` has
+copied the data to the GPU.
+
+### **passthrough.cpp**
+```c++
+#include <openbr/plugins/openbr_internal.h>
+#include <opencv2/opencv.hpp>
+
+// wrapper function within the CUDA file
+namespace br { namespace cuda { namespace passthrough {
+  void wrapper(void* srcGpuData, void** dstGpuData);
+}}};
+
+#include <iostream>
+namespace br
+{
+  class CUDAPassthroughTransform : public UntrainableTransform
+  {
+    Q_OBJECT
+
+    void project(const Template &src, Template &dst) {
+      // extract the parameters out of the Mat passed from the previous plugin
+      void* const* srcDataPtr = src.m().ptr<void*>();
+      int rows = *((int*)srcDataPtr[1]);
+      int cols = *((int*)srcDataPtr[2]);
+      int type = *((int*)srcDataPtr[3]);
+
+      // generate a new Mat to be passed to the next plugin
+      Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
+      void** dstDataPtr = dstMat.ptr<void*>();
+      dstDataPtr[1] = srcDataPtr[1];
+      dstDataPtr[2] = srcDataPtr[2];
+      dstDataPtr[3] = srcDataPtr[3];
+
+      // call the wrapper and set the dst output to the newly created Mat
+      br::cuda::passthrough::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
+      dst = dstMat;
+    }
+  };
+
+  BR_REGISTER(Transform, CUDAPassthroughTransform);
+}
+
+#include "cuda/passthrough.moc"
+```
+
+### **passthrough.cu**
+```c++
+#include <opencv2/opencv.hpp>
+
+namespace br { namespace cuda { namespace passthrough {
+  __global__ void kernel(char* srcPtr, char* dstPtr, int rows, int cols) {
+    // get the current index
+    int rowInd = blockIdx.y*blockDim.y+threadIdx.y;
+    int colInd = blockIdx.x*blockDim.x+threadIdx.x;
+
+    // don't do anything if we are outside the allowable positions
+    if (rowInd >= rows || colInd >= cols)
+      return;
+
+    // write the input to the output
+    rowDstPtr[rowInd*cols + colInd] = srcVal;
+  }
+
+  void wrapper(char* srcPtr, char** dstPtr, int rows, int cols, int type) {
+    // verify the proper image type
+    if (type != CV_8UC1) {
+      cout << "Error: image type not supported"
+      return;
+    }
+
+    *dstPtr = cudaMalloc(rows*cols*sizeof(char));
+
+    dim3 threadsPerBlock(8, 8);
+    dim3 numBlocks(imageWidth / threadsPerBlock.x + 1,
+                   imageHeight / threadsPerBlock.y + 1);
+
+    // run the kernel function
+    kernel<<<numBlocks, threadPerBlock>>>(srcPtr, dstPtr, rows, cols);
+
+    // free the memory as it isn't used anymore
+    cudaFree(srcPtr);
+  }
+}}}
+```
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright 2016 Colin Heinzmann                                            *
+ *                                                                           *
+ * Licensed under the Apache License, Version 2.0 (the "License");           *
+ * you may not use this file except in compliance with the License.          *
+ * You may obtain a copy of the License at                                   *
+ *                                                                           *
+ *     http://www.apache.org/licenses/LICENSE-2.0                            *
+ *                                                                           *
+ * Unless required by applicable law or agreed to in writing, software       *
+ * distributed under the License is distributed on an "AS IS" BASIS,         *
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  *
+ * See the License for the specific language governing permissions and       *
+ * limitations under the License.                                            *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#include <iostream>
+
+#include <opencv2/opencv.hpp>
+
+#include <openbr/plugins/openbr_internal.h>
+
+using namespace std;
+
+using namespace cv;
+
+// CUDA functions for this plugin
+namespace br { namespace cuda { namespace copyfrom {
+  template <typename T> void wrapper(void* src, T* out, int rows, int cols);
+}}}
+
+namespace br
+{
+  /*!
+  * \ingroup transforms
+  * \brief Copies a transform from the GPU to the CPU.
+  * \author Colin Heinzmann \cite DepthDeluxe
+  * \note Method: Automatically matches image dimensions, works for 32-bit single channel, 8-bit single channel, and 8-bit 3 channel
+  */
+  class CUDACopyFrom : public UntrainableTransform
+  {
+    Q_OBJECT
+
+private:
+    void project(const Template &src, Template &dst) const
+    {
+      // pull the data back out of the Mat
+      void* const* dataPtr = src.m().ptr<void*>();
+      int rows = *((int*)dataPtr[1]);
+      int cols = *((int*)dataPtr[2]);
+      int type = *((int*)dataPtr[3]);
+
+      Mat dstMat = Mat(rows, cols, type);
+      switch(type) {
+      case CV_32FC1:
+        cuda::copyfrom::wrapper(dataPtr[0], dstMat.ptr<float>(), rows, cols);
+        break;
+      case CV_8UC1:
+        cuda::copyfrom::wrapper(dataPtr[0], dstMat.ptr<unsigned char>(), rows, cols);
+        break;
+      case CV_8UC3:
+        cuda::copyfrom::wrapper(dataPtr[0], dstMat.ptr<unsigned char>(), rows, cols * 3);
+        break;
+      default:
+        cout << "ERR: Invalid image type (" << type << ")" << endl;
+        break;
+      }
+      dst = dstMat;
+    }
+  };
+
+  BR_REGISTER(Transform, CUDACopyFrom);
+}
+
+#include "cuda/copyfrom.moc"
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright 2016 Colin Heinzmann                                            *
+ *                                                                           *
+ * Licensed under the Apache License, Version 2.0 (the "License");           *
+ * you may not use this file except in compliance with the License.          *
+ * You may obtain a copy of the License at                                   *
+ *                                                                           *
+ *     http://www.apache.org/licenses/LICENSE-2.0                            *
+ *                                                                           *
+ * Unless required by applicable law or agreed to in writing, software       *
+ * distributed under the License is distributed on an "AS IS" BASIS,         *
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  *
+ * See the License for the specific language governing permissions and       *
+ * limitations under the License.                                            *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#include "cudadefines.hpp"
+
+namespace br { namespace cuda { namespace copyfrom {
+  template <typename T> void wrapper(void* src, T* dst, int rows, int cols) {
+    cudaError_t err;
+    CUDA_SAFE_MEMCPY(dst, src, rows*cols*sizeof(T), cudaMemcpyDeviceToHost, &err);
+    CUDA_SAFE_FREE(src, &err);
+  }
+
+  template void wrapper(void*, float*, int, int);
+  template void wrapper(void*, unsigned char*, int, int);
+}}}
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright 2016 Colin Heinzmann                                            *
+ *                                                                           *
+ * Licensed under the Apache License, Version 2.0 (the "License");           *
+ * you may not use this file except in compliance with the License.          *
+ * You may obtain a copy of the License at                                   *
+ *                                                                           *
+ *     http://www.apache.org/licenses/LICENSE-2.0                            *
+ *                                                                           *
+ * Unless required by applicable law or agreed to in writing, software       *
+ * distributed under the License is distributed on an "AS IS" BASIS,         *
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  *
+ * See the License for the specific language governing permissions and       *
+ * limitations under the License.                                            *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#include <iostream>
+
+#include <opencv2/opencv.hpp>
+
+#include <openbr/plugins/openbr_internal.h>
+
+using namespace std;
+
+using namespace cv;
+
+// definitions from the CUDA source file
+namespace br { namespace cuda { namespace copyto {
+  template <typename T> void wrapper(const T* in, void** out, const int rows, const int cols);
+}}}
+
+namespace br
+{
+
+  /*!
+  * \ingroup transforms
+  * \brief Copies a transform to the GPU.
+  * \author Colin Heinzmann \cite DepthDeluxe
+  * \note Method: Automatically matches image dimensions, works for 32-bit single channel, 8-bit single channel, and 8-bit 3 channel
+  */
+  class CUDACopyTo : public UntrainableTransform
+  {
+    Q_OBJECT
+
+private:
+    void project(const Template &src, Template &dst) const
+    {
+      const Mat& srcMat = src.m();
+      const int rows = srcMat.rows;
+      const int cols = srcMat.cols;
+
+      // output will be a single pointer to graphics card memory
+      Mat dstMat = Mat(4, 1, DataType<void*>::type);
+      void** dstMatData = dstMat.ptr<void*>();
+
+      // save cuda ptr, rows, cols, then type
+      dstMatData[1] = new int; *((int*)dstMatData[1]) = rows;
+      dstMatData[2] = new int; *((int*)dstMatData[2]) = cols;
+      dstMatData[3] = new int; *((int*)dstMatData[3]) = srcMat.type();
+
+      void* cudaMemPtr;
+      switch(srcMat.type()) {
+      case CV_32FC1:
+        cuda::copyto::wrapper(srcMat.ptr<float>(), &dstMatData[0], rows, cols);
+        break;
+      case CV_8UC1:
+        cuda::copyto::wrapper(srcMat.ptr<unsigned char>(), &dstMatData[0], rows, cols);
+        break;
+      case CV_8UC3:
+        cuda::copyto::wrapper(srcMat.ptr<unsigned char>(), &dstMatData[0], rows, 3*cols);
+        break;
+      default:
+        cout << "ERR: Invalid image type (" << srcMat.type() << ")" << endl;
+        return;
+      }
+
+      dst = dstMat;
+    }
+  };
+
+  BR_REGISTER(Transform, CUDACopyTo);
+}
+
+#include "cuda/copyto.moc"
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright 2016 Colin Heinzmann                                            *
+ *                                                                           *
+ * Licensed under the Apache License, Version 2.0 (the "License");           *
+ * you may not use this file except in compliance with the License.          *
+ * You may obtain a copy of the License at                                   *
+ *                                                                           *
+ *     http://www.apache.org/licenses/LICENSE-2.0                            *
+ *                                                                           *
+ * Unless required by applicable law or agreed to in writing, software       *
+ * distributed under the License is distributed on an "AS IS" BASIS,         *
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  *
+ * See the License for the specific language governing permissions and       *
+ * limitations under the License.                                            *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#include "cudadefines.hpp"
+
+namespace br { namespace cuda { namespace copyto {
+
+  template <typename T> void wrapper(const T* in, void** out, const int rows, const int cols) {
+    cudaError_t err;
+    CUDA_SAFE_MALLOC(out, rows*cols*sizeof(T), &err);
+    CUDA_SAFE_MEMCPY(*out, in, rows*cols*sizeof(T), cudaMemcpyHostToDevice, &err);
+  }
+
+  template void wrapper(const float* in, void** out, const int rows, const int cols);
+  template void wrapper(const unsigned char* in, void** out, const int rows, const int cols);
+
+}}}
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright 2016 Greg Shrock, Colin Heinzmann                               *
+ *                                                                           *
+ * Licensed under the Apache License, Version 2.0 (the "License");           *
+ * you may not use this file except in compliance with the License.          *
+ * You may obtain a copy of the License at                                   *
+ *                                                                           *
+ *     http://www.apache.org/licenses/LICENSE-2.0                            *
+ *                                                                           *
+ * Unless required by applicable law or agreed to in writing, software       *
+ * distributed under the License is distributed on an "AS IS" BASIS,         *
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  *
+ * See the License for the specific language governing permissions and       *
+ * limitations under the License.                                            *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+
+
+#include <iostream>
+using namespace std;
+
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <pthread.h>
+
+#include <opencv2/opencv.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <opencv2/imgproc/imgproc_c.h>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/highgui/highgui_c.h>
+#include <limits>
+
+#include <openbr/plugins/openbr_internal.h>
+#include <openbr/core/opencvutils.h>
+
+using namespace cv;
+
+// definitions from the CUDA source file
+namespace br { namespace cuda { namespace affine {
+	void resizeWrapper(void* srcPtr, void** dstPtr, int src_rows, int src_cols, int dst_rows, int dst_cols);
+	void wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols);
+}}}
+
+namespace br
+{
+
+	/*!
+	* \ingroup transforms
+	* \brief Performs a two or three point registration on the GPU.  Modified from stock OpenBR implementation.  Only supports single-point input bilinear transformation.
+	* \author Greg Schrock \cite gls022
+  * \author Colin Heinzmann \cite DepthDeluxe
+	* \note Method: Area should be used for shrinking an image, Cubic for slow but accurate enlargment, Bilin for fast enlargement.
+	*/
+	class CUDAAffineTransform : public UntrainableTransform
+	{
+	    Q_OBJECT
+
+  private:
+	    Q_PROPERTY(int width READ get_width WRITE set_width RESET reset_width STORED false)
+	    Q_PROPERTY(int height READ get_height WRITE set_height RESET reset_height STORED false)
+	    Q_PROPERTY(float x1 READ get_x1 WRITE set_x1 RESET reset_x1 STORED false)
+	    Q_PROPERTY(float y1 READ get_y1 WRITE set_y1 RESET reset_y1 STORED false)
+	    BR_PROPERTY(int, width, 64)
+	    BR_PROPERTY(int, height, 64)
+	    BR_PROPERTY(float, x1, 0)
+	    BR_PROPERTY(float, y1, 0)
+
+	    static Point2f getThirdAffinePoint(const Point2f &a, const Point2f &b)
+	    {
+	        float dx = b.x - a.x;
+	        float dy = b.y - a.y;
+	        return Point2f(a.x - dy, a.y + dx);
+	    }
+
+	    void project(const Template &src, Template &dst) const
+	    {
+	        Point2f dstPoints[3];
+	        dstPoints[0] = Point2f(x1*width, y1*height);
+          dstPoints[1] = Point2f((1-x1)*width, (1-y1)*height);
+          dstPoints[2] = getThirdAffinePoint(dstPoints[0], dstPoints[1]);
+
+	        Point2f srcPoints[3];
+	        if (src.file.contains("Affine_0") &&
+	            src.file.contains("Affine_1") &&
+	            src.file.contains("Affine_2")) {
+	            srcPoints[0] = OpenCVUtils::toPoint(src.file.get<QPointF>("Affine_0"));
+	            srcPoints[1] = OpenCVUtils::toPoint(src.file.get<QPointF>("Affine_1"));
+	        } else {
+	            const QList<Point2f> landmarks = OpenCVUtils::toPoints(src.file.points());
+
+	            if (landmarks.size() < 2) {
+                  void* const* srcDataPtr = src.m().ptr<void*>();
+                  int rows = *((int*)srcDataPtr[1]);
+                  int cols = *((int*)srcDataPtr[2]);
+                  int type = *((int*)srcDataPtr[3]);
+
+                  if (type != CV_8UC1) {
+                    cout << "ERR: Invalid image format!" << endl;
+                    return;
+                  }
+
+                  Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
+                  void** dstDataPtr = dstMat.ptr<void*>();
+
+                  dstDataPtr[1] = srcDataPtr[1]; *((int*)dstDataPtr[1]) = height;  // rows
+                  dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = width;   // cols
+                  dstDataPtr[3] = srcDataPtr[3];
+
+                  cuda::affine::resizeWrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols, height, width);
+                  dst = dstMat;
+	                return;
+	            } else {
+	                srcPoints[0] = landmarks[0];
+	                srcPoints[1] = landmarks[1];
+	            }
+	        }
+	        srcPoints[2] = getThirdAffinePoint(srcPoints[0], srcPoints[1]);
+
+	        Mat affineTransform = getAffineTransform(srcPoints, dstPoints);
+
+	        void* const* srcDataPtr = src.m().ptr<void*>();
+	        int rows = *((int*)srcDataPtr[1]);
+	        int cols = *((int*)srcDataPtr[2]);
+	        int type = *((int*)srcDataPtr[3]);
+
+          if (type != CV_8UC1) {
+            cout << "ERR: Invalid image format!" << endl;
+            return;
+          }
+
+
+	        Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
+	        void** dstDataPtr = dstMat.ptr<void*>();
+
+	        dstDataPtr[1] = srcDataPtr[1]; *((int*)dstDataPtr[1]) = height;  // rows
+	        dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = width;   // cols
+	        dstDataPtr[3] = srcDataPtr[3];
+
+	        cuda::affine::wrapper(srcDataPtr[0], &dstDataPtr[0], affineTransform, rows, cols, height, width);
+
+	        dst = dstMat;
+	    }
+	};
+
+	BR_REGISTER(Transform, CUDAAffineTransform)
+
+} // namespace br
+
+#include "cuda/cudaaffine.moc"
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright 2016 Colin Heinzmann                                            *
+ *                                                                           *
+ * Licensed under the Apache License, Version 2.0 (the "License");           *
+ * you may not use this file except in compliance with the License.          *
+ * You may obtain a copy of the License at                                   *
+ *                                                                           *
+ *     http://www.apache.org/licenses/LICENSE-2.0                            *
+ *                                                                           *
+ * Unless required by applicable law or agreed to in writing, software       *
+ * distributed under the License is distributed on an "AS IS" BASIS,         *
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  *
+ * See the License for the specific language governing permissions and       *
+ * limitations under the License.                                            *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#include <iostream>
+
+using namespace std;
+
+#include <opencv2/gpu/gpu.hpp>
+#include <opencv2/opencv.hpp>
+#include <stdio.h>
+#include <math.h>
+
+#include "cudadefines.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+
+namespace br { namespace cuda { namespace affine {
+
+    __device__ __forceinline__ uint8_t getPixelValueDevice(int row, int col, uint8_t* srcPtr, int rows, int cols) {
+        return (srcPtr + row*cols)[col];
+    }
+
+
+    __device__ __forceinline__ uint8_t getBilinearPixelValueDevice(double row, double col, uint8_t* srcPtr, int rows, int cols) {
+        // http://www.sci.utah.edu/~acoste/uou/Image/project3/ArthurCOSTE_Project3.pdf
+        // Bilinear Transformation
+        // f(Px, Py) = f(Q11)×(1−Rx)×(1−Sy)+f(Q21)×(Rx)×(1−Sy)+f(Q12)×(1−Rx)×(Sy)+f(Q22)×(Rx)×(Sy)
+
+        int row1 = floor(row);
+        int row2 = row1+1;
+
+        int col1 = floor(col);
+        int col2 = col1+1;
+
+        double d_row = row - row1;
+        double d_col = col - col1;
+
+        int Q11 = getPixelValueDevice(row1, col1, srcPtr, rows, cols);
+        int Q21 = getPixelValueDevice(row2, col1, srcPtr, rows, cols);
+        int Q12 = getPixelValueDevice(row1, col2, srcPtr, rows, cols);
+        int Q22 = getPixelValueDevice(row2, col2, srcPtr, rows, cols);
+
+        double val = Q22*(d_row*d_col) + Q12*((1-d_row)*d_col) + Q21*(d_row*(1-d_col)) + Q11*((1-d_row)*(1-d_col));
+        return ((uint8_t) round(val));
+    }
+
+    __device__ __forceinline__ uint8_t getDistancePixelValueDevice(double row, double col, uint8_t* srcPtr, int rows, int cols) {
+        int row1 = floor(row);
+        int row2 = row1+1;
+
+        int col1 = floor(col);
+        int col2 = col1+1;
+
+        double m1 = row2 - row;
+        double m12 = m1*m1;
+
+        double m2 = col - col1;
+        double m22 = m2*m2;
+
+        double d1 = sqrt(m12 - 2*m1 + 1 + m22);
+        double d2 = sqrt(m12 + m22);
+        double d3 = sqrt(m12 - 2*m1 + 1 + m22 - 2*m2 + 1);
+        double d4 = sqrt(m12 + m22 - 2*m2 + 1);
+        double sum = d1 + d2 + d3 + d4;
+
+        double w1 = d1/sum;
+        double w2 = d2/sum;
+        double w3 = d3/sum;
+        double w4 = d4/sum;
+
+        uint8_t v1 = getPixelValueDevice(row1, col1, srcPtr, rows, cols);
+        uint8_t v2 = getPixelValueDevice(row2, col1, srcPtr, rows, cols);
+        uint8_t v3 = getPixelValueDevice(row1, col2, srcPtr, rows, cols);
+        uint8_t v4 = getPixelValueDevice(row2, col2, srcPtr, rows, cols);
+
+        return round(w1*v1 + w2*v2 + w3*v3 + w4*v4);
+    }
+
+    /*
+     * trans_inv          - A pointer to a one-dimensional representation of the inverse of the transform matrix 3x3
+     * dst_row            - The destination row (mapping to this row)
+     * dst_col            - The destination column (mapping to this column)
+     * src_row            - The computed source pixel row (mapping from this row)
+     * src_col            - The computed source pixel column (mapping from this col)
+     */
+    __device__ __forceinline__ void getSrcCoordDevice(double *trans_inv, int dst_row, int dst_col, double* src_row_pnt, double* src_col_pnt){
+        *src_col_pnt = dst_col * trans_inv[0] + dst_row * trans_inv[3] + trans_inv[6];
+        *src_row_pnt = dst_col * trans_inv[1] + dst_row * trans_inv[4] + trans_inv[7];
+		}
+
+    __global__ void bilinearKernel(uint8_t* srcPtr, uint8_t* dstPtr, int srcRows, int srcCols, int dstRows, int dstCols) {
+      int dstRowInd = blockIdx.y*blockDim.y+threadIdx.y;
+      int dstColInd = blockIdx.x*blockDim.x+threadIdx.x;
+      int dstIndex = dstRowInd*dstCols+dstColInd;
+
+      // destination boundary checking
+      if (dstRowInd >= dstRows || dstColInd >= dstCols) {
+        return;
+      }
+
+      // get the reference indices and relative amounts
+      float exactSrcRowInd = (float)dstRowInd / (float)dstRows * (float)srcRows;
+      int minSrcRowInd = (int)exactSrcRowInd;
+      int maxSrcRowInd = minSrcRowInd+1;
+      float relSrcRowInd = 1.-(exactSrcRowInd-(float)minSrcRowInd);
+
+      // get the reference indices and relative amounts
+      double exactSrcColInd = (double)dstColInd / (double)dstCols * (double)srcCols;
+      int minSrcColInd = (int)exactSrcColInd;
+      int maxSrcColInd = minSrcColInd+1;
+      float relSrcColInd = 1.-(exactSrcColInd-(float)minSrcColInd);
+
+      // perform boundary checking
+      if (minSrcRowInd < 0 || maxSrcRowInd >= srcRows || minSrcColInd < 0 || maxSrcColInd >= srcCols) {
+        dstPtr[dstIndex] = 0;
+        return;
+      }
+
+      // get each of the pixel values
+      float topLeft = srcPtr[minSrcRowInd*srcCols+minSrcColInd];
+      float topRight = srcPtr[minSrcRowInd*srcCols+maxSrcColInd];
+      float bottomLeft = srcPtr[maxSrcRowInd*srcCols+minSrcColInd];
+      float bottomRight = srcPtr[maxSrcRowInd*srcCols+maxSrcColInd];
+
+      float out = relSrcRowInd*relSrcColInd*topLeft + relSrcRowInd*(1.-relSrcColInd)*topRight + (1.-relSrcRowInd)*relSrcColInd*bottomLeft + (1.-relSrcRowInd)*(1.-relSrcColInd)*bottomRight;
+
+      dstPtr[dstIndex] = (int)out;
+    }
+
+    __global__ void affineKernel(uint8_t* srcPtr, uint8_t* dstPtr, double* trans_inv, int src_rows, int src_cols, int dst_rows, int dst_cols){
+        int dstRowInd = blockIdx.y*blockDim.y+threadIdx.y;
+        int dstColInd = blockIdx.x*blockDim.x+threadIdx.x;
+        int dstIndex = dstRowInd*dst_cols + dstColInd;
+
+        double srcRowPnt;
+        double srcColPnt;
+
+        // don't do anything if the index is out of bounds
+        if (dstRowInd >= dst_rows || dstColInd >= dst_cols) {
+          return;
+        }
+        if (dstRowInd == 0 || dstRowInd == dst_rows-1 || dstColInd ==0 || dstColInd == dst_cols-1) {
+          dstPtr[dstIndex] = 0;
+          return;
+        }
+
+        getSrcCoordDevice(trans_inv, dstRowInd, dstColInd, &srcRowPnt, &srcColPnt);
+        const uint8_t cval = getBilinearPixelValueDevice(srcRowPnt, srcColPnt, srcPtr, src_rows, src_cols); // Get initial pixel value
+
+        dstPtr[dstIndex] = cval;
+    }
+
+    void resizeWrapper(void* srcPtr, void** dstPtr, int srcRows, int srcCols, int dstRows, int dstCols) {
+      // perform bilinear filtering
+
+      // allocate space for destination
+      cudaError_t err;
+      CUDA_SAFE_MALLOC(dstPtr, dstRows*dstCols*sizeof(uint8_t), &err);
+
+      // call the bilinear kernel function
+      dim3 threadsPerBlock(32, 16);
+      dim3 numBlocks(dstCols/threadsPerBlock.x + 1,
+                     dstRows/threadsPerBlock.y + 1);
+
+      bilinearKernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)*dstPtr, srcRows, srcCols, dstRows, dstCols);
+      CUDA_KERNEL_ERR_CHK(&err);
+
+      CUDA_SAFE_FREE(srcPtr, &err);
+    }
+
+    void wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols) {
+        cudaError_t err;
+        double* gpuInverse;
+
+        dim3 threadsPerBlock(32, 16);
+        dim3 numBlocks(dst_cols/threadsPerBlock.x + 1,
+                       dst_rows/threadsPerBlock.y + 1);
+
+        //************************************************************************
+        // Input affine is a 2x3 Mat whose transpose is used in the computations
+        // [x, y, 1] = [u, v, 1] [ a^T | [0 0 1]^T ]
+        // See "Digital Image Warping" by George Wolburg (p. 50)
+        //************************************************************************
+
+        // get new transform elements
+        double a11 = affineTransform.at<double>(0, 0);
+        double a12 = affineTransform.at<double>(1, 0);
+        double a21 = affineTransform.at<double>(0, 1);
+        double a22 = affineTransform.at<double>(1, 1);
+        double a31 = affineTransform.at<double>(0, 2);
+        double a32 = affineTransform.at<double>(1, 2);
+
+        // compute transform inverse
+        double det = 1 / (a11*a22 - a21*a12);
+
+        double affineInverse[9];
+        affineInverse[0] = a22 * det;
+        affineInverse[1] = -a12 * det;
+        affineInverse[2] = 0;
+        affineInverse[3] = -a21 * det;
+        affineInverse[4] = a11 * det;
+        affineInverse[5] = 0;
+        affineInverse[6] = (a21*a32 - a31*a22) * det;
+        affineInverse[7] = (a31*a12 - a11*a32) * det;
+        affineInverse[8] = (a11*a22 - a21*a12) * det;
+
+        CUDA_SAFE_MALLOC(dstPtr, dst_rows*dst_cols*sizeof(uint8_t), &err);
+        CUDA_SAFE_MALLOC(&gpuInverse, 3*3*sizeof(double), &err);
+
+        CUDA_SAFE_MEMCPY(gpuInverse, affineInverse, 9*sizeof(double), cudaMemcpyHostToDevice, &err);
+
+        affineKernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), gpuInverse, src_rows, src_cols, dst_rows, dst_cols);
+        CUDA_KERNEL_ERR_CHK(&err);
+
+        CUDA_SAFE_FREE(srcPtr, &err);
+        CUDA_SAFE_FREE(gpuInverse, &err);
+    }
+}}}
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright 2016 Li Li, Colin Heinzmann                                     *
+ *                                                                           *
+ * Licensed under the Apache License, Version 2.0 (the "License");           *
+ * you may not use this file except in compliance with the License.          *
+ * You may obtain a copy of the License at                                   *
+ *                                                                           *
+ *     http://www.apache.org/licenses/LICENSE-2.0                            *
+ *                                                                           *
+ * Unless required by applicable law or agreed to in writing, software       *
+ * distributed under the License is distributed on an "AS IS" BASIS,         *
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  *
+ * See the License for the specific language governing permissions and       *
+ * limitations under the License.                                            *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#include <iostream>
+using namespace std;
+#include <unistd.h>
+
+#include <opencv2/opencv.hpp>
+using namespace cv;
+
+#include <openbr/plugins/openbr_internal.h>
+
+// definitions from the CUDA source file
+namespace br { namespace cuda { namespace cvtfloat {
+  void wrapper(void* src, void** dst, int rows, int cols);
+}}}
+
+namespace br
+{
+
+/*!
+ * \ingroup transforms
+ * \brief Converts 8-bit images currently on GPU into 32-bit floating point equivalent.
+ * \author Colin Heinzmann \cite DepthDeluxe
+ */
+class CUDACvtFloatTransform : public UntrainableTransform
+{
+    Q_OBJECT
+
+  public:
+    void project(const Template &src, Template &dst) const
+    {
+      void* const* srcDataPtr = src.m().ptr<void*>();
+      int rows = *((int*)srcDataPtr[1]);
+      int cols = *((int*)srcDataPtr[2]);
+      int type = *((int*)srcDataPtr[3]);
+
+      // assume the image type is 256-monochrome
+      // TODO(colin): real exception handling
+      if (type != CV_8UC1) {
+        cout << "ERR: Invalid memory format" << endl;
+        return;
+      }
+
+      // build the destination mat
+      Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
+      void** dstDataPtr = dstMat.ptr<void*>();
+      dstDataPtr[1] = srcDataPtr[1];
+      dstDataPtr[2] = srcDataPtr[2];
+      dstDataPtr[3] = srcDataPtr[3]; *((int*)dstDataPtr[3]) = CV_32FC1;
+
+      cuda::cvtfloat::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
+      dst = dstMat;
+    }
+};
+
+BR_REGISTER(Transform, CUDACvtFloatTransform)
+
+} // namespace br
+
+#include "cuda/cudacvtfloat.moc"
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright 2016 Li Li, Colin Heinzmann                                     *
+ *                                                                           *
+ * Licensed under the Apache License, Version 2.0 (the "License");           *
+ * you may not use this file except in compliance with the License.          *
+ * You may obtain a copy of the License at                                   *
+ *                                                                           *
+ *     http://www.apache.org/licenses/LICENSE-2.0                            *
+ *                                                                           *
+ * Unless required by applicable law or agreed to in writing, software       *
+ * distributed under the License is distributed on an "AS IS" BASIS,         *
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  *
+ * See the License for the specific language governing permissions and       *
+ * limitations under the License.                                            *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#include <iostream>
+
+using namespace std;
+
+#include "cudadefines.hpp"
+
+namespace br { namespace cuda { namespace cvtfloat {
+
+  __global__ void kernel(const unsigned char* src, float* dst, int rows, int cols) {
+    // get my index
+    int rowInd = blockIdx.y*blockDim.y + threadIdx.y;
+    int colInd = blockIdx.x*blockDim.x + threadIdx.x;
+
+    // bounds check
+    if (rowInd >= rows || colInd >= cols) {
+      return;
+    }
+
+    int index = rowInd*cols + colInd;
+    dst[index] = (float)src[index];
+  }
+
+  void wrapper(void* src, void** dst, int rows, int cols) {
+    cudaError_t err;
+    CUDA_SAFE_MALLOC(dst, rows*cols*sizeof(float), &err);
+
+    dim3 threadsPerBlock(8, 8);
+    dim3 numBlocks(
+      cols / threadsPerBlock.x + 1,
+      rows / threadsPerBlock.y + 1
+    );
+
+    kernel<<<numBlocks, threadsPerBlock>>>((const unsigned char*)src, (float*)(*dst), rows, cols);
+    CUDA_KERNEL_ERR_CHK(&err);
+
+    // free the src memory since it is now in a newly allocated dst
+    CUDA_SAFE_FREE(src, &err);
+  }
+
+}}}
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright 2016 Colin Heinzmann                                     *
+ *                                                                           *
+ * Licensed under the Apache License, Version 2.0 (the "License");           *
+ * you may not use this file except in compliance with the License.          *
+ * You may obtain a copy of the License at                                   *
+ *                                                                           *
+ *     http://www.apache.org/licenses/LICENSE-2.0                            *
+ *                                                                           *
+ * Unless required by applicable law or agreed to in writing, software       *
+ * distributed under the License is distributed on an "AS IS" BASIS,         *
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  *
+ * See the License for the specific language governing permissions and       *
+ * limitations under the License.                                            *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#include <iostream>
+
+using namespace std;
+#include <pthread.h>
+
+#define CUDA_SAFE_FREE(cudaPtr, errPtr) \
+  /*cout << pthread_self() << ": CUDA Free: " << cudaPtr << endl;*/ \
+  *errPtr = cudaFree(cudaPtr); \
+  if (*errPtr != cudaSuccess) { \
+    cout << pthread_self() << ": CUDA Free Error(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \
+    throw 0; \
+  }
+
+#define CUDA_SAFE_MALLOC(cudaPtrPtr, size, errPtr) \
+  *errPtr = cudaMalloc(cudaPtrPtr, size); \
+  if (*errPtr != cudaSuccess) { \
+    cout << pthread_self() << ": CUDA Malloc Error(" << *errPtr  << "): " << cudaGetErrorString(*errPtr) << endl; \
+    throw 0; \
+  } \
+  //cout << pthread_self() << ": CUDA Malloc: " << (void*)*(int**)cudaPtrPtr << endl;
+
+#define CUDA_SAFE_MEMCPY(dstPtr, srcPtr, count, kind, errPtr) \
+  *errPtr = cudaMemcpy(dstPtr, srcPtr, count, kind); \
+  if (*errPtr != cudaSuccess) { \
+    cout << pthread_self() << ": CUDA Memcpy Error(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \
+    throw 0; \
+  }
+
+#define CUDA_KERNEL_ERR_CHK(errPtr) \
+  *errPtr = cudaPeekAtLastError(); \
+  if (*errPtr != cudaSuccess) { \
+    cout << pthread_self() << ": Kernel Call Err(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \
+    throw 0; \
+  }
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright 2016 Colin Heinzmann                                            *
+ *                                                                           *
+ * Licensed under the Apache License, Version 2.0 (the "License");           *
+ * you may not use this file except in compliance with the License.          *
+ * You may obtain a copy of the License at                                   *
+ *                                                                           *
+ *     http://www.apache.org/licenses/LICENSE-2.0                            *
+ *                                                                           *
+ * Unless required by applicable law or agreed to in writing, software       *
+ * distributed under the License is distributed on an "AS IS" BASIS,         *
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  *
+ * See the License for the specific language governing permissions and       *
+ * limitations under the License.                                            *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#include <iostream>
+using namespace std;
+
+#include <openbr/plugins/openbr_internal.h>
+
+// definitions from the CUDA source file
+namespace br { namespace cuda { namespace L2 {
+  void wrapper(float const* aPtr, float const* bPtr, int length, float* outPtr);
+}}}
+
+namespace br
+{
+
+/*!
+ * \ingroup distances
+ * \brief L2 distance computed using eigen.
+ * \author Colin Heinzmann \cite DepthDeluxe
+ */
+class CUDAL2Distance : public UntrainableDistance
+{
+    Q_OBJECT
+
+    float compare(const cv::Mat &a, const cv::Mat &b) const
+    {
+      if (a.type() != CV_32FC1 || b.type() != CV_32FC1) {
+        cout << "ERR: Type mismatch" << endl;
+        throw 0;
+      }
+      if (a.rows*a.cols != b.rows*b.cols) {
+        cout << "ERR: Dimension mismatch" << endl;
+        throw 1;
+      }
+
+      float out;
+      cuda::L2::wrapper(a.ptr<float>(), b.ptr<float>(), a.rows*a.cols, &out);
+
+      return out;
+    }
+};
+
+BR_REGISTER(Distance, CUDAL2Distance)
+
+} // namespace br
+
+#include "cuda/cudal2.moc"
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright 2016 Colin Heinzmann                                            *
+ *                                                                           *
+ * Licensed under the Apache License, Version 2.0 (the "License");           *
+ * you may not use this file except in compliance with the License.          *
+ * You may obtain a copy of the License at                                   *
+ *                                                                           *
+ *     http://www.apache.org/licenses/LICENSE-2.0                            *
+ *                                                                           *
+ * Unless required by applicable law or agreed to in writing, software       *
+ * distributed under the License is distributed on an "AS IS" BASIS,         *
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  *
+ * See the License for the specific language governing permissions and       *
+ * limitations under the License.                                            *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+#include <math.h>
+
+
+#include "cudadefines.hpp"
+
+namespace br { namespace cuda { namespace L2 {
+
+  __global__ void subtractKernel(float* aPtr, float* bPtr, float* workPtr, int length) {
+    int index = blockIdx.x*blockDim.x+threadIdx.x;
+
+    if (index >= length) {
+      return;
+    }
+
+    // perform the subtraction
+    float res = aPtr[index] - bPtr[index];
+    res = res * res;
+    workPtr[index] = res;
+  }
+
+  __global__ void collapseKernel(float* inPtr, float* outPtr, int length) {
+    // make sure there is only one thread that we are calling
+    if (blockIdx.x != 0 || threadIdx.x != 0) {
+      return;
+    }
+
+    // sum up all the values
+    float acc = 0;
+    for (int i=0; i < length; i++) {
+      acc += inPtr[i];
+    }
+
+    *outPtr = acc;
+  }
+
+  float* cudaAPtr = NULL;
+  float* cudaBPtr = NULL;
+  float* cudaWorkBufferPtr = NULL;
+  float* cudaOutPtr = NULL;
+  int bufferLen = 0;
+
+  void wrapper(float const* aPtr, float const* bPtr, int length, float* outPtr) {
+    cudaError_t err;
+
+    // allocate memory for the mats and copy data to graphics card
+    // only allocate if there is a mismatch in image size, otherwise
+    // use the existing allocated memory
+    if (length != bufferLen) {
+      if (cudaAPtr != NULL) {
+        CUDA_SAFE_FREE(cudaAPtr, &err);
+        CUDA_SAFE_FREE(cudaBPtr, &err);
+        CUDA_SAFE_FREE(cudaWorkBufferPtr, &err);
+        CUDA_SAFE_FREE(cudaOutPtr, &err);
+      }
+      CUDA_SAFE_MALLOC(&cudaAPtr, length*sizeof(float), &err);
+      CUDA_SAFE_MALLOC(&cudaBPtr, length*sizeof(float), &err);
+      CUDA_SAFE_MALLOC(&cudaWorkBufferPtr, sizeof(float)*length, &err);
+      CUDA_SAFE_MALLOC(&cudaOutPtr, sizeof(float), &err);
+      bufferLen = length;
+    }
+
+    // copy data over from CPU
+    CUDA_SAFE_MEMCPY(cudaAPtr, aPtr, length*sizeof(float), cudaMemcpyHostToDevice, &err);
+    CUDA_SAFE_MEMCPY(cudaBPtr, bPtr, length*sizeof(float), cudaMemcpyHostToDevice, &err);
+
+    // perform the subtraction
+    int threadsPerBlock = 512;
+    int numBlocks = length / threadsPerBlock + 1;
+    subtractKernel<<<threadsPerBlock, numBlocks>>>(cudaAPtr, cudaBPtr, cudaWorkBufferPtr, length);
+    CUDA_KERNEL_ERR_CHK(&err);
+
+    // perform the collapse
+    collapseKernel<<<1,1>>>(cudaWorkBufferPtr, cudaOutPtr, length);
+    CUDA_KERNEL_ERR_CHK(&err);
+
+    // copy the single value back to the destinsion
+    CUDA_SAFE_MEMCPY(outPtr, cudaOutPtr, sizeof(float), cudaMemcpyDeviceToHost, &err);
+  }
+}}}
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright 2016 Li Li, Colin Heinzmann                                     *
+ *                                                                           *
+ * Licensed under the Apache License, Version 2.0 (the "License");           *
+ * you may not use this file except in compliance with the License.          *
+ * You may obtain a copy of the License at                                   *
+ *                                                                           *
+ *     http://www.apache.org/licenses/LICENSE-2.0                            *
+ *                                                                           *
+ * Unless required by applicable law or agreed to in writing, software       *
+ * distributed under the License is distributed on an "AS IS" BASIS,         *
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  *
+ * See the License for the specific language governing permissions and       *
+ * limitations under the License.                                            *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#include <iostream>
+using namespace std;
+
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <pthread.h>
+
+#include <opencv2/opencv.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <opencv2/imgproc/imgproc_c.h>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/highgui/highgui_c.h>
+#include <limits>
+
+#include <openbr/plugins/openbr_internal.h>
+
+using namespace cv;
+
+// definitions from the CUDA source file
+namespace br { namespace cuda { namespace lbp {
+  void wrapper(void* srcPtr, void** dstPtr, int rows, int cols);
+  void initializeWrapper(uint8_t* lut);
+}}}
+
+namespace br
+{
+/*!
+ * \ingroup transforms
+ * \brief Convert the image into a feature vector using Local Binary Patterns in CUDA.  Modified from stock OpenBR plugin.
+ * \author Colin Heinzmann \cite DepthDeluxe
+ * \author Li Li \cite booli
+ */
+class CUDALBPTransform : public UntrainableTransform
+{
+    Q_OBJECT
+    Q_PROPERTY(int radius READ get_radius WRITE set_radius RESET reset_radius STORED false)
+    Q_PROPERTY(int maxTransitions READ get_maxTransitions WRITE set_maxTransitions RESET reset_maxTransitions STORED false)
+    Q_PROPERTY(bool rotationInvariant READ get_rotationInvariant WRITE set_rotationInvariant RESET reset_rotationInvariant STORED false)
+    BR_PROPERTY(int, radius, 1)
+    BR_PROPERTY(int, maxTransitions, 8)
+    BR_PROPERTY(bool, rotationInvariant, false)
+
+  private:
+    uchar lut[256];
+    uchar null;
+
+  public:
+    /* Returns the number of 0->1 or 1->0 transitions in i */
+    static int numTransitions(int i)
+    {
+        int transitions = 0;
+        int curParity = i%2;
+        for (int j=1; j<=8; j++) {
+            int parity = (i>>(j%8)) % 2;
+            if (parity != curParity) transitions++;
+            curParity = parity;
+        }
+        return transitions;
+    }
+
+    static int rotationInvariantEquivalent(int i)
+    {
+        int min = std::numeric_limits<int>::max();
+        for (int j=0; j<8; j++) {
+            bool parity = i % 2;
+            i = i >> 1;
+            if (parity) i+=128;
+            min = std::min(min, i);
+        }
+        return min;
+    }
+
+    void init()
+    {
+        bool set[256];
+        uchar uid = 0;
+        for (int i=0; i<256; i++) {
+            if (numTransitions(i) <= maxTransitions) {
+                int id;
+                if (rotationInvariant) {
+                    int rie = rotationInvariantEquivalent(i);
+                    if (i == rie) id = uid++;
+                    else          id = lut[rie];
+                } else            id = uid++;
+                lut[i] = id;
+                set[i] = true;
+            } else {
+                set[i] = false;
+            }
+        }
+
+        null = uid;
+        for (int i=0; i<256; i++)
+            if (!set[i])
+                lut[i] = null; // Set to null id
+
+        // copy lut over to the GPU
+        cuda::lbp::initializeWrapper(lut);
+
+        std::cout << "Initialized CUDALBP" << std::endl;
+    }
+
+    void project(const Template &src, Template &dst) const
+    {
+        void* const* srcDataPtr = src.m().ptr<void*>();
+        int rows = *((int*)srcDataPtr[1]);
+        int cols = *((int*)srcDataPtr[2]);
+        int type = *((int*)srcDataPtr[3]);
+
+        Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
+        void** dstDataPtr = dstMat.ptr<void*>();
+        dstDataPtr[1] = srcDataPtr[1];
+        dstDataPtr[2] = srcDataPtr[2];
+        dstDataPtr[3] = srcDataPtr[3];
+
+        cuda::lbp::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
+        dst = dstMat;
+    }
+};
+
+BR_REGISTER(Transform, CUDALBPTransform)
+
+}
+
+#include "cuda/cudalbp.moc"
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright 2016 Li Li, Colin Heinzmann                                     *
+ *                                                                           *
+ * Licensed under the Apache License, Version 2.0 (the "License");           *
+ * you may not use this file except in compliance with the License.          *
+ * You may obtain a copy of the License at                                   *
+ *                                                                           *
+ *     http://www.apache.org/licenses/LICENSE-2.0                            *
+ *                                                                           *
+ * Unless required by applicable law or agreed to in writing, software       *
+ * distributed under the License is distributed on an "AS IS" BASIS,         *
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  *
+ * See the License for the specific language governing permissions and       *
+ * limitations under the License.                                            *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+#include <iostream>
+
+using namespace std;
+
+#include <opencv2/gpu/gpu.hpp>
+#include <stdio.h>
+
+#include "cudadefines.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+
+/*
+ * These are the CUDA functions for CUDALBP.  See cudapca.cpp for more details
+ */
+
+namespace br { namespace cuda { namespace lbp {
+  uint8_t* lut;
+
+  __device__ __forceinline__ uint8_t getPixelValueKernel(int row, int col, uint8_t* srcPtr, int rows, int cols) {
+    return (srcPtr + row*cols)[col];
+  }
+
+  __global__ void lutKernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols, uint8_t* lut)
+  {
+    int rowInd = blockIdx.y*blockDim.y+threadIdx.y;
+    int colInd = blockIdx.x*blockDim.x+threadIdx.x;
+    int radius = 1;
+
+    int index = rowInd*cols + colInd;
+
+    // don't do anything if the index is out of bounds
+    if (rowInd < 1 || rowInd >= rows-1 || colInd < 1 || colInd >= cols-1) {
+      if (rowInd >= rows || colInd >= cols) {
+        return;
+      } else {
+        dstPtr[index] = 0;
+        return;
+      }
+    }
+
+    const uint8_t cval = getPixelValueKernel(rowInd+0*radius, colInd+0*radius, srcPtr, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]);                      // center value
+    uint8_t val = lut[(getPixelValueKernel(rowInd-1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 128 : 0) |
+                      (getPixelValueKernel(rowInd-1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 64  : 0) |
+                      (getPixelValueKernel(rowInd-1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 32  : 0) |
+                      (getPixelValueKernel(rowInd+0*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 16  : 0) |
+                      (getPixelValueKernel(rowInd+1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 8   : 0) |
+                      (getPixelValueKernel(rowInd+1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 4   : 0) |
+                      (getPixelValueKernel(rowInd+1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 2   : 0) |
+                      (getPixelValueKernel(rowInd+0*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 1   : 0)];
+
+    // store calculated value away in the right place
+    dstPtr[index] = val;
+  }
+
+  //void cudalbp_wrapper(uint8_t* srcPtr, uint8_t* dstPtr, uint8_t* lut, int imageWidth, int imageHeight, size_t step)
+  void wrapper(void* srcPtr, void** dstPtr, int rows, int cols)
+  {
+    cudaError_t err;
+
+    // make 8 * 8 = 64 square block
+    dim3 threadsPerBlock(8, 8);
+    dim3 numBlocks(cols/threadsPerBlock.x + 1,
+                   rows/threadsPerBlock.y + 1);
+
+    CUDA_SAFE_MALLOC(dstPtr, rows*cols*sizeof(uint8_t), &err);
+    lutKernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), rows, cols, lut);
+    CUDA_KERNEL_ERR_CHK(&err);
+
+    CUDA_SAFE_FREE(srcPtr, &err);
+  }
+
+  void initializeWrapper(uint8_t* cpuLut) {
+    cudaError_t err;
+    CUDA_SAFE_MALLOC(&lut, 256*sizeof(uint8_t), &err);
+    CUDA_SAFE_MEMCPY(lut, cpuLut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice, &err);
+  }
+}}}
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright 2016 Colin Heinzmann                                            *
+ *                                                                           *
+ * Licensed under the Apache License, Version 2.0 (the "License");           *
+ * you may not use this file except in compliance with the License.          *
+ * You may obtain a copy of the License at                                   *
+ *                                                                           *
+ *     http://www.apache.org/licenses/LICENSE-2.0                            *
+ *                                                                           *
+ * Unless required by applicable law or agreed to in writing, software       *
+ * distributed under the License is distributed on an "AS IS" BASIS,         *
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  *
+ * See the License for the specific language governing permissions and       *
+ * limitations under the License.                                            *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#include <iostream>
+using namespace std;
+#include <unistd.h>
+
+#include <QList>
+
+#include <Eigen/Dense>
+
+#include <opencv2/opencv.hpp>
+using namespace cv;
+
+#include <openbr/plugins/openbr_internal.h>
+#include <openbr/core/common.h>
+#include <openbr/core/eigenutils.h>
+#include <openbr/core/opencvutils.h>
+
+// definitions from the CUDA source file
+namespace br { namespace cuda { namespace pca {
+  void initializeWrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems);
+  void trainWrapper(void* cudaSrc, float* dst, int rows, int cols);
+  void wrapper(void* src, void** dst, int imgRows, int imgCols);
+}}}
+
+namespace br
+{
+/*!
+ * \ingroup transforms
+ * \brief Projects input into learned Principal Component Analysis subspace using CUDA. Modified from original PCA plugin.
+ * \author Colin Heinzmann \cite DepthDeluxe
+ *
+ * \br_property float keep Options are: [keep < 0 - All eigenvalues are retained, keep == 0 - No PCA is performed and the eigenvectors form an identity matrix, 0 < keep < 1 - Keep is the fraction of the variance to retain, keep >= 1 - keep is the number of leading eigenvectors to retain] Default is 0.95.
+ * \br_property int drop The number of leading eigen-dimensions to drop.
+ * \br_property bool whiten Whether or not to perform PCA whitening (i.e., normalize variance of each dimension to unit norm)
+ */
+class CUDAPCATransform : public Transform
+{
+    Q_OBJECT
+
+protected:
+    Q_PROPERTY(float keep READ get_keep WRITE set_keep RESET reset_keep STORED false)
+    Q_PROPERTY(int drop READ get_drop WRITE set_drop RESET reset_drop STORED false)
+    Q_PROPERTY(bool whiten READ get_whiten WRITE set_whiten RESET reset_whiten STORED false)
+
+    BR_PROPERTY(float, keep, 0.95)
+    BR_PROPERTY(int, drop, 0)
+    BR_PROPERTY(bool, whiten, false)
+
+    Eigen::VectorXf mean, eVals;
+    Eigen::MatrixXf eVecs;
+
+    int originalRows;
+
+public:
+    CUDAPCATransform() : keep(0.95), drop(0), whiten(false) {}
+
+private:
+    double residualReconstructionError(const Template &src) const
+    {
+        Template proj;
+        project(src, proj);
+
+        Eigen::Map<const Eigen::VectorXf> srcMap(src.m().ptr<float>(), src.m().rows*src.m().cols);
+        Eigen::Map<Eigen::VectorXf> projMap(proj.m().ptr<float>(), keep);
+
+        return (srcMap - mean).squaredNorm() - projMap.squaredNorm();
+    }
+
+    void train(const TemplateList &cudaTrainingSet)
+    {
+      // copy the data back from the graphics card so the training can be done on the CPU
+        const int instances = cudaTrainingSet.size();       // get the number of training set instances
+        QList<Template> trainingQlist;
+        for(int i=0; i<instances; i++) {
+          Template currentTemplate = cudaTrainingSet[i];
+          void* const* srcDataPtr = currentTemplate.m().ptr<void*>();
+          void* cudaMemPtr = srcDataPtr[0];
+          int rows = *((int*)srcDataPtr[1]);
+          int cols = *((int*)srcDataPtr[2]);
+          int type = *((int*)srcDataPtr[3]);
+
+          if (type != CV_32FC1) {
+            qFatal("Requires single channel 32-bit floating point matrices.");
+          }
+
+          Mat mat = Mat(rows, cols, type);
+          br::cuda::pca::trainWrapper(cudaMemPtr, mat.ptr<float>(), rows, cols);
+          trainingQlist.append(Template(mat));
+        }
+
+        // assemble a TemplateList from the list of data
+        TemplateList trainingSet(trainingQlist);
+
+
+        originalRows = trainingSet.first().m().rows;    // get number of rows of first image
+        int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols; // get the size of the first image
+
+        // Map into 64-bit Eigen matrix
+        Eigen::MatrixXd data(dimsIn, instances);        // create a mat
+        for (int i=0; i<instances; i++) {
+          data.col(i) = Eigen::Map<const Eigen::MatrixXf>(trainingSet[i].m().ptr<float>(), dimsIn, 1).cast<double>();
+        }
+
+        trainCore(data);
+    }
+
+    void project(const Template &src, Template &dst) const
+    {
+      void* const* srcDataPtr = src.m().ptr<void*>();
+      int rows = *((int*)srcDataPtr[1]);
+      int cols = *((int*)srcDataPtr[2]);
+      int type = *((int*)srcDataPtr[3]);
+
+      if (type != CV_32FC1) {
+        cout << "ERR: Invalid image type" << endl;
+        throw 0;
+      }
+
+      Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
+      void** dstDataPtr = dstMat.ptr<void*>();
+      dstDataPtr[1] = srcDataPtr[1];  *((int*)dstDataPtr[1]) = 1;
+      dstDataPtr[2] = srcDataPtr[2];  *((int*)dstDataPtr[2]) = keep;
+      dstDataPtr[3] = srcDataPtr[3];
+
+      cuda::pca::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
+
+      dst = dstMat;
+    }
+
+    void store(QDataStream &stream) const
+    {
+        stream << keep << drop << whiten << originalRows << mean << eVals << eVecs;
+    }
+
+    void load(QDataStream &stream)
+    {
+        stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> eVecs;
+
+        // serialize the eigenvectors
+        float* evBuffer = new float[eVecs.rows() * eVecs.cols()];
+        for (int i=0; i < eVecs.rows(); i++) {
+          for (int j=0; j < eVecs.cols(); j++) {
+            evBuffer[i*eVecs.cols() + j] = eVecs(i, j);
+          }
+        }
+
+        // serialize the mean
+        float* meanBuffer = new float[mean.rows() * mean.cols()];
+        for (int i=0; i < mean.rows(); i++) {
+          for (int j=0; j < mean.cols(); j++) {
+            meanBuffer[i*mean.cols() + j] = mean(i, j);
+          }
+        }
+
+        // call the wrapper function
+        cuda::pca::initializeWrapper(evBuffer, eVecs.rows(), eVecs.cols(), meanBuffer, mean.rows()*mean.cols());
+
+        delete evBuffer;
+        delete meanBuffer;
+    }
+
+protected:
+    void trainCore(Eigen::MatrixXd data)
+    {
+        int dimsIn = data.rows();
+        int instances = data.cols();
+        const bool dominantEigenEstimation = (dimsIn > instances);
+
+        Eigen::MatrixXd allEVals, allEVecs;
+        if (keep != 0) {
+            // Compute and remove mean
+            mean = Eigen::VectorXf(dimsIn);
+            for (int i=0; i<dimsIn; i++) mean(i) = data.row(i).sum() / (float)instances;
+            for (int i=0; i<dimsIn; i++) data.row(i).array() -= mean(i);
+
+            // Calculate covariance matrix
+            Eigen::MatrixXd cov;
+            if (dominantEigenEstimation) cov = data.transpose() * data / (instances-1.0);
+            else                         cov = data * data.transpose() / (instances-1.0);
+
+            // Compute eigendecomposition. Returns eigenvectors/eigenvalues in increasing order by eigenvalue.
+            Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> eSolver(cov);
+            allEVals = eSolver.eigenvalues();
+            allEVecs = eSolver.eigenvectors();
+            if (dominantEigenEstimation) allEVecs = data * allEVecs;
+        } else {
+            // Null case
+            mean = Eigen::VectorXf::Zero(dimsIn);
+            allEVecs = Eigen::MatrixXd::Identity(dimsIn, dimsIn);
+            allEVals = Eigen::VectorXd::Ones(dimsIn);
+        }
+
+        if (keep <= 0) {
+            keep = dimsIn - drop;
+        } else if (keep < 1) {
+            // Keep eigenvectors that retain a certain energy percentage.
+            const double totalEnergy = allEVals.sum();
+            if (totalEnergy == 0) {
+                keep = 0;
+            } else {
+                double currentEnergy = 0;
+                int i=0;
+                while ((currentEnergy / totalEnergy < keep) && (i < allEVals.rows())) {
+                    currentEnergy += allEVals(allEVals.rows()-(i+1));
+                    i++;
+                }
+                keep = i - drop;
+            }
+        } else {
+            if (keep + drop > allEVals.rows()) {
+                qWarning("Insufficient samples, needed at least %d but only got %d.", (int)keep + drop, (int)allEVals.rows());
+                keep = allEVals.rows() - drop;
+            }
+        }
+
+        // Keep highest energy vectors
+        eVals = Eigen::VectorXf((int)keep, 1);
+        eVecs = Eigen::MatrixXf(allEVecs.rows(), (int)keep);
+        for (int i=0; i<keep; i++) {
+            int index = allEVals.rows()-(i+drop+1);
+            eVals(i) = allEVals(index);
+            eVecs.col(i) = allEVecs.col(index).cast<float>() / allEVecs.col(index).norm();
+            if (whiten) eVecs.col(i) /= sqrt(eVals(i));
+        }
+
+        // Debug output
+        if (Globals->verbose) qDebug() << "PCA Training:\n\tDimsIn =" << dimsIn << "\n\tKeep =" << keep;
+    }
+
+    void writeEigenVectors(const Eigen::MatrixXd &allEVals, const Eigen::MatrixXd &allEVecs) const
+    {
+        const int originalCols = mean.rows() / originalRows;
+
+        { // Write out mean image
+            cv::Mat out(originalRows, originalCols, CV_32FC1);
+            Eigen::Map<Eigen::MatrixXf> outMap(out.ptr<float>(), mean.rows(), 1);
+            outMap = mean.col(0);
+            // OpenCVUtils::saveImage(out, Globals->Debug+"/PCA/eigenVectors/mean.png");
+        }
+
+        // Write out sample eigen vectors (16 highest, 8 lowest), filename = eigenvalue.
+        for (int k=0; k<(int)allEVals.size(); k++) {
+            if ((k < 8) || (k >= (int)allEVals.size()-16)) {
+                cv::Mat out(originalRows, originalCols, CV_64FC1);
+                Eigen::Map<Eigen::MatrixXd> outMap(out.ptr<double>(), mean.rows(), 1);
+                outMap = allEVecs.col(k);
+                // OpenCVUtils::saveImage(out, Globals->Debug+"/PCA/eigenVectors/"+QString::number(allEVals(k),'f',0)+".png");
+            }
+        }
+    }
+};
+
+BR_REGISTER(Transform, CUDAPCATransform)
+} // namespace br
+
+#include "cuda/cudapca.moc"
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright 2016 Colin Heinzmann                                            *
+ *                                                                           *
+ * Licensed under the Apache License, Version 2.0 (the "License");           *
+ * you may not use this file except in compliance with the License.          *
+ * You may obtain a copy of the License at                                   *
+ *                                                                           *
+ *     http://www.apache.org/licenses/LICENSE-2.0                            *
+ *                                                                           *
+ * Unless required by applicable law or agreed to in writing, software       *
+ * distributed under the License is distributed on an "AS IS" BASIS,         *
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  *
+ * See the License for the specific language governing permissions and       *
+ * limitations under the License.                                            *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#include <iostream>
+using namespace std;
+
+#include <opencv2/opencv.hpp>
+#include <opencv2/gpu/gpu.hpp>
+
+#include "cudadefines.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+
+/*
+ * These are the CUDA functions for CUDAPCA.  See cudapca.cpp for more details
+ */
+
+namespace br { namespace cuda { namespace pca {
+  __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int numEigenvectors, int numSteps, int stepSize, int numPixels) {
+    int evIdx = blockIdx.x*blockDim.x+threadIdx.x;
+    int stepIdx = blockIdx.y*blockDim.y+threadIdx.y;
+
+    if (evIdx >= numEigenvectors || stepIdx >= numSteps) {
+      return;
+    }
+
+    float acc = 0;
+    int startIdx = stepSize*stepIdx;
+    int stopIdx = startIdx+stepSize;
+    if (startIdx >= numPixels) {
+      return;
+    }
+    if (stopIdx >= numPixels) {
+      stopIdx = numPixels;
+    }
+    for(int i=startIdx; i < stopIdx; i++) {
+      acc += src[i]*evPtr[i*numEigenvectors + evIdx];
+    }
+
+    intermediaryBuffer[stepIdx*stepSize + evIdx] = acc;
+  }
+
+  __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* out, int numEigenvectors, int numSteps, int stepSize) {
+    int evIdx = blockIdx.x*blockDim.x+threadIdx.x;
+    if (evIdx >= numEigenvectors) {
+      return;
+    }
+
+    if (numSteps*stepSize+evIdx >= numEigenvectors) {
+      numSteps--;
+    }
+
+    float acc = 0;
+    for (int i=0; i < numSteps; i++) {
+      int ibIdx = i*stepSize + evIdx;
+      acc += intermediaryBuffer[ibIdx];
+    }
+
+    out[evIdx] = acc;
+  }
+
+  __global__ void subtractMeanKernel(float* out, float* mean, int numElems) {
+    int idx = blockIdx.x*blockDim.x+threadIdx.x;
+
+    // perform bound checking
+    if (idx >= numElems) {
+      return;
+    }
+
+    // subtract out the mean
+    out[idx] -= mean[idx];
+  }
+
+  // _evRows: the number of pixels in the trained images
+  // _evCols: the number of eigenvectors
+  // _meanElems: the number of pixels in an image
+  // _stepSize: the number of pixels in a single step
+  // _numSteps: the number of steps required to complete operation
+  float* cudaEvPtr; int _evRows; int _evCols;
+  float* cudaMeanPtr; int _meanElems;
+  int _numSteps; int _stepSize;
+  float* intermediaryBuffer;
+
+  void initializeWrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) {
+    _evRows = evRows; _evCols = evCols;
+    _meanElems = meanElems;
+
+    cudaError_t err;
+
+    // copy the eigenvectors to the GPU
+    CUDA_SAFE_MALLOC(&cudaEvPtr, evRows*evCols*sizeof(float), &err);
+    CUDA_SAFE_MEMCPY(cudaEvPtr, evPtr, evRows*evCols*sizeof(float), cudaMemcpyHostToDevice, &err);
+
+    // copy the mean to the GPU
+    CUDA_SAFE_MALLOC(&cudaMeanPtr, meanElems*sizeof(float), &err);
+    CUDA_SAFE_MEMCPY(cudaMeanPtr, meanPtr, meanElems*sizeof(float), cudaMemcpyHostToDevice, &err);
+
+    // initialize the intermediary working space,
+    _stepSize = 2048;
+    _numSteps = _evRows / _stepSize + 1;
+    CUDA_SAFE_MALLOC(&intermediaryBuffer, _numSteps*_stepSize*sizeof(float), &err);
+  }
+
+  void trainWrapper(void* cudaSrc, float* data, int rows, int cols) {
+    cudaError_t err;
+    CUDA_SAFE_MEMCPY(data, cudaSrc, rows*cols*sizeof(float), cudaMemcpyDeviceToHost, &err);
+  }
+
+  void wrapper(void* src, void** dst, int imgRows, int imgCols) {
+    cudaError_t err;
+    CUDA_SAFE_MALLOC(dst, _evCols*sizeof(float), &err);
+
+    if (imgRows*imgCols != _evRows || imgRows*imgCols != _meanElems) {
+      cout << "ERR: Image dimension mismatch!" << endl;
+      throw 0;
+    }
+
+    // subtract out the mean of the image (mean is 1xpixels in size), perform in place (in src)
+    int threadsPerBlock = 512;
+    int numBlocks = _meanElems / threadsPerBlock + 1;
+    subtractMeanKernel<<<numBlocks, threadsPerBlock>>>((float*)src, cudaMeanPtr, _meanElems);
+    CUDA_KERNEL_ERR_CHK(&err);
+
+    // perform matrix multiplication
+    dim3 threadsPerBlock2d(512, 1);
+    dim3 numBlocks2d(
+        _evCols / threadsPerBlock2d.x + 1,
+        _numSteps / threadsPerBlock2d.y + 1);
+    multiplyKernel<<<numBlocks2d, threadsPerBlock2d>>>((float*)src, intermediaryBuffer, cudaEvPtr, _evCols, _numSteps, _stepSize, _meanElems);
+    CUDA_KERNEL_ERR_CHK(&err);
+
+    threadsPerBlock = 512;
+    numBlocks = _evCols / threadsPerBlock + 1;
+    multiplyJoinKernel<<<numBlocks, threadsPerBlock>>>(intermediaryBuffer, (float*)*dst, _evCols, _numSteps, _stepSize);
+    CUDA_KERNEL_ERR_CHK(&err);
+
+    // free the src memory
+    CUDA_SAFE_FREE(src, &err);
+  }
+}}}
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright 2016 Li Li, Colin Heinzmann                                     *
+ *                                                                           *
+ * Licensed under the Apache License, Version 2.0 (the "License");           *
+ * you may not use this file except in compliance with the License.          *
+ * You may obtain a copy of the License at                                   *
+ *                                                                           *
+ *     http://www.apache.org/licenses/LICENSE-2.0                            *
+ *                                                                           *
+ * Unless required by applicable law or agreed to in writing, software       *
+ * distributed under the License is distributed on an "AS IS" BASIS,         *
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  *
+ * See the License for the specific language governing permissions and       *
+ * limitations under the License.                                            *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#include <iostream>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#include <openbr/plugins/openbr_internal.h>
+#include <openbr/core/opencvutils.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+
+using namespace cv;
+
+// definitions from the CUDA source file
+namespace br { namespace cuda { namespace rgb2grayscale {
+  void wrapper(void* srcPtr, void**dstPtr, int rows, int cols);
+}}}
+
+namespace br
+{
+
+/*!
+ * \ingroup transforms
+ * \brief Converts 3-channel images to grayscale
+ * \author Li Li \cite booli
+ */
+class CUDARGB2GrayScaleTransform : public UntrainableTransform
+{
+    Q_OBJECT
+
+public:
+
+private:
+    void project(const Template &src, Template &dst) const
+    {
+        void* const* srcDataPtr = src.m().ptr<void*>();
+        int rows = *((int*) srcDataPtr[1]);
+        int cols = *((int*) srcDataPtr[2]);
+        int type = *((int*) srcDataPtr[3]);
+
+        Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
+        void** dstDataPtr = dstMat.ptr<void*>();
+        dstDataPtr[1] = srcDataPtr[1];
+        dstDataPtr[2] = srcDataPtr[2];
+        dstDataPtr[3] = srcDataPtr[3];
+        *((int*)dstDataPtr[3]) = CV_8UC1; // not sure if the type of the new mat is the same
+
+        cuda::rgb2grayscale::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
+        dst = dstMat;
+    }
+};
+
+BR_REGISTER(Transform, CUDARGB2GrayScaleTransform)
+
+} // namespace br
+
+#include "imgproc/cudargb2grayscale.moc"
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright 2016 Li Li, Colin Heinzmann                                     *
+ *                                                                           *
+ * Licensed under the Apache License, Version 2.0 (the "License");           *
+ * you may not use this file except in compliance with the License.          *
+ * You may obtain a copy of the License at                                   *
+ *                                                                           *
+ *     http://www.apache.org/licenses/LICENSE-2.0                            *
+ *                                                                           *
+ * Unless required by applicable law or agreed to in writing, software       *
+ * distributed under the License is distributed on an "AS IS" BASIS,         *
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  *
+ * See the License for the specific language governing permissions and       *
+ * limitations under the License.                                            *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#include <stdio.h>
+#include <iostream>
+#include <opencv2/opencv.hpp>
+#include <opencv2/gpu/gpu.hpp>
+#include "cudadefines.hpp"
+#include <opencv2/imgproc/imgproc.hpp>
+#include <opencv2/imgproc/imgproc_c.h>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/highgui/highgui_c.h>
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+namespace br { namespace cuda { namespace rgb2grayscale {
+
+  __global__ void kernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols)
+  {
+    int rowInd = blockIdx.y*blockDim.y+threadIdx.y;
+    int colInd = blockIdx.x*blockDim.x+threadIdx.x;
+    int index = rowInd*cols + colInd;
+    if (rowInd < 0 || rowInd >= rows || colInd < 0 || colInd >= cols) {
+      return;
+    }
+    int new_index = 3 * index;
+    float g = (float) srcPtr[new_index];
+    float b = (float) srcPtr[new_index+1];
+    float r = (float) srcPtr[new_index+2];
+
+    dstPtr[index] = (uint8_t) (0.299f * g + 0.587f * b + 0.114f * r);
+    return;
+  }
+
+  void wrapper(void* srcPtr, void** dstPtr, int rows, int cols)
+  {
+    cudaError_t err;
+    dim3 threadsPerBlock(9, 9);
+    dim3 numBlocks(cols/threadsPerBlock.x + 1,
+                   rows/threadsPerBlock.y + 1);
+    CUDA_SAFE_MALLOC(dstPtr, rows*cols*sizeof(uint8_t), &err);
+
+    kernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*) (*dstPtr), rows, cols);
+    CUDA_KERNEL_ERR_CHK(&err);
+    CUDA_SAFE_FREE(srcPtr, &err);
+  }
+
+}}}
+# add WITH_CUDA option
+option(BR_WITH_CUDA "Build CUDA-accelerated plugins." OFF)
+set(BR_CUDA_ARCH "sm_20" CACHE STRING "CUDA Architecture")
+
+# only build this module if explicitly OK'ed
+if(BR_WITH_CUDA)
+  message(STATUS "Building with CUDA Support")
+  find_package(CUDA REQUIRED)
+
+  set(CUDA_SRC_DIR ${PROJECT_SOURCE_DIR}/openbr/plugins/cuda)
+
+  # configure the compiler, need -fPIC for shared library
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch=${BR_CUDA_ARCH} --compiler-options -fPIC)
+  include_directories(${CUDA_INCLUDE_DIRS})
+
+  # glob sources
+  file(GLOB CUDA_CU_SRC ${CUDA_SRC_DIR}/*.cu)
+  file(GLOB CUDA_CPP_SRC ${CUDA_SRC_DIR}/*.cpp)
+
+  # compile each of the object files and append to CUDA_CU_OBJ
+  foreach(FILE ${CUDA_CU_SRC})
+    cuda_compile(FILE_O ${FILE})
+    set(CUDA_CU_OBJ ${CUDA_CU_OBJ} ${FILE_O})
+  endforeach()
+
+  # ensure add_library knows these are external object file
+  set_source_files_properties(${CUDA_CU_OBJ} PROPERTIES EXTERNAL_OBJECT true)
+
+  # add the compiled source and libs into the build system
+  set(BR_THIRDPARTY_SRC ${BR_THIRDPARTY_SRC} ${CUDA_CPP_SRC} ${CUDA_CU_OBJ})
+  set(BR_THIRDPARTY_LIBS ${BR_THIRDPARTY_LIBS} ${CUDA_LIBRARIES})
+
+endif()
@@ -12,7 +12,15 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${BR_THIRDPARTY_PLUGINS_DIR})
 file(GLOB SUBFILES plugins/*)
 foreach(FILE ${SUBFILES})
   if(IS_DIRECTORY ${FILE})
-    set(BR_PLUGINS_DIR ${BR_PLUGINS_DIR} ${FILE})
+    # check to see if there is a cmake file for the folder, if there is, then that
+    # file should be used to build the contents of the directory
+    if (EXISTS ${FILE}/module.cmake)
+      message(STATUS "importing ${FILE}/module.cmake")
+      include(${FILE}/module.cmake)
+    else()
+      message(STATUS "adding ${FILE}")
+      set(BR_PLUGINS_DIR ${BR_PLUGINS_DIR} ${FILE})
+    endif()
   endif()
 endforeach()
 set(BR_PLUGINS_DIR ${BR_PLUGINS_DIR} plugins/) # Remove this when finished with reorg