diff --git a/openbr/plugins/cuda/cudalbp.cpp b/openbr/plugins/cuda/cudalbp.cpp
new file mode 100644
index 0000000..2d6d05f
--- /dev/null
+++ b/openbr/plugins/cuda/cudalbp.cpp
@@ -0,0 +1,128 @@
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright 2012 The MITRE Corporation                                      *
+ *                                                                           *
+ * Licensed under the Apache License, Version 2.0 (the "License");           *
+ * you may not use this file except in compliance with the License.          *
+ * You may obtain a copy of the License at                                   *
+ *                                                                           *
+ *     http://www.apache.org/licenses/LICENSE-2.0                            *
+ *                                                                           *
+ * Unless required by applicable law or agreed to in writing, software       *
+ * distributed under the License is distributed on an "AS IS" BASIS,         *
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  *
+ * See the License for the specific language governing permissions and       *
+ * limitations under the License.                                            *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#include <opencv2/imgproc/imgproc.hpp>
+#include <opencv2/imgproc/imgproc_c.h>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/highgui/highgui_c.h>
+#include <limits>
+
+#include <openbr/plugins/openbr_internal.h>
+
+#include "cudalbp.hpp"
+
+using namespace cv;
+
+namespace br
+{
+
+/*!
+ * \ingroup transforms
+ * \brief Convert the image into a feature vector using Local Binary Patterns
+ * \br_paper Ahonen, T.; Hadid, A.; Pietikainen, M.;
+ *           "Face Description with Local Binary Patterns: Application to Face Recognition"
+ *           Pattern Analysis and Machine Intelligence, IEEE Transactions, vol.28, no.12, pp.2037-2041, Dec. 2006
+ * \author Josh Klontz \cite jklontz
+ */
+class CUDALBPTransform : public UntrainableTransform
+{
+    Q_OBJECT
+    Q_PROPERTY(int radius READ get_radius WRITE set_radius RESET reset_radius STORED false)
+    Q_PROPERTY(int maxTransitions READ get_maxTransitions WRITE set_maxTransitions RESET reset_maxTransitions STORED false)
+    Q_PROPERTY(bool rotationInvariant READ get_rotationInvariant WRITE set_rotationInvariant RESET reset_rotationInvariant STORED false)
+    BR_PROPERTY(int, radius, 1)
+    BR_PROPERTY(int, maxTransitions, 8)
+    BR_PROPERTY(bool, rotationInvariant, false)
+
+    uchar lut[256];
+    uint8_t* lutGpuPtr;
+    uchar null;
+
+    /* Returns the number of 0->1 or 1->0 transitions in i */
+    static int numTransitions(int i)
+    {
+        int transitions = 0;
+        int curParity = i%2;
+        for (int j=1; j<=8; j++) {
+            int parity = (i>>(j%8)) % 2;
+            if (parity != curParity) transitions++;
+            curParity = parity;
+        }
+        return transitions;
+    }
+
+    static int rotationInvariantEquivalent(int i)
+    {
+        int min = std::numeric_limits<int>::max();
+        for (int j=0; j<8; j++) {
+            bool parity = i % 2;
+            i = i >> 1;
+            if (parity) i+=128;
+            min = std::min(min, i);
+        }
+        return min;
+    }
+
+    void init()
+    {
+        bool set[256];
+        uchar uid = 0;
+        for (int i=0; i<256; i++) {
+            if (numTransitions(i) <= maxTransitions) {
+                int id;
+                if (rotationInvariant) {
+                    int rie = rotationInvariantEquivalent(i);
+                    if (i == rie) id = uid++;
+                    else          id = lut[rie];
+                } else            id = uid++;
+                lut[i] = id;
+                set[i] = true;
+            } else {
+                set[i] = false;
+            }
+        }
+
+        null = uid;
+        for (int i=0; i<256; i++)
+            if (!set[i])
+                lut[i] = null; // Set to null id
+
+        // copy lut over to the GPU
+        br::cuda::cudalbp_init_wrapper(lut, &lutGpuPtr);
+    }
+
+    void project(const Template &src, Template &dst) const
+    {
+      // assume we are using an 8-bit 1 channel image
+      GpuMat srcGpuMat, dstGpuMat;
+
+      // copy the data to the GPU
+      srcGpuMat.upload(src.m());
+      dstGpuMat.upload(src.m());
+
+      // call the kernel function
+      br::cuda::cudalbp_wrapper(srcGpuMat, dstGpuMat, lutGpuPtr);
+
+      // download the result
+      dstGpuMat.download(dst.m());
+    }
+};
+
+BR_REGISTER(Transform, CUDALBPTransform)
+
+} // namespace br
+
+#include "cuda/cudalbp.moc"
diff --git a/openbr/plugins/cuda/cudalbp.cu b/openbr/plugins/cuda/cudalbp.cu
new file mode 100644
index 0000000..81007b6
--- /dev/null
+++ b/openbr/plugins/cuda/cudalbp.cu
@@ -0,0 +1,65 @@
+#include <opencv2/gpu/gpu.hpp>
+#include <stdio.h>
+
+using namespace cv;
+using namespace cv::gpu;
+
+#include "cudalbp.hpp"
+
+namespace br { namespace cuda {
+  __device__ __forceinline__ uint8_t cudalbp_kernel_get_pixel_value(int row, int col, uint8_t* srcPtr, size_t srcStep, int rows, int cols) {
+    return (row >= rows || col >= cols) ? 0 : (srcPtr + row*srcStep)[col];
+  }
+
+  __global__ void cudalbp_kernel(uint8_t* srcPtr, uint8_t* dstPtr, size_t srcStep, size_t dstStep, int rows, int cols, uint8_t* lut)
+  {
+    int rowInd = blockIdx.y*blockDim.y+threadIdx.y;
+    int colInd = blockIdx.x*blockDim.x+threadIdx.x;
+    int radius = 1;
+
+    // don't do anything if the index is out of bounds
+    if (rowInd >= rows || colInd >= cols)
+      return;
+
+    const uint8_t cval = cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+0*radius, srcPtr, srcStep, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]);                      // center value
+    uint8_t val = lut[(cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd-1*radius, srcPtr, srcStep, rows, cols) >= cval ? 128 : 0) |
+                      (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+0*radius, srcPtr, srcStep, rows, cols) >= cval ? 64  : 0) |
+                      (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+1*radius, srcPtr, srcStep, rows, cols) >= cval ? 32  : 0) |
+                      (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+1*radius, srcPtr, srcStep, rows, cols) >= cval ? 16  : 0) |
+                      (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+1*radius, srcPtr, srcStep, rows, cols) >= cval ? 8   : 0) |
+                      (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+0*radius, srcPtr, srcStep, rows, cols) >= cval ? 4   : 0) |
+                      (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd-1*radius, srcPtr, srcStep, rows, cols) >= cval ? 2   : 0) |
+                      (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd-1*radius, srcPtr, srcStep, rows, cols) >= cval ? 1   : 0)];
+
+    // store calculated value away in the right place
+    uint8_t* dstRowPtr = dstPtr + rowInd*dstStep;
+    dstRowPtr[colInd] = val;
+  }
+
+  void cudalbp_wrapper(GpuMat& src, GpuMat& dst, uint8_t* lut)
+  {
+    // convert the GpuMats to pointers
+    uint8_t* srcPtr = (uint8_t*)src.data;
+    uint8_t* dstPtr = (uint8_t*)dst.data;
+
+    int imageWidth = src.cols;
+    int imageHeight = src.rows;
+
+    // make 8 * 8 = 64 square block
+    dim3 threadsPerBlock(8, 8);
+
+    dim3 numBlocks(imageWidth/threadsPerBlock.x + 1,
+                   imageHeight/threadsPerBlock.y + 1);
+
+    printf("Src Image Dimesions:\n\trows: %d\tcols: %d\n", src.rows, src.cols);
+    printf("Dst Image Dimesions:\n\trows: %d\tcols: %d\n", dst.rows, dst.cols);
+    printf("Running CUDALBP\nBlock Dimensions:\n\tx: %d\ty: %d\n", numBlocks.x, numBlocks.y);
+
+    cudalbp_kernel<<<numBlocks, threadsPerBlock>>>(srcPtr, dstPtr, src.step, dst.step, imageHeight, imageWidth, lut);
+  }
+
+  void cudalbp_init_wrapper(uint8_t* lut, uint8_t** lutGpuPtrPtr) {
+    cudaMalloc(lutGpuPtrPtr, 256*sizeof(uint8_t));
+    cudaMemcpy(*lutGpuPtrPtr, lut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice);
+  }
+}}
diff --git a/openbr/plugins/cuda/cudalbp.hpp b/openbr/plugins/cuda/cudalbp.hpp
new file mode 100644
index 0000000..f6569be
--- /dev/null
+++ b/openbr/plugins/cuda/cudalbp.hpp
@@ -0,0 +1,9 @@
+#include <opencv2/gpu/gpu.hpp>
+
+using namespace cv;
+using namespace cv::gpu;
+
+namespace br { namespace cuda {
+  void cudalbp_init_wrapper(uint8_t* lut, uint8_t** lutGpuPtrPtr);
+  void cudalbp_wrapper(GpuMat& src, GpuMat& dst, uint8_t* lut);
+}}