diff --git a/openbr/plugins/cuda/CUDAL2.cpp b/openbr/plugins/cuda/CUDAL2.cpp
new file mode 100644
index 0000000..6b248d0
--- /dev/null
+++ b/openbr/plugins/cuda/CUDAL2.cpp
@@ -0,0 +1,58 @@
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright 2012 The MITRE Corporation                                      *
+ *                                                                           *
+ * Licensed under the Apache License, Version 2.0 (the "License");           *
+ * you may not use this file except in compliance with the License.          *
+ * You may obtain a copy of the License at                                   *
+ *                                                                           *
+ *     http://www.apache.org/licenses/LICENSE-2.0                            *
+ *                                                                           *
+ * Unless required by applicable law or agreed to in writing, software       *
+ * distributed under the License is distributed on an "AS IS" BASIS,         *
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  *
+ * See the License for the specific language governing permissions and       *
+ * limitations under the License.                                            *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#include <iostream>
+using namespace std;
+
+#include <openbr/plugins/openbr_internal.h>
+
+namespace br { namespace cuda { namespace L2{
+  void wrapper(float* cudaAPtr, float* cudaBPtr, int length, float* outPtr);
+}}}
+
+namespace br
+{
+
+/*!
+ * \ingroup distances
+ * \brief L2 distance computed using eigen.
+ * \author Josh Klontz \cite jklontz
+ */
+class CUDAL2Distance : public UntrainableDistance
+{
+    Q_OBJECT
+
+    float compare(const cv::Mat &a, const cv::Mat &b) const
+    {
+      void* const* srcDataPtr = a.ptr<void*>();
+      float* cudaAPtr = (float*)srcDataPtr[0];
+      int rows = *((int*)srcDataPtr[1]);
+      int cols = *((int*)srcDataPtr[2]);
+
+      float* cudaBPtr = (float*)b.ptr<void*>()[0];
+
+      float out;
+      br::cuda::L2::wrapper(cudaAPtr, cudaBPtr, rows*cols, &out);
+
+      return out;
+    }
+};
+
+BR_REGISTER(Distance, CUDAL2Distance)
+
+} // namespace br
+
+#include "cuda/CUDAL2.moc"
diff --git a/openbr/plugins/cuda/CUDAL2.cu b/openbr/plugins/cuda/CUDAL2.cu
new file mode 100644
index 0000000..8d52b35
--- /dev/null
+++ b/openbr/plugins/cuda/CUDAL2.cu
@@ -0,0 +1,68 @@
+#include <math.h>
+
+#include "cudadefines.hpp"
+
+namespace br { namespace cuda { namespace L2 {
+
+  __global__ void my_subtract_kernel(float* aPtr, float* bPtr, float* workPtr, int length) {
+    int index = blockIdx.x*blockDim.x+threadIdx.x;
+
+    if (index >= length) {
+      return;
+    }
+
+    // perform the subtraction in-place
+    // use b because it is the comparison
+    // image
+    workPtr[index] = aPtr[index] - bPtr[index];
+    workPtr[index] = workPtr[index] * workPtr[index];
+  }
+
+  __global__ void collapse_kernel(float* inPtr, float* outPtr, int length) {
+    // make sure there is only one thread that we are calling
+    if (blockIdx.x != 0 || threadIdx.x != 0) {
+      return;
+    }
+
+    // sum up all the values
+    *outPtr = 0;
+    for (int i=0; i < length; i++) {
+      *outPtr = *outPtr + inPtr[i];
+    }
+
+    // take the square root
+    *outPtr = sqrtf(*outPtr);
+  }
+
+  void wrapper(float* cudaAPtr, float* cudaBPtr, int length, float* outPtr) {
+    cudaError_t err;
+    float* cudaOutPtr;
+    CUDA_SAFE_MALLOC(&cudaOutPtr, sizeof(float), &err);
+
+    float* cudaWorkBufferPtr;
+    CUDA_SAFE_MALLOC(&cudaWorkBufferPtr, sizeof(float)*length, &err);
+
+    // perform the subtraction
+    int threadsPerBlock = 64;
+    int numBlocks = length / threadsPerBlock + 1;
+    my_subtract_kernel<<<threadsPerBlock, numBlocks>>>(cudaAPtr, cudaBPtr, cudaWorkBufferPtr, length);
+    CUDA_KERNEL_ERR_CHK(&err);
+
+    // perform the collapse
+    collapse_kernel<<<1,1>>>(cudaWorkBufferPtr, cudaOutPtr, length);
+    CUDA_KERNEL_ERR_CHK(&err);
+
+    // copy the single value back to the destinsion
+    CUDA_SAFE_MEMCPY(outPtr, cudaOutPtr, sizeof(float), cudaMemcpyDeviceToHost, &err);
+
+    CUDA_SAFE_FREE(cudaOutPtr, &err);
+
+    // do not free aPtr which should be the reference library
+    // only free bPtr, which is the image we are comparing
+    CUDA_SAFE_FREE(cudaBPtr, &err);
+    CUDA_SAFE_FREE(cudaWorkBufferPtr, &err);
+  }
+}}}
+
+// 128CUDAEigenfaces on 6400 ATT: 54.367s
+// 128CUDAEigenfacesL2 on 6400 ATT: