diff --git a/openbr/plugins/cuda/cudacvtfloat.cpp b/openbr/plugins/cuda/cudacvtfloat.cpp
new file mode 100644
index 0000000..211fcfb
--- /dev/null
+++ b/openbr/plugins/cuda/cudacvtfloat.cpp
@@ -0,0 +1,48 @@
+#include <iostream>
+#include <unistd.h>
+using namespace std;
+
+#include <opencv2/opencv.hpp>
+using namespace cv;
+
+#include <openbr/plugins/openbr_internal.h>
+
+#include "cudacvtfloat.hpp"
+
+namespace br
+{
+
+/*!
+ * \ingroup transforms
+ * \brief Converts byte to floating point
+ * \author Colin Heinzmann \cite DepthDeluxe
+ */
+class CUDACvtFloatTransform : public UntrainableTransform
+{
+    Q_OBJECT
+
+  public:
+    void project(const Template &src, Template &dst) const
+    {
+      // assume the image type is 256-monochrome
+      // TODO(colin): real exception handling
+      if (src.m().type() != CV_8UC1) {
+        cout << "ERR: Invalid memory format" << endl;
+        return;
+      }
+
+
+      int rows = src.m().rows;
+      int cols = src.m().cols;
+
+      dst = Mat(rows, cols, CV_32FC1);
+
+      br::cuda::cudacvtfloat::wrapper((const unsigned char*)src.m().ptr<unsigned char>(), dst.m().ptr<float>(), rows, cols);
+    }
+};
+
+BR_REGISTER(Transform, CUDACvtFloatTransform)
+
+} // namespace br
+
+#include "cuda/cudacvtfloat.moc"
diff --git a/openbr/plugins/cuda/cudacvtfloat.cu b/openbr/plugins/cuda/cudacvtfloat.cu
new file mode 100644
index 0000000..4f675fb
--- /dev/null
+++ b/openbr/plugins/cuda/cudacvtfloat.cu
@@ -0,0 +1,37 @@
+namespace br { namespace cuda { namespace cudacvtfloat {
+
+  __global__ void kernel(const unsigned char* src, float* dst, int rows, int cols) {
+    // get my index
+    int rowInd = blockIdx.y*blockDim.y + threadIdx.y;
+    int colInd = blockIdx.x*blockDim.x + threadIdx.x;
+
+    // bounds check
+    if (rowInd >= rows || colInd >= cols) {
+      return;
+    }
+
+    int index = rowInd*cols + colInd;
+    dst[index] = (float)src[index];
+  }
+
+  void wrapper(const unsigned char* src, float* dst, int rows, int cols) {
+    unsigned char* cudaSrc;
+    cudaMalloc(&cudaSrc, rows*cols*sizeof(unsigned char));
+    cudaMemcpy(cudaSrc, src, rows*cols*sizeof(unsigned char), cudaMemcpyHostToDevice);
+
+    float* cudaDst;
+    cudaMalloc(&cudaDst, rows*cols*sizeof(float));
+
+    dim3 threadsPerBlock(8, 8);
+    dim3 blocks(
+      cols / threadsPerBlock.x + 1,
+      rows / threadsPerBlock.y + 1
+    );
+
+    kernel<<<threadsPerBlock, blocks>>>(cudaSrc, cudaDst, rows, cols);
+
+    // copy the data back to the destination
+    cudaMemcpy(dst, cudaDst, rows*cols*sizeof(float), cudaMemcpyDeviceToHost);
+  }
+
+}}}
diff --git a/openbr/plugins/cuda/cudacvtfloat.hpp b/openbr/plugins/cuda/cudacvtfloat.hpp
new file mode 100644
index 0000000..d2fe82a
--- /dev/null
+++ b/openbr/plugins/cuda/cudacvtfloat.hpp
@@ -0,0 +1,3 @@
+namespace br { namespace cuda { namespace cudacvtfloat {
+  void wrapper(const unsigned char* src, float* dst, int rows, int cols);
+}}}