diff --git a/openbr/plugins/cuda/cudapca.cpp b/openbr/plugins/cuda/cudapca.cpp
index e7e5e0c..1e5845c 100644
--- a/openbr/plugins/cuda/cudapca.cpp
+++ b/openbr/plugins/cuda/cudapca.cpp
@@ -13,8 +13,8 @@
  * See the License for the specific language governing permissions and       *
  * limitations under the License.                                            *
  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
-// CUDA includes
-#include <cusolverDn.h>
+#include <iostream>
+using namespace std;
 
 #include <Eigen/Dense>
 #include <openbr/plugins/openbr_internal.h>
@@ -23,6 +23,8 @@
 #include <openbr/core/eigenutils.h>
 #include <openbr/core/opencvutils.h>
 
+#include "cudapca.hpp"
+
 namespace br
 {
 /*!
@@ -74,12 +76,12 @@ private:
         if (trainingSet.first().m().type() != CV_32FC1)
             qFatal("Requires single channel 32-bit floating point matrices.");
 
-        originalRows = trainingSet.first().m().rows;
-        int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols;
-        const int instances = trainingSet.size();
+        originalRows = trainingSet.first().m().rows;    // get number of rows of first image
+        int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols; // get the size of the first image
+        const int instances = trainingSet.size();       // get the number of training set instances
 
         // Map into 64-bit Eigen matrix
-        Eigen::MatrixXd data(dimsIn, instances);
+        Eigen::MatrixXd data(dimsIn, instances);        // create a mat
         for (int i=0; i<instances; i++)
             data.col(i) = Eigen::Map<const Eigen::MatrixXf>(trainingSet[i].m().ptr<float>(), dimsIn, 1).cast<double>();
 
@@ -90,12 +92,16 @@ private:
     {
         dst = cv::Mat(1, keep, CV_32FC1);
 
+        // perform the operation on the graphics card
+        cuda::cudapca_projectwrapper((float*)src.m().ptr<float>(), (float*)dst.m().ptr<float>());
+
         // Map Eigen into OpenCV
-        Eigen::Map<const Eigen::MatrixXf> inMap(src.m().ptr<float>(), src.m().rows*src.m().cols, 1);
-        Eigen::Map<Eigen::MatrixXf> outMap(dst.m().ptr<float>(), keep, 1);
+        //Mat cpuDst = cv::Mat(1, keep, CV_32FC1);
+        //Eigen::Map<const Eigen::MatrixXf> inMap(src.m().ptr<float>(), src.m().rows*src.m().cols, 1);
+        //Eigen::Map<Eigen::MatrixXf> outMap(dst.m().ptr<float>(), keep, 1);
 
         // Do projection
-        outMap = eVecs.transpose() * (inMap - mean);
+        //cpuOutMap = eVecs.transpose() * (inMap - mean);
     }
 
     void store(QDataStream &stream) const
@@ -106,6 +112,41 @@ private:
     void load(QDataStream &stream)
     {
         stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> eVecs;
+
+        cout << "Mean Dimensions" << endl;
+        cout << "\tRows: " << mean.rows() << " Cols: " << mean.cols() << endl;
+        cout << "eVecs Dimensions" << endl;
+        cout << "\tRows: " << eVecs.rows() << " Cols: " << eVecs.cols() << endl;
+        cout << "eVals Dimensions" << endl;
+        cout << "\tRows: " << eVals.rows() << " Cols: " << eVals.cols() << endl;
+        cout << "Keep: " << keep << endl;
+
+        cout << "Mean first value: " << mean(0, 0) << endl;
+
+        // TODO(colin): use Eigen Map class to generate map files so we don't have to copy the data
+        // serialize the eigenvectors
+        float* evBuffer = new float[eVecs.rows() * eVecs.cols()];
+        for (int i=0; i < eVecs.rows(); i++) {
+          for (int j=0; j < eVecs.cols(); j++) {
+            evBuffer[i*eVecs.cols() + j] = eVecs(i, j);
+          }
+        }
+
+        // serialize the mean
+        float* meanBuffer = new float[mean.rows() * mean.cols()];
+        for (int i=0; i < mean.rows(); i++) {
+          for (int j=0; j < mean.cols(); j++) {
+            meanBuffer[i*mean.cols() + j] = mean(i, j);
+          }
+        }
+
+        cout << "Meanbuffer first value: " << meanBuffer[0] << endl;
+
+        // call the wrapper function
+        cuda::cudapca_loadwrapper(evBuffer, eVecs.rows(), eVecs.cols(), meanBuffer, mean.rows(), mean.cols(), keep);
+
+        delete evBuffer;
+        delete meanBuffer;
     }
 
 protected:
diff --git a/openbr/plugins/cuda/cudapca.cu b/openbr/plugins/cuda/cudapca.cu
new file mode 100644
index 0000000..bd6f7fb
--- /dev/null
+++ b/openbr/plugins/cuda/cudapca.cu
@@ -0,0 +1,201 @@
+#include <iostream>
+using namespace std;
+
+#include <opencv2/opencv.hpp>
+#include <opencv2/gpu/gpu.hpp>
+
+using namespace cv;
+using namespace cv::gpu;
+
+#include "cudapca.hpp"
+
+namespace br { namespace cuda {
+  __global__ void calculateCovariance_kernel(float* trainingSet, float* cov, int numRows, int numCols) {
+    int rowInd = blockIdx.y*blockDim.y + threadIdx.y;
+    int colInd = blockIdx.x*blockDim.x + threadIdx.x;
+
+    // this calculates trainingSet' * trainingSet
+    if (rowInd >= numRows || colInd >= numCols) {
+      return;
+    }
+
+    // get a reference the value we wish to write
+    float& out = cov[rowInd*numRows + colInd];
+
+    // calculate the value of this position
+    out = 0;
+    for (int i=0; i<numRows; i++) {
+      out += trainingSet[rowInd*numCols + colInd] * trainingSet[rowInd*numCols + numRows]; // XXX(colin): not sure if this is correct
+    }
+    out = out / (numRows-1);
+  }
+
+  __global__ void cudapca_project_multiply_kernel(float* src, float* dst, float* evPtr, int evRows, int evCols) {
+    int colInd = blockIdx.x*blockDim.x+threadIdx.x;
+
+    // check dimensions
+    if (colInd >= evCols) {
+      return;
+    }
+
+    dst[colInd] = 0;
+    for (int i=0; i < evRows; i++) {
+      dst[colInd] += evPtr[evCols*i + colInd] * src[i];
+    }
+  }
+
+  __global__ void cudapca_project_subtractmean_kernel(float* out, float* mean, int cols) {
+    int colInd = blockIdx.x*blockDim.x+threadIdx.x;
+
+    // perform bound checking
+    if (colInd >= cols) {
+      return;
+    }
+
+    // subtract out the mean
+    out[colInd] -= mean[colInd];
+  }
+
+  float* cudaEvPtr; int _evRows; int _evCols;
+  float* cudaMeanPtr; int _meanRows; int _meanCols;
+  int _keep;
+
+  void cudapca_initwrapper() {
+
+  }
+
+  void cudapca_loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanRows, int meanCols, int keep) {
+    _evRows = evRows; _evCols = evCols;
+    _meanRows = meanRows; _meanCols = meanCols;
+    _keep = keep;
+
+    // copy the eigenvectors to the GPU
+    cudaMalloc(&cudaEvPtr, evRows*evCols*sizeof(float));
+    cudaMemcpy(cudaEvPtr, evPtr, evRows*evCols*sizeof(float), cudaMemcpyHostToDevice);
+
+    // copy the mean to the GPU
+    cudaMalloc(&cudaMeanPtr, meanRows*meanCols*sizeof(float));
+    cudaMemcpy(cudaMeanPtr, meanPtr, meanRows*meanCols*sizeof(float), cudaMemcpyHostToDevice);
+  }
+
+  void cudapca_trainwrapper() {
+    /*
+    if (trainingSet[0].type() != CV_32FC1) {
+      std::cout << "ERR: Requires single 32-bit floating point matrix!";
+      return;
+    }
+
+    cudaError_t status;
+
+    const int numExamples = trainingSetSize;
+    int numPixels = trainingSet[0].rows * trainingSet[0].cols;
+
+    // create a custom matrix
+    float* cudaDataPtr;
+    status = cudaMalloc(&cudaDataPtr, numPixels * numExamples * sizeof(float));
+    if (status != cudaSuccess) {
+      std::cout << "ERR: Memory allocation" << std::endl;
+      return;
+    }
+
+    // copy all the data to the graphics card
+    for (int i=0; i < numExamples; i++) {
+      status = cudaMemcpy(cudaDataPtr + i*numPixels, trainingSet[i].ptr<float>(), numPixels*sizeof(float), cudaMemcpyHostToDevice);
+      if (status != cudaSuccess) {
+        std::cout << "ERR: Memcpy at index " << i << std::endl;
+        return;
+      }
+    }
+
+    // start the core part of the algorithm
+    int numDimensions = numPixels;
+    const bool dominantEigenEstimation = (numDimensions > numExamples);
+
+    // malloc and init mean
+    mean = new float[numDimensions];
+    for (int i=0; i < numDimensions; i++) {
+      mean[i] = 0;
+    }
+    float* cudaMeanPtr;
+    status = cudaMalloc(&cudaMeanPtr, numDimensions*sizeof(float));
+    if (status != cudaSuccess) {
+      std::cout << " ERR: Malloc of mean" << std::endl;
+      return;
+    }
+
+    if (keep != 0) {
+      // compute the mean so we can subtract from data
+      for (int i=0; i < numExamples; i++) {
+        Mat& m = trainingSet[i];
+
+        for (int j=0; j < numDimensions; j++) {
+          mean[j] += m.ptr<float>()[i*numDimensions + j];
+        }
+      }
+      for (int i=0; i < numDimensions; i++) {
+        mean[i] = mean[i] / numExamples;
+      }
+
+      // copy mean over to graphics card
+      cudaMemcpy(cudaMeanPtr, mean, numExamples*sizeof(float), cudaMemcpyHostToDevice);
+      if (status != cudaSuccess) {
+        std::cout << " ERR: Cpy of mean" << std::endl;
+        return;
+      }
+
+      // set the thread dimensions and run the kernel
+      dim3 threadsPerBlock(64, 1);
+      dim3 numBlocks(numDimensions/threadsPerBlock.x + 1,
+                     numExamples/threadsPerBlock.y + 1);
+
+      subtractMean_kernel<<<numBlocks, threadsPerBlock>>>(cudaDataPtr, cudaMeanPtr, numExamples, numDimensions);
+
+      // calculate the covariance matrix using kernel
+      // malloc location for covariance matrix
+      float* cudaCovPtr;
+      status = cudaMalloc(&cudaCovPtr, numExamples*numExamples*sizeof(float));
+      if (status != cudaSuccess) h
+        std::cout << " ERR: Cpy of mean" << std::endl;
+        return;
+      }
+
+      // calculate the covariance matrix
+      threadsPerBlock = dim3(8, 8);
+      numBlocks = dim3(numExamples/threadsPerBlock.x + 1,
+                       numExamples/threadsPerBlock.y + 1);
+      calculateCovariance_kernel<<<numBlocks, threadsPerBlock>>>(cudaDataPtr, cudaCovPtr, numExamples, numDimensions);
+
+      // perform eigendecomposition
+      //std::cout << "Skipping eigendecomposition" << std::endl;
+      cusolverStatus_t cusolverStatus;
+      cusolverStatus = cusolverDnSgebrd(cusolverHandle,)
+    }
+    */
+  }
+
+  void cudapca_projectwrapper(float* src, float* dst) {
+    // copy the image to the GPU
+    float* cudaSrcPtr;
+    cudaMalloc(&cudaSrcPtr, _meanRows*_meanCols*sizeof(float));
+    cudaMemcpy(cudaSrcPtr, src, _meanRows*_meanCols*sizeof(float), cudaMemcpyHostToDevice);
+
+    float* cudaDstPtr;
+    cudaMalloc(&cudaDstPtr, _keep*sizeof(float));
+
+    // subtract out the mean of the image (mean is 1xpixels in size)
+    int threadsPerBlock = 64;
+    int numBlocks = _meanRows*_meanCols / threadsPerBlock;
+    cudapca_project_subtractmean_kernel<<<numBlocks, threadsPerBlock>>>(cudaSrcPtr, cudaMeanPtr, _meanRows*_meanCols);
+
+    // perform the multiplication
+    threadsPerBlock = 64;
+    numBlocks = _keep / threadsPerBlock;
+    cudapca_project_multiply_kernel<<<numBlocks, threadsPerBlock>>>(cudaSrcPtr, cudaDstPtr, cudaEvPtr, _evRows, _evCols);
+
+    // copy the data back to the CPU
+    cudaMemcpy(dst, cudaDstPtr, _keep*sizeof(float), cudaMemcpyDeviceToHost);
+
+    cudaFree(cudaSrcPtr);
+    cudaFree(cudaDstPtr);
+  }
+}}
diff --git a/openbr/plugins/cuda/cudapca.hpp b/openbr/plugins/cuda/cudapca.hpp
new file mode 100644
index 0000000..b655e3e
--- /dev/null
+++ b/openbr/plugins/cuda/cudapca.hpp
@@ -0,0 +1,14 @@
+#include <opencv2/opencv.hpp>
+#include <opencv2/gpu/gpu.hpp>
+
+using namespace cv;
+using namespace cv::gpu;
+
+namespace br { namespace cuda {
+  void cudapca_initwrapper();
+
+  void cudapca_loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanRows, int meanCols, int keep);
+  void cudapca_trainwrapper();
+
+  void cudapca_projectwrapper(float* src, float* dst);
+}}