diff --git a/openbr/plugins/cuda/GpuMatManager.cpp b/openbr/plugins/cuda/GpuMatManager.cpp
deleted file mode 100644
index 187a549..0000000
--- a/openbr/plugins/cuda/GpuMatManager.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-#include <pthread.h>
-#include <semaphore.h>
-
-#include <opencv2/opencv.hpp>
-
-#include "GpuMatManager.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-namespace br { namespace cuda {
-  GpuMatManager::GpuMatManager(int num) {
-    _numMats = num;
-
-    // initialize the GpuMats
-    _mats = (GpuMat**)malloc(num * sizeof(GpuMat*));
-    _matTaken = (bool**)malloc(num * sizeof(bool*));
-    for (int i=0; i < num; i++) {
-      _mats[i] = new GpuMat();
-      _matTaken[i] = new bool;
-      (*_matTaken[i]) = false;
-    }
-
-    // initialize the locks
-    _matTakenLock = new pthread_mutex_t;
-    pthread_mutex_init(_matTakenLock, NULL);
-    _openCvOperationLock = new pthread_mutex_t;
-    pthread_mutex_init(_openCvOperationLock, NULL);
-
-    // initialize the semaphore
-    _matSemaphore = new sem_t;
-    sem_init(_matSemaphore, 0, _numMats);
-  }
-
-  GpuMat* GpuMatManager::reserve() {
-    GpuMat* reservedMat = NULL;
-
-    // get the reserved GpuMat
-    //sem_wait(_matSemaphore);
-    pthread_mutex_lock(_matTakenLock);
-    for (int i=0; i < _numMats; i++) {
-      if ( !(*_matTaken[i]) ) {
-        reservedMat = _mats[i];
-        *_matTaken[i] = true;
-        break;
-      }
-    }
-    pthread_mutex_unlock(_matTakenLock);
-
-    return reservedMat;
-  }
-
-  void GpuMatManager::upload(GpuMat* reservedMat, Mat& mat) {
-    // check the image Dimensions
-    if (reservedMat->size() != mat.size()) {
-      pthread_mutex_lock(_openCvOperationLock);
-      reservedMat->release();
-      reservedMat->create(mat.size(), mat.type());
-      pthread_mutex_unlock(_openCvOperationLock);
-    }
-
-    // upload the image
-    pthread_mutex_lock(_openCvOperationLock);
-    reservedMat->upload(mat);
-    pthread_mutex_unlock(_openCvOperationLock);
-    pthread_mutex_lock(_openCvOperationLock);
-    reservedMat->upload(mat);
-    pthread_mutex_unlock(_openCvOperationLock);
-  }
-
-  void GpuMatManager::matchDimensions(GpuMat* srcMat, GpuMat* dstMat) {
-    if (srcMat->size() != dstMat->size()) {
-      pthread_mutex_lock(_openCvOperationLock);
-      dstMat->release();
-      dstMat->create(srcMat->size(), srcMat->type());
-      pthread_mutex_unlock(_openCvOperationLock);
-    }
-  }
-
-  void GpuMatManager::download(GpuMat* reservedMat, Mat& dstMat) {
-    pthread_mutex_lock(_openCvOperationLock);
-    reservedMat->download(dstMat);
-    pthread_mutex_unlock(_openCvOperationLock);
-  }
-
-  void GpuMatManager::release(GpuMat* reservedMat) {
-    pthread_mutex_lock(_matTakenLock);
-    bool foundMatch = false;
-    for (int i=0; i < _numMats; i++) {
-      if (reservedMat == _mats[i]) {
-        *_matTaken[i] = false;
-        foundMatch = true;
-      }
-    }
-    pthread_mutex_unlock(_matTakenLock);
-
-    // return unconditionally if we didn't find a match
-    if (!foundMatch) {
-      return;
-    }
-
-    sem_post(_matSemaphore);
-  }
-
-  GpuMatManager::~GpuMatManager() {
-    // assume a single thread is destroying the manager
-    // TODO(colin): add the destroy code
-  }
-
-}}
diff --git a/openbr/plugins/cuda/MatManager.cu b/openbr/plugins/cuda/MatManager.cu
new file mode 100644
index 0000000..daca2c4
--- /dev/null
+++ b/openbr/plugins/cuda/MatManager.cu
@@ -0,0 +1,185 @@
+#include <pthread.h>
+#include <semaphore.h>
+
+#include <opencv2/opencv.hpp>
+
+#include "MatManager.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+
+namespace br { namespace cuda {
+  MatManager::MatManager(int num) {
+    _numMats = num;
+
+    // initialize the an array of Mats
+    _mats = (uint8_t**)malloc(num * sizeof(uint8_t*));
+    _matTaken = (bool**)malloc(num * sizeof(bool*));
+    _matsDimension = (int**) malloc(num * sizeof(int));
+
+    for (int i=0; i < num; i++) {
+      cudaMalloc(&_mats[i], 1 * sizeof(uint8_t));
+      //_mats[i] = new GpuMat();
+
+      _matTaken[i] = new bool;
+      (*_matTaken[i]) = false;
+
+      // initialize all mat dimensions to be 1
+      _matsDimension[i] = new int;
+      (*_matsDimension[i]) = 1;
+    }
+
+    // initialize the locks
+    _matTakenLock = new pthread_mutex_t;
+    pthread_mutex_init(_matTakenLock, NULL);
+    _matsDimensionLock = new pthread_mutex_t;
+    pthread_mutex_init(_matsDimensionLock, NULL);
+
+    // initialize the semaphore
+    _matSemaphore = new sem_t;
+    sem_init(_matSemaphore, 0, _numMats);
+  }
+
+  uint8_t* MatManager::reserve(Mat *mat) {
+    int reservedMatIndex = 0;
+    std::cout << "Reserving" << std::endl << std::flush;
+
+    sem_wait(_matSemaphore);
+    pthread_mutex_lock(_matTakenLock);
+    int i;
+    for (i=0; i < _numMats; i++) {
+      if ( !(*_matTaken[i]) ) {
+        *_matTaken[i] = true;
+        reservedMatIndex = i;
+        std::cout << "Taking " << i << std::endl << std::flush;
+        break;
+      }
+    }
+    if (i == _numMats) {
+      std::cout << "Cannot reserve a mat. Not enough GpuMat resourses\n" << std::endl << std::flush;
+    }
+
+    //printMats();
+    //printSemValue();
+    pthread_mutex_unlock(_matTakenLock);
+
+    // reallocate if size does not match
+    pthread_mutex_lock(_matsDimensionLock);
+    if (*_matsDimension[reservedMatIndex] != mat->rows * mat->cols) {
+      //printSizeChangingMat(reservedMat);
+      //reservedMat->release();
+      //reservedMat->create(mat->size(), mat->type());
+      std::cout << "Size mismatch" << std::endl << std::flush; 
+      // re malloc
+      cudaFree(_mats[reservedMatIndex]); // free the previous memory first
+      cudaMalloc(&_mats[reservedMatIndex], mat->rows * mat->cols * sizeof(uint8_t));
+      // change the dimension of that matrix
+      *_matsDimension[reservedMatIndex] = mat->rows * mat->cols; 
+
+    } 
+    pthread_mutex_unlock(_matsDimensionLock);
+    return _mats[reservedMatIndex];
+  }
+
+  void MatManager::upload(uint8_t* reservedMat, Mat& mat) {
+    // upload the image
+    /*
+    pthread_mutex_lock(_matsDimensionLock);
+    reservedMat->upload(mat);
+    pthread_mutex_unlock(_matsDimensionLock);
+    */
+
+    // copy the content of the Mat to GPU
+    cudaMemcpy(reservedMat, mat.ptr<uint8_t>(), mat.rows * mat.cols, cudaMemcpyHostToDevice);
+  }
+
+  void MatManager::download(uint8_t* reservedMat, Mat& dstMat) {
+    /*
+    pthread_mutex_lock(_matsDimensionLock);
+    reservedMat->download(dstMat);
+    pthread_mutex_unlock(_matsDimensionLock);
+    */
+    
+    // copy the mat data back
+    int dimension = dstMat.rows * dstMat.cols;
+    cudaMemcpy(dstMat.ptr<uint8_t>(), reservedMat, dimension, cudaMemcpyDeviceToHost);
+  }
+
+  void MatManager::release(uint8_t* reservedMat) {
+    pthread_mutex_lock(_matTakenLock);
+    bool foundMatch = false;
+    for (int i=0; i < _numMats; i++) {
+      if (reservedMat == _mats[i]) {
+        *_matTaken[i] = false;
+        foundMatch = true;
+      }
+    }
+    pthread_mutex_unlock(_matTakenLock);
+
+    // return unconditionally if we didn't find a match
+    if (!foundMatch) {
+      std::cout << "Reservedmat is not in the _mats array" << std::endl << std::flush;
+      return;
+    }
+    /*
+    printReleasingMat(reservedMat);
+    pthread_mutex_lock(_matsDimensionLock);
+    Size size = reservedMat->size();
+    int type = reservedMat->type();
+    reservedMat->release();
+    reservedMat->create(size, type);
+    
+
+    
+    pthread_mutex_unlock(_matsDimensionLock);
+    */
+
+    sem_post(_matSemaphore);
+  }
+
+  MatManager::~MatManager() {
+    // assume a single thread is destroying the manager
+    // TODO(colin): add the destroy code
+    std::cout << "Start to destroy.." << std::endl << std::flush;
+  }
+
+  /*
+  void MatManager::printMats() {
+    for (int i = 0; i < _numMats; i++) {
+      if ((*_matTaken[i]) == true) {
+        std::cout << i << ": Taken, " << _mats[i]->size() << std::endl << std::flush;
+      } else {
+        std::cout << i << ": Not taken, " << _mats[i]->size() << std::endl << std::flush;
+      }
+    }
+    std::cout << std::endl << std::flush;
+  }
+
+  void MatManager::printSemValue() {
+    int semValue;
+    sem_getvalue(_matSemaphore, &semValue);
+    std::cout << "Sem value: " << semValue << std::endl << std::flush;
+  }
+
+  void MatManager::printSizeChangingMat(GpuMat* gpuMat) {
+    for (int i=0; i < _numMats; i++) {
+      if (gpuMat == _mats[i]) {
+        std::cout << "changing is size of" << i << " at " << gpuMat <<  std::endl << std::flush;
+        return;
+      }
+    }
+    std::cout << "can't change size of mat at address: " << gpuMat << std::endl << std::flush;
+  }
+
+  void MatManager::printReleasingMat(GpuMat* gpuMat) {
+    for (int i=0; i < _numMats; i++) {
+      if (gpuMat == _mats[i]) {
+        std::cout << "releasing mat" << i << " at " << gpuMat <<  std::endl << std::flush;
+        return;
+      }
+    }
+    std::cout << "can't release mat at address: " << gpuMat << std::endl << std::flush;
+  }
+*/
+
+}}
diff --git a/openbr/plugins/cuda/GpuMatManager.hpp b/openbr/plugins/cuda/MatManager.hpp
index 86fb169..9302200 100644
--- a/openbr/plugins/cuda/GpuMatManager.hpp
+++ b/openbr/plugins/cuda/MatManager.hpp
@@ -8,25 +8,29 @@ using namespace cv;
 using namespace cv::gpu;
 
 namespace br { namespace cuda {
-  class GpuMatManager {
+  class MatManager {
   private:
     int _numMats;
-    GpuMat** _mats;         // holds all the mats
+    uint8_t** _mats;         // holds all the mats
     bool** _matTaken;       // holds whether or not they are taken
+    int** _matsDimension;   // holds the dimension of the Mats
 
     pthread_mutex_t* _matTakenLock;            // lock for matTaken table
-    pthread_mutex_t* _openCvOperationLock;     // lock for OpenCV upload/download/realloc operations
+    pthread_mutex_t* _matsDimensionLock;     // lock for OpenCV upload/download/realloc operations
     sem_t* _matSemaphore;
 
   public:
-    GpuMatManager(int num);
+    MatManager(int num);
 
-    GpuMat* reserve();
-    void upload(GpuMat* reservedMat, Mat& mat);
-    void matchDimensions(GpuMat* srcMat, GpuMat* dstMat);
-    void download(GpuMat* reservedMat, Mat& dstMat);
-    void release(GpuMat* mat);
+    uint8_t* reserve(Mat *mat);
+    void upload(uint8_t* reservedMat, Mat& mat);
+    void download(uint8_t* reservedMat, Mat& dstMat);
+    void release(uint8_t* mat);
 
-    ~GpuMatManager();
+    ~MatManager();
+    //void printMats();
+    //void printSemValue();
+    //void printSizeChangingMat(uint8_t* gpuMat);
+    //void printReleasingMat(uint8_t* gpuMat);
   };
 }}
diff --git a/openbr/plugins/cuda/cudalbp.cpp b/openbr/plugins/cuda/cudalbp.cpp
index dca1b79..f5dc029 100644
--- a/openbr/plugins/cuda/cudalbp.cpp
+++ b/openbr/plugins/cuda/cudalbp.cpp
@@ -33,7 +33,7 @@
 #include <openbr/plugins/openbr_internal.h>
 
 #include "cudalbp.hpp"
-#include "GpuMatManager.hpp"
+#include "MatManager.hpp"
 
 using namespace cv;
 
@@ -90,7 +90,7 @@ class CUDALBPTransform : public UntrainableTransform
     uchar null;
 
 
-    cuda::GpuMatManager* matManager;
+    cuda::MatManager* matManager;
 
   public:
     /* Returns the number of 0->1 or 1->0 transitions in i */
@@ -143,7 +143,7 @@ class CUDALBPTransform : public UntrainableTransform
                 lut[i] = null; // Set to null id
 
         // init the mat manager for managing 10 mats
-        matManager = new cuda::GpuMatManager(10);
+        matManager = new cuda::MatManager(10);
 
         // copy lut over to the GPU
         br::cuda::cudalbp_init_wrapper(lut, &lutGpuPtr);
@@ -154,23 +154,29 @@ class CUDALBPTransform : public UntrainableTransform
     void project(const Template &src, Template &dst) const
     {
         Mat& m = (Mat&)src.m();
-
-        GpuMat* a;
-        GpuMat* b;
-        a = matManager->reserve();
+        uint8_t* a;
+        uint8_t* b;
+        a = matManager->reserve(&m);
+//        std::cout << "m: " << m.size() << ", " << m.type() << std::endl << std::flush;
+//        std::cout << "a: " << a->size() << ", " << a->type() << std::endl << std::flush;
         matManager->upload(a, m);
 
         // reserve the second mat and check the dimensiosn
-        b = matManager->reserve();
-        matManager->matchDimensions(b, a);
-
-        br::cuda::cudalbp_wrapper(*a, *b, lutGpuPtr);
-
+        b = matManager->reserve(&m);
+        //matManager->matchDimensions(b, a);
+        
+        //std::cout << "Coming to here" << std::endl << std::flush;
+        br::cuda::cudalbp_wrapper(a, b, lutGpuPtr, m.cols, m.rows, m.step1());
+        //std::cout << "Coming out of here" << std::endl << std::flush;
+
+        //std::cout << "Start to download" << std::endl << std::flush;
         matManager->download(b, dst);
+        //std::cout << "finish download" << std::endl << std::flush;
 
         // release both the mats
         matManager->release(a);
         matManager->release(b);
+        std::cout << "finish release" << std::endl << std::flush;
     }
 };
 
diff --git a/openbr/plugins/cuda/cudalbp.cu b/openbr/plugins/cuda/cudalbp.cu
index 84644f0..23aec27 100644
--- a/openbr/plugins/cuda/cudalbp.cu
+++ b/openbr/plugins/cuda/cudalbp.cu
@@ -36,15 +36,8 @@ namespace br { namespace cuda {
     dstRowPtr[colInd] = val;
   }
 
-  void cudalbp_wrapper(GpuMat& src, GpuMat& dst, uint8_t* lut)
+  void cudalbp_wrapper(uint8_t* srcPtr, uint8_t* dstPtr, uint8_t* lut, int imageWidth, int imageHeight, size_t step)
   {
-    // convert the GpuMats to pointers
-    uint8_t* srcPtr = (uint8_t*)src.data;
-    uint8_t* dstPtr = (uint8_t*)dst.data;
-
-    int imageWidth = src.cols;
-    int imageHeight = src.rows;
-
     // make 8 * 8 = 64 square block
     dim3 threadsPerBlock(8, 8);
 
@@ -55,7 +48,7 @@ namespace br { namespace cuda {
     //printf("Dst Image Dimesions:\n\trows: %d\tcols: %d\n", dst.rows, dst.cols);
     //printf("Running CUDALBP\nBlock Dimensions:\n\tx: %d\ty: %d\n", numBlocks.x, numBlocks.y);
 
-    cudalbp_kernel<<<numBlocks, threadsPerBlock>>>(srcPtr, dstPtr, src.step, dst.step, imageHeight, imageWidth, lut);
+    cudalbp_kernel<<<numBlocks, threadsPerBlock>>>(srcPtr, dstPtr, step, step, imageHeight, imageWidth, lut);
   }
 
   void cudalbp_init_wrapper(uint8_t* lut, uint8_t** lutGpuPtrPtr) {
diff --git a/openbr/plugins/cuda/cudalbp.hpp b/openbr/plugins/cuda/cudalbp.hpp
index f6569be..7fa270c 100644
--- a/openbr/plugins/cuda/cudalbp.hpp
+++ b/openbr/plugins/cuda/cudalbp.hpp
@@ -5,5 +5,5 @@ using namespace cv::gpu;
 
 namespace br { namespace cuda {
   void cudalbp_init_wrapper(uint8_t* lut, uint8_t** lutGpuPtrPtr);
-  void cudalbp_wrapper(GpuMat& src, GpuMat& dst, uint8_t* lut);
+  void cudalbp_wrapper(uint8_t* src, uint8_t* dst, uint8_t* lut, int imageWidth, int imageHeight, size_t step);
 }}