Commit 4e59b2b58f7bd025265838a29835fbbfb5225c51

Authored by DepthDeluxe
1 parent d025a5da

full carryover support with proper memory management

* wrapped all CUDA memory functions in safety calls
* fixed CUDALBP invalid memory access which caused performance and stability issues
* carryover support for all plugins, only copy over memory once throughout the whole computation
openbr/plugins/cuda/copyfrom.cpp
@@ -24,7 +24,6 @@ private: @@ -24,7 +24,6 @@ private:
24 { 24 {
25 // pull the data back out of the Mat 25 // pull the data back out of the Mat
26 void* const* dataPtr = src.m().ptr<void*>(); 26 void* const* dataPtr = src.m().ptr<void*>();
27 - void* cudaMemPtr = dataPtr[0];  
28 int rows = *((int*)dataPtr[1]); 27 int rows = *((int*)dataPtr[1]);
29 int cols = *((int*)dataPtr[2]); 28 int cols = *((int*)dataPtr[2]);
30 int type = *((int*)dataPtr[3]); 29 int type = *((int*)dataPtr[3]);
@@ -32,10 +31,10 @@ private: @@ -32,10 +31,10 @@ private:
32 Mat dstMat = Mat(rows, cols, type); 31 Mat dstMat = Mat(rows, cols, type);
33 switch(type) { 32 switch(type) {
34 case CV_32FC1: 33 case CV_32FC1:
35 - br::cuda::cudacopyfrom::wrapper(cudaMemPtr, dstMat.ptr<float>(), rows, cols); 34 + br::cuda::cudacopyfrom::wrapper(dataPtr[0], dstMat.ptr<float>(), rows, cols);
36 break; 35 break;
37 case CV_8UC1: 36 case CV_8UC1:
38 - br::cuda::cudacopyfrom::wrapper(cudaMemPtr, dstMat.ptr<unsigned char>(), rows, cols); 37 + br::cuda::cudacopyfrom::wrapper(dataPtr[0], dstMat.ptr<unsigned char>(), rows, cols);
39 break; 38 break;
40 default: 39 default:
41 cout << "ERR: Invalid image format" << endl; 40 cout << "ERR: Invalid image format" << endl;
openbr/plugins/cuda/copyfrom.cu
  1 +#include "cudadefines.hpp"
  2 +
1 namespace br { namespace cuda { namespace cudacopyfrom { 3 namespace br { namespace cuda { namespace cudacopyfrom {
2 template <typename T> void wrapper(void* src, T* dst, int rows, int cols) { 4 template <typename T> void wrapper(void* src, T* dst, int rows, int cols) {
3 - cudaMemcpy(dst, src, rows*cols*sizeof(T), cudaMemcpyDeviceToHost);  
4 - cudaFree(src); 5 + cudaError_t err;
  6 + CUDA_SAFE_MEMCPY(dst, src, rows*cols*sizeof(T), cudaMemcpyDeviceToHost, &err);
  7 + CUDA_SAFE_FREE(src, &err);
5 } 8 }
6 9
7 template void wrapper(void*, float*, int, int); 10 template void wrapper(void*, float*, int, int);
openbr/plugins/cuda/copyto.cpp
@@ -27,29 +27,28 @@ private: @@ -27,29 +27,28 @@ private:
27 const int rows = srcMat.rows; 27 const int rows = srcMat.rows;
28 const int cols = srcMat.cols; 28 const int cols = srcMat.cols;
29 29
  30 + // output will be a single pointer to graphics card memory
  31 + Mat dstMat = Mat(4, 1, DataType<void*>::type);
  32 + void** dstMatData = dstMat.ptr<void*>();
  33 +
  34 + // save cuda ptr, rows, cols, then type
  35 + dstMatData[1] = new int; *((int*)dstMatData[1]) = rows;
  36 + dstMatData[2] = new int; *((int*)dstMatData[2]) = cols;
  37 + dstMatData[3] = new int; *((int*)dstMatData[3]) = srcMat.type();
  38 +
30 void* cudaMemPtr; 39 void* cudaMemPtr;
31 switch(srcMat.type()) { 40 switch(srcMat.type()) {
32 case CV_32FC1: 41 case CV_32FC1:
33 - br::cuda::cudacopyto::wrapper(srcMat.ptr<float>(), &cudaMemPtr, rows, cols); 42 + br::cuda::cudacopyto::wrapper(srcMat.ptr<float>(), &dstMatData[0], rows, cols);
34 break; 43 break;
35 case CV_8UC1: 44 case CV_8UC1:
36 - br::cuda::cudacopyto::wrapper(srcMat.ptr<unsigned char>(), &cudaMemPtr, rows, cols); 45 + br::cuda::cudacopyto::wrapper(srcMat.ptr<unsigned char>(), &dstMatData[0], rows, cols);
37 break; 46 break;
38 default: 47 default:
39 cout << "ERR: Invalid image type! " << type2str(srcMat.type()) << endl; 48 cout << "ERR: Invalid image type! " << type2str(srcMat.type()) << endl;
40 return; 49 return;
41 } 50 }
42 51
43 - // output will be a single pointer to graphics card memory  
44 - Mat dstMat = Mat(4, 1, DataType<void*>::type);  
45 - void** dstMatData = dstMat.ptr<void*>();  
46 -  
47 - // save cuda ptr, rows, cols, then type  
48 - dstMatData[0] = cudaMemPtr;  
49 - dstMatData[1] = new int; *((int*)dstMatData[1]) = rows;  
50 - dstMatData[2] = new int; *((int*)dstMatData[2]) = cols;  
51 - dstMatData[3] = new int; *((int*)dstMatData[3]) = srcMat.type();  
52 -  
53 dst = dstMat; 52 dst = dstMat;
54 } 53 }
55 }; 54 };
openbr/plugins/cuda/copyto.cu
  1 +#include "cudadefines.hpp"
  2 +
1 namespace br { namespace cuda { namespace cudacopyto { 3 namespace br { namespace cuda { namespace cudacopyto {
2 template <typename T> void wrapper(const T* in, void** out, const int rows, const int cols) { 4 template <typename T> void wrapper(const T* in, void** out, const int rows, const int cols) {
3 - cudaMalloc(out, rows*cols*sizeof(T));  
4 - cudaMemcpy(*out, in, rows*cols*sizeof(T), cudaMemcpyHostToDevice); 5 + cudaError_t err;
  6 + CUDA_SAFE_MALLOC(out, rows*cols*sizeof(T), &err);
  7 + CUDA_SAFE_MEMCPY(*out, in, rows*cols*sizeof(T), cudaMemcpyHostToDevice, &err);
5 } 8 }
6 9
7 template void wrapper(const float* in, void** out, const int rows, const int cols); 10 template void wrapper(const float* in, void** out, const int rows, const int cols);
openbr/plugins/cuda/cudacvtfloat.cpp
1 #include <iostream> 1 #include <iostream>
2 -#include <unistd.h>  
3 using namespace std; 2 using namespace std;
  3 +#include <unistd.h>
4 4
5 #include <opencv2/opencv.hpp> 5 #include <opencv2/opencv.hpp>
6 using namespace cv; 6 using namespace cv;
7 7
8 #include <openbr/plugins/openbr_internal.h> 8 #include <openbr/plugins/openbr_internal.h>
9 9
10 -  
11 namespace br { namespace cuda { namespace cudacvtfloat { 10 namespace br { namespace cuda { namespace cudacvtfloat {
12 - void wrapper(const unsigned char* src, void** dst, int rows, int cols); 11 + void wrapper(void* src, void** dst, int rows, int cols);
13 }}} 12 }}}
14 13
15 namespace br 14 namespace br
@@ -28,7 +27,6 @@ class CUDACvtFloatTransform : public UntrainableTransform @@ -28,7 +27,6 @@ class CUDACvtFloatTransform : public UntrainableTransform
28 void project(const Template &src, Template &dst) const 27 void project(const Template &src, Template &dst) const
29 { 28 {
30 void* const* srcDataPtr = src.m().ptr<void*>(); 29 void* const* srcDataPtr = src.m().ptr<void*>();
31 - void* srcMemPtr = srcDataPtr[0];  
32 int rows = *((int*)srcDataPtr[1]); 30 int rows = *((int*)srcDataPtr[1]);
33 int cols = *((int*)srcDataPtr[2]); 31 int cols = *((int*)srcDataPtr[2]);
34 int type = *((int*)srcDataPtr[3]); 32 int type = *((int*)srcDataPtr[3]);
@@ -47,7 +45,7 @@ class CUDACvtFloatTransform : public UntrainableTransform @@ -47,7 +45,7 @@ class CUDACvtFloatTransform : public UntrainableTransform
47 dstDataPtr[2] = srcDataPtr[2]; 45 dstDataPtr[2] = srcDataPtr[2];
48 dstDataPtr[3] = srcDataPtr[3]; *((int*)dstDataPtr[3]) = CV_32FC1; 46 dstDataPtr[3] = srcDataPtr[3]; *((int*)dstDataPtr[3]) = CV_32FC1;
49 47
50 - br::cuda::cudacvtfloat::wrapper((const unsigned char*)srcMemPtr, &dstDataPtr[0], rows, cols); 48 + br::cuda::cudacvtfloat::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
51 dst = dstMat; 49 dst = dstMat;
52 } 50 }
53 }; 51 };
openbr/plugins/cuda/cudacvtfloat.cu
  1 +#include <iostream>
  2 +using namespace std;
  3 +
  4 +#include "cudadefines.hpp"
  5 +
1 namespace br { namespace cuda { namespace cudacvtfloat { 6 namespace br { namespace cuda { namespace cudacvtfloat {
2 7
3 __global__ void kernel(const unsigned char* src, float* dst, int rows, int cols) { 8 __global__ void kernel(const unsigned char* src, float* dst, int rows, int cols) {
@@ -14,15 +19,15 @@ namespace br { namespace cuda { namespace cudacvtfloat { @@ -14,15 +19,15 @@ namespace br { namespace cuda { namespace cudacvtfloat {
14 dst[index] = (float)src[index]; 19 dst[index] = (float)src[index];
15 } 20 }
16 21
17 - void wrapper(const unsigned char* src, void** dst, int rows, int cols) { 22 + void wrapper(void* src, void** dst, int rows, int cols) {
18 //unsigned char* cudaSrc; 23 //unsigned char* cudaSrc;
19 //cudaMalloc(&cudaSrc, rows*cols*sizeof(unsigned char)); 24 //cudaMalloc(&cudaSrc, rows*cols*sizeof(unsigned char));
20 //cudaMemcpy(cudaSrc, src, rows*cols*sizeof(unsigned char), cudaMemcpyHostToDevice); 25 //cudaMemcpy(cudaSrc, src, rows*cols*sizeof(unsigned char), cudaMemcpyHostToDevice);
21 26
22 //float* cudaDst; 27 //float* cudaDst;
23 //cudaMalloc(&cudaDst, rows*cols*sizeof(float)); 28 //cudaMalloc(&cudaDst, rows*cols*sizeof(float));
24 -  
25 - cudaMalloc(dst, rows*cols*sizeof(float)); 29 + cudaError_t err;
  30 + CUDA_SAFE_MALLOC(dst, rows*cols*sizeof(float), &err);
26 31
27 dim3 threadsPerBlock(8, 8); 32 dim3 threadsPerBlock(8, 8);
28 dim3 blocks( 33 dim3 blocks(
@@ -30,10 +35,11 @@ namespace br { namespace cuda { namespace cudacvtfloat { @@ -30,10 +35,11 @@ namespace br { namespace cuda { namespace cudacvtfloat {
30 rows / threadsPerBlock.y + 1 35 rows / threadsPerBlock.y + 1
31 ); 36 );
32 37
33 - kernel<<<threadsPerBlock, blocks>>>(src, (float*)(*dst), rows, cols); 38 + kernel<<<threadsPerBlock, blocks>>>((const unsigned char*)src, (float*)(*dst), rows, cols);
  39 + CUDA_KERNEL_ERR_CHK(&err);
34 40
35 // free the src memory since it is now in a newly allocated dst 41 // free the src memory since it is now in a newly allocated dst
36 - cudaFree((void*)src); 42 + CUDA_SAFE_FREE(src, &err);
37 } 43 }
38 44
39 }}} 45 }}}
openbr/plugins/cuda/cudadefines.hpp 0 โ†’ 100644
  1 +#include <iostream>
  2 +using namespace std;
  3 +#include <pthread.h>
  4 +
  5 +#define CUDA_SAFE_FREE(cudaPtr, errPtr) \
  6 + /*cout << pthread_self() << ": CUDA Free: " << cudaPtr << endl;*/ \
  7 + *errPtr = cudaFree(cudaPtr); \
  8 + if (*errPtr != cudaSuccess) { \
  9 + cout << pthread_self() << ": CUDA Free Error(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \
  10 + throw 0; \
  11 + }
  12 +
  13 +#define CUDA_SAFE_MALLOC(cudaPtrPtr, size, errPtr) \
  14 + *errPtr = cudaMalloc(cudaPtrPtr, size); \
  15 + if (*errPtr != cudaSuccess) { \
  16 + cout << pthread_self() << ": CUDA Malloc Error(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \
  17 + throw 0; \
  18 + } \
  19 + /* cout << pthread_self() << ": CUDA Malloc: " << (void*)*(int**)cudaPtrPtr << endl; */;
  20 +
  21 +#define CUDA_SAFE_MEMCPY(dstPtr, srcPtr, count, kind, errPtr) \
  22 + *errPtr = cudaMemcpy(dstPtr, srcPtr, count, kind); \
  23 + if (*errPtr != cudaSuccess) { \
  24 + cout << pthread_self() << ": CUDA Memcpy Error(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \
  25 + throw 0; \
  26 + }
  27 +
  28 +#define CUDA_KERNEL_ERR_CHK(errPtr) \
  29 + *errPtr = cudaPeekAtLastError(); \
  30 + if (*errPtr != cudaSuccess) { \
  31 + cout << pthread_self() << ": Kernel Call Err(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \
  32 + throw 0; \
  33 + }
openbr/plugins/cuda/cudalbp.cpp
@@ -84,7 +84,7 @@ class CUDALBPTransform : public UntrainableTransform @@ -84,7 +84,7 @@ class CUDALBPTransform : public UntrainableTransform
84 uchar lut[256]; 84 uchar lut[256];
85 uchar null; 85 uchar null;
86 86
87 - cuda::MatManager* matManager; 87 + //cuda::MatManager* matManager;
88 88
89 public: 89 public:
90 /* Returns the number of 0->1 or 1->0 transitions in i */ 90 /* Returns the number of 0->1 or 1->0 transitions in i */
@@ -137,7 +137,7 @@ class CUDALBPTransform : public UntrainableTransform @@ -137,7 +137,7 @@ class CUDALBPTransform : public UntrainableTransform
137 lut[i] = null; // Set to null id 137 lut[i] = null; // Set to null id
138 138
139 // init the mat manager for managing 10 mats 139 // init the mat manager for managing 10 mats
140 - matManager = new cuda::MatManager(10); 140 + //matManager = new cuda::MatManager(10);
141 141
142 // copy lut over to the GPU 142 // copy lut over to the GPU
143 br::cuda::cudalbp_init_wrapper(lut); 143 br::cuda::cudalbp_init_wrapper(lut);
@@ -167,7 +167,6 @@ class CUDALBPTransform : public UntrainableTransform @@ -167,7 +167,6 @@ class CUDALBPTransform : public UntrainableTransform
167 //matManager->release(b); 167 //matManager->release(b);
168 168
169 void* const* srcDataPtr = src.m().ptr<void*>(); 169 void* const* srcDataPtr = src.m().ptr<void*>();
170 - void* cudaSrcPtr = srcDataPtr[0];  
171 int rows = *((int*)srcDataPtr[1]); 170 int rows = *((int*)srcDataPtr[1]);
172 int cols = *((int*)srcDataPtr[2]); 171 int cols = *((int*)srcDataPtr[2]);
173 int type = *((int*)srcDataPtr[3]); 172 int type = *((int*)srcDataPtr[3]);
@@ -178,7 +177,7 @@ class CUDALBPTransform : public UntrainableTransform @@ -178,7 +177,7 @@ class CUDALBPTransform : public UntrainableTransform
178 dstDataPtr[2] = srcDataPtr[2]; 177 dstDataPtr[2] = srcDataPtr[2];
179 dstDataPtr[3] = srcDataPtr[3]; 178 dstDataPtr[3] = srcDataPtr[3];
180 179
181 - br::cuda::cudalbp_wrapper(cudaSrcPtr, &dstDataPtr[0], rows, cols); 180 + br::cuda::cudalbp_wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
182 dst = dstMat; 181 dst = dstMat;
183 } 182 }
184 }; 183 };
openbr/plugins/cuda/cudalbp.cu
  1 +#include <iostream>
  2 +using namespace std;
  3 +
1 #include <opencv2/gpu/gpu.hpp> 4 #include <opencv2/gpu/gpu.hpp>
2 #include <stdio.h> 5 #include <stdio.h>
3 6
  7 +#include "cudadefines.hpp"
  8 +
4 using namespace cv; 9 using namespace cv;
5 using namespace cv::gpu; 10 using namespace cv::gpu;
6 11
@@ -8,7 +13,7 @@ namespace br { namespace cuda { @@ -8,7 +13,7 @@ namespace br { namespace cuda {
8 uint8_t* lut; 13 uint8_t* lut;
9 14
10 __device__ __forceinline__ uint8_t cudalbp_kernel_get_pixel_value(int row, int col, uint8_t* srcPtr, int rows, int cols) { 15 __device__ __forceinline__ uint8_t cudalbp_kernel_get_pixel_value(int row, int col, uint8_t* srcPtr, int rows, int cols) {
11 - return (row >= rows || col >= cols) ? 0 : (srcPtr + row*cols)[col]; 16 + return (srcPtr + row*cols)[col];
12 } 17 }
13 18
14 __global__ void cudalbp_kernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols, uint8_t* lut) 19 __global__ void cudalbp_kernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols, uint8_t* lut)
@@ -17,9 +22,16 @@ namespace br { namespace cuda { @@ -17,9 +22,16 @@ namespace br { namespace cuda {
17 int colInd = blockIdx.x*blockDim.x+threadIdx.x; 22 int colInd = blockIdx.x*blockDim.x+threadIdx.x;
18 int radius = 1; 23 int radius = 1;
19 24
  25 + int index = rowInd*cols + colInd;
  26 +
20 // don't do anything if the index is out of bounds 27 // don't do anything if the index is out of bounds
21 - if (rowInd >= rows || colInd >= cols) {  
22 - return; 28 + if (rowInd < 1 || rowInd >= rows-1 || colInd < 1 || colInd >= cols-1) {
  29 + if (rowInd >= rows || colInd >= cols) {
  30 + return;
  31 + } else {
  32 + dstPtr[index] = 0;
  33 + return;
  34 + }
23 } 35 }
24 36
25 const uint8_t cval = cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+0*radius, srcPtr, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]); // center value 37 const uint8_t cval = cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+0*radius, srcPtr, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]); // center value
@@ -33,26 +45,29 @@ namespace br { namespace cuda { @@ -33,26 +45,29 @@ namespace br { namespace cuda {
33 (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 1 : 0)]; 45 (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 1 : 0)];
34 46
35 // store calculated value away in the right place 47 // store calculated value away in the right place
36 - int index = rowInd*cols + colInd;  
37 dstPtr[index] = val; 48 dstPtr[index] = val;
38 } 49 }
39 50
40 //void cudalbp_wrapper(uint8_t* srcPtr, uint8_t* dstPtr, uint8_t* lut, int imageWidth, int imageHeight, size_t step) 51 //void cudalbp_wrapper(uint8_t* srcPtr, uint8_t* dstPtr, uint8_t* lut, int imageWidth, int imageHeight, size_t step)
41 void cudalbp_wrapper(void* srcPtr, void** dstPtr, int rows, int cols) 52 void cudalbp_wrapper(void* srcPtr, void** dstPtr, int rows, int cols)
42 { 53 {
  54 + cudaError_t err;
  55 +
43 // make 8 * 8 = 64 square block 56 // make 8 * 8 = 64 square block
44 dim3 threadsPerBlock(8, 8); 57 dim3 threadsPerBlock(8, 8);
45 dim3 numBlocks(cols/threadsPerBlock.x + 1, 58 dim3 numBlocks(cols/threadsPerBlock.x + 1,
46 rows/threadsPerBlock.y + 1); 59 rows/threadsPerBlock.y + 1);
47 60
48 - cudaMalloc(dstPtr, rows*cols*sizeof(uint8_t)); 61 + CUDA_SAFE_MALLOC(dstPtr, rows*cols*sizeof(uint8_t), &err);
49 cudalbp_kernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), rows, cols, lut); 62 cudalbp_kernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), rows, cols, lut);
  63 + CUDA_KERNEL_ERR_CHK(&err);
50 64
51 - cudaFree(srcPtr); 65 + CUDA_SAFE_FREE(srcPtr, &err);
52 } 66 }
53 67
54 void cudalbp_init_wrapper(uint8_t* cpuLut) { 68 void cudalbp_init_wrapper(uint8_t* cpuLut) {
55 - cudaMalloc(&lut, 256*sizeof(uint8_t));  
56 - cudaMemcpy(lut, cpuLut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice); 69 + cudaError_t err;
  70 + CUDA_SAFE_MALLOC(&lut, 256*sizeof(uint8_t), &err);
  71 + CUDA_SAFE_MEMCPY(lut, cpuLut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice, &err);
57 } 72 }
58 }} 73 }}
openbr/plugins/cuda/cudapca.cpp
@@ -15,6 +15,7 @@ @@ -15,6 +15,7 @@
15 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 15 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
16 #include <iostream> 16 #include <iostream>
17 using namespace std; 17 using namespace std;
  18 +#include <unistd.h>
18 19
19 #include <QList> 20 #include <QList>
20 21
@@ -30,7 +31,7 @@ using namespace cv; @@ -30,7 +31,7 @@ using namespace cv;
30 31
31 namespace br { namespace cuda { 32 namespace br { namespace cuda {
32 void cudapca_loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems); 33 void cudapca_loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems);
33 - void cudapca_trainwrapper(const void* cudaDataPtr, float* dataPtr, int rows, int cols); 34 + void cudapca_trainwrapper(void* cudaDataPtr, float* dataPtr, int rows, int cols);
34 void cudapca_projectwrapper(void* src, void** dst); 35 void cudapca_projectwrapper(void* src, void** dst);
35 }} 36 }}
36 37
@@ -82,12 +83,13 @@ private: @@ -82,12 +83,13 @@ private:
82 83
83 void train(const TemplateList &cudaTrainingSet) 84 void train(const TemplateList &cudaTrainingSet)
84 { 85 {
  86 + // copy the data back from the graphics card so the training can be done on the CPU
85 const int instances = cudaTrainingSet.size(); // get the number of training set instances 87 const int instances = cudaTrainingSet.size(); // get the number of training set instances
86 QList<Template> trainingQlist; 88 QList<Template> trainingQlist;
87 for(int i=0; i<instances; i++) { 89 for(int i=0; i<instances; i++) {
88 Template currentTemplate = cudaTrainingSet[i]; 90 Template currentTemplate = cudaTrainingSet[i];
89 void* const* srcDataPtr = currentTemplate.m().ptr<void*>(); 91 void* const* srcDataPtr = currentTemplate.m().ptr<void*>();
90 - const void* cudaMemPtr = srcDataPtr[0]; 92 + void* cudaMemPtr = srcDataPtr[0];
91 int rows = *((int*)srcDataPtr[1]); 93 int rows = *((int*)srcDataPtr[1]);
92 int cols = *((int*)srcDataPtr[2]); 94 int cols = *((int*)srcDataPtr[2]);
93 int type = *((int*)srcDataPtr[3]); 95 int type = *((int*)srcDataPtr[3]);
@@ -95,29 +97,30 @@ private: @@ -95,29 +97,30 @@ private:
95 Mat mat = Mat(rows, cols, type); 97 Mat mat = Mat(rows, cols, type);
96 br::cuda::cudapca_trainwrapper(cudaMemPtr, mat.ptr<float>(), rows, cols); 98 br::cuda::cudapca_trainwrapper(cudaMemPtr, mat.ptr<float>(), rows, cols);
97 trainingQlist.append(Template(mat)); 99 trainingQlist.append(Template(mat));
98 - TemplateList trainingSet;  
99 } 100 }
  101 +
  102 + // assemble a TemplateList from the list of data
100 TemplateList trainingSet(trainingQlist); 103 TemplateList trainingSet(trainingQlist);
101 104
102 - if (trainingSet.first().m().type() != CV_32FC1)  
103 - qFatal("Requires single channel 32-bit floating point matrices."); 105 + if (trainingSet.first().m().type() != CV_32FC1) {
  106 + qFatal("Requires single channel 32-bit floating point matrices.");
  107 + }
104 108
105 originalRows = trainingSet.first().m().rows; // get number of rows of first image 109 originalRows = trainingSet.first().m().rows; // get number of rows of first image
106 int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols; // get the size of the first image 110 int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols; // get the size of the first image
107 111
108 // Map into 64-bit Eigen matrix 112 // Map into 64-bit Eigen matrix
109 Eigen::MatrixXd data(dimsIn, instances); // create a mat 113 Eigen::MatrixXd data(dimsIn, instances); // create a mat
110 - for (int i=0; i<instances; i++)  
111 - data.col(i) = Eigen::Map<const Eigen::MatrixXf>(trainingSet[i].m().ptr<float>(), dimsIn, 1).cast<double>(); 114 + for (int i=0; i<instances; i++) {
  115 + data.col(i) = Eigen::Map<const Eigen::MatrixXf>(trainingSet[i].m().ptr<float>(), dimsIn, 1).cast<double>();
  116 + }
112 117
113 trainCore(data); 118 trainCore(data);
114 } 119 }
115 120
116 void project(const Template &src, Template &dst) const 121 void project(const Template &src, Template &dst) const
117 { 122 {
118 -  
119 void* const* srcDataPtr = src.m().ptr<void*>(); 123 void* const* srcDataPtr = src.m().ptr<void*>();
120 - void* cudaMemPtr = srcDataPtr[0];  
121 int rows = *((int*)srcDataPtr[1]); 124 int rows = *((int*)srcDataPtr[1]);
122 int cols = *((int*)srcDataPtr[2]); 125 int cols = *((int*)srcDataPtr[2]);
123 int type = *((int*)srcDataPtr[3]); 126 int type = *((int*)srcDataPtr[3]);
@@ -133,7 +136,7 @@ private: @@ -133,7 +136,7 @@ private:
133 dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = keep; 136 dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = keep;
134 dstDataPtr[3] = srcDataPtr[3]; 137 dstDataPtr[3] = srcDataPtr[3];
135 138
136 - br::cuda::cudapca_projectwrapper(cudaMemPtr, &dstDataPtr[0]); 139 + br::cuda::cudapca_projectwrapper(srcDataPtr[0], &dstDataPtr[0]);
137 140
138 dst = dstMat; 141 dst = dstMat;
139 142
openbr/plugins/cuda/cudapca.cu
@@ -4,6 +4,8 @@ using namespace std; @@ -4,6 +4,8 @@ using namespace std;
4 #include <opencv2/opencv.hpp> 4 #include <opencv2/opencv.hpp>
5 #include <opencv2/gpu/gpu.hpp> 5 #include <opencv2/gpu/gpu.hpp>
6 6
  7 +#include "cudadefines.hpp"
  8 +
7 using namespace cv; 9 using namespace cv;
8 using namespace cv::gpu; 10 using namespace cv::gpu;
9 11
@@ -63,39 +65,45 @@ namespace br { namespace cuda { @@ -63,39 +65,45 @@ namespace br { namespace cuda {
63 _evRows = evRows; _evCols = evCols; 65 _evRows = evRows; _evCols = evCols;
64 _meanElems = meanElems; 66 _meanElems = meanElems;
65 67
  68 + cudaError_t err;
  69 +
66 // copy the eigenvectors to the GPU 70 // copy the eigenvectors to the GPU
67 - cudaMalloc(&cudaEvPtr, evRows*evCols*sizeof(float));  
68 - cudaMemcpy(cudaEvPtr, evPtr, evRows*evCols*sizeof(float), cudaMemcpyHostToDevice); 71 + CUDA_SAFE_MALLOC(&cudaEvPtr, evRows*evCols*sizeof(float), &err);
  72 + CUDA_SAFE_MEMCPY(cudaEvPtr, evPtr, evRows*evCols*sizeof(float), cudaMemcpyHostToDevice, &err);
69 73
70 // copy the mean to the GPU 74 // copy the mean to the GPU
71 - cudaMalloc(&cudaMeanPtr, meanElems*sizeof(float));  
72 - cudaMemcpy(cudaMeanPtr, meanPtr, meanElems*sizeof(float), cudaMemcpyHostToDevice); 75 + CUDA_SAFE_MALLOC(&cudaMeanPtr, meanElems*sizeof(float), &err);
  76 + CUDA_SAFE_MEMCPY(cudaMeanPtr, meanPtr, meanElems*sizeof(float), cudaMemcpyHostToDevice, &err);
73 77
74 - cudaMalloc(&_cudaSrcPtr, _meanElems*sizeof(float));  
75 - cudaMalloc(&_cudaDstPtr, _evCols*sizeof(float)); 78 + CUDA_SAFE_MALLOC(&_cudaSrcPtr, _meanElems*sizeof(float), &err);
  79 + CUDA_SAFE_MALLOC(&_cudaDstPtr, _evCols*sizeof(float), &err);
76 } 80 }
77 81
78 - void cudapca_trainwrapper(const void* cudaDataPtr, float* dataPtr, int rows, int cols) {  
79 - cudaMemcpy(dataPtr, cudaDataPtr, rows*cols*sizeof(float), cudaMemcpyDeviceToHost); 82 + void cudapca_trainwrapper(void* cudaDataPtr, float* dataPtr, int rows, int cols) {
  83 + cudaError_t err;
  84 + CUDA_SAFE_MEMCPY(dataPtr, cudaDataPtr, rows*cols*sizeof(float), cudaMemcpyDeviceToHost, &err);
  85 + CUDA_SAFE_FREE(cudaDataPtr, &err);
80 } 86 }
81 87
82 void cudapca_projectwrapper(void* src, void** dst) { 88 void cudapca_projectwrapper(void* src, void** dst) {
83 // copy the image to the GPU 89 // copy the image to the GPU
84 //cudaMemcpy(_cudaSrcPtr, src, _meanElems*sizeof(float), cudaMemcpyHostToDevice); 90 //cudaMemcpy(_cudaSrcPtr, src, _meanElems*sizeof(float), cudaMemcpyHostToDevice);
85 -  
86 - cudaMalloc(dst, _evRows*_evCols*sizeof(float)); 91 + cudaError_t err;
  92 + CUDA_SAFE_MALLOC(dst, _evRows*_evCols*sizeof(float), &err);
87 93
88 // subtract out the mean of the image (mean is 1xpixels in size) 94 // subtract out the mean of the image (mean is 1xpixels in size)
89 int threadsPerBlock = 64; 95 int threadsPerBlock = 64;
90 int numBlocks = _meanElems / threadsPerBlock + 1; 96 int numBlocks = _meanElems / threadsPerBlock + 1;
91 cudapca_project_subtractmean_kernel<<<numBlocks, threadsPerBlock>>>((float*)src, cudaMeanPtr, _meanElems); 97 cudapca_project_subtractmean_kernel<<<numBlocks, threadsPerBlock>>>((float*)src, cudaMeanPtr, _meanElems);
  98 + CUDA_KERNEL_ERR_CHK(&err);
92 99
93 // perform the multiplication 100 // perform the multiplication
94 threadsPerBlock = 64; 101 threadsPerBlock = 64;
95 numBlocks = _evCols / threadsPerBlock + 1; 102 numBlocks = _evCols / threadsPerBlock + 1;
96 cudapca_project_multiply_kernel<<<numBlocks, threadsPerBlock>>>((float*)src, (float*)(*dst), cudaEvPtr, _evRows, _evCols); 103 cudapca_project_multiply_kernel<<<numBlocks, threadsPerBlock>>>((float*)src, (float*)(*dst), cudaEvPtr, _evRows, _evCols);
  104 + CUDA_KERNEL_ERR_CHK(&err);
97 105
98 - //cudaFree(src); // TODO(colin): figure out why adding this free causes memory corruption... 106 + CUDA_SAFE_FREE(src, &err); // TODO(colin): figure out why adding this free causes memory corruption...
99 107
100 // copy the data back to the CPU 108 // copy the data back to the CPU
101 //cudaMemcpy(dst, _cudaDstPtr, _evCols*sizeof(float), cudaMemcpyDeviceToHost); 109 //cudaMemcpy(dst, _cudaDstPtr, _evCols*sizeof(float), cudaMemcpyDeviceToHost);
openbr/plugins/distance/dist.cpp
@@ -14,6 +14,9 @@ @@ -14,6 +14,9 @@
14 * limitations under the License. * 14 * limitations under the License. *
15 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 15 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
16 16
  17 +#include <iostream>
  18 +using namespace std;
  19 +
17 #include <opencv2/imgproc/imgproc.hpp> 20 #include <opencv2/imgproc/imgproc.hpp>
18 #include <openbr/plugins/openbr_internal.h> 21 #include <openbr/plugins/openbr_internal.h>
19 22
@@ -52,6 +55,15 @@ private: @@ -52,6 +55,15 @@ private:
52 55
53 float compare(const Mat &a, const Mat &b) const 56 float compare(const Mat &a, const Mat &b) const
54 { 57 {
  58 + /*
  59 + cout << "Mat A" << endl;
  60 + cout << "rows: " << a.rows << "\tcols: " << a.cols << endl;
  61 + cout << "a.ptr<float>()[0]: " << a.ptr<float>()[0] << endl;
  62 + cout << "Mat B" << endl;
  63 + cout << "rows: " << b.rows << "\tcols: " << b.cols << endl;
  64 + cout << "b.ptr<float>()[0]: " << b.ptr<float>()[0] << endl;
  65 + */
  66 +
55 if ((a.size != b.size) || 67 if ((a.size != b.size) ||
56 (a.type() != b.type())) 68 (a.type() != b.type()))
57 return -std::numeric_limits<float>::max(); 69 return -std::numeric_limits<float>::max();