From 4e59b2b58f7bd025265838a29835fbbfb5225c51 Mon Sep 17 00:00:00 2001 From: DepthDeluxe Date: Mon, 15 Feb 2016 18:37:28 -0500 Subject: [PATCH] full carryover support with proper memory management * wrapped all CUDA memory functions in safety calls * fixed CUDALBP invalid memory access which caused performance and stability issues * carryover support for all plugins, only copy over memory once throughout the whole computation --- openbr/plugins/cuda/copyfrom.cpp | 5 ++--- openbr/plugins/cuda/copyfrom.cu | 7 +++++-- openbr/plugins/cuda/copyto.cpp | 23 +++++++++++------------ openbr/plugins/cuda/copyto.cu | 7 +++++-- openbr/plugins/cuda/cudacvtfloat.cpp | 8 +++----- openbr/plugins/cuda/cudacvtfloat.cu | 16 +++++++++++----- openbr/plugins/cuda/cudadefines.hpp | 33 +++++++++++++++++++++++++++++++++ openbr/plugins/cuda/cudalbp.cpp | 7 +++---- openbr/plugins/cuda/cudalbp.cu | 31 +++++++++++++++++++++++-------- openbr/plugins/cuda/cudapca.cpp | 23 +++++++++++++---------- openbr/plugins/cuda/cudapca.cu | 30 +++++++++++++++++++----------- openbr/plugins/distance/dist.cpp | 12 ++++++++++++ 12 files changed, 140 insertions(+), 62 deletions(-) create mode 100644 openbr/plugins/cuda/cudadefines.hpp diff --git a/openbr/plugins/cuda/copyfrom.cpp b/openbr/plugins/cuda/copyfrom.cpp index 846b7ba..8b0a88b 100644 --- a/openbr/plugins/cuda/copyfrom.cpp +++ b/openbr/plugins/cuda/copyfrom.cpp @@ -24,7 +24,6 @@ private: { // pull the data back out of the Mat void* const* dataPtr = src.m().ptr(); - void* cudaMemPtr = dataPtr[0]; int rows = *((int*)dataPtr[1]); int cols = *((int*)dataPtr[2]); int type = *((int*)dataPtr[3]); @@ -32,10 +31,10 @@ private: Mat dstMat = Mat(rows, cols, type); switch(type) { case CV_32FC1: - br::cuda::cudacopyfrom::wrapper(cudaMemPtr, dstMat.ptr(), rows, cols); + br::cuda::cudacopyfrom::wrapper(dataPtr[0], dstMat.ptr(), rows, cols); break; case CV_8UC1: - br::cuda::cudacopyfrom::wrapper(cudaMemPtr, dstMat.ptr(), rows, cols); + br::cuda::cudacopyfrom::wrapper(dataPtr[0], dstMat.ptr(), rows, cols); break; default: cout << "ERR: Invalid image format" << endl; diff --git a/openbr/plugins/cuda/copyfrom.cu b/openbr/plugins/cuda/copyfrom.cu index 2a85f93..ddc77f8 100644 --- a/openbr/plugins/cuda/copyfrom.cu +++ b/openbr/plugins/cuda/copyfrom.cu @@ -1,7 +1,10 @@ +#include "cudadefines.hpp" + namespace br { namespace cuda { namespace cudacopyfrom { template void wrapper(void* src, T* dst, int rows, int cols) { - cudaMemcpy(dst, src, rows*cols*sizeof(T), cudaMemcpyDeviceToHost); - cudaFree(src); + cudaError_t err; + CUDA_SAFE_MEMCPY(dst, src, rows*cols*sizeof(T), cudaMemcpyDeviceToHost, &err); + CUDA_SAFE_FREE(src, &err); } template void wrapper(void*, float*, int, int); diff --git a/openbr/plugins/cuda/copyto.cpp b/openbr/plugins/cuda/copyto.cpp index 7288553..902c335 100644 --- a/openbr/plugins/cuda/copyto.cpp +++ b/openbr/plugins/cuda/copyto.cpp @@ -27,29 +27,28 @@ private: const int rows = srcMat.rows; const int cols = srcMat.cols; + // output will be a single pointer to graphics card memory + Mat dstMat = Mat(4, 1, DataType::type); + void** dstMatData = dstMat.ptr(); + + // save cuda ptr, rows, cols, then type + dstMatData[1] = new int; *((int*)dstMatData[1]) = rows; + dstMatData[2] = new int; *((int*)dstMatData[2]) = cols; + dstMatData[3] = new int; *((int*)dstMatData[3]) = srcMat.type(); + void* cudaMemPtr; switch(srcMat.type()) { case CV_32FC1: - br::cuda::cudacopyto::wrapper(srcMat.ptr(), &cudaMemPtr, rows, cols); + br::cuda::cudacopyto::wrapper(srcMat.ptr(), &dstMatData[0], rows, cols); break; case CV_8UC1: - br::cuda::cudacopyto::wrapper(srcMat.ptr(), &cudaMemPtr, rows, cols); + br::cuda::cudacopyto::wrapper(srcMat.ptr(), &dstMatData[0], rows, cols); break; default: cout << "ERR: Invalid image type! " << type2str(srcMat.type()) << endl; return; } - // output will be a single pointer to graphics card memory - Mat dstMat = Mat(4, 1, DataType::type); - void** dstMatData = dstMat.ptr(); - - // save cuda ptr, rows, cols, then type - dstMatData[0] = cudaMemPtr; - dstMatData[1] = new int; *((int*)dstMatData[1]) = rows; - dstMatData[2] = new int; *((int*)dstMatData[2]) = cols; - dstMatData[3] = new int; *((int*)dstMatData[3]) = srcMat.type(); - dst = dstMat; } }; diff --git a/openbr/plugins/cuda/copyto.cu b/openbr/plugins/cuda/copyto.cu index feecb83..0742ff7 100644 --- a/openbr/plugins/cuda/copyto.cu +++ b/openbr/plugins/cuda/copyto.cu @@ -1,7 +1,10 @@ +#include "cudadefines.hpp" + namespace br { namespace cuda { namespace cudacopyto { template void wrapper(const T* in, void** out, const int rows, const int cols) { - cudaMalloc(out, rows*cols*sizeof(T)); - cudaMemcpy(*out, in, rows*cols*sizeof(T), cudaMemcpyHostToDevice); + cudaError_t err; + CUDA_SAFE_MALLOC(out, rows*cols*sizeof(T), &err); + CUDA_SAFE_MEMCPY(*out, in, rows*cols*sizeof(T), cudaMemcpyHostToDevice, &err); } template void wrapper(const float* in, void** out, const int rows, const int cols); diff --git a/openbr/plugins/cuda/cudacvtfloat.cpp b/openbr/plugins/cuda/cudacvtfloat.cpp index fada942..d774a05 100644 --- a/openbr/plugins/cuda/cudacvtfloat.cpp +++ b/openbr/plugins/cuda/cudacvtfloat.cpp @@ -1,15 +1,14 @@ #include -#include using namespace std; +#include #include using namespace cv; #include - namespace br { namespace cuda { namespace cudacvtfloat { - void wrapper(const unsigned char* src, void** dst, int rows, int cols); + void wrapper(void* src, void** dst, int rows, int cols); }}} namespace br @@ -28,7 +27,6 @@ class CUDACvtFloatTransform : public UntrainableTransform void project(const Template &src, Template &dst) const { void* const* srcDataPtr = src.m().ptr(); - void* srcMemPtr = srcDataPtr[0]; int rows = *((int*)srcDataPtr[1]); int cols = *((int*)srcDataPtr[2]); int type = *((int*)srcDataPtr[3]); @@ -47,7 +45,7 @@ class CUDACvtFloatTransform : public UntrainableTransform dstDataPtr[2] = srcDataPtr[2]; dstDataPtr[3] = srcDataPtr[3]; *((int*)dstDataPtr[3]) = CV_32FC1; - br::cuda::cudacvtfloat::wrapper((const unsigned char*)srcMemPtr, &dstDataPtr[0], rows, cols); + br::cuda::cudacvtfloat::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols); dst = dstMat; } }; diff --git a/openbr/plugins/cuda/cudacvtfloat.cu b/openbr/plugins/cuda/cudacvtfloat.cu index 93651e6..70bf2e4 100644 --- a/openbr/plugins/cuda/cudacvtfloat.cu +++ b/openbr/plugins/cuda/cudacvtfloat.cu @@ -1,3 +1,8 @@ +#include +using namespace std; + +#include "cudadefines.hpp" + namespace br { namespace cuda { namespace cudacvtfloat { __global__ void kernel(const unsigned char* src, float* dst, int rows, int cols) { @@ -14,15 +19,15 @@ namespace br { namespace cuda { namespace cudacvtfloat { dst[index] = (float)src[index]; } - void wrapper(const unsigned char* src, void** dst, int rows, int cols) { + void wrapper(void* src, void** dst, int rows, int cols) { //unsigned char* cudaSrc; //cudaMalloc(&cudaSrc, rows*cols*sizeof(unsigned char)); //cudaMemcpy(cudaSrc, src, rows*cols*sizeof(unsigned char), cudaMemcpyHostToDevice); //float* cudaDst; //cudaMalloc(&cudaDst, rows*cols*sizeof(float)); - - cudaMalloc(dst, rows*cols*sizeof(float)); + cudaError_t err; + CUDA_SAFE_MALLOC(dst, rows*cols*sizeof(float), &err); dim3 threadsPerBlock(8, 8); dim3 blocks( @@ -30,10 +35,11 @@ namespace br { namespace cuda { namespace cudacvtfloat { rows / threadsPerBlock.y + 1 ); - kernel<<>>(src, (float*)(*dst), rows, cols); + kernel<<>>((const unsigned char*)src, (float*)(*dst), rows, cols); + CUDA_KERNEL_ERR_CHK(&err); // free the src memory since it is now in a newly allocated dst - cudaFree((void*)src); + CUDA_SAFE_FREE(src, &err); } }}} diff --git a/openbr/plugins/cuda/cudadefines.hpp b/openbr/plugins/cuda/cudadefines.hpp new file mode 100644 index 0000000..354be2b --- /dev/null +++ b/openbr/plugins/cuda/cudadefines.hpp @@ -0,0 +1,33 @@ +#include +using namespace std; +#include + +#define CUDA_SAFE_FREE(cudaPtr, errPtr) \ + /*cout << pthread_self() << ": CUDA Free: " << cudaPtr << endl;*/ \ + *errPtr = cudaFree(cudaPtr); \ + if (*errPtr != cudaSuccess) { \ + cout << pthread_self() << ": CUDA Free Error(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \ + throw 0; \ + } + +#define CUDA_SAFE_MALLOC(cudaPtrPtr, size, errPtr) \ + *errPtr = cudaMalloc(cudaPtrPtr, size); \ + if (*errPtr != cudaSuccess) { \ + cout << pthread_self() << ": CUDA Malloc Error(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \ + throw 0; \ + } \ + /* cout << pthread_self() << ": CUDA Malloc: " << (void*)*(int**)cudaPtrPtr << endl; */; + +#define CUDA_SAFE_MEMCPY(dstPtr, srcPtr, count, kind, errPtr) \ + *errPtr = cudaMemcpy(dstPtr, srcPtr, count, kind); \ + if (*errPtr != cudaSuccess) { \ + cout << pthread_self() << ": CUDA Memcpy Error(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \ + throw 0; \ + } + +#define CUDA_KERNEL_ERR_CHK(errPtr) \ + *errPtr = cudaPeekAtLastError(); \ + if (*errPtr != cudaSuccess) { \ + cout << pthread_self() << ": Kernel Call Err(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \ + throw 0; \ + } diff --git a/openbr/plugins/cuda/cudalbp.cpp b/openbr/plugins/cuda/cudalbp.cpp index 4cebf7c..2385c9b 100644 --- a/openbr/plugins/cuda/cudalbp.cpp +++ b/openbr/plugins/cuda/cudalbp.cpp @@ -84,7 +84,7 @@ class CUDALBPTransform : public UntrainableTransform uchar lut[256]; uchar null; - cuda::MatManager* matManager; + //cuda::MatManager* matManager; public: /* Returns the number of 0->1 or 1->0 transitions in i */ @@ -137,7 +137,7 @@ class CUDALBPTransform : public UntrainableTransform lut[i] = null; // Set to null id // init the mat manager for managing 10 mats - matManager = new cuda::MatManager(10); + //matManager = new cuda::MatManager(10); // copy lut over to the GPU br::cuda::cudalbp_init_wrapper(lut); @@ -167,7 +167,6 @@ class CUDALBPTransform : public UntrainableTransform //matManager->release(b); void* const* srcDataPtr = src.m().ptr(); - void* cudaSrcPtr = srcDataPtr[0]; int rows = *((int*)srcDataPtr[1]); int cols = *((int*)srcDataPtr[2]); int type = *((int*)srcDataPtr[3]); @@ -178,7 +177,7 @@ class CUDALBPTransform : public UntrainableTransform dstDataPtr[2] = srcDataPtr[2]; dstDataPtr[3] = srcDataPtr[3]; - br::cuda::cudalbp_wrapper(cudaSrcPtr, &dstDataPtr[0], rows, cols); + br::cuda::cudalbp_wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols); dst = dstMat; } }; diff --git a/openbr/plugins/cuda/cudalbp.cu b/openbr/plugins/cuda/cudalbp.cu index af4464c..bb5be9a 100644 --- a/openbr/plugins/cuda/cudalbp.cu +++ b/openbr/plugins/cuda/cudalbp.cu @@ -1,6 +1,11 @@ +#include +using namespace std; + #include #include +#include "cudadefines.hpp" + using namespace cv; using namespace cv::gpu; @@ -8,7 +13,7 @@ namespace br { namespace cuda { uint8_t* lut; __device__ __forceinline__ uint8_t cudalbp_kernel_get_pixel_value(int row, int col, uint8_t* srcPtr, int rows, int cols) { - return (row >= rows || col >= cols) ? 0 : (srcPtr + row*cols)[col]; + return (srcPtr + row*cols)[col]; } __global__ void cudalbp_kernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols, uint8_t* lut) @@ -17,9 +22,16 @@ namespace br { namespace cuda { int colInd = blockIdx.x*blockDim.x+threadIdx.x; int radius = 1; + int index = rowInd*cols + colInd; + // don't do anything if the index is out of bounds - if (rowInd >= rows || colInd >= cols) { - return; + if (rowInd < 1 || rowInd >= rows-1 || colInd < 1 || colInd >= cols-1) { + if (rowInd >= rows || colInd >= cols) { + return; + } else { + dstPtr[index] = 0; + return; + } } const uint8_t cval = cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+0*radius, srcPtr, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]); // center value @@ -33,26 +45,29 @@ namespace br { namespace cuda { (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 1 : 0)]; // store calculated value away in the right place - int index = rowInd*cols + colInd; dstPtr[index] = val; } //void cudalbp_wrapper(uint8_t* srcPtr, uint8_t* dstPtr, uint8_t* lut, int imageWidth, int imageHeight, size_t step) void cudalbp_wrapper(void* srcPtr, void** dstPtr, int rows, int cols) { + cudaError_t err; + // make 8 * 8 = 64 square block dim3 threadsPerBlock(8, 8); dim3 numBlocks(cols/threadsPerBlock.x + 1, rows/threadsPerBlock.y + 1); - cudaMalloc(dstPtr, rows*cols*sizeof(uint8_t)); + CUDA_SAFE_MALLOC(dstPtr, rows*cols*sizeof(uint8_t), &err); cudalbp_kernel<<>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), rows, cols, lut); + CUDA_KERNEL_ERR_CHK(&err); - cudaFree(srcPtr); + CUDA_SAFE_FREE(srcPtr, &err); } void cudalbp_init_wrapper(uint8_t* cpuLut) { - cudaMalloc(&lut, 256*sizeof(uint8_t)); - cudaMemcpy(lut, cpuLut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice); + cudaError_t err; + CUDA_SAFE_MALLOC(&lut, 256*sizeof(uint8_t), &err); + CUDA_SAFE_MEMCPY(lut, cpuLut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice, &err); } }} diff --git a/openbr/plugins/cuda/cudapca.cpp b/openbr/plugins/cuda/cudapca.cpp index 1ede027..7f56070 100644 --- a/openbr/plugins/cuda/cudapca.cpp +++ b/openbr/plugins/cuda/cudapca.cpp @@ -15,6 +15,7 @@ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ #include using namespace std; +#include #include @@ -30,7 +31,7 @@ using namespace cv; namespace br { namespace cuda { void cudapca_loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems); - void cudapca_trainwrapper(const void* cudaDataPtr, float* dataPtr, int rows, int cols); + void cudapca_trainwrapper(void* cudaDataPtr, float* dataPtr, int rows, int cols); void cudapca_projectwrapper(void* src, void** dst); }} @@ -82,12 +83,13 @@ private: void train(const TemplateList &cudaTrainingSet) { + // copy the data back from the graphics card so the training can be done on the CPU const int instances = cudaTrainingSet.size(); // get the number of training set instances QList