Commit 4e59b2b58f7bd025265838a29835fbbfb5225c51
1 parent
d025a5da
full carryover support with proper memory management
* wrapped all CUDA memory functions in safety calls * fixed CUDALBP invalid memory access which caused performance and stability issues * carryover support for all plugins, only copy over memory once throughout the whole computation
Showing
12 changed files
with
140 additions
and
62 deletions
openbr/plugins/cuda/copyfrom.cpp
| ... | ... | @@ -24,7 +24,6 @@ private: |
| 24 | 24 | { |
| 25 | 25 | // pull the data back out of the Mat |
| 26 | 26 | void* const* dataPtr = src.m().ptr<void*>(); |
| 27 | - void* cudaMemPtr = dataPtr[0]; | |
| 28 | 27 | int rows = *((int*)dataPtr[1]); |
| 29 | 28 | int cols = *((int*)dataPtr[2]); |
| 30 | 29 | int type = *((int*)dataPtr[3]); |
| ... | ... | @@ -32,10 +31,10 @@ private: |
| 32 | 31 | Mat dstMat = Mat(rows, cols, type); |
| 33 | 32 | switch(type) { |
| 34 | 33 | case CV_32FC1: |
| 35 | - br::cuda::cudacopyfrom::wrapper(cudaMemPtr, dstMat.ptr<float>(), rows, cols); | |
| 34 | + br::cuda::cudacopyfrom::wrapper(dataPtr[0], dstMat.ptr<float>(), rows, cols); | |
| 36 | 35 | break; |
| 37 | 36 | case CV_8UC1: |
| 38 | - br::cuda::cudacopyfrom::wrapper(cudaMemPtr, dstMat.ptr<unsigned char>(), rows, cols); | |
| 37 | + br::cuda::cudacopyfrom::wrapper(dataPtr[0], dstMat.ptr<unsigned char>(), rows, cols); | |
| 39 | 38 | break; |
| 40 | 39 | default: |
| 41 | 40 | cout << "ERR: Invalid image format" << endl; | ... | ... |
openbr/plugins/cuda/copyfrom.cu
| 1 | +#include "cudadefines.hpp" | |
| 2 | + | |
| 1 | 3 | namespace br { namespace cuda { namespace cudacopyfrom { |
| 2 | 4 | template <typename T> void wrapper(void* src, T* dst, int rows, int cols) { |
| 3 | - cudaMemcpy(dst, src, rows*cols*sizeof(T), cudaMemcpyDeviceToHost); | |
| 4 | - cudaFree(src); | |
| 5 | + cudaError_t err; | |
| 6 | + CUDA_SAFE_MEMCPY(dst, src, rows*cols*sizeof(T), cudaMemcpyDeviceToHost, &err); | |
| 7 | + CUDA_SAFE_FREE(src, &err); | |
| 5 | 8 | } |
| 6 | 9 | |
| 7 | 10 | template void wrapper(void*, float*, int, int); | ... | ... |
openbr/plugins/cuda/copyto.cpp
| ... | ... | @@ -27,29 +27,28 @@ private: |
| 27 | 27 | const int rows = srcMat.rows; |
| 28 | 28 | const int cols = srcMat.cols; |
| 29 | 29 | |
| 30 | + // output will be a single pointer to graphics card memory | |
| 31 | + Mat dstMat = Mat(4, 1, DataType<void*>::type); | |
| 32 | + void** dstMatData = dstMat.ptr<void*>(); | |
| 33 | + | |
| 34 | + // save cuda ptr, rows, cols, then type | |
| 35 | + dstMatData[1] = new int; *((int*)dstMatData[1]) = rows; | |
| 36 | + dstMatData[2] = new int; *((int*)dstMatData[2]) = cols; | |
| 37 | + dstMatData[3] = new int; *((int*)dstMatData[3]) = srcMat.type(); | |
| 38 | + | |
| 30 | 39 | void* cudaMemPtr; |
| 31 | 40 | switch(srcMat.type()) { |
| 32 | 41 | case CV_32FC1: |
| 33 | - br::cuda::cudacopyto::wrapper(srcMat.ptr<float>(), &cudaMemPtr, rows, cols); | |
| 42 | + br::cuda::cudacopyto::wrapper(srcMat.ptr<float>(), &dstMatData[0], rows, cols); | |
| 34 | 43 | break; |
| 35 | 44 | case CV_8UC1: |
| 36 | - br::cuda::cudacopyto::wrapper(srcMat.ptr<unsigned char>(), &cudaMemPtr, rows, cols); | |
| 45 | + br::cuda::cudacopyto::wrapper(srcMat.ptr<unsigned char>(), &dstMatData[0], rows, cols); | |
| 37 | 46 | break; |
| 38 | 47 | default: |
| 39 | 48 | cout << "ERR: Invalid image type! " << type2str(srcMat.type()) << endl; |
| 40 | 49 | return; |
| 41 | 50 | } |
| 42 | 51 | |
| 43 | - // output will be a single pointer to graphics card memory | |
| 44 | - Mat dstMat = Mat(4, 1, DataType<void*>::type); | |
| 45 | - void** dstMatData = dstMat.ptr<void*>(); | |
| 46 | - | |
| 47 | - // save cuda ptr, rows, cols, then type | |
| 48 | - dstMatData[0] = cudaMemPtr; | |
| 49 | - dstMatData[1] = new int; *((int*)dstMatData[1]) = rows; | |
| 50 | - dstMatData[2] = new int; *((int*)dstMatData[2]) = cols; | |
| 51 | - dstMatData[3] = new int; *((int*)dstMatData[3]) = srcMat.type(); | |
| 52 | - | |
| 53 | 52 | dst = dstMat; |
| 54 | 53 | } |
| 55 | 54 | }; | ... | ... |
openbr/plugins/cuda/copyto.cu
| 1 | +#include "cudadefines.hpp" | |
| 2 | + | |
| 1 | 3 | namespace br { namespace cuda { namespace cudacopyto { |
| 2 | 4 | template <typename T> void wrapper(const T* in, void** out, const int rows, const int cols) { |
| 3 | - cudaMalloc(out, rows*cols*sizeof(T)); | |
| 4 | - cudaMemcpy(*out, in, rows*cols*sizeof(T), cudaMemcpyHostToDevice); | |
| 5 | + cudaError_t err; | |
| 6 | + CUDA_SAFE_MALLOC(out, rows*cols*sizeof(T), &err); | |
| 7 | + CUDA_SAFE_MEMCPY(*out, in, rows*cols*sizeof(T), cudaMemcpyHostToDevice, &err); | |
| 5 | 8 | } |
| 6 | 9 | |
| 7 | 10 | template void wrapper(const float* in, void** out, const int rows, const int cols); | ... | ... |
openbr/plugins/cuda/cudacvtfloat.cpp
| 1 | 1 | #include <iostream> |
| 2 | -#include <unistd.h> | |
| 3 | 2 | using namespace std; |
| 3 | +#include <unistd.h> | |
| 4 | 4 | |
| 5 | 5 | #include <opencv2/opencv.hpp> |
| 6 | 6 | using namespace cv; |
| 7 | 7 | |
| 8 | 8 | #include <openbr/plugins/openbr_internal.h> |
| 9 | 9 | |
| 10 | - | |
| 11 | 10 | namespace br { namespace cuda { namespace cudacvtfloat { |
| 12 | - void wrapper(const unsigned char* src, void** dst, int rows, int cols); | |
| 11 | + void wrapper(void* src, void** dst, int rows, int cols); | |
| 13 | 12 | }}} |
| 14 | 13 | |
| 15 | 14 | namespace br |
| ... | ... | @@ -28,7 +27,6 @@ class CUDACvtFloatTransform : public UntrainableTransform |
| 28 | 27 | void project(const Template &src, Template &dst) const |
| 29 | 28 | { |
| 30 | 29 | void* const* srcDataPtr = src.m().ptr<void*>(); |
| 31 | - void* srcMemPtr = srcDataPtr[0]; | |
| 32 | 30 | int rows = *((int*)srcDataPtr[1]); |
| 33 | 31 | int cols = *((int*)srcDataPtr[2]); |
| 34 | 32 | int type = *((int*)srcDataPtr[3]); |
| ... | ... | @@ -47,7 +45,7 @@ class CUDACvtFloatTransform : public UntrainableTransform |
| 47 | 45 | dstDataPtr[2] = srcDataPtr[2]; |
| 48 | 46 | dstDataPtr[3] = srcDataPtr[3]; *((int*)dstDataPtr[3]) = CV_32FC1; |
| 49 | 47 | |
| 50 | - br::cuda::cudacvtfloat::wrapper((const unsigned char*)srcMemPtr, &dstDataPtr[0], rows, cols); | |
| 48 | + br::cuda::cudacvtfloat::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols); | |
| 51 | 49 | dst = dstMat; |
| 52 | 50 | } |
| 53 | 51 | }; | ... | ... |
openbr/plugins/cuda/cudacvtfloat.cu
| 1 | +#include <iostream> | |
| 2 | +using namespace std; | |
| 3 | + | |
| 4 | +#include "cudadefines.hpp" | |
| 5 | + | |
| 1 | 6 | namespace br { namespace cuda { namespace cudacvtfloat { |
| 2 | 7 | |
| 3 | 8 | __global__ void kernel(const unsigned char* src, float* dst, int rows, int cols) { |
| ... | ... | @@ -14,15 +19,15 @@ namespace br { namespace cuda { namespace cudacvtfloat { |
| 14 | 19 | dst[index] = (float)src[index]; |
| 15 | 20 | } |
| 16 | 21 | |
| 17 | - void wrapper(const unsigned char* src, void** dst, int rows, int cols) { | |
| 22 | + void wrapper(void* src, void** dst, int rows, int cols) { | |
| 18 | 23 | //unsigned char* cudaSrc; |
| 19 | 24 | //cudaMalloc(&cudaSrc, rows*cols*sizeof(unsigned char)); |
| 20 | 25 | //cudaMemcpy(cudaSrc, src, rows*cols*sizeof(unsigned char), cudaMemcpyHostToDevice); |
| 21 | 26 | |
| 22 | 27 | //float* cudaDst; |
| 23 | 28 | //cudaMalloc(&cudaDst, rows*cols*sizeof(float)); |
| 24 | - | |
| 25 | - cudaMalloc(dst, rows*cols*sizeof(float)); | |
| 29 | + cudaError_t err; | |
| 30 | + CUDA_SAFE_MALLOC(dst, rows*cols*sizeof(float), &err); | |
| 26 | 31 | |
| 27 | 32 | dim3 threadsPerBlock(8, 8); |
| 28 | 33 | dim3 blocks( |
| ... | ... | @@ -30,10 +35,11 @@ namespace br { namespace cuda { namespace cudacvtfloat { |
| 30 | 35 | rows / threadsPerBlock.y + 1 |
| 31 | 36 | ); |
| 32 | 37 | |
| 33 | - kernel<<<threadsPerBlock, blocks>>>(src, (float*)(*dst), rows, cols); | |
| 38 | + kernel<<<threadsPerBlock, blocks>>>((const unsigned char*)src, (float*)(*dst), rows, cols); | |
| 39 | + CUDA_KERNEL_ERR_CHK(&err); | |
| 34 | 40 | |
| 35 | 41 | // free the src memory since it is now in a newly allocated dst |
| 36 | - cudaFree((void*)src); | |
| 42 | + CUDA_SAFE_FREE(src, &err); | |
| 37 | 43 | } |
| 38 | 44 | |
| 39 | 45 | }}} | ... | ... |
openbr/plugins/cuda/cudadefines.hpp
0 โ 100644
| 1 | +#include <iostream> | |
| 2 | +using namespace std; | |
| 3 | +#include <pthread.h> | |
| 4 | + | |
| 5 | +#define CUDA_SAFE_FREE(cudaPtr, errPtr) \ | |
| 6 | + /*cout << pthread_self() << ": CUDA Free: " << cudaPtr << endl;*/ \ | |
| 7 | + *errPtr = cudaFree(cudaPtr); \ | |
| 8 | + if (*errPtr != cudaSuccess) { \ | |
| 9 | + cout << pthread_self() << ": CUDA Free Error(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \ | |
| 10 | + throw 0; \ | |
| 11 | + } | |
| 12 | + | |
| 13 | +#define CUDA_SAFE_MALLOC(cudaPtrPtr, size, errPtr) \ | |
| 14 | + *errPtr = cudaMalloc(cudaPtrPtr, size); \ | |
| 15 | + if (*errPtr != cudaSuccess) { \ | |
| 16 | + cout << pthread_self() << ": CUDA Malloc Error(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \ | |
| 17 | + throw 0; \ | |
| 18 | + } \ | |
| 19 | + /* cout << pthread_self() << ": CUDA Malloc: " << (void*)*(int**)cudaPtrPtr << endl; */; | |
| 20 | + | |
| 21 | +#define CUDA_SAFE_MEMCPY(dstPtr, srcPtr, count, kind, errPtr) \ | |
| 22 | + *errPtr = cudaMemcpy(dstPtr, srcPtr, count, kind); \ | |
| 23 | + if (*errPtr != cudaSuccess) { \ | |
| 24 | + cout << pthread_self() << ": CUDA Memcpy Error(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \ | |
| 25 | + throw 0; \ | |
| 26 | + } | |
| 27 | + | |
| 28 | +#define CUDA_KERNEL_ERR_CHK(errPtr) \ | |
| 29 | + *errPtr = cudaPeekAtLastError(); \ | |
| 30 | + if (*errPtr != cudaSuccess) { \ | |
| 31 | + cout << pthread_self() << ": Kernel Call Err(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \ | |
| 32 | + throw 0; \ | |
| 33 | + } | ... | ... |
openbr/plugins/cuda/cudalbp.cpp
| ... | ... | @@ -84,7 +84,7 @@ class CUDALBPTransform : public UntrainableTransform |
| 84 | 84 | uchar lut[256]; |
| 85 | 85 | uchar null; |
| 86 | 86 | |
| 87 | - cuda::MatManager* matManager; | |
| 87 | + //cuda::MatManager* matManager; | |
| 88 | 88 | |
| 89 | 89 | public: |
| 90 | 90 | /* Returns the number of 0->1 or 1->0 transitions in i */ |
| ... | ... | @@ -137,7 +137,7 @@ class CUDALBPTransform : public UntrainableTransform |
| 137 | 137 | lut[i] = null; // Set to null id |
| 138 | 138 | |
| 139 | 139 | // init the mat manager for managing 10 mats |
| 140 | - matManager = new cuda::MatManager(10); | |
| 140 | + //matManager = new cuda::MatManager(10); | |
| 141 | 141 | |
| 142 | 142 | // copy lut over to the GPU |
| 143 | 143 | br::cuda::cudalbp_init_wrapper(lut); |
| ... | ... | @@ -167,7 +167,6 @@ class CUDALBPTransform : public UntrainableTransform |
| 167 | 167 | //matManager->release(b); |
| 168 | 168 | |
| 169 | 169 | void* const* srcDataPtr = src.m().ptr<void*>(); |
| 170 | - void* cudaSrcPtr = srcDataPtr[0]; | |
| 171 | 170 | int rows = *((int*)srcDataPtr[1]); |
| 172 | 171 | int cols = *((int*)srcDataPtr[2]); |
| 173 | 172 | int type = *((int*)srcDataPtr[3]); |
| ... | ... | @@ -178,7 +177,7 @@ class CUDALBPTransform : public UntrainableTransform |
| 178 | 177 | dstDataPtr[2] = srcDataPtr[2]; |
| 179 | 178 | dstDataPtr[3] = srcDataPtr[3]; |
| 180 | 179 | |
| 181 | - br::cuda::cudalbp_wrapper(cudaSrcPtr, &dstDataPtr[0], rows, cols); | |
| 180 | + br::cuda::cudalbp_wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols); | |
| 182 | 181 | dst = dstMat; |
| 183 | 182 | } |
| 184 | 183 | }; | ... | ... |
openbr/plugins/cuda/cudalbp.cu
| 1 | +#include <iostream> | |
| 2 | +using namespace std; | |
| 3 | + | |
| 1 | 4 | #include <opencv2/gpu/gpu.hpp> |
| 2 | 5 | #include <stdio.h> |
| 3 | 6 | |
| 7 | +#include "cudadefines.hpp" | |
| 8 | + | |
| 4 | 9 | using namespace cv; |
| 5 | 10 | using namespace cv::gpu; |
| 6 | 11 | |
| ... | ... | @@ -8,7 +13,7 @@ namespace br { namespace cuda { |
| 8 | 13 | uint8_t* lut; |
| 9 | 14 | |
| 10 | 15 | __device__ __forceinline__ uint8_t cudalbp_kernel_get_pixel_value(int row, int col, uint8_t* srcPtr, int rows, int cols) { |
| 11 | - return (row >= rows || col >= cols) ? 0 : (srcPtr + row*cols)[col]; | |
| 16 | + return (srcPtr + row*cols)[col]; | |
| 12 | 17 | } |
| 13 | 18 | |
| 14 | 19 | __global__ void cudalbp_kernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols, uint8_t* lut) |
| ... | ... | @@ -17,9 +22,16 @@ namespace br { namespace cuda { |
| 17 | 22 | int colInd = blockIdx.x*blockDim.x+threadIdx.x; |
| 18 | 23 | int radius = 1; |
| 19 | 24 | |
| 25 | + int index = rowInd*cols + colInd; | |
| 26 | + | |
| 20 | 27 | // don't do anything if the index is out of bounds |
| 21 | - if (rowInd >= rows || colInd >= cols) { | |
| 22 | - return; | |
| 28 | + if (rowInd < 1 || rowInd >= rows-1 || colInd < 1 || colInd >= cols-1) { | |
| 29 | + if (rowInd >= rows || colInd >= cols) { | |
| 30 | + return; | |
| 31 | + } else { | |
| 32 | + dstPtr[index] = 0; | |
| 33 | + return; | |
| 34 | + } | |
| 23 | 35 | } |
| 24 | 36 | |
| 25 | 37 | const uint8_t cval = cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+0*radius, srcPtr, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]); // center value |
| ... | ... | @@ -33,26 +45,29 @@ namespace br { namespace cuda { |
| 33 | 45 | (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 1 : 0)]; |
| 34 | 46 | |
| 35 | 47 | // store calculated value away in the right place |
| 36 | - int index = rowInd*cols + colInd; | |
| 37 | 48 | dstPtr[index] = val; |
| 38 | 49 | } |
| 39 | 50 | |
| 40 | 51 | //void cudalbp_wrapper(uint8_t* srcPtr, uint8_t* dstPtr, uint8_t* lut, int imageWidth, int imageHeight, size_t step) |
| 41 | 52 | void cudalbp_wrapper(void* srcPtr, void** dstPtr, int rows, int cols) |
| 42 | 53 | { |
| 54 | + cudaError_t err; | |
| 55 | + | |
| 43 | 56 | // make 8 * 8 = 64 square block |
| 44 | 57 | dim3 threadsPerBlock(8, 8); |
| 45 | 58 | dim3 numBlocks(cols/threadsPerBlock.x + 1, |
| 46 | 59 | rows/threadsPerBlock.y + 1); |
| 47 | 60 | |
| 48 | - cudaMalloc(dstPtr, rows*cols*sizeof(uint8_t)); | |
| 61 | + CUDA_SAFE_MALLOC(dstPtr, rows*cols*sizeof(uint8_t), &err); | |
| 49 | 62 | cudalbp_kernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), rows, cols, lut); |
| 63 | + CUDA_KERNEL_ERR_CHK(&err); | |
| 50 | 64 | |
| 51 | - cudaFree(srcPtr); | |
| 65 | + CUDA_SAFE_FREE(srcPtr, &err); | |
| 52 | 66 | } |
| 53 | 67 | |
| 54 | 68 | void cudalbp_init_wrapper(uint8_t* cpuLut) { |
| 55 | - cudaMalloc(&lut, 256*sizeof(uint8_t)); | |
| 56 | - cudaMemcpy(lut, cpuLut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice); | |
| 69 | + cudaError_t err; | |
| 70 | + CUDA_SAFE_MALLOC(&lut, 256*sizeof(uint8_t), &err); | |
| 71 | + CUDA_SAFE_MEMCPY(lut, cpuLut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice, &err); | |
| 57 | 72 | } |
| 58 | 73 | }} | ... | ... |
openbr/plugins/cuda/cudapca.cpp
| ... | ... | @@ -15,6 +15,7 @@ |
| 15 | 15 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ |
| 16 | 16 | #include <iostream> |
| 17 | 17 | using namespace std; |
| 18 | +#include <unistd.h> | |
| 18 | 19 | |
| 19 | 20 | #include <QList> |
| 20 | 21 | |
| ... | ... | @@ -30,7 +31,7 @@ using namespace cv; |
| 30 | 31 | |
| 31 | 32 | namespace br { namespace cuda { |
| 32 | 33 | void cudapca_loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems); |
| 33 | - void cudapca_trainwrapper(const void* cudaDataPtr, float* dataPtr, int rows, int cols); | |
| 34 | + void cudapca_trainwrapper(void* cudaDataPtr, float* dataPtr, int rows, int cols); | |
| 34 | 35 | void cudapca_projectwrapper(void* src, void** dst); |
| 35 | 36 | }} |
| 36 | 37 | |
| ... | ... | @@ -82,12 +83,13 @@ private: |
| 82 | 83 | |
| 83 | 84 | void train(const TemplateList &cudaTrainingSet) |
| 84 | 85 | { |
| 86 | + // copy the data back from the graphics card so the training can be done on the CPU | |
| 85 | 87 | const int instances = cudaTrainingSet.size(); // get the number of training set instances |
| 86 | 88 | QList<Template> trainingQlist; |
| 87 | 89 | for(int i=0; i<instances; i++) { |
| 88 | 90 | Template currentTemplate = cudaTrainingSet[i]; |
| 89 | 91 | void* const* srcDataPtr = currentTemplate.m().ptr<void*>(); |
| 90 | - const void* cudaMemPtr = srcDataPtr[0]; | |
| 92 | + void* cudaMemPtr = srcDataPtr[0]; | |
| 91 | 93 | int rows = *((int*)srcDataPtr[1]); |
| 92 | 94 | int cols = *((int*)srcDataPtr[2]); |
| 93 | 95 | int type = *((int*)srcDataPtr[3]); |
| ... | ... | @@ -95,29 +97,30 @@ private: |
| 95 | 97 | Mat mat = Mat(rows, cols, type); |
| 96 | 98 | br::cuda::cudapca_trainwrapper(cudaMemPtr, mat.ptr<float>(), rows, cols); |
| 97 | 99 | trainingQlist.append(Template(mat)); |
| 98 | - TemplateList trainingSet; | |
| 99 | 100 | } |
| 101 | + | |
| 102 | + // assemble a TemplateList from the list of data | |
| 100 | 103 | TemplateList trainingSet(trainingQlist); |
| 101 | 104 | |
| 102 | - if (trainingSet.first().m().type() != CV_32FC1) | |
| 103 | - qFatal("Requires single channel 32-bit floating point matrices."); | |
| 105 | + if (trainingSet.first().m().type() != CV_32FC1) { | |
| 106 | + qFatal("Requires single channel 32-bit floating point matrices."); | |
| 107 | + } | |
| 104 | 108 | |
| 105 | 109 | originalRows = trainingSet.first().m().rows; // get number of rows of first image |
| 106 | 110 | int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols; // get the size of the first image |
| 107 | 111 | |
| 108 | 112 | // Map into 64-bit Eigen matrix |
| 109 | 113 | Eigen::MatrixXd data(dimsIn, instances); // create a mat |
| 110 | - for (int i=0; i<instances; i++) | |
| 111 | - data.col(i) = Eigen::Map<const Eigen::MatrixXf>(trainingSet[i].m().ptr<float>(), dimsIn, 1).cast<double>(); | |
| 114 | + for (int i=0; i<instances; i++) { | |
| 115 | + data.col(i) = Eigen::Map<const Eigen::MatrixXf>(trainingSet[i].m().ptr<float>(), dimsIn, 1).cast<double>(); | |
| 116 | + } | |
| 112 | 117 | |
| 113 | 118 | trainCore(data); |
| 114 | 119 | } |
| 115 | 120 | |
| 116 | 121 | void project(const Template &src, Template &dst) const |
| 117 | 122 | { |
| 118 | - | |
| 119 | 123 | void* const* srcDataPtr = src.m().ptr<void*>(); |
| 120 | - void* cudaMemPtr = srcDataPtr[0]; | |
| 121 | 124 | int rows = *((int*)srcDataPtr[1]); |
| 122 | 125 | int cols = *((int*)srcDataPtr[2]); |
| 123 | 126 | int type = *((int*)srcDataPtr[3]); |
| ... | ... | @@ -133,7 +136,7 @@ private: |
| 133 | 136 | dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = keep; |
| 134 | 137 | dstDataPtr[3] = srcDataPtr[3]; |
| 135 | 138 | |
| 136 | - br::cuda::cudapca_projectwrapper(cudaMemPtr, &dstDataPtr[0]); | |
| 139 | + br::cuda::cudapca_projectwrapper(srcDataPtr[0], &dstDataPtr[0]); | |
| 137 | 140 | |
| 138 | 141 | dst = dstMat; |
| 139 | 142 | ... | ... |
openbr/plugins/cuda/cudapca.cu
| ... | ... | @@ -4,6 +4,8 @@ using namespace std; |
| 4 | 4 | #include <opencv2/opencv.hpp> |
| 5 | 5 | #include <opencv2/gpu/gpu.hpp> |
| 6 | 6 | |
| 7 | +#include "cudadefines.hpp" | |
| 8 | + | |
| 7 | 9 | using namespace cv; |
| 8 | 10 | using namespace cv::gpu; |
| 9 | 11 | |
| ... | ... | @@ -63,39 +65,45 @@ namespace br { namespace cuda { |
| 63 | 65 | _evRows = evRows; _evCols = evCols; |
| 64 | 66 | _meanElems = meanElems; |
| 65 | 67 | |
| 68 | + cudaError_t err; | |
| 69 | + | |
| 66 | 70 | // copy the eigenvectors to the GPU |
| 67 | - cudaMalloc(&cudaEvPtr, evRows*evCols*sizeof(float)); | |
| 68 | - cudaMemcpy(cudaEvPtr, evPtr, evRows*evCols*sizeof(float), cudaMemcpyHostToDevice); | |
| 71 | + CUDA_SAFE_MALLOC(&cudaEvPtr, evRows*evCols*sizeof(float), &err); | |
| 72 | + CUDA_SAFE_MEMCPY(cudaEvPtr, evPtr, evRows*evCols*sizeof(float), cudaMemcpyHostToDevice, &err); | |
| 69 | 73 | |
| 70 | 74 | // copy the mean to the GPU |
| 71 | - cudaMalloc(&cudaMeanPtr, meanElems*sizeof(float)); | |
| 72 | - cudaMemcpy(cudaMeanPtr, meanPtr, meanElems*sizeof(float), cudaMemcpyHostToDevice); | |
| 75 | + CUDA_SAFE_MALLOC(&cudaMeanPtr, meanElems*sizeof(float), &err); | |
| 76 | + CUDA_SAFE_MEMCPY(cudaMeanPtr, meanPtr, meanElems*sizeof(float), cudaMemcpyHostToDevice, &err); | |
| 73 | 77 | |
| 74 | - cudaMalloc(&_cudaSrcPtr, _meanElems*sizeof(float)); | |
| 75 | - cudaMalloc(&_cudaDstPtr, _evCols*sizeof(float)); | |
| 78 | + CUDA_SAFE_MALLOC(&_cudaSrcPtr, _meanElems*sizeof(float), &err); | |
| 79 | + CUDA_SAFE_MALLOC(&_cudaDstPtr, _evCols*sizeof(float), &err); | |
| 76 | 80 | } |
| 77 | 81 | |
| 78 | - void cudapca_trainwrapper(const void* cudaDataPtr, float* dataPtr, int rows, int cols) { | |
| 79 | - cudaMemcpy(dataPtr, cudaDataPtr, rows*cols*sizeof(float), cudaMemcpyDeviceToHost); | |
| 82 | + void cudapca_trainwrapper(void* cudaDataPtr, float* dataPtr, int rows, int cols) { | |
| 83 | + cudaError_t err; | |
| 84 | + CUDA_SAFE_MEMCPY(dataPtr, cudaDataPtr, rows*cols*sizeof(float), cudaMemcpyDeviceToHost, &err); | |
| 85 | + CUDA_SAFE_FREE(cudaDataPtr, &err); | |
| 80 | 86 | } |
| 81 | 87 | |
| 82 | 88 | void cudapca_projectwrapper(void* src, void** dst) { |
| 83 | 89 | // copy the image to the GPU |
| 84 | 90 | //cudaMemcpy(_cudaSrcPtr, src, _meanElems*sizeof(float), cudaMemcpyHostToDevice); |
| 85 | - | |
| 86 | - cudaMalloc(dst, _evRows*_evCols*sizeof(float)); | |
| 91 | + cudaError_t err; | |
| 92 | + CUDA_SAFE_MALLOC(dst, _evRows*_evCols*sizeof(float), &err); | |
| 87 | 93 | |
| 88 | 94 | // subtract out the mean of the image (mean is 1xpixels in size) |
| 89 | 95 | int threadsPerBlock = 64; |
| 90 | 96 | int numBlocks = _meanElems / threadsPerBlock + 1; |
| 91 | 97 | cudapca_project_subtractmean_kernel<<<numBlocks, threadsPerBlock>>>((float*)src, cudaMeanPtr, _meanElems); |
| 98 | + CUDA_KERNEL_ERR_CHK(&err); | |
| 92 | 99 | |
| 93 | 100 | // perform the multiplication |
| 94 | 101 | threadsPerBlock = 64; |
| 95 | 102 | numBlocks = _evCols / threadsPerBlock + 1; |
| 96 | 103 | cudapca_project_multiply_kernel<<<numBlocks, threadsPerBlock>>>((float*)src, (float*)(*dst), cudaEvPtr, _evRows, _evCols); |
| 104 | + CUDA_KERNEL_ERR_CHK(&err); | |
| 97 | 105 | |
| 98 | - //cudaFree(src); // TODO(colin): figure out why adding this free causes memory corruption... | |
| 106 | + CUDA_SAFE_FREE(src, &err); // TODO(colin): figure out why adding this free causes memory corruption... | |
| 99 | 107 | |
| 100 | 108 | // copy the data back to the CPU |
| 101 | 109 | //cudaMemcpy(dst, _cudaDstPtr, _evCols*sizeof(float), cudaMemcpyDeviceToHost); | ... | ... |
openbr/plugins/distance/dist.cpp
| ... | ... | @@ -14,6 +14,9 @@ |
| 14 | 14 | * limitations under the License. * |
| 15 | 15 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ |
| 16 | 16 | |
| 17 | +#include <iostream> | |
| 18 | +using namespace std; | |
| 19 | + | |
| 17 | 20 | #include <opencv2/imgproc/imgproc.hpp> |
| 18 | 21 | #include <openbr/plugins/openbr_internal.h> |
| 19 | 22 | |
| ... | ... | @@ -52,6 +55,15 @@ private: |
| 52 | 55 | |
| 53 | 56 | float compare(const Mat &a, const Mat &b) const |
| 54 | 57 | { |
| 58 | + /* | |
| 59 | + cout << "Mat A" << endl; | |
| 60 | + cout << "rows: " << a.rows << "\tcols: " << a.cols << endl; | |
| 61 | + cout << "a.ptr<float>()[0]: " << a.ptr<float>()[0] << endl; | |
| 62 | + cout << "Mat B" << endl; | |
| 63 | + cout << "rows: " << b.rows << "\tcols: " << b.cols << endl; | |
| 64 | + cout << "b.ptr<float>()[0]: " << b.ptr<float>()[0] << endl; | |
| 65 | + */ | |
| 66 | + | |
| 55 | 67 | if ((a.size != b.size) || |
| 56 | 68 | (a.type() != b.type())) |
| 57 | 69 | return -std::numeric_limits<float>::max(); | ... | ... |