From 2f6387f4e6742f97ec5691d0799663f8340b147c Mon Sep 17 00:00:00 2001 From: Colin Heinzmann Date: Wed, 20 Apr 2016 16:53:34 -0400 Subject: [PATCH] made CUDAL2 compatible with assumption that comparison will be done after the fact --- openbr/plugins/cuda/cudal2.cpp | 20 +++++++------------- openbr/plugins/cuda/cudal2.cu | 57 +++++++++++++++++++++++++++++++++++---------------------- 2 files changed, 42 insertions(+), 35 deletions(-) diff --git a/openbr/plugins/cuda/cudal2.cpp b/openbr/plugins/cuda/cudal2.cpp index 982a0a5..18231f8 100644 --- a/openbr/plugins/cuda/cudal2.cpp +++ b/openbr/plugins/cuda/cudal2.cpp @@ -21,7 +21,7 @@ using namespace std; // definitions from the CUDA source file namespace br { namespace cuda { namespace L2 { - void wrapper(float* cudaAPtr, float* cudaBPtr, int length, float* outPtr); + void wrapper(float const* aPtr, float const* bPtr, int length, float* outPtr); }}} namespace br @@ -38,23 +38,17 @@ class CUDAL2Distance : public UntrainableDistance float compare(const cv::Mat &a, const cv::Mat &b) const { - void* const* srcDataPtr = a.ptr(); - float* cudaAPtr = (float*)srcDataPtr[0]; - int rows = *((int*)srcDataPtr[1]); - int cols = *((int*)srcDataPtr[2]); - int srcType = *((int*)srcDataPtr[3]); - - void* const* dstDataPtr = b.ptr(); - float* cudaBPtr = (float*)dstDataPtr[0]; - int dstType = *((int*)dstDataPtr[3]); - - if (srcType != dstType) { + if (a.type() != CV_32FC1 || b.type() != CV_32FC1) { cout << "ERR: Type mismatch" << endl; throw 0; } + if (a.rows*a.cols != b.rows*b.cols) { + cout << "ERR: Dimension mismatch" << endl; + throw 1; + } float out; - cuda::L2::wrapper(cudaAPtr, cudaBPtr, rows*cols, &out); + cuda::L2::wrapper(a.ptr(), b.ptr(), a.rows*a.cols, &out); return out; } diff --git a/openbr/plugins/cuda/cudal2.cu b/openbr/plugins/cuda/cudal2.cu index 7a2baff..ca70e55 100644 --- a/openbr/plugins/cuda/cudal2.cu +++ b/openbr/plugins/cuda/cudal2.cu @@ -27,11 +27,10 @@ namespace br { namespace cuda { namespace L2 { return; } - // perform the subtraction in-place - // use b because it is the comparison - // image - workPtr[index] = aPtr[index] - bPtr[index]; - workPtr[index] = workPtr[index] * workPtr[index]; + // perform the subtraction + float res = aPtr[index] - bPtr[index]; + res = res * res; + workPtr[index] = res; } __global__ void collapseKernel(float* inPtr, float* outPtr, int length) { @@ -41,25 +40,46 @@ namespace br { namespace cuda { namespace L2 { } // sum up all the values - *outPtr = 0; + float acc = 0; for (int i=0; i < length; i++) { - *outPtr = *outPtr + inPtr[i]; + acc += inPtr[i]; } - // take the square root - *outPtr = sqrtf(*outPtr); + *outPtr = acc; } - void wrapper(float* cudaAPtr, float* cudaBPtr, int length, float* outPtr) { + float* cudaAPtr = NULL; + float* cudaBPtr = NULL; + float* cudaWorkBufferPtr = NULL; + float* cudaOutPtr = NULL; + int bufferLen = 0; + + void wrapper(float const* aPtr, float const* bPtr, int length, float* outPtr) { cudaError_t err; - float* cudaOutPtr; - CUDA_SAFE_MALLOC(&cudaOutPtr, sizeof(float), &err); - float* cudaWorkBufferPtr; - CUDA_SAFE_MALLOC(&cudaWorkBufferPtr, sizeof(float)*length, &err); + // allocate memory for the mats and copy data to graphics card + // only allocate if there is a mismatch in image size, otherwise + // use the existing allocated memory + if (length != bufferLen) { + if (cudaAPtr != NULL) { + CUDA_SAFE_FREE(cudaAPtr, &err); + CUDA_SAFE_FREE(cudaBPtr, &err); + CUDA_SAFE_FREE(cudaWorkBufferPtr, &err); + CUDA_SAFE_FREE(cudaOutPtr, &err); + } + CUDA_SAFE_MALLOC(&cudaAPtr, length*sizeof(float), &err); + CUDA_SAFE_MALLOC(&cudaBPtr, length*sizeof(float), &err); + CUDA_SAFE_MALLOC(&cudaWorkBufferPtr, sizeof(float)*length, &err); + CUDA_SAFE_MALLOC(&cudaOutPtr, sizeof(float), &err); + bufferLen = length; + } + + // copy data over from CPU + CUDA_SAFE_MEMCPY(cudaAPtr, aPtr, length*sizeof(float), cudaMemcpyHostToDevice, &err); + CUDA_SAFE_MEMCPY(cudaBPtr, bPtr, length*sizeof(float), cudaMemcpyHostToDevice, &err); // perform the subtraction - int threadsPerBlock = 64; + int threadsPerBlock = 512; int numBlocks = length / threadsPerBlock + 1; subtractKernel<<>>(cudaAPtr, cudaBPtr, cudaWorkBufferPtr, length); CUDA_KERNEL_ERR_CHK(&err); @@ -70,12 +90,5 @@ namespace br { namespace cuda { namespace L2 { // copy the single value back to the destinsion CUDA_SAFE_MEMCPY(outPtr, cudaOutPtr, sizeof(float), cudaMemcpyDeviceToHost, &err); - - CUDA_SAFE_FREE(cudaOutPtr, &err); - - // do not free aPtr which should be the reference library - // only free bPtr, which is the image we are comparing - CUDA_SAFE_FREE(cudaBPtr, &err); - CUDA_SAFE_FREE(cudaWorkBufferPtr, &err); } }}} -- libgit2 0.21.4