Commit 2f6387f4e6742f97ec5691d0799663f8340b147c
1 parent
2ba8a2a3
made CUDAL2 compatible with assumption that comparison will be done after the fact
Showing
2 changed files
with
42 additions
and
35 deletions
openbr/plugins/cuda/cudal2.cpp
| @@ -21,7 +21,7 @@ using namespace std; | @@ -21,7 +21,7 @@ using namespace std; | ||
| 21 | 21 | ||
| 22 | // definitions from the CUDA source file | 22 | // definitions from the CUDA source file |
| 23 | namespace br { namespace cuda { namespace L2 { | 23 | namespace br { namespace cuda { namespace L2 { |
| 24 | - void wrapper(float* cudaAPtr, float* cudaBPtr, int length, float* outPtr); | 24 | + void wrapper(float const* aPtr, float const* bPtr, int length, float* outPtr); |
| 25 | }}} | 25 | }}} |
| 26 | 26 | ||
| 27 | namespace br | 27 | namespace br |
| @@ -38,23 +38,17 @@ class CUDAL2Distance : public UntrainableDistance | @@ -38,23 +38,17 @@ class CUDAL2Distance : public UntrainableDistance | ||
| 38 | 38 | ||
| 39 | float compare(const cv::Mat &a, const cv::Mat &b) const | 39 | float compare(const cv::Mat &a, const cv::Mat &b) const |
| 40 | { | 40 | { |
| 41 | - void* const* srcDataPtr = a.ptr<void*>(); | ||
| 42 | - float* cudaAPtr = (float*)srcDataPtr[0]; | ||
| 43 | - int rows = *((int*)srcDataPtr[1]); | ||
| 44 | - int cols = *((int*)srcDataPtr[2]); | ||
| 45 | - int srcType = *((int*)srcDataPtr[3]); | ||
| 46 | - | ||
| 47 | - void* const* dstDataPtr = b.ptr<void*>(); | ||
| 48 | - float* cudaBPtr = (float*)dstDataPtr[0]; | ||
| 49 | - int dstType = *((int*)dstDataPtr[3]); | ||
| 50 | - | ||
| 51 | - if (srcType != dstType) { | 41 | + if (a.type() != CV_32FC1 || b.type() != CV_32FC1) { |
| 52 | cout << "ERR: Type mismatch" << endl; | 42 | cout << "ERR: Type mismatch" << endl; |
| 53 | throw 0; | 43 | throw 0; |
| 54 | } | 44 | } |
| 45 | + if (a.rows*a.cols != b.rows*b.cols) { | ||
| 46 | + cout << "ERR: Dimension mismatch" << endl; | ||
| 47 | + throw 1; | ||
| 48 | + } | ||
| 55 | 49 | ||
| 56 | float out; | 50 | float out; |
| 57 | - cuda::L2::wrapper(cudaAPtr, cudaBPtr, rows*cols, &out); | 51 | + cuda::L2::wrapper(a.ptr<float>(), b.ptr<float>(), a.rows*a.cols, &out); |
| 58 | 52 | ||
| 59 | return out; | 53 | return out; |
| 60 | } | 54 | } |
openbr/plugins/cuda/cudal2.cu
| @@ -27,11 +27,10 @@ namespace br { namespace cuda { namespace L2 { | @@ -27,11 +27,10 @@ namespace br { namespace cuda { namespace L2 { | ||
| 27 | return; | 27 | return; |
| 28 | } | 28 | } |
| 29 | 29 | ||
| 30 | - // perform the subtraction in-place | ||
| 31 | - // use b because it is the comparison | ||
| 32 | - // image | ||
| 33 | - workPtr[index] = aPtr[index] - bPtr[index]; | ||
| 34 | - workPtr[index] = workPtr[index] * workPtr[index]; | 30 | + // perform the subtraction |
| 31 | + float res = aPtr[index] - bPtr[index]; | ||
| 32 | + res = res * res; | ||
| 33 | + workPtr[index] = res; | ||
| 35 | } | 34 | } |
| 36 | 35 | ||
| 37 | __global__ void collapseKernel(float* inPtr, float* outPtr, int length) { | 36 | __global__ void collapseKernel(float* inPtr, float* outPtr, int length) { |
| @@ -41,25 +40,46 @@ namespace br { namespace cuda { namespace L2 { | @@ -41,25 +40,46 @@ namespace br { namespace cuda { namespace L2 { | ||
| 41 | } | 40 | } |
| 42 | 41 | ||
| 43 | // sum up all the values | 42 | // sum up all the values |
| 44 | - *outPtr = 0; | 43 | + float acc = 0; |
| 45 | for (int i=0; i < length; i++) { | 44 | for (int i=0; i < length; i++) { |
| 46 | - *outPtr = *outPtr + inPtr[i]; | 45 | + acc += inPtr[i]; |
| 47 | } | 46 | } |
| 48 | 47 | ||
| 49 | - // take the square root | ||
| 50 | - *outPtr = sqrtf(*outPtr); | 48 | + *outPtr = acc; |
| 51 | } | 49 | } |
| 52 | 50 | ||
| 53 | - void wrapper(float* cudaAPtr, float* cudaBPtr, int length, float* outPtr) { | 51 | + float* cudaAPtr = NULL; |
| 52 | + float* cudaBPtr = NULL; | ||
| 53 | + float* cudaWorkBufferPtr = NULL; | ||
| 54 | + float* cudaOutPtr = NULL; | ||
| 55 | + int bufferLen = 0; | ||
| 56 | + | ||
| 57 | + void wrapper(float const* aPtr, float const* bPtr, int length, float* outPtr) { | ||
| 54 | cudaError_t err; | 58 | cudaError_t err; |
| 55 | - float* cudaOutPtr; | ||
| 56 | - CUDA_SAFE_MALLOC(&cudaOutPtr, sizeof(float), &err); | ||
| 57 | 59 | ||
| 58 | - float* cudaWorkBufferPtr; | ||
| 59 | - CUDA_SAFE_MALLOC(&cudaWorkBufferPtr, sizeof(float)*length, &err); | 60 | + // allocate memory for the mats and copy data to graphics card |
| 61 | + // only allocate if there is a mismatch in image size, otherwise | ||
| 62 | + // use the existing allocated memory | ||
| 63 | + if (length != bufferLen) { | ||
| 64 | + if (cudaAPtr != NULL) { | ||
| 65 | + CUDA_SAFE_FREE(cudaAPtr, &err); | ||
| 66 | + CUDA_SAFE_FREE(cudaBPtr, &err); | ||
| 67 | + CUDA_SAFE_FREE(cudaWorkBufferPtr, &err); | ||
| 68 | + CUDA_SAFE_FREE(cudaOutPtr, &err); | ||
| 69 | + } | ||
| 70 | + CUDA_SAFE_MALLOC(&cudaAPtr, length*sizeof(float), &err); | ||
| 71 | + CUDA_SAFE_MALLOC(&cudaBPtr, length*sizeof(float), &err); | ||
| 72 | + CUDA_SAFE_MALLOC(&cudaWorkBufferPtr, sizeof(float)*length, &err); | ||
| 73 | + CUDA_SAFE_MALLOC(&cudaOutPtr, sizeof(float), &err); | ||
| 74 | + bufferLen = length; | ||
| 75 | + } | ||
| 76 | + | ||
| 77 | + // copy data over from CPU | ||
| 78 | + CUDA_SAFE_MEMCPY(cudaAPtr, aPtr, length*sizeof(float), cudaMemcpyHostToDevice, &err); | ||
| 79 | + CUDA_SAFE_MEMCPY(cudaBPtr, bPtr, length*sizeof(float), cudaMemcpyHostToDevice, &err); | ||
| 60 | 80 | ||
| 61 | // perform the subtraction | 81 | // perform the subtraction |
| 62 | - int threadsPerBlock = 64; | 82 | + int threadsPerBlock = 512; |
| 63 | int numBlocks = length / threadsPerBlock + 1; | 83 | int numBlocks = length / threadsPerBlock + 1; |
| 64 | subtractKernel<<<threadsPerBlock, numBlocks>>>(cudaAPtr, cudaBPtr, cudaWorkBufferPtr, length); | 84 | subtractKernel<<<threadsPerBlock, numBlocks>>>(cudaAPtr, cudaBPtr, cudaWorkBufferPtr, length); |
| 65 | CUDA_KERNEL_ERR_CHK(&err); | 85 | CUDA_KERNEL_ERR_CHK(&err); |
| @@ -70,12 +90,5 @@ namespace br { namespace cuda { namespace L2 { | @@ -70,12 +90,5 @@ namespace br { namespace cuda { namespace L2 { | ||
| 70 | 90 | ||
| 71 | // copy the single value back to the destinsion | 91 | // copy the single value back to the destinsion |
| 72 | CUDA_SAFE_MEMCPY(outPtr, cudaOutPtr, sizeof(float), cudaMemcpyDeviceToHost, &err); | 92 | CUDA_SAFE_MEMCPY(outPtr, cudaOutPtr, sizeof(float), cudaMemcpyDeviceToHost, &err); |
| 73 | - | ||
| 74 | - CUDA_SAFE_FREE(cudaOutPtr, &err); | ||
| 75 | - | ||
| 76 | - // do not free aPtr which should be the reference library | ||
| 77 | - // only free bPtr, which is the image we are comparing | ||
| 78 | - CUDA_SAFE_FREE(cudaBPtr, &err); | ||
| 79 | - CUDA_SAFE_FREE(cudaWorkBufferPtr, &err); | ||
| 80 | } | 93 | } |
| 81 | }}} | 94 | }}} |