Commit 2f6387f4e6742f97ec5691d0799663f8340b147c
1 parent
2ba8a2a3
made CUDAL2 compatible with assumption that comparison will be done after the fact
Showing
2 changed files
with
42 additions
and
35 deletions
openbr/plugins/cuda/cudal2.cpp
| ... | ... | @@ -21,7 +21,7 @@ using namespace std; |
| 21 | 21 | |
| 22 | 22 | // definitions from the CUDA source file |
| 23 | 23 | namespace br { namespace cuda { namespace L2 { |
| 24 | - void wrapper(float* cudaAPtr, float* cudaBPtr, int length, float* outPtr); | |
| 24 | + void wrapper(float const* aPtr, float const* bPtr, int length, float* outPtr); | |
| 25 | 25 | }}} |
| 26 | 26 | |
| 27 | 27 | namespace br |
| ... | ... | @@ -38,23 +38,17 @@ class CUDAL2Distance : public UntrainableDistance |
| 38 | 38 | |
| 39 | 39 | float compare(const cv::Mat &a, const cv::Mat &b) const |
| 40 | 40 | { |
| 41 | - void* const* srcDataPtr = a.ptr<void*>(); | |
| 42 | - float* cudaAPtr = (float*)srcDataPtr[0]; | |
| 43 | - int rows = *((int*)srcDataPtr[1]); | |
| 44 | - int cols = *((int*)srcDataPtr[2]); | |
| 45 | - int srcType = *((int*)srcDataPtr[3]); | |
| 46 | - | |
| 47 | - void* const* dstDataPtr = b.ptr<void*>(); | |
| 48 | - float* cudaBPtr = (float*)dstDataPtr[0]; | |
| 49 | - int dstType = *((int*)dstDataPtr[3]); | |
| 50 | - | |
| 51 | - if (srcType != dstType) { | |
| 41 | + if (a.type() != CV_32FC1 || b.type() != CV_32FC1) { | |
| 52 | 42 | cout << "ERR: Type mismatch" << endl; |
| 53 | 43 | throw 0; |
| 54 | 44 | } |
| 45 | + if (a.rows*a.cols != b.rows*b.cols) { | |
| 46 | + cout << "ERR: Dimension mismatch" << endl; | |
| 47 | + throw 1; | |
| 48 | + } | |
| 55 | 49 | |
| 56 | 50 | float out; |
| 57 | - cuda::L2::wrapper(cudaAPtr, cudaBPtr, rows*cols, &out); | |
| 51 | + cuda::L2::wrapper(a.ptr<float>(), b.ptr<float>(), a.rows*a.cols, &out); | |
| 58 | 52 | |
| 59 | 53 | return out; |
| 60 | 54 | } | ... | ... |
openbr/plugins/cuda/cudal2.cu
| ... | ... | @@ -27,11 +27,10 @@ namespace br { namespace cuda { namespace L2 { |
| 27 | 27 | return; |
| 28 | 28 | } |
| 29 | 29 | |
| 30 | - // perform the subtraction in-place | |
| 31 | - // use b because it is the comparison | |
| 32 | - // image | |
| 33 | - workPtr[index] = aPtr[index] - bPtr[index]; | |
| 34 | - workPtr[index] = workPtr[index] * workPtr[index]; | |
| 30 | + // perform the subtraction | |
| 31 | + float res = aPtr[index] - bPtr[index]; | |
| 32 | + res = res * res; | |
| 33 | + workPtr[index] = res; | |
| 35 | 34 | } |
| 36 | 35 | |
| 37 | 36 | __global__ void collapseKernel(float* inPtr, float* outPtr, int length) { |
| ... | ... | @@ -41,25 +40,46 @@ namespace br { namespace cuda { namespace L2 { |
| 41 | 40 | } |
| 42 | 41 | |
| 43 | 42 | // sum up all the values |
| 44 | - *outPtr = 0; | |
| 43 | + float acc = 0; | |
| 45 | 44 | for (int i=0; i < length; i++) { |
| 46 | - *outPtr = *outPtr + inPtr[i]; | |
| 45 | + acc += inPtr[i]; | |
| 47 | 46 | } |
| 48 | 47 | |
| 49 | - // take the square root | |
| 50 | - *outPtr = sqrtf(*outPtr); | |
| 48 | + *outPtr = acc; | |
| 51 | 49 | } |
| 52 | 50 | |
| 53 | - void wrapper(float* cudaAPtr, float* cudaBPtr, int length, float* outPtr) { | |
| 51 | + float* cudaAPtr = NULL; | |
| 52 | + float* cudaBPtr = NULL; | |
| 53 | + float* cudaWorkBufferPtr = NULL; | |
| 54 | + float* cudaOutPtr = NULL; | |
| 55 | + int bufferLen = 0; | |
| 56 | + | |
| 57 | + void wrapper(float const* aPtr, float const* bPtr, int length, float* outPtr) { | |
| 54 | 58 | cudaError_t err; |
| 55 | - float* cudaOutPtr; | |
| 56 | - CUDA_SAFE_MALLOC(&cudaOutPtr, sizeof(float), &err); | |
| 57 | 59 | |
| 58 | - float* cudaWorkBufferPtr; | |
| 59 | - CUDA_SAFE_MALLOC(&cudaWorkBufferPtr, sizeof(float)*length, &err); | |
| 60 | + // allocate memory for the mats and copy data to graphics card | |
| 61 | + // only allocate if there is a mismatch in image size, otherwise | |
| 62 | + // use the existing allocated memory | |
| 63 | + if (length != bufferLen) { | |
| 64 | + if (cudaAPtr != NULL) { | |
| 65 | + CUDA_SAFE_FREE(cudaAPtr, &err); | |
| 66 | + CUDA_SAFE_FREE(cudaBPtr, &err); | |
| 67 | + CUDA_SAFE_FREE(cudaWorkBufferPtr, &err); | |
| 68 | + CUDA_SAFE_FREE(cudaOutPtr, &err); | |
| 69 | + } | |
| 70 | + CUDA_SAFE_MALLOC(&cudaAPtr, length*sizeof(float), &err); | |
| 71 | + CUDA_SAFE_MALLOC(&cudaBPtr, length*sizeof(float), &err); | |
| 72 | + CUDA_SAFE_MALLOC(&cudaWorkBufferPtr, sizeof(float)*length, &err); | |
| 73 | + CUDA_SAFE_MALLOC(&cudaOutPtr, sizeof(float), &err); | |
| 74 | + bufferLen = length; | |
| 75 | + } | |
| 76 | + | |
| 77 | + // copy data over from CPU | |
| 78 | + CUDA_SAFE_MEMCPY(cudaAPtr, aPtr, length*sizeof(float), cudaMemcpyHostToDevice, &err); | |
| 79 | + CUDA_SAFE_MEMCPY(cudaBPtr, bPtr, length*sizeof(float), cudaMemcpyHostToDevice, &err); | |
| 60 | 80 | |
| 61 | 81 | // perform the subtraction |
| 62 | - int threadsPerBlock = 64; | |
| 82 | + int threadsPerBlock = 512; | |
| 63 | 83 | int numBlocks = length / threadsPerBlock + 1; |
| 64 | 84 | subtractKernel<<<threadsPerBlock, numBlocks>>>(cudaAPtr, cudaBPtr, cudaWorkBufferPtr, length); |
| 65 | 85 | CUDA_KERNEL_ERR_CHK(&err); |
| ... | ... | @@ -70,12 +90,5 @@ namespace br { namespace cuda { namespace L2 { |
| 70 | 90 | |
| 71 | 91 | // copy the single value back to the destinsion |
| 72 | 92 | CUDA_SAFE_MEMCPY(outPtr, cudaOutPtr, sizeof(float), cudaMemcpyDeviceToHost, &err); |
| 73 | - | |
| 74 | - CUDA_SAFE_FREE(cudaOutPtr, &err); | |
| 75 | - | |
| 76 | - // do not free aPtr which should be the reference library | |
| 77 | - // only free bPtr, which is the image we are comparing | |
| 78 | - CUDA_SAFE_FREE(cudaBPtr, &err); | |
| 79 | - CUDA_SAFE_FREE(cudaWorkBufferPtr, &err); | |
| 80 | 93 | } |
| 81 | 94 | }}} | ... | ... |