Commit 2f6387f4e6742f97ec5691d0799663f8340b147c

Authored by Colin Heinzmann
1 parent 2ba8a2a3

made CUDAL2 compatible with assumption that comparison will be done after the fact

openbr/plugins/cuda/cudal2.cpp
... ... @@ -21,7 +21,7 @@ using namespace std;
21 21  
22 22 // definitions from the CUDA source file
23 23 namespace br { namespace cuda { namespace L2 {
24   - void wrapper(float* cudaAPtr, float* cudaBPtr, int length, float* outPtr);
  24 + void wrapper(float const* aPtr, float const* bPtr, int length, float* outPtr);
25 25 }}}
26 26  
27 27 namespace br
... ... @@ -38,23 +38,17 @@ class CUDAL2Distance : public UntrainableDistance
38 38  
39 39 float compare(const cv::Mat &a, const cv::Mat &b) const
40 40 {
41   - void* const* srcDataPtr = a.ptr<void*>();
42   - float* cudaAPtr = (float*)srcDataPtr[0];
43   - int rows = *((int*)srcDataPtr[1]);
44   - int cols = *((int*)srcDataPtr[2]);
45   - int srcType = *((int*)srcDataPtr[3]);
46   -
47   - void* const* dstDataPtr = b.ptr<void*>();
48   - float* cudaBPtr = (float*)dstDataPtr[0];
49   - int dstType = *((int*)dstDataPtr[3]);
50   -
51   - if (srcType != dstType) {
  41 + if (a.type() != CV_32FC1 || b.type() != CV_32FC1) {
52 42 cout << "ERR: Type mismatch" << endl;
53 43 throw 0;
54 44 }
  45 + if (a.rows*a.cols != b.rows*b.cols) {
  46 + cout << "ERR: Dimension mismatch" << endl;
  47 + throw 1;
  48 + }
55 49  
56 50 float out;
57   - cuda::L2::wrapper(cudaAPtr, cudaBPtr, rows*cols, &out);
  51 + cuda::L2::wrapper(a.ptr<float>(), b.ptr<float>(), a.rows*a.cols, &out);
58 52  
59 53 return out;
60 54 }
... ...
openbr/plugins/cuda/cudal2.cu
... ... @@ -27,11 +27,10 @@ namespace br { namespace cuda { namespace L2 {
27 27 return;
28 28 }
29 29  
30   - // perform the subtraction in-place
31   - // use b because it is the comparison
32   - // image
33   - workPtr[index] = aPtr[index] - bPtr[index];
34   - workPtr[index] = workPtr[index] * workPtr[index];
  30 + // perform the subtraction
  31 + float res = aPtr[index] - bPtr[index];
  32 + res = res * res;
  33 + workPtr[index] = res;
35 34 }
36 35  
37 36 __global__ void collapseKernel(float* inPtr, float* outPtr, int length) {
... ... @@ -41,25 +40,46 @@ namespace br { namespace cuda { namespace L2 {
41 40 }
42 41  
43 42 // sum up all the values
44   - *outPtr = 0;
  43 + float acc = 0;
45 44 for (int i=0; i < length; i++) {
46   - *outPtr = *outPtr + inPtr[i];
  45 + acc += inPtr[i];
47 46 }
48 47  
49   - // take the square root
50   - *outPtr = sqrtf(*outPtr);
  48 + *outPtr = acc;
51 49 }
52 50  
53   - void wrapper(float* cudaAPtr, float* cudaBPtr, int length, float* outPtr) {
  51 + float* cudaAPtr = NULL;
  52 + float* cudaBPtr = NULL;
  53 + float* cudaWorkBufferPtr = NULL;
  54 + float* cudaOutPtr = NULL;
  55 + int bufferLen = 0;
  56 +
  57 + void wrapper(float const* aPtr, float const* bPtr, int length, float* outPtr) {
54 58 cudaError_t err;
55   - float* cudaOutPtr;
56   - CUDA_SAFE_MALLOC(&cudaOutPtr, sizeof(float), &err);
57 59  
58   - float* cudaWorkBufferPtr;
59   - CUDA_SAFE_MALLOC(&cudaWorkBufferPtr, sizeof(float)*length, &err);
  60 + // allocate memory for the mats and copy data to graphics card
  61 + // only allocate if there is a mismatch in image size, otherwise
  62 + // use the existing allocated memory
  63 + if (length != bufferLen) {
  64 + if (cudaAPtr != NULL) {
  65 + CUDA_SAFE_FREE(cudaAPtr, &err);
  66 + CUDA_SAFE_FREE(cudaBPtr, &err);
  67 + CUDA_SAFE_FREE(cudaWorkBufferPtr, &err);
  68 + CUDA_SAFE_FREE(cudaOutPtr, &err);
  69 + }
  70 + CUDA_SAFE_MALLOC(&cudaAPtr, length*sizeof(float), &err);
  71 + CUDA_SAFE_MALLOC(&cudaBPtr, length*sizeof(float), &err);
  72 + CUDA_SAFE_MALLOC(&cudaWorkBufferPtr, sizeof(float)*length, &err);
  73 + CUDA_SAFE_MALLOC(&cudaOutPtr, sizeof(float), &err);
  74 + bufferLen = length;
  75 + }
  76 +
  77 + // copy data over from CPU
  78 + CUDA_SAFE_MEMCPY(cudaAPtr, aPtr, length*sizeof(float), cudaMemcpyHostToDevice, &err);
  79 + CUDA_SAFE_MEMCPY(cudaBPtr, bPtr, length*sizeof(float), cudaMemcpyHostToDevice, &err);
60 80  
61 81 // perform the subtraction
62   - int threadsPerBlock = 64;
  82 + int threadsPerBlock = 512;
63 83 int numBlocks = length / threadsPerBlock + 1;
64 84 subtractKernel<<<threadsPerBlock, numBlocks>>>(cudaAPtr, cudaBPtr, cudaWorkBufferPtr, length);
65 85 CUDA_KERNEL_ERR_CHK(&err);
... ... @@ -70,12 +90,5 @@ namespace br { namespace cuda { namespace L2 {
70 90  
71 91 // copy the single value back to the destinsion
72 92 CUDA_SAFE_MEMCPY(outPtr, cudaOutPtr, sizeof(float), cudaMemcpyDeviceToHost, &err);
73   -
74   - CUDA_SAFE_FREE(cudaOutPtr, &err);
75   -
76   - // do not free aPtr which should be the reference library
77   - // only free bPtr, which is the image we are comparing
78   - CUDA_SAFE_FREE(cudaBPtr, &err);
79   - CUDA_SAFE_FREE(cudaWorkBufferPtr, &err);
80 93 }
81 94 }}}
... ...