Commit 2f6387f4e6742f97ec5691d0799663f8340b147c

Authored by Colin Heinzmann
1 parent 2ba8a2a3

made CUDAL2 compatible with assumption that comparison will be done after the fact

openbr/plugins/cuda/cudal2.cpp
@@ -21,7 +21,7 @@ using namespace std; @@ -21,7 +21,7 @@ using namespace std;
21 21
22 // definitions from the CUDA source file 22 // definitions from the CUDA source file
23 namespace br { namespace cuda { namespace L2 { 23 namespace br { namespace cuda { namespace L2 {
24 - void wrapper(float* cudaAPtr, float* cudaBPtr, int length, float* outPtr); 24 + void wrapper(float const* aPtr, float const* bPtr, int length, float* outPtr);
25 }}} 25 }}}
26 26
27 namespace br 27 namespace br
@@ -38,23 +38,17 @@ class CUDAL2Distance : public UntrainableDistance @@ -38,23 +38,17 @@ class CUDAL2Distance : public UntrainableDistance
38 38
39 float compare(const cv::Mat &a, const cv::Mat &b) const 39 float compare(const cv::Mat &a, const cv::Mat &b) const
40 { 40 {
41 - void* const* srcDataPtr = a.ptr<void*>();  
42 - float* cudaAPtr = (float*)srcDataPtr[0];  
43 - int rows = *((int*)srcDataPtr[1]);  
44 - int cols = *((int*)srcDataPtr[2]);  
45 - int srcType = *((int*)srcDataPtr[3]);  
46 -  
47 - void* const* dstDataPtr = b.ptr<void*>();  
48 - float* cudaBPtr = (float*)dstDataPtr[0];  
49 - int dstType = *((int*)dstDataPtr[3]);  
50 -  
51 - if (srcType != dstType) { 41 + if (a.type() != CV_32FC1 || b.type() != CV_32FC1) {
52 cout << "ERR: Type mismatch" << endl; 42 cout << "ERR: Type mismatch" << endl;
53 throw 0; 43 throw 0;
54 } 44 }
  45 + if (a.rows*a.cols != b.rows*b.cols) {
  46 + cout << "ERR: Dimension mismatch" << endl;
  47 + throw 1;
  48 + }
55 49
56 float out; 50 float out;
57 - cuda::L2::wrapper(cudaAPtr, cudaBPtr, rows*cols, &out); 51 + cuda::L2::wrapper(a.ptr<float>(), b.ptr<float>(), a.rows*a.cols, &out);
58 52
59 return out; 53 return out;
60 } 54 }
openbr/plugins/cuda/cudal2.cu
@@ -27,11 +27,10 @@ namespace br { namespace cuda { namespace L2 { @@ -27,11 +27,10 @@ namespace br { namespace cuda { namespace L2 {
27 return; 27 return;
28 } 28 }
29 29
30 - // perform the subtraction in-place  
31 - // use b because it is the comparison  
32 - // image  
33 - workPtr[index] = aPtr[index] - bPtr[index];  
34 - workPtr[index] = workPtr[index] * workPtr[index]; 30 + // perform the subtraction
  31 + float res = aPtr[index] - bPtr[index];
  32 + res = res * res;
  33 + workPtr[index] = res;
35 } 34 }
36 35
37 __global__ void collapseKernel(float* inPtr, float* outPtr, int length) { 36 __global__ void collapseKernel(float* inPtr, float* outPtr, int length) {
@@ -41,25 +40,46 @@ namespace br { namespace cuda { namespace L2 { @@ -41,25 +40,46 @@ namespace br { namespace cuda { namespace L2 {
41 } 40 }
42 41
43 // sum up all the values 42 // sum up all the values
44 - *outPtr = 0; 43 + float acc = 0;
45 for (int i=0; i < length; i++) { 44 for (int i=0; i < length; i++) {
46 - *outPtr = *outPtr + inPtr[i]; 45 + acc += inPtr[i];
47 } 46 }
48 47
49 - // take the square root  
50 - *outPtr = sqrtf(*outPtr); 48 + *outPtr = acc;
51 } 49 }
52 50
53 - void wrapper(float* cudaAPtr, float* cudaBPtr, int length, float* outPtr) { 51 + float* cudaAPtr = NULL;
  52 + float* cudaBPtr = NULL;
  53 + float* cudaWorkBufferPtr = NULL;
  54 + float* cudaOutPtr = NULL;
  55 + int bufferLen = 0;
  56 +
  57 + void wrapper(float const* aPtr, float const* bPtr, int length, float* outPtr) {
54 cudaError_t err; 58 cudaError_t err;
55 - float* cudaOutPtr;  
56 - CUDA_SAFE_MALLOC(&cudaOutPtr, sizeof(float), &err);  
57 59
58 - float* cudaWorkBufferPtr;  
59 - CUDA_SAFE_MALLOC(&cudaWorkBufferPtr, sizeof(float)*length, &err); 60 + // allocate memory for the mats and copy data to graphics card
  61 + // only allocate if there is a mismatch in image size, otherwise
  62 + // use the existing allocated memory
  63 + if (length != bufferLen) {
  64 + if (cudaAPtr != NULL) {
  65 + CUDA_SAFE_FREE(cudaAPtr, &err);
  66 + CUDA_SAFE_FREE(cudaBPtr, &err);
  67 + CUDA_SAFE_FREE(cudaWorkBufferPtr, &err);
  68 + CUDA_SAFE_FREE(cudaOutPtr, &err);
  69 + }
  70 + CUDA_SAFE_MALLOC(&cudaAPtr, length*sizeof(float), &err);
  71 + CUDA_SAFE_MALLOC(&cudaBPtr, length*sizeof(float), &err);
  72 + CUDA_SAFE_MALLOC(&cudaWorkBufferPtr, sizeof(float)*length, &err);
  73 + CUDA_SAFE_MALLOC(&cudaOutPtr, sizeof(float), &err);
  74 + bufferLen = length;
  75 + }
  76 +
  77 + // copy data over from CPU
  78 + CUDA_SAFE_MEMCPY(cudaAPtr, aPtr, length*sizeof(float), cudaMemcpyHostToDevice, &err);
  79 + CUDA_SAFE_MEMCPY(cudaBPtr, bPtr, length*sizeof(float), cudaMemcpyHostToDevice, &err);
60 80
61 // perform the subtraction 81 // perform the subtraction
62 - int threadsPerBlock = 64; 82 + int threadsPerBlock = 512;
63 int numBlocks = length / threadsPerBlock + 1; 83 int numBlocks = length / threadsPerBlock + 1;
64 subtractKernel<<<threadsPerBlock, numBlocks>>>(cudaAPtr, cudaBPtr, cudaWorkBufferPtr, length); 84 subtractKernel<<<threadsPerBlock, numBlocks>>>(cudaAPtr, cudaBPtr, cudaWorkBufferPtr, length);
65 CUDA_KERNEL_ERR_CHK(&err); 85 CUDA_KERNEL_ERR_CHK(&err);
@@ -70,12 +90,5 @@ namespace br { namespace cuda { namespace L2 { @@ -70,12 +90,5 @@ namespace br { namespace cuda { namespace L2 {
70 90
71 // copy the single value back to the destinsion 91 // copy the single value back to the destinsion
72 CUDA_SAFE_MEMCPY(outPtr, cudaOutPtr, sizeof(float), cudaMemcpyDeviceToHost, &err); 92 CUDA_SAFE_MEMCPY(outPtr, cudaOutPtr, sizeof(float), cudaMemcpyDeviceToHost, &err);
73 -  
74 - CUDA_SAFE_FREE(cudaOutPtr, &err);  
75 -  
76 - // do not free aPtr which should be the reference library  
77 - // only free bPtr, which is the image we are comparing  
78 - CUDA_SAFE_FREE(cudaBPtr, &err);  
79 - CUDA_SAFE_FREE(cudaWorkBufferPtr, &err);  
80 } 93 }
81 }}} 94 }}}