diff --git a/openbr/plugins/cuda/CUDAL2.cpp b/openbr/plugins/cuda/CUDAL2.cpp new file mode 100644 index 0000000..6b248d0 --- /dev/null +++ b/openbr/plugins/cuda/CUDAL2.cpp @@ -0,0 +1,58 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright 2012 The MITRE Corporation * + * * + * Licensed under the Apache License, Version 2.0 (the "License"); * + * you may not use this file except in compliance with the License. * + * You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, software * + * distributed under the License is distributed on an "AS IS" BASIS, * + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * + * See the License for the specific language governing permissions and * + * limitations under the License. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include +using namespace std; + +#include + +namespace br { namespace cuda { namespace L2{ + void wrapper(float* cudaAPtr, float* cudaBPtr, int length, float* outPtr); +}}} + +namespace br +{ + +/*! + * \ingroup distances + * \brief L2 distance computed using eigen. + * \author Josh Klontz \cite jklontz + */ +class CUDAL2Distance : public UntrainableDistance +{ + Q_OBJECT + + float compare(const cv::Mat &a, const cv::Mat &b) const + { + void* const* srcDataPtr = a.ptr(); + float* cudaAPtr = (float*)srcDataPtr[0]; + int rows = *((int*)srcDataPtr[1]); + int cols = *((int*)srcDataPtr[2]); + + float* cudaBPtr = (float*)b.ptr()[0]; + + float out; + br::cuda::L2::wrapper(cudaAPtr, cudaBPtr, rows*cols, &out); + + return out; + } +}; + +BR_REGISTER(Distance, CUDAL2Distance) + +} // namespace br + +#include "cuda/CUDAL2.moc" diff --git a/openbr/plugins/cuda/CUDAL2.cu b/openbr/plugins/cuda/CUDAL2.cu new file mode 100644 index 0000000..8d52b35 --- /dev/null +++ b/openbr/plugins/cuda/CUDAL2.cu @@ -0,0 +1,68 @@ +#include + +#include "cudadefines.hpp" + +namespace br { namespace cuda { namespace L2 { + + __global__ void my_subtract_kernel(float* aPtr, float* bPtr, float* workPtr, int length) { + int index = blockIdx.x*blockDim.x+threadIdx.x; + + if (index >= length) { + return; + } + + // perform the subtraction in-place + // use b because it is the comparison + // image + workPtr[index] = aPtr[index] - bPtr[index]; + workPtr[index] = workPtr[index] * workPtr[index]; + } + + __global__ void collapse_kernel(float* inPtr, float* outPtr, int length) { + // make sure there is only one thread that we are calling + if (blockIdx.x != 0 || threadIdx.x != 0) { + return; + } + + // sum up all the values + *outPtr = 0; + for (int i=0; i < length; i++) { + *outPtr = *outPtr + inPtr[i]; + } + + // take the square root + *outPtr = sqrtf(*outPtr); + } + + void wrapper(float* cudaAPtr, float* cudaBPtr, int length, float* outPtr) { + cudaError_t err; + float* cudaOutPtr; + CUDA_SAFE_MALLOC(&cudaOutPtr, sizeof(float), &err); + + float* cudaWorkBufferPtr; + CUDA_SAFE_MALLOC(&cudaWorkBufferPtr, sizeof(float)*length, &err); + + // perform the subtraction + int threadsPerBlock = 64; + int numBlocks = length / threadsPerBlock + 1; + my_subtract_kernel<<>>(cudaAPtr, cudaBPtr, cudaWorkBufferPtr, length); + CUDA_KERNEL_ERR_CHK(&err); + + // perform the collapse + collapse_kernel<<<1,1>>>(cudaWorkBufferPtr, cudaOutPtr, length); + CUDA_KERNEL_ERR_CHK(&err); + + // copy the single value back to the destinsion + CUDA_SAFE_MEMCPY(outPtr, cudaOutPtr, sizeof(float), cudaMemcpyDeviceToHost, &err); + + CUDA_SAFE_FREE(cudaOutPtr, &err); + + // do not free aPtr which should be the reference library + // only free bPtr, which is the image we are comparing + CUDA_SAFE_FREE(cudaBPtr, &err); + CUDA_SAFE_FREE(cudaWorkBufferPtr, &err); + } +}}} + +// 128CUDAEigenfaces on 6400 ATT: 54.367s +// 128CUDAEigenfacesL2 on 6400 ATT: