Commit c9a58b18b6db3f96d330f71fbbee507b108e637d
1 parent
4e59b2b5
added L2 distance norm plugin
Showing
2 changed files
with
126 additions
and
0 deletions
openbr/plugins/cuda/CUDAL2.cpp
0 → 100644
| 1 | +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * | |
| 2 | + * Copyright 2012 The MITRE Corporation * | |
| 3 | + * * | |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); * | |
| 5 | + * you may not use this file except in compliance with the License. * | |
| 6 | + * You may obtain a copy of the License at * | |
| 7 | + * * | |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 * | |
| 9 | + * * | |
| 10 | + * Unless required by applicable law or agreed to in writing, software * | |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, * | |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * | |
| 13 | + * See the License for the specific language governing permissions and * | |
| 14 | + * limitations under the License. * | |
| 15 | + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ | |
| 16 | + | |
| 17 | +#include <iostream> | |
| 18 | +using namespace std; | |
| 19 | + | |
| 20 | +#include <openbr/plugins/openbr_internal.h> | |
| 21 | + | |
| 22 | +namespace br { namespace cuda { namespace L2{ | |
| 23 | + void wrapper(float* cudaAPtr, float* cudaBPtr, int length, float* outPtr); | |
| 24 | +}}} | |
| 25 | + | |
| 26 | +namespace br | |
| 27 | +{ | |
| 28 | + | |
| 29 | +/*! | |
| 30 | + * \ingroup distances | |
| 31 | + * \brief L2 distance computed using eigen. | |
| 32 | + * \author Josh Klontz \cite jklontz | |
| 33 | + */ | |
| 34 | +class CUDAL2Distance : public UntrainableDistance | |
| 35 | +{ | |
| 36 | + Q_OBJECT | |
| 37 | + | |
| 38 | + float compare(const cv::Mat &a, const cv::Mat &b) const | |
| 39 | + { | |
| 40 | + void* const* srcDataPtr = a.ptr<void*>(); | |
| 41 | + float* cudaAPtr = (float*)srcDataPtr[0]; | |
| 42 | + int rows = *((int*)srcDataPtr[1]); | |
| 43 | + int cols = *((int*)srcDataPtr[2]); | |
| 44 | + | |
| 45 | + float* cudaBPtr = (float*)b.ptr<void*>()[0]; | |
| 46 | + | |
| 47 | + float out; | |
| 48 | + br::cuda::L2::wrapper(cudaAPtr, cudaBPtr, rows*cols, &out); | |
| 49 | + | |
| 50 | + return out; | |
| 51 | + } | |
| 52 | +}; | |
| 53 | + | |
| 54 | +BR_REGISTER(Distance, CUDAL2Distance) | |
| 55 | + | |
| 56 | +} // namespace br | |
| 57 | + | |
| 58 | +#include "cuda/CUDAL2.moc" | ... | ... |
openbr/plugins/cuda/CUDAL2.cu
0 → 100644
| 1 | +#include <math.h> | |
| 2 | + | |
| 3 | +#include "cudadefines.hpp" | |
| 4 | + | |
| 5 | +namespace br { namespace cuda { namespace L2 { | |
| 6 | + | |
| 7 | + __global__ void my_subtract_kernel(float* aPtr, float* bPtr, float* workPtr, int length) { | |
| 8 | + int index = blockIdx.x*blockDim.x+threadIdx.x; | |
| 9 | + | |
| 10 | + if (index >= length) { | |
| 11 | + return; | |
| 12 | + } | |
| 13 | + | |
| 14 | + // perform the subtraction in-place | |
| 15 | + // use b because it is the comparison | |
| 16 | + // image | |
| 17 | + workPtr[index] = aPtr[index] - bPtr[index]; | |
| 18 | + workPtr[index] = workPtr[index] * workPtr[index]; | |
| 19 | + } | |
| 20 | + | |
| 21 | + __global__ void collapse_kernel(float* inPtr, float* outPtr, int length) { | |
| 22 | + // make sure there is only one thread that we are calling | |
| 23 | + if (blockIdx.x != 0 || threadIdx.x != 0) { | |
| 24 | + return; | |
| 25 | + } | |
| 26 | + | |
| 27 | + // sum up all the values | |
| 28 | + *outPtr = 0; | |
| 29 | + for (int i=0; i < length; i++) { | |
| 30 | + *outPtr = *outPtr + inPtr[i]; | |
| 31 | + } | |
| 32 | + | |
| 33 | + // take the square root | |
| 34 | + *outPtr = sqrtf(*outPtr); | |
| 35 | + } | |
| 36 | + | |
| 37 | + void wrapper(float* cudaAPtr, float* cudaBPtr, int length, float* outPtr) { | |
| 38 | + cudaError_t err; | |
| 39 | + float* cudaOutPtr; | |
| 40 | + CUDA_SAFE_MALLOC(&cudaOutPtr, sizeof(float), &err); | |
| 41 | + | |
| 42 | + float* cudaWorkBufferPtr; | |
| 43 | + CUDA_SAFE_MALLOC(&cudaWorkBufferPtr, sizeof(float)*length, &err); | |
| 44 | + | |
| 45 | + // perform the subtraction | |
| 46 | + int threadsPerBlock = 64; | |
| 47 | + int numBlocks = length / threadsPerBlock + 1; | |
| 48 | + my_subtract_kernel<<<threadsPerBlock, numBlocks>>>(cudaAPtr, cudaBPtr, cudaWorkBufferPtr, length); | |
| 49 | + CUDA_KERNEL_ERR_CHK(&err); | |
| 50 | + | |
| 51 | + // perform the collapse | |
| 52 | + collapse_kernel<<<1,1>>>(cudaWorkBufferPtr, cudaOutPtr, length); | |
| 53 | + CUDA_KERNEL_ERR_CHK(&err); | |
| 54 | + | |
| 55 | + // copy the single value back to the destinsion | |
| 56 | + CUDA_SAFE_MEMCPY(outPtr, cudaOutPtr, sizeof(float), cudaMemcpyDeviceToHost, &err); | |
| 57 | + | |
| 58 | + CUDA_SAFE_FREE(cudaOutPtr, &err); | |
| 59 | + | |
| 60 | + // do not free aPtr which should be the reference library | |
| 61 | + // only free bPtr, which is the image we are comparing | |
| 62 | + CUDA_SAFE_FREE(cudaBPtr, &err); | |
| 63 | + CUDA_SAFE_FREE(cudaWorkBufferPtr, &err); | |
| 64 | + } | |
| 65 | +}}} | |
| 66 | + | |
| 67 | +// 128CUDAEigenfaces on 6400 ATT: 54.367s | |
| 68 | +// 128CUDAEigenfacesL2 on 6400 ATT: | ... | ... |