From 1dabf42dad2e33b7a9e3d11b4142a98fd53c5f9d Mon Sep 17 00:00:00 2001 From: DepthDeluxe Date: Wed, 30 Mar 2016 15:07:21 -0400 Subject: [PATCH] code refactoring for pull request --- openbr/core/cuda/MatManager.cu | 116 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ openbr/core/cuda/MatManager.hpp | 39 +++++++++++++++++++++++++++++++++++++++ openbr/plugins/cuda/CUDAL2.cpp | 58 ---------------------------------------------------------- openbr/plugins/cuda/CUDAL2.cu | 68 -------------------------------------------------------------------- openbr/plugins/cuda/MatManager.cu | 116 -------------------------------------------------------------------------------------------------------------------- openbr/plugins/cuda/MatManager.hpp | 39 --------------------------------------- openbr/plugins/cuda/copyfrom.cpp | 24 ++++++++++++------------ openbr/plugins/cuda/copyfrom.cu | 2 +- openbr/plugins/cuda/copyto.cpp | 22 ++++++++++++++-------- openbr/plugins/cuda/copyto.cu | 4 +++- openbr/plugins/cuda/cudaaffine.cpp | 293 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------------------------------------------------------------------------------------------------------------------------------- openbr/plugins/cuda/cudaaffine.cu | 55 +++++++++++++++++++++++++++---------------------------- openbr/plugins/cuda/cudacvtfloat.cpp | 7 ++++--- openbr/plugins/cuda/cudacvtfloat.cu | 8 +------- openbr/plugins/cuda/cudal2.cpp | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ openbr/plugins/cuda/cudal2.cu | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ openbr/plugins/cuda/cudalbp.cpp | 69 +++++++++++---------------------------------------------------------- openbr/plugins/cuda/cudalbp.cu | 36 ++++++++++++++++++++---------------- openbr/plugins/cuda/cudapca.cpp | 32 +++++--------------------------- openbr/plugins/cuda/cudapca.cu | 10 +++++----- openbr/plugins/cuda/cudargb2grayscale.cpp | 15 ++++++++------- openbr/plugins/cuda/cudargb2grayscale.cu | 12 ++++++------ openbr/plugins/cuda/passthrough.cpp | 46 ---------------------------------------------- openbr/plugins/cuda/passthrough.cu | 43 ------------------------------------------- openbr/plugins/cuda/passthrough.hpp | 3 --- 25 files changed, 542 insertions(+), 699 deletions(-) create mode 100644 openbr/core/cuda/MatManager.cu create mode 100644 openbr/core/cuda/MatManager.hpp delete mode 100644 openbr/plugins/cuda/CUDAL2.cpp delete mode 100644 openbr/plugins/cuda/CUDAL2.cu delete mode 100644 openbr/plugins/cuda/MatManager.cu delete mode 100644 openbr/plugins/cuda/MatManager.hpp create mode 100644 openbr/plugins/cuda/cudal2.cpp create mode 100644 openbr/plugins/cuda/cudal2.cu delete mode 100644 openbr/plugins/cuda/passthrough.cpp delete mode 100644 openbr/plugins/cuda/passthrough.cu delete mode 100644 openbr/plugins/cuda/passthrough.hpp diff --git a/openbr/core/cuda/MatManager.cu b/openbr/core/cuda/MatManager.cu new file mode 100644 index 0000000..4823249 --- /dev/null +++ b/openbr/core/cuda/MatManager.cu @@ -0,0 +1,116 @@ +#include +#include + +#include + +#include "MatManager.hpp" + +using namespace cv; +using namespace cv::gpu; + +namespace br { namespace cuda { + MatManager::MatManager(int num) { + _numMats = num; + + // initialize the an array of Mats + _mats = (uint8_t**)malloc(num * sizeof(uint8_t*)); + _matTaken = (bool*)malloc(num * sizeof(bool)); + _matsDimension = (int*)malloc(num * sizeof(int)); + + for (int i=0; i < num; i++) { + cudaMalloc(&_mats[i], 1 * sizeof(uint8_t)); + + // initialize matTaken + _matTaken[i] = false; + + // initialize all mat dimensions to be 1 + _matsDimension[i] = 1; + } + + // initialize the locks + _matTakenLock = new pthread_mutex_t; + pthread_mutex_init(_matTakenLock, NULL); + _matsDimensionLock = new pthread_mutex_t; + pthread_mutex_init(_matsDimensionLock, NULL); + + // initialize the semaphore + _matSemaphore = new sem_t; + sem_init(_matSemaphore, 0, _numMats); + } + + MatManager::matindex MatManager::reserve(Mat &mat) { + int reservedMatIndex = 0; + + sem_wait(_matSemaphore); + pthread_mutex_lock(_matTakenLock); + int i; + for (i=0; i < _numMats; i++) { + if ( !_matTaken[i] ) { + _matTaken[i] = true; + reservedMatIndex = i; + break; + } + } + if (i == _numMats) { + std::cout << "Cannot reserve a mat. Not enough GpuMat resourses\n" << std::endl << std::flush; + } + + pthread_mutex_unlock(_matTakenLock); + + // reallocate if size does not match + pthread_mutex_lock(_matsDimensionLock); + if (_matsDimension[reservedMatIndex] != mat.rows * mat.cols) { + cudaFree(_mats[reservedMatIndex]); // free the previous memory first + cudaMalloc(&_mats[reservedMatIndex], mat.rows * mat.cols * sizeof(uint8_t)); + // change the dimension of that matrix + _matsDimension[reservedMatIndex] = mat.rows * mat.cols; + + } + pthread_mutex_unlock(_matsDimensionLock); + return reservedMatIndex; + } + + void MatManager::upload(MatManager::matindex reservedMatIndex, Mat& mat) { + // copy the content of the Mat to GPU + uint8_t* reservedMat = _mats[reservedMatIndex]; + cudaMemcpy(reservedMat, mat.ptr(), mat.rows * mat.cols, cudaMemcpyHostToDevice); + } + + void MatManager::download(MatManager::matindex reservedMatIndex, Mat& dstMat) { + // copy the mat data back + int dimension = dstMat.rows * dstMat.cols; + uint8_t* reservedMat = _mats[reservedMatIndex]; + cudaMemcpy(dstMat.ptr(), reservedMat, dimension, cudaMemcpyDeviceToHost); + } + + void MatManager::release(MatManager::matindex reservedMatIndex) { + uint8_t* reservedMat = _mats[reservedMatIndex]; + pthread_mutex_lock(_matTakenLock); + bool foundMatch = false; + for (int i=0; i < _numMats; i++) { + if (reservedMat == _mats[i]) { + _matTaken[i] = false; + foundMatch = true; + } + } + pthread_mutex_unlock(_matTakenLock); + + // return unconditionally if we didn't find a match + if (!foundMatch) { + std::cout << "Reservedmat is not in the _mats array" << std::endl << std::flush; + return; + } + sem_post(_matSemaphore); + } + + MatManager::~MatManager() { + // assume a single thread is destroying the manager + // TODO(colin): add the destroy code + //std::cout << "Start to destroy.." << std::endl << std::flush; + } + + uint8_t* MatManager::get_mat_pointer_from_index(MatManager::matindex matIndex) { + return _mats[matIndex]; + } + +}} diff --git a/openbr/core/cuda/MatManager.hpp b/openbr/core/cuda/MatManager.hpp new file mode 100644 index 0000000..e58ad42 --- /dev/null +++ b/openbr/core/cuda/MatManager.hpp @@ -0,0 +1,39 @@ +/* +NOTES +Mat reservations should return a handle instead of a pointer +*/ + +#include +#include + +#include +#include + +using namespace cv; +using namespace cv::gpu; + +namespace br { namespace cuda { + class MatManager { + private: + int _numMats; + uint8_t** _mats; // holds all the mats + bool* _matTaken; // holds whether or not they are taken + int* _matsDimension; // holds the dimension of the Mats + + pthread_mutex_t* _matTakenLock; // lock for matTaken table + pthread_mutex_t* _matsDimensionLock; // lock for _matsDimension table and _mats table + sem_t* _matSemaphore; + + public: + typedef int matindex; + MatManager(int num); + + int reserve(Mat &mat); + void upload(matindex reservedMatIndex, Mat& mat); + void download(matindex reservedMatIndex, Mat& dstMat); + void release(matindex matIndex); + uint8_t* get_mat_pointer_from_index(matindex matIndex); + + ~MatManager(); + }; +}} diff --git a/openbr/plugins/cuda/CUDAL2.cpp b/openbr/plugins/cuda/CUDAL2.cpp deleted file mode 100644 index 6b248d0..0000000 --- a/openbr/plugins/cuda/CUDAL2.cpp +++ /dev/null @@ -1,58 +0,0 @@ -/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * - * Copyright 2012 The MITRE Corporation * - * * - * Licensed under the Apache License, Version 2.0 (the "License"); * - * you may not use this file except in compliance with the License. * - * You may obtain a copy of the License at * - * * - * http://www.apache.org/licenses/LICENSE-2.0 * - * * - * Unless required by applicable law or agreed to in writing, software * - * distributed under the License is distributed on an "AS IS" BASIS, * - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * - * See the License for the specific language governing permissions and * - * limitations under the License. * - * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ - -#include -using namespace std; - -#include - -namespace br { namespace cuda { namespace L2{ - void wrapper(float* cudaAPtr, float* cudaBPtr, int length, float* outPtr); -}}} - -namespace br -{ - -/*! - * \ingroup distances - * \brief L2 distance computed using eigen. - * \author Josh Klontz \cite jklontz - */ -class CUDAL2Distance : public UntrainableDistance -{ - Q_OBJECT - - float compare(const cv::Mat &a, const cv::Mat &b) const - { - void* const* srcDataPtr = a.ptr(); - float* cudaAPtr = (float*)srcDataPtr[0]; - int rows = *((int*)srcDataPtr[1]); - int cols = *((int*)srcDataPtr[2]); - - float* cudaBPtr = (float*)b.ptr()[0]; - - float out; - br::cuda::L2::wrapper(cudaAPtr, cudaBPtr, rows*cols, &out); - - return out; - } -}; - -BR_REGISTER(Distance, CUDAL2Distance) - -} // namespace br - -#include "cuda/CUDAL2.moc" diff --git a/openbr/plugins/cuda/CUDAL2.cu b/openbr/plugins/cuda/CUDAL2.cu deleted file mode 100644 index 8d52b35..0000000 --- a/openbr/plugins/cuda/CUDAL2.cu +++ /dev/null @@ -1,68 +0,0 @@ -#include - -#include "cudadefines.hpp" - -namespace br { namespace cuda { namespace L2 { - - __global__ void my_subtract_kernel(float* aPtr, float* bPtr, float* workPtr, int length) { - int index = blockIdx.x*blockDim.x+threadIdx.x; - - if (index >= length) { - return; - } - - // perform the subtraction in-place - // use b because it is the comparison - // image - workPtr[index] = aPtr[index] - bPtr[index]; - workPtr[index] = workPtr[index] * workPtr[index]; - } - - __global__ void collapse_kernel(float* inPtr, float* outPtr, int length) { - // make sure there is only one thread that we are calling - if (blockIdx.x != 0 || threadIdx.x != 0) { - return; - } - - // sum up all the values - *outPtr = 0; - for (int i=0; i < length; i++) { - *outPtr = *outPtr + inPtr[i]; - } - - // take the square root - *outPtr = sqrtf(*outPtr); - } - - void wrapper(float* cudaAPtr, float* cudaBPtr, int length, float* outPtr) { - cudaError_t err; - float* cudaOutPtr; - CUDA_SAFE_MALLOC(&cudaOutPtr, sizeof(float), &err); - - float* cudaWorkBufferPtr; - CUDA_SAFE_MALLOC(&cudaWorkBufferPtr, sizeof(float)*length, &err); - - // perform the subtraction - int threadsPerBlock = 64; - int numBlocks = length / threadsPerBlock + 1; - my_subtract_kernel<<>>(cudaAPtr, cudaBPtr, cudaWorkBufferPtr, length); - CUDA_KERNEL_ERR_CHK(&err); - - // perform the collapse - collapse_kernel<<<1,1>>>(cudaWorkBufferPtr, cudaOutPtr, length); - CUDA_KERNEL_ERR_CHK(&err); - - // copy the single value back to the destinsion - CUDA_SAFE_MEMCPY(outPtr, cudaOutPtr, sizeof(float), cudaMemcpyDeviceToHost, &err); - - CUDA_SAFE_FREE(cudaOutPtr, &err); - - // do not free aPtr which should be the reference library - // only free bPtr, which is the image we are comparing - CUDA_SAFE_FREE(cudaBPtr, &err); - CUDA_SAFE_FREE(cudaWorkBufferPtr, &err); - } -}}} - -// 128CUDAEigenfaces on 6400 ATT: 54.367s -// 128CUDAEigenfacesL2 on 6400 ATT: diff --git a/openbr/plugins/cuda/MatManager.cu b/openbr/plugins/cuda/MatManager.cu deleted file mode 100644 index 4823249..0000000 --- a/openbr/plugins/cuda/MatManager.cu +++ /dev/null @@ -1,116 +0,0 @@ -#include -#include - -#include - -#include "MatManager.hpp" - -using namespace cv; -using namespace cv::gpu; - -namespace br { namespace cuda { - MatManager::MatManager(int num) { - _numMats = num; - - // initialize the an array of Mats - _mats = (uint8_t**)malloc(num * sizeof(uint8_t*)); - _matTaken = (bool*)malloc(num * sizeof(bool)); - _matsDimension = (int*)malloc(num * sizeof(int)); - - for (int i=0; i < num; i++) { - cudaMalloc(&_mats[i], 1 * sizeof(uint8_t)); - - // initialize matTaken - _matTaken[i] = false; - - // initialize all mat dimensions to be 1 - _matsDimension[i] = 1; - } - - // initialize the locks - _matTakenLock = new pthread_mutex_t; - pthread_mutex_init(_matTakenLock, NULL); - _matsDimensionLock = new pthread_mutex_t; - pthread_mutex_init(_matsDimensionLock, NULL); - - // initialize the semaphore - _matSemaphore = new sem_t; - sem_init(_matSemaphore, 0, _numMats); - } - - MatManager::matindex MatManager::reserve(Mat &mat) { - int reservedMatIndex = 0; - - sem_wait(_matSemaphore); - pthread_mutex_lock(_matTakenLock); - int i; - for (i=0; i < _numMats; i++) { - if ( !_matTaken[i] ) { - _matTaken[i] = true; - reservedMatIndex = i; - break; - } - } - if (i == _numMats) { - std::cout << "Cannot reserve a mat. Not enough GpuMat resourses\n" << std::endl << std::flush; - } - - pthread_mutex_unlock(_matTakenLock); - - // reallocate if size does not match - pthread_mutex_lock(_matsDimensionLock); - if (_matsDimension[reservedMatIndex] != mat.rows * mat.cols) { - cudaFree(_mats[reservedMatIndex]); // free the previous memory first - cudaMalloc(&_mats[reservedMatIndex], mat.rows * mat.cols * sizeof(uint8_t)); - // change the dimension of that matrix - _matsDimension[reservedMatIndex] = mat.rows * mat.cols; - - } - pthread_mutex_unlock(_matsDimensionLock); - return reservedMatIndex; - } - - void MatManager::upload(MatManager::matindex reservedMatIndex, Mat& mat) { - // copy the content of the Mat to GPU - uint8_t* reservedMat = _mats[reservedMatIndex]; - cudaMemcpy(reservedMat, mat.ptr(), mat.rows * mat.cols, cudaMemcpyHostToDevice); - } - - void MatManager::download(MatManager::matindex reservedMatIndex, Mat& dstMat) { - // copy the mat data back - int dimension = dstMat.rows * dstMat.cols; - uint8_t* reservedMat = _mats[reservedMatIndex]; - cudaMemcpy(dstMat.ptr(), reservedMat, dimension, cudaMemcpyDeviceToHost); - } - - void MatManager::release(MatManager::matindex reservedMatIndex) { - uint8_t* reservedMat = _mats[reservedMatIndex]; - pthread_mutex_lock(_matTakenLock); - bool foundMatch = false; - for (int i=0; i < _numMats; i++) { - if (reservedMat == _mats[i]) { - _matTaken[i] = false; - foundMatch = true; - } - } - pthread_mutex_unlock(_matTakenLock); - - // return unconditionally if we didn't find a match - if (!foundMatch) { - std::cout << "Reservedmat is not in the _mats array" << std::endl << std::flush; - return; - } - sem_post(_matSemaphore); - } - - MatManager::~MatManager() { - // assume a single thread is destroying the manager - // TODO(colin): add the destroy code - //std::cout << "Start to destroy.." << std::endl << std::flush; - } - - uint8_t* MatManager::get_mat_pointer_from_index(MatManager::matindex matIndex) { - return _mats[matIndex]; - } - -}} diff --git a/openbr/plugins/cuda/MatManager.hpp b/openbr/plugins/cuda/MatManager.hpp deleted file mode 100644 index e58ad42..0000000 --- a/openbr/plugins/cuda/MatManager.hpp +++ /dev/null @@ -1,39 +0,0 @@ -/* -NOTES -Mat reservations should return a handle instead of a pointer -*/ - -#include -#include - -#include -#include - -using namespace cv; -using namespace cv::gpu; - -namespace br { namespace cuda { - class MatManager { - private: - int _numMats; - uint8_t** _mats; // holds all the mats - bool* _matTaken; // holds whether or not they are taken - int* _matsDimension; // holds the dimension of the Mats - - pthread_mutex_t* _matTakenLock; // lock for matTaken table - pthread_mutex_t* _matsDimensionLock; // lock for _matsDimension table and _mats table - sem_t* _matSemaphore; - - public: - typedef int matindex; - MatManager(int num); - - int reserve(Mat &mat); - void upload(matindex reservedMatIndex, Mat& mat); - void download(matindex reservedMatIndex, Mat& dstMat); - void release(matindex matIndex); - uint8_t* get_mat_pointer_from_index(matindex matIndex); - - ~MatManager(); - }; -}} diff --git a/openbr/plugins/cuda/copyfrom.cpp b/openbr/plugins/cuda/copyfrom.cpp index 4bfc0bf..287b3b9 100644 --- a/openbr/plugins/cuda/copyfrom.cpp +++ b/openbr/plugins/cuda/copyfrom.cpp @@ -2,21 +2,25 @@ #include -//#include - #include using namespace std; using namespace cv; -// extern CUDA declaration -namespace br { namespace cuda { namespace cudacopyfrom { +// CUDA functions for this plugin +namespace br { namespace cuda { namespace copyfrom { template void wrapper(void* src, T* out, int rows, int cols); }}} namespace br { + /*! + * \ingroup transforms + * \brief Copies a transform from the GPU to the CPU. + * \author Colin Heinzmann \cite DepthDeluxe + * \note Method: Automatically matches image dimensions, works for 32-bit single channel, 8-bit single channel, and 8-bit 3 channel + */ class CUDACopyFrom : public UntrainableTransform { Q_OBJECT @@ -24,8 +28,6 @@ namespace br private: void project(const Template &src, Template &dst) const { -// ProfilerStart("PROFILEME.log"); - // pull the data back out of the Mat void* const* dataPtr = src.m().ptr(); int rows = *((int*)dataPtr[1]); @@ -35,21 +37,19 @@ private: Mat dstMat = Mat(rows, cols, type); switch(type) { case CV_32FC1: - br::cuda::cudacopyfrom::wrapper(dataPtr[0], dstMat.ptr(), rows, cols); + cuda::copyfrom::wrapper(dataPtr[0], dstMat.ptr(), rows, cols); break; case CV_8UC1: - br::cuda::cudacopyfrom::wrapper(dataPtr[0], dstMat.ptr(), rows, cols); + cuda::copyfrom::wrapper(dataPtr[0], dstMat.ptr(), rows, cols); break; case CV_8UC3: - br::cuda::cudacopyfrom::wrapper(dataPtr[0], dstMat.ptr(), rows, cols * 3); + cuda::copyfrom::wrapper(dataPtr[0], dstMat.ptr(), rows, cols * 3); break; default: - cout << "ERR: Invalid image format" << endl; + cout << "ERR: Invalid image type (" << type << ")" << endl; break; } dst = dstMat; - -// ProfilerStop(); } }; diff --git a/openbr/plugins/cuda/copyfrom.cu b/openbr/plugins/cuda/copyfrom.cu index ddc77f8..5574ea6 100644 --- a/openbr/plugins/cuda/copyfrom.cu +++ b/openbr/plugins/cuda/copyfrom.cu @@ -1,6 +1,6 @@ #include "cudadefines.hpp" -namespace br { namespace cuda { namespace cudacopyfrom { +namespace br { namespace cuda { namespace copyfrom { template void wrapper(void* src, T* dst, int rows, int cols) { cudaError_t err; CUDA_SAFE_MEMCPY(dst, src, rows*cols*sizeof(T), cudaMemcpyDeviceToHost, &err); diff --git a/openbr/plugins/cuda/copyto.cpp b/openbr/plugins/cuda/copyto.cpp index cc63e58..0fad607 100644 --- a/openbr/plugins/cuda/copyto.cpp +++ b/openbr/plugins/cuda/copyto.cpp @@ -8,14 +8,20 @@ using namespace std; using namespace cv; -extern string type2str(int type); - -namespace br { namespace cuda { namespace cudacopyto { +// definitions from the CUDA source file +namespace br { namespace cuda { namespace copyto { template void wrapper(const T* in, void** out, const int rows, const int cols); }}} namespace br { + + /*! + * \ingroup transforms + * \brief Copies a transform to the GPU. + * \author Colin Heinzmann \cite DepthDeluxe + * \note Method: Automatically matches image dimensions, works for 32-bit single channel, 8-bit single channel, and 8-bit 3 channel + */ class CUDACopyTo : public UntrainableTransform { Q_OBJECT @@ -25,7 +31,7 @@ private: { const Mat& srcMat = src.m(); const int rows = srcMat.rows; - const int cols = srcMat.cols; + const int cols = srcMat.cols; // output will be a single pointer to graphics card memory Mat dstMat = Mat(4, 1, DataType::type); @@ -39,16 +45,16 @@ private: void* cudaMemPtr; switch(srcMat.type()) { case CV_32FC1: - br::cuda::cudacopyto::wrapper(srcMat.ptr(), &dstMatData[0], rows, cols); + cuda::copyto::wrapper(srcMat.ptr(), &dstMatData[0], rows, cols); break; case CV_8UC1: - br::cuda::cudacopyto::wrapper(srcMat.ptr(), &dstMatData[0], rows, cols); + cuda::copyto::wrapper(srcMat.ptr(), &dstMatData[0], rows, cols); break; case CV_8UC3: - br::cuda::cudacopyto::wrapper(srcMat.ptr(), &dstMatData[0], rows, 3*cols); + cuda::copyto::wrapper(srcMat.ptr(), &dstMatData[0], rows, 3*cols); break; default: - cout << "ERR: Invalid image type! " << type2str(srcMat.type()) << endl; + cout << "ERR: Invalid image type (" << srcMat.type() << ")" << endl; return; } diff --git a/openbr/plugins/cuda/copyto.cu b/openbr/plugins/cuda/copyto.cu index 0742ff7..4c66605 100644 --- a/openbr/plugins/cuda/copyto.cu +++ b/openbr/plugins/cuda/copyto.cu @@ -1,6 +1,7 @@ #include "cudadefines.hpp" -namespace br { namespace cuda { namespace cudacopyto { +namespace br { namespace cuda { namespace copyto { + template void wrapper(const T* in, void** out, const int rows, const int cols) { cudaError_t err; CUDA_SAFE_MALLOC(out, rows*cols*sizeof(T), &err); @@ -9,4 +10,5 @@ namespace br { namespace cuda { namespace cudacopyto { template void wrapper(const float* in, void** out, const int rows, const int cols); template void wrapper(const unsigned char* in, void** out, const int rows, const int cols); + }}} diff --git a/openbr/plugins/cuda/cudaaffine.cpp b/openbr/plugins/cuda/cudaaffine.cpp index 46dd71f..eec10f4 100644 --- a/openbr/plugins/cuda/cudaaffine.cpp +++ b/openbr/plugins/cuda/cudaaffine.cpp @@ -33,159 +33,158 @@ using namespace std; #include #include -#include "MatManager.hpp" - using namespace cv; -namespace br { namespace cuda { - void cudaaffine_wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols); -}} +// definitions from the CUDA source file +namespace br { namespace cuda { namespace affine { + void wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols); +}}} namespace br { -/*! - * \ingroup transforms - * \brief Performs a two or three point registration. - * \author Josh Klontz \cite jklontz - * \note Method: Area should be used for shrinking an image, Cubic for slow but accurate enlargment, Bilin for fast enlargement. - */ -class CUDAAffineTransform : public UntrainableTransform -{ - Q_OBJECT - Q_ENUMS(Method) - -public: - /*!< */ - enum Method { Near = INTER_NEAREST, - Area = INTER_AREA, - Bilin = INTER_LINEAR, - Cubic = INTER_CUBIC, - Lanczo = INTER_LANCZOS4}; - -private: - Q_PROPERTY(int width READ get_width WRITE set_width RESET reset_width STORED false) - Q_PROPERTY(int height READ get_height WRITE set_height RESET reset_height STORED false) - Q_PROPERTY(float x1 READ get_x1 WRITE set_x1 RESET reset_x1 STORED false) - Q_PROPERTY(float y1 READ get_y1 WRITE set_y1 RESET reset_y1 STORED false) - Q_PROPERTY(float x2 READ get_x2 WRITE set_x2 RESET reset_x2 STORED false) - Q_PROPERTY(float y2 READ get_y2 WRITE set_y2 RESET reset_y2 STORED false) - Q_PROPERTY(float x3 READ get_x3 WRITE set_x3 RESET reset_x3 STORED false) - Q_PROPERTY(float y3 READ get_y3 WRITE set_y3 RESET reset_y3 STORED false) - Q_PROPERTY(Method method READ get_method WRITE set_method RESET reset_method STORED false) - Q_PROPERTY(bool storeAffine READ get_storeAffine WRITE set_storeAffine RESET reset_storeAffine STORED false) - Q_PROPERTY(bool warpPoints READ get_warpPoints WRITE set_warpPoints RESET reset_warpPoints STORED false) - BR_PROPERTY(int, width, 64) - BR_PROPERTY(int, height, 64) - BR_PROPERTY(float, x1, 0) - BR_PROPERTY(float, y1, 0) - BR_PROPERTY(float, x2, -1) - BR_PROPERTY(float, y2, -1) - BR_PROPERTY(float, x3, -1) - BR_PROPERTY(float, y3, -1) - BR_PROPERTY(Method, method, Bilin) - BR_PROPERTY(bool, storeAffine, false) - BR_PROPERTY(bool, warpPoints, false) - - static Point2f getThirdAffinePoint(const Point2f &a, const Point2f &b) - { - float dx = b.x - a.x; - float dy = b.y - a.y; - return Point2f(a.x - dy, a.y + dx); - } - - void project(const Template &src, Template &dst) const - { - const bool twoPoints = ((x3 == -1) || (y3 == -1)); - - Point2f dstPoints[3]; - dstPoints[0] = Point2f(x1*width, y1*height); - dstPoints[1] = Point2f((x2 == -1 ? 1 - x1 : x2)*width, (y2 == -1 ? y1 : y2)*height); - if (twoPoints) dstPoints[2] = getThirdAffinePoint(dstPoints[0], dstPoints[1]); - else dstPoints[2] = Point2f(x3*width, y3*height); - - Point2f srcPoints[3]; - if (src.file.contains("Affine_0") && - src.file.contains("Affine_1") && - (src.file.contains("Affine_2") || twoPoints)) { - srcPoints[0] = OpenCVUtils::toPoint(src.file.get("Affine_0")); - srcPoints[1] = OpenCVUtils::toPoint(src.file.get("Affine_1")); - if (!twoPoints) srcPoints[2] = OpenCVUtils::toPoint(src.file.get("Affine_2")); - } else { - const QList landmarks = OpenCVUtils::toPoints(src.file.points()); - - if ((landmarks.size() < 2) || (!twoPoints && (landmarks.size() < 3))) { - resize(src, dst, Size(width, height)); - return; - } else { - srcPoints[0] = landmarks[0]; - srcPoints[1] = landmarks[1]; - if (!twoPoints) srcPoints[2] = landmarks[2]; - } - } - if (twoPoints) srcPoints[2] = getThirdAffinePoint(srcPoints[0], srcPoints[1]); - - // Code section being altered (original) - // - // Mat affineTransform = getAffineTransform(srcPoints, dstPoints); - // warpAffine(src, dst, affineTransform, Size(width, height), method); - // - // end original - - Mat affineTransform = getAffineTransform(srcPoints, dstPoints); - - void* const* srcDataPtr = src.m().ptr(); - int rows = *((int*)srcDataPtr[1]); - int cols = *((int*)srcDataPtr[2]); - int type = *((int*)srcDataPtr[3]); - - Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type()); - void** dstDataPtr = dstMat.ptr(); - - dstDataPtr[1] = srcDataPtr[1]; *((int*)dstDataPtr[1]) = height; // rows - dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = width; // cols - dstDataPtr[3] = srcDataPtr[3]; - - // Print the transform - //for(int x = 0; x < affineTransform.rows; x++){ - //for(int y = 0; y < affineTransform.cols; y++){ - //printf("%8.3f\t", affineTransform.at(x, y)); - //} - //printf("\n"); - //} - - br::cuda::cudaaffine_wrapper(srcDataPtr[0], &dstDataPtr[0], affineTransform, rows, cols, height, width); - - // end altered code - - if (warpPoints) { - QList points = src.file.points(); - QList rotatedPoints; - for (int i=0; i(0,0)+ - points.at(i).y()*affineTransform.at(0,1)+ - affineTransform.at(0,2), - points.at(i).x()*affineTransform.at(1,0)+ - points.at(i).y()*affineTransform.at(1,1)+ - affineTransform.at(1,2))); - } - - dst.file.setPoints(rotatedPoints); - } - - if (storeAffine) { - QList affineParams; - for (int i = 0 ; i < 2; i++) - for (int j = 0; j < 3; j++) - affineParams.append(affineTransform.at(i, j)); - dst.file.setList("affineParameters", affineParams); - } - - dst = dstMat; - } -}; - -BR_REGISTER(Transform, CUDAAffineTransform) + /*! + * \ingroup transforms + * \brief Performs a two or three point registration on the GPU. Modified from stock OpenBR implementation + * \author Greg Schrock \cite gls022 + * \note Method: Area should be used for shrinking an image, Cubic for slow but accurate enlargment, Bilin for fast enlargement. + */ + class CUDAAffineTransform : public UntrainableTransform + { + Q_OBJECT + Q_ENUMS(Method) + + public: + /*!< */ + enum Method { Near = INTER_NEAREST, + Area = INTER_AREA, + Bilin = INTER_LINEAR, + Cubic = INTER_CUBIC, + Lanczo = INTER_LANCZOS4}; + + private: + Q_PROPERTY(int width READ get_width WRITE set_width RESET reset_width STORED false) + Q_PROPERTY(int height READ get_height WRITE set_height RESET reset_height STORED false) + Q_PROPERTY(float x1 READ get_x1 WRITE set_x1 RESET reset_x1 STORED false) + Q_PROPERTY(float y1 READ get_y1 WRITE set_y1 RESET reset_y1 STORED false) + Q_PROPERTY(float x2 READ get_x2 WRITE set_x2 RESET reset_x2 STORED false) + Q_PROPERTY(float y2 READ get_y2 WRITE set_y2 RESET reset_y2 STORED false) + Q_PROPERTY(float x3 READ get_x3 WRITE set_x3 RESET reset_x3 STORED false) + Q_PROPERTY(float y3 READ get_y3 WRITE set_y3 RESET reset_y3 STORED false) + Q_PROPERTY(Method method READ get_method WRITE set_method RESET reset_method STORED false) + Q_PROPERTY(bool storeAffine READ get_storeAffine WRITE set_storeAffine RESET reset_storeAffine STORED false) + Q_PROPERTY(bool warpPoints READ get_warpPoints WRITE set_warpPoints RESET reset_warpPoints STORED false) + BR_PROPERTY(int, width, 64) + BR_PROPERTY(int, height, 64) + BR_PROPERTY(float, x1, 0) + BR_PROPERTY(float, y1, 0) + BR_PROPERTY(float, x2, -1) + BR_PROPERTY(float, y2, -1) + BR_PROPERTY(float, x3, -1) + BR_PROPERTY(float, y3, -1) + BR_PROPERTY(Method, method, Bilin) + BR_PROPERTY(bool, storeAffine, false) + BR_PROPERTY(bool, warpPoints, false) + + static Point2f getThirdAffinePoint(const Point2f &a, const Point2f &b) + { + float dx = b.x - a.x; + float dy = b.y - a.y; + return Point2f(a.x - dy, a.y + dx); + } + + void project(const Template &src, Template &dst) const + { + const bool twoPoints = ((x3 == -1) || (y3 == -1)); + + Point2f dstPoints[3]; + dstPoints[0] = Point2f(x1*width, y1*height); + dstPoints[1] = Point2f((x2 == -1 ? 1 - x1 : x2)*width, (y2 == -1 ? y1 : y2)*height); + if (twoPoints) dstPoints[2] = getThirdAffinePoint(dstPoints[0], dstPoints[1]); + else dstPoints[2] = Point2f(x3*width, y3*height); + + Point2f srcPoints[3]; + if (src.file.contains("Affine_0") && + src.file.contains("Affine_1") && + (src.file.contains("Affine_2") || twoPoints)) { + srcPoints[0] = OpenCVUtils::toPoint(src.file.get("Affine_0")); + srcPoints[1] = OpenCVUtils::toPoint(src.file.get("Affine_1")); + if (!twoPoints) srcPoints[2] = OpenCVUtils::toPoint(src.file.get("Affine_2")); + } else { + const QList landmarks = OpenCVUtils::toPoints(src.file.points()); + + if ((landmarks.size() < 2) || (!twoPoints && (landmarks.size() < 3))) { + resize(src, dst, Size(width, height)); + return; + } else { + srcPoints[0] = landmarks[0]; + srcPoints[1] = landmarks[1]; + if (!twoPoints) srcPoints[2] = landmarks[2]; + } + } + if (twoPoints) srcPoints[2] = getThirdAffinePoint(srcPoints[0], srcPoints[1]); + + // Code section being altered (original) + // + // Mat affineTransform = getAffineTransform(srcPoints, dstPoints); + // warpAffine(src, dst, affineTransform, Size(width, height), method); + // + // end original + + Mat affineTransform = getAffineTransform(srcPoints, dstPoints); + + void* const* srcDataPtr = src.m().ptr(); + int rows = *((int*)srcDataPtr[1]); + int cols = *((int*)srcDataPtr[2]); + int type = *((int*)srcDataPtr[3]); + + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type()); + void** dstDataPtr = dstMat.ptr(); + + dstDataPtr[1] = srcDataPtr[1]; *((int*)dstDataPtr[1]) = height; // rows + dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = width; // cols + dstDataPtr[3] = srcDataPtr[3]; + + // Print the transform + //for(int x = 0; x < affineTransform.rows; x++){ + //for(int y = 0; y < affineTransform.cols; y++){ + //printf("%8.3f\t", affineTransform.at(x, y)); + //} + //printf("\n"); + //} + + cuda::affine::wrapper(srcDataPtr[0], &dstDataPtr[0], affineTransform, rows, cols, height, width); + + // end altered code + + if (warpPoints) { + QList points = src.file.points(); + QList rotatedPoints; + for (int i=0; i(0,0)+ + points.at(i).y()*affineTransform.at(0,1)+ + affineTransform.at(0,2), + points.at(i).x()*affineTransform.at(1,0)+ + points.at(i).y()*affineTransform.at(1,1)+ + affineTransform.at(1,2))); + } + + dst.file.setPoints(rotatedPoints); + } + + if (storeAffine) { + QList affineParams; + for (int i = 0 ; i < 2; i++) + for (int j = 0; j < 3; j++) + affineParams.append(affineTransform.at(i, j)); + dst.file.setList("affineParameters", affineParams); + } + + dst = dstMat; + } + }; + + BR_REGISTER(Transform, CUDAAffineTransform) } // namespace br diff --git a/openbr/plugins/cuda/cudaaffine.cu b/openbr/plugins/cuda/cudaaffine.cu index 0c19dde..79cc244 100644 --- a/openbr/plugins/cuda/cudaaffine.cu +++ b/openbr/plugins/cuda/cudaaffine.cu @@ -11,30 +11,30 @@ using namespace std; using namespace cv; using namespace cv::gpu; -namespace br { namespace cuda { +namespace br { namespace cuda { namespace affine { - __device__ __forceinline__ uint8_t cudaaffine_kernel_get_pixel_value(int row, int col, uint8_t* srcPtr, int rows, int cols) { + __device__ __forceinline__ uint8_t getPixelValueDevice(int row, int col, uint8_t* srcPtr, int rows, int cols) { if (row < 0 || row > rows || col < 0 || col > cols) { if (row > rows || col > cols) { return 0; } else{ - return 0; + return 0; } } return (srcPtr + row*cols)[col]; } - __device__ __forceinline__ uint8_t cudaaffine_kernel_get_bilinear_pixel_value(double row, double col, uint8_t* srcPtr, int rows, int cols) { + __device__ __forceinline__ uint8_t getBilinearPixelValueDevice(double row, double col, uint8_t* srcPtr, int rows, int cols) { // don't do anything if the index is out of bounds if (row < 0 || row > rows || col < 0 || col > cols) { if (row > rows || col > cols) { return 0; } else{ - return 0; + return 0; } } - + // http://www.sci.utah.edu/~acoste/uou/Image/project3/ArthurCOSTE_Project3.pdf // Bilinear Transformation // f(Px, Py) = f(Q11)×(1−Rx)×(1−Sy)+f(Q21)×(Rx)×(1−Sy)+f(Q12)×(1−Rx)×(Sy)+f(Q22)×(Rx)×(Sy) @@ -48,22 +48,22 @@ namespace br { namespace cuda { double d_row = row - row1; double d_col = col - col1; - int Q11 = cudaaffine_kernel_get_pixel_value(row1, col1, srcPtr, rows, cols); - int Q21 = cudaaffine_kernel_get_pixel_value(row2, col1, srcPtr, rows, cols); - int Q12 = cudaaffine_kernel_get_pixel_value(row1, col2, srcPtr, rows, cols); - int Q22 = cudaaffine_kernel_get_pixel_value(row2, col2, srcPtr, rows, cols); + int Q11 = getPixelValueDevice(row1, col1, srcPtr, rows, cols); + int Q21 = getPixelValueDevice(row2, col1, srcPtr, rows, cols); + int Q12 = getPixelValueDevice(row1, col2, srcPtr, rows, cols); + int Q22 = getPixelValueDevice(row2, col2, srcPtr, rows, cols); double val = Q22*(d_row*d_col) + Q12*((1-d_row)*d_col) + Q21*(d_row*(1-d_col)) + Q11*((1-d_row)*(1-d_col)); return ((uint8_t) round(val)); } - __device__ __forceinline__ uint8_t cudaaffine_kernel_get_distance_pixel_value(double row, double col, uint8_t* srcPtr, int rows, int cols) { + __device__ __forceinline__ uint8_t getDistancePixelValueDevice(double row, double col, uint8_t* srcPtr, int rows, int cols) { // don't do anything if the index is out of bounds if (row < 1 || row >= rows-1 || col < 1 || col >= cols-1) { if (row >= rows || col >= cols) { return 0; } else{ - return 0; + return 0; } } @@ -90,10 +90,10 @@ namespace br { namespace cuda { double w3 = d3/sum; double w4 = d4/sum; - uint8_t v1 = cudaaffine_kernel_get_pixel_value(row1, col1, srcPtr, rows, cols); - uint8_t v2 = cudaaffine_kernel_get_pixel_value(row2, col1, srcPtr, rows, cols); - uint8_t v3 = cudaaffine_kernel_get_pixel_value(row1, col2, srcPtr, rows, cols); - uint8_t v4 = cudaaffine_kernel_get_pixel_value(row2, col2, srcPtr, rows, cols); + uint8_t v1 = getPixelValueDevice(row1, col1, srcPtr, rows, cols); + uint8_t v2 = getPixelValueDevice(row2, col1, srcPtr, rows, cols); + uint8_t v3 = getPixelValueDevice(row1, col2, srcPtr, rows, cols); + uint8_t v4 = getPixelValueDevice(row2, col2, srcPtr, rows, cols); return round(w1*v1 + w2*v2 + w3*v3 + w4*v4); } @@ -105,16 +105,16 @@ namespace br { namespace cuda { * src_row - The computed source pixel row (mapping from this row) * src_col - The computed source pixel column (mapping from this col) */ - __device__ __forceinline__ void cudaaffine_kernel_get_src_coord(double *trans_inv, int dst_row, int dst_col, double* src_row_pnt, double* src_col_pnt){ + __device__ __forceinline__ void getSrcCoordDevice(double *trans_inv, int dst_row, int dst_col, double* src_row_pnt, double* src_col_pnt){ *src_col_pnt = dst_col * trans_inv[0] + dst_row * trans_inv[3] + trans_inv[6]; *src_row_pnt = dst_col * trans_inv[1] + dst_row * trans_inv[4] + trans_inv[7]; //printf("Dst: [%d, %d, 1] = [%d, %d, 1] \n[ %0.4f, %0.4f, %0.4f] \n[ %0.4f, %0.4f, %0.4f ]\n[ %0.4f, %0.4f, %0.4f ]\n\n", *src_col, *src_row, dst_col, dst_row, trans_inv[0], trans_inv[1], trans_inv[2], trans_inv[3], trans_inv[4], trans_inv[5], trans_inv[6], trans_inv[7], trans_inv[8]); } - - __global__ void cudaaffine_kernel(uint8_t* srcPtr, uint8_t* dstPtr, double* trans_inv, int src_rows, int src_cols, int dst_rows, int dst_cols){ + + __global__ void affineKernel(uint8_t* srcPtr, uint8_t* dstPtr, double* trans_inv, int src_rows, int src_cols, int dst_rows, int dst_cols){ int dstRowInd = blockIdx.y*blockDim.y+threadIdx.y; int dstColInd = blockIdx.x*blockDim.x+threadIdx.x; int dstIndex = dstRowInd*dst_cols + dstColInd; @@ -134,15 +134,15 @@ namespace br { namespace cuda { } } - cudaaffine_kernel_get_src_coord(trans_inv, dstRowInd, dstColInd, &srcRowPnt, &srcColPnt); - //const uint8_t cval = cudaaffine_kernel_get_distance_pixel_value(srcRowPnt, srcColPnt, srcPtr, src_rows, src_cols); // Get initial pixel value - const uint8_t cval = cudaaffine_kernel_get_bilinear_pixel_value(srcRowPnt, srcColPnt, srcPtr, src_rows, src_cols); // Get initial pixel value - //const uint8_t cval = cudaaffine_kernel_get_pixel_value(round(srcRowPnt), round(srcColPnt), srcPtr, src_rows, src_cols); // Get initial pixel value + getSrcCoordDevice(trans_inv, dstRowInd, dstColInd, &srcRowPnt, &srcColPnt); + //const uint8_t cval = getDistancePixelValueDevice(srcRowPnt, srcColPnt, srcPtr, src_rows, src_cols); // Get initial pixel value + const uint8_t cval = getBilinearPixelValueDevice(srcRowPnt, srcColPnt, srcPtr, src_rows, src_cols); // Get initial pixel value + //const uint8_t cval = getPixelValueDevice(round(srcRowPnt), round(srcColPnt), srcPtr, src_rows, src_cols); // Get initial pixel value dstPtr[dstIndex] = cval; } - void cudaaffine_wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols) { + void wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols) { cudaError_t err; double* gpuInverse; @@ -152,7 +152,7 @@ namespace br { namespace cuda { //************************************************************************ // Input affine is a 2x3 Mat whose transpose is used in the computations - // [x, y, 1] = [u, v, 1] [ a^T | [0 0 1]^T ] + // [x, y, 1] = [u, v, 1] [ a^T | [0 0 1]^T ] // See "Digital Image Warping" by George Wolburg (p. 50) //************************************************************************ @@ -210,7 +210,7 @@ namespace br { namespace cuda { CUDA_SAFE_MEMCPY(gpuInverse, affineInverse, 9*sizeof(double), cudaMemcpyHostToDevice, &err); - cudaaffine_kernel<<>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), gpuInverse, src_rows, src_cols, dst_rows, dst_cols); + affineKernel<<>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), gpuInverse, src_rows, src_cols, dst_rows, dst_cols); CUDA_KERNEL_ERR_CHK(&err); CUDA_SAFE_FREE(srcPtr, &err); @@ -225,5 +225,4 @@ namespace br { namespace cuda { // } // printf("\n"); } -} // end cuda -} // end br +}}} diff --git a/openbr/plugins/cuda/cudacvtfloat.cpp b/openbr/plugins/cuda/cudacvtfloat.cpp index d774a05..ced2b9c 100644 --- a/openbr/plugins/cuda/cudacvtfloat.cpp +++ b/openbr/plugins/cuda/cudacvtfloat.cpp @@ -7,7 +7,8 @@ using namespace cv; #include -namespace br { namespace cuda { namespace cudacvtfloat { +// definitions from the CUDA source file +namespace br { namespace cuda { namespace cvtfloat { void wrapper(void* src, void** dst, int rows, int cols); }}} @@ -16,7 +17,7 @@ namespace br /*! * \ingroup transforms - * \brief Converts byte to floating point + * \brief Converts 8-bit images currently on GPU into 32-bit floating point equivalent. * \author Colin Heinzmann \cite DepthDeluxe */ class CUDACvtFloatTransform : public UntrainableTransform @@ -45,7 +46,7 @@ class CUDACvtFloatTransform : public UntrainableTransform dstDataPtr[2] = srcDataPtr[2]; dstDataPtr[3] = srcDataPtr[3]; *((int*)dstDataPtr[3]) = CV_32FC1; - br::cuda::cudacvtfloat::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols); + cuda::cvtfloat::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols); dst = dstMat; } }; diff --git a/openbr/plugins/cuda/cudacvtfloat.cu b/openbr/plugins/cuda/cudacvtfloat.cu index 70bf2e4..bcc4665 100644 --- a/openbr/plugins/cuda/cudacvtfloat.cu +++ b/openbr/plugins/cuda/cudacvtfloat.cu @@ -3,7 +3,7 @@ using namespace std; #include "cudadefines.hpp" -namespace br { namespace cuda { namespace cudacvtfloat { +namespace br { namespace cuda { namespace cvtfloat { __global__ void kernel(const unsigned char* src, float* dst, int rows, int cols) { // get my index @@ -20,12 +20,6 @@ namespace br { namespace cuda { namespace cudacvtfloat { } void wrapper(void* src, void** dst, int rows, int cols) { - //unsigned char* cudaSrc; - //cudaMalloc(&cudaSrc, rows*cols*sizeof(unsigned char)); - //cudaMemcpy(cudaSrc, src, rows*cols*sizeof(unsigned char), cudaMemcpyHostToDevice); - - //float* cudaDst; - //cudaMalloc(&cudaDst, rows*cols*sizeof(float)); cudaError_t err; CUDA_SAFE_MALLOC(dst, rows*cols*sizeof(float), &err); diff --git a/openbr/plugins/cuda/cudal2.cpp b/openbr/plugins/cuda/cudal2.cpp new file mode 100644 index 0000000..8cbc7c6 --- /dev/null +++ b/openbr/plugins/cuda/cudal2.cpp @@ -0,0 +1,59 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright 2012 The MITRE Corporation * + * * + * Licensed under the Apache License, Version 2.0 (the "License"); * + * you may not use this file except in compliance with the License. * + * You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, software * + * distributed under the License is distributed on an "AS IS" BASIS, * + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * + * See the License for the specific language governing permissions and * + * limitations under the License. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include +using namespace std; + +#include + +// definitions from the CUDA source file +namespace br { namespace cuda { namespace L2 { + void wrapper(float* cudaAPtr, float* cudaBPtr, int length, float* outPtr); +}}} + +namespace br +{ + +/*! + * \ingroup distances + * \brief L2 distance computed using eigen. + * \author Colin Heinzmann \cite DepthDeluxe + */ +class CUDAL2Distance : public UntrainableDistance +{ + Q_OBJECT + + float compare(const cv::Mat &a, const cv::Mat &b) const + { + void* const* srcDataPtr = a.ptr(); + float* cudaAPtr = (float*)srcDataPtr[0]; + int rows = *((int*)srcDataPtr[1]); + int cols = *((int*)srcDataPtr[2]); + + float* cudaBPtr = (float*)b.ptr()[0]; + + float out; + cuda::L2::wrapper(cudaAPtr, cudaBPtr, rows*cols, &out); + + return out; + } +}; + +BR_REGISTER(Distance, CUDAL2Distance) + +} // namespace br + +#include "cuda/cudal2.moc" diff --git a/openbr/plugins/cuda/cudal2.cu b/openbr/plugins/cuda/cudal2.cu new file mode 100644 index 0000000..23db28d --- /dev/null +++ b/openbr/plugins/cuda/cudal2.cu @@ -0,0 +1,65 @@ +#include + +#include "cudadefines.hpp" + +namespace br { namespace cuda { namespace L2 { + + __global__ void subtractKernel(float* aPtr, float* bPtr, float* workPtr, int length) { + int index = blockIdx.x*blockDim.x+threadIdx.x; + + if (index >= length) { + return; + } + + // perform the subtraction in-place + // use b because it is the comparison + // image + workPtr[index] = aPtr[index] - bPtr[index]; + workPtr[index] = workPtr[index] * workPtr[index]; + } + + __global__ void collapseKernel(float* inPtr, float* outPtr, int length) { + // make sure there is only one thread that we are calling + if (blockIdx.x != 0 || threadIdx.x != 0) { + return; + } + + // sum up all the values + *outPtr = 0; + for (int i=0; i < length; i++) { + *outPtr = *outPtr + inPtr[i]; + } + + // take the square root + *outPtr = sqrtf(*outPtr); + } + + void wrapper(float* cudaAPtr, float* cudaBPtr, int length, float* outPtr) { + cudaError_t err; + float* cudaOutPtr; + CUDA_SAFE_MALLOC(&cudaOutPtr, sizeof(float), &err); + + float* cudaWorkBufferPtr; + CUDA_SAFE_MALLOC(&cudaWorkBufferPtr, sizeof(float)*length, &err); + + // perform the subtraction + int threadsPerBlock = 64; + int numBlocks = length / threadsPerBlock + 1; + subtractKernel<<>>(cudaAPtr, cudaBPtr, cudaWorkBufferPtr, length); + CUDA_KERNEL_ERR_CHK(&err); + + // perform the collapse + collapseKernel<<<1,1>>>(cudaWorkBufferPtr, cudaOutPtr, length); + CUDA_KERNEL_ERR_CHK(&err); + + // copy the single value back to the destinsion + CUDA_SAFE_MEMCPY(outPtr, cudaOutPtr, sizeof(float), cudaMemcpyDeviceToHost, &err); + + CUDA_SAFE_FREE(cudaOutPtr, &err); + + // do not free aPtr which should be the reference library + // only free bPtr, which is the image we are comparing + CUDA_SAFE_FREE(cudaBPtr, &err); + CUDA_SAFE_FREE(cudaWorkBufferPtr, &err); + } +}}} diff --git a/openbr/plugins/cuda/cudalbp.cpp b/openbr/plugins/cuda/cudalbp.cpp index 2385c9b..f2f2d9f 100644 --- a/openbr/plugins/cuda/cudalbp.cpp +++ b/openbr/plugins/cuda/cudalbp.cpp @@ -31,44 +31,21 @@ using namespace std; #include -#include "MatManager.hpp" - using namespace cv; -string type2str(int type) { - string r; - - uchar depth = type & CV_MAT_DEPTH_MASK; - uchar chans = 1 + (type >> CV_CN_SHIFT); - - switch ( depth ) { - case CV_8U: r = "8U"; break; - case CV_8S: r = "8S"; break; - case CV_16U: r = "16U"; break; - case CV_16S: r = "16S"; break; - case CV_32S: r = "32S"; break; - case CV_32F: r = "32F"; break; - case CV_64F: r = "64F"; break; - default: r = "User"; break; - } - - r += "C"; - r += (chans+'0'); - - return r; -} - -namespace br { namespace cuda { - void cudalbp_wrapper(void* srcPtr, void** dstPtr, int rows, int cols); - void cudalbp_init_wrapper(uint8_t* lut); -}} +// definitions from the CUDA source file +namespace br { namespace cuda { namespace lbp { + void wrapper(void* srcPtr, void** dstPtr, int rows, int cols); + void initializeWrapper(uint8_t* lut); +}}} namespace br { /*! * \ingroup transforms - * \brief Convert the image into a feature vector using Local Binary Patterns in CUDA - * \author Colin Heinzmann, Li Li \cite DepthDeluxe, booli + * \brief Convert the image into a feature vector using Local Binary Patterns in CUDA. Modified from stock OpenBR plugin. + * \author Colin Heinzmann \cite DepthDeluxe + * \author Li Li \cite booli */ class CUDALBPTransform : public UntrainableTransform { @@ -84,8 +61,6 @@ class CUDALBPTransform : public UntrainableTransform uchar lut[256]; uchar null; - //cuda::MatManager* matManager; - public: /* Returns the number of 0->1 or 1->0 transitions in i */ static int numTransitions(int i) @@ -136,36 +111,14 @@ class CUDALBPTransform : public UntrainableTransform if (!set[i]) lut[i] = null; // Set to null id - // init the mat manager for managing 10 mats - //matManager = new cuda::MatManager(10); - // copy lut over to the GPU - br::cuda::cudalbp_init_wrapper(lut); + cuda::lbp::initializeWrapper(lut); std::cout << "Initialized CUDALBP" << std::endl; } void project(const Template &src, Template &dst) const { - //Mat& m = (Mat&)src.m(); - //cuda::MatManager::matindex a; - //cuda::MatManager::matindex b; - //a = matManager->reserve(m); - //matManager->upload(a, m); - - // reserve the second mat and check the dimensiosn - //b = matManager->reserve(m); - - //uint8_t* srcMatPtr = matManager->get_mat_pointer_from_index(a); - //uint8_t* dstMatPtr = matManager->get_mat_pointer_from_index(b); - //br::cuda::cudalbp_wrapper(srcMatPtr, dstMatPtr, lutGpuPtr, m.cols, m.rows, m.step1()); - - //matManager->download(b, dst); - - // release both the mats - //matManager->release(a); - //matManager->release(b); - void* const* srcDataPtr = src.m().ptr(); int rows = *((int*)srcDataPtr[1]); int cols = *((int*)srcDataPtr[2]); @@ -177,13 +130,13 @@ class CUDALBPTransform : public UntrainableTransform dstDataPtr[2] = srcDataPtr[2]; dstDataPtr[3] = srcDataPtr[3]; - br::cuda::cudalbp_wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols); + cuda::lbp::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols); dst = dstMat; } }; BR_REGISTER(Transform, CUDALBPTransform) -} // namespace br +} #include "cuda/cudalbp.moc" diff --git a/openbr/plugins/cuda/cudalbp.cu b/openbr/plugins/cuda/cudalbp.cu index bb5be9a..61548df 100644 --- a/openbr/plugins/cuda/cudalbp.cu +++ b/openbr/plugins/cuda/cudalbp.cu @@ -9,14 +9,18 @@ using namespace std; using namespace cv; using namespace cv::gpu; -namespace br { namespace cuda { +/* + * These are the CUDA functions for CUDALBP. See cudapca.cpp for more details + */ + +namespace br { namespace cuda { namespace lbp { uint8_t* lut; - __device__ __forceinline__ uint8_t cudalbp_kernel_get_pixel_value(int row, int col, uint8_t* srcPtr, int rows, int cols) { + __device__ __forceinline__ uint8_t getPixelValueKernel(int row, int col, uint8_t* srcPtr, int rows, int cols) { return (srcPtr + row*cols)[col]; } - __global__ void cudalbp_kernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols, uint8_t* lut) + __global__ void lutKernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols, uint8_t* lut) { int rowInd = blockIdx.y*blockDim.y+threadIdx.y; int colInd = blockIdx.x*blockDim.x+threadIdx.x; @@ -34,22 +38,22 @@ namespace br { namespace cuda { } } - const uint8_t cval = cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+0*radius, srcPtr, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]); // center value - uint8_t val = lut[(cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 128 : 0) | - (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 64 : 0) | - (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 32 : 0) | - (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 16 : 0) | - (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 8 : 0) | - (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 4 : 0) | - (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 2 : 0) | - (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 1 : 0)]; + const uint8_t cval = getPixelValueKernel(rowInd+0*radius, colInd+0*radius, srcPtr, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]); // center value + uint8_t val = lut[(getPixelValueKernel(rowInd-1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 128 : 0) | + (getPixelValueKernel(rowInd-1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 64 : 0) | + (getPixelValueKernel(rowInd-1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 32 : 0) | + (getPixelValueKernel(rowInd+0*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 16 : 0) | + (getPixelValueKernel(rowInd+1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 8 : 0) | + (getPixelValueKernel(rowInd+1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 4 : 0) | + (getPixelValueKernel(rowInd+1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 2 : 0) | + (getPixelValueKernel(rowInd+0*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 1 : 0)]; // store calculated value away in the right place dstPtr[index] = val; } //void cudalbp_wrapper(uint8_t* srcPtr, uint8_t* dstPtr, uint8_t* lut, int imageWidth, int imageHeight, size_t step) - void cudalbp_wrapper(void* srcPtr, void** dstPtr, int rows, int cols) + void wrapper(void* srcPtr, void** dstPtr, int rows, int cols) { cudaError_t err; @@ -59,15 +63,15 @@ namespace br { namespace cuda { rows/threadsPerBlock.y + 1); CUDA_SAFE_MALLOC(dstPtr, rows*cols*sizeof(uint8_t), &err); - cudalbp_kernel<<>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), rows, cols, lut); + lutKernel<<>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), rows, cols, lut); CUDA_KERNEL_ERR_CHK(&err); CUDA_SAFE_FREE(srcPtr, &err); } - void cudalbp_init_wrapper(uint8_t* cpuLut) { + void initializeWrapper(uint8_t* cpuLut) { cudaError_t err; CUDA_SAFE_MALLOC(&lut, 256*sizeof(uint8_t), &err); CUDA_SAFE_MEMCPY(lut, cpuLut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice, &err); } -}} +}}} diff --git a/openbr/plugins/cuda/cudapca.cpp b/openbr/plugins/cuda/cudapca.cpp index fbee173..c1677c8 100644 --- a/openbr/plugins/cuda/cudapca.cpp +++ b/openbr/plugins/cuda/cudapca.cpp @@ -29,8 +29,9 @@ using namespace cv; #include #include +// definitions from the CUDA source file namespace br { namespace cuda { namespace pca { - void loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems); + void initializeWrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems); void wrapper(void* src, void** dst); }}} @@ -38,9 +39,7 @@ namespace br { /*! * \ingroup transforms - * \brief Projects input into learned Principal Component Analysis subspace using CUDA. - * \author Brendan Klare \cite bklare - * \author Josh Klontz \cite jklontz + * \brief Projects input into learned Principal Component Analysis subspace using CUDA. Modified from original PCA plugin. * \author Colin Heinzmann \cite DepthDeluxe * * \br_property float keep Options are: [keep < 0 - All eigenvalues are retained, keep == 0 - No PCA is performed and the eigenvectors form an identity matrix, 0 < keep < 1 - Keep is the fraction of the variance to retain, keep >= 1 - keep is the number of leading eigenvectors to retain] Default is 0.95. @@ -134,22 +133,9 @@ private: dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = keep; dstDataPtr[3] = srcDataPtr[3]; - br::cuda::pca::wrapper(srcDataPtr[0], &dstDataPtr[0]); + cuda::pca::wrapper(srcDataPtr[0], &dstDataPtr[0]); dst = dstMat; - - //dst = cv::Mat(1, keep, CV_32FC1); - - // perform the operation on the graphics card - //cuda::cudapca_projectwrapper((float*)src.m().ptr(), (float*)dst.m().ptr()); - - // Map Eigen into OpenCV - //Mat cpuDst = cv::Mat(1, keep, CV_32FC1); - //Eigen::Map inMap(src.m().ptr(), src.m().rows*src.m().cols, 1); - //Eigen::Map outMap(cpuDst.ptr(), keep, 1); - - // Do projection - //outMap = eVecs.transpose() * (inMap - mean); } void store(QDataStream &stream) const @@ -161,14 +147,6 @@ private: { stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> eVecs; - cout << "Mean Dimensions" << endl; - cout << "\tRows: " << mean.rows() << " Cols: " << mean.cols() << endl; - cout << "eVecs Dimensions" << endl; - cout << "\tRows: " << eVecs.rows() << " Cols: " << eVecs.cols() << endl; - cout << "eVals Dimensions" << endl; - cout << "\tRows: " << eVals.rows() << " Cols: " << eVals.cols() << endl; - cout << "Keep: " << keep << endl; - // TODO(colin): use Eigen Map class to generate map files so we don't have to copy the data // serialize the eigenvectors float* evBuffer = new float[eVecs.rows() * eVecs.cols()]; @@ -187,7 +165,7 @@ private: } // call the wrapper function - br::cuda::pca::loadwrapper(evBuffer, eVecs.rows(), eVecs.cols(), meanBuffer, mean.rows()*mean.cols()); + cuda::pca::initializeWrapper(evBuffer, eVecs.rows(), eVecs.cols(), meanBuffer, mean.rows()*mean.cols()); delete evBuffer; delete meanBuffer; diff --git a/openbr/plugins/cuda/cudapca.cu b/openbr/plugins/cuda/cudapca.cu index f6fd3a1..001dd49 100644 --- a/openbr/plugins/cuda/cudapca.cu +++ b/openbr/plugins/cuda/cudapca.cu @@ -9,6 +9,10 @@ using namespace std; using namespace cv; using namespace cv::gpu; +/* + * These are the CUDA functions for CUDAPCA. See cudapca.cpp for more details + */ + namespace br { namespace cuda { namespace pca { __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int evRows, int evCols, int stepSize) { int colInd = blockIdx.x*blockDim.x+threadIdx.x; @@ -68,7 +72,7 @@ namespace br { namespace cuda { namespace pca { int _numSteps; int _stepSize; float* intermediaryBuffer; - void loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) { + void initializeWrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) { _evRows = evRows; _evCols = evCols; _meanElems = meanElems; @@ -95,7 +99,6 @@ namespace br { namespace cuda { namespace pca { cudaError_t err; CUDA_SAFE_MALLOC(dst, _evCols*sizeof(float), &err); - // subtract out the mean of the image (mean is 1xpixels in size) int threadsPerBlock = 64; int numBlocks = _meanElems / threadsPerBlock + 1; @@ -114,8 +117,5 @@ namespace br { namespace cuda { namespace pca { CUDA_KERNEL_ERR_CHK(&err); CUDA_SAFE_FREE(src, &err); // TODO(colin): figure out why adding this free causes memory corruption... - - // copy the data back to the CPU - //cudaMemcpy(dst, _cudaDstPtr, _evCols*sizeof(float), cudaMemcpyDeviceToHost); } }}} diff --git a/openbr/plugins/cuda/cudargb2grayscale.cpp b/openbr/plugins/cuda/cudargb2grayscale.cpp index 6f72728..1db7cca 100644 --- a/openbr/plugins/cuda/cudargb2grayscale.cpp +++ b/openbr/plugins/cuda/cudargb2grayscale.cpp @@ -25,17 +25,18 @@ using namespace cv; -namespace br { namespace cuda{ - void cudargb2grayscale_wrapper(void* srcPtr, void**dstPtr, int rows, int cols); -}} +// definitions from the CUDA source file +namespace br { namespace cuda { namespace rgb2grayscale { + void wrapper(void* srcPtr, void**dstPtr, int rows, int cols); +}}} namespace br { /*! * \ingroup transforms - * \brief Colorspace conversion. - * \author Li Li \cite Josh Klontz \cite jklontz + * \brief Converts 3-channel images to grayscale + * \author Li Li \cite booli */ class CUDARGB2GrayScaleTransform : public UntrainableTransform { @@ -57,8 +58,8 @@ private: dstDataPtr[2] = srcDataPtr[2]; dstDataPtr[3] = srcDataPtr[3]; *((int*)dstDataPtr[3]) = CV_8UC1; // not sure if the type of the new mat is the same - - br::cuda::cudargb2grayscale_wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols); + + cuda::rgb2grayscale::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols); dst = dstMat; /* diff --git a/openbr/plugins/cuda/cudargb2grayscale.cu b/openbr/plugins/cuda/cudargb2grayscale.cu index 266389c..8efaafa 100644 --- a/openbr/plugins/cuda/cudargb2grayscale.cu +++ b/openbr/plugins/cuda/cudargb2grayscale.cu @@ -12,9 +12,9 @@ using namespace std; using namespace cv; using namespace cv::gpu; -namespace br{ namespace cuda { +namespace br { namespace cuda { namespace rgb2grayscale { - __global__ void cudargb2grayscale_kernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols) + __global__ void kernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols) { int rowInd = blockIdx.y*blockDim.y+threadIdx.y; int colInd = blockIdx.x*blockDim.x+threadIdx.x; @@ -31,7 +31,7 @@ namespace br{ namespace cuda { return; } - void cudargb2grayscale_wrapper(void* srcPtr, void** dstPtr, int rows, int cols) + void wrapper(void* srcPtr, void** dstPtr, int rows, int cols) { cudaError_t err; dim3 threadsPerBlock(9, 9); @@ -39,9 +39,9 @@ namespace br{ namespace cuda { rows/threadsPerBlock.y + 1); CUDA_SAFE_MALLOC(dstPtr, rows*cols*sizeof(uint8_t), &err); - cudargb2grayscale_kernel<<>>((uint8_t*)srcPtr, (uint8_t*) (*dstPtr), rows, cols); + kernel<<>>((uint8_t*)srcPtr, (uint8_t*) (*dstPtr), rows, cols); CUDA_KERNEL_ERR_CHK(&err); CUDA_SAFE_FREE(srcPtr, &err); - } + } -}} +}}} diff --git a/openbr/plugins/cuda/passthrough.cpp b/openbr/plugins/cuda/passthrough.cpp deleted file mode 100644 index f7018ea..0000000 --- a/openbr/plugins/cuda/passthrough.cpp +++ /dev/null @@ -1,46 +0,0 @@ -#include - -#include -#include - -using namespace cv; -using namespace cv::gpu; - -#include "passthrough.hpp" - -#include - - -namespace br -{ - class CUDAPassthroughTransform : public UntrainableTransform - { - Q_OBJECT - -private: - void project(const Template &src, Template &dst) const - { - // note: if you convert the image to grayscale, you get 8UC1 - - // upload the src mat to the GPU - GpuMat srcGpuMat, dstGpuMat; - srcGpuMat.upload(src.m()); - dstGpuMat.upload(src.m()); - - br::cuda::passthrough_wrapper(srcGpuMat, dstGpuMat); - - dstGpuMat.download(dst.m()); - - // TODO(colin): add delete code - srcGpuMat.release(); - dstGpuMat.release(); - - printf("srcGpuMat empty: %d\n", (int)srcGpuMat.empty()); - printf("dstGpuMat empty: %d\n", (int)srcGpuMat.empty()); - } - }; - - BR_REGISTER(Transform, CUDAPassthroughTransform); -} - -#include "cuda/passthrough.moc" diff --git a/openbr/plugins/cuda/passthrough.cu b/openbr/plugins/cuda/passthrough.cu deleted file mode 100644 index 49f94e9..0000000 --- a/openbr/plugins/cuda/passthrough.cu +++ /dev/null @@ -1,43 +0,0 @@ -// note: Using 8-bit unsigned 1 channel images - -#include - -using namespace cv; -using namespace cv::gpu; - -#include "passthrough.hpp" - -namespace br { namespace cuda { - __global__ void passthrough_kernel(uint8_t* srcPtr, uint8_t* dstPtr, size_t srcStep, size_t dstStep, int cols, int rows) { - int rowInd = blockIdx.y*blockDim.y+threadIdx.y; - int colInd = blockIdx.x*blockDim.x+threadIdx.x; - - // don't do anything if we are outside the allowable positions - if (rowInd >= rows || colInd >= cols) - return; - - uint8_t srcVal = (srcPtr + rowInd*srcStep)[colInd]; - uint8_t* rowDstPtr = dstPtr + rowInd*dstStep; - - rowDstPtr[colInd] = srcVal; - } - - void passthrough_wrapper(GpuMat& src, GpuMat& dst) { - // convert the GpuMats to pointers - uint8_t* srcPtr = (uint8_t*)src.data; - uint8_t* dstPtr = (uint8_t*)dst.data; - - int imageWidth = src.cols; - int imageHeight = src.rows; - - // make 8 * 8 = 64 square block - dim3 threadsPerBlock(8, 8); - dim3 numBlocks(imageWidth / threadsPerBlock.x + 1, - imageHeight / threadsPerBlock.y + 1); - - passthrough_kernel<<>>(srcPtr, dstPtr, src.step, dst.step, imageWidth, imageHeight); - } -}} - - -// read http://stackoverflow.com/questions/31927297/array-of-ptrstepszgpumat-to-a-c-cuda-kernel diff --git a/openbr/plugins/cuda/passthrough.hpp b/openbr/plugins/cuda/passthrough.hpp deleted file mode 100644 index 8749684..0000000 --- a/openbr/plugins/cuda/passthrough.hpp +++ /dev/null @@ -1,3 +0,0 @@ -namespace br { namespace cuda { - void passthrough_wrapper(GpuMat& src, GpuMat& dst); -}} -- libgit2 0.21.4