diff --git a/openbr/plugins/cuda/MatManager.cu b/openbr/core/cuda/MatManager.cu index 4823249..4823249 100644 --- a/openbr/plugins/cuda/MatManager.cu +++ b/openbr/core/cuda/MatManager.cu diff --git a/openbr/plugins/cuda/MatManager.hpp b/openbr/core/cuda/MatManager.hpp index e58ad42..e58ad42 100644 --- a/openbr/plugins/cuda/MatManager.hpp +++ b/openbr/core/cuda/MatManager.hpp diff --git a/openbr/plugins/cuda/copyfrom.cpp b/openbr/plugins/cuda/copyfrom.cpp index 4bfc0bf..287b3b9 100644 --- a/openbr/plugins/cuda/copyfrom.cpp +++ b/openbr/plugins/cuda/copyfrom.cpp @@ -2,21 +2,25 @@ #include -//#include - #include using namespace std; using namespace cv; -// extern CUDA declaration -namespace br { namespace cuda { namespace cudacopyfrom { +// CUDA functions for this plugin +namespace br { namespace cuda { namespace copyfrom { template void wrapper(void* src, T* out, int rows, int cols); }}} namespace br { + /*! + * \ingroup transforms + * \brief Copies a transform from the GPU to the CPU. + * \author Colin Heinzmann \cite DepthDeluxe + * \note Method: Automatically matches image dimensions, works for 32-bit single channel, 8-bit single channel, and 8-bit 3 channel + */ class CUDACopyFrom : public UntrainableTransform { Q_OBJECT @@ -24,8 +28,6 @@ namespace br private: void project(const Template &src, Template &dst) const { -// ProfilerStart("PROFILEME.log"); - // pull the data back out of the Mat void* const* dataPtr = src.m().ptr(); int rows = *((int*)dataPtr[1]); @@ -35,21 +37,19 @@ private: Mat dstMat = Mat(rows, cols, type); switch(type) { case CV_32FC1: - br::cuda::cudacopyfrom::wrapper(dataPtr[0], dstMat.ptr(), rows, cols); + cuda::copyfrom::wrapper(dataPtr[0], dstMat.ptr(), rows, cols); break; case CV_8UC1: - br::cuda::cudacopyfrom::wrapper(dataPtr[0], dstMat.ptr(), rows, cols); + cuda::copyfrom::wrapper(dataPtr[0], dstMat.ptr(), rows, cols); break; case CV_8UC3: - br::cuda::cudacopyfrom::wrapper(dataPtr[0], dstMat.ptr(), rows, cols * 3); + cuda::copyfrom::wrapper(dataPtr[0], dstMat.ptr(), rows, cols * 3); break; default: - cout << "ERR: Invalid image format" << endl; + cout << "ERR: Invalid image type (" << type << ")" << endl; break; } dst = dstMat; - -// ProfilerStop(); } }; diff --git a/openbr/plugins/cuda/copyfrom.cu b/openbr/plugins/cuda/copyfrom.cu index ddc77f8..5574ea6 100644 --- a/openbr/plugins/cuda/copyfrom.cu +++ b/openbr/plugins/cuda/copyfrom.cu @@ -1,6 +1,6 @@ #include "cudadefines.hpp" -namespace br { namespace cuda { namespace cudacopyfrom { +namespace br { namespace cuda { namespace copyfrom { template void wrapper(void* src, T* dst, int rows, int cols) { cudaError_t err; CUDA_SAFE_MEMCPY(dst, src, rows*cols*sizeof(T), cudaMemcpyDeviceToHost, &err); diff --git a/openbr/plugins/cuda/copyto.cpp b/openbr/plugins/cuda/copyto.cpp index cc63e58..0fad607 100644 --- a/openbr/plugins/cuda/copyto.cpp +++ b/openbr/plugins/cuda/copyto.cpp @@ -8,14 +8,20 @@ using namespace std; using namespace cv; -extern string type2str(int type); - -namespace br { namespace cuda { namespace cudacopyto { +// definitions from the CUDA source file +namespace br { namespace cuda { namespace copyto { template void wrapper(const T* in, void** out, const int rows, const int cols); }}} namespace br { + + /*! + * \ingroup transforms + * \brief Copies a transform to the GPU. + * \author Colin Heinzmann \cite DepthDeluxe + * \note Method: Automatically matches image dimensions, works for 32-bit single channel, 8-bit single channel, and 8-bit 3 channel + */ class CUDACopyTo : public UntrainableTransform { Q_OBJECT @@ -25,7 +31,7 @@ private: { const Mat& srcMat = src.m(); const int rows = srcMat.rows; - const int cols = srcMat.cols; + const int cols = srcMat.cols; // output will be a single pointer to graphics card memory Mat dstMat = Mat(4, 1, DataType::type); @@ -39,16 +45,16 @@ private: void* cudaMemPtr; switch(srcMat.type()) { case CV_32FC1: - br::cuda::cudacopyto::wrapper(srcMat.ptr(), &dstMatData[0], rows, cols); + cuda::copyto::wrapper(srcMat.ptr(), &dstMatData[0], rows, cols); break; case CV_8UC1: - br::cuda::cudacopyto::wrapper(srcMat.ptr(), &dstMatData[0], rows, cols); + cuda::copyto::wrapper(srcMat.ptr(), &dstMatData[0], rows, cols); break; case CV_8UC3: - br::cuda::cudacopyto::wrapper(srcMat.ptr(), &dstMatData[0], rows, 3*cols); + cuda::copyto::wrapper(srcMat.ptr(), &dstMatData[0], rows, 3*cols); break; default: - cout << "ERR: Invalid image type! " << type2str(srcMat.type()) << endl; + cout << "ERR: Invalid image type (" << srcMat.type() << ")" << endl; return; } diff --git a/openbr/plugins/cuda/copyto.cu b/openbr/plugins/cuda/copyto.cu index 0742ff7..4c66605 100644 --- a/openbr/plugins/cuda/copyto.cu +++ b/openbr/plugins/cuda/copyto.cu @@ -1,6 +1,7 @@ #include "cudadefines.hpp" -namespace br { namespace cuda { namespace cudacopyto { +namespace br { namespace cuda { namespace copyto { + template void wrapper(const T* in, void** out, const int rows, const int cols) { cudaError_t err; CUDA_SAFE_MALLOC(out, rows*cols*sizeof(T), &err); @@ -9,4 +10,5 @@ namespace br { namespace cuda { namespace cudacopyto { template void wrapper(const float* in, void** out, const int rows, const int cols); template void wrapper(const unsigned char* in, void** out, const int rows, const int cols); + }}} diff --git a/openbr/plugins/cuda/cudaaffine.cpp b/openbr/plugins/cuda/cudaaffine.cpp index 46dd71f..eec10f4 100644 --- a/openbr/plugins/cuda/cudaaffine.cpp +++ b/openbr/plugins/cuda/cudaaffine.cpp @@ -33,159 +33,158 @@ using namespace std; #include #include -#include "MatManager.hpp" - using namespace cv; -namespace br { namespace cuda { - void cudaaffine_wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols); -}} +// definitions from the CUDA source file +namespace br { namespace cuda { namespace affine { + void wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols); +}}} namespace br { -/*! - * \ingroup transforms - * \brief Performs a two or three point registration. - * \author Josh Klontz \cite jklontz - * \note Method: Area should be used for shrinking an image, Cubic for slow but accurate enlargment, Bilin for fast enlargement. - */ -class CUDAAffineTransform : public UntrainableTransform -{ - Q_OBJECT - Q_ENUMS(Method) - -public: - /*!< */ - enum Method { Near = INTER_NEAREST, - Area = INTER_AREA, - Bilin = INTER_LINEAR, - Cubic = INTER_CUBIC, - Lanczo = INTER_LANCZOS4}; - -private: - Q_PROPERTY(int width READ get_width WRITE set_width RESET reset_width STORED false) - Q_PROPERTY(int height READ get_height WRITE set_height RESET reset_height STORED false) - Q_PROPERTY(float x1 READ get_x1 WRITE set_x1 RESET reset_x1 STORED false) - Q_PROPERTY(float y1 READ get_y1 WRITE set_y1 RESET reset_y1 STORED false) - Q_PROPERTY(float x2 READ get_x2 WRITE set_x2 RESET reset_x2 STORED false) - Q_PROPERTY(float y2 READ get_y2 WRITE set_y2 RESET reset_y2 STORED false) - Q_PROPERTY(float x3 READ get_x3 WRITE set_x3 RESET reset_x3 STORED false) - Q_PROPERTY(float y3 READ get_y3 WRITE set_y3 RESET reset_y3 STORED false) - Q_PROPERTY(Method method READ get_method WRITE set_method RESET reset_method STORED false) - Q_PROPERTY(bool storeAffine READ get_storeAffine WRITE set_storeAffine RESET reset_storeAffine STORED false) - Q_PROPERTY(bool warpPoints READ get_warpPoints WRITE set_warpPoints RESET reset_warpPoints STORED false) - BR_PROPERTY(int, width, 64) - BR_PROPERTY(int, height, 64) - BR_PROPERTY(float, x1, 0) - BR_PROPERTY(float, y1, 0) - BR_PROPERTY(float, x2, -1) - BR_PROPERTY(float, y2, -1) - BR_PROPERTY(float, x3, -1) - BR_PROPERTY(float, y3, -1) - BR_PROPERTY(Method, method, Bilin) - BR_PROPERTY(bool, storeAffine, false) - BR_PROPERTY(bool, warpPoints, false) - - static Point2f getThirdAffinePoint(const Point2f &a, const Point2f &b) - { - float dx = b.x - a.x; - float dy = b.y - a.y; - return Point2f(a.x - dy, a.y + dx); - } - - void project(const Template &src, Template &dst) const - { - const bool twoPoints = ((x3 == -1) || (y3 == -1)); - - Point2f dstPoints[3]; - dstPoints[0] = Point2f(x1*width, y1*height); - dstPoints[1] = Point2f((x2 == -1 ? 1 - x1 : x2)*width, (y2 == -1 ? y1 : y2)*height); - if (twoPoints) dstPoints[2] = getThirdAffinePoint(dstPoints[0], dstPoints[1]); - else dstPoints[2] = Point2f(x3*width, y3*height); - - Point2f srcPoints[3]; - if (src.file.contains("Affine_0") && - src.file.contains("Affine_1") && - (src.file.contains("Affine_2") || twoPoints)) { - srcPoints[0] = OpenCVUtils::toPoint(src.file.get("Affine_0")); - srcPoints[1] = OpenCVUtils::toPoint(src.file.get("Affine_1")); - if (!twoPoints) srcPoints[2] = OpenCVUtils::toPoint(src.file.get("Affine_2")); - } else { - const QList landmarks = OpenCVUtils::toPoints(src.file.points()); - - if ((landmarks.size() < 2) || (!twoPoints && (landmarks.size() < 3))) { - resize(src, dst, Size(width, height)); - return; - } else { - srcPoints[0] = landmarks[0]; - srcPoints[1] = landmarks[1]; - if (!twoPoints) srcPoints[2] = landmarks[2]; - } - } - if (twoPoints) srcPoints[2] = getThirdAffinePoint(srcPoints[0], srcPoints[1]); - - // Code section being altered (original) - // - // Mat affineTransform = getAffineTransform(srcPoints, dstPoints); - // warpAffine(src, dst, affineTransform, Size(width, height), method); - // - // end original - - Mat affineTransform = getAffineTransform(srcPoints, dstPoints); - - void* const* srcDataPtr = src.m().ptr(); - int rows = *((int*)srcDataPtr[1]); - int cols = *((int*)srcDataPtr[2]); - int type = *((int*)srcDataPtr[3]); - - Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type()); - void** dstDataPtr = dstMat.ptr(); - - dstDataPtr[1] = srcDataPtr[1]; *((int*)dstDataPtr[1]) = height; // rows - dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = width; // cols - dstDataPtr[3] = srcDataPtr[3]; - - // Print the transform - //for(int x = 0; x < affineTransform.rows; x++){ - //for(int y = 0; y < affineTransform.cols; y++){ - //printf("%8.3f\t", affineTransform.at(x, y)); - //} - //printf("\n"); - //} - - br::cuda::cudaaffine_wrapper(srcDataPtr[0], &dstDataPtr[0], affineTransform, rows, cols, height, width); - - // end altered code - - if (warpPoints) { - QList points = src.file.points(); - QList rotatedPoints; - for (int i=0; i(0,0)+ - points.at(i).y()*affineTransform.at(0,1)+ - affineTransform.at(0,2), - points.at(i).x()*affineTransform.at(1,0)+ - points.at(i).y()*affineTransform.at(1,1)+ - affineTransform.at(1,2))); - } - - dst.file.setPoints(rotatedPoints); - } - - if (storeAffine) { - QList affineParams; - for (int i = 0 ; i < 2; i++) - for (int j = 0; j < 3; j++) - affineParams.append(affineTransform.at(i, j)); - dst.file.setList("affineParameters", affineParams); - } - - dst = dstMat; - } -}; - -BR_REGISTER(Transform, CUDAAffineTransform) + /*! + * \ingroup transforms + * \brief Performs a two or three point registration on the GPU. Modified from stock OpenBR implementation + * \author Greg Schrock \cite gls022 + * \note Method: Area should be used for shrinking an image, Cubic for slow but accurate enlargment, Bilin for fast enlargement. + */ + class CUDAAffineTransform : public UntrainableTransform + { + Q_OBJECT + Q_ENUMS(Method) + + public: + /*!< */ + enum Method { Near = INTER_NEAREST, + Area = INTER_AREA, + Bilin = INTER_LINEAR, + Cubic = INTER_CUBIC, + Lanczo = INTER_LANCZOS4}; + + private: + Q_PROPERTY(int width READ get_width WRITE set_width RESET reset_width STORED false) + Q_PROPERTY(int height READ get_height WRITE set_height RESET reset_height STORED false) + Q_PROPERTY(float x1 READ get_x1 WRITE set_x1 RESET reset_x1 STORED false) + Q_PROPERTY(float y1 READ get_y1 WRITE set_y1 RESET reset_y1 STORED false) + Q_PROPERTY(float x2 READ get_x2 WRITE set_x2 RESET reset_x2 STORED false) + Q_PROPERTY(float y2 READ get_y2 WRITE set_y2 RESET reset_y2 STORED false) + Q_PROPERTY(float x3 READ get_x3 WRITE set_x3 RESET reset_x3 STORED false) + Q_PROPERTY(float y3 READ get_y3 WRITE set_y3 RESET reset_y3 STORED false) + Q_PROPERTY(Method method READ get_method WRITE set_method RESET reset_method STORED false) + Q_PROPERTY(bool storeAffine READ get_storeAffine WRITE set_storeAffine RESET reset_storeAffine STORED false) + Q_PROPERTY(bool warpPoints READ get_warpPoints WRITE set_warpPoints RESET reset_warpPoints STORED false) + BR_PROPERTY(int, width, 64) + BR_PROPERTY(int, height, 64) + BR_PROPERTY(float, x1, 0) + BR_PROPERTY(float, y1, 0) + BR_PROPERTY(float, x2, -1) + BR_PROPERTY(float, y2, -1) + BR_PROPERTY(float, x3, -1) + BR_PROPERTY(float, y3, -1) + BR_PROPERTY(Method, method, Bilin) + BR_PROPERTY(bool, storeAffine, false) + BR_PROPERTY(bool, warpPoints, false) + + static Point2f getThirdAffinePoint(const Point2f &a, const Point2f &b) + { + float dx = b.x - a.x; + float dy = b.y - a.y; + return Point2f(a.x - dy, a.y + dx); + } + + void project(const Template &src, Template &dst) const + { + const bool twoPoints = ((x3 == -1) || (y3 == -1)); + + Point2f dstPoints[3]; + dstPoints[0] = Point2f(x1*width, y1*height); + dstPoints[1] = Point2f((x2 == -1 ? 1 - x1 : x2)*width, (y2 == -1 ? y1 : y2)*height); + if (twoPoints) dstPoints[2] = getThirdAffinePoint(dstPoints[0], dstPoints[1]); + else dstPoints[2] = Point2f(x3*width, y3*height); + + Point2f srcPoints[3]; + if (src.file.contains("Affine_0") && + src.file.contains("Affine_1") && + (src.file.contains("Affine_2") || twoPoints)) { + srcPoints[0] = OpenCVUtils::toPoint(src.file.get("Affine_0")); + srcPoints[1] = OpenCVUtils::toPoint(src.file.get("Affine_1")); + if (!twoPoints) srcPoints[2] = OpenCVUtils::toPoint(src.file.get("Affine_2")); + } else { + const QList landmarks = OpenCVUtils::toPoints(src.file.points()); + + if ((landmarks.size() < 2) || (!twoPoints && (landmarks.size() < 3))) { + resize(src, dst, Size(width, height)); + return; + } else { + srcPoints[0] = landmarks[0]; + srcPoints[1] = landmarks[1]; + if (!twoPoints) srcPoints[2] = landmarks[2]; + } + } + if (twoPoints) srcPoints[2] = getThirdAffinePoint(srcPoints[0], srcPoints[1]); + + // Code section being altered (original) + // + // Mat affineTransform = getAffineTransform(srcPoints, dstPoints); + // warpAffine(src, dst, affineTransform, Size(width, height), method); + // + // end original + + Mat affineTransform = getAffineTransform(srcPoints, dstPoints); + + void* const* srcDataPtr = src.m().ptr(); + int rows = *((int*)srcDataPtr[1]); + int cols = *((int*)srcDataPtr[2]); + int type = *((int*)srcDataPtr[3]); + + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type()); + void** dstDataPtr = dstMat.ptr(); + + dstDataPtr[1] = srcDataPtr[1]; *((int*)dstDataPtr[1]) = height; // rows + dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = width; // cols + dstDataPtr[3] = srcDataPtr[3]; + + // Print the transform + //for(int x = 0; x < affineTransform.rows; x++){ + //for(int y = 0; y < affineTransform.cols; y++){ + //printf("%8.3f\t", affineTransform.at(x, y)); + //} + //printf("\n"); + //} + + cuda::affine::wrapper(srcDataPtr[0], &dstDataPtr[0], affineTransform, rows, cols, height, width); + + // end altered code + + if (warpPoints) { + QList points = src.file.points(); + QList rotatedPoints; + for (int i=0; i(0,0)+ + points.at(i).y()*affineTransform.at(0,1)+ + affineTransform.at(0,2), + points.at(i).x()*affineTransform.at(1,0)+ + points.at(i).y()*affineTransform.at(1,1)+ + affineTransform.at(1,2))); + } + + dst.file.setPoints(rotatedPoints); + } + + if (storeAffine) { + QList affineParams; + for (int i = 0 ; i < 2; i++) + for (int j = 0; j < 3; j++) + affineParams.append(affineTransform.at(i, j)); + dst.file.setList("affineParameters", affineParams); + } + + dst = dstMat; + } + }; + + BR_REGISTER(Transform, CUDAAffineTransform) } // namespace br diff --git a/openbr/plugins/cuda/cudaaffine.cu b/openbr/plugins/cuda/cudaaffine.cu index 0c19dde..79cc244 100644 --- a/openbr/plugins/cuda/cudaaffine.cu +++ b/openbr/plugins/cuda/cudaaffine.cu @@ -11,30 +11,30 @@ using namespace std; using namespace cv; using namespace cv::gpu; -namespace br { namespace cuda { +namespace br { namespace cuda { namespace affine { - __device__ __forceinline__ uint8_t cudaaffine_kernel_get_pixel_value(int row, int col, uint8_t* srcPtr, int rows, int cols) { + __device__ __forceinline__ uint8_t getPixelValueDevice(int row, int col, uint8_t* srcPtr, int rows, int cols) { if (row < 0 || row > rows || col < 0 || col > cols) { if (row > rows || col > cols) { return 0; } else{ - return 0; + return 0; } } return (srcPtr + row*cols)[col]; } - __device__ __forceinline__ uint8_t cudaaffine_kernel_get_bilinear_pixel_value(double row, double col, uint8_t* srcPtr, int rows, int cols) { + __device__ __forceinline__ uint8_t getBilinearPixelValueDevice(double row, double col, uint8_t* srcPtr, int rows, int cols) { // don't do anything if the index is out of bounds if (row < 0 || row > rows || col < 0 || col > cols) { if (row > rows || col > cols) { return 0; } else{ - return 0; + return 0; } } - + // http://www.sci.utah.edu/~acoste/uou/Image/project3/ArthurCOSTE_Project3.pdf // Bilinear Transformation // f(Px, Py) = f(Q11)×(1−Rx)×(1−Sy)+f(Q21)×(Rx)×(1−Sy)+f(Q12)×(1−Rx)×(Sy)+f(Q22)×(Rx)×(Sy) @@ -48,22 +48,22 @@ namespace br { namespace cuda { double d_row = row - row1; double d_col = col - col1; - int Q11 = cudaaffine_kernel_get_pixel_value(row1, col1, srcPtr, rows, cols); - int Q21 = cudaaffine_kernel_get_pixel_value(row2, col1, srcPtr, rows, cols); - int Q12 = cudaaffine_kernel_get_pixel_value(row1, col2, srcPtr, rows, cols); - int Q22 = cudaaffine_kernel_get_pixel_value(row2, col2, srcPtr, rows, cols); + int Q11 = getPixelValueDevice(row1, col1, srcPtr, rows, cols); + int Q21 = getPixelValueDevice(row2, col1, srcPtr, rows, cols); + int Q12 = getPixelValueDevice(row1, col2, srcPtr, rows, cols); + int Q22 = getPixelValueDevice(row2, col2, srcPtr, rows, cols); double val = Q22*(d_row*d_col) + Q12*((1-d_row)*d_col) + Q21*(d_row*(1-d_col)) + Q11*((1-d_row)*(1-d_col)); return ((uint8_t) round(val)); } - __device__ __forceinline__ uint8_t cudaaffine_kernel_get_distance_pixel_value(double row, double col, uint8_t* srcPtr, int rows, int cols) { + __device__ __forceinline__ uint8_t getDistancePixelValueDevice(double row, double col, uint8_t* srcPtr, int rows, int cols) { // don't do anything if the index is out of bounds if (row < 1 || row >= rows-1 || col < 1 || col >= cols-1) { if (row >= rows || col >= cols) { return 0; } else{ - return 0; + return 0; } } @@ -90,10 +90,10 @@ namespace br { namespace cuda { double w3 = d3/sum; double w4 = d4/sum; - uint8_t v1 = cudaaffine_kernel_get_pixel_value(row1, col1, srcPtr, rows, cols); - uint8_t v2 = cudaaffine_kernel_get_pixel_value(row2, col1, srcPtr, rows, cols); - uint8_t v3 = cudaaffine_kernel_get_pixel_value(row1, col2, srcPtr, rows, cols); - uint8_t v4 = cudaaffine_kernel_get_pixel_value(row2, col2, srcPtr, rows, cols); + uint8_t v1 = getPixelValueDevice(row1, col1, srcPtr, rows, cols); + uint8_t v2 = getPixelValueDevice(row2, col1, srcPtr, rows, cols); + uint8_t v3 = getPixelValueDevice(row1, col2, srcPtr, rows, cols); + uint8_t v4 = getPixelValueDevice(row2, col2, srcPtr, rows, cols); return round(w1*v1 + w2*v2 + w3*v3 + w4*v4); } @@ -105,16 +105,16 @@ namespace br { namespace cuda { * src_row - The computed source pixel row (mapping from this row) * src_col - The computed source pixel column (mapping from this col) */ - __device__ __forceinline__ void cudaaffine_kernel_get_src_coord(double *trans_inv, int dst_row, int dst_col, double* src_row_pnt, double* src_col_pnt){ + __device__ __forceinline__ void getSrcCoordDevice(double *trans_inv, int dst_row, int dst_col, double* src_row_pnt, double* src_col_pnt){ *src_col_pnt = dst_col * trans_inv[0] + dst_row * trans_inv[3] + trans_inv[6]; *src_row_pnt = dst_col * trans_inv[1] + dst_row * trans_inv[4] + trans_inv[7]; //printf("Dst: [%d, %d, 1] = [%d, %d, 1] \n[ %0.4f, %0.4f, %0.4f] \n[ %0.4f, %0.4f, %0.4f ]\n[ %0.4f, %0.4f, %0.4f ]\n\n", *src_col, *src_row, dst_col, dst_row, trans_inv[0], trans_inv[1], trans_inv[2], trans_inv[3], trans_inv[4], trans_inv[5], trans_inv[6], trans_inv[7], trans_inv[8]); } - - __global__ void cudaaffine_kernel(uint8_t* srcPtr, uint8_t* dstPtr, double* trans_inv, int src_rows, int src_cols, int dst_rows, int dst_cols){ + + __global__ void affineKernel(uint8_t* srcPtr, uint8_t* dstPtr, double* trans_inv, int src_rows, int src_cols, int dst_rows, int dst_cols){ int dstRowInd = blockIdx.y*blockDim.y+threadIdx.y; int dstColInd = blockIdx.x*blockDim.x+threadIdx.x; int dstIndex = dstRowInd*dst_cols + dstColInd; @@ -134,15 +134,15 @@ namespace br { namespace cuda { } } - cudaaffine_kernel_get_src_coord(trans_inv, dstRowInd, dstColInd, &srcRowPnt, &srcColPnt); - //const uint8_t cval = cudaaffine_kernel_get_distance_pixel_value(srcRowPnt, srcColPnt, srcPtr, src_rows, src_cols); // Get initial pixel value - const uint8_t cval = cudaaffine_kernel_get_bilinear_pixel_value(srcRowPnt, srcColPnt, srcPtr, src_rows, src_cols); // Get initial pixel value - //const uint8_t cval = cudaaffine_kernel_get_pixel_value(round(srcRowPnt), round(srcColPnt), srcPtr, src_rows, src_cols); // Get initial pixel value + getSrcCoordDevice(trans_inv, dstRowInd, dstColInd, &srcRowPnt, &srcColPnt); + //const uint8_t cval = getDistancePixelValueDevice(srcRowPnt, srcColPnt, srcPtr, src_rows, src_cols); // Get initial pixel value + const uint8_t cval = getBilinearPixelValueDevice(srcRowPnt, srcColPnt, srcPtr, src_rows, src_cols); // Get initial pixel value + //const uint8_t cval = getPixelValueDevice(round(srcRowPnt), round(srcColPnt), srcPtr, src_rows, src_cols); // Get initial pixel value dstPtr[dstIndex] = cval; } - void cudaaffine_wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols) { + void wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols) { cudaError_t err; double* gpuInverse; @@ -152,7 +152,7 @@ namespace br { namespace cuda { //************************************************************************ // Input affine is a 2x3 Mat whose transpose is used in the computations - // [x, y, 1] = [u, v, 1] [ a^T | [0 0 1]^T ] + // [x, y, 1] = [u, v, 1] [ a^T | [0 0 1]^T ] // See "Digital Image Warping" by George Wolburg (p. 50) //************************************************************************ @@ -210,7 +210,7 @@ namespace br { namespace cuda { CUDA_SAFE_MEMCPY(gpuInverse, affineInverse, 9*sizeof(double), cudaMemcpyHostToDevice, &err); - cudaaffine_kernel<<>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), gpuInverse, src_rows, src_cols, dst_rows, dst_cols); + affineKernel<<>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), gpuInverse, src_rows, src_cols, dst_rows, dst_cols); CUDA_KERNEL_ERR_CHK(&err); CUDA_SAFE_FREE(srcPtr, &err); @@ -225,5 +225,4 @@ namespace br { namespace cuda { // } // printf("\n"); } -} // end cuda -} // end br +}}} diff --git a/openbr/plugins/cuda/cudacvtfloat.cpp b/openbr/plugins/cuda/cudacvtfloat.cpp index d774a05..ced2b9c 100644 --- a/openbr/plugins/cuda/cudacvtfloat.cpp +++ b/openbr/plugins/cuda/cudacvtfloat.cpp @@ -7,7 +7,8 @@ using namespace cv; #include -namespace br { namespace cuda { namespace cudacvtfloat { +// definitions from the CUDA source file +namespace br { namespace cuda { namespace cvtfloat { void wrapper(void* src, void** dst, int rows, int cols); }}} @@ -16,7 +17,7 @@ namespace br /*! * \ingroup transforms - * \brief Converts byte to floating point + * \brief Converts 8-bit images currently on GPU into 32-bit floating point equivalent. * \author Colin Heinzmann \cite DepthDeluxe */ class CUDACvtFloatTransform : public UntrainableTransform @@ -45,7 +46,7 @@ class CUDACvtFloatTransform : public UntrainableTransform dstDataPtr[2] = srcDataPtr[2]; dstDataPtr[3] = srcDataPtr[3]; *((int*)dstDataPtr[3]) = CV_32FC1; - br::cuda::cudacvtfloat::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols); + cuda::cvtfloat::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols); dst = dstMat; } }; diff --git a/openbr/plugins/cuda/cudacvtfloat.cu b/openbr/plugins/cuda/cudacvtfloat.cu index 70bf2e4..bcc4665 100644 --- a/openbr/plugins/cuda/cudacvtfloat.cu +++ b/openbr/plugins/cuda/cudacvtfloat.cu @@ -3,7 +3,7 @@ using namespace std; #include "cudadefines.hpp" -namespace br { namespace cuda { namespace cudacvtfloat { +namespace br { namespace cuda { namespace cvtfloat { __global__ void kernel(const unsigned char* src, float* dst, int rows, int cols) { // get my index @@ -20,12 +20,6 @@ namespace br { namespace cuda { namespace cudacvtfloat { } void wrapper(void* src, void** dst, int rows, int cols) { - //unsigned char* cudaSrc; - //cudaMalloc(&cudaSrc, rows*cols*sizeof(unsigned char)); - //cudaMemcpy(cudaSrc, src, rows*cols*sizeof(unsigned char), cudaMemcpyHostToDevice); - - //float* cudaDst; - //cudaMalloc(&cudaDst, rows*cols*sizeof(float)); cudaError_t err; CUDA_SAFE_MALLOC(dst, rows*cols*sizeof(float), &err); diff --git a/openbr/plugins/cuda/CUDAL2.cpp b/openbr/plugins/cuda/cudal2.cpp index 6b248d0..8cbc7c6 100644 --- a/openbr/plugins/cuda/CUDAL2.cpp +++ b/openbr/plugins/cuda/cudal2.cpp @@ -19,7 +19,8 @@ using namespace std; #include -namespace br { namespace cuda { namespace L2{ +// definitions from the CUDA source file +namespace br { namespace cuda { namespace L2 { void wrapper(float* cudaAPtr, float* cudaBPtr, int length, float* outPtr); }}} @@ -29,7 +30,7 @@ namespace br /*! * \ingroup distances * \brief L2 distance computed using eigen. - * \author Josh Klontz \cite jklontz + * \author Colin Heinzmann \cite DepthDeluxe */ class CUDAL2Distance : public UntrainableDistance { @@ -45,7 +46,7 @@ class CUDAL2Distance : public UntrainableDistance float* cudaBPtr = (float*)b.ptr()[0]; float out; - br::cuda::L2::wrapper(cudaAPtr, cudaBPtr, rows*cols, &out); + cuda::L2::wrapper(cudaAPtr, cudaBPtr, rows*cols, &out); return out; } @@ -55,4 +56,4 @@ BR_REGISTER(Distance, CUDAL2Distance) } // namespace br -#include "cuda/CUDAL2.moc" +#include "cuda/cudal2.moc" diff --git a/openbr/plugins/cuda/CUDAL2.cu b/openbr/plugins/cuda/cudal2.cu index 8d52b35..23db28d 100644 --- a/openbr/plugins/cuda/CUDAL2.cu +++ b/openbr/plugins/cuda/cudal2.cu @@ -4,7 +4,7 @@ namespace br { namespace cuda { namespace L2 { - __global__ void my_subtract_kernel(float* aPtr, float* bPtr, float* workPtr, int length) { + __global__ void subtractKernel(float* aPtr, float* bPtr, float* workPtr, int length) { int index = blockIdx.x*blockDim.x+threadIdx.x; if (index >= length) { @@ -18,7 +18,7 @@ namespace br { namespace cuda { namespace L2 { workPtr[index] = workPtr[index] * workPtr[index]; } - __global__ void collapse_kernel(float* inPtr, float* outPtr, int length) { + __global__ void collapseKernel(float* inPtr, float* outPtr, int length) { // make sure there is only one thread that we are calling if (blockIdx.x != 0 || threadIdx.x != 0) { return; @@ -45,11 +45,11 @@ namespace br { namespace cuda { namespace L2 { // perform the subtraction int threadsPerBlock = 64; int numBlocks = length / threadsPerBlock + 1; - my_subtract_kernel<<>>(cudaAPtr, cudaBPtr, cudaWorkBufferPtr, length); + subtractKernel<<>>(cudaAPtr, cudaBPtr, cudaWorkBufferPtr, length); CUDA_KERNEL_ERR_CHK(&err); // perform the collapse - collapse_kernel<<<1,1>>>(cudaWorkBufferPtr, cudaOutPtr, length); + collapseKernel<<<1,1>>>(cudaWorkBufferPtr, cudaOutPtr, length); CUDA_KERNEL_ERR_CHK(&err); // copy the single value back to the destinsion @@ -63,6 +63,3 @@ namespace br { namespace cuda { namespace L2 { CUDA_SAFE_FREE(cudaWorkBufferPtr, &err); } }}} - -// 128CUDAEigenfaces on 6400 ATT: 54.367s -// 128CUDAEigenfacesL2 on 6400 ATT: diff --git a/openbr/plugins/cuda/cudalbp.cpp b/openbr/plugins/cuda/cudalbp.cpp index 2385c9b..f2f2d9f 100644 --- a/openbr/plugins/cuda/cudalbp.cpp +++ b/openbr/plugins/cuda/cudalbp.cpp @@ -31,44 +31,21 @@ using namespace std; #include -#include "MatManager.hpp" - using namespace cv; -string type2str(int type) { - string r; - - uchar depth = type & CV_MAT_DEPTH_MASK; - uchar chans = 1 + (type >> CV_CN_SHIFT); - - switch ( depth ) { - case CV_8U: r = "8U"; break; - case CV_8S: r = "8S"; break; - case CV_16U: r = "16U"; break; - case CV_16S: r = "16S"; break; - case CV_32S: r = "32S"; break; - case CV_32F: r = "32F"; break; - case CV_64F: r = "64F"; break; - default: r = "User"; break; - } - - r += "C"; - r += (chans+'0'); - - return r; -} - -namespace br { namespace cuda { - void cudalbp_wrapper(void* srcPtr, void** dstPtr, int rows, int cols); - void cudalbp_init_wrapper(uint8_t* lut); -}} +// definitions from the CUDA source file +namespace br { namespace cuda { namespace lbp { + void wrapper(void* srcPtr, void** dstPtr, int rows, int cols); + void initializeWrapper(uint8_t* lut); +}}} namespace br { /*! * \ingroup transforms - * \brief Convert the image into a feature vector using Local Binary Patterns in CUDA - * \author Colin Heinzmann, Li Li \cite DepthDeluxe, booli + * \brief Convert the image into a feature vector using Local Binary Patterns in CUDA. Modified from stock OpenBR plugin. + * \author Colin Heinzmann \cite DepthDeluxe + * \author Li Li \cite booli */ class CUDALBPTransform : public UntrainableTransform { @@ -84,8 +61,6 @@ class CUDALBPTransform : public UntrainableTransform uchar lut[256]; uchar null; - //cuda::MatManager* matManager; - public: /* Returns the number of 0->1 or 1->0 transitions in i */ static int numTransitions(int i) @@ -136,36 +111,14 @@ class CUDALBPTransform : public UntrainableTransform if (!set[i]) lut[i] = null; // Set to null id - // init the mat manager for managing 10 mats - //matManager = new cuda::MatManager(10); - // copy lut over to the GPU - br::cuda::cudalbp_init_wrapper(lut); + cuda::lbp::initializeWrapper(lut); std::cout << "Initialized CUDALBP" << std::endl; } void project(const Template &src, Template &dst) const { - //Mat& m = (Mat&)src.m(); - //cuda::MatManager::matindex a; - //cuda::MatManager::matindex b; - //a = matManager->reserve(m); - //matManager->upload(a, m); - - // reserve the second mat and check the dimensiosn - //b = matManager->reserve(m); - - //uint8_t* srcMatPtr = matManager->get_mat_pointer_from_index(a); - //uint8_t* dstMatPtr = matManager->get_mat_pointer_from_index(b); - //br::cuda::cudalbp_wrapper(srcMatPtr, dstMatPtr, lutGpuPtr, m.cols, m.rows, m.step1()); - - //matManager->download(b, dst); - - // release both the mats - //matManager->release(a); - //matManager->release(b); - void* const* srcDataPtr = src.m().ptr(); int rows = *((int*)srcDataPtr[1]); int cols = *((int*)srcDataPtr[2]); @@ -177,13 +130,13 @@ class CUDALBPTransform : public UntrainableTransform dstDataPtr[2] = srcDataPtr[2]; dstDataPtr[3] = srcDataPtr[3]; - br::cuda::cudalbp_wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols); + cuda::lbp::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols); dst = dstMat; } }; BR_REGISTER(Transform, CUDALBPTransform) -} // namespace br +} #include "cuda/cudalbp.moc" diff --git a/openbr/plugins/cuda/cudalbp.cu b/openbr/plugins/cuda/cudalbp.cu index bb5be9a..61548df 100644 --- a/openbr/plugins/cuda/cudalbp.cu +++ b/openbr/plugins/cuda/cudalbp.cu @@ -9,14 +9,18 @@ using namespace std; using namespace cv; using namespace cv::gpu; -namespace br { namespace cuda { +/* + * These are the CUDA functions for CUDALBP. See cudapca.cpp for more details + */ + +namespace br { namespace cuda { namespace lbp { uint8_t* lut; - __device__ __forceinline__ uint8_t cudalbp_kernel_get_pixel_value(int row, int col, uint8_t* srcPtr, int rows, int cols) { + __device__ __forceinline__ uint8_t getPixelValueKernel(int row, int col, uint8_t* srcPtr, int rows, int cols) { return (srcPtr + row*cols)[col]; } - __global__ void cudalbp_kernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols, uint8_t* lut) + __global__ void lutKernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols, uint8_t* lut) { int rowInd = blockIdx.y*blockDim.y+threadIdx.y; int colInd = blockIdx.x*blockDim.x+threadIdx.x; @@ -34,22 +38,22 @@ namespace br { namespace cuda { } } - const uint8_t cval = cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+0*radius, srcPtr, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]); // center value - uint8_t val = lut[(cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 128 : 0) | - (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 64 : 0) | - (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 32 : 0) | - (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 16 : 0) | - (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 8 : 0) | - (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 4 : 0) | - (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 2 : 0) | - (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 1 : 0)]; + const uint8_t cval = getPixelValueKernel(rowInd+0*radius, colInd+0*radius, srcPtr, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]); // center value + uint8_t val = lut[(getPixelValueKernel(rowInd-1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 128 : 0) | + (getPixelValueKernel(rowInd-1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 64 : 0) | + (getPixelValueKernel(rowInd-1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 32 : 0) | + (getPixelValueKernel(rowInd+0*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 16 : 0) | + (getPixelValueKernel(rowInd+1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 8 : 0) | + (getPixelValueKernel(rowInd+1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 4 : 0) | + (getPixelValueKernel(rowInd+1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 2 : 0) | + (getPixelValueKernel(rowInd+0*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 1 : 0)]; // store calculated value away in the right place dstPtr[index] = val; } //void cudalbp_wrapper(uint8_t* srcPtr, uint8_t* dstPtr, uint8_t* lut, int imageWidth, int imageHeight, size_t step) - void cudalbp_wrapper(void* srcPtr, void** dstPtr, int rows, int cols) + void wrapper(void* srcPtr, void** dstPtr, int rows, int cols) { cudaError_t err; @@ -59,15 +63,15 @@ namespace br { namespace cuda { rows/threadsPerBlock.y + 1); CUDA_SAFE_MALLOC(dstPtr, rows*cols*sizeof(uint8_t), &err); - cudalbp_kernel<<>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), rows, cols, lut); + lutKernel<<>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), rows, cols, lut); CUDA_KERNEL_ERR_CHK(&err); CUDA_SAFE_FREE(srcPtr, &err); } - void cudalbp_init_wrapper(uint8_t* cpuLut) { + void initializeWrapper(uint8_t* cpuLut) { cudaError_t err; CUDA_SAFE_MALLOC(&lut, 256*sizeof(uint8_t), &err); CUDA_SAFE_MEMCPY(lut, cpuLut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice, &err); } -}} +}}} diff --git a/openbr/plugins/cuda/cudapca.cpp b/openbr/plugins/cuda/cudapca.cpp index fbee173..c1677c8 100644 --- a/openbr/plugins/cuda/cudapca.cpp +++ b/openbr/plugins/cuda/cudapca.cpp @@ -29,8 +29,9 @@ using namespace cv; #include #include +// definitions from the CUDA source file namespace br { namespace cuda { namespace pca { - void loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems); + void initializeWrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems); void wrapper(void* src, void** dst); }}} @@ -38,9 +39,7 @@ namespace br { /*! * \ingroup transforms - * \brief Projects input into learned Principal Component Analysis subspace using CUDA. - * \author Brendan Klare \cite bklare - * \author Josh Klontz \cite jklontz + * \brief Projects input into learned Principal Component Analysis subspace using CUDA. Modified from original PCA plugin. * \author Colin Heinzmann \cite DepthDeluxe * * \br_property float keep Options are: [keep < 0 - All eigenvalues are retained, keep == 0 - No PCA is performed and the eigenvectors form an identity matrix, 0 < keep < 1 - Keep is the fraction of the variance to retain, keep >= 1 - keep is the number of leading eigenvectors to retain] Default is 0.95. @@ -134,22 +133,9 @@ private: dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = keep; dstDataPtr[3] = srcDataPtr[3]; - br::cuda::pca::wrapper(srcDataPtr[0], &dstDataPtr[0]); + cuda::pca::wrapper(srcDataPtr[0], &dstDataPtr[0]); dst = dstMat; - - //dst = cv::Mat(1, keep, CV_32FC1); - - // perform the operation on the graphics card - //cuda::cudapca_projectwrapper((float*)src.m().ptr(), (float*)dst.m().ptr()); - - // Map Eigen into OpenCV - //Mat cpuDst = cv::Mat(1, keep, CV_32FC1); - //Eigen::Map inMap(src.m().ptr(), src.m().rows*src.m().cols, 1); - //Eigen::Map outMap(cpuDst.ptr(), keep, 1); - - // Do projection - //outMap = eVecs.transpose() * (inMap - mean); } void store(QDataStream &stream) const @@ -161,14 +147,6 @@ private: { stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> eVecs; - cout << "Mean Dimensions" << endl; - cout << "\tRows: " << mean.rows() << " Cols: " << mean.cols() << endl; - cout << "eVecs Dimensions" << endl; - cout << "\tRows: " << eVecs.rows() << " Cols: " << eVecs.cols() << endl; - cout << "eVals Dimensions" << endl; - cout << "\tRows: " << eVals.rows() << " Cols: " << eVals.cols() << endl; - cout << "Keep: " << keep << endl; - // TODO(colin): use Eigen Map class to generate map files so we don't have to copy the data // serialize the eigenvectors float* evBuffer = new float[eVecs.rows() * eVecs.cols()]; @@ -187,7 +165,7 @@ private: } // call the wrapper function - br::cuda::pca::loadwrapper(evBuffer, eVecs.rows(), eVecs.cols(), meanBuffer, mean.rows()*mean.cols()); + cuda::pca::initializeWrapper(evBuffer, eVecs.rows(), eVecs.cols(), meanBuffer, mean.rows()*mean.cols()); delete evBuffer; delete meanBuffer; diff --git a/openbr/plugins/cuda/cudapca.cu b/openbr/plugins/cuda/cudapca.cu index f6fd3a1..001dd49 100644 --- a/openbr/plugins/cuda/cudapca.cu +++ b/openbr/plugins/cuda/cudapca.cu @@ -9,6 +9,10 @@ using namespace std; using namespace cv; using namespace cv::gpu; +/* + * These are the CUDA functions for CUDAPCA. See cudapca.cpp for more details + */ + namespace br { namespace cuda { namespace pca { __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int evRows, int evCols, int stepSize) { int colInd = blockIdx.x*blockDim.x+threadIdx.x; @@ -68,7 +72,7 @@ namespace br { namespace cuda { namespace pca { int _numSteps; int _stepSize; float* intermediaryBuffer; - void loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) { + void initializeWrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) { _evRows = evRows; _evCols = evCols; _meanElems = meanElems; @@ -95,7 +99,6 @@ namespace br { namespace cuda { namespace pca { cudaError_t err; CUDA_SAFE_MALLOC(dst, _evCols*sizeof(float), &err); - // subtract out the mean of the image (mean is 1xpixels in size) int threadsPerBlock = 64; int numBlocks = _meanElems / threadsPerBlock + 1; @@ -114,8 +117,5 @@ namespace br { namespace cuda { namespace pca { CUDA_KERNEL_ERR_CHK(&err); CUDA_SAFE_FREE(src, &err); // TODO(colin): figure out why adding this free causes memory corruption... - - // copy the data back to the CPU - //cudaMemcpy(dst, _cudaDstPtr, _evCols*sizeof(float), cudaMemcpyDeviceToHost); } }}} diff --git a/openbr/plugins/cuda/cudargb2grayscale.cpp b/openbr/plugins/cuda/cudargb2grayscale.cpp index 6f72728..1db7cca 100644 --- a/openbr/plugins/cuda/cudargb2grayscale.cpp +++ b/openbr/plugins/cuda/cudargb2grayscale.cpp @@ -25,17 +25,18 @@ using namespace cv; -namespace br { namespace cuda{ - void cudargb2grayscale_wrapper(void* srcPtr, void**dstPtr, int rows, int cols); -}} +// definitions from the CUDA source file +namespace br { namespace cuda { namespace rgb2grayscale { + void wrapper(void* srcPtr, void**dstPtr, int rows, int cols); +}}} namespace br { /*! * \ingroup transforms - * \brief Colorspace conversion. - * \author Li Li \cite Josh Klontz \cite jklontz + * \brief Converts 3-channel images to grayscale + * \author Li Li \cite booli */ class CUDARGB2GrayScaleTransform : public UntrainableTransform { @@ -57,8 +58,8 @@ private: dstDataPtr[2] = srcDataPtr[2]; dstDataPtr[3] = srcDataPtr[3]; *((int*)dstDataPtr[3]) = CV_8UC1; // not sure if the type of the new mat is the same - - br::cuda::cudargb2grayscale_wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols); + + cuda::rgb2grayscale::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols); dst = dstMat; /* diff --git a/openbr/plugins/cuda/cudargb2grayscale.cu b/openbr/plugins/cuda/cudargb2grayscale.cu index 266389c..8efaafa 100644 --- a/openbr/plugins/cuda/cudargb2grayscale.cu +++ b/openbr/plugins/cuda/cudargb2grayscale.cu @@ -12,9 +12,9 @@ using namespace std; using namespace cv; using namespace cv::gpu; -namespace br{ namespace cuda { +namespace br { namespace cuda { namespace rgb2grayscale { - __global__ void cudargb2grayscale_kernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols) + __global__ void kernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols) { int rowInd = blockIdx.y*blockDim.y+threadIdx.y; int colInd = blockIdx.x*blockDim.x+threadIdx.x; @@ -31,7 +31,7 @@ namespace br{ namespace cuda { return; } - void cudargb2grayscale_wrapper(void* srcPtr, void** dstPtr, int rows, int cols) + void wrapper(void* srcPtr, void** dstPtr, int rows, int cols) { cudaError_t err; dim3 threadsPerBlock(9, 9); @@ -39,9 +39,9 @@ namespace br{ namespace cuda { rows/threadsPerBlock.y + 1); CUDA_SAFE_MALLOC(dstPtr, rows*cols*sizeof(uint8_t), &err); - cudargb2grayscale_kernel<<>>((uint8_t*)srcPtr, (uint8_t*) (*dstPtr), rows, cols); + kernel<<>>((uint8_t*)srcPtr, (uint8_t*) (*dstPtr), rows, cols); CUDA_KERNEL_ERR_CHK(&err); CUDA_SAFE_FREE(srcPtr, &err); - } + } -}} +}}} diff --git a/openbr/plugins/cuda/passthrough.cpp b/openbr/plugins/cuda/passthrough.cpp deleted file mode 100644 index f7018ea..0000000 --- a/openbr/plugins/cuda/passthrough.cpp +++ /dev/null @@ -1,46 +0,0 @@ -#include - -#include -#include - -using namespace cv; -using namespace cv::gpu; - -#include "passthrough.hpp" - -#include - - -namespace br -{ - class CUDAPassthroughTransform : public UntrainableTransform - { - Q_OBJECT - -private: - void project(const Template &src, Template &dst) const - { - // note: if you convert the image to grayscale, you get 8UC1 - - // upload the src mat to the GPU - GpuMat srcGpuMat, dstGpuMat; - srcGpuMat.upload(src.m()); - dstGpuMat.upload(src.m()); - - br::cuda::passthrough_wrapper(srcGpuMat, dstGpuMat); - - dstGpuMat.download(dst.m()); - - // TODO(colin): add delete code - srcGpuMat.release(); - dstGpuMat.release(); - - printf("srcGpuMat empty: %d\n", (int)srcGpuMat.empty()); - printf("dstGpuMat empty: %d\n", (int)srcGpuMat.empty()); - } - }; - - BR_REGISTER(Transform, CUDAPassthroughTransform); -} - -#include "cuda/passthrough.moc" diff --git a/openbr/plugins/cuda/passthrough.cu b/openbr/plugins/cuda/passthrough.cu deleted file mode 100644 index 49f94e9..0000000 --- a/openbr/plugins/cuda/passthrough.cu +++ /dev/null @@ -1,43 +0,0 @@ -// note: Using 8-bit unsigned 1 channel images - -#include - -using namespace cv; -using namespace cv::gpu; - -#include "passthrough.hpp" - -namespace br { namespace cuda { - __global__ void passthrough_kernel(uint8_t* srcPtr, uint8_t* dstPtr, size_t srcStep, size_t dstStep, int cols, int rows) { - int rowInd = blockIdx.y*blockDim.y+threadIdx.y; - int colInd = blockIdx.x*blockDim.x+threadIdx.x; - - // don't do anything if we are outside the allowable positions - if (rowInd >= rows || colInd >= cols) - return; - - uint8_t srcVal = (srcPtr + rowInd*srcStep)[colInd]; - uint8_t* rowDstPtr = dstPtr + rowInd*dstStep; - - rowDstPtr[colInd] = srcVal; - } - - void passthrough_wrapper(GpuMat& src, GpuMat& dst) { - // convert the GpuMats to pointers - uint8_t* srcPtr = (uint8_t*)src.data; - uint8_t* dstPtr = (uint8_t*)dst.data; - - int imageWidth = src.cols; - int imageHeight = src.rows; - - // make 8 * 8 = 64 square block - dim3 threadsPerBlock(8, 8); - dim3 numBlocks(imageWidth / threadsPerBlock.x + 1, - imageHeight / threadsPerBlock.y + 1); - - passthrough_kernel<<>>(srcPtr, dstPtr, src.step, dst.step, imageWidth, imageHeight); - } -}} - - -// read http://stackoverflow.com/questions/31927297/array-of-ptrstepszgpumat-to-a-c-cuda-kernel diff --git a/openbr/plugins/cuda/passthrough.hpp b/openbr/plugins/cuda/passthrough.hpp deleted file mode 100644 index 8749684..0000000 --- a/openbr/plugins/cuda/passthrough.hpp +++ /dev/null @@ -1,3 +0,0 @@ -namespace br { namespace cuda { - void passthrough_wrapper(GpuMat& src, GpuMat& dst); -}}