diff --git a/openbr/plugins/cuda/README.md b/openbr/plugins/cuda/README.md new file mode 100644 index 0000000..45db537 --- /dev/null +++ b/openbr/plugins/cuda/README.md @@ -0,0 +1,122 @@ +# CUDA Plugins +CUDA plugins are very similar to normal plugins. A single plugin is split into +two files: the `.cpp` file with the BR standard plugin definition and the `.cu` +file with your kernel and wrapper functions. + +## The `.cpp` file +Every main plugin file must have the names of the kernel wrapper functions +defined at the top of the program. Once the definitions are there, just call +the CUDA functions as you need them + +## The `.cu` file +All functions within the CUDA file must be declared inside their own namespace +under `br::cuda`. For example the plugin `passthrough` must have all functions +inside it declared under the namespace `br::cuda::passthrough`. + +## CPU Template object format +Like any other BR Transform, the plugin must return an object for the next +plugin to consume. For performance reasons, we don't copy data to and from +the graphics card for every transform. Instead, we use this space to transfer +data about how to access the image data and its type. The Mat is an array of data type `void*`. + +Index | Item Name | Type | Description +--------|-------------|-----------|------------ +0 | GpuData | void* | Pointer to the graphics card data +1 | rows | int | Number of rows in the Mat +2 | cols | int | Number of colums in the Mat +3 | type | int | OpenCV mat data type code (i.e. `mat.type()`) + +It is expected that the wrapper function does the proper GPU memory handling +to make sure that the GpuData pointer in the output mat is pointing to the +data that the plugin is outputting. + +## Example: Passthrough +This example plugin takes in input data and passes it straight to the output. +The BR transform calls the wrapper function which exists in the CUDA file which +in turn calls the kernel routine to copy the data in the GPU. + +**Note**: This program assumes that a previous Transform, namely `CUDACopyTo` has +copied the data to the GPU. + +### **passthrough.cpp** +```c++ +#include +#include + +// wrapper function within the CUDA file +namespace br { namespace cuda { namespace passthrough { + void wrapper(void* srcGpuData, void** dstGpuData); +}}}; + +#include +namespace br +{ + class CUDAPassthroughTransform : public UntrainableTransform + { + Q_OBJECT + + void project(const Template &src, Template &dst) { + // extract the parameters out of the Mat passed from the previous plugin + void* const* srcDataPtr = src.m().ptr(); + int rows = *((int*)srcDataPtr[1]); + int cols = *((int*)srcDataPtr[2]); + int type = *((int*)srcDataPtr[3]); + + // generate a new Mat to be passed to the next plugin + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type()); + void** dstDataPtr = dstMat.ptr(); + dstDataPtr[1] = srcDataPtr[1]; + dstDataPtr[2] = srcDataPtr[2]; + dstDataPtr[3] = srcDataPtr[3]; + + // call the wrapper and set the dst output to the newly created Mat + br::cuda::passthrough::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols); + dst = dstMat; + } + }; + + BR_REGISTER(Transform, CUDAPassthroughTransform); +} + +#include "cuda/passthrough.moc" +``` + +### **passthrough.cu** +```c++ +#include + +namespace br { namespace cuda { namespace passthrough { + __global__ void kernel(char* srcPtr, char* dstPtr, int rows, int cols) { + // get the current index + int rowInd = blockIdx.y*blockDim.y+threadIdx.y; + int colInd = blockIdx.x*blockDim.x+threadIdx.x; + + // don't do anything if we are outside the allowable positions + if (rowInd >= rows || colInd >= cols) + return; + + // write the input to the output + rowDstPtr[rowInd*cols + colInd] = srcVal; + } + + void wrapper(char* srcPtr, char** dstPtr, int rows, int cols, int type) { + // verify the proper image type + if (type != CV_8UC1) { + cout << "Error: image type not supported" + return; + } + + *dstPtr = cudaMalloc(rows*cols*sizeof(char)); + + dim3 threadsPerBlock(8, 8); + dim3 numBlocks(imageWidth / threadsPerBlock.x + 1, + imageHeight / threadsPerBlock.y + 1); + + // run the kernel function + kernel<<>>(srcPtr, dstPtr, rows, cols); + + // free the memory as it isn't used anymore + cudaFree(srcPtr); + } +}}} +``` diff --git a/openbr/plugins/cuda/copyfrom.cpp b/openbr/plugins/cuda/copyfrom.cpp new file mode 100644 index 0000000..6d913e0 --- /dev/null +++ b/openbr/plugins/cuda/copyfrom.cpp @@ -0,0 +1,75 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright 2016 Colin Heinzmann * + * * + * Licensed under the Apache License, Version 2.0 (the "License"); * + * you may not use this file except in compliance with the License. * + * You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, software * + * distributed under the License is distributed on an "AS IS" BASIS, * + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * + * See the License for the specific language governing permissions and * + * limitations under the License. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include + +#include + +#include + +using namespace std; + +using namespace cv; + +// CUDA functions for this plugin +namespace br { namespace cuda { namespace copyfrom { + template void wrapper(void* src, T* out, int rows, int cols); +}}} + +namespace br +{ + /*! + * \ingroup transforms + * \brief Copies a transform from the GPU to the CPU. + * \author Colin Heinzmann \cite DepthDeluxe + * \note Method: Automatically matches image dimensions, works for 32-bit single channel, 8-bit single channel, and 8-bit 3 channel + */ + class CUDACopyFrom : public UntrainableTransform + { + Q_OBJECT + +private: + void project(const Template &src, Template &dst) const + { + // pull the data back out of the Mat + void* const* dataPtr = src.m().ptr(); + int rows = *((int*)dataPtr[1]); + int cols = *((int*)dataPtr[2]); + int type = *((int*)dataPtr[3]); + + Mat dstMat = Mat(rows, cols, type); + switch(type) { + case CV_32FC1: + cuda::copyfrom::wrapper(dataPtr[0], dstMat.ptr(), rows, cols); + break; + case CV_8UC1: + cuda::copyfrom::wrapper(dataPtr[0], dstMat.ptr(), rows, cols); + break; + case CV_8UC3: + cuda::copyfrom::wrapper(dataPtr[0], dstMat.ptr(), rows, cols * 3); + break; + default: + cout << "ERR: Invalid image type (" << type << ")" << endl; + break; + } + dst = dstMat; + } + }; + + BR_REGISTER(Transform, CUDACopyFrom); +} + +#include "cuda/copyfrom.moc" diff --git a/openbr/plugins/cuda/copyfrom.cu b/openbr/plugins/cuda/copyfrom.cu new file mode 100644 index 0000000..6d145f6 --- /dev/null +++ b/openbr/plugins/cuda/copyfrom.cu @@ -0,0 +1,28 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright 2016 Colin Heinzmann * + * * + * Licensed under the Apache License, Version 2.0 (the "License"); * + * you may not use this file except in compliance with the License. * + * You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, software * + * distributed under the License is distributed on an "AS IS" BASIS, * + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * + * See the License for the specific language governing permissions and * + * limitations under the License. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include "cudadefines.hpp" + +namespace br { namespace cuda { namespace copyfrom { + template void wrapper(void* src, T* dst, int rows, int cols) { + cudaError_t err; + CUDA_SAFE_MEMCPY(dst, src, rows*cols*sizeof(T), cudaMemcpyDeviceToHost, &err); + CUDA_SAFE_FREE(src, &err); + } + + template void wrapper(void*, float*, int, int); + template void wrapper(void*, unsigned char*, int, int); +}}} diff --git a/openbr/plugins/cuda/copyto.cpp b/openbr/plugins/cuda/copyto.cpp new file mode 100644 index 0000000..a52bd94 --- /dev/null +++ b/openbr/plugins/cuda/copyto.cpp @@ -0,0 +1,84 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright 2016 Colin Heinzmann * + * * + * Licensed under the Apache License, Version 2.0 (the "License"); * + * you may not use this file except in compliance with the License. * + * You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, software * + * distributed under the License is distributed on an "AS IS" BASIS, * + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * + * See the License for the specific language governing permissions and * + * limitations under the License. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include + +#include + +#include + +using namespace std; + +using namespace cv; + +// definitions from the CUDA source file +namespace br { namespace cuda { namespace copyto { + template void wrapper(const T* in, void** out, const int rows, const int cols); +}}} + +namespace br +{ + + /*! + * \ingroup transforms + * \brief Copies a transform to the GPU. + * \author Colin Heinzmann \cite DepthDeluxe + * \note Method: Automatically matches image dimensions, works for 32-bit single channel, 8-bit single channel, and 8-bit 3 channel + */ + class CUDACopyTo : public UntrainableTransform + { + Q_OBJECT + +private: + void project(const Template &src, Template &dst) const + { + const Mat& srcMat = src.m(); + const int rows = srcMat.rows; + const int cols = srcMat.cols; + + // output will be a single pointer to graphics card memory + Mat dstMat = Mat(4, 1, DataType::type); + void** dstMatData = dstMat.ptr(); + + // save cuda ptr, rows, cols, then type + dstMatData[1] = new int; *((int*)dstMatData[1]) = rows; + dstMatData[2] = new int; *((int*)dstMatData[2]) = cols; + dstMatData[3] = new int; *((int*)dstMatData[3]) = srcMat.type(); + + void* cudaMemPtr; + switch(srcMat.type()) { + case CV_32FC1: + cuda::copyto::wrapper(srcMat.ptr(), &dstMatData[0], rows, cols); + break; + case CV_8UC1: + cuda::copyto::wrapper(srcMat.ptr(), &dstMatData[0], rows, cols); + break; + case CV_8UC3: + cuda::copyto::wrapper(srcMat.ptr(), &dstMatData[0], rows, 3*cols); + break; + default: + cout << "ERR: Invalid image type (" << srcMat.type() << ")" << endl; + return; + } + + dst = dstMat; + } + }; + + BR_REGISTER(Transform, CUDACopyTo); +} + +#include "cuda/copyto.moc" diff --git a/openbr/plugins/cuda/copyto.cu b/openbr/plugins/cuda/copyto.cu new file mode 100644 index 0000000..6c60742 --- /dev/null +++ b/openbr/plugins/cuda/copyto.cu @@ -0,0 +1,30 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright 2016 Colin Heinzmann * + * * + * Licensed under the Apache License, Version 2.0 (the "License"); * + * you may not use this file except in compliance with the License. * + * You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, software * + * distributed under the License is distributed on an "AS IS" BASIS, * + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * + * See the License for the specific language governing permissions and * + * limitations under the License. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include "cudadefines.hpp" + +namespace br { namespace cuda { namespace copyto { + + template void wrapper(const T* in, void** out, const int rows, const int cols) { + cudaError_t err; + CUDA_SAFE_MALLOC(out, rows*cols*sizeof(T), &err); + CUDA_SAFE_MEMCPY(*out, in, rows*cols*sizeof(T), cudaMemcpyHostToDevice, &err); + } + + template void wrapper(const float* in, void** out, const int rows, const int cols); + template void wrapper(const unsigned char* in, void** out, const int rows, const int cols); + +}}} diff --git a/openbr/plugins/cuda/cudaaffine.cpp b/openbr/plugins/cuda/cudaaffine.cpp new file mode 100644 index 0000000..22e9315 --- /dev/null +++ b/openbr/plugins/cuda/cudaaffine.cpp @@ -0,0 +1,150 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright 2016 Greg Shrock, Colin Heinzmann * + * * + * Licensed under the Apache License, Version 2.0 (the "License"); * + * you may not use this file except in compliance with the License. * + * You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, software * + * distributed under the License is distributed on an "AS IS" BASIS, * + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * + * See the License for the specific language governing permissions and * + * limitations under the License. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + + + +#include +using namespace std; + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +using namespace cv; + +// definitions from the CUDA source file +namespace br { namespace cuda { namespace affine { + void resizeWrapper(void* srcPtr, void** dstPtr, int src_rows, int src_cols, int dst_rows, int dst_cols); + void wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols); +}}} + +namespace br +{ + + /*! + * \ingroup transforms + * \brief Performs a two or three point registration on the GPU. Modified from stock OpenBR implementation. Only supports single-point input bilinear transformation. + * \author Greg Schrock \cite gls022 + * \author Colin Heinzmann \cite DepthDeluxe + * \note Method: Area should be used for shrinking an image, Cubic for slow but accurate enlargment, Bilin for fast enlargement. + */ + class CUDAAffineTransform : public UntrainableTransform + { + Q_OBJECT + + private: + Q_PROPERTY(int width READ get_width WRITE set_width RESET reset_width STORED false) + Q_PROPERTY(int height READ get_height WRITE set_height RESET reset_height STORED false) + Q_PROPERTY(float x1 READ get_x1 WRITE set_x1 RESET reset_x1 STORED false) + Q_PROPERTY(float y1 READ get_y1 WRITE set_y1 RESET reset_y1 STORED false) + BR_PROPERTY(int, width, 64) + BR_PROPERTY(int, height, 64) + BR_PROPERTY(float, x1, 0) + BR_PROPERTY(float, y1, 0) + + static Point2f getThirdAffinePoint(const Point2f &a, const Point2f &b) + { + float dx = b.x - a.x; + float dy = b.y - a.y; + return Point2f(a.x - dy, a.y + dx); + } + + void project(const Template &src, Template &dst) const + { + Point2f dstPoints[3]; + dstPoints[0] = Point2f(x1*width, y1*height); + dstPoints[1] = Point2f((1-x1)*width, (1-y1)*height); + dstPoints[2] = getThirdAffinePoint(dstPoints[0], dstPoints[1]); + + Point2f srcPoints[3]; + if (src.file.contains("Affine_0") && + src.file.contains("Affine_1") && + src.file.contains("Affine_2")) { + srcPoints[0] = OpenCVUtils::toPoint(src.file.get("Affine_0")); + srcPoints[1] = OpenCVUtils::toPoint(src.file.get("Affine_1")); + } else { + const QList landmarks = OpenCVUtils::toPoints(src.file.points()); + + if (landmarks.size() < 2) { + void* const* srcDataPtr = src.m().ptr(); + int rows = *((int*)srcDataPtr[1]); + int cols = *((int*)srcDataPtr[2]); + int type = *((int*)srcDataPtr[3]); + + if (type != CV_8UC1) { + cout << "ERR: Invalid image format!" << endl; + return; + } + + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type()); + void** dstDataPtr = dstMat.ptr(); + + dstDataPtr[1] = srcDataPtr[1]; *((int*)dstDataPtr[1]) = height; // rows + dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = width; // cols + dstDataPtr[3] = srcDataPtr[3]; + + cuda::affine::resizeWrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols, height, width); + dst = dstMat; + return; + } else { + srcPoints[0] = landmarks[0]; + srcPoints[1] = landmarks[1]; + } + } + srcPoints[2] = getThirdAffinePoint(srcPoints[0], srcPoints[1]); + + Mat affineTransform = getAffineTransform(srcPoints, dstPoints); + + void* const* srcDataPtr = src.m().ptr(); + int rows = *((int*)srcDataPtr[1]); + int cols = *((int*)srcDataPtr[2]); + int type = *((int*)srcDataPtr[3]); + + if (type != CV_8UC1) { + cout << "ERR: Invalid image format!" << endl; + return; + } + + + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type()); + void** dstDataPtr = dstMat.ptr(); + + dstDataPtr[1] = srcDataPtr[1]; *((int*)dstDataPtr[1]) = height; // rows + dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = width; // cols + dstDataPtr[3] = srcDataPtr[3]; + + cuda::affine::wrapper(srcDataPtr[0], &dstDataPtr[0], affineTransform, rows, cols, height, width); + + dst = dstMat; + } + }; + + BR_REGISTER(Transform, CUDAAffineTransform) + +} // namespace br + +#include "cuda/cudaaffine.moc" diff --git a/openbr/plugins/cuda/cudaaffine.cu b/openbr/plugins/cuda/cudaaffine.cu new file mode 100644 index 0000000..6fa707b --- /dev/null +++ b/openbr/plugins/cuda/cudaaffine.cu @@ -0,0 +1,232 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright 2016 Colin Heinzmann * + * * + * Licensed under the Apache License, Version 2.0 (the "License"); * + * you may not use this file except in compliance with the License. * + * You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, software * + * distributed under the License is distributed on an "AS IS" BASIS, * + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * + * See the License for the specific language governing permissions and * + * limitations under the License. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include + +using namespace std; + +#include +#include +#include +#include + +#include "cudadefines.hpp" + +using namespace cv; +using namespace cv::gpu; + +namespace br { namespace cuda { namespace affine { + + __device__ __forceinline__ uint8_t getPixelValueDevice(int row, int col, uint8_t* srcPtr, int rows, int cols) { + return (srcPtr + row*cols)[col]; + } + + + __device__ __forceinline__ uint8_t getBilinearPixelValueDevice(double row, double col, uint8_t* srcPtr, int rows, int cols) { + // http://www.sci.utah.edu/~acoste/uou/Image/project3/ArthurCOSTE_Project3.pdf + // Bilinear Transformation + // f(Px, Py) = f(Q11)×(1−Rx)×(1−Sy)+f(Q21)×(Rx)×(1−Sy)+f(Q12)×(1−Rx)×(Sy)+f(Q22)×(Rx)×(Sy) + + int row1 = floor(row); + int row2 = row1+1; + + int col1 = floor(col); + int col2 = col1+1; + + double d_row = row - row1; + double d_col = col - col1; + + int Q11 = getPixelValueDevice(row1, col1, srcPtr, rows, cols); + int Q21 = getPixelValueDevice(row2, col1, srcPtr, rows, cols); + int Q12 = getPixelValueDevice(row1, col2, srcPtr, rows, cols); + int Q22 = getPixelValueDevice(row2, col2, srcPtr, rows, cols); + + double val = Q22*(d_row*d_col) + Q12*((1-d_row)*d_col) + Q21*(d_row*(1-d_col)) + Q11*((1-d_row)*(1-d_col)); + return ((uint8_t) round(val)); + } + + __device__ __forceinline__ uint8_t getDistancePixelValueDevice(double row, double col, uint8_t* srcPtr, int rows, int cols) { + int row1 = floor(row); + int row2 = row1+1; + + int col1 = floor(col); + int col2 = col1+1; + + double m1 = row2 - row; + double m12 = m1*m1; + + double m2 = col - col1; + double m22 = m2*m2; + + double d1 = sqrt(m12 - 2*m1 + 1 + m22); + double d2 = sqrt(m12 + m22); + double d3 = sqrt(m12 - 2*m1 + 1 + m22 - 2*m2 + 1); + double d4 = sqrt(m12 + m22 - 2*m2 + 1); + double sum = d1 + d2 + d3 + d4; + + double w1 = d1/sum; + double w2 = d2/sum; + double w3 = d3/sum; + double w4 = d4/sum; + + uint8_t v1 = getPixelValueDevice(row1, col1, srcPtr, rows, cols); + uint8_t v2 = getPixelValueDevice(row2, col1, srcPtr, rows, cols); + uint8_t v3 = getPixelValueDevice(row1, col2, srcPtr, rows, cols); + uint8_t v4 = getPixelValueDevice(row2, col2, srcPtr, rows, cols); + + return round(w1*v1 + w2*v2 + w3*v3 + w4*v4); + } + + /* + * trans_inv - A pointer to a one-dimensional representation of the inverse of the transform matrix 3x3 + * dst_row - The destination row (mapping to this row) + * dst_col - The destination column (mapping to this column) + * src_row - The computed source pixel row (mapping from this row) + * src_col - The computed source pixel column (mapping from this col) + */ + __device__ __forceinline__ void getSrcCoordDevice(double *trans_inv, int dst_row, int dst_col, double* src_row_pnt, double* src_col_pnt){ + *src_col_pnt = dst_col * trans_inv[0] + dst_row * trans_inv[3] + trans_inv[6]; + *src_row_pnt = dst_col * trans_inv[1] + dst_row * trans_inv[4] + trans_inv[7]; + } + + __global__ void bilinearKernel(uint8_t* srcPtr, uint8_t* dstPtr, int srcRows, int srcCols, int dstRows, int dstCols) { + int dstRowInd = blockIdx.y*blockDim.y+threadIdx.y; + int dstColInd = blockIdx.x*blockDim.x+threadIdx.x; + int dstIndex = dstRowInd*dstCols+dstColInd; + + // destination boundary checking + if (dstRowInd >= dstRows || dstColInd >= dstCols) { + return; + } + + // get the reference indices and relative amounts + float exactSrcRowInd = (float)dstRowInd / (float)dstRows * (float)srcRows; + int minSrcRowInd = (int)exactSrcRowInd; + int maxSrcRowInd = minSrcRowInd+1; + float relSrcRowInd = 1.-(exactSrcRowInd-(float)minSrcRowInd); + + // get the reference indices and relative amounts + double exactSrcColInd = (double)dstColInd / (double)dstCols * (double)srcCols; + int minSrcColInd = (int)exactSrcColInd; + int maxSrcColInd = minSrcColInd+1; + float relSrcColInd = 1.-(exactSrcColInd-(float)minSrcColInd); + + // perform boundary checking + if (minSrcRowInd < 0 || maxSrcRowInd >= srcRows || minSrcColInd < 0 || maxSrcColInd >= srcCols) { + dstPtr[dstIndex] = 0; + return; + } + + // get each of the pixel values + float topLeft = srcPtr[minSrcRowInd*srcCols+minSrcColInd]; + float topRight = srcPtr[minSrcRowInd*srcCols+maxSrcColInd]; + float bottomLeft = srcPtr[maxSrcRowInd*srcCols+minSrcColInd]; + float bottomRight = srcPtr[maxSrcRowInd*srcCols+maxSrcColInd]; + + float out = relSrcRowInd*relSrcColInd*topLeft + relSrcRowInd*(1.-relSrcColInd)*topRight + (1.-relSrcRowInd)*relSrcColInd*bottomLeft + (1.-relSrcRowInd)*(1.-relSrcColInd)*bottomRight; + + dstPtr[dstIndex] = (int)out; + } + + __global__ void affineKernel(uint8_t* srcPtr, uint8_t* dstPtr, double* trans_inv, int src_rows, int src_cols, int dst_rows, int dst_cols){ + int dstRowInd = blockIdx.y*blockDim.y+threadIdx.y; + int dstColInd = blockIdx.x*blockDim.x+threadIdx.x; + int dstIndex = dstRowInd*dst_cols + dstColInd; + + double srcRowPnt; + double srcColPnt; + + // don't do anything if the index is out of bounds + if (dstRowInd >= dst_rows || dstColInd >= dst_cols) { + return; + } + if (dstRowInd == 0 || dstRowInd == dst_rows-1 || dstColInd ==0 || dstColInd == dst_cols-1) { + dstPtr[dstIndex] = 0; + return; + } + + getSrcCoordDevice(trans_inv, dstRowInd, dstColInd, &srcRowPnt, &srcColPnt); + const uint8_t cval = getBilinearPixelValueDevice(srcRowPnt, srcColPnt, srcPtr, src_rows, src_cols); // Get initial pixel value + + dstPtr[dstIndex] = cval; + } + + void resizeWrapper(void* srcPtr, void** dstPtr, int srcRows, int srcCols, int dstRows, int dstCols) { + // perform bilinear filtering + + // allocate space for destination + cudaError_t err; + CUDA_SAFE_MALLOC(dstPtr, dstRows*dstCols*sizeof(uint8_t), &err); + + // call the bilinear kernel function + dim3 threadsPerBlock(32, 16); + dim3 numBlocks(dstCols/threadsPerBlock.x + 1, + dstRows/threadsPerBlock.y + 1); + + bilinearKernel<<>>((uint8_t*)srcPtr, (uint8_t*)*dstPtr, srcRows, srcCols, dstRows, dstCols); + CUDA_KERNEL_ERR_CHK(&err); + + CUDA_SAFE_FREE(srcPtr, &err); + } + + void wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols) { + cudaError_t err; + double* gpuInverse; + + dim3 threadsPerBlock(32, 16); + dim3 numBlocks(dst_cols/threadsPerBlock.x + 1, + dst_rows/threadsPerBlock.y + 1); + + //************************************************************************ + // Input affine is a 2x3 Mat whose transpose is used in the computations + // [x, y, 1] = [u, v, 1] [ a^T | [0 0 1]^T ] + // See "Digital Image Warping" by George Wolburg (p. 50) + //************************************************************************ + + // get new transform elements + double a11 = affineTransform.at(0, 0); + double a12 = affineTransform.at(1, 0); + double a21 = affineTransform.at(0, 1); + double a22 = affineTransform.at(1, 1); + double a31 = affineTransform.at(0, 2); + double a32 = affineTransform.at(1, 2); + + // compute transform inverse + double det = 1 / (a11*a22 - a21*a12); + + double affineInverse[9]; + affineInverse[0] = a22 * det; + affineInverse[1] = -a12 * det; + affineInverse[2] = 0; + affineInverse[3] = -a21 * det; + affineInverse[4] = a11 * det; + affineInverse[5] = 0; + affineInverse[6] = (a21*a32 - a31*a22) * det; + affineInverse[7] = (a31*a12 - a11*a32) * det; + affineInverse[8] = (a11*a22 - a21*a12) * det; + + CUDA_SAFE_MALLOC(dstPtr, dst_rows*dst_cols*sizeof(uint8_t), &err); + CUDA_SAFE_MALLOC(&gpuInverse, 3*3*sizeof(double), &err); + + CUDA_SAFE_MEMCPY(gpuInverse, affineInverse, 9*sizeof(double), cudaMemcpyHostToDevice, &err); + + affineKernel<<>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), gpuInverse, src_rows, src_cols, dst_rows, dst_cols); + CUDA_KERNEL_ERR_CHK(&err); + + CUDA_SAFE_FREE(srcPtr, &err); + CUDA_SAFE_FREE(gpuInverse, &err); + } +}}} diff --git a/openbr/plugins/cuda/cudacvtfloat.cpp b/openbr/plugins/cuda/cudacvtfloat.cpp new file mode 100644 index 0000000..9d6f37e --- /dev/null +++ b/openbr/plugins/cuda/cudacvtfloat.cpp @@ -0,0 +1,74 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright 2016 Li Li, Colin Heinzmann * + * * + * Licensed under the Apache License, Version 2.0 (the "License"); * + * you may not use this file except in compliance with the License. * + * You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, software * + * distributed under the License is distributed on an "AS IS" BASIS, * + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * + * See the License for the specific language governing permissions and * + * limitations under the License. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include +using namespace std; +#include + +#include +using namespace cv; + +#include + +// definitions from the CUDA source file +namespace br { namespace cuda { namespace cvtfloat { + void wrapper(void* src, void** dst, int rows, int cols); +}}} + +namespace br +{ + +/*! + * \ingroup transforms + * \brief Converts 8-bit images currently on GPU into 32-bit floating point equivalent. + * \author Colin Heinzmann \cite DepthDeluxe + */ +class CUDACvtFloatTransform : public UntrainableTransform +{ + Q_OBJECT + + public: + void project(const Template &src, Template &dst) const + { + void* const* srcDataPtr = src.m().ptr(); + int rows = *((int*)srcDataPtr[1]); + int cols = *((int*)srcDataPtr[2]); + int type = *((int*)srcDataPtr[3]); + + // assume the image type is 256-monochrome + // TODO(colin): real exception handling + if (type != CV_8UC1) { + cout << "ERR: Invalid memory format" << endl; + return; + } + + // build the destination mat + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type()); + void** dstDataPtr = dstMat.ptr(); + dstDataPtr[1] = srcDataPtr[1]; + dstDataPtr[2] = srcDataPtr[2]; + dstDataPtr[3] = srcDataPtr[3]; *((int*)dstDataPtr[3]) = CV_32FC1; + + cuda::cvtfloat::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols); + dst = dstMat; + } +}; + +BR_REGISTER(Transform, CUDACvtFloatTransform) + +} // namespace br + +#include "cuda/cudacvtfloat.moc" diff --git a/openbr/plugins/cuda/cudacvtfloat.cu b/openbr/plugins/cuda/cudacvtfloat.cu new file mode 100644 index 0000000..e93cbd7 --- /dev/null +++ b/openbr/plugins/cuda/cudacvtfloat.cu @@ -0,0 +1,56 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright 2016 Li Li, Colin Heinzmann * + * * + * Licensed under the Apache License, Version 2.0 (the "License"); * + * you may not use this file except in compliance with the License. * + * You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, software * + * distributed under the License is distributed on an "AS IS" BASIS, * + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * + * See the License for the specific language governing permissions and * + * limitations under the License. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include + +using namespace std; + +#include "cudadefines.hpp" + +namespace br { namespace cuda { namespace cvtfloat { + + __global__ void kernel(const unsigned char* src, float* dst, int rows, int cols) { + // get my index + int rowInd = blockIdx.y*blockDim.y + threadIdx.y; + int colInd = blockIdx.x*blockDim.x + threadIdx.x; + + // bounds check + if (rowInd >= rows || colInd >= cols) { + return; + } + + int index = rowInd*cols + colInd; + dst[index] = (float)src[index]; + } + + void wrapper(void* src, void** dst, int rows, int cols) { + cudaError_t err; + CUDA_SAFE_MALLOC(dst, rows*cols*sizeof(float), &err); + + dim3 threadsPerBlock(8, 8); + dim3 numBlocks( + cols / threadsPerBlock.x + 1, + rows / threadsPerBlock.y + 1 + ); + + kernel<<>>((const unsigned char*)src, (float*)(*dst), rows, cols); + CUDA_KERNEL_ERR_CHK(&err); + + // free the src memory since it is now in a newly allocated dst + CUDA_SAFE_FREE(src, &err); + } + +}}} diff --git a/openbr/plugins/cuda/cudadefines.hpp b/openbr/plugins/cuda/cudadefines.hpp new file mode 100644 index 0000000..4f78b00 --- /dev/null +++ b/openbr/plugins/cuda/cudadefines.hpp @@ -0,0 +1,50 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright 2016 Colin Heinzmann * + * * + * Licensed under the Apache License, Version 2.0 (the "License"); * + * you may not use this file except in compliance with the License. * + * You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, software * + * distributed under the License is distributed on an "AS IS" BASIS, * + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * + * See the License for the specific language governing permissions and * + * limitations under the License. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include + +using namespace std; +#include + +#define CUDA_SAFE_FREE(cudaPtr, errPtr) \ + /*cout << pthread_self() << ": CUDA Free: " << cudaPtr << endl;*/ \ + *errPtr = cudaFree(cudaPtr); \ + if (*errPtr != cudaSuccess) { \ + cout << pthread_self() << ": CUDA Free Error(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \ + throw 0; \ + } + +#define CUDA_SAFE_MALLOC(cudaPtrPtr, size, errPtr) \ + *errPtr = cudaMalloc(cudaPtrPtr, size); \ + if (*errPtr != cudaSuccess) { \ + cout << pthread_self() << ": CUDA Malloc Error(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \ + throw 0; \ + } \ + //cout << pthread_self() << ": CUDA Malloc: " << (void*)*(int**)cudaPtrPtr << endl; + +#define CUDA_SAFE_MEMCPY(dstPtr, srcPtr, count, kind, errPtr) \ + *errPtr = cudaMemcpy(dstPtr, srcPtr, count, kind); \ + if (*errPtr != cudaSuccess) { \ + cout << pthread_self() << ": CUDA Memcpy Error(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \ + throw 0; \ + } + +#define CUDA_KERNEL_ERR_CHK(errPtr) \ + *errPtr = cudaPeekAtLastError(); \ + if (*errPtr != cudaSuccess) { \ + cout << pthread_self() << ": Kernel Call Err(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \ + throw 0; \ + } diff --git a/openbr/plugins/cuda/cudal2.cpp b/openbr/plugins/cuda/cudal2.cpp new file mode 100644 index 0000000..18231f8 --- /dev/null +++ b/openbr/plugins/cuda/cudal2.cpp @@ -0,0 +1,61 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright 2016 Colin Heinzmann * + * * + * Licensed under the Apache License, Version 2.0 (the "License"); * + * you may not use this file except in compliance with the License. * + * You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, software * + * distributed under the License is distributed on an "AS IS" BASIS, * + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * + * See the License for the specific language governing permissions and * + * limitations under the License. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include +using namespace std; + +#include + +// definitions from the CUDA source file +namespace br { namespace cuda { namespace L2 { + void wrapper(float const* aPtr, float const* bPtr, int length, float* outPtr); +}}} + +namespace br +{ + +/*! + * \ingroup distances + * \brief L2 distance computed using eigen. + * \author Colin Heinzmann \cite DepthDeluxe + */ +class CUDAL2Distance : public UntrainableDistance +{ + Q_OBJECT + + float compare(const cv::Mat &a, const cv::Mat &b) const + { + if (a.type() != CV_32FC1 || b.type() != CV_32FC1) { + cout << "ERR: Type mismatch" << endl; + throw 0; + } + if (a.rows*a.cols != b.rows*b.cols) { + cout << "ERR: Dimension mismatch" << endl; + throw 1; + } + + float out; + cuda::L2::wrapper(a.ptr(), b.ptr(), a.rows*a.cols, &out); + + return out; + } +}; + +BR_REGISTER(Distance, CUDAL2Distance) + +} // namespace br + +#include "cuda/cudal2.moc" diff --git a/openbr/plugins/cuda/cudal2.cu b/openbr/plugins/cuda/cudal2.cu new file mode 100644 index 0000000..ca70e55 --- /dev/null +++ b/openbr/plugins/cuda/cudal2.cu @@ -0,0 +1,94 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright 2016 Colin Heinzmann * + * * + * Licensed under the Apache License, Version 2.0 (the "License"); * + * you may not use this file except in compliance with the License. * + * You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, software * + * distributed under the License is distributed on an "AS IS" BASIS, * + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * + * See the License for the specific language governing permissions and * + * limitations under the License. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ +#include + + +#include "cudadefines.hpp" + +namespace br { namespace cuda { namespace L2 { + + __global__ void subtractKernel(float* aPtr, float* bPtr, float* workPtr, int length) { + int index = blockIdx.x*blockDim.x+threadIdx.x; + + if (index >= length) { + return; + } + + // perform the subtraction + float res = aPtr[index] - bPtr[index]; + res = res * res; + workPtr[index] = res; + } + + __global__ void collapseKernel(float* inPtr, float* outPtr, int length) { + // make sure there is only one thread that we are calling + if (blockIdx.x != 0 || threadIdx.x != 0) { + return; + } + + // sum up all the values + float acc = 0; + for (int i=0; i < length; i++) { + acc += inPtr[i]; + } + + *outPtr = acc; + } + + float* cudaAPtr = NULL; + float* cudaBPtr = NULL; + float* cudaWorkBufferPtr = NULL; + float* cudaOutPtr = NULL; + int bufferLen = 0; + + void wrapper(float const* aPtr, float const* bPtr, int length, float* outPtr) { + cudaError_t err; + + // allocate memory for the mats and copy data to graphics card + // only allocate if there is a mismatch in image size, otherwise + // use the existing allocated memory + if (length != bufferLen) { + if (cudaAPtr != NULL) { + CUDA_SAFE_FREE(cudaAPtr, &err); + CUDA_SAFE_FREE(cudaBPtr, &err); + CUDA_SAFE_FREE(cudaWorkBufferPtr, &err); + CUDA_SAFE_FREE(cudaOutPtr, &err); + } + CUDA_SAFE_MALLOC(&cudaAPtr, length*sizeof(float), &err); + CUDA_SAFE_MALLOC(&cudaBPtr, length*sizeof(float), &err); + CUDA_SAFE_MALLOC(&cudaWorkBufferPtr, sizeof(float)*length, &err); + CUDA_SAFE_MALLOC(&cudaOutPtr, sizeof(float), &err); + bufferLen = length; + } + + // copy data over from CPU + CUDA_SAFE_MEMCPY(cudaAPtr, aPtr, length*sizeof(float), cudaMemcpyHostToDevice, &err); + CUDA_SAFE_MEMCPY(cudaBPtr, bPtr, length*sizeof(float), cudaMemcpyHostToDevice, &err); + + // perform the subtraction + int threadsPerBlock = 512; + int numBlocks = length / threadsPerBlock + 1; + subtractKernel<<>>(cudaAPtr, cudaBPtr, cudaWorkBufferPtr, length); + CUDA_KERNEL_ERR_CHK(&err); + + // perform the collapse + collapseKernel<<<1,1>>>(cudaWorkBufferPtr, cudaOutPtr, length); + CUDA_KERNEL_ERR_CHK(&err); + + // copy the single value back to the destinsion + CUDA_SAFE_MEMCPY(outPtr, cudaOutPtr, sizeof(float), cudaMemcpyDeviceToHost, &err); + } +}}} diff --git a/openbr/plugins/cuda/cudalbp.cpp b/openbr/plugins/cuda/cudalbp.cpp new file mode 100644 index 0000000..a3459dd --- /dev/null +++ b/openbr/plugins/cuda/cudalbp.cpp @@ -0,0 +1,142 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright 2016 Li Li, Colin Heinzmann * + * * + * Licensed under the Apache License, Version 2.0 (the "License"); * + * you may not use this file except in compliance with the License. * + * You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, software * + * distributed under the License is distributed on an "AS IS" BASIS, * + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * + * See the License for the specific language governing permissions and * + * limitations under the License. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include +using namespace std; + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include + +using namespace cv; + +// definitions from the CUDA source file +namespace br { namespace cuda { namespace lbp { + void wrapper(void* srcPtr, void** dstPtr, int rows, int cols); + void initializeWrapper(uint8_t* lut); +}}} + +namespace br +{ +/*! + * \ingroup transforms + * \brief Convert the image into a feature vector using Local Binary Patterns in CUDA. Modified from stock OpenBR plugin. + * \author Colin Heinzmann \cite DepthDeluxe + * \author Li Li \cite booli + */ +class CUDALBPTransform : public UntrainableTransform +{ + Q_OBJECT + Q_PROPERTY(int radius READ get_radius WRITE set_radius RESET reset_radius STORED false) + Q_PROPERTY(int maxTransitions READ get_maxTransitions WRITE set_maxTransitions RESET reset_maxTransitions STORED false) + Q_PROPERTY(bool rotationInvariant READ get_rotationInvariant WRITE set_rotationInvariant RESET reset_rotationInvariant STORED false) + BR_PROPERTY(int, radius, 1) + BR_PROPERTY(int, maxTransitions, 8) + BR_PROPERTY(bool, rotationInvariant, false) + + private: + uchar lut[256]; + uchar null; + + public: + /* Returns the number of 0->1 or 1->0 transitions in i */ + static int numTransitions(int i) + { + int transitions = 0; + int curParity = i%2; + for (int j=1; j<=8; j++) { + int parity = (i>>(j%8)) % 2; + if (parity != curParity) transitions++; + curParity = parity; + } + return transitions; + } + + static int rotationInvariantEquivalent(int i) + { + int min = std::numeric_limits::max(); + for (int j=0; j<8; j++) { + bool parity = i % 2; + i = i >> 1; + if (parity) i+=128; + min = std::min(min, i); + } + return min; + } + + void init() + { + bool set[256]; + uchar uid = 0; + for (int i=0; i<256; i++) { + if (numTransitions(i) <= maxTransitions) { + int id; + if (rotationInvariant) { + int rie = rotationInvariantEquivalent(i); + if (i == rie) id = uid++; + else id = lut[rie]; + } else id = uid++; + lut[i] = id; + set[i] = true; + } else { + set[i] = false; + } + } + + null = uid; + for (int i=0; i<256; i++) + if (!set[i]) + lut[i] = null; // Set to null id + + // copy lut over to the GPU + cuda::lbp::initializeWrapper(lut); + + std::cout << "Initialized CUDALBP" << std::endl; + } + + void project(const Template &src, Template &dst) const + { + void* const* srcDataPtr = src.m().ptr(); + int rows = *((int*)srcDataPtr[1]); + int cols = *((int*)srcDataPtr[2]); + int type = *((int*)srcDataPtr[3]); + + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type()); + void** dstDataPtr = dstMat.ptr(); + dstDataPtr[1] = srcDataPtr[1]; + dstDataPtr[2] = srcDataPtr[2]; + dstDataPtr[3] = srcDataPtr[3]; + + cuda::lbp::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols); + dst = dstMat; + } +}; + +BR_REGISTER(Transform, CUDALBPTransform) + +} + +#include "cuda/cudalbp.moc" diff --git a/openbr/plugins/cuda/cudalbp.cu b/openbr/plugins/cuda/cudalbp.cu new file mode 100644 index 0000000..f1e7118 --- /dev/null +++ b/openbr/plugins/cuda/cudalbp.cu @@ -0,0 +1,93 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright 2016 Li Li, Colin Heinzmann * + * * + * Licensed under the Apache License, Version 2.0 (the "License"); * + * you may not use this file except in compliance with the License. * + * You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, software * + * distributed under the License is distributed on an "AS IS" BASIS, * + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * + * See the License for the specific language governing permissions and * + * limitations under the License. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ +#include + +using namespace std; + +#include +#include + +#include "cudadefines.hpp" + +using namespace cv; +using namespace cv::gpu; + +/* + * These are the CUDA functions for CUDALBP. See cudapca.cpp for more details + */ + +namespace br { namespace cuda { namespace lbp { + uint8_t* lut; + + __device__ __forceinline__ uint8_t getPixelValueKernel(int row, int col, uint8_t* srcPtr, int rows, int cols) { + return (srcPtr + row*cols)[col]; + } + + __global__ void lutKernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols, uint8_t* lut) + { + int rowInd = blockIdx.y*blockDim.y+threadIdx.y; + int colInd = blockIdx.x*blockDim.x+threadIdx.x; + int radius = 1; + + int index = rowInd*cols + colInd; + + // don't do anything if the index is out of bounds + if (rowInd < 1 || rowInd >= rows-1 || colInd < 1 || colInd >= cols-1) { + if (rowInd >= rows || colInd >= cols) { + return; + } else { + dstPtr[index] = 0; + return; + } + } + + const uint8_t cval = getPixelValueKernel(rowInd+0*radius, colInd+0*radius, srcPtr, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]); // center value + uint8_t val = lut[(getPixelValueKernel(rowInd-1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 128 : 0) | + (getPixelValueKernel(rowInd-1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 64 : 0) | + (getPixelValueKernel(rowInd-1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 32 : 0) | + (getPixelValueKernel(rowInd+0*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 16 : 0) | + (getPixelValueKernel(rowInd+1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 8 : 0) | + (getPixelValueKernel(rowInd+1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 4 : 0) | + (getPixelValueKernel(rowInd+1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 2 : 0) | + (getPixelValueKernel(rowInd+0*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 1 : 0)]; + + // store calculated value away in the right place + dstPtr[index] = val; + } + + //void cudalbp_wrapper(uint8_t* srcPtr, uint8_t* dstPtr, uint8_t* lut, int imageWidth, int imageHeight, size_t step) + void wrapper(void* srcPtr, void** dstPtr, int rows, int cols) + { + cudaError_t err; + + // make 8 * 8 = 64 square block + dim3 threadsPerBlock(8, 8); + dim3 numBlocks(cols/threadsPerBlock.x + 1, + rows/threadsPerBlock.y + 1); + + CUDA_SAFE_MALLOC(dstPtr, rows*cols*sizeof(uint8_t), &err); + lutKernel<<>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), rows, cols, lut); + CUDA_KERNEL_ERR_CHK(&err); + + CUDA_SAFE_FREE(srcPtr, &err); + } + + void initializeWrapper(uint8_t* cpuLut) { + cudaError_t err; + CUDA_SAFE_MALLOC(&lut, 256*sizeof(uint8_t), &err); + CUDA_SAFE_MEMCPY(lut, cpuLut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice, &err); + } +}}} diff --git a/openbr/plugins/cuda/cudapca.cpp b/openbr/plugins/cuda/cudapca.cpp new file mode 100644 index 0000000..dd51a81 --- /dev/null +++ b/openbr/plugins/cuda/cudapca.cpp @@ -0,0 +1,271 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright 2016 Colin Heinzmann * + * * + * Licensed under the Apache License, Version 2.0 (the "License"); * + * you may not use this file except in compliance with the License. * + * You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, software * + * distributed under the License is distributed on an "AS IS" BASIS, * + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * + * See the License for the specific language governing permissions and * + * limitations under the License. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include +using namespace std; +#include + +#include + +#include + +#include +using namespace cv; + +#include +#include +#include +#include + +// definitions from the CUDA source file +namespace br { namespace cuda { namespace pca { + void initializeWrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems); + void trainWrapper(void* cudaSrc, float* dst, int rows, int cols); + void wrapper(void* src, void** dst, int imgRows, int imgCols); +}}} + +namespace br +{ +/*! + * \ingroup transforms + * \brief Projects input into learned Principal Component Analysis subspace using CUDA. Modified from original PCA plugin. + * \author Colin Heinzmann \cite DepthDeluxe + * + * \br_property float keep Options are: [keep < 0 - All eigenvalues are retained, keep == 0 - No PCA is performed and the eigenvectors form an identity matrix, 0 < keep < 1 - Keep is the fraction of the variance to retain, keep >= 1 - keep is the number of leading eigenvectors to retain] Default is 0.95. + * \br_property int drop The number of leading eigen-dimensions to drop. + * \br_property bool whiten Whether or not to perform PCA whitening (i.e., normalize variance of each dimension to unit norm) + */ +class CUDAPCATransform : public Transform +{ + Q_OBJECT + +protected: + Q_PROPERTY(float keep READ get_keep WRITE set_keep RESET reset_keep STORED false) + Q_PROPERTY(int drop READ get_drop WRITE set_drop RESET reset_drop STORED false) + Q_PROPERTY(bool whiten READ get_whiten WRITE set_whiten RESET reset_whiten STORED false) + + BR_PROPERTY(float, keep, 0.95) + BR_PROPERTY(int, drop, 0) + BR_PROPERTY(bool, whiten, false) + + Eigen::VectorXf mean, eVals; + Eigen::MatrixXf eVecs; + + int originalRows; + +public: + CUDAPCATransform() : keep(0.95), drop(0), whiten(false) {} + +private: + double residualReconstructionError(const Template &src) const + { + Template proj; + project(src, proj); + + Eigen::Map srcMap(src.m().ptr(), src.m().rows*src.m().cols); + Eigen::Map projMap(proj.m().ptr(), keep); + + return (srcMap - mean).squaredNorm() - projMap.squaredNorm(); + } + + void train(const TemplateList &cudaTrainingSet) + { + // copy the data back from the graphics card so the training can be done on the CPU + const int instances = cudaTrainingSet.size(); // get the number of training set instances + QList