Commit 53d01e939b2fc292f9a1143cfa24e62d007fc27f
Merge pull request #464 from CGLG/master
CUDA-accelerated plugin support
Showing
20 changed files
with
1892 additions
and
1 deletions
openbr/plugins/cuda/README.md
0 → 100644
| 1 | +# CUDA Plugins | |
| 2 | +CUDA plugins are very similar to normal plugins. A single plugin is split into | |
| 3 | +two files: the `.cpp` file with the BR standard plugin definition and the `.cu` | |
| 4 | +file with your kernel and wrapper functions. | |
| 5 | + | |
| 6 | +## The `.cpp` file | |
| 7 | +Every main plugin file must have the names of the kernel wrapper functions | |
| 8 | +defined at the top of the program. Once the definitions are there, just call | |
| 9 | +the CUDA functions as you need them | |
| 10 | + | |
| 11 | +## The `.cu` file | |
| 12 | +All functions within the CUDA file must be declared inside their own namespace | |
| 13 | +under `br::cuda`. For example the plugin `passthrough` must have all functions | |
| 14 | +inside it declared under the namespace `br::cuda::passthrough`. | |
| 15 | + | |
| 16 | +## CPU Template object format | |
| 17 | +Like any other BR Transform, the plugin must return an object for the next | |
| 18 | +plugin to consume. For performance reasons, we don't copy data to and from | |
| 19 | +the graphics card for every transform. Instead, we use this space to transfer | |
| 20 | +data about how to access the image data and its type. The Mat is an array of data type `void*`. | |
| 21 | + | |
| 22 | +Index | Item Name | Type | Description | |
| 23 | +--------|-------------|-----------|------------ | |
| 24 | +0 | GpuData | void* | Pointer to the graphics card data | |
| 25 | +1 | rows | int | Number of rows in the Mat | |
| 26 | +2 | cols | int | Number of colums in the Mat | |
| 27 | +3 | type | int | OpenCV mat data type code (i.e. `mat.type()`) | |
| 28 | + | |
| 29 | +It is expected that the wrapper function does the proper GPU memory handling | |
| 30 | +to make sure that the GpuData pointer in the output mat is pointing to the | |
| 31 | +data that the plugin is outputting. | |
| 32 | + | |
| 33 | +## Example: Passthrough | |
| 34 | +This example plugin takes in input data and passes it straight to the output. | |
| 35 | +The BR transform calls the wrapper function which exists in the CUDA file which | |
| 36 | +in turn calls the kernel routine to copy the data in the GPU. | |
| 37 | + | |
| 38 | +**Note**: This program assumes that a previous Transform, namely `CUDACopyTo` has | |
| 39 | +copied the data to the GPU. | |
| 40 | + | |
| 41 | +### **passthrough.cpp** | |
| 42 | +```c++ | |
| 43 | +#include <openbr/plugins/openbr_internal.h> | |
| 44 | +#include <opencv2/opencv.hpp> | |
| 45 | + | |
| 46 | +// wrapper function within the CUDA file | |
| 47 | +namespace br { namespace cuda { namespace passthrough { | |
| 48 | + void wrapper(void* srcGpuData, void** dstGpuData); | |
| 49 | +}}}; | |
| 50 | + | |
| 51 | +#include <iostream> | |
| 52 | +namespace br | |
| 53 | +{ | |
| 54 | + class CUDAPassthroughTransform : public UntrainableTransform | |
| 55 | + { | |
| 56 | + Q_OBJECT | |
| 57 | + | |
| 58 | + void project(const Template &src, Template &dst) { | |
| 59 | + // extract the parameters out of the Mat passed from the previous plugin | |
| 60 | + void* const* srcDataPtr = src.m().ptr<void*>(); | |
| 61 | + int rows = *((int*)srcDataPtr[1]); | |
| 62 | + int cols = *((int*)srcDataPtr[2]); | |
| 63 | + int type = *((int*)srcDataPtr[3]); | |
| 64 | + | |
| 65 | + // generate a new Mat to be passed to the next plugin | |
| 66 | + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type()); | |
| 67 | + void** dstDataPtr = dstMat.ptr<void*>(); | |
| 68 | + dstDataPtr[1] = srcDataPtr[1]; | |
| 69 | + dstDataPtr[2] = srcDataPtr[2]; | |
| 70 | + dstDataPtr[3] = srcDataPtr[3]; | |
| 71 | + | |
| 72 | + // call the wrapper and set the dst output to the newly created Mat | |
| 73 | + br::cuda::passthrough::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols); | |
| 74 | + dst = dstMat; | |
| 75 | + } | |
| 76 | + }; | |
| 77 | + | |
| 78 | + BR_REGISTER(Transform, CUDAPassthroughTransform); | |
| 79 | +} | |
| 80 | + | |
| 81 | +#include "cuda/passthrough.moc" | |
| 82 | +``` | |
| 83 | + | |
| 84 | +### **passthrough.cu** | |
| 85 | +```c++ | |
| 86 | +#include <opencv2/opencv.hpp> | |
| 87 | + | |
| 88 | +namespace br { namespace cuda { namespace passthrough { | |
| 89 | + __global__ void kernel(char* srcPtr, char* dstPtr, int rows, int cols) { | |
| 90 | + // get the current index | |
| 91 | + int rowInd = blockIdx.y*blockDim.y+threadIdx.y; | |
| 92 | + int colInd = blockIdx.x*blockDim.x+threadIdx.x; | |
| 93 | + | |
| 94 | + // don't do anything if we are outside the allowable positions | |
| 95 | + if (rowInd >= rows || colInd >= cols) | |
| 96 | + return; | |
| 97 | + | |
| 98 | + // write the input to the output | |
| 99 | + rowDstPtr[rowInd*cols + colInd] = srcVal; | |
| 100 | + } | |
| 101 | + | |
| 102 | + void wrapper(char* srcPtr, char** dstPtr, int rows, int cols, int type) { | |
| 103 | + // verify the proper image type | |
| 104 | + if (type != CV_8UC1) { | |
| 105 | + cout << "Error: image type not supported" | |
| 106 | + return; | |
| 107 | + } | |
| 108 | + | |
| 109 | + *dstPtr = cudaMalloc(rows*cols*sizeof(char)); | |
| 110 | + | |
| 111 | + dim3 threadsPerBlock(8, 8); | |
| 112 | + dim3 numBlocks(imageWidth / threadsPerBlock.x + 1, | |
| 113 | + imageHeight / threadsPerBlock.y + 1); | |
| 114 | + | |
| 115 | + // run the kernel function | |
| 116 | + kernel<<<numBlocks, threadPerBlock>>>(srcPtr, dstPtr, rows, cols); | |
| 117 | + | |
| 118 | + // free the memory as it isn't used anymore | |
| 119 | + cudaFree(srcPtr); | |
| 120 | + } | |
| 121 | +}}} | |
| 122 | +``` | ... | ... |
openbr/plugins/cuda/copyfrom.cpp
0 → 100644
| 1 | +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * | |
| 2 | + * Copyright 2016 Colin Heinzmann * | |
| 3 | + * * | |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); * | |
| 5 | + * you may not use this file except in compliance with the License. * | |
| 6 | + * You may obtain a copy of the License at * | |
| 7 | + * * | |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 * | |
| 9 | + * * | |
| 10 | + * Unless required by applicable law or agreed to in writing, software * | |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, * | |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * | |
| 13 | + * See the License for the specific language governing permissions and * | |
| 14 | + * limitations under the License. * | |
| 15 | + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ | |
| 16 | + | |
| 17 | +#include <iostream> | |
| 18 | + | |
| 19 | +#include <opencv2/opencv.hpp> | |
| 20 | + | |
| 21 | +#include <openbr/plugins/openbr_internal.h> | |
| 22 | + | |
| 23 | +using namespace std; | |
| 24 | + | |
| 25 | +using namespace cv; | |
| 26 | + | |
| 27 | +// CUDA functions for this plugin | |
| 28 | +namespace br { namespace cuda { namespace copyfrom { | |
| 29 | + template <typename T> void wrapper(void* src, T* out, int rows, int cols); | |
| 30 | +}}} | |
| 31 | + | |
| 32 | +namespace br | |
| 33 | +{ | |
| 34 | + /*! | |
| 35 | + * \ingroup transforms | |
| 36 | + * \brief Copies a transform from the GPU to the CPU. | |
| 37 | + * \author Colin Heinzmann \cite DepthDeluxe | |
| 38 | + * \note Method: Automatically matches image dimensions, works for 32-bit single channel, 8-bit single channel, and 8-bit 3 channel | |
| 39 | + */ | |
| 40 | + class CUDACopyFrom : public UntrainableTransform | |
| 41 | + { | |
| 42 | + Q_OBJECT | |
| 43 | + | |
| 44 | +private: | |
| 45 | + void project(const Template &src, Template &dst) const | |
| 46 | + { | |
| 47 | + // pull the data back out of the Mat | |
| 48 | + void* const* dataPtr = src.m().ptr<void*>(); | |
| 49 | + int rows = *((int*)dataPtr[1]); | |
| 50 | + int cols = *((int*)dataPtr[2]); | |
| 51 | + int type = *((int*)dataPtr[3]); | |
| 52 | + | |
| 53 | + Mat dstMat = Mat(rows, cols, type); | |
| 54 | + switch(type) { | |
| 55 | + case CV_32FC1: | |
| 56 | + cuda::copyfrom::wrapper(dataPtr[0], dstMat.ptr<float>(), rows, cols); | |
| 57 | + break; | |
| 58 | + case CV_8UC1: | |
| 59 | + cuda::copyfrom::wrapper(dataPtr[0], dstMat.ptr<unsigned char>(), rows, cols); | |
| 60 | + break; | |
| 61 | + case CV_8UC3: | |
| 62 | + cuda::copyfrom::wrapper(dataPtr[0], dstMat.ptr<unsigned char>(), rows, cols * 3); | |
| 63 | + break; | |
| 64 | + default: | |
| 65 | + cout << "ERR: Invalid image type (" << type << ")" << endl; | |
| 66 | + break; | |
| 67 | + } | |
| 68 | + dst = dstMat; | |
| 69 | + } | |
| 70 | + }; | |
| 71 | + | |
| 72 | + BR_REGISTER(Transform, CUDACopyFrom); | |
| 73 | +} | |
| 74 | + | |
| 75 | +#include "cuda/copyfrom.moc" | ... | ... |
openbr/plugins/cuda/copyfrom.cu
0 → 100644
| 1 | +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * | |
| 2 | + * Copyright 2016 Colin Heinzmann * | |
| 3 | + * * | |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); * | |
| 5 | + * you may not use this file except in compliance with the License. * | |
| 6 | + * You may obtain a copy of the License at * | |
| 7 | + * * | |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 * | |
| 9 | + * * | |
| 10 | + * Unless required by applicable law or agreed to in writing, software * | |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, * | |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * | |
| 13 | + * See the License for the specific language governing permissions and * | |
| 14 | + * limitations under the License. * | |
| 15 | + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ | |
| 16 | + | |
| 17 | +#include "cudadefines.hpp" | |
| 18 | + | |
| 19 | +namespace br { namespace cuda { namespace copyfrom { | |
| 20 | + template <typename T> void wrapper(void* src, T* dst, int rows, int cols) { | |
| 21 | + cudaError_t err; | |
| 22 | + CUDA_SAFE_MEMCPY(dst, src, rows*cols*sizeof(T), cudaMemcpyDeviceToHost, &err); | |
| 23 | + CUDA_SAFE_FREE(src, &err); | |
| 24 | + } | |
| 25 | + | |
| 26 | + template void wrapper(void*, float*, int, int); | |
| 27 | + template void wrapper(void*, unsigned char*, int, int); | |
| 28 | +}}} | ... | ... |
openbr/plugins/cuda/copyto.cpp
0 → 100644
| 1 | +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * | |
| 2 | + * Copyright 2016 Colin Heinzmann * | |
| 3 | + * * | |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); * | |
| 5 | + * you may not use this file except in compliance with the License. * | |
| 6 | + * You may obtain a copy of the License at * | |
| 7 | + * * | |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 * | |
| 9 | + * * | |
| 10 | + * Unless required by applicable law or agreed to in writing, software * | |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, * | |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * | |
| 13 | + * See the License for the specific language governing permissions and * | |
| 14 | + * limitations under the License. * | |
| 15 | + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ | |
| 16 | + | |
| 17 | +#include <iostream> | |
| 18 | + | |
| 19 | +#include <opencv2/opencv.hpp> | |
| 20 | + | |
| 21 | +#include <openbr/plugins/openbr_internal.h> | |
| 22 | + | |
| 23 | +using namespace std; | |
| 24 | + | |
| 25 | +using namespace cv; | |
| 26 | + | |
| 27 | +// definitions from the CUDA source file | |
| 28 | +namespace br { namespace cuda { namespace copyto { | |
| 29 | + template <typename T> void wrapper(const T* in, void** out, const int rows, const int cols); | |
| 30 | +}}} | |
| 31 | + | |
| 32 | +namespace br | |
| 33 | +{ | |
| 34 | + | |
| 35 | + /*! | |
| 36 | + * \ingroup transforms | |
| 37 | + * \brief Copies a transform to the GPU. | |
| 38 | + * \author Colin Heinzmann \cite DepthDeluxe | |
| 39 | + * \note Method: Automatically matches image dimensions, works for 32-bit single channel, 8-bit single channel, and 8-bit 3 channel | |
| 40 | + */ | |
| 41 | + class CUDACopyTo : public UntrainableTransform | |
| 42 | + { | |
| 43 | + Q_OBJECT | |
| 44 | + | |
| 45 | +private: | |
| 46 | + void project(const Template &src, Template &dst) const | |
| 47 | + { | |
| 48 | + const Mat& srcMat = src.m(); | |
| 49 | + const int rows = srcMat.rows; | |
| 50 | + const int cols = srcMat.cols; | |
| 51 | + | |
| 52 | + // output will be a single pointer to graphics card memory | |
| 53 | + Mat dstMat = Mat(4, 1, DataType<void*>::type); | |
| 54 | + void** dstMatData = dstMat.ptr<void*>(); | |
| 55 | + | |
| 56 | + // save cuda ptr, rows, cols, then type | |
| 57 | + dstMatData[1] = new int; *((int*)dstMatData[1]) = rows; | |
| 58 | + dstMatData[2] = new int; *((int*)dstMatData[2]) = cols; | |
| 59 | + dstMatData[3] = new int; *((int*)dstMatData[3]) = srcMat.type(); | |
| 60 | + | |
| 61 | + void* cudaMemPtr; | |
| 62 | + switch(srcMat.type()) { | |
| 63 | + case CV_32FC1: | |
| 64 | + cuda::copyto::wrapper(srcMat.ptr<float>(), &dstMatData[0], rows, cols); | |
| 65 | + break; | |
| 66 | + case CV_8UC1: | |
| 67 | + cuda::copyto::wrapper(srcMat.ptr<unsigned char>(), &dstMatData[0], rows, cols); | |
| 68 | + break; | |
| 69 | + case CV_8UC3: | |
| 70 | + cuda::copyto::wrapper(srcMat.ptr<unsigned char>(), &dstMatData[0], rows, 3*cols); | |
| 71 | + break; | |
| 72 | + default: | |
| 73 | + cout << "ERR: Invalid image type (" << srcMat.type() << ")" << endl; | |
| 74 | + return; | |
| 75 | + } | |
| 76 | + | |
| 77 | + dst = dstMat; | |
| 78 | + } | |
| 79 | + }; | |
| 80 | + | |
| 81 | + BR_REGISTER(Transform, CUDACopyTo); | |
| 82 | +} | |
| 83 | + | |
| 84 | +#include "cuda/copyto.moc" | ... | ... |
openbr/plugins/cuda/copyto.cu
0 → 100644
| 1 | +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * | |
| 2 | + * Copyright 2016 Colin Heinzmann * | |
| 3 | + * * | |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); * | |
| 5 | + * you may not use this file except in compliance with the License. * | |
| 6 | + * You may obtain a copy of the License at * | |
| 7 | + * * | |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 * | |
| 9 | + * * | |
| 10 | + * Unless required by applicable law or agreed to in writing, software * | |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, * | |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * | |
| 13 | + * See the License for the specific language governing permissions and * | |
| 14 | + * limitations under the License. * | |
| 15 | + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ | |
| 16 | + | |
| 17 | +#include "cudadefines.hpp" | |
| 18 | + | |
| 19 | +namespace br { namespace cuda { namespace copyto { | |
| 20 | + | |
| 21 | + template <typename T> void wrapper(const T* in, void** out, const int rows, const int cols) { | |
| 22 | + cudaError_t err; | |
| 23 | + CUDA_SAFE_MALLOC(out, rows*cols*sizeof(T), &err); | |
| 24 | + CUDA_SAFE_MEMCPY(*out, in, rows*cols*sizeof(T), cudaMemcpyHostToDevice, &err); | |
| 25 | + } | |
| 26 | + | |
| 27 | + template void wrapper(const float* in, void** out, const int rows, const int cols); | |
| 28 | + template void wrapper(const unsigned char* in, void** out, const int rows, const int cols); | |
| 29 | + | |
| 30 | +}}} | ... | ... |
openbr/plugins/cuda/cudaaffine.cpp
0 → 100644
| 1 | +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * | |
| 2 | + * Copyright 2016 Greg Shrock, Colin Heinzmann * | |
| 3 | + * * | |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); * | |
| 5 | + * you may not use this file except in compliance with the License. * | |
| 6 | + * You may obtain a copy of the License at * | |
| 7 | + * * | |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 * | |
| 9 | + * * | |
| 10 | + * Unless required by applicable law or agreed to in writing, software * | |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, * | |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * | |
| 13 | + * See the License for the specific language governing permissions and * | |
| 14 | + * limitations under the License. * | |
| 15 | + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ | |
| 16 | + | |
| 17 | + | |
| 18 | + | |
| 19 | +#include <iostream> | |
| 20 | +using namespace std; | |
| 21 | + | |
| 22 | +#include <sys/types.h> | |
| 23 | +#include <unistd.h> | |
| 24 | + | |
| 25 | +#include <pthread.h> | |
| 26 | + | |
| 27 | +#include <opencv2/opencv.hpp> | |
| 28 | +#include <opencv2/imgproc/imgproc.hpp> | |
| 29 | +#include <opencv2/imgproc/imgproc_c.h> | |
| 30 | +#include <opencv2/highgui/highgui.hpp> | |
| 31 | +#include <opencv2/highgui/highgui_c.h> | |
| 32 | +#include <limits> | |
| 33 | + | |
| 34 | +#include <openbr/plugins/openbr_internal.h> | |
| 35 | +#include <openbr/core/opencvutils.h> | |
| 36 | + | |
| 37 | +using namespace cv; | |
| 38 | + | |
| 39 | +// definitions from the CUDA source file | |
| 40 | +namespace br { namespace cuda { namespace affine { | |
| 41 | + void resizeWrapper(void* srcPtr, void** dstPtr, int src_rows, int src_cols, int dst_rows, int dst_cols); | |
| 42 | + void wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols); | |
| 43 | +}}} | |
| 44 | + | |
| 45 | +namespace br | |
| 46 | +{ | |
| 47 | + | |
| 48 | + /*! | |
| 49 | + * \ingroup transforms | |
| 50 | + * \brief Performs a two or three point registration on the GPU. Modified from stock OpenBR implementation. Only supports single-point input bilinear transformation. | |
| 51 | + * \author Greg Schrock \cite gls022 | |
| 52 | + * \author Colin Heinzmann \cite DepthDeluxe | |
| 53 | + * \note Method: Area should be used for shrinking an image, Cubic for slow but accurate enlargment, Bilin for fast enlargement. | |
| 54 | + */ | |
| 55 | + class CUDAAffineTransform : public UntrainableTransform | |
| 56 | + { | |
| 57 | + Q_OBJECT | |
| 58 | + | |
| 59 | + private: | |
| 60 | + Q_PROPERTY(int width READ get_width WRITE set_width RESET reset_width STORED false) | |
| 61 | + Q_PROPERTY(int height READ get_height WRITE set_height RESET reset_height STORED false) | |
| 62 | + Q_PROPERTY(float x1 READ get_x1 WRITE set_x1 RESET reset_x1 STORED false) | |
| 63 | + Q_PROPERTY(float y1 READ get_y1 WRITE set_y1 RESET reset_y1 STORED false) | |
| 64 | + BR_PROPERTY(int, width, 64) | |
| 65 | + BR_PROPERTY(int, height, 64) | |
| 66 | + BR_PROPERTY(float, x1, 0) | |
| 67 | + BR_PROPERTY(float, y1, 0) | |
| 68 | + | |
| 69 | + static Point2f getThirdAffinePoint(const Point2f &a, const Point2f &b) | |
| 70 | + { | |
| 71 | + float dx = b.x - a.x; | |
| 72 | + float dy = b.y - a.y; | |
| 73 | + return Point2f(a.x - dy, a.y + dx); | |
| 74 | + } | |
| 75 | + | |
| 76 | + void project(const Template &src, Template &dst) const | |
| 77 | + { | |
| 78 | + Point2f dstPoints[3]; | |
| 79 | + dstPoints[0] = Point2f(x1*width, y1*height); | |
| 80 | + dstPoints[1] = Point2f((1-x1)*width, (1-y1)*height); | |
| 81 | + dstPoints[2] = getThirdAffinePoint(dstPoints[0], dstPoints[1]); | |
| 82 | + | |
| 83 | + Point2f srcPoints[3]; | |
| 84 | + if (src.file.contains("Affine_0") && | |
| 85 | + src.file.contains("Affine_1") && | |
| 86 | + src.file.contains("Affine_2")) { | |
| 87 | + srcPoints[0] = OpenCVUtils::toPoint(src.file.get<QPointF>("Affine_0")); | |
| 88 | + srcPoints[1] = OpenCVUtils::toPoint(src.file.get<QPointF>("Affine_1")); | |
| 89 | + } else { | |
| 90 | + const QList<Point2f> landmarks = OpenCVUtils::toPoints(src.file.points()); | |
| 91 | + | |
| 92 | + if (landmarks.size() < 2) { | |
| 93 | + void* const* srcDataPtr = src.m().ptr<void*>(); | |
| 94 | + int rows = *((int*)srcDataPtr[1]); | |
| 95 | + int cols = *((int*)srcDataPtr[2]); | |
| 96 | + int type = *((int*)srcDataPtr[3]); | |
| 97 | + | |
| 98 | + if (type != CV_8UC1) { | |
| 99 | + cout << "ERR: Invalid image format!" << endl; | |
| 100 | + return; | |
| 101 | + } | |
| 102 | + | |
| 103 | + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type()); | |
| 104 | + void** dstDataPtr = dstMat.ptr<void*>(); | |
| 105 | + | |
| 106 | + dstDataPtr[1] = srcDataPtr[1]; *((int*)dstDataPtr[1]) = height; // rows | |
| 107 | + dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = width; // cols | |
| 108 | + dstDataPtr[3] = srcDataPtr[3]; | |
| 109 | + | |
| 110 | + cuda::affine::resizeWrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols, height, width); | |
| 111 | + dst = dstMat; | |
| 112 | + return; | |
| 113 | + } else { | |
| 114 | + srcPoints[0] = landmarks[0]; | |
| 115 | + srcPoints[1] = landmarks[1]; | |
| 116 | + } | |
| 117 | + } | |
| 118 | + srcPoints[2] = getThirdAffinePoint(srcPoints[0], srcPoints[1]); | |
| 119 | + | |
| 120 | + Mat affineTransform = getAffineTransform(srcPoints, dstPoints); | |
| 121 | + | |
| 122 | + void* const* srcDataPtr = src.m().ptr<void*>(); | |
| 123 | + int rows = *((int*)srcDataPtr[1]); | |
| 124 | + int cols = *((int*)srcDataPtr[2]); | |
| 125 | + int type = *((int*)srcDataPtr[3]); | |
| 126 | + | |
| 127 | + if (type != CV_8UC1) { | |
| 128 | + cout << "ERR: Invalid image format!" << endl; | |
| 129 | + return; | |
| 130 | + } | |
| 131 | + | |
| 132 | + | |
| 133 | + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type()); | |
| 134 | + void** dstDataPtr = dstMat.ptr<void*>(); | |
| 135 | + | |
| 136 | + dstDataPtr[1] = srcDataPtr[1]; *((int*)dstDataPtr[1]) = height; // rows | |
| 137 | + dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = width; // cols | |
| 138 | + dstDataPtr[3] = srcDataPtr[3]; | |
| 139 | + | |
| 140 | + cuda::affine::wrapper(srcDataPtr[0], &dstDataPtr[0], affineTransform, rows, cols, height, width); | |
| 141 | + | |
| 142 | + dst = dstMat; | |
| 143 | + } | |
| 144 | + }; | |
| 145 | + | |
| 146 | + BR_REGISTER(Transform, CUDAAffineTransform) | |
| 147 | + | |
| 148 | +} // namespace br | |
| 149 | + | |
| 150 | +#include "cuda/cudaaffine.moc" | ... | ... |
openbr/plugins/cuda/cudaaffine.cu
0 → 100644
| 1 | +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * | |
| 2 | + * Copyright 2016 Colin Heinzmann * | |
| 3 | + * * | |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); * | |
| 5 | + * you may not use this file except in compliance with the License. * | |
| 6 | + * You may obtain a copy of the License at * | |
| 7 | + * * | |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 * | |
| 9 | + * * | |
| 10 | + * Unless required by applicable law or agreed to in writing, software * | |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, * | |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * | |
| 13 | + * See the License for the specific language governing permissions and * | |
| 14 | + * limitations under the License. * | |
| 15 | + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ | |
| 16 | + | |
| 17 | +#include <iostream> | |
| 18 | + | |
| 19 | +using namespace std; | |
| 20 | + | |
| 21 | +#include <opencv2/gpu/gpu.hpp> | |
| 22 | +#include <opencv2/opencv.hpp> | |
| 23 | +#include <stdio.h> | |
| 24 | +#include <math.h> | |
| 25 | + | |
| 26 | +#include "cudadefines.hpp" | |
| 27 | + | |
| 28 | +using namespace cv; | |
| 29 | +using namespace cv::gpu; | |
| 30 | + | |
| 31 | +namespace br { namespace cuda { namespace affine { | |
| 32 | + | |
| 33 | + __device__ __forceinline__ uint8_t getPixelValueDevice(int row, int col, uint8_t* srcPtr, int rows, int cols) { | |
| 34 | + return (srcPtr + row*cols)[col]; | |
| 35 | + } | |
| 36 | + | |
| 37 | + | |
| 38 | + __device__ __forceinline__ uint8_t getBilinearPixelValueDevice(double row, double col, uint8_t* srcPtr, int rows, int cols) { | |
| 39 | + // http://www.sci.utah.edu/~acoste/uou/Image/project3/ArthurCOSTE_Project3.pdf | |
| 40 | + // Bilinear Transformation | |
| 41 | + // f(Px, Py) = f(Q11)×(1−Rx)×(1−Sy)+f(Q21)×(Rx)×(1−Sy)+f(Q12)×(1−Rx)×(Sy)+f(Q22)×(Rx)×(Sy) | |
| 42 | + | |
| 43 | + int row1 = floor(row); | |
| 44 | + int row2 = row1+1; | |
| 45 | + | |
| 46 | + int col1 = floor(col); | |
| 47 | + int col2 = col1+1; | |
| 48 | + | |
| 49 | + double d_row = row - row1; | |
| 50 | + double d_col = col - col1; | |
| 51 | + | |
| 52 | + int Q11 = getPixelValueDevice(row1, col1, srcPtr, rows, cols); | |
| 53 | + int Q21 = getPixelValueDevice(row2, col1, srcPtr, rows, cols); | |
| 54 | + int Q12 = getPixelValueDevice(row1, col2, srcPtr, rows, cols); | |
| 55 | + int Q22 = getPixelValueDevice(row2, col2, srcPtr, rows, cols); | |
| 56 | + | |
| 57 | + double val = Q22*(d_row*d_col) + Q12*((1-d_row)*d_col) + Q21*(d_row*(1-d_col)) + Q11*((1-d_row)*(1-d_col)); | |
| 58 | + return ((uint8_t) round(val)); | |
| 59 | + } | |
| 60 | + | |
| 61 | + __device__ __forceinline__ uint8_t getDistancePixelValueDevice(double row, double col, uint8_t* srcPtr, int rows, int cols) { | |
| 62 | + int row1 = floor(row); | |
| 63 | + int row2 = row1+1; | |
| 64 | + | |
| 65 | + int col1 = floor(col); | |
| 66 | + int col2 = col1+1; | |
| 67 | + | |
| 68 | + double m1 = row2 - row; | |
| 69 | + double m12 = m1*m1; | |
| 70 | + | |
| 71 | + double m2 = col - col1; | |
| 72 | + double m22 = m2*m2; | |
| 73 | + | |
| 74 | + double d1 = sqrt(m12 - 2*m1 + 1 + m22); | |
| 75 | + double d2 = sqrt(m12 + m22); | |
| 76 | + double d3 = sqrt(m12 - 2*m1 + 1 + m22 - 2*m2 + 1); | |
| 77 | + double d4 = sqrt(m12 + m22 - 2*m2 + 1); | |
| 78 | + double sum = d1 + d2 + d3 + d4; | |
| 79 | + | |
| 80 | + double w1 = d1/sum; | |
| 81 | + double w2 = d2/sum; | |
| 82 | + double w3 = d3/sum; | |
| 83 | + double w4 = d4/sum; | |
| 84 | + | |
| 85 | + uint8_t v1 = getPixelValueDevice(row1, col1, srcPtr, rows, cols); | |
| 86 | + uint8_t v2 = getPixelValueDevice(row2, col1, srcPtr, rows, cols); | |
| 87 | + uint8_t v3 = getPixelValueDevice(row1, col2, srcPtr, rows, cols); | |
| 88 | + uint8_t v4 = getPixelValueDevice(row2, col2, srcPtr, rows, cols); | |
| 89 | + | |
| 90 | + return round(w1*v1 + w2*v2 + w3*v3 + w4*v4); | |
| 91 | + } | |
| 92 | + | |
| 93 | + /* | |
| 94 | + * trans_inv - A pointer to a one-dimensional representation of the inverse of the transform matrix 3x3 | |
| 95 | + * dst_row - The destination row (mapping to this row) | |
| 96 | + * dst_col - The destination column (mapping to this column) | |
| 97 | + * src_row - The computed source pixel row (mapping from this row) | |
| 98 | + * src_col - The computed source pixel column (mapping from this col) | |
| 99 | + */ | |
| 100 | + __device__ __forceinline__ void getSrcCoordDevice(double *trans_inv, int dst_row, int dst_col, double* src_row_pnt, double* src_col_pnt){ | |
| 101 | + *src_col_pnt = dst_col * trans_inv[0] + dst_row * trans_inv[3] + trans_inv[6]; | |
| 102 | + *src_row_pnt = dst_col * trans_inv[1] + dst_row * trans_inv[4] + trans_inv[7]; | |
| 103 | + } | |
| 104 | + | |
| 105 | + __global__ void bilinearKernel(uint8_t* srcPtr, uint8_t* dstPtr, int srcRows, int srcCols, int dstRows, int dstCols) { | |
| 106 | + int dstRowInd = blockIdx.y*blockDim.y+threadIdx.y; | |
| 107 | + int dstColInd = blockIdx.x*blockDim.x+threadIdx.x; | |
| 108 | + int dstIndex = dstRowInd*dstCols+dstColInd; | |
| 109 | + | |
| 110 | + // destination boundary checking | |
| 111 | + if (dstRowInd >= dstRows || dstColInd >= dstCols) { | |
| 112 | + return; | |
| 113 | + } | |
| 114 | + | |
| 115 | + // get the reference indices and relative amounts | |
| 116 | + float exactSrcRowInd = (float)dstRowInd / (float)dstRows * (float)srcRows; | |
| 117 | + int minSrcRowInd = (int)exactSrcRowInd; | |
| 118 | + int maxSrcRowInd = minSrcRowInd+1; | |
| 119 | + float relSrcRowInd = 1.-(exactSrcRowInd-(float)minSrcRowInd); | |
| 120 | + | |
| 121 | + // get the reference indices and relative amounts | |
| 122 | + double exactSrcColInd = (double)dstColInd / (double)dstCols * (double)srcCols; | |
| 123 | + int minSrcColInd = (int)exactSrcColInd; | |
| 124 | + int maxSrcColInd = minSrcColInd+1; | |
| 125 | + float relSrcColInd = 1.-(exactSrcColInd-(float)minSrcColInd); | |
| 126 | + | |
| 127 | + // perform boundary checking | |
| 128 | + if (minSrcRowInd < 0 || maxSrcRowInd >= srcRows || minSrcColInd < 0 || maxSrcColInd >= srcCols) { | |
| 129 | + dstPtr[dstIndex] = 0; | |
| 130 | + return; | |
| 131 | + } | |
| 132 | + | |
| 133 | + // get each of the pixel values | |
| 134 | + float topLeft = srcPtr[minSrcRowInd*srcCols+minSrcColInd]; | |
| 135 | + float topRight = srcPtr[minSrcRowInd*srcCols+maxSrcColInd]; | |
| 136 | + float bottomLeft = srcPtr[maxSrcRowInd*srcCols+minSrcColInd]; | |
| 137 | + float bottomRight = srcPtr[maxSrcRowInd*srcCols+maxSrcColInd]; | |
| 138 | + | |
| 139 | + float out = relSrcRowInd*relSrcColInd*topLeft + relSrcRowInd*(1.-relSrcColInd)*topRight + (1.-relSrcRowInd)*relSrcColInd*bottomLeft + (1.-relSrcRowInd)*(1.-relSrcColInd)*bottomRight; | |
| 140 | + | |
| 141 | + dstPtr[dstIndex] = (int)out; | |
| 142 | + } | |
| 143 | + | |
| 144 | + __global__ void affineKernel(uint8_t* srcPtr, uint8_t* dstPtr, double* trans_inv, int src_rows, int src_cols, int dst_rows, int dst_cols){ | |
| 145 | + int dstRowInd = blockIdx.y*blockDim.y+threadIdx.y; | |
| 146 | + int dstColInd = blockIdx.x*blockDim.x+threadIdx.x; | |
| 147 | + int dstIndex = dstRowInd*dst_cols + dstColInd; | |
| 148 | + | |
| 149 | + double srcRowPnt; | |
| 150 | + double srcColPnt; | |
| 151 | + | |
| 152 | + // don't do anything if the index is out of bounds | |
| 153 | + if (dstRowInd >= dst_rows || dstColInd >= dst_cols) { | |
| 154 | + return; | |
| 155 | + } | |
| 156 | + if (dstRowInd == 0 || dstRowInd == dst_rows-1 || dstColInd ==0 || dstColInd == dst_cols-1) { | |
| 157 | + dstPtr[dstIndex] = 0; | |
| 158 | + return; | |
| 159 | + } | |
| 160 | + | |
| 161 | + getSrcCoordDevice(trans_inv, dstRowInd, dstColInd, &srcRowPnt, &srcColPnt); | |
| 162 | + const uint8_t cval = getBilinearPixelValueDevice(srcRowPnt, srcColPnt, srcPtr, src_rows, src_cols); // Get initial pixel value | |
| 163 | + | |
| 164 | + dstPtr[dstIndex] = cval; | |
| 165 | + } | |
| 166 | + | |
| 167 | + void resizeWrapper(void* srcPtr, void** dstPtr, int srcRows, int srcCols, int dstRows, int dstCols) { | |
| 168 | + // perform bilinear filtering | |
| 169 | + | |
| 170 | + // allocate space for destination | |
| 171 | + cudaError_t err; | |
| 172 | + CUDA_SAFE_MALLOC(dstPtr, dstRows*dstCols*sizeof(uint8_t), &err); | |
| 173 | + | |
| 174 | + // call the bilinear kernel function | |
| 175 | + dim3 threadsPerBlock(32, 16); | |
| 176 | + dim3 numBlocks(dstCols/threadsPerBlock.x + 1, | |
| 177 | + dstRows/threadsPerBlock.y + 1); | |
| 178 | + | |
| 179 | + bilinearKernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)*dstPtr, srcRows, srcCols, dstRows, dstCols); | |
| 180 | + CUDA_KERNEL_ERR_CHK(&err); | |
| 181 | + | |
| 182 | + CUDA_SAFE_FREE(srcPtr, &err); | |
| 183 | + } | |
| 184 | + | |
| 185 | + void wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols) { | |
| 186 | + cudaError_t err; | |
| 187 | + double* gpuInverse; | |
| 188 | + | |
| 189 | + dim3 threadsPerBlock(32, 16); | |
| 190 | + dim3 numBlocks(dst_cols/threadsPerBlock.x + 1, | |
| 191 | + dst_rows/threadsPerBlock.y + 1); | |
| 192 | + | |
| 193 | + //************************************************************************ | |
| 194 | + // Input affine is a 2x3 Mat whose transpose is used in the computations | |
| 195 | + // [x, y, 1] = [u, v, 1] [ a^T | [0 0 1]^T ] | |
| 196 | + // See "Digital Image Warping" by George Wolburg (p. 50) | |
| 197 | + //************************************************************************ | |
| 198 | + | |
| 199 | + // get new transform elements | |
| 200 | + double a11 = affineTransform.at<double>(0, 0); | |
| 201 | + double a12 = affineTransform.at<double>(1, 0); | |
| 202 | + double a21 = affineTransform.at<double>(0, 1); | |
| 203 | + double a22 = affineTransform.at<double>(1, 1); | |
| 204 | + double a31 = affineTransform.at<double>(0, 2); | |
| 205 | + double a32 = affineTransform.at<double>(1, 2); | |
| 206 | + | |
| 207 | + // compute transform inverse | |
| 208 | + double det = 1 / (a11*a22 - a21*a12); | |
| 209 | + | |
| 210 | + double affineInverse[9]; | |
| 211 | + affineInverse[0] = a22 * det; | |
| 212 | + affineInverse[1] = -a12 * det; | |
| 213 | + affineInverse[2] = 0; | |
| 214 | + affineInverse[3] = -a21 * det; | |
| 215 | + affineInverse[4] = a11 * det; | |
| 216 | + affineInverse[5] = 0; | |
| 217 | + affineInverse[6] = (a21*a32 - a31*a22) * det; | |
| 218 | + affineInverse[7] = (a31*a12 - a11*a32) * det; | |
| 219 | + affineInverse[8] = (a11*a22 - a21*a12) * det; | |
| 220 | + | |
| 221 | + CUDA_SAFE_MALLOC(dstPtr, dst_rows*dst_cols*sizeof(uint8_t), &err); | |
| 222 | + CUDA_SAFE_MALLOC(&gpuInverse, 3*3*sizeof(double), &err); | |
| 223 | + | |
| 224 | + CUDA_SAFE_MEMCPY(gpuInverse, affineInverse, 9*sizeof(double), cudaMemcpyHostToDevice, &err); | |
| 225 | + | |
| 226 | + affineKernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), gpuInverse, src_rows, src_cols, dst_rows, dst_cols); | |
| 227 | + CUDA_KERNEL_ERR_CHK(&err); | |
| 228 | + | |
| 229 | + CUDA_SAFE_FREE(srcPtr, &err); | |
| 230 | + CUDA_SAFE_FREE(gpuInverse, &err); | |
| 231 | + } | |
| 232 | +}}} | ... | ... |
openbr/plugins/cuda/cudacvtfloat.cpp
0 → 100644
| 1 | +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * | |
| 2 | + * Copyright 2016 Li Li, Colin Heinzmann * | |
| 3 | + * * | |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); * | |
| 5 | + * you may not use this file except in compliance with the License. * | |
| 6 | + * You may obtain a copy of the License at * | |
| 7 | + * * | |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 * | |
| 9 | + * * | |
| 10 | + * Unless required by applicable law or agreed to in writing, software * | |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, * | |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * | |
| 13 | + * See the License for the specific language governing permissions and * | |
| 14 | + * limitations under the License. * | |
| 15 | + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ | |
| 16 | + | |
| 17 | +#include <iostream> | |
| 18 | +using namespace std; | |
| 19 | +#include <unistd.h> | |
| 20 | + | |
| 21 | +#include <opencv2/opencv.hpp> | |
| 22 | +using namespace cv; | |
| 23 | + | |
| 24 | +#include <openbr/plugins/openbr_internal.h> | |
| 25 | + | |
| 26 | +// definitions from the CUDA source file | |
| 27 | +namespace br { namespace cuda { namespace cvtfloat { | |
| 28 | + void wrapper(void* src, void** dst, int rows, int cols); | |
| 29 | +}}} | |
| 30 | + | |
| 31 | +namespace br | |
| 32 | +{ | |
| 33 | + | |
| 34 | +/*! | |
| 35 | + * \ingroup transforms | |
| 36 | + * \brief Converts 8-bit images currently on GPU into 32-bit floating point equivalent. | |
| 37 | + * \author Colin Heinzmann \cite DepthDeluxe | |
| 38 | + */ | |
| 39 | +class CUDACvtFloatTransform : public UntrainableTransform | |
| 40 | +{ | |
| 41 | + Q_OBJECT | |
| 42 | + | |
| 43 | + public: | |
| 44 | + void project(const Template &src, Template &dst) const | |
| 45 | + { | |
| 46 | + void* const* srcDataPtr = src.m().ptr<void*>(); | |
| 47 | + int rows = *((int*)srcDataPtr[1]); | |
| 48 | + int cols = *((int*)srcDataPtr[2]); | |
| 49 | + int type = *((int*)srcDataPtr[3]); | |
| 50 | + | |
| 51 | + // assume the image type is 256-monochrome | |
| 52 | + // TODO(colin): real exception handling | |
| 53 | + if (type != CV_8UC1) { | |
| 54 | + cout << "ERR: Invalid memory format" << endl; | |
| 55 | + return; | |
| 56 | + } | |
| 57 | + | |
| 58 | + // build the destination mat | |
| 59 | + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type()); | |
| 60 | + void** dstDataPtr = dstMat.ptr<void*>(); | |
| 61 | + dstDataPtr[1] = srcDataPtr[1]; | |
| 62 | + dstDataPtr[2] = srcDataPtr[2]; | |
| 63 | + dstDataPtr[3] = srcDataPtr[3]; *((int*)dstDataPtr[3]) = CV_32FC1; | |
| 64 | + | |
| 65 | + cuda::cvtfloat::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols); | |
| 66 | + dst = dstMat; | |
| 67 | + } | |
| 68 | +}; | |
| 69 | + | |
| 70 | +BR_REGISTER(Transform, CUDACvtFloatTransform) | |
| 71 | + | |
| 72 | +} // namespace br | |
| 73 | + | |
| 74 | +#include "cuda/cudacvtfloat.moc" | ... | ... |
openbr/plugins/cuda/cudacvtfloat.cu
0 → 100644
| 1 | +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * | |
| 2 | + * Copyright 2016 Li Li, Colin Heinzmann * | |
| 3 | + * * | |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); * | |
| 5 | + * you may not use this file except in compliance with the License. * | |
| 6 | + * You may obtain a copy of the License at * | |
| 7 | + * * | |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 * | |
| 9 | + * * | |
| 10 | + * Unless required by applicable law or agreed to in writing, software * | |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, * | |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * | |
| 13 | + * See the License for the specific language governing permissions and * | |
| 14 | + * limitations under the License. * | |
| 15 | + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ | |
| 16 | + | |
| 17 | +#include <iostream> | |
| 18 | + | |
| 19 | +using namespace std; | |
| 20 | + | |
| 21 | +#include "cudadefines.hpp" | |
| 22 | + | |
| 23 | +namespace br { namespace cuda { namespace cvtfloat { | |
| 24 | + | |
| 25 | + __global__ void kernel(const unsigned char* src, float* dst, int rows, int cols) { | |
| 26 | + // get my index | |
| 27 | + int rowInd = blockIdx.y*blockDim.y + threadIdx.y; | |
| 28 | + int colInd = blockIdx.x*blockDim.x + threadIdx.x; | |
| 29 | + | |
| 30 | + // bounds check | |
| 31 | + if (rowInd >= rows || colInd >= cols) { | |
| 32 | + return; | |
| 33 | + } | |
| 34 | + | |
| 35 | + int index = rowInd*cols + colInd; | |
| 36 | + dst[index] = (float)src[index]; | |
| 37 | + } | |
| 38 | + | |
| 39 | + void wrapper(void* src, void** dst, int rows, int cols) { | |
| 40 | + cudaError_t err; | |
| 41 | + CUDA_SAFE_MALLOC(dst, rows*cols*sizeof(float), &err); | |
| 42 | + | |
| 43 | + dim3 threadsPerBlock(8, 8); | |
| 44 | + dim3 numBlocks( | |
| 45 | + cols / threadsPerBlock.x + 1, | |
| 46 | + rows / threadsPerBlock.y + 1 | |
| 47 | + ); | |
| 48 | + | |
| 49 | + kernel<<<numBlocks, threadsPerBlock>>>((const unsigned char*)src, (float*)(*dst), rows, cols); | |
| 50 | + CUDA_KERNEL_ERR_CHK(&err); | |
| 51 | + | |
| 52 | + // free the src memory since it is now in a newly allocated dst | |
| 53 | + CUDA_SAFE_FREE(src, &err); | |
| 54 | + } | |
| 55 | + | |
| 56 | +}}} | ... | ... |
openbr/plugins/cuda/cudadefines.hpp
0 → 100644
| 1 | +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * | |
| 2 | + * Copyright 2016 Colin Heinzmann * | |
| 3 | + * * | |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); * | |
| 5 | + * you may not use this file except in compliance with the License. * | |
| 6 | + * You may obtain a copy of the License at * | |
| 7 | + * * | |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 * | |
| 9 | + * * | |
| 10 | + * Unless required by applicable law or agreed to in writing, software * | |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, * | |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * | |
| 13 | + * See the License for the specific language governing permissions and * | |
| 14 | + * limitations under the License. * | |
| 15 | + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ | |
| 16 | + | |
| 17 | +#include <iostream> | |
| 18 | + | |
| 19 | +using namespace std; | |
| 20 | +#include <pthread.h> | |
| 21 | + | |
| 22 | +#define CUDA_SAFE_FREE(cudaPtr, errPtr) \ | |
| 23 | + /*cout << pthread_self() << ": CUDA Free: " << cudaPtr << endl;*/ \ | |
| 24 | + *errPtr = cudaFree(cudaPtr); \ | |
| 25 | + if (*errPtr != cudaSuccess) { \ | |
| 26 | + cout << pthread_self() << ": CUDA Free Error(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \ | |
| 27 | + throw 0; \ | |
| 28 | + } | |
| 29 | + | |
| 30 | +#define CUDA_SAFE_MALLOC(cudaPtrPtr, size, errPtr) \ | |
| 31 | + *errPtr = cudaMalloc(cudaPtrPtr, size); \ | |
| 32 | + if (*errPtr != cudaSuccess) { \ | |
| 33 | + cout << pthread_self() << ": CUDA Malloc Error(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \ | |
| 34 | + throw 0; \ | |
| 35 | + } \ | |
| 36 | + //cout << pthread_self() << ": CUDA Malloc: " << (void*)*(int**)cudaPtrPtr << endl; | |
| 37 | + | |
| 38 | +#define CUDA_SAFE_MEMCPY(dstPtr, srcPtr, count, kind, errPtr) \ | |
| 39 | + *errPtr = cudaMemcpy(dstPtr, srcPtr, count, kind); \ | |
| 40 | + if (*errPtr != cudaSuccess) { \ | |
| 41 | + cout << pthread_self() << ": CUDA Memcpy Error(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \ | |
| 42 | + throw 0; \ | |
| 43 | + } | |
| 44 | + | |
| 45 | +#define CUDA_KERNEL_ERR_CHK(errPtr) \ | |
| 46 | + *errPtr = cudaPeekAtLastError(); \ | |
| 47 | + if (*errPtr != cudaSuccess) { \ | |
| 48 | + cout << pthread_self() << ": Kernel Call Err(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \ | |
| 49 | + throw 0; \ | |
| 50 | + } | ... | ... |
openbr/plugins/cuda/cudal2.cpp
0 → 100644
| 1 | +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * | |
| 2 | + * Copyright 2016 Colin Heinzmann * | |
| 3 | + * * | |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); * | |
| 5 | + * you may not use this file except in compliance with the License. * | |
| 6 | + * You may obtain a copy of the License at * | |
| 7 | + * * | |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 * | |
| 9 | + * * | |
| 10 | + * Unless required by applicable law or agreed to in writing, software * | |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, * | |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * | |
| 13 | + * See the License for the specific language governing permissions and * | |
| 14 | + * limitations under the License. * | |
| 15 | + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ | |
| 16 | + | |
| 17 | +#include <iostream> | |
| 18 | +using namespace std; | |
| 19 | + | |
| 20 | +#include <openbr/plugins/openbr_internal.h> | |
| 21 | + | |
| 22 | +// definitions from the CUDA source file | |
| 23 | +namespace br { namespace cuda { namespace L2 { | |
| 24 | + void wrapper(float const* aPtr, float const* bPtr, int length, float* outPtr); | |
| 25 | +}}} | |
| 26 | + | |
| 27 | +namespace br | |
| 28 | +{ | |
| 29 | + | |
| 30 | +/*! | |
| 31 | + * \ingroup distances | |
| 32 | + * \brief L2 distance computed using eigen. | |
| 33 | + * \author Colin Heinzmann \cite DepthDeluxe | |
| 34 | + */ | |
| 35 | +class CUDAL2Distance : public UntrainableDistance | |
| 36 | +{ | |
| 37 | + Q_OBJECT | |
| 38 | + | |
| 39 | + float compare(const cv::Mat &a, const cv::Mat &b) const | |
| 40 | + { | |
| 41 | + if (a.type() != CV_32FC1 || b.type() != CV_32FC1) { | |
| 42 | + cout << "ERR: Type mismatch" << endl; | |
| 43 | + throw 0; | |
| 44 | + } | |
| 45 | + if (a.rows*a.cols != b.rows*b.cols) { | |
| 46 | + cout << "ERR: Dimension mismatch" << endl; | |
| 47 | + throw 1; | |
| 48 | + } | |
| 49 | + | |
| 50 | + float out; | |
| 51 | + cuda::L2::wrapper(a.ptr<float>(), b.ptr<float>(), a.rows*a.cols, &out); | |
| 52 | + | |
| 53 | + return out; | |
| 54 | + } | |
| 55 | +}; | |
| 56 | + | |
| 57 | +BR_REGISTER(Distance, CUDAL2Distance) | |
| 58 | + | |
| 59 | +} // namespace br | |
| 60 | + | |
| 61 | +#include "cuda/cudal2.moc" | ... | ... |
openbr/plugins/cuda/cudal2.cu
0 → 100644
| 1 | +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * | |
| 2 | + * Copyright 2016 Colin Heinzmann * | |
| 3 | + * * | |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); * | |
| 5 | + * you may not use this file except in compliance with the License. * | |
| 6 | + * You may obtain a copy of the License at * | |
| 7 | + * * | |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 * | |
| 9 | + * * | |
| 10 | + * Unless required by applicable law or agreed to in writing, software * | |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, * | |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * | |
| 13 | + * See the License for the specific language governing permissions and * | |
| 14 | + * limitations under the License. * | |
| 15 | + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ | |
| 16 | +#include <math.h> | |
| 17 | + | |
| 18 | + | |
| 19 | +#include "cudadefines.hpp" | |
| 20 | + | |
| 21 | +namespace br { namespace cuda { namespace L2 { | |
| 22 | + | |
| 23 | + __global__ void subtractKernel(float* aPtr, float* bPtr, float* workPtr, int length) { | |
| 24 | + int index = blockIdx.x*blockDim.x+threadIdx.x; | |
| 25 | + | |
| 26 | + if (index >= length) { | |
| 27 | + return; | |
| 28 | + } | |
| 29 | + | |
| 30 | + // perform the subtraction | |
| 31 | + float res = aPtr[index] - bPtr[index]; | |
| 32 | + res = res * res; | |
| 33 | + workPtr[index] = res; | |
| 34 | + } | |
| 35 | + | |
| 36 | + __global__ void collapseKernel(float* inPtr, float* outPtr, int length) { | |
| 37 | + // make sure there is only one thread that we are calling | |
| 38 | + if (blockIdx.x != 0 || threadIdx.x != 0) { | |
| 39 | + return; | |
| 40 | + } | |
| 41 | + | |
| 42 | + // sum up all the values | |
| 43 | + float acc = 0; | |
| 44 | + for (int i=0; i < length; i++) { | |
| 45 | + acc += inPtr[i]; | |
| 46 | + } | |
| 47 | + | |
| 48 | + *outPtr = acc; | |
| 49 | + } | |
| 50 | + | |
| 51 | + float* cudaAPtr = NULL; | |
| 52 | + float* cudaBPtr = NULL; | |
| 53 | + float* cudaWorkBufferPtr = NULL; | |
| 54 | + float* cudaOutPtr = NULL; | |
| 55 | + int bufferLen = 0; | |
| 56 | + | |
| 57 | + void wrapper(float const* aPtr, float const* bPtr, int length, float* outPtr) { | |
| 58 | + cudaError_t err; | |
| 59 | + | |
| 60 | + // allocate memory for the mats and copy data to graphics card | |
| 61 | + // only allocate if there is a mismatch in image size, otherwise | |
| 62 | + // use the existing allocated memory | |
| 63 | + if (length != bufferLen) { | |
| 64 | + if (cudaAPtr != NULL) { | |
| 65 | + CUDA_SAFE_FREE(cudaAPtr, &err); | |
| 66 | + CUDA_SAFE_FREE(cudaBPtr, &err); | |
| 67 | + CUDA_SAFE_FREE(cudaWorkBufferPtr, &err); | |
| 68 | + CUDA_SAFE_FREE(cudaOutPtr, &err); | |
| 69 | + } | |
| 70 | + CUDA_SAFE_MALLOC(&cudaAPtr, length*sizeof(float), &err); | |
| 71 | + CUDA_SAFE_MALLOC(&cudaBPtr, length*sizeof(float), &err); | |
| 72 | + CUDA_SAFE_MALLOC(&cudaWorkBufferPtr, sizeof(float)*length, &err); | |
| 73 | + CUDA_SAFE_MALLOC(&cudaOutPtr, sizeof(float), &err); | |
| 74 | + bufferLen = length; | |
| 75 | + } | |
| 76 | + | |
| 77 | + // copy data over from CPU | |
| 78 | + CUDA_SAFE_MEMCPY(cudaAPtr, aPtr, length*sizeof(float), cudaMemcpyHostToDevice, &err); | |
| 79 | + CUDA_SAFE_MEMCPY(cudaBPtr, bPtr, length*sizeof(float), cudaMemcpyHostToDevice, &err); | |
| 80 | + | |
| 81 | + // perform the subtraction | |
| 82 | + int threadsPerBlock = 512; | |
| 83 | + int numBlocks = length / threadsPerBlock + 1; | |
| 84 | + subtractKernel<<<threadsPerBlock, numBlocks>>>(cudaAPtr, cudaBPtr, cudaWorkBufferPtr, length); | |
| 85 | + CUDA_KERNEL_ERR_CHK(&err); | |
| 86 | + | |
| 87 | + // perform the collapse | |
| 88 | + collapseKernel<<<1,1>>>(cudaWorkBufferPtr, cudaOutPtr, length); | |
| 89 | + CUDA_KERNEL_ERR_CHK(&err); | |
| 90 | + | |
| 91 | + // copy the single value back to the destinsion | |
| 92 | + CUDA_SAFE_MEMCPY(outPtr, cudaOutPtr, sizeof(float), cudaMemcpyDeviceToHost, &err); | |
| 93 | + } | |
| 94 | +}}} | ... | ... |
openbr/plugins/cuda/cudalbp.cpp
0 → 100644
| 1 | +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * | |
| 2 | + * Copyright 2016 Li Li, Colin Heinzmann * | |
| 3 | + * * | |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); * | |
| 5 | + * you may not use this file except in compliance with the License. * | |
| 6 | + * You may obtain a copy of the License at * | |
| 7 | + * * | |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 * | |
| 9 | + * * | |
| 10 | + * Unless required by applicable law or agreed to in writing, software * | |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, * | |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * | |
| 13 | + * See the License for the specific language governing permissions and * | |
| 14 | + * limitations under the License. * | |
| 15 | + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ | |
| 16 | + | |
| 17 | +#include <iostream> | |
| 18 | +using namespace std; | |
| 19 | + | |
| 20 | +#include <sys/types.h> | |
| 21 | +#include <unistd.h> | |
| 22 | + | |
| 23 | +#include <pthread.h> | |
| 24 | + | |
| 25 | +#include <opencv2/opencv.hpp> | |
| 26 | +#include <opencv2/imgproc/imgproc.hpp> | |
| 27 | +#include <opencv2/imgproc/imgproc_c.h> | |
| 28 | +#include <opencv2/highgui/highgui.hpp> | |
| 29 | +#include <opencv2/highgui/highgui_c.h> | |
| 30 | +#include <limits> | |
| 31 | + | |
| 32 | +#include <openbr/plugins/openbr_internal.h> | |
| 33 | + | |
| 34 | +using namespace cv; | |
| 35 | + | |
| 36 | +// definitions from the CUDA source file | |
| 37 | +namespace br { namespace cuda { namespace lbp { | |
| 38 | + void wrapper(void* srcPtr, void** dstPtr, int rows, int cols); | |
| 39 | + void initializeWrapper(uint8_t* lut); | |
| 40 | +}}} | |
| 41 | + | |
| 42 | +namespace br | |
| 43 | +{ | |
| 44 | +/*! | |
| 45 | + * \ingroup transforms | |
| 46 | + * \brief Convert the image into a feature vector using Local Binary Patterns in CUDA. Modified from stock OpenBR plugin. | |
| 47 | + * \author Colin Heinzmann \cite DepthDeluxe | |
| 48 | + * \author Li Li \cite booli | |
| 49 | + */ | |
| 50 | +class CUDALBPTransform : public UntrainableTransform | |
| 51 | +{ | |
| 52 | + Q_OBJECT | |
| 53 | + Q_PROPERTY(int radius READ get_radius WRITE set_radius RESET reset_radius STORED false) | |
| 54 | + Q_PROPERTY(int maxTransitions READ get_maxTransitions WRITE set_maxTransitions RESET reset_maxTransitions STORED false) | |
| 55 | + Q_PROPERTY(bool rotationInvariant READ get_rotationInvariant WRITE set_rotationInvariant RESET reset_rotationInvariant STORED false) | |
| 56 | + BR_PROPERTY(int, radius, 1) | |
| 57 | + BR_PROPERTY(int, maxTransitions, 8) | |
| 58 | + BR_PROPERTY(bool, rotationInvariant, false) | |
| 59 | + | |
| 60 | + private: | |
| 61 | + uchar lut[256]; | |
| 62 | + uchar null; | |
| 63 | + | |
| 64 | + public: | |
| 65 | + /* Returns the number of 0->1 or 1->0 transitions in i */ | |
| 66 | + static int numTransitions(int i) | |
| 67 | + { | |
| 68 | + int transitions = 0; | |
| 69 | + int curParity = i%2; | |
| 70 | + for (int j=1; j<=8; j++) { | |
| 71 | + int parity = (i>>(j%8)) % 2; | |
| 72 | + if (parity != curParity) transitions++; | |
| 73 | + curParity = parity; | |
| 74 | + } | |
| 75 | + return transitions; | |
| 76 | + } | |
| 77 | + | |
| 78 | + static int rotationInvariantEquivalent(int i) | |
| 79 | + { | |
| 80 | + int min = std::numeric_limits<int>::max(); | |
| 81 | + for (int j=0; j<8; j++) { | |
| 82 | + bool parity = i % 2; | |
| 83 | + i = i >> 1; | |
| 84 | + if (parity) i+=128; | |
| 85 | + min = std::min(min, i); | |
| 86 | + } | |
| 87 | + return min; | |
| 88 | + } | |
| 89 | + | |
| 90 | + void init() | |
| 91 | + { | |
| 92 | + bool set[256]; | |
| 93 | + uchar uid = 0; | |
| 94 | + for (int i=0; i<256; i++) { | |
| 95 | + if (numTransitions(i) <= maxTransitions) { | |
| 96 | + int id; | |
| 97 | + if (rotationInvariant) { | |
| 98 | + int rie = rotationInvariantEquivalent(i); | |
| 99 | + if (i == rie) id = uid++; | |
| 100 | + else id = lut[rie]; | |
| 101 | + } else id = uid++; | |
| 102 | + lut[i] = id; | |
| 103 | + set[i] = true; | |
| 104 | + } else { | |
| 105 | + set[i] = false; | |
| 106 | + } | |
| 107 | + } | |
| 108 | + | |
| 109 | + null = uid; | |
| 110 | + for (int i=0; i<256; i++) | |
| 111 | + if (!set[i]) | |
| 112 | + lut[i] = null; // Set to null id | |
| 113 | + | |
| 114 | + // copy lut over to the GPU | |
| 115 | + cuda::lbp::initializeWrapper(lut); | |
| 116 | + | |
| 117 | + std::cout << "Initialized CUDALBP" << std::endl; | |
| 118 | + } | |
| 119 | + | |
| 120 | + void project(const Template &src, Template &dst) const | |
| 121 | + { | |
| 122 | + void* const* srcDataPtr = src.m().ptr<void*>(); | |
| 123 | + int rows = *((int*)srcDataPtr[1]); | |
| 124 | + int cols = *((int*)srcDataPtr[2]); | |
| 125 | + int type = *((int*)srcDataPtr[3]); | |
| 126 | + | |
| 127 | + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type()); | |
| 128 | + void** dstDataPtr = dstMat.ptr<void*>(); | |
| 129 | + dstDataPtr[1] = srcDataPtr[1]; | |
| 130 | + dstDataPtr[2] = srcDataPtr[2]; | |
| 131 | + dstDataPtr[3] = srcDataPtr[3]; | |
| 132 | + | |
| 133 | + cuda::lbp::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols); | |
| 134 | + dst = dstMat; | |
| 135 | + } | |
| 136 | +}; | |
| 137 | + | |
| 138 | +BR_REGISTER(Transform, CUDALBPTransform) | |
| 139 | + | |
| 140 | +} | |
| 141 | + | |
| 142 | +#include "cuda/cudalbp.moc" | ... | ... |
openbr/plugins/cuda/cudalbp.cu
0 → 100644
| 1 | +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * | |
| 2 | + * Copyright 2016 Li Li, Colin Heinzmann * | |
| 3 | + * * | |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); * | |
| 5 | + * you may not use this file except in compliance with the License. * | |
| 6 | + * You may obtain a copy of the License at * | |
| 7 | + * * | |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 * | |
| 9 | + * * | |
| 10 | + * Unless required by applicable law or agreed to in writing, software * | |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, * | |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * | |
| 13 | + * See the License for the specific language governing permissions and * | |
| 14 | + * limitations under the License. * | |
| 15 | + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ | |
| 16 | +#include <iostream> | |
| 17 | + | |
| 18 | +using namespace std; | |
| 19 | + | |
| 20 | +#include <opencv2/gpu/gpu.hpp> | |
| 21 | +#include <stdio.h> | |
| 22 | + | |
| 23 | +#include "cudadefines.hpp" | |
| 24 | + | |
| 25 | +using namespace cv; | |
| 26 | +using namespace cv::gpu; | |
| 27 | + | |
| 28 | +/* | |
| 29 | + * These are the CUDA functions for CUDALBP. See cudapca.cpp for more details | |
| 30 | + */ | |
| 31 | + | |
| 32 | +namespace br { namespace cuda { namespace lbp { | |
| 33 | + uint8_t* lut; | |
| 34 | + | |
| 35 | + __device__ __forceinline__ uint8_t getPixelValueKernel(int row, int col, uint8_t* srcPtr, int rows, int cols) { | |
| 36 | + return (srcPtr + row*cols)[col]; | |
| 37 | + } | |
| 38 | + | |
| 39 | + __global__ void lutKernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols, uint8_t* lut) | |
| 40 | + { | |
| 41 | + int rowInd = blockIdx.y*blockDim.y+threadIdx.y; | |
| 42 | + int colInd = blockIdx.x*blockDim.x+threadIdx.x; | |
| 43 | + int radius = 1; | |
| 44 | + | |
| 45 | + int index = rowInd*cols + colInd; | |
| 46 | + | |
| 47 | + // don't do anything if the index is out of bounds | |
| 48 | + if (rowInd < 1 || rowInd >= rows-1 || colInd < 1 || colInd >= cols-1) { | |
| 49 | + if (rowInd >= rows || colInd >= cols) { | |
| 50 | + return; | |
| 51 | + } else { | |
| 52 | + dstPtr[index] = 0; | |
| 53 | + return; | |
| 54 | + } | |
| 55 | + } | |
| 56 | + | |
| 57 | + const uint8_t cval = getPixelValueKernel(rowInd+0*radius, colInd+0*radius, srcPtr, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]); // center value | |
| 58 | + uint8_t val = lut[(getPixelValueKernel(rowInd-1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 128 : 0) | | |
| 59 | + (getPixelValueKernel(rowInd-1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 64 : 0) | | |
| 60 | + (getPixelValueKernel(rowInd-1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 32 : 0) | | |
| 61 | + (getPixelValueKernel(rowInd+0*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 16 : 0) | | |
| 62 | + (getPixelValueKernel(rowInd+1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 8 : 0) | | |
| 63 | + (getPixelValueKernel(rowInd+1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 4 : 0) | | |
| 64 | + (getPixelValueKernel(rowInd+1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 2 : 0) | | |
| 65 | + (getPixelValueKernel(rowInd+0*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 1 : 0)]; | |
| 66 | + | |
| 67 | + // store calculated value away in the right place | |
| 68 | + dstPtr[index] = val; | |
| 69 | + } | |
| 70 | + | |
| 71 | + //void cudalbp_wrapper(uint8_t* srcPtr, uint8_t* dstPtr, uint8_t* lut, int imageWidth, int imageHeight, size_t step) | |
| 72 | + void wrapper(void* srcPtr, void** dstPtr, int rows, int cols) | |
| 73 | + { | |
| 74 | + cudaError_t err; | |
| 75 | + | |
| 76 | + // make 8 * 8 = 64 square block | |
| 77 | + dim3 threadsPerBlock(8, 8); | |
| 78 | + dim3 numBlocks(cols/threadsPerBlock.x + 1, | |
| 79 | + rows/threadsPerBlock.y + 1); | |
| 80 | + | |
| 81 | + CUDA_SAFE_MALLOC(dstPtr, rows*cols*sizeof(uint8_t), &err); | |
| 82 | + lutKernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), rows, cols, lut); | |
| 83 | + CUDA_KERNEL_ERR_CHK(&err); | |
| 84 | + | |
| 85 | + CUDA_SAFE_FREE(srcPtr, &err); | |
| 86 | + } | |
| 87 | + | |
| 88 | + void initializeWrapper(uint8_t* cpuLut) { | |
| 89 | + cudaError_t err; | |
| 90 | + CUDA_SAFE_MALLOC(&lut, 256*sizeof(uint8_t), &err); | |
| 91 | + CUDA_SAFE_MEMCPY(lut, cpuLut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice, &err); | |
| 92 | + } | |
| 93 | +}}} | ... | ... |
openbr/plugins/cuda/cudapca.cpp
0 → 100644
| 1 | +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * | |
| 2 | + * Copyright 2016 Colin Heinzmann * | |
| 3 | + * * | |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); * | |
| 5 | + * you may not use this file except in compliance with the License. * | |
| 6 | + * You may obtain a copy of the License at * | |
| 7 | + * * | |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 * | |
| 9 | + * * | |
| 10 | + * Unless required by applicable law or agreed to in writing, software * | |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, * | |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * | |
| 13 | + * See the License for the specific language governing permissions and * | |
| 14 | + * limitations under the License. * | |
| 15 | + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ | |
| 16 | + | |
| 17 | +#include <iostream> | |
| 18 | +using namespace std; | |
| 19 | +#include <unistd.h> | |
| 20 | + | |
| 21 | +#include <QList> | |
| 22 | + | |
| 23 | +#include <Eigen/Dense> | |
| 24 | + | |
| 25 | +#include <opencv2/opencv.hpp> | |
| 26 | +using namespace cv; | |
| 27 | + | |
| 28 | +#include <openbr/plugins/openbr_internal.h> | |
| 29 | +#include <openbr/core/common.h> | |
| 30 | +#include <openbr/core/eigenutils.h> | |
| 31 | +#include <openbr/core/opencvutils.h> | |
| 32 | + | |
| 33 | +// definitions from the CUDA source file | |
| 34 | +namespace br { namespace cuda { namespace pca { | |
| 35 | + void initializeWrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems); | |
| 36 | + void trainWrapper(void* cudaSrc, float* dst, int rows, int cols); | |
| 37 | + void wrapper(void* src, void** dst, int imgRows, int imgCols); | |
| 38 | +}}} | |
| 39 | + | |
| 40 | +namespace br | |
| 41 | +{ | |
| 42 | +/*! | |
| 43 | + * \ingroup transforms | |
| 44 | + * \brief Projects input into learned Principal Component Analysis subspace using CUDA. Modified from original PCA plugin. | |
| 45 | + * \author Colin Heinzmann \cite DepthDeluxe | |
| 46 | + * | |
| 47 | + * \br_property float keep Options are: [keep < 0 - All eigenvalues are retained, keep == 0 - No PCA is performed and the eigenvectors form an identity matrix, 0 < keep < 1 - Keep is the fraction of the variance to retain, keep >= 1 - keep is the number of leading eigenvectors to retain] Default is 0.95. | |
| 48 | + * \br_property int drop The number of leading eigen-dimensions to drop. | |
| 49 | + * \br_property bool whiten Whether or not to perform PCA whitening (i.e., normalize variance of each dimension to unit norm) | |
| 50 | + */ | |
| 51 | +class CUDAPCATransform : public Transform | |
| 52 | +{ | |
| 53 | + Q_OBJECT | |
| 54 | + | |
| 55 | +protected: | |
| 56 | + Q_PROPERTY(float keep READ get_keep WRITE set_keep RESET reset_keep STORED false) | |
| 57 | + Q_PROPERTY(int drop READ get_drop WRITE set_drop RESET reset_drop STORED false) | |
| 58 | + Q_PROPERTY(bool whiten READ get_whiten WRITE set_whiten RESET reset_whiten STORED false) | |
| 59 | + | |
| 60 | + BR_PROPERTY(float, keep, 0.95) | |
| 61 | + BR_PROPERTY(int, drop, 0) | |
| 62 | + BR_PROPERTY(bool, whiten, false) | |
| 63 | + | |
| 64 | + Eigen::VectorXf mean, eVals; | |
| 65 | + Eigen::MatrixXf eVecs; | |
| 66 | + | |
| 67 | + int originalRows; | |
| 68 | + | |
| 69 | +public: | |
| 70 | + CUDAPCATransform() : keep(0.95), drop(0), whiten(false) {} | |
| 71 | + | |
| 72 | +private: | |
| 73 | + double residualReconstructionError(const Template &src) const | |
| 74 | + { | |
| 75 | + Template proj; | |
| 76 | + project(src, proj); | |
| 77 | + | |
| 78 | + Eigen::Map<const Eigen::VectorXf> srcMap(src.m().ptr<float>(), src.m().rows*src.m().cols); | |
| 79 | + Eigen::Map<Eigen::VectorXf> projMap(proj.m().ptr<float>(), keep); | |
| 80 | + | |
| 81 | + return (srcMap - mean).squaredNorm() - projMap.squaredNorm(); | |
| 82 | + } | |
| 83 | + | |
| 84 | + void train(const TemplateList &cudaTrainingSet) | |
| 85 | + { | |
| 86 | + // copy the data back from the graphics card so the training can be done on the CPU | |
| 87 | + const int instances = cudaTrainingSet.size(); // get the number of training set instances | |
| 88 | + QList<Template> trainingQlist; | |
| 89 | + for(int i=0; i<instances; i++) { | |
| 90 | + Template currentTemplate = cudaTrainingSet[i]; | |
| 91 | + void* const* srcDataPtr = currentTemplate.m().ptr<void*>(); | |
| 92 | + void* cudaMemPtr = srcDataPtr[0]; | |
| 93 | + int rows = *((int*)srcDataPtr[1]); | |
| 94 | + int cols = *((int*)srcDataPtr[2]); | |
| 95 | + int type = *((int*)srcDataPtr[3]); | |
| 96 | + | |
| 97 | + if (type != CV_32FC1) { | |
| 98 | + qFatal("Requires single channel 32-bit floating point matrices."); | |
| 99 | + } | |
| 100 | + | |
| 101 | + Mat mat = Mat(rows, cols, type); | |
| 102 | + br::cuda::pca::trainWrapper(cudaMemPtr, mat.ptr<float>(), rows, cols); | |
| 103 | + trainingQlist.append(Template(mat)); | |
| 104 | + } | |
| 105 | + | |
| 106 | + // assemble a TemplateList from the list of data | |
| 107 | + TemplateList trainingSet(trainingQlist); | |
| 108 | + | |
| 109 | + | |
| 110 | + originalRows = trainingSet.first().m().rows; // get number of rows of first image | |
| 111 | + int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols; // get the size of the first image | |
| 112 | + | |
| 113 | + // Map into 64-bit Eigen matrix | |
| 114 | + Eigen::MatrixXd data(dimsIn, instances); // create a mat | |
| 115 | + for (int i=0; i<instances; i++) { | |
| 116 | + data.col(i) = Eigen::Map<const Eigen::MatrixXf>(trainingSet[i].m().ptr<float>(), dimsIn, 1).cast<double>(); | |
| 117 | + } | |
| 118 | + | |
| 119 | + trainCore(data); | |
| 120 | + } | |
| 121 | + | |
| 122 | + void project(const Template &src, Template &dst) const | |
| 123 | + { | |
| 124 | + void* const* srcDataPtr = src.m().ptr<void*>(); | |
| 125 | + int rows = *((int*)srcDataPtr[1]); | |
| 126 | + int cols = *((int*)srcDataPtr[2]); | |
| 127 | + int type = *((int*)srcDataPtr[3]); | |
| 128 | + | |
| 129 | + if (type != CV_32FC1) { | |
| 130 | + cout << "ERR: Invalid image type" << endl; | |
| 131 | + throw 0; | |
| 132 | + } | |
| 133 | + | |
| 134 | + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type()); | |
| 135 | + void** dstDataPtr = dstMat.ptr<void*>(); | |
| 136 | + dstDataPtr[1] = srcDataPtr[1]; *((int*)dstDataPtr[1]) = 1; | |
| 137 | + dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = keep; | |
| 138 | + dstDataPtr[3] = srcDataPtr[3]; | |
| 139 | + | |
| 140 | + cuda::pca::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols); | |
| 141 | + | |
| 142 | + dst = dstMat; | |
| 143 | + } | |
| 144 | + | |
| 145 | + void store(QDataStream &stream) const | |
| 146 | + { | |
| 147 | + stream << keep << drop << whiten << originalRows << mean << eVals << eVecs; | |
| 148 | + } | |
| 149 | + | |
| 150 | + void load(QDataStream &stream) | |
| 151 | + { | |
| 152 | + stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> eVecs; | |
| 153 | + | |
| 154 | + // serialize the eigenvectors | |
| 155 | + float* evBuffer = new float[eVecs.rows() * eVecs.cols()]; | |
| 156 | + for (int i=0; i < eVecs.rows(); i++) { | |
| 157 | + for (int j=0; j < eVecs.cols(); j++) { | |
| 158 | + evBuffer[i*eVecs.cols() + j] = eVecs(i, j); | |
| 159 | + } | |
| 160 | + } | |
| 161 | + | |
| 162 | + // serialize the mean | |
| 163 | + float* meanBuffer = new float[mean.rows() * mean.cols()]; | |
| 164 | + for (int i=0; i < mean.rows(); i++) { | |
| 165 | + for (int j=0; j < mean.cols(); j++) { | |
| 166 | + meanBuffer[i*mean.cols() + j] = mean(i, j); | |
| 167 | + } | |
| 168 | + } | |
| 169 | + | |
| 170 | + // call the wrapper function | |
| 171 | + cuda::pca::initializeWrapper(evBuffer, eVecs.rows(), eVecs.cols(), meanBuffer, mean.rows()*mean.cols()); | |
| 172 | + | |
| 173 | + delete evBuffer; | |
| 174 | + delete meanBuffer; | |
| 175 | + } | |
| 176 | + | |
| 177 | +protected: | |
| 178 | + void trainCore(Eigen::MatrixXd data) | |
| 179 | + { | |
| 180 | + int dimsIn = data.rows(); | |
| 181 | + int instances = data.cols(); | |
| 182 | + const bool dominantEigenEstimation = (dimsIn > instances); | |
| 183 | + | |
| 184 | + Eigen::MatrixXd allEVals, allEVecs; | |
| 185 | + if (keep != 0) { | |
| 186 | + // Compute and remove mean | |
| 187 | + mean = Eigen::VectorXf(dimsIn); | |
| 188 | + for (int i=0; i<dimsIn; i++) mean(i) = data.row(i).sum() / (float)instances; | |
| 189 | + for (int i=0; i<dimsIn; i++) data.row(i).array() -= mean(i); | |
| 190 | + | |
| 191 | + // Calculate covariance matrix | |
| 192 | + Eigen::MatrixXd cov; | |
| 193 | + if (dominantEigenEstimation) cov = data.transpose() * data / (instances-1.0); | |
| 194 | + else cov = data * data.transpose() / (instances-1.0); | |
| 195 | + | |
| 196 | + // Compute eigendecomposition. Returns eigenvectors/eigenvalues in increasing order by eigenvalue. | |
| 197 | + Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> eSolver(cov); | |
| 198 | + allEVals = eSolver.eigenvalues(); | |
| 199 | + allEVecs = eSolver.eigenvectors(); | |
| 200 | + if (dominantEigenEstimation) allEVecs = data * allEVecs; | |
| 201 | + } else { | |
| 202 | + // Null case | |
| 203 | + mean = Eigen::VectorXf::Zero(dimsIn); | |
| 204 | + allEVecs = Eigen::MatrixXd::Identity(dimsIn, dimsIn); | |
| 205 | + allEVals = Eigen::VectorXd::Ones(dimsIn); | |
| 206 | + } | |
| 207 | + | |
| 208 | + if (keep <= 0) { | |
| 209 | + keep = dimsIn - drop; | |
| 210 | + } else if (keep < 1) { | |
| 211 | + // Keep eigenvectors that retain a certain energy percentage. | |
| 212 | + const double totalEnergy = allEVals.sum(); | |
| 213 | + if (totalEnergy == 0) { | |
| 214 | + keep = 0; | |
| 215 | + } else { | |
| 216 | + double currentEnergy = 0; | |
| 217 | + int i=0; | |
| 218 | + while ((currentEnergy / totalEnergy < keep) && (i < allEVals.rows())) { | |
| 219 | + currentEnergy += allEVals(allEVals.rows()-(i+1)); | |
| 220 | + i++; | |
| 221 | + } | |
| 222 | + keep = i - drop; | |
| 223 | + } | |
| 224 | + } else { | |
| 225 | + if (keep + drop > allEVals.rows()) { | |
| 226 | + qWarning("Insufficient samples, needed at least %d but only got %d.", (int)keep + drop, (int)allEVals.rows()); | |
| 227 | + keep = allEVals.rows() - drop; | |
| 228 | + } | |
| 229 | + } | |
| 230 | + | |
| 231 | + // Keep highest energy vectors | |
| 232 | + eVals = Eigen::VectorXf((int)keep, 1); | |
| 233 | + eVecs = Eigen::MatrixXf(allEVecs.rows(), (int)keep); | |
| 234 | + for (int i=0; i<keep; i++) { | |
| 235 | + int index = allEVals.rows()-(i+drop+1); | |
| 236 | + eVals(i) = allEVals(index); | |
| 237 | + eVecs.col(i) = allEVecs.col(index).cast<float>() / allEVecs.col(index).norm(); | |
| 238 | + if (whiten) eVecs.col(i) /= sqrt(eVals(i)); | |
| 239 | + } | |
| 240 | + | |
| 241 | + // Debug output | |
| 242 | + if (Globals->verbose) qDebug() << "PCA Training:\n\tDimsIn =" << dimsIn << "\n\tKeep =" << keep; | |
| 243 | + } | |
| 244 | + | |
| 245 | + void writeEigenVectors(const Eigen::MatrixXd &allEVals, const Eigen::MatrixXd &allEVecs) const | |
| 246 | + { | |
| 247 | + const int originalCols = mean.rows() / originalRows; | |
| 248 | + | |
| 249 | + { // Write out mean image | |
| 250 | + cv::Mat out(originalRows, originalCols, CV_32FC1); | |
| 251 | + Eigen::Map<Eigen::MatrixXf> outMap(out.ptr<float>(), mean.rows(), 1); | |
| 252 | + outMap = mean.col(0); | |
| 253 | + // OpenCVUtils::saveImage(out, Globals->Debug+"/PCA/eigenVectors/mean.png"); | |
| 254 | + } | |
| 255 | + | |
| 256 | + // Write out sample eigen vectors (16 highest, 8 lowest), filename = eigenvalue. | |
| 257 | + for (int k=0; k<(int)allEVals.size(); k++) { | |
| 258 | + if ((k < 8) || (k >= (int)allEVals.size()-16)) { | |
| 259 | + cv::Mat out(originalRows, originalCols, CV_64FC1); | |
| 260 | + Eigen::Map<Eigen::MatrixXd> outMap(out.ptr<double>(), mean.rows(), 1); | |
| 261 | + outMap = allEVecs.col(k); | |
| 262 | + // OpenCVUtils::saveImage(out, Globals->Debug+"/PCA/eigenVectors/"+QString::number(allEVals(k),'f',0)+".png"); | |
| 263 | + } | |
| 264 | + } | |
| 265 | + } | |
| 266 | +}; | |
| 267 | + | |
| 268 | +BR_REGISTER(Transform, CUDAPCATransform) | |
| 269 | +} // namespace br | |
| 270 | + | |
| 271 | +#include "cuda/cudapca.moc" | ... | ... |
openbr/plugins/cuda/cudapca.cu
0 → 100644
| 1 | +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * | |
| 2 | + * Copyright 2016 Colin Heinzmann * | |
| 3 | + * * | |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); * | |
| 5 | + * you may not use this file except in compliance with the License. * | |
| 6 | + * You may obtain a copy of the License at * | |
| 7 | + * * | |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 * | |
| 9 | + * * | |
| 10 | + * Unless required by applicable law or agreed to in writing, software * | |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, * | |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * | |
| 13 | + * See the License for the specific language governing permissions and * | |
| 14 | + * limitations under the License. * | |
| 15 | + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ | |
| 16 | + | |
| 17 | +#include <iostream> | |
| 18 | +using namespace std; | |
| 19 | + | |
| 20 | +#include <opencv2/opencv.hpp> | |
| 21 | +#include <opencv2/gpu/gpu.hpp> | |
| 22 | + | |
| 23 | +#include "cudadefines.hpp" | |
| 24 | + | |
| 25 | +using namespace cv; | |
| 26 | +using namespace cv::gpu; | |
| 27 | + | |
| 28 | +/* | |
| 29 | + * These are the CUDA functions for CUDAPCA. See cudapca.cpp for more details | |
| 30 | + */ | |
| 31 | + | |
| 32 | +namespace br { namespace cuda { namespace pca { | |
| 33 | + __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int numEigenvectors, int numSteps, int stepSize, int numPixels) { | |
| 34 | + int evIdx = blockIdx.x*blockDim.x+threadIdx.x; | |
| 35 | + int stepIdx = blockIdx.y*blockDim.y+threadIdx.y; | |
| 36 | + | |
| 37 | + if (evIdx >= numEigenvectors || stepIdx >= numSteps) { | |
| 38 | + return; | |
| 39 | + } | |
| 40 | + | |
| 41 | + float acc = 0; | |
| 42 | + int startIdx = stepSize*stepIdx; | |
| 43 | + int stopIdx = startIdx+stepSize; | |
| 44 | + if (startIdx >= numPixels) { | |
| 45 | + return; | |
| 46 | + } | |
| 47 | + if (stopIdx >= numPixels) { | |
| 48 | + stopIdx = numPixels; | |
| 49 | + } | |
| 50 | + for(int i=startIdx; i < stopIdx; i++) { | |
| 51 | + acc += src[i]*evPtr[i*numEigenvectors + evIdx]; | |
| 52 | + } | |
| 53 | + | |
| 54 | + intermediaryBuffer[stepIdx*stepSize + evIdx] = acc; | |
| 55 | + } | |
| 56 | + | |
| 57 | + __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* out, int numEigenvectors, int numSteps, int stepSize) { | |
| 58 | + int evIdx = blockIdx.x*blockDim.x+threadIdx.x; | |
| 59 | + if (evIdx >= numEigenvectors) { | |
| 60 | + return; | |
| 61 | + } | |
| 62 | + | |
| 63 | + if (numSteps*stepSize+evIdx >= numEigenvectors) { | |
| 64 | + numSteps--; | |
| 65 | + } | |
| 66 | + | |
| 67 | + float acc = 0; | |
| 68 | + for (int i=0; i < numSteps; i++) { | |
| 69 | + int ibIdx = i*stepSize + evIdx; | |
| 70 | + acc += intermediaryBuffer[ibIdx]; | |
| 71 | + } | |
| 72 | + | |
| 73 | + out[evIdx] = acc; | |
| 74 | + } | |
| 75 | + | |
| 76 | + __global__ void subtractMeanKernel(float* out, float* mean, int numElems) { | |
| 77 | + int idx = blockIdx.x*blockDim.x+threadIdx.x; | |
| 78 | + | |
| 79 | + // perform bound checking | |
| 80 | + if (idx >= numElems) { | |
| 81 | + return; | |
| 82 | + } | |
| 83 | + | |
| 84 | + // subtract out the mean | |
| 85 | + out[idx] -= mean[idx]; | |
| 86 | + } | |
| 87 | + | |
| 88 | + // _evRows: the number of pixels in the trained images | |
| 89 | + // _evCols: the number of eigenvectors | |
| 90 | + // _meanElems: the number of pixels in an image | |
| 91 | + // _stepSize: the number of pixels in a single step | |
| 92 | + // _numSteps: the number of steps required to complete operation | |
| 93 | + float* cudaEvPtr; int _evRows; int _evCols; | |
| 94 | + float* cudaMeanPtr; int _meanElems; | |
| 95 | + int _numSteps; int _stepSize; | |
| 96 | + float* intermediaryBuffer; | |
| 97 | + | |
| 98 | + void initializeWrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) { | |
| 99 | + _evRows = evRows; _evCols = evCols; | |
| 100 | + _meanElems = meanElems; | |
| 101 | + | |
| 102 | + cudaError_t err; | |
| 103 | + | |
| 104 | + // copy the eigenvectors to the GPU | |
| 105 | + CUDA_SAFE_MALLOC(&cudaEvPtr, evRows*evCols*sizeof(float), &err); | |
| 106 | + CUDA_SAFE_MEMCPY(cudaEvPtr, evPtr, evRows*evCols*sizeof(float), cudaMemcpyHostToDevice, &err); | |
| 107 | + | |
| 108 | + // copy the mean to the GPU | |
| 109 | + CUDA_SAFE_MALLOC(&cudaMeanPtr, meanElems*sizeof(float), &err); | |
| 110 | + CUDA_SAFE_MEMCPY(cudaMeanPtr, meanPtr, meanElems*sizeof(float), cudaMemcpyHostToDevice, &err); | |
| 111 | + | |
| 112 | + // initialize the intermediary working space, | |
| 113 | + _stepSize = 2048; | |
| 114 | + _numSteps = _evRows / _stepSize + 1; | |
| 115 | + CUDA_SAFE_MALLOC(&intermediaryBuffer, _numSteps*_stepSize*sizeof(float), &err); | |
| 116 | + } | |
| 117 | + | |
| 118 | + void trainWrapper(void* cudaSrc, float* data, int rows, int cols) { | |
| 119 | + cudaError_t err; | |
| 120 | + CUDA_SAFE_MEMCPY(data, cudaSrc, rows*cols*sizeof(float), cudaMemcpyDeviceToHost, &err); | |
| 121 | + } | |
| 122 | + | |
| 123 | + void wrapper(void* src, void** dst, int imgRows, int imgCols) { | |
| 124 | + cudaError_t err; | |
| 125 | + CUDA_SAFE_MALLOC(dst, _evCols*sizeof(float), &err); | |
| 126 | + | |
| 127 | + if (imgRows*imgCols != _evRows || imgRows*imgCols != _meanElems) { | |
| 128 | + cout << "ERR: Image dimension mismatch!" << endl; | |
| 129 | + throw 0; | |
| 130 | + } | |
| 131 | + | |
| 132 | + // subtract out the mean of the image (mean is 1xpixels in size), perform in place (in src) | |
| 133 | + int threadsPerBlock = 512; | |
| 134 | + int numBlocks = _meanElems / threadsPerBlock + 1; | |
| 135 | + subtractMeanKernel<<<numBlocks, threadsPerBlock>>>((float*)src, cudaMeanPtr, _meanElems); | |
| 136 | + CUDA_KERNEL_ERR_CHK(&err); | |
| 137 | + | |
| 138 | + // perform matrix multiplication | |
| 139 | + dim3 threadsPerBlock2d(512, 1); | |
| 140 | + dim3 numBlocks2d( | |
| 141 | + _evCols / threadsPerBlock2d.x + 1, | |
| 142 | + _numSteps / threadsPerBlock2d.y + 1); | |
| 143 | + multiplyKernel<<<numBlocks2d, threadsPerBlock2d>>>((float*)src, intermediaryBuffer, cudaEvPtr, _evCols, _numSteps, _stepSize, _meanElems); | |
| 144 | + CUDA_KERNEL_ERR_CHK(&err); | |
| 145 | + | |
| 146 | + threadsPerBlock = 512; | |
| 147 | + numBlocks = _evCols / threadsPerBlock + 1; | |
| 148 | + multiplyJoinKernel<<<numBlocks, threadsPerBlock>>>(intermediaryBuffer, (float*)*dst, _evCols, _numSteps, _stepSize); | |
| 149 | + CUDA_KERNEL_ERR_CHK(&err); | |
| 150 | + | |
| 151 | + // free the src memory | |
| 152 | + CUDA_SAFE_FREE(src, &err); | |
| 153 | + } | |
| 154 | +}}} | ... | ... |
openbr/plugins/cuda/cudargb2grayscale.cpp
0 → 100644
| 1 | +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * | |
| 2 | + * Copyright 2016 Li Li, Colin Heinzmann * | |
| 3 | + * * | |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); * | |
| 5 | + * you may not use this file except in compliance with the License. * | |
| 6 | + * You may obtain a copy of the License at * | |
| 7 | + * * | |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 * | |
| 9 | + * * | |
| 10 | + * Unless required by applicable law or agreed to in writing, software * | |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, * | |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * | |
| 13 | + * See the License for the specific language governing permissions and * | |
| 14 | + * limitations under the License. * | |
| 15 | + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ | |
| 16 | + | |
| 17 | +#include <iostream> | |
| 18 | +#include <opencv2/imgproc/imgproc.hpp> | |
| 19 | + | |
| 20 | +#include <openbr/plugins/openbr_internal.h> | |
| 21 | +#include <openbr/core/opencvutils.h> | |
| 22 | +#include <thrust/host_vector.h> | |
| 23 | +#include <thrust/device_vector.h> | |
| 24 | + | |
| 25 | + | |
| 26 | +using namespace cv; | |
| 27 | + | |
| 28 | +// definitions from the CUDA source file | |
| 29 | +namespace br { namespace cuda { namespace rgb2grayscale { | |
| 30 | + void wrapper(void* srcPtr, void**dstPtr, int rows, int cols); | |
| 31 | +}}} | |
| 32 | + | |
| 33 | +namespace br | |
| 34 | +{ | |
| 35 | + | |
| 36 | +/*! | |
| 37 | + * \ingroup transforms | |
| 38 | + * \brief Converts 3-channel images to grayscale | |
| 39 | + * \author Li Li \cite booli | |
| 40 | + */ | |
| 41 | +class CUDARGB2GrayScaleTransform : public UntrainableTransform | |
| 42 | +{ | |
| 43 | + Q_OBJECT | |
| 44 | + | |
| 45 | +public: | |
| 46 | + | |
| 47 | +private: | |
| 48 | + void project(const Template &src, Template &dst) const | |
| 49 | + { | |
| 50 | + void* const* srcDataPtr = src.m().ptr<void*>(); | |
| 51 | + int rows = *((int*) srcDataPtr[1]); | |
| 52 | + int cols = *((int*) srcDataPtr[2]); | |
| 53 | + int type = *((int*) srcDataPtr[3]); | |
| 54 | + | |
| 55 | + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type()); | |
| 56 | + void** dstDataPtr = dstMat.ptr<void*>(); | |
| 57 | + dstDataPtr[1] = srcDataPtr[1]; | |
| 58 | + dstDataPtr[2] = srcDataPtr[2]; | |
| 59 | + dstDataPtr[3] = srcDataPtr[3]; | |
| 60 | + *((int*)dstDataPtr[3]) = CV_8UC1; // not sure if the type of the new mat is the same | |
| 61 | + | |
| 62 | + cuda::rgb2grayscale::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols); | |
| 63 | + dst = dstMat; | |
| 64 | + } | |
| 65 | +}; | |
| 66 | + | |
| 67 | +BR_REGISTER(Transform, CUDARGB2GrayScaleTransform) | |
| 68 | + | |
| 69 | +} // namespace br | |
| 70 | + | |
| 71 | +#include "imgproc/cudargb2grayscale.moc" | ... | ... |
openbr/plugins/cuda/cudargb2grayscale.cu
0 → 100644
| 1 | +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * | |
| 2 | + * Copyright 2016 Li Li, Colin Heinzmann * | |
| 3 | + * * | |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); * | |
| 5 | + * you may not use this file except in compliance with the License. * | |
| 6 | + * You may obtain a copy of the License at * | |
| 7 | + * * | |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 * | |
| 9 | + * * | |
| 10 | + * Unless required by applicable law or agreed to in writing, software * | |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, * | |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * | |
| 13 | + * See the License for the specific language governing permissions and * | |
| 14 | + * limitations under the License. * | |
| 15 | + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ | |
| 16 | + | |
| 17 | +#include <stdio.h> | |
| 18 | +#include <iostream> | |
| 19 | +#include <opencv2/opencv.hpp> | |
| 20 | +#include <opencv2/gpu/gpu.hpp> | |
| 21 | +#include "cudadefines.hpp" | |
| 22 | +#include <opencv2/imgproc/imgproc.hpp> | |
| 23 | +#include <opencv2/imgproc/imgproc_c.h> | |
| 24 | +#include <opencv2/highgui/highgui.hpp> | |
| 25 | +#include <opencv2/highgui/highgui_c.h> | |
| 26 | + | |
| 27 | +using namespace std; | |
| 28 | +using namespace cv; | |
| 29 | +using namespace cv::gpu; | |
| 30 | + | |
| 31 | +namespace br { namespace cuda { namespace rgb2grayscale { | |
| 32 | + | |
| 33 | + __global__ void kernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols) | |
| 34 | + { | |
| 35 | + int rowInd = blockIdx.y*blockDim.y+threadIdx.y; | |
| 36 | + int colInd = blockIdx.x*blockDim.x+threadIdx.x; | |
| 37 | + int index = rowInd*cols + colInd; | |
| 38 | + if (rowInd < 0 || rowInd >= rows || colInd < 0 || colInd >= cols) { | |
| 39 | + return; | |
| 40 | + } | |
| 41 | + int new_index = 3 * index; | |
| 42 | + float g = (float) srcPtr[new_index]; | |
| 43 | + float b = (float) srcPtr[new_index+1]; | |
| 44 | + float r = (float) srcPtr[new_index+2]; | |
| 45 | + | |
| 46 | + dstPtr[index] = (uint8_t) (0.299f * g + 0.587f * b + 0.114f * r); | |
| 47 | + return; | |
| 48 | + } | |
| 49 | + | |
| 50 | + void wrapper(void* srcPtr, void** dstPtr, int rows, int cols) | |
| 51 | + { | |
| 52 | + cudaError_t err; | |
| 53 | + dim3 threadsPerBlock(9, 9); | |
| 54 | + dim3 numBlocks(cols/threadsPerBlock.x + 1, | |
| 55 | + rows/threadsPerBlock.y + 1); | |
| 56 | + CUDA_SAFE_MALLOC(dstPtr, rows*cols*sizeof(uint8_t), &err); | |
| 57 | + | |
| 58 | + kernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*) (*dstPtr), rows, cols); | |
| 59 | + CUDA_KERNEL_ERR_CHK(&err); | |
| 60 | + CUDA_SAFE_FREE(srcPtr, &err); | |
| 61 | + } | |
| 62 | + | |
| 63 | +}}} | ... | ... |
openbr/plugins/cuda/module.cmake
0 → 100644
| 1 | +# add WITH_CUDA option | |
| 2 | +option(BR_WITH_CUDA "Build CUDA-accelerated plugins." OFF) | |
| 3 | +set(BR_CUDA_ARCH "sm_20" CACHE STRING "CUDA Architecture") | |
| 4 | + | |
| 5 | +# only build this module if explicitly OK'ed | |
| 6 | +if(BR_WITH_CUDA) | |
| 7 | + message(STATUS "Building with CUDA Support") | |
| 8 | + find_package(CUDA REQUIRED) | |
| 9 | + | |
| 10 | + set(CUDA_SRC_DIR ${PROJECT_SOURCE_DIR}/openbr/plugins/cuda) | |
| 11 | + | |
| 12 | + # configure the compiler, need -fPIC for shared library | |
| 13 | + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch=${BR_CUDA_ARCH} --compiler-options -fPIC) | |
| 14 | + include_directories(${CUDA_INCLUDE_DIRS}) | |
| 15 | + | |
| 16 | + # glob sources | |
| 17 | + file(GLOB CUDA_CU_SRC ${CUDA_SRC_DIR}/*.cu) | |
| 18 | + file(GLOB CUDA_CPP_SRC ${CUDA_SRC_DIR}/*.cpp) | |
| 19 | + | |
| 20 | + # compile each of the object files and append to CUDA_CU_OBJ | |
| 21 | + foreach(FILE ${CUDA_CU_SRC}) | |
| 22 | + cuda_compile(FILE_O ${FILE}) | |
| 23 | + set(CUDA_CU_OBJ ${CUDA_CU_OBJ} ${FILE_O}) | |
| 24 | + endforeach() | |
| 25 | + | |
| 26 | + # ensure add_library knows these are external object file | |
| 27 | + set_source_files_properties(${CUDA_CU_OBJ} PROPERTIES EXTERNAL_OBJECT true) | |
| 28 | + | |
| 29 | + # add the compiled source and libs into the build system | |
| 30 | + set(BR_THIRDPARTY_SRC ${BR_THIRDPARTY_SRC} ${CUDA_CPP_SRC} ${CUDA_CU_OBJ}) | |
| 31 | + set(BR_THIRDPARTY_LIBS ${BR_THIRDPARTY_LIBS} ${CUDA_LIBRARIES}) | |
| 32 | + | |
| 33 | +endif() | ... | ... |
openbr/plugins/plugins.cmake
| ... | ... | @@ -12,7 +12,15 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${BR_THIRDPARTY_PLUGINS_DIR}) |
| 12 | 12 | file(GLOB SUBFILES plugins/*) |
| 13 | 13 | foreach(FILE ${SUBFILES}) |
| 14 | 14 | if(IS_DIRECTORY ${FILE}) |
| 15 | - set(BR_PLUGINS_DIR ${BR_PLUGINS_DIR} ${FILE}) | |
| 15 | + # check to see if there is a cmake file for the folder, if there is, then that | |
| 16 | + # file should be used to build the contents of the directory | |
| 17 | + if (EXISTS ${FILE}/module.cmake) | |
| 18 | + message(STATUS "importing ${FILE}/module.cmake") | |
| 19 | + include(${FILE}/module.cmake) | |
| 20 | + else() | |
| 21 | + message(STATUS "adding ${FILE}") | |
| 22 | + set(BR_PLUGINS_DIR ${BR_PLUGINS_DIR} ${FILE}) | |
| 23 | + endif() | |
| 16 | 24 | endif() |
| 17 | 25 | endforeach() |
| 18 | 26 | set(BR_PLUGINS_DIR ${BR_PLUGINS_DIR} plugins/) # Remove this when finished with reorg | ... | ... |