Commit 53d01e939b2fc292f9a1143cfa24e62d007fc27f

Authored by Josh Klontz
2 parents 2849f3b9 552e99f9

Merge pull request #464 from CGLG/master

CUDA-accelerated plugin support
openbr/plugins/cuda/README.md 0 → 100644
  1 +# CUDA Plugins
  2 +CUDA plugins are very similar to normal plugins. A single plugin is split into
  3 +two files: the `.cpp` file with the BR standard plugin definition and the `.cu`
  4 +file with your kernel and wrapper functions.
  5 +
  6 +## The `.cpp` file
  7 +Every main plugin file must have the names of the kernel wrapper functions
  8 +defined at the top of the program. Once the definitions are there, just call
  9 +the CUDA functions as you need them
  10 +
  11 +## The `.cu` file
  12 +All functions within the CUDA file must be declared inside their own namespace
  13 +under `br::cuda`. For example the plugin `passthrough` must have all functions
  14 +inside it declared under the namespace `br::cuda::passthrough`.
  15 +
  16 +## CPU Template object format
  17 +Like any other BR Transform, the plugin must return an object for the next
  18 +plugin to consume. For performance reasons, we don't copy data to and from
  19 +the graphics card for every transform. Instead, we use this space to transfer
  20 +data about how to access the image data and its type. The Mat is an array of data type `void*`.
  21 +
  22 +Index | Item Name | Type | Description
  23 +--------|-------------|-----------|------------
  24 +0 | GpuData | void* | Pointer to the graphics card data
  25 +1 | rows | int | Number of rows in the Mat
  26 +2 | cols | int | Number of colums in the Mat
  27 +3 | type | int | OpenCV mat data type code (i.e. `mat.type()`)
  28 +
  29 +It is expected that the wrapper function does the proper GPU memory handling
  30 +to make sure that the GpuData pointer in the output mat is pointing to the
  31 +data that the plugin is outputting.
  32 +
  33 +## Example: Passthrough
  34 +This example plugin takes in input data and passes it straight to the output.
  35 +The BR transform calls the wrapper function which exists in the CUDA file which
  36 +in turn calls the kernel routine to copy the data in the GPU.
  37 +
  38 +**Note**: This program assumes that a previous Transform, namely `CUDACopyTo` has
  39 +copied the data to the GPU.
  40 +
  41 +### **passthrough.cpp**
  42 +```c++
  43 +#include <openbr/plugins/openbr_internal.h>
  44 +#include <opencv2/opencv.hpp>
  45 +
  46 +// wrapper function within the CUDA file
  47 +namespace br { namespace cuda { namespace passthrough {
  48 + void wrapper(void* srcGpuData, void** dstGpuData);
  49 +}}};
  50 +
  51 +#include <iostream>
  52 +namespace br
  53 +{
  54 + class CUDAPassthroughTransform : public UntrainableTransform
  55 + {
  56 + Q_OBJECT
  57 +
  58 + void project(const Template &src, Template &dst) {
  59 + // extract the parameters out of the Mat passed from the previous plugin
  60 + void* const* srcDataPtr = src.m().ptr<void*>();
  61 + int rows = *((int*)srcDataPtr[1]);
  62 + int cols = *((int*)srcDataPtr[2]);
  63 + int type = *((int*)srcDataPtr[3]);
  64 +
  65 + // generate a new Mat to be passed to the next plugin
  66 + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
  67 + void** dstDataPtr = dstMat.ptr<void*>();
  68 + dstDataPtr[1] = srcDataPtr[1];
  69 + dstDataPtr[2] = srcDataPtr[2];
  70 + dstDataPtr[3] = srcDataPtr[3];
  71 +
  72 + // call the wrapper and set the dst output to the newly created Mat
  73 + br::cuda::passthrough::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
  74 + dst = dstMat;
  75 + }
  76 + };
  77 +
  78 + BR_REGISTER(Transform, CUDAPassthroughTransform);
  79 +}
  80 +
  81 +#include "cuda/passthrough.moc"
  82 +```
  83 +
  84 +### **passthrough.cu**
  85 +```c++
  86 +#include <opencv2/opencv.hpp>
  87 +
  88 +namespace br { namespace cuda { namespace passthrough {
  89 + __global__ void kernel(char* srcPtr, char* dstPtr, int rows, int cols) {
  90 + // get the current index
  91 + int rowInd = blockIdx.y*blockDim.y+threadIdx.y;
  92 + int colInd = blockIdx.x*blockDim.x+threadIdx.x;
  93 +
  94 + // don't do anything if we are outside the allowable positions
  95 + if (rowInd >= rows || colInd >= cols)
  96 + return;
  97 +
  98 + // write the input to the output
  99 + rowDstPtr[rowInd*cols + colInd] = srcVal;
  100 + }
  101 +
  102 + void wrapper(char* srcPtr, char** dstPtr, int rows, int cols, int type) {
  103 + // verify the proper image type
  104 + if (type != CV_8UC1) {
  105 + cout << "Error: image type not supported"
  106 + return;
  107 + }
  108 +
  109 + *dstPtr = cudaMalloc(rows*cols*sizeof(char));
  110 +
  111 + dim3 threadsPerBlock(8, 8);
  112 + dim3 numBlocks(imageWidth / threadsPerBlock.x + 1,
  113 + imageHeight / threadsPerBlock.y + 1);
  114 +
  115 + // run the kernel function
  116 + kernel<<<numBlocks, threadPerBlock>>>(srcPtr, dstPtr, rows, cols);
  117 +
  118 + // free the memory as it isn't used anymore
  119 + cudaFree(srcPtr);
  120 + }
  121 +}}}
  122 +```
... ...
openbr/plugins/cuda/copyfrom.cpp 0 → 100644
  1 +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
  2 + * Copyright 2016 Colin Heinzmann *
  3 + * *
  4 + * Licensed under the Apache License, Version 2.0 (the "License"); *
  5 + * you may not use this file except in compliance with the License. *
  6 + * You may obtain a copy of the License at *
  7 + * *
  8 + * http://www.apache.org/licenses/LICENSE-2.0 *
  9 + * *
  10 + * Unless required by applicable law or agreed to in writing, software *
  11 + * distributed under the License is distributed on an "AS IS" BASIS, *
  12 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
  13 + * See the License for the specific language governing permissions and *
  14 + * limitations under the License. *
  15 + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
  16 +
  17 +#include <iostream>
  18 +
  19 +#include <opencv2/opencv.hpp>
  20 +
  21 +#include <openbr/plugins/openbr_internal.h>
  22 +
  23 +using namespace std;
  24 +
  25 +using namespace cv;
  26 +
  27 +// CUDA functions for this plugin
  28 +namespace br { namespace cuda { namespace copyfrom {
  29 + template <typename T> void wrapper(void* src, T* out, int rows, int cols);
  30 +}}}
  31 +
  32 +namespace br
  33 +{
  34 + /*!
  35 + * \ingroup transforms
  36 + * \brief Copies a transform from the GPU to the CPU.
  37 + * \author Colin Heinzmann \cite DepthDeluxe
  38 + * \note Method: Automatically matches image dimensions, works for 32-bit single channel, 8-bit single channel, and 8-bit 3 channel
  39 + */
  40 + class CUDACopyFrom : public UntrainableTransform
  41 + {
  42 + Q_OBJECT
  43 +
  44 +private:
  45 + void project(const Template &src, Template &dst) const
  46 + {
  47 + // pull the data back out of the Mat
  48 + void* const* dataPtr = src.m().ptr<void*>();
  49 + int rows = *((int*)dataPtr[1]);
  50 + int cols = *((int*)dataPtr[2]);
  51 + int type = *((int*)dataPtr[3]);
  52 +
  53 + Mat dstMat = Mat(rows, cols, type);
  54 + switch(type) {
  55 + case CV_32FC1:
  56 + cuda::copyfrom::wrapper(dataPtr[0], dstMat.ptr<float>(), rows, cols);
  57 + break;
  58 + case CV_8UC1:
  59 + cuda::copyfrom::wrapper(dataPtr[0], dstMat.ptr<unsigned char>(), rows, cols);
  60 + break;
  61 + case CV_8UC3:
  62 + cuda::copyfrom::wrapper(dataPtr[0], dstMat.ptr<unsigned char>(), rows, cols * 3);
  63 + break;
  64 + default:
  65 + cout << "ERR: Invalid image type (" << type << ")" << endl;
  66 + break;
  67 + }
  68 + dst = dstMat;
  69 + }
  70 + };
  71 +
  72 + BR_REGISTER(Transform, CUDACopyFrom);
  73 +}
  74 +
  75 +#include "cuda/copyfrom.moc"
... ...
openbr/plugins/cuda/copyfrom.cu 0 → 100644
  1 +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
  2 + * Copyright 2016 Colin Heinzmann *
  3 + * *
  4 + * Licensed under the Apache License, Version 2.0 (the "License"); *
  5 + * you may not use this file except in compliance with the License. *
  6 + * You may obtain a copy of the License at *
  7 + * *
  8 + * http://www.apache.org/licenses/LICENSE-2.0 *
  9 + * *
  10 + * Unless required by applicable law or agreed to in writing, software *
  11 + * distributed under the License is distributed on an "AS IS" BASIS, *
  12 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
  13 + * See the License for the specific language governing permissions and *
  14 + * limitations under the License. *
  15 + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
  16 +
  17 +#include "cudadefines.hpp"
  18 +
  19 +namespace br { namespace cuda { namespace copyfrom {
  20 + template <typename T> void wrapper(void* src, T* dst, int rows, int cols) {
  21 + cudaError_t err;
  22 + CUDA_SAFE_MEMCPY(dst, src, rows*cols*sizeof(T), cudaMemcpyDeviceToHost, &err);
  23 + CUDA_SAFE_FREE(src, &err);
  24 + }
  25 +
  26 + template void wrapper(void*, float*, int, int);
  27 + template void wrapper(void*, unsigned char*, int, int);
  28 +}}}
... ...
openbr/plugins/cuda/copyto.cpp 0 → 100644
  1 +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
  2 + * Copyright 2016 Colin Heinzmann *
  3 + * *
  4 + * Licensed under the Apache License, Version 2.0 (the "License"); *
  5 + * you may not use this file except in compliance with the License. *
  6 + * You may obtain a copy of the License at *
  7 + * *
  8 + * http://www.apache.org/licenses/LICENSE-2.0 *
  9 + * *
  10 + * Unless required by applicable law or agreed to in writing, software *
  11 + * distributed under the License is distributed on an "AS IS" BASIS, *
  12 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
  13 + * See the License for the specific language governing permissions and *
  14 + * limitations under the License. *
  15 + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
  16 +
  17 +#include <iostream>
  18 +
  19 +#include <opencv2/opencv.hpp>
  20 +
  21 +#include <openbr/plugins/openbr_internal.h>
  22 +
  23 +using namespace std;
  24 +
  25 +using namespace cv;
  26 +
  27 +// definitions from the CUDA source file
  28 +namespace br { namespace cuda { namespace copyto {
  29 + template <typename T> void wrapper(const T* in, void** out, const int rows, const int cols);
  30 +}}}
  31 +
  32 +namespace br
  33 +{
  34 +
  35 + /*!
  36 + * \ingroup transforms
  37 + * \brief Copies a transform to the GPU.
  38 + * \author Colin Heinzmann \cite DepthDeluxe
  39 + * \note Method: Automatically matches image dimensions, works for 32-bit single channel, 8-bit single channel, and 8-bit 3 channel
  40 + */
  41 + class CUDACopyTo : public UntrainableTransform
  42 + {
  43 + Q_OBJECT
  44 +
  45 +private:
  46 + void project(const Template &src, Template &dst) const
  47 + {
  48 + const Mat& srcMat = src.m();
  49 + const int rows = srcMat.rows;
  50 + const int cols = srcMat.cols;
  51 +
  52 + // output will be a single pointer to graphics card memory
  53 + Mat dstMat = Mat(4, 1, DataType<void*>::type);
  54 + void** dstMatData = dstMat.ptr<void*>();
  55 +
  56 + // save cuda ptr, rows, cols, then type
  57 + dstMatData[1] = new int; *((int*)dstMatData[1]) = rows;
  58 + dstMatData[2] = new int; *((int*)dstMatData[2]) = cols;
  59 + dstMatData[3] = new int; *((int*)dstMatData[3]) = srcMat.type();
  60 +
  61 + void* cudaMemPtr;
  62 + switch(srcMat.type()) {
  63 + case CV_32FC1:
  64 + cuda::copyto::wrapper(srcMat.ptr<float>(), &dstMatData[0], rows, cols);
  65 + break;
  66 + case CV_8UC1:
  67 + cuda::copyto::wrapper(srcMat.ptr<unsigned char>(), &dstMatData[0], rows, cols);
  68 + break;
  69 + case CV_8UC3:
  70 + cuda::copyto::wrapper(srcMat.ptr<unsigned char>(), &dstMatData[0], rows, 3*cols);
  71 + break;
  72 + default:
  73 + cout << "ERR: Invalid image type (" << srcMat.type() << ")" << endl;
  74 + return;
  75 + }
  76 +
  77 + dst = dstMat;
  78 + }
  79 + };
  80 +
  81 + BR_REGISTER(Transform, CUDACopyTo);
  82 +}
  83 +
  84 +#include "cuda/copyto.moc"
... ...
openbr/plugins/cuda/copyto.cu 0 → 100644
  1 +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
  2 + * Copyright 2016 Colin Heinzmann *
  3 + * *
  4 + * Licensed under the Apache License, Version 2.0 (the "License"); *
  5 + * you may not use this file except in compliance with the License. *
  6 + * You may obtain a copy of the License at *
  7 + * *
  8 + * http://www.apache.org/licenses/LICENSE-2.0 *
  9 + * *
  10 + * Unless required by applicable law or agreed to in writing, software *
  11 + * distributed under the License is distributed on an "AS IS" BASIS, *
  12 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
  13 + * See the License for the specific language governing permissions and *
  14 + * limitations under the License. *
  15 + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
  16 +
  17 +#include "cudadefines.hpp"
  18 +
  19 +namespace br { namespace cuda { namespace copyto {
  20 +
  21 + template <typename T> void wrapper(const T* in, void** out, const int rows, const int cols) {
  22 + cudaError_t err;
  23 + CUDA_SAFE_MALLOC(out, rows*cols*sizeof(T), &err);
  24 + CUDA_SAFE_MEMCPY(*out, in, rows*cols*sizeof(T), cudaMemcpyHostToDevice, &err);
  25 + }
  26 +
  27 + template void wrapper(const float* in, void** out, const int rows, const int cols);
  28 + template void wrapper(const unsigned char* in, void** out, const int rows, const int cols);
  29 +
  30 +}}}
... ...
openbr/plugins/cuda/cudaaffine.cpp 0 → 100644
  1 +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
  2 + * Copyright 2016 Greg Shrock, Colin Heinzmann *
  3 + * *
  4 + * Licensed under the Apache License, Version 2.0 (the "License"); *
  5 + * you may not use this file except in compliance with the License. *
  6 + * You may obtain a copy of the License at *
  7 + * *
  8 + * http://www.apache.org/licenses/LICENSE-2.0 *
  9 + * *
  10 + * Unless required by applicable law or agreed to in writing, software *
  11 + * distributed under the License is distributed on an "AS IS" BASIS, *
  12 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
  13 + * See the License for the specific language governing permissions and *
  14 + * limitations under the License. *
  15 + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
  16 +
  17 +
  18 +
  19 +#include <iostream>
  20 +using namespace std;
  21 +
  22 +#include <sys/types.h>
  23 +#include <unistd.h>
  24 +
  25 +#include <pthread.h>
  26 +
  27 +#include <opencv2/opencv.hpp>
  28 +#include <opencv2/imgproc/imgproc.hpp>
  29 +#include <opencv2/imgproc/imgproc_c.h>
  30 +#include <opencv2/highgui/highgui.hpp>
  31 +#include <opencv2/highgui/highgui_c.h>
  32 +#include <limits>
  33 +
  34 +#include <openbr/plugins/openbr_internal.h>
  35 +#include <openbr/core/opencvutils.h>
  36 +
  37 +using namespace cv;
  38 +
  39 +// definitions from the CUDA source file
  40 +namespace br { namespace cuda { namespace affine {
  41 + void resizeWrapper(void* srcPtr, void** dstPtr, int src_rows, int src_cols, int dst_rows, int dst_cols);
  42 + void wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols);
  43 +}}}
  44 +
  45 +namespace br
  46 +{
  47 +
  48 + /*!
  49 + * \ingroup transforms
  50 + * \brief Performs a two or three point registration on the GPU. Modified from stock OpenBR implementation. Only supports single-point input bilinear transformation.
  51 + * \author Greg Schrock \cite gls022
  52 + * \author Colin Heinzmann \cite DepthDeluxe
  53 + * \note Method: Area should be used for shrinking an image, Cubic for slow but accurate enlargment, Bilin for fast enlargement.
  54 + */
  55 + class CUDAAffineTransform : public UntrainableTransform
  56 + {
  57 + Q_OBJECT
  58 +
  59 + private:
  60 + Q_PROPERTY(int width READ get_width WRITE set_width RESET reset_width STORED false)
  61 + Q_PROPERTY(int height READ get_height WRITE set_height RESET reset_height STORED false)
  62 + Q_PROPERTY(float x1 READ get_x1 WRITE set_x1 RESET reset_x1 STORED false)
  63 + Q_PROPERTY(float y1 READ get_y1 WRITE set_y1 RESET reset_y1 STORED false)
  64 + BR_PROPERTY(int, width, 64)
  65 + BR_PROPERTY(int, height, 64)
  66 + BR_PROPERTY(float, x1, 0)
  67 + BR_PROPERTY(float, y1, 0)
  68 +
  69 + static Point2f getThirdAffinePoint(const Point2f &a, const Point2f &b)
  70 + {
  71 + float dx = b.x - a.x;
  72 + float dy = b.y - a.y;
  73 + return Point2f(a.x - dy, a.y + dx);
  74 + }
  75 +
  76 + void project(const Template &src, Template &dst) const
  77 + {
  78 + Point2f dstPoints[3];
  79 + dstPoints[0] = Point2f(x1*width, y1*height);
  80 + dstPoints[1] = Point2f((1-x1)*width, (1-y1)*height);
  81 + dstPoints[2] = getThirdAffinePoint(dstPoints[0], dstPoints[1]);
  82 +
  83 + Point2f srcPoints[3];
  84 + if (src.file.contains("Affine_0") &&
  85 + src.file.contains("Affine_1") &&
  86 + src.file.contains("Affine_2")) {
  87 + srcPoints[0] = OpenCVUtils::toPoint(src.file.get<QPointF>("Affine_0"));
  88 + srcPoints[1] = OpenCVUtils::toPoint(src.file.get<QPointF>("Affine_1"));
  89 + } else {
  90 + const QList<Point2f> landmarks = OpenCVUtils::toPoints(src.file.points());
  91 +
  92 + if (landmarks.size() < 2) {
  93 + void* const* srcDataPtr = src.m().ptr<void*>();
  94 + int rows = *((int*)srcDataPtr[1]);
  95 + int cols = *((int*)srcDataPtr[2]);
  96 + int type = *((int*)srcDataPtr[3]);
  97 +
  98 + if (type != CV_8UC1) {
  99 + cout << "ERR: Invalid image format!" << endl;
  100 + return;
  101 + }
  102 +
  103 + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
  104 + void** dstDataPtr = dstMat.ptr<void*>();
  105 +
  106 + dstDataPtr[1] = srcDataPtr[1]; *((int*)dstDataPtr[1]) = height; // rows
  107 + dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = width; // cols
  108 + dstDataPtr[3] = srcDataPtr[3];
  109 +
  110 + cuda::affine::resizeWrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols, height, width);
  111 + dst = dstMat;
  112 + return;
  113 + } else {
  114 + srcPoints[0] = landmarks[0];
  115 + srcPoints[1] = landmarks[1];
  116 + }
  117 + }
  118 + srcPoints[2] = getThirdAffinePoint(srcPoints[0], srcPoints[1]);
  119 +
  120 + Mat affineTransform = getAffineTransform(srcPoints, dstPoints);
  121 +
  122 + void* const* srcDataPtr = src.m().ptr<void*>();
  123 + int rows = *((int*)srcDataPtr[1]);
  124 + int cols = *((int*)srcDataPtr[2]);
  125 + int type = *((int*)srcDataPtr[3]);
  126 +
  127 + if (type != CV_8UC1) {
  128 + cout << "ERR: Invalid image format!" << endl;
  129 + return;
  130 + }
  131 +
  132 +
  133 + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
  134 + void** dstDataPtr = dstMat.ptr<void*>();
  135 +
  136 + dstDataPtr[1] = srcDataPtr[1]; *((int*)dstDataPtr[1]) = height; // rows
  137 + dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = width; // cols
  138 + dstDataPtr[3] = srcDataPtr[3];
  139 +
  140 + cuda::affine::wrapper(srcDataPtr[0], &dstDataPtr[0], affineTransform, rows, cols, height, width);
  141 +
  142 + dst = dstMat;
  143 + }
  144 + };
  145 +
  146 + BR_REGISTER(Transform, CUDAAffineTransform)
  147 +
  148 +} // namespace br
  149 +
  150 +#include "cuda/cudaaffine.moc"
... ...
openbr/plugins/cuda/cudaaffine.cu 0 → 100644
  1 +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
  2 + * Copyright 2016 Colin Heinzmann *
  3 + * *
  4 + * Licensed under the Apache License, Version 2.0 (the "License"); *
  5 + * you may not use this file except in compliance with the License. *
  6 + * You may obtain a copy of the License at *
  7 + * *
  8 + * http://www.apache.org/licenses/LICENSE-2.0 *
  9 + * *
  10 + * Unless required by applicable law or agreed to in writing, software *
  11 + * distributed under the License is distributed on an "AS IS" BASIS, *
  12 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
  13 + * See the License for the specific language governing permissions and *
  14 + * limitations under the License. *
  15 + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
  16 +
  17 +#include <iostream>
  18 +
  19 +using namespace std;
  20 +
  21 +#include <opencv2/gpu/gpu.hpp>
  22 +#include <opencv2/opencv.hpp>
  23 +#include <stdio.h>
  24 +#include <math.h>
  25 +
  26 +#include "cudadefines.hpp"
  27 +
  28 +using namespace cv;
  29 +using namespace cv::gpu;
  30 +
  31 +namespace br { namespace cuda { namespace affine {
  32 +
  33 + __device__ __forceinline__ uint8_t getPixelValueDevice(int row, int col, uint8_t* srcPtr, int rows, int cols) {
  34 + return (srcPtr + row*cols)[col];
  35 + }
  36 +
  37 +
  38 + __device__ __forceinline__ uint8_t getBilinearPixelValueDevice(double row, double col, uint8_t* srcPtr, int rows, int cols) {
  39 + // http://www.sci.utah.edu/~acoste/uou/Image/project3/ArthurCOSTE_Project3.pdf
  40 + // Bilinear Transformation
  41 + // f(Px, Py) = f(Q11)×(1−Rx)×(1−Sy)+f(Q21)×(Rx)×(1−Sy)+f(Q12)×(1−Rx)×(Sy)+f(Q22)×(Rx)×(Sy)
  42 +
  43 + int row1 = floor(row);
  44 + int row2 = row1+1;
  45 +
  46 + int col1 = floor(col);
  47 + int col2 = col1+1;
  48 +
  49 + double d_row = row - row1;
  50 + double d_col = col - col1;
  51 +
  52 + int Q11 = getPixelValueDevice(row1, col1, srcPtr, rows, cols);
  53 + int Q21 = getPixelValueDevice(row2, col1, srcPtr, rows, cols);
  54 + int Q12 = getPixelValueDevice(row1, col2, srcPtr, rows, cols);
  55 + int Q22 = getPixelValueDevice(row2, col2, srcPtr, rows, cols);
  56 +
  57 + double val = Q22*(d_row*d_col) + Q12*((1-d_row)*d_col) + Q21*(d_row*(1-d_col)) + Q11*((1-d_row)*(1-d_col));
  58 + return ((uint8_t) round(val));
  59 + }
  60 +
  61 + __device__ __forceinline__ uint8_t getDistancePixelValueDevice(double row, double col, uint8_t* srcPtr, int rows, int cols) {
  62 + int row1 = floor(row);
  63 + int row2 = row1+1;
  64 +
  65 + int col1 = floor(col);
  66 + int col2 = col1+1;
  67 +
  68 + double m1 = row2 - row;
  69 + double m12 = m1*m1;
  70 +
  71 + double m2 = col - col1;
  72 + double m22 = m2*m2;
  73 +
  74 + double d1 = sqrt(m12 - 2*m1 + 1 + m22);
  75 + double d2 = sqrt(m12 + m22);
  76 + double d3 = sqrt(m12 - 2*m1 + 1 + m22 - 2*m2 + 1);
  77 + double d4 = sqrt(m12 + m22 - 2*m2 + 1);
  78 + double sum = d1 + d2 + d3 + d4;
  79 +
  80 + double w1 = d1/sum;
  81 + double w2 = d2/sum;
  82 + double w3 = d3/sum;
  83 + double w4 = d4/sum;
  84 +
  85 + uint8_t v1 = getPixelValueDevice(row1, col1, srcPtr, rows, cols);
  86 + uint8_t v2 = getPixelValueDevice(row2, col1, srcPtr, rows, cols);
  87 + uint8_t v3 = getPixelValueDevice(row1, col2, srcPtr, rows, cols);
  88 + uint8_t v4 = getPixelValueDevice(row2, col2, srcPtr, rows, cols);
  89 +
  90 + return round(w1*v1 + w2*v2 + w3*v3 + w4*v4);
  91 + }
  92 +
  93 + /*
  94 + * trans_inv - A pointer to a one-dimensional representation of the inverse of the transform matrix 3x3
  95 + * dst_row - The destination row (mapping to this row)
  96 + * dst_col - The destination column (mapping to this column)
  97 + * src_row - The computed source pixel row (mapping from this row)
  98 + * src_col - The computed source pixel column (mapping from this col)
  99 + */
  100 + __device__ __forceinline__ void getSrcCoordDevice(double *trans_inv, int dst_row, int dst_col, double* src_row_pnt, double* src_col_pnt){
  101 + *src_col_pnt = dst_col * trans_inv[0] + dst_row * trans_inv[3] + trans_inv[6];
  102 + *src_row_pnt = dst_col * trans_inv[1] + dst_row * trans_inv[4] + trans_inv[7];
  103 + }
  104 +
  105 + __global__ void bilinearKernel(uint8_t* srcPtr, uint8_t* dstPtr, int srcRows, int srcCols, int dstRows, int dstCols) {
  106 + int dstRowInd = blockIdx.y*blockDim.y+threadIdx.y;
  107 + int dstColInd = blockIdx.x*blockDim.x+threadIdx.x;
  108 + int dstIndex = dstRowInd*dstCols+dstColInd;
  109 +
  110 + // destination boundary checking
  111 + if (dstRowInd >= dstRows || dstColInd >= dstCols) {
  112 + return;
  113 + }
  114 +
  115 + // get the reference indices and relative amounts
  116 + float exactSrcRowInd = (float)dstRowInd / (float)dstRows * (float)srcRows;
  117 + int minSrcRowInd = (int)exactSrcRowInd;
  118 + int maxSrcRowInd = minSrcRowInd+1;
  119 + float relSrcRowInd = 1.-(exactSrcRowInd-(float)minSrcRowInd);
  120 +
  121 + // get the reference indices and relative amounts
  122 + double exactSrcColInd = (double)dstColInd / (double)dstCols * (double)srcCols;
  123 + int minSrcColInd = (int)exactSrcColInd;
  124 + int maxSrcColInd = minSrcColInd+1;
  125 + float relSrcColInd = 1.-(exactSrcColInd-(float)minSrcColInd);
  126 +
  127 + // perform boundary checking
  128 + if (minSrcRowInd < 0 || maxSrcRowInd >= srcRows || minSrcColInd < 0 || maxSrcColInd >= srcCols) {
  129 + dstPtr[dstIndex] = 0;
  130 + return;
  131 + }
  132 +
  133 + // get each of the pixel values
  134 + float topLeft = srcPtr[minSrcRowInd*srcCols+minSrcColInd];
  135 + float topRight = srcPtr[minSrcRowInd*srcCols+maxSrcColInd];
  136 + float bottomLeft = srcPtr[maxSrcRowInd*srcCols+minSrcColInd];
  137 + float bottomRight = srcPtr[maxSrcRowInd*srcCols+maxSrcColInd];
  138 +
  139 + float out = relSrcRowInd*relSrcColInd*topLeft + relSrcRowInd*(1.-relSrcColInd)*topRight + (1.-relSrcRowInd)*relSrcColInd*bottomLeft + (1.-relSrcRowInd)*(1.-relSrcColInd)*bottomRight;
  140 +
  141 + dstPtr[dstIndex] = (int)out;
  142 + }
  143 +
  144 + __global__ void affineKernel(uint8_t* srcPtr, uint8_t* dstPtr, double* trans_inv, int src_rows, int src_cols, int dst_rows, int dst_cols){
  145 + int dstRowInd = blockIdx.y*blockDim.y+threadIdx.y;
  146 + int dstColInd = blockIdx.x*blockDim.x+threadIdx.x;
  147 + int dstIndex = dstRowInd*dst_cols + dstColInd;
  148 +
  149 + double srcRowPnt;
  150 + double srcColPnt;
  151 +
  152 + // don't do anything if the index is out of bounds
  153 + if (dstRowInd >= dst_rows || dstColInd >= dst_cols) {
  154 + return;
  155 + }
  156 + if (dstRowInd == 0 || dstRowInd == dst_rows-1 || dstColInd ==0 || dstColInd == dst_cols-1) {
  157 + dstPtr[dstIndex] = 0;
  158 + return;
  159 + }
  160 +
  161 + getSrcCoordDevice(trans_inv, dstRowInd, dstColInd, &srcRowPnt, &srcColPnt);
  162 + const uint8_t cval = getBilinearPixelValueDevice(srcRowPnt, srcColPnt, srcPtr, src_rows, src_cols); // Get initial pixel value
  163 +
  164 + dstPtr[dstIndex] = cval;
  165 + }
  166 +
  167 + void resizeWrapper(void* srcPtr, void** dstPtr, int srcRows, int srcCols, int dstRows, int dstCols) {
  168 + // perform bilinear filtering
  169 +
  170 + // allocate space for destination
  171 + cudaError_t err;
  172 + CUDA_SAFE_MALLOC(dstPtr, dstRows*dstCols*sizeof(uint8_t), &err);
  173 +
  174 + // call the bilinear kernel function
  175 + dim3 threadsPerBlock(32, 16);
  176 + dim3 numBlocks(dstCols/threadsPerBlock.x + 1,
  177 + dstRows/threadsPerBlock.y + 1);
  178 +
  179 + bilinearKernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)*dstPtr, srcRows, srcCols, dstRows, dstCols);
  180 + CUDA_KERNEL_ERR_CHK(&err);
  181 +
  182 + CUDA_SAFE_FREE(srcPtr, &err);
  183 + }
  184 +
  185 + void wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols) {
  186 + cudaError_t err;
  187 + double* gpuInverse;
  188 +
  189 + dim3 threadsPerBlock(32, 16);
  190 + dim3 numBlocks(dst_cols/threadsPerBlock.x + 1,
  191 + dst_rows/threadsPerBlock.y + 1);
  192 +
  193 + //************************************************************************
  194 + // Input affine is a 2x3 Mat whose transpose is used in the computations
  195 + // [x, y, 1] = [u, v, 1] [ a^T | [0 0 1]^T ]
  196 + // See "Digital Image Warping" by George Wolburg (p. 50)
  197 + //************************************************************************
  198 +
  199 + // get new transform elements
  200 + double a11 = affineTransform.at<double>(0, 0);
  201 + double a12 = affineTransform.at<double>(1, 0);
  202 + double a21 = affineTransform.at<double>(0, 1);
  203 + double a22 = affineTransform.at<double>(1, 1);
  204 + double a31 = affineTransform.at<double>(0, 2);
  205 + double a32 = affineTransform.at<double>(1, 2);
  206 +
  207 + // compute transform inverse
  208 + double det = 1 / (a11*a22 - a21*a12);
  209 +
  210 + double affineInverse[9];
  211 + affineInverse[0] = a22 * det;
  212 + affineInverse[1] = -a12 * det;
  213 + affineInverse[2] = 0;
  214 + affineInverse[3] = -a21 * det;
  215 + affineInverse[4] = a11 * det;
  216 + affineInverse[5] = 0;
  217 + affineInverse[6] = (a21*a32 - a31*a22) * det;
  218 + affineInverse[7] = (a31*a12 - a11*a32) * det;
  219 + affineInverse[8] = (a11*a22 - a21*a12) * det;
  220 +
  221 + CUDA_SAFE_MALLOC(dstPtr, dst_rows*dst_cols*sizeof(uint8_t), &err);
  222 + CUDA_SAFE_MALLOC(&gpuInverse, 3*3*sizeof(double), &err);
  223 +
  224 + CUDA_SAFE_MEMCPY(gpuInverse, affineInverse, 9*sizeof(double), cudaMemcpyHostToDevice, &err);
  225 +
  226 + affineKernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), gpuInverse, src_rows, src_cols, dst_rows, dst_cols);
  227 + CUDA_KERNEL_ERR_CHK(&err);
  228 +
  229 + CUDA_SAFE_FREE(srcPtr, &err);
  230 + CUDA_SAFE_FREE(gpuInverse, &err);
  231 + }
  232 +}}}
... ...
openbr/plugins/cuda/cudacvtfloat.cpp 0 → 100644
  1 +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
  2 + * Copyright 2016 Li Li, Colin Heinzmann *
  3 + * *
  4 + * Licensed under the Apache License, Version 2.0 (the "License"); *
  5 + * you may not use this file except in compliance with the License. *
  6 + * You may obtain a copy of the License at *
  7 + * *
  8 + * http://www.apache.org/licenses/LICENSE-2.0 *
  9 + * *
  10 + * Unless required by applicable law or agreed to in writing, software *
  11 + * distributed under the License is distributed on an "AS IS" BASIS, *
  12 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
  13 + * See the License for the specific language governing permissions and *
  14 + * limitations under the License. *
  15 + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
  16 +
  17 +#include <iostream>
  18 +using namespace std;
  19 +#include <unistd.h>
  20 +
  21 +#include <opencv2/opencv.hpp>
  22 +using namespace cv;
  23 +
  24 +#include <openbr/plugins/openbr_internal.h>
  25 +
  26 +// definitions from the CUDA source file
  27 +namespace br { namespace cuda { namespace cvtfloat {
  28 + void wrapper(void* src, void** dst, int rows, int cols);
  29 +}}}
  30 +
  31 +namespace br
  32 +{
  33 +
  34 +/*!
  35 + * \ingroup transforms
  36 + * \brief Converts 8-bit images currently on GPU into 32-bit floating point equivalent.
  37 + * \author Colin Heinzmann \cite DepthDeluxe
  38 + */
  39 +class CUDACvtFloatTransform : public UntrainableTransform
  40 +{
  41 + Q_OBJECT
  42 +
  43 + public:
  44 + void project(const Template &src, Template &dst) const
  45 + {
  46 + void* const* srcDataPtr = src.m().ptr<void*>();
  47 + int rows = *((int*)srcDataPtr[1]);
  48 + int cols = *((int*)srcDataPtr[2]);
  49 + int type = *((int*)srcDataPtr[3]);
  50 +
  51 + // assume the image type is 256-monochrome
  52 + // TODO(colin): real exception handling
  53 + if (type != CV_8UC1) {
  54 + cout << "ERR: Invalid memory format" << endl;
  55 + return;
  56 + }
  57 +
  58 + // build the destination mat
  59 + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
  60 + void** dstDataPtr = dstMat.ptr<void*>();
  61 + dstDataPtr[1] = srcDataPtr[1];
  62 + dstDataPtr[2] = srcDataPtr[2];
  63 + dstDataPtr[3] = srcDataPtr[3]; *((int*)dstDataPtr[3]) = CV_32FC1;
  64 +
  65 + cuda::cvtfloat::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
  66 + dst = dstMat;
  67 + }
  68 +};
  69 +
  70 +BR_REGISTER(Transform, CUDACvtFloatTransform)
  71 +
  72 +} // namespace br
  73 +
  74 +#include "cuda/cudacvtfloat.moc"
... ...
openbr/plugins/cuda/cudacvtfloat.cu 0 → 100644
  1 +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
  2 + * Copyright 2016 Li Li, Colin Heinzmann *
  3 + * *
  4 + * Licensed under the Apache License, Version 2.0 (the "License"); *
  5 + * you may not use this file except in compliance with the License. *
  6 + * You may obtain a copy of the License at *
  7 + * *
  8 + * http://www.apache.org/licenses/LICENSE-2.0 *
  9 + * *
  10 + * Unless required by applicable law or agreed to in writing, software *
  11 + * distributed under the License is distributed on an "AS IS" BASIS, *
  12 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
  13 + * See the License for the specific language governing permissions and *
  14 + * limitations under the License. *
  15 + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
  16 +
  17 +#include <iostream>
  18 +
  19 +using namespace std;
  20 +
  21 +#include "cudadefines.hpp"
  22 +
  23 +namespace br { namespace cuda { namespace cvtfloat {
  24 +
  25 + __global__ void kernel(const unsigned char* src, float* dst, int rows, int cols) {
  26 + // get my index
  27 + int rowInd = blockIdx.y*blockDim.y + threadIdx.y;
  28 + int colInd = blockIdx.x*blockDim.x + threadIdx.x;
  29 +
  30 + // bounds check
  31 + if (rowInd >= rows || colInd >= cols) {
  32 + return;
  33 + }
  34 +
  35 + int index = rowInd*cols + colInd;
  36 + dst[index] = (float)src[index];
  37 + }
  38 +
  39 + void wrapper(void* src, void** dst, int rows, int cols) {
  40 + cudaError_t err;
  41 + CUDA_SAFE_MALLOC(dst, rows*cols*sizeof(float), &err);
  42 +
  43 + dim3 threadsPerBlock(8, 8);
  44 + dim3 numBlocks(
  45 + cols / threadsPerBlock.x + 1,
  46 + rows / threadsPerBlock.y + 1
  47 + );
  48 +
  49 + kernel<<<numBlocks, threadsPerBlock>>>((const unsigned char*)src, (float*)(*dst), rows, cols);
  50 + CUDA_KERNEL_ERR_CHK(&err);
  51 +
  52 + // free the src memory since it is now in a newly allocated dst
  53 + CUDA_SAFE_FREE(src, &err);
  54 + }
  55 +
  56 +}}}
... ...
openbr/plugins/cuda/cudadefines.hpp 0 → 100644
  1 +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
  2 + * Copyright 2016 Colin Heinzmann *
  3 + * *
  4 + * Licensed under the Apache License, Version 2.0 (the "License"); *
  5 + * you may not use this file except in compliance with the License. *
  6 + * You may obtain a copy of the License at *
  7 + * *
  8 + * http://www.apache.org/licenses/LICENSE-2.0 *
  9 + * *
  10 + * Unless required by applicable law or agreed to in writing, software *
  11 + * distributed under the License is distributed on an "AS IS" BASIS, *
  12 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
  13 + * See the License for the specific language governing permissions and *
  14 + * limitations under the License. *
  15 + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
  16 +
  17 +#include <iostream>
  18 +
  19 +using namespace std;
  20 +#include <pthread.h>
  21 +
  22 +#define CUDA_SAFE_FREE(cudaPtr, errPtr) \
  23 + /*cout << pthread_self() << ": CUDA Free: " << cudaPtr << endl;*/ \
  24 + *errPtr = cudaFree(cudaPtr); \
  25 + if (*errPtr != cudaSuccess) { \
  26 + cout << pthread_self() << ": CUDA Free Error(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \
  27 + throw 0; \
  28 + }
  29 +
  30 +#define CUDA_SAFE_MALLOC(cudaPtrPtr, size, errPtr) \
  31 + *errPtr = cudaMalloc(cudaPtrPtr, size); \
  32 + if (*errPtr != cudaSuccess) { \
  33 + cout << pthread_self() << ": CUDA Malloc Error(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \
  34 + throw 0; \
  35 + } \
  36 + //cout << pthread_self() << ": CUDA Malloc: " << (void*)*(int**)cudaPtrPtr << endl;
  37 +
  38 +#define CUDA_SAFE_MEMCPY(dstPtr, srcPtr, count, kind, errPtr) \
  39 + *errPtr = cudaMemcpy(dstPtr, srcPtr, count, kind); \
  40 + if (*errPtr != cudaSuccess) { \
  41 + cout << pthread_self() << ": CUDA Memcpy Error(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \
  42 + throw 0; \
  43 + }
  44 +
  45 +#define CUDA_KERNEL_ERR_CHK(errPtr) \
  46 + *errPtr = cudaPeekAtLastError(); \
  47 + if (*errPtr != cudaSuccess) { \
  48 + cout << pthread_self() << ": Kernel Call Err(" << *errPtr << "): " << cudaGetErrorString(*errPtr) << endl; \
  49 + throw 0; \
  50 + }
... ...
openbr/plugins/cuda/cudal2.cpp 0 → 100644
  1 +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
  2 + * Copyright 2016 Colin Heinzmann *
  3 + * *
  4 + * Licensed under the Apache License, Version 2.0 (the "License"); *
  5 + * you may not use this file except in compliance with the License. *
  6 + * You may obtain a copy of the License at *
  7 + * *
  8 + * http://www.apache.org/licenses/LICENSE-2.0 *
  9 + * *
  10 + * Unless required by applicable law or agreed to in writing, software *
  11 + * distributed under the License is distributed on an "AS IS" BASIS, *
  12 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
  13 + * See the License for the specific language governing permissions and *
  14 + * limitations under the License. *
  15 + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
  16 +
  17 +#include <iostream>
  18 +using namespace std;
  19 +
  20 +#include <openbr/plugins/openbr_internal.h>
  21 +
  22 +// definitions from the CUDA source file
  23 +namespace br { namespace cuda { namespace L2 {
  24 + void wrapper(float const* aPtr, float const* bPtr, int length, float* outPtr);
  25 +}}}
  26 +
  27 +namespace br
  28 +{
  29 +
  30 +/*!
  31 + * \ingroup distances
  32 + * \brief L2 distance computed using eigen.
  33 + * \author Colin Heinzmann \cite DepthDeluxe
  34 + */
  35 +class CUDAL2Distance : public UntrainableDistance
  36 +{
  37 + Q_OBJECT
  38 +
  39 + float compare(const cv::Mat &a, const cv::Mat &b) const
  40 + {
  41 + if (a.type() != CV_32FC1 || b.type() != CV_32FC1) {
  42 + cout << "ERR: Type mismatch" << endl;
  43 + throw 0;
  44 + }
  45 + if (a.rows*a.cols != b.rows*b.cols) {
  46 + cout << "ERR: Dimension mismatch" << endl;
  47 + throw 1;
  48 + }
  49 +
  50 + float out;
  51 + cuda::L2::wrapper(a.ptr<float>(), b.ptr<float>(), a.rows*a.cols, &out);
  52 +
  53 + return out;
  54 + }
  55 +};
  56 +
  57 +BR_REGISTER(Distance, CUDAL2Distance)
  58 +
  59 +} // namespace br
  60 +
  61 +#include "cuda/cudal2.moc"
... ...
openbr/plugins/cuda/cudal2.cu 0 → 100644
  1 +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
  2 + * Copyright 2016 Colin Heinzmann *
  3 + * *
  4 + * Licensed under the Apache License, Version 2.0 (the "License"); *
  5 + * you may not use this file except in compliance with the License. *
  6 + * You may obtain a copy of the License at *
  7 + * *
  8 + * http://www.apache.org/licenses/LICENSE-2.0 *
  9 + * *
  10 + * Unless required by applicable law or agreed to in writing, software *
  11 + * distributed under the License is distributed on an "AS IS" BASIS, *
  12 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
  13 + * See the License for the specific language governing permissions and *
  14 + * limitations under the License. *
  15 + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
  16 +#include <math.h>
  17 +
  18 +
  19 +#include "cudadefines.hpp"
  20 +
  21 +namespace br { namespace cuda { namespace L2 {
  22 +
  23 + __global__ void subtractKernel(float* aPtr, float* bPtr, float* workPtr, int length) {
  24 + int index = blockIdx.x*blockDim.x+threadIdx.x;
  25 +
  26 + if (index >= length) {
  27 + return;
  28 + }
  29 +
  30 + // perform the subtraction
  31 + float res = aPtr[index] - bPtr[index];
  32 + res = res * res;
  33 + workPtr[index] = res;
  34 + }
  35 +
  36 + __global__ void collapseKernel(float* inPtr, float* outPtr, int length) {
  37 + // make sure there is only one thread that we are calling
  38 + if (blockIdx.x != 0 || threadIdx.x != 0) {
  39 + return;
  40 + }
  41 +
  42 + // sum up all the values
  43 + float acc = 0;
  44 + for (int i=0; i < length; i++) {
  45 + acc += inPtr[i];
  46 + }
  47 +
  48 + *outPtr = acc;
  49 + }
  50 +
  51 + float* cudaAPtr = NULL;
  52 + float* cudaBPtr = NULL;
  53 + float* cudaWorkBufferPtr = NULL;
  54 + float* cudaOutPtr = NULL;
  55 + int bufferLen = 0;
  56 +
  57 + void wrapper(float const* aPtr, float const* bPtr, int length, float* outPtr) {
  58 + cudaError_t err;
  59 +
  60 + // allocate memory for the mats and copy data to graphics card
  61 + // only allocate if there is a mismatch in image size, otherwise
  62 + // use the existing allocated memory
  63 + if (length != bufferLen) {
  64 + if (cudaAPtr != NULL) {
  65 + CUDA_SAFE_FREE(cudaAPtr, &err);
  66 + CUDA_SAFE_FREE(cudaBPtr, &err);
  67 + CUDA_SAFE_FREE(cudaWorkBufferPtr, &err);
  68 + CUDA_SAFE_FREE(cudaOutPtr, &err);
  69 + }
  70 + CUDA_SAFE_MALLOC(&cudaAPtr, length*sizeof(float), &err);
  71 + CUDA_SAFE_MALLOC(&cudaBPtr, length*sizeof(float), &err);
  72 + CUDA_SAFE_MALLOC(&cudaWorkBufferPtr, sizeof(float)*length, &err);
  73 + CUDA_SAFE_MALLOC(&cudaOutPtr, sizeof(float), &err);
  74 + bufferLen = length;
  75 + }
  76 +
  77 + // copy data over from CPU
  78 + CUDA_SAFE_MEMCPY(cudaAPtr, aPtr, length*sizeof(float), cudaMemcpyHostToDevice, &err);
  79 + CUDA_SAFE_MEMCPY(cudaBPtr, bPtr, length*sizeof(float), cudaMemcpyHostToDevice, &err);
  80 +
  81 + // perform the subtraction
  82 + int threadsPerBlock = 512;
  83 + int numBlocks = length / threadsPerBlock + 1;
  84 + subtractKernel<<<threadsPerBlock, numBlocks>>>(cudaAPtr, cudaBPtr, cudaWorkBufferPtr, length);
  85 + CUDA_KERNEL_ERR_CHK(&err);
  86 +
  87 + // perform the collapse
  88 + collapseKernel<<<1,1>>>(cudaWorkBufferPtr, cudaOutPtr, length);
  89 + CUDA_KERNEL_ERR_CHK(&err);
  90 +
  91 + // copy the single value back to the destinsion
  92 + CUDA_SAFE_MEMCPY(outPtr, cudaOutPtr, sizeof(float), cudaMemcpyDeviceToHost, &err);
  93 + }
  94 +}}}
... ...
openbr/plugins/cuda/cudalbp.cpp 0 → 100644
  1 +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
  2 + * Copyright 2016 Li Li, Colin Heinzmann *
  3 + * *
  4 + * Licensed under the Apache License, Version 2.0 (the "License"); *
  5 + * you may not use this file except in compliance with the License. *
  6 + * You may obtain a copy of the License at *
  7 + * *
  8 + * http://www.apache.org/licenses/LICENSE-2.0 *
  9 + * *
  10 + * Unless required by applicable law or agreed to in writing, software *
  11 + * distributed under the License is distributed on an "AS IS" BASIS, *
  12 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
  13 + * See the License for the specific language governing permissions and *
  14 + * limitations under the License. *
  15 + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
  16 +
  17 +#include <iostream>
  18 +using namespace std;
  19 +
  20 +#include <sys/types.h>
  21 +#include <unistd.h>
  22 +
  23 +#include <pthread.h>
  24 +
  25 +#include <opencv2/opencv.hpp>
  26 +#include <opencv2/imgproc/imgproc.hpp>
  27 +#include <opencv2/imgproc/imgproc_c.h>
  28 +#include <opencv2/highgui/highgui.hpp>
  29 +#include <opencv2/highgui/highgui_c.h>
  30 +#include <limits>
  31 +
  32 +#include <openbr/plugins/openbr_internal.h>
  33 +
  34 +using namespace cv;
  35 +
  36 +// definitions from the CUDA source file
  37 +namespace br { namespace cuda { namespace lbp {
  38 + void wrapper(void* srcPtr, void** dstPtr, int rows, int cols);
  39 + void initializeWrapper(uint8_t* lut);
  40 +}}}
  41 +
  42 +namespace br
  43 +{
  44 +/*!
  45 + * \ingroup transforms
  46 + * \brief Convert the image into a feature vector using Local Binary Patterns in CUDA. Modified from stock OpenBR plugin.
  47 + * \author Colin Heinzmann \cite DepthDeluxe
  48 + * \author Li Li \cite booli
  49 + */
  50 +class CUDALBPTransform : public UntrainableTransform
  51 +{
  52 + Q_OBJECT
  53 + Q_PROPERTY(int radius READ get_radius WRITE set_radius RESET reset_radius STORED false)
  54 + Q_PROPERTY(int maxTransitions READ get_maxTransitions WRITE set_maxTransitions RESET reset_maxTransitions STORED false)
  55 + Q_PROPERTY(bool rotationInvariant READ get_rotationInvariant WRITE set_rotationInvariant RESET reset_rotationInvariant STORED false)
  56 + BR_PROPERTY(int, radius, 1)
  57 + BR_PROPERTY(int, maxTransitions, 8)
  58 + BR_PROPERTY(bool, rotationInvariant, false)
  59 +
  60 + private:
  61 + uchar lut[256];
  62 + uchar null;
  63 +
  64 + public:
  65 + /* Returns the number of 0->1 or 1->0 transitions in i */
  66 + static int numTransitions(int i)
  67 + {
  68 + int transitions = 0;
  69 + int curParity = i%2;
  70 + for (int j=1; j<=8; j++) {
  71 + int parity = (i>>(j%8)) % 2;
  72 + if (parity != curParity) transitions++;
  73 + curParity = parity;
  74 + }
  75 + return transitions;
  76 + }
  77 +
  78 + static int rotationInvariantEquivalent(int i)
  79 + {
  80 + int min = std::numeric_limits<int>::max();
  81 + for (int j=0; j<8; j++) {
  82 + bool parity = i % 2;
  83 + i = i >> 1;
  84 + if (parity) i+=128;
  85 + min = std::min(min, i);
  86 + }
  87 + return min;
  88 + }
  89 +
  90 + void init()
  91 + {
  92 + bool set[256];
  93 + uchar uid = 0;
  94 + for (int i=0; i<256; i++) {
  95 + if (numTransitions(i) <= maxTransitions) {
  96 + int id;
  97 + if (rotationInvariant) {
  98 + int rie = rotationInvariantEquivalent(i);
  99 + if (i == rie) id = uid++;
  100 + else id = lut[rie];
  101 + } else id = uid++;
  102 + lut[i] = id;
  103 + set[i] = true;
  104 + } else {
  105 + set[i] = false;
  106 + }
  107 + }
  108 +
  109 + null = uid;
  110 + for (int i=0; i<256; i++)
  111 + if (!set[i])
  112 + lut[i] = null; // Set to null id
  113 +
  114 + // copy lut over to the GPU
  115 + cuda::lbp::initializeWrapper(lut);
  116 +
  117 + std::cout << "Initialized CUDALBP" << std::endl;
  118 + }
  119 +
  120 + void project(const Template &src, Template &dst) const
  121 + {
  122 + void* const* srcDataPtr = src.m().ptr<void*>();
  123 + int rows = *((int*)srcDataPtr[1]);
  124 + int cols = *((int*)srcDataPtr[2]);
  125 + int type = *((int*)srcDataPtr[3]);
  126 +
  127 + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
  128 + void** dstDataPtr = dstMat.ptr<void*>();
  129 + dstDataPtr[1] = srcDataPtr[1];
  130 + dstDataPtr[2] = srcDataPtr[2];
  131 + dstDataPtr[3] = srcDataPtr[3];
  132 +
  133 + cuda::lbp::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
  134 + dst = dstMat;
  135 + }
  136 +};
  137 +
  138 +BR_REGISTER(Transform, CUDALBPTransform)
  139 +
  140 +}
  141 +
  142 +#include "cuda/cudalbp.moc"
... ...
openbr/plugins/cuda/cudalbp.cu 0 → 100644
  1 +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
  2 + * Copyright 2016 Li Li, Colin Heinzmann *
  3 + * *
  4 + * Licensed under the Apache License, Version 2.0 (the "License"); *
  5 + * you may not use this file except in compliance with the License. *
  6 + * You may obtain a copy of the License at *
  7 + * *
  8 + * http://www.apache.org/licenses/LICENSE-2.0 *
  9 + * *
  10 + * Unless required by applicable law or agreed to in writing, software *
  11 + * distributed under the License is distributed on an "AS IS" BASIS, *
  12 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
  13 + * See the License for the specific language governing permissions and *
  14 + * limitations under the License. *
  15 + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
  16 +#include <iostream>
  17 +
  18 +using namespace std;
  19 +
  20 +#include <opencv2/gpu/gpu.hpp>
  21 +#include <stdio.h>
  22 +
  23 +#include "cudadefines.hpp"
  24 +
  25 +using namespace cv;
  26 +using namespace cv::gpu;
  27 +
  28 +/*
  29 + * These are the CUDA functions for CUDALBP. See cudapca.cpp for more details
  30 + */
  31 +
  32 +namespace br { namespace cuda { namespace lbp {
  33 + uint8_t* lut;
  34 +
  35 + __device__ __forceinline__ uint8_t getPixelValueKernel(int row, int col, uint8_t* srcPtr, int rows, int cols) {
  36 + return (srcPtr + row*cols)[col];
  37 + }
  38 +
  39 + __global__ void lutKernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols, uint8_t* lut)
  40 + {
  41 + int rowInd = blockIdx.y*blockDim.y+threadIdx.y;
  42 + int colInd = blockIdx.x*blockDim.x+threadIdx.x;
  43 + int radius = 1;
  44 +
  45 + int index = rowInd*cols + colInd;
  46 +
  47 + // don't do anything if the index is out of bounds
  48 + if (rowInd < 1 || rowInd >= rows-1 || colInd < 1 || colInd >= cols-1) {
  49 + if (rowInd >= rows || colInd >= cols) {
  50 + return;
  51 + } else {
  52 + dstPtr[index] = 0;
  53 + return;
  54 + }
  55 + }
  56 +
  57 + const uint8_t cval = getPixelValueKernel(rowInd+0*radius, colInd+0*radius, srcPtr, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]); // center value
  58 + uint8_t val = lut[(getPixelValueKernel(rowInd-1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 128 : 0) |
  59 + (getPixelValueKernel(rowInd-1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 64 : 0) |
  60 + (getPixelValueKernel(rowInd-1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 32 : 0) |
  61 + (getPixelValueKernel(rowInd+0*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 16 : 0) |
  62 + (getPixelValueKernel(rowInd+1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 8 : 0) |
  63 + (getPixelValueKernel(rowInd+1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 4 : 0) |
  64 + (getPixelValueKernel(rowInd+1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 2 : 0) |
  65 + (getPixelValueKernel(rowInd+0*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 1 : 0)];
  66 +
  67 + // store calculated value away in the right place
  68 + dstPtr[index] = val;
  69 + }
  70 +
  71 + //void cudalbp_wrapper(uint8_t* srcPtr, uint8_t* dstPtr, uint8_t* lut, int imageWidth, int imageHeight, size_t step)
  72 + void wrapper(void* srcPtr, void** dstPtr, int rows, int cols)
  73 + {
  74 + cudaError_t err;
  75 +
  76 + // make 8 * 8 = 64 square block
  77 + dim3 threadsPerBlock(8, 8);
  78 + dim3 numBlocks(cols/threadsPerBlock.x + 1,
  79 + rows/threadsPerBlock.y + 1);
  80 +
  81 + CUDA_SAFE_MALLOC(dstPtr, rows*cols*sizeof(uint8_t), &err);
  82 + lutKernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), rows, cols, lut);
  83 + CUDA_KERNEL_ERR_CHK(&err);
  84 +
  85 + CUDA_SAFE_FREE(srcPtr, &err);
  86 + }
  87 +
  88 + void initializeWrapper(uint8_t* cpuLut) {
  89 + cudaError_t err;
  90 + CUDA_SAFE_MALLOC(&lut, 256*sizeof(uint8_t), &err);
  91 + CUDA_SAFE_MEMCPY(lut, cpuLut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice, &err);
  92 + }
  93 +}}}
... ...
openbr/plugins/cuda/cudapca.cpp 0 → 100644
  1 +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
  2 + * Copyright 2016 Colin Heinzmann *
  3 + * *
  4 + * Licensed under the Apache License, Version 2.0 (the "License"); *
  5 + * you may not use this file except in compliance with the License. *
  6 + * You may obtain a copy of the License at *
  7 + * *
  8 + * http://www.apache.org/licenses/LICENSE-2.0 *
  9 + * *
  10 + * Unless required by applicable law or agreed to in writing, software *
  11 + * distributed under the License is distributed on an "AS IS" BASIS, *
  12 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
  13 + * See the License for the specific language governing permissions and *
  14 + * limitations under the License. *
  15 + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
  16 +
  17 +#include <iostream>
  18 +using namespace std;
  19 +#include <unistd.h>
  20 +
  21 +#include <QList>
  22 +
  23 +#include <Eigen/Dense>
  24 +
  25 +#include <opencv2/opencv.hpp>
  26 +using namespace cv;
  27 +
  28 +#include <openbr/plugins/openbr_internal.h>
  29 +#include <openbr/core/common.h>
  30 +#include <openbr/core/eigenutils.h>
  31 +#include <openbr/core/opencvutils.h>
  32 +
  33 +// definitions from the CUDA source file
  34 +namespace br { namespace cuda { namespace pca {
  35 + void initializeWrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems);
  36 + void trainWrapper(void* cudaSrc, float* dst, int rows, int cols);
  37 + void wrapper(void* src, void** dst, int imgRows, int imgCols);
  38 +}}}
  39 +
  40 +namespace br
  41 +{
  42 +/*!
  43 + * \ingroup transforms
  44 + * \brief Projects input into learned Principal Component Analysis subspace using CUDA. Modified from original PCA plugin.
  45 + * \author Colin Heinzmann \cite DepthDeluxe
  46 + *
  47 + * \br_property float keep Options are: [keep < 0 - All eigenvalues are retained, keep == 0 - No PCA is performed and the eigenvectors form an identity matrix, 0 < keep < 1 - Keep is the fraction of the variance to retain, keep >= 1 - keep is the number of leading eigenvectors to retain] Default is 0.95.
  48 + * \br_property int drop The number of leading eigen-dimensions to drop.
  49 + * \br_property bool whiten Whether or not to perform PCA whitening (i.e., normalize variance of each dimension to unit norm)
  50 + */
  51 +class CUDAPCATransform : public Transform
  52 +{
  53 + Q_OBJECT
  54 +
  55 +protected:
  56 + Q_PROPERTY(float keep READ get_keep WRITE set_keep RESET reset_keep STORED false)
  57 + Q_PROPERTY(int drop READ get_drop WRITE set_drop RESET reset_drop STORED false)
  58 + Q_PROPERTY(bool whiten READ get_whiten WRITE set_whiten RESET reset_whiten STORED false)
  59 +
  60 + BR_PROPERTY(float, keep, 0.95)
  61 + BR_PROPERTY(int, drop, 0)
  62 + BR_PROPERTY(bool, whiten, false)
  63 +
  64 + Eigen::VectorXf mean, eVals;
  65 + Eigen::MatrixXf eVecs;
  66 +
  67 + int originalRows;
  68 +
  69 +public:
  70 + CUDAPCATransform() : keep(0.95), drop(0), whiten(false) {}
  71 +
  72 +private:
  73 + double residualReconstructionError(const Template &src) const
  74 + {
  75 + Template proj;
  76 + project(src, proj);
  77 +
  78 + Eigen::Map<const Eigen::VectorXf> srcMap(src.m().ptr<float>(), src.m().rows*src.m().cols);
  79 + Eigen::Map<Eigen::VectorXf> projMap(proj.m().ptr<float>(), keep);
  80 +
  81 + return (srcMap - mean).squaredNorm() - projMap.squaredNorm();
  82 + }
  83 +
  84 + void train(const TemplateList &cudaTrainingSet)
  85 + {
  86 + // copy the data back from the graphics card so the training can be done on the CPU
  87 + const int instances = cudaTrainingSet.size(); // get the number of training set instances
  88 + QList<Template> trainingQlist;
  89 + for(int i=0; i<instances; i++) {
  90 + Template currentTemplate = cudaTrainingSet[i];
  91 + void* const* srcDataPtr = currentTemplate.m().ptr<void*>();
  92 + void* cudaMemPtr = srcDataPtr[0];
  93 + int rows = *((int*)srcDataPtr[1]);
  94 + int cols = *((int*)srcDataPtr[2]);
  95 + int type = *((int*)srcDataPtr[3]);
  96 +
  97 + if (type != CV_32FC1) {
  98 + qFatal("Requires single channel 32-bit floating point matrices.");
  99 + }
  100 +
  101 + Mat mat = Mat(rows, cols, type);
  102 + br::cuda::pca::trainWrapper(cudaMemPtr, mat.ptr<float>(), rows, cols);
  103 + trainingQlist.append(Template(mat));
  104 + }
  105 +
  106 + // assemble a TemplateList from the list of data
  107 + TemplateList trainingSet(trainingQlist);
  108 +
  109 +
  110 + originalRows = trainingSet.first().m().rows; // get number of rows of first image
  111 + int dimsIn = trainingSet.first().m().rows * trainingSet.first().m().cols; // get the size of the first image
  112 +
  113 + // Map into 64-bit Eigen matrix
  114 + Eigen::MatrixXd data(dimsIn, instances); // create a mat
  115 + for (int i=0; i<instances; i++) {
  116 + data.col(i) = Eigen::Map<const Eigen::MatrixXf>(trainingSet[i].m().ptr<float>(), dimsIn, 1).cast<double>();
  117 + }
  118 +
  119 + trainCore(data);
  120 + }
  121 +
  122 + void project(const Template &src, Template &dst) const
  123 + {
  124 + void* const* srcDataPtr = src.m().ptr<void*>();
  125 + int rows = *((int*)srcDataPtr[1]);
  126 + int cols = *((int*)srcDataPtr[2]);
  127 + int type = *((int*)srcDataPtr[3]);
  128 +
  129 + if (type != CV_32FC1) {
  130 + cout << "ERR: Invalid image type" << endl;
  131 + throw 0;
  132 + }
  133 +
  134 + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
  135 + void** dstDataPtr = dstMat.ptr<void*>();
  136 + dstDataPtr[1] = srcDataPtr[1]; *((int*)dstDataPtr[1]) = 1;
  137 + dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = keep;
  138 + dstDataPtr[3] = srcDataPtr[3];
  139 +
  140 + cuda::pca::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
  141 +
  142 + dst = dstMat;
  143 + }
  144 +
  145 + void store(QDataStream &stream) const
  146 + {
  147 + stream << keep << drop << whiten << originalRows << mean << eVals << eVecs;
  148 + }
  149 +
  150 + void load(QDataStream &stream)
  151 + {
  152 + stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> eVecs;
  153 +
  154 + // serialize the eigenvectors
  155 + float* evBuffer = new float[eVecs.rows() * eVecs.cols()];
  156 + for (int i=0; i < eVecs.rows(); i++) {
  157 + for (int j=0; j < eVecs.cols(); j++) {
  158 + evBuffer[i*eVecs.cols() + j] = eVecs(i, j);
  159 + }
  160 + }
  161 +
  162 + // serialize the mean
  163 + float* meanBuffer = new float[mean.rows() * mean.cols()];
  164 + for (int i=0; i < mean.rows(); i++) {
  165 + for (int j=0; j < mean.cols(); j++) {
  166 + meanBuffer[i*mean.cols() + j] = mean(i, j);
  167 + }
  168 + }
  169 +
  170 + // call the wrapper function
  171 + cuda::pca::initializeWrapper(evBuffer, eVecs.rows(), eVecs.cols(), meanBuffer, mean.rows()*mean.cols());
  172 +
  173 + delete evBuffer;
  174 + delete meanBuffer;
  175 + }
  176 +
  177 +protected:
  178 + void trainCore(Eigen::MatrixXd data)
  179 + {
  180 + int dimsIn = data.rows();
  181 + int instances = data.cols();
  182 + const bool dominantEigenEstimation = (dimsIn > instances);
  183 +
  184 + Eigen::MatrixXd allEVals, allEVecs;
  185 + if (keep != 0) {
  186 + // Compute and remove mean
  187 + mean = Eigen::VectorXf(dimsIn);
  188 + for (int i=0; i<dimsIn; i++) mean(i) = data.row(i).sum() / (float)instances;
  189 + for (int i=0; i<dimsIn; i++) data.row(i).array() -= mean(i);
  190 +
  191 + // Calculate covariance matrix
  192 + Eigen::MatrixXd cov;
  193 + if (dominantEigenEstimation) cov = data.transpose() * data / (instances-1.0);
  194 + else cov = data * data.transpose() / (instances-1.0);
  195 +
  196 + // Compute eigendecomposition. Returns eigenvectors/eigenvalues in increasing order by eigenvalue.
  197 + Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> eSolver(cov);
  198 + allEVals = eSolver.eigenvalues();
  199 + allEVecs = eSolver.eigenvectors();
  200 + if (dominantEigenEstimation) allEVecs = data * allEVecs;
  201 + } else {
  202 + // Null case
  203 + mean = Eigen::VectorXf::Zero(dimsIn);
  204 + allEVecs = Eigen::MatrixXd::Identity(dimsIn, dimsIn);
  205 + allEVals = Eigen::VectorXd::Ones(dimsIn);
  206 + }
  207 +
  208 + if (keep <= 0) {
  209 + keep = dimsIn - drop;
  210 + } else if (keep < 1) {
  211 + // Keep eigenvectors that retain a certain energy percentage.
  212 + const double totalEnergy = allEVals.sum();
  213 + if (totalEnergy == 0) {
  214 + keep = 0;
  215 + } else {
  216 + double currentEnergy = 0;
  217 + int i=0;
  218 + while ((currentEnergy / totalEnergy < keep) && (i < allEVals.rows())) {
  219 + currentEnergy += allEVals(allEVals.rows()-(i+1));
  220 + i++;
  221 + }
  222 + keep = i - drop;
  223 + }
  224 + } else {
  225 + if (keep + drop > allEVals.rows()) {
  226 + qWarning("Insufficient samples, needed at least %d but only got %d.", (int)keep + drop, (int)allEVals.rows());
  227 + keep = allEVals.rows() - drop;
  228 + }
  229 + }
  230 +
  231 + // Keep highest energy vectors
  232 + eVals = Eigen::VectorXf((int)keep, 1);
  233 + eVecs = Eigen::MatrixXf(allEVecs.rows(), (int)keep);
  234 + for (int i=0; i<keep; i++) {
  235 + int index = allEVals.rows()-(i+drop+1);
  236 + eVals(i) = allEVals(index);
  237 + eVecs.col(i) = allEVecs.col(index).cast<float>() / allEVecs.col(index).norm();
  238 + if (whiten) eVecs.col(i) /= sqrt(eVals(i));
  239 + }
  240 +
  241 + // Debug output
  242 + if (Globals->verbose) qDebug() << "PCA Training:\n\tDimsIn =" << dimsIn << "\n\tKeep =" << keep;
  243 + }
  244 +
  245 + void writeEigenVectors(const Eigen::MatrixXd &allEVals, const Eigen::MatrixXd &allEVecs) const
  246 + {
  247 + const int originalCols = mean.rows() / originalRows;
  248 +
  249 + { // Write out mean image
  250 + cv::Mat out(originalRows, originalCols, CV_32FC1);
  251 + Eigen::Map<Eigen::MatrixXf> outMap(out.ptr<float>(), mean.rows(), 1);
  252 + outMap = mean.col(0);
  253 + // OpenCVUtils::saveImage(out, Globals->Debug+"/PCA/eigenVectors/mean.png");
  254 + }
  255 +
  256 + // Write out sample eigen vectors (16 highest, 8 lowest), filename = eigenvalue.
  257 + for (int k=0; k<(int)allEVals.size(); k++) {
  258 + if ((k < 8) || (k >= (int)allEVals.size()-16)) {
  259 + cv::Mat out(originalRows, originalCols, CV_64FC1);
  260 + Eigen::Map<Eigen::MatrixXd> outMap(out.ptr<double>(), mean.rows(), 1);
  261 + outMap = allEVecs.col(k);
  262 + // OpenCVUtils::saveImage(out, Globals->Debug+"/PCA/eigenVectors/"+QString::number(allEVals(k),'f',0)+".png");
  263 + }
  264 + }
  265 + }
  266 +};
  267 +
  268 +BR_REGISTER(Transform, CUDAPCATransform)
  269 +} // namespace br
  270 +
  271 +#include "cuda/cudapca.moc"
... ...
openbr/plugins/cuda/cudapca.cu 0 → 100644
  1 +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
  2 + * Copyright 2016 Colin Heinzmann *
  3 + * *
  4 + * Licensed under the Apache License, Version 2.0 (the "License"); *
  5 + * you may not use this file except in compliance with the License. *
  6 + * You may obtain a copy of the License at *
  7 + * *
  8 + * http://www.apache.org/licenses/LICENSE-2.0 *
  9 + * *
  10 + * Unless required by applicable law or agreed to in writing, software *
  11 + * distributed under the License is distributed on an "AS IS" BASIS, *
  12 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
  13 + * See the License for the specific language governing permissions and *
  14 + * limitations under the License. *
  15 + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
  16 +
  17 +#include <iostream>
  18 +using namespace std;
  19 +
  20 +#include <opencv2/opencv.hpp>
  21 +#include <opencv2/gpu/gpu.hpp>
  22 +
  23 +#include "cudadefines.hpp"
  24 +
  25 +using namespace cv;
  26 +using namespace cv::gpu;
  27 +
  28 +/*
  29 + * These are the CUDA functions for CUDAPCA. See cudapca.cpp for more details
  30 + */
  31 +
  32 +namespace br { namespace cuda { namespace pca {
  33 + __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int numEigenvectors, int numSteps, int stepSize, int numPixels) {
  34 + int evIdx = blockIdx.x*blockDim.x+threadIdx.x;
  35 + int stepIdx = blockIdx.y*blockDim.y+threadIdx.y;
  36 +
  37 + if (evIdx >= numEigenvectors || stepIdx >= numSteps) {
  38 + return;
  39 + }
  40 +
  41 + float acc = 0;
  42 + int startIdx = stepSize*stepIdx;
  43 + int stopIdx = startIdx+stepSize;
  44 + if (startIdx >= numPixels) {
  45 + return;
  46 + }
  47 + if (stopIdx >= numPixels) {
  48 + stopIdx = numPixels;
  49 + }
  50 + for(int i=startIdx; i < stopIdx; i++) {
  51 + acc += src[i]*evPtr[i*numEigenvectors + evIdx];
  52 + }
  53 +
  54 + intermediaryBuffer[stepIdx*stepSize + evIdx] = acc;
  55 + }
  56 +
  57 + __global__ void multiplyJoinKernel(float* intermediaryBuffer, float* out, int numEigenvectors, int numSteps, int stepSize) {
  58 + int evIdx = blockIdx.x*blockDim.x+threadIdx.x;
  59 + if (evIdx >= numEigenvectors) {
  60 + return;
  61 + }
  62 +
  63 + if (numSteps*stepSize+evIdx >= numEigenvectors) {
  64 + numSteps--;
  65 + }
  66 +
  67 + float acc = 0;
  68 + for (int i=0; i < numSteps; i++) {
  69 + int ibIdx = i*stepSize + evIdx;
  70 + acc += intermediaryBuffer[ibIdx];
  71 + }
  72 +
  73 + out[evIdx] = acc;
  74 + }
  75 +
  76 + __global__ void subtractMeanKernel(float* out, float* mean, int numElems) {
  77 + int idx = blockIdx.x*blockDim.x+threadIdx.x;
  78 +
  79 + // perform bound checking
  80 + if (idx >= numElems) {
  81 + return;
  82 + }
  83 +
  84 + // subtract out the mean
  85 + out[idx] -= mean[idx];
  86 + }
  87 +
  88 + // _evRows: the number of pixels in the trained images
  89 + // _evCols: the number of eigenvectors
  90 + // _meanElems: the number of pixels in an image
  91 + // _stepSize: the number of pixels in a single step
  92 + // _numSteps: the number of steps required to complete operation
  93 + float* cudaEvPtr; int _evRows; int _evCols;
  94 + float* cudaMeanPtr; int _meanElems;
  95 + int _numSteps; int _stepSize;
  96 + float* intermediaryBuffer;
  97 +
  98 + void initializeWrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) {
  99 + _evRows = evRows; _evCols = evCols;
  100 + _meanElems = meanElems;
  101 +
  102 + cudaError_t err;
  103 +
  104 + // copy the eigenvectors to the GPU
  105 + CUDA_SAFE_MALLOC(&cudaEvPtr, evRows*evCols*sizeof(float), &err);
  106 + CUDA_SAFE_MEMCPY(cudaEvPtr, evPtr, evRows*evCols*sizeof(float), cudaMemcpyHostToDevice, &err);
  107 +
  108 + // copy the mean to the GPU
  109 + CUDA_SAFE_MALLOC(&cudaMeanPtr, meanElems*sizeof(float), &err);
  110 + CUDA_SAFE_MEMCPY(cudaMeanPtr, meanPtr, meanElems*sizeof(float), cudaMemcpyHostToDevice, &err);
  111 +
  112 + // initialize the intermediary working space,
  113 + _stepSize = 2048;
  114 + _numSteps = _evRows / _stepSize + 1;
  115 + CUDA_SAFE_MALLOC(&intermediaryBuffer, _numSteps*_stepSize*sizeof(float), &err);
  116 + }
  117 +
  118 + void trainWrapper(void* cudaSrc, float* data, int rows, int cols) {
  119 + cudaError_t err;
  120 + CUDA_SAFE_MEMCPY(data, cudaSrc, rows*cols*sizeof(float), cudaMemcpyDeviceToHost, &err);
  121 + }
  122 +
  123 + void wrapper(void* src, void** dst, int imgRows, int imgCols) {
  124 + cudaError_t err;
  125 + CUDA_SAFE_MALLOC(dst, _evCols*sizeof(float), &err);
  126 +
  127 + if (imgRows*imgCols != _evRows || imgRows*imgCols != _meanElems) {
  128 + cout << "ERR: Image dimension mismatch!" << endl;
  129 + throw 0;
  130 + }
  131 +
  132 + // subtract out the mean of the image (mean is 1xpixels in size), perform in place (in src)
  133 + int threadsPerBlock = 512;
  134 + int numBlocks = _meanElems / threadsPerBlock + 1;
  135 + subtractMeanKernel<<<numBlocks, threadsPerBlock>>>((float*)src, cudaMeanPtr, _meanElems);
  136 + CUDA_KERNEL_ERR_CHK(&err);
  137 +
  138 + // perform matrix multiplication
  139 + dim3 threadsPerBlock2d(512, 1);
  140 + dim3 numBlocks2d(
  141 + _evCols / threadsPerBlock2d.x + 1,
  142 + _numSteps / threadsPerBlock2d.y + 1);
  143 + multiplyKernel<<<numBlocks2d, threadsPerBlock2d>>>((float*)src, intermediaryBuffer, cudaEvPtr, _evCols, _numSteps, _stepSize, _meanElems);
  144 + CUDA_KERNEL_ERR_CHK(&err);
  145 +
  146 + threadsPerBlock = 512;
  147 + numBlocks = _evCols / threadsPerBlock + 1;
  148 + multiplyJoinKernel<<<numBlocks, threadsPerBlock>>>(intermediaryBuffer, (float*)*dst, _evCols, _numSteps, _stepSize);
  149 + CUDA_KERNEL_ERR_CHK(&err);
  150 +
  151 + // free the src memory
  152 + CUDA_SAFE_FREE(src, &err);
  153 + }
  154 +}}}
... ...
openbr/plugins/cuda/cudargb2grayscale.cpp 0 → 100644
  1 +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
  2 + * Copyright 2016 Li Li, Colin Heinzmann *
  3 + * *
  4 + * Licensed under the Apache License, Version 2.0 (the "License"); *
  5 + * you may not use this file except in compliance with the License. *
  6 + * You may obtain a copy of the License at *
  7 + * *
  8 + * http://www.apache.org/licenses/LICENSE-2.0 *
  9 + * *
  10 + * Unless required by applicable law or agreed to in writing, software *
  11 + * distributed under the License is distributed on an "AS IS" BASIS, *
  12 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
  13 + * See the License for the specific language governing permissions and *
  14 + * limitations under the License. *
  15 + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
  16 +
  17 +#include <iostream>
  18 +#include <opencv2/imgproc/imgproc.hpp>
  19 +
  20 +#include <openbr/plugins/openbr_internal.h>
  21 +#include <openbr/core/opencvutils.h>
  22 +#include <thrust/host_vector.h>
  23 +#include <thrust/device_vector.h>
  24 +
  25 +
  26 +using namespace cv;
  27 +
  28 +// definitions from the CUDA source file
  29 +namespace br { namespace cuda { namespace rgb2grayscale {
  30 + void wrapper(void* srcPtr, void**dstPtr, int rows, int cols);
  31 +}}}
  32 +
  33 +namespace br
  34 +{
  35 +
  36 +/*!
  37 + * \ingroup transforms
  38 + * \brief Converts 3-channel images to grayscale
  39 + * \author Li Li \cite booli
  40 + */
  41 +class CUDARGB2GrayScaleTransform : public UntrainableTransform
  42 +{
  43 + Q_OBJECT
  44 +
  45 +public:
  46 +
  47 +private:
  48 + void project(const Template &src, Template &dst) const
  49 + {
  50 + void* const* srcDataPtr = src.m().ptr<void*>();
  51 + int rows = *((int*) srcDataPtr[1]);
  52 + int cols = *((int*) srcDataPtr[2]);
  53 + int type = *((int*) srcDataPtr[3]);
  54 +
  55 + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
  56 + void** dstDataPtr = dstMat.ptr<void*>();
  57 + dstDataPtr[1] = srcDataPtr[1];
  58 + dstDataPtr[2] = srcDataPtr[2];
  59 + dstDataPtr[3] = srcDataPtr[3];
  60 + *((int*)dstDataPtr[3]) = CV_8UC1; // not sure if the type of the new mat is the same
  61 +
  62 + cuda::rgb2grayscale::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
  63 + dst = dstMat;
  64 + }
  65 +};
  66 +
  67 +BR_REGISTER(Transform, CUDARGB2GrayScaleTransform)
  68 +
  69 +} // namespace br
  70 +
  71 +#include "imgproc/cudargb2grayscale.moc"
... ...
openbr/plugins/cuda/cudargb2grayscale.cu 0 → 100644
  1 +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
  2 + * Copyright 2016 Li Li, Colin Heinzmann *
  3 + * *
  4 + * Licensed under the Apache License, Version 2.0 (the "License"); *
  5 + * you may not use this file except in compliance with the License. *
  6 + * You may obtain a copy of the License at *
  7 + * *
  8 + * http://www.apache.org/licenses/LICENSE-2.0 *
  9 + * *
  10 + * Unless required by applicable law or agreed to in writing, software *
  11 + * distributed under the License is distributed on an "AS IS" BASIS, *
  12 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
  13 + * See the License for the specific language governing permissions and *
  14 + * limitations under the License. *
  15 + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
  16 +
  17 +#include <stdio.h>
  18 +#include <iostream>
  19 +#include <opencv2/opencv.hpp>
  20 +#include <opencv2/gpu/gpu.hpp>
  21 +#include "cudadefines.hpp"
  22 +#include <opencv2/imgproc/imgproc.hpp>
  23 +#include <opencv2/imgproc/imgproc_c.h>
  24 +#include <opencv2/highgui/highgui.hpp>
  25 +#include <opencv2/highgui/highgui_c.h>
  26 +
  27 +using namespace std;
  28 +using namespace cv;
  29 +using namespace cv::gpu;
  30 +
  31 +namespace br { namespace cuda { namespace rgb2grayscale {
  32 +
  33 + __global__ void kernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols)
  34 + {
  35 + int rowInd = blockIdx.y*blockDim.y+threadIdx.y;
  36 + int colInd = blockIdx.x*blockDim.x+threadIdx.x;
  37 + int index = rowInd*cols + colInd;
  38 + if (rowInd < 0 || rowInd >= rows || colInd < 0 || colInd >= cols) {
  39 + return;
  40 + }
  41 + int new_index = 3 * index;
  42 + float g = (float) srcPtr[new_index];
  43 + float b = (float) srcPtr[new_index+1];
  44 + float r = (float) srcPtr[new_index+2];
  45 +
  46 + dstPtr[index] = (uint8_t) (0.299f * g + 0.587f * b + 0.114f * r);
  47 + return;
  48 + }
  49 +
  50 + void wrapper(void* srcPtr, void** dstPtr, int rows, int cols)
  51 + {
  52 + cudaError_t err;
  53 + dim3 threadsPerBlock(9, 9);
  54 + dim3 numBlocks(cols/threadsPerBlock.x + 1,
  55 + rows/threadsPerBlock.y + 1);
  56 + CUDA_SAFE_MALLOC(dstPtr, rows*cols*sizeof(uint8_t), &err);
  57 +
  58 + kernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*) (*dstPtr), rows, cols);
  59 + CUDA_KERNEL_ERR_CHK(&err);
  60 + CUDA_SAFE_FREE(srcPtr, &err);
  61 + }
  62 +
  63 +}}}
... ...
openbr/plugins/cuda/module.cmake 0 → 100644
  1 +# add WITH_CUDA option
  2 +option(BR_WITH_CUDA "Build CUDA-accelerated plugins." OFF)
  3 +set(BR_CUDA_ARCH "sm_20" CACHE STRING "CUDA Architecture")
  4 +
  5 +# only build this module if explicitly OK'ed
  6 +if(BR_WITH_CUDA)
  7 + message(STATUS "Building with CUDA Support")
  8 + find_package(CUDA REQUIRED)
  9 +
  10 + set(CUDA_SRC_DIR ${PROJECT_SOURCE_DIR}/openbr/plugins/cuda)
  11 +
  12 + # configure the compiler, need -fPIC for shared library
  13 + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -arch=${BR_CUDA_ARCH} --compiler-options -fPIC)
  14 + include_directories(${CUDA_INCLUDE_DIRS})
  15 +
  16 + # glob sources
  17 + file(GLOB CUDA_CU_SRC ${CUDA_SRC_DIR}/*.cu)
  18 + file(GLOB CUDA_CPP_SRC ${CUDA_SRC_DIR}/*.cpp)
  19 +
  20 + # compile each of the object files and append to CUDA_CU_OBJ
  21 + foreach(FILE ${CUDA_CU_SRC})
  22 + cuda_compile(FILE_O ${FILE})
  23 + set(CUDA_CU_OBJ ${CUDA_CU_OBJ} ${FILE_O})
  24 + endforeach()
  25 +
  26 + # ensure add_library knows these are external object file
  27 + set_source_files_properties(${CUDA_CU_OBJ} PROPERTIES EXTERNAL_OBJECT true)
  28 +
  29 + # add the compiled source and libs into the build system
  30 + set(BR_THIRDPARTY_SRC ${BR_THIRDPARTY_SRC} ${CUDA_CPP_SRC} ${CUDA_CU_OBJ})
  31 + set(BR_THIRDPARTY_LIBS ${BR_THIRDPARTY_LIBS} ${CUDA_LIBRARIES})
  32 +
  33 +endif()
... ...
openbr/plugins/plugins.cmake
... ... @@ -12,7 +12,15 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${BR_THIRDPARTY_PLUGINS_DIR})
12 12 file(GLOB SUBFILES plugins/*)
13 13 foreach(FILE ${SUBFILES})
14 14 if(IS_DIRECTORY ${FILE})
15   - set(BR_PLUGINS_DIR ${BR_PLUGINS_DIR} ${FILE})
  15 + # check to see if there is a cmake file for the folder, if there is, then that
  16 + # file should be used to build the contents of the directory
  17 + if (EXISTS ${FILE}/module.cmake)
  18 + message(STATUS "importing ${FILE}/module.cmake")
  19 + include(${FILE}/module.cmake)
  20 + else()
  21 + message(STATUS "adding ${FILE}")
  22 + set(BR_PLUGINS_DIR ${BR_PLUGINS_DIR} ${FILE})
  23 + endif()
16 24 endif()
17 25 endforeach()
18 26 set(BR_PLUGINS_DIR ${BR_PLUGINS_DIR} plugins/) # Remove this when finished with reorg
... ...