Commit 1dabf42dad2e33b7a9e3d11b4142a98fd53c5f9d

Authored by DepthDeluxe
1 parent e600a994

code refactoring for pull request

openbr/plugins/cuda/MatManager.cu renamed to openbr/core/cuda/MatManager.cu
openbr/plugins/cuda/MatManager.hpp renamed to openbr/core/cuda/MatManager.hpp
openbr/plugins/cuda/copyfrom.cpp
... ... @@ -2,21 +2,25 @@
2 2  
3 3 #include <opencv2/opencv.hpp>
4 4  
5   -//#include <gperftools/profiler.h>
6   -
7 5 #include <openbr/plugins/openbr_internal.h>
8 6  
9 7 using namespace std;
10 8  
11 9 using namespace cv;
12 10  
13   -// extern CUDA declaration
14   -namespace br { namespace cuda { namespace cudacopyfrom {
  11 +// CUDA functions for this plugin
  12 +namespace br { namespace cuda { namespace copyfrom {
15 13 template <typename T> void wrapper(void* src, T* out, int rows, int cols);
16 14 }}}
17 15  
18 16 namespace br
19 17 {
  18 + /*!
  19 + * \ingroup transforms
  20 + * \brief Copies a transform from the GPU to the CPU.
  21 + * \author Colin Heinzmann \cite DepthDeluxe
  22 + * \note Method: Automatically matches image dimensions, works for 32-bit single channel, 8-bit single channel, and 8-bit 3 channel
  23 + */
20 24 class CUDACopyFrom : public UntrainableTransform
21 25 {
22 26 Q_OBJECT
... ... @@ -24,8 +28,6 @@ namespace br
24 28 private:
25 29 void project(const Template &src, Template &dst) const
26 30 {
27   -// ProfilerStart("PROFILEME.log");
28   -
29 31 // pull the data back out of the Mat
30 32 void* const* dataPtr = src.m().ptr<void*>();
31 33 int rows = *((int*)dataPtr[1]);
... ... @@ -35,21 +37,19 @@ private:
35 37 Mat dstMat = Mat(rows, cols, type);
36 38 switch(type) {
37 39 case CV_32FC1:
38   - br::cuda::cudacopyfrom::wrapper(dataPtr[0], dstMat.ptr<float>(), rows, cols);
  40 + cuda::copyfrom::wrapper(dataPtr[0], dstMat.ptr<float>(), rows, cols);
39 41 break;
40 42 case CV_8UC1:
41   - br::cuda::cudacopyfrom::wrapper(dataPtr[0], dstMat.ptr<unsigned char>(), rows, cols);
  43 + cuda::copyfrom::wrapper(dataPtr[0], dstMat.ptr<unsigned char>(), rows, cols);
42 44 break;
43 45 case CV_8UC3:
44   - br::cuda::cudacopyfrom::wrapper(dataPtr[0], dstMat.ptr<unsigned char>(), rows, cols * 3);
  46 + cuda::copyfrom::wrapper(dataPtr[0], dstMat.ptr<unsigned char>(), rows, cols * 3);
45 47 break;
46 48 default:
47   - cout << "ERR: Invalid image format" << endl;
  49 + cout << "ERR: Invalid image type (" << type << ")" << endl;
48 50 break;
49 51 }
50 52 dst = dstMat;
51   -
52   -// ProfilerStop();
53 53 }
54 54 };
55 55  
... ...
openbr/plugins/cuda/copyfrom.cu
1 1 #include "cudadefines.hpp"
2 2  
3   -namespace br { namespace cuda { namespace cudacopyfrom {
  3 +namespace br { namespace cuda { namespace copyfrom {
4 4 template <typename T> void wrapper(void* src, T* dst, int rows, int cols) {
5 5 cudaError_t err;
6 6 CUDA_SAFE_MEMCPY(dst, src, rows*cols*sizeof(T), cudaMemcpyDeviceToHost, &err);
... ...
openbr/plugins/cuda/copyto.cpp
... ... @@ -8,14 +8,20 @@ using namespace std;
8 8  
9 9 using namespace cv;
10 10  
11   -extern string type2str(int type);
12   -
13   -namespace br { namespace cuda { namespace cudacopyto {
  11 +// definitions from the CUDA source file
  12 +namespace br { namespace cuda { namespace copyto {
14 13 template <typename T> void wrapper(const T* in, void** out, const int rows, const int cols);
15 14 }}}
16 15  
17 16 namespace br
18 17 {
  18 +
  19 + /*!
  20 + * \ingroup transforms
  21 + * \brief Copies a transform to the GPU.
  22 + * \author Colin Heinzmann \cite DepthDeluxe
  23 + * \note Method: Automatically matches image dimensions, works for 32-bit single channel, 8-bit single channel, and 8-bit 3 channel
  24 + */
19 25 class CUDACopyTo : public UntrainableTransform
20 26 {
21 27 Q_OBJECT
... ... @@ -25,7 +31,7 @@ private:
25 31 {
26 32 const Mat& srcMat = src.m();
27 33 const int rows = srcMat.rows;
28   - const int cols = srcMat.cols;
  34 + const int cols = srcMat.cols;
29 35  
30 36 // output will be a single pointer to graphics card memory
31 37 Mat dstMat = Mat(4, 1, DataType<void*>::type);
... ... @@ -39,16 +45,16 @@ private:
39 45 void* cudaMemPtr;
40 46 switch(srcMat.type()) {
41 47 case CV_32FC1:
42   - br::cuda::cudacopyto::wrapper(srcMat.ptr<float>(), &dstMatData[0], rows, cols);
  48 + cuda::copyto::wrapper(srcMat.ptr<float>(), &dstMatData[0], rows, cols);
43 49 break;
44 50 case CV_8UC1:
45   - br::cuda::cudacopyto::wrapper(srcMat.ptr<unsigned char>(), &dstMatData[0], rows, cols);
  51 + cuda::copyto::wrapper(srcMat.ptr<unsigned char>(), &dstMatData[0], rows, cols);
46 52 break;
47 53 case CV_8UC3:
48   - br::cuda::cudacopyto::wrapper(srcMat.ptr<unsigned char>(), &dstMatData[0], rows, 3*cols);
  54 + cuda::copyto::wrapper(srcMat.ptr<unsigned char>(), &dstMatData[0], rows, 3*cols);
49 55 break;
50 56 default:
51   - cout << "ERR: Invalid image type! " << type2str(srcMat.type()) << endl;
  57 + cout << "ERR: Invalid image type (" << srcMat.type() << ")" << endl;
52 58 return;
53 59 }
54 60  
... ...
openbr/plugins/cuda/copyto.cu
1 1 #include "cudadefines.hpp"
2 2  
3   -namespace br { namespace cuda { namespace cudacopyto {
  3 +namespace br { namespace cuda { namespace copyto {
  4 +
4 5 template <typename T> void wrapper(const T* in, void** out, const int rows, const int cols) {
5 6 cudaError_t err;
6 7 CUDA_SAFE_MALLOC(out, rows*cols*sizeof(T), &err);
... ... @@ -9,4 +10,5 @@ namespace br { namespace cuda { namespace cudacopyto {
9 10  
10 11 template void wrapper(const float* in, void** out, const int rows, const int cols);
11 12 template void wrapper(const unsigned char* in, void** out, const int rows, const int cols);
  13 +
12 14 }}}
... ...
openbr/plugins/cuda/cudaaffine.cpp
... ... @@ -33,159 +33,158 @@ using namespace std;
33 33 #include <openbr/plugins/openbr_internal.h>
34 34 #include <openbr/core/opencvutils.h>
35 35  
36   -#include "MatManager.hpp"
37   -
38 36 using namespace cv;
39 37  
40   -namespace br { namespace cuda {
41   - void cudaaffine_wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols);
42   -}}
  38 +// definitions from the CUDA source file
  39 +namespace br { namespace cuda { namespace affine {
  40 + void wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols);
  41 +}}}
43 42  
44 43 namespace br
45 44 {
46 45  
47   -/*!
48   - * \ingroup transforms
49   - * \brief Performs a two or three point registration.
50   - * \author Josh Klontz \cite jklontz
51   - * \note Method: Area should be used for shrinking an image, Cubic for slow but accurate enlargment, Bilin for fast enlargement.
52   - */
53   -class CUDAAffineTransform : public UntrainableTransform
54   -{
55   - Q_OBJECT
56   - Q_ENUMS(Method)
57   -
58   -public:
59   - /*!< */
60   - enum Method { Near = INTER_NEAREST,
61   - Area = INTER_AREA,
62   - Bilin = INTER_LINEAR,
63   - Cubic = INTER_CUBIC,
64   - Lanczo = INTER_LANCZOS4};
65   -
66   -private:
67   - Q_PROPERTY(int width READ get_width WRITE set_width RESET reset_width STORED false)
68   - Q_PROPERTY(int height READ get_height WRITE set_height RESET reset_height STORED false)
69   - Q_PROPERTY(float x1 READ get_x1 WRITE set_x1 RESET reset_x1 STORED false)
70   - Q_PROPERTY(float y1 READ get_y1 WRITE set_y1 RESET reset_y1 STORED false)
71   - Q_PROPERTY(float x2 READ get_x2 WRITE set_x2 RESET reset_x2 STORED false)
72   - Q_PROPERTY(float y2 READ get_y2 WRITE set_y2 RESET reset_y2 STORED false)
73   - Q_PROPERTY(float x3 READ get_x3 WRITE set_x3 RESET reset_x3 STORED false)
74   - Q_PROPERTY(float y3 READ get_y3 WRITE set_y3 RESET reset_y3 STORED false)
75   - Q_PROPERTY(Method method READ get_method WRITE set_method RESET reset_method STORED false)
76   - Q_PROPERTY(bool storeAffine READ get_storeAffine WRITE set_storeAffine RESET reset_storeAffine STORED false)
77   - Q_PROPERTY(bool warpPoints READ get_warpPoints WRITE set_warpPoints RESET reset_warpPoints STORED false)
78   - BR_PROPERTY(int, width, 64)
79   - BR_PROPERTY(int, height, 64)
80   - BR_PROPERTY(float, x1, 0)
81   - BR_PROPERTY(float, y1, 0)
82   - BR_PROPERTY(float, x2, -1)
83   - BR_PROPERTY(float, y2, -1)
84   - BR_PROPERTY(float, x3, -1)
85   - BR_PROPERTY(float, y3, -1)
86   - BR_PROPERTY(Method, method, Bilin)
87   - BR_PROPERTY(bool, storeAffine, false)
88   - BR_PROPERTY(bool, warpPoints, false)
89   -
90   - static Point2f getThirdAffinePoint(const Point2f &a, const Point2f &b)
91   - {
92   - float dx = b.x - a.x;
93   - float dy = b.y - a.y;
94   - return Point2f(a.x - dy, a.y + dx);
95   - }
96   -
97   - void project(const Template &src, Template &dst) const
98   - {
99   - const bool twoPoints = ((x3 == -1) || (y3 == -1));
100   -
101   - Point2f dstPoints[3];
102   - dstPoints[0] = Point2f(x1*width, y1*height);
103   - dstPoints[1] = Point2f((x2 == -1 ? 1 - x1 : x2)*width, (y2 == -1 ? y1 : y2)*height);
104   - if (twoPoints) dstPoints[2] = getThirdAffinePoint(dstPoints[0], dstPoints[1]);
105   - else dstPoints[2] = Point2f(x3*width, y3*height);
106   -
107   - Point2f srcPoints[3];
108   - if (src.file.contains("Affine_0") &&
109   - src.file.contains("Affine_1") &&
110   - (src.file.contains("Affine_2") || twoPoints)) {
111   - srcPoints[0] = OpenCVUtils::toPoint(src.file.get<QPointF>("Affine_0"));
112   - srcPoints[1] = OpenCVUtils::toPoint(src.file.get<QPointF>("Affine_1"));
113   - if (!twoPoints) srcPoints[2] = OpenCVUtils::toPoint(src.file.get<QPointF>("Affine_2"));
114   - } else {
115   - const QList<Point2f> landmarks = OpenCVUtils::toPoints(src.file.points());
116   -
117   - if ((landmarks.size() < 2) || (!twoPoints && (landmarks.size() < 3))) {
118   - resize(src, dst, Size(width, height));
119   - return;
120   - } else {
121   - srcPoints[0] = landmarks[0];
122   - srcPoints[1] = landmarks[1];
123   - if (!twoPoints) srcPoints[2] = landmarks[2];
124   - }
125   - }
126   - if (twoPoints) srcPoints[2] = getThirdAffinePoint(srcPoints[0], srcPoints[1]);
127   -
128   - // Code section being altered (original)
129   - //
130   - // Mat affineTransform = getAffineTransform(srcPoints, dstPoints);
131   - // warpAffine(src, dst, affineTransform, Size(width, height), method);
132   - //
133   - // end original
134   -
135   - Mat affineTransform = getAffineTransform(srcPoints, dstPoints);
136   -
137   - void* const* srcDataPtr = src.m().ptr<void*>();
138   - int rows = *((int*)srcDataPtr[1]);
139   - int cols = *((int*)srcDataPtr[2]);
140   - int type = *((int*)srcDataPtr[3]);
141   -
142   - Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
143   - void** dstDataPtr = dstMat.ptr<void*>();
144   -
145   - dstDataPtr[1] = srcDataPtr[1]; *((int*)dstDataPtr[1]) = height; // rows
146   - dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = width; // cols
147   - dstDataPtr[3] = srcDataPtr[3];
148   -
149   - // Print the transform
150   - //for(int x = 0; x < affineTransform.rows; x++){
151   - //for(int y = 0; y < affineTransform.cols; y++){
152   - //printf("%8.3f\t", affineTransform.at<double>(x, y));
153   - //}
154   - //printf("\n");
155   - //}
156   -
157   - br::cuda::cudaaffine_wrapper(srcDataPtr[0], &dstDataPtr[0], affineTransform, rows, cols, height, width);
158   -
159   - // end altered code
160   -
161   - if (warpPoints) {
162   - QList<QPointF> points = src.file.points();
163   - QList<QPointF> rotatedPoints;
164   - for (int i=0; i<points.size(); i++) {
165   - rotatedPoints.append(QPointF(points.at(i).x()*affineTransform.at<double>(0,0)+
166   - points.at(i).y()*affineTransform.at<double>(0,1)+
167   - affineTransform.at<double>(0,2),
168   - points.at(i).x()*affineTransform.at<double>(1,0)+
169   - points.at(i).y()*affineTransform.at<double>(1,1)+
170   - affineTransform.at<double>(1,2)));
171   - }
172   -
173   - dst.file.setPoints(rotatedPoints);
174   - }
175   -
176   - if (storeAffine) {
177   - QList<float> affineParams;
178   - for (int i = 0 ; i < 2; i++)
179   - for (int j = 0; j < 3; j++)
180   - affineParams.append(affineTransform.at<double>(i, j));
181   - dst.file.setList("affineParameters", affineParams);
182   - }
183   -
184   - dst = dstMat;
185   - }
186   -};
187   -
188   -BR_REGISTER(Transform, CUDAAffineTransform)
  46 + /*!
  47 + * \ingroup transforms
  48 + * \brief Performs a two or three point registration on the GPU. Modified from stock OpenBR implementation
  49 + * \author Greg Schrock \cite gls022
  50 + * \note Method: Area should be used for shrinking an image, Cubic for slow but accurate enlargment, Bilin for fast enlargement.
  51 + */
  52 + class CUDAAffineTransform : public UntrainableTransform
  53 + {
  54 + Q_OBJECT
  55 + Q_ENUMS(Method)
  56 +
  57 + public:
  58 + /*!< */
  59 + enum Method { Near = INTER_NEAREST,
  60 + Area = INTER_AREA,
  61 + Bilin = INTER_LINEAR,
  62 + Cubic = INTER_CUBIC,
  63 + Lanczo = INTER_LANCZOS4};
  64 +
  65 + private:
  66 + Q_PROPERTY(int width READ get_width WRITE set_width RESET reset_width STORED false)
  67 + Q_PROPERTY(int height READ get_height WRITE set_height RESET reset_height STORED false)
  68 + Q_PROPERTY(float x1 READ get_x1 WRITE set_x1 RESET reset_x1 STORED false)
  69 + Q_PROPERTY(float y1 READ get_y1 WRITE set_y1 RESET reset_y1 STORED false)
  70 + Q_PROPERTY(float x2 READ get_x2 WRITE set_x2 RESET reset_x2 STORED false)
  71 + Q_PROPERTY(float y2 READ get_y2 WRITE set_y2 RESET reset_y2 STORED false)
  72 + Q_PROPERTY(float x3 READ get_x3 WRITE set_x3 RESET reset_x3 STORED false)
  73 + Q_PROPERTY(float y3 READ get_y3 WRITE set_y3 RESET reset_y3 STORED false)
  74 + Q_PROPERTY(Method method READ get_method WRITE set_method RESET reset_method STORED false)
  75 + Q_PROPERTY(bool storeAffine READ get_storeAffine WRITE set_storeAffine RESET reset_storeAffine STORED false)
  76 + Q_PROPERTY(bool warpPoints READ get_warpPoints WRITE set_warpPoints RESET reset_warpPoints STORED false)
  77 + BR_PROPERTY(int, width, 64)
  78 + BR_PROPERTY(int, height, 64)
  79 + BR_PROPERTY(float, x1, 0)
  80 + BR_PROPERTY(float, y1, 0)
  81 + BR_PROPERTY(float, x2, -1)
  82 + BR_PROPERTY(float, y2, -1)
  83 + BR_PROPERTY(float, x3, -1)
  84 + BR_PROPERTY(float, y3, -1)
  85 + BR_PROPERTY(Method, method, Bilin)
  86 + BR_PROPERTY(bool, storeAffine, false)
  87 + BR_PROPERTY(bool, warpPoints, false)
  88 +
  89 + static Point2f getThirdAffinePoint(const Point2f &a, const Point2f &b)
  90 + {
  91 + float dx = b.x - a.x;
  92 + float dy = b.y - a.y;
  93 + return Point2f(a.x - dy, a.y + dx);
  94 + }
  95 +
  96 + void project(const Template &src, Template &dst) const
  97 + {
  98 + const bool twoPoints = ((x3 == -1) || (y3 == -1));
  99 +
  100 + Point2f dstPoints[3];
  101 + dstPoints[0] = Point2f(x1*width, y1*height);
  102 + dstPoints[1] = Point2f((x2 == -1 ? 1 - x1 : x2)*width, (y2 == -1 ? y1 : y2)*height);
  103 + if (twoPoints) dstPoints[2] = getThirdAffinePoint(dstPoints[0], dstPoints[1]);
  104 + else dstPoints[2] = Point2f(x3*width, y3*height);
  105 +
  106 + Point2f srcPoints[3];
  107 + if (src.file.contains("Affine_0") &&
  108 + src.file.contains("Affine_1") &&
  109 + (src.file.contains("Affine_2") || twoPoints)) {
  110 + srcPoints[0] = OpenCVUtils::toPoint(src.file.get<QPointF>("Affine_0"));
  111 + srcPoints[1] = OpenCVUtils::toPoint(src.file.get<QPointF>("Affine_1"));
  112 + if (!twoPoints) srcPoints[2] = OpenCVUtils::toPoint(src.file.get<QPointF>("Affine_2"));
  113 + } else {
  114 + const QList<Point2f> landmarks = OpenCVUtils::toPoints(src.file.points());
  115 +
  116 + if ((landmarks.size() < 2) || (!twoPoints && (landmarks.size() < 3))) {
  117 + resize(src, dst, Size(width, height));
  118 + return;
  119 + } else {
  120 + srcPoints[0] = landmarks[0];
  121 + srcPoints[1] = landmarks[1];
  122 + if (!twoPoints) srcPoints[2] = landmarks[2];
  123 + }
  124 + }
  125 + if (twoPoints) srcPoints[2] = getThirdAffinePoint(srcPoints[0], srcPoints[1]);
  126 +
  127 + // Code section being altered (original)
  128 + //
  129 + // Mat affineTransform = getAffineTransform(srcPoints, dstPoints);
  130 + // warpAffine(src, dst, affineTransform, Size(width, height), method);
  131 + //
  132 + // end original
  133 +
  134 + Mat affineTransform = getAffineTransform(srcPoints, dstPoints);
  135 +
  136 + void* const* srcDataPtr = src.m().ptr<void*>();
  137 + int rows = *((int*)srcDataPtr[1]);
  138 + int cols = *((int*)srcDataPtr[2]);
  139 + int type = *((int*)srcDataPtr[3]);
  140 +
  141 + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
  142 + void** dstDataPtr = dstMat.ptr<void*>();
  143 +
  144 + dstDataPtr[1] = srcDataPtr[1]; *((int*)dstDataPtr[1]) = height; // rows
  145 + dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = width; // cols
  146 + dstDataPtr[3] = srcDataPtr[3];
  147 +
  148 + // Print the transform
  149 + //for(int x = 0; x < affineTransform.rows; x++){
  150 + //for(int y = 0; y < affineTransform.cols; y++){
  151 + //printf("%8.3f\t", affineTransform.at<double>(x, y));
  152 + //}
  153 + //printf("\n");
  154 + //}
  155 +
  156 + cuda::affine::wrapper(srcDataPtr[0], &dstDataPtr[0], affineTransform, rows, cols, height, width);
  157 +
  158 + // end altered code
  159 +
  160 + if (warpPoints) {
  161 + QList<QPointF> points = src.file.points();
  162 + QList<QPointF> rotatedPoints;
  163 + for (int i=0; i<points.size(); i++) {
  164 + rotatedPoints.append(QPointF(points.at(i).x()*affineTransform.at<double>(0,0)+
  165 + points.at(i).y()*affineTransform.at<double>(0,1)+
  166 + affineTransform.at<double>(0,2),
  167 + points.at(i).x()*affineTransform.at<double>(1,0)+
  168 + points.at(i).y()*affineTransform.at<double>(1,1)+
  169 + affineTransform.at<double>(1,2)));
  170 + }
  171 +
  172 + dst.file.setPoints(rotatedPoints);
  173 + }
  174 +
  175 + if (storeAffine) {
  176 + QList<float> affineParams;
  177 + for (int i = 0 ; i < 2; i++)
  178 + for (int j = 0; j < 3; j++)
  179 + affineParams.append(affineTransform.at<double>(i, j));
  180 + dst.file.setList("affineParameters", affineParams);
  181 + }
  182 +
  183 + dst = dstMat;
  184 + }
  185 + };
  186 +
  187 + BR_REGISTER(Transform, CUDAAffineTransform)
189 188  
190 189 } // namespace br
191 190  
... ...
openbr/plugins/cuda/cudaaffine.cu
... ... @@ -11,30 +11,30 @@ using namespace std;
11 11 using namespace cv;
12 12 using namespace cv::gpu;
13 13  
14   -namespace br { namespace cuda {
  14 +namespace br { namespace cuda { namespace affine {
15 15  
16   - __device__ __forceinline__ uint8_t cudaaffine_kernel_get_pixel_value(int row, int col, uint8_t* srcPtr, int rows, int cols) {
  16 + __device__ __forceinline__ uint8_t getPixelValueDevice(int row, int col, uint8_t* srcPtr, int rows, int cols) {
17 17 if (row < 0 || row > rows || col < 0 || col > cols) {
18 18 if (row > rows || col > cols) {
19 19 return 0;
20 20 } else{
21   - return 0;
  21 + return 0;
22 22 }
23 23 }
24 24 return (srcPtr + row*cols)[col];
25 25 }
26 26  
27 27  
28   - __device__ __forceinline__ uint8_t cudaaffine_kernel_get_bilinear_pixel_value(double row, double col, uint8_t* srcPtr, int rows, int cols) {
  28 + __device__ __forceinline__ uint8_t getBilinearPixelValueDevice(double row, double col, uint8_t* srcPtr, int rows, int cols) {
29 29 // don't do anything if the index is out of bounds
30 30 if (row < 0 || row > rows || col < 0 || col > cols) {
31 31 if (row > rows || col > cols) {
32 32 return 0;
33 33 } else{
34   - return 0;
  34 + return 0;
35 35 }
36 36 }
37   -
  37 +
38 38 // http://www.sci.utah.edu/~acoste/uou/Image/project3/ArthurCOSTE_Project3.pdf
39 39 // Bilinear Transformation
40 40 // f(Px, Py) = f(Q11)ร—(1โˆ’Rx)ร—(1โˆ’Sy)+f(Q21)ร—(Rx)ร—(1โˆ’Sy)+f(Q12)ร—(1โˆ’Rx)ร—(Sy)+f(Q22)ร—(Rx)ร—(Sy)
... ... @@ -48,22 +48,22 @@ namespace br { namespace cuda {
48 48 double d_row = row - row1;
49 49 double d_col = col - col1;
50 50  
51   - int Q11 = cudaaffine_kernel_get_pixel_value(row1, col1, srcPtr, rows, cols);
52   - int Q21 = cudaaffine_kernel_get_pixel_value(row2, col1, srcPtr, rows, cols);
53   - int Q12 = cudaaffine_kernel_get_pixel_value(row1, col2, srcPtr, rows, cols);
54   - int Q22 = cudaaffine_kernel_get_pixel_value(row2, col2, srcPtr, rows, cols);
  51 + int Q11 = getPixelValueDevice(row1, col1, srcPtr, rows, cols);
  52 + int Q21 = getPixelValueDevice(row2, col1, srcPtr, rows, cols);
  53 + int Q12 = getPixelValueDevice(row1, col2, srcPtr, rows, cols);
  54 + int Q22 = getPixelValueDevice(row2, col2, srcPtr, rows, cols);
55 55  
56 56 double val = Q22*(d_row*d_col) + Q12*((1-d_row)*d_col) + Q21*(d_row*(1-d_col)) + Q11*((1-d_row)*(1-d_col));
57 57 return ((uint8_t) round(val));
58 58 }
59 59  
60   - __device__ __forceinline__ uint8_t cudaaffine_kernel_get_distance_pixel_value(double row, double col, uint8_t* srcPtr, int rows, int cols) {
  60 + __device__ __forceinline__ uint8_t getDistancePixelValueDevice(double row, double col, uint8_t* srcPtr, int rows, int cols) {
61 61 // don't do anything if the index is out of bounds
62 62 if (row < 1 || row >= rows-1 || col < 1 || col >= cols-1) {
63 63 if (row >= rows || col >= cols) {
64 64 return 0;
65 65 } else{
66   - return 0;
  66 + return 0;
67 67 }
68 68 }
69 69  
... ... @@ -90,10 +90,10 @@ namespace br { namespace cuda {
90 90 double w3 = d3/sum;
91 91 double w4 = d4/sum;
92 92  
93   - uint8_t v1 = cudaaffine_kernel_get_pixel_value(row1, col1, srcPtr, rows, cols);
94   - uint8_t v2 = cudaaffine_kernel_get_pixel_value(row2, col1, srcPtr, rows, cols);
95   - uint8_t v3 = cudaaffine_kernel_get_pixel_value(row1, col2, srcPtr, rows, cols);
96   - uint8_t v4 = cudaaffine_kernel_get_pixel_value(row2, col2, srcPtr, rows, cols);
  93 + uint8_t v1 = getPixelValueDevice(row1, col1, srcPtr, rows, cols);
  94 + uint8_t v2 = getPixelValueDevice(row2, col1, srcPtr, rows, cols);
  95 + uint8_t v3 = getPixelValueDevice(row1, col2, srcPtr, rows, cols);
  96 + uint8_t v4 = getPixelValueDevice(row2, col2, srcPtr, rows, cols);
97 97  
98 98 return round(w1*v1 + w2*v2 + w3*v3 + w4*v4);
99 99 }
... ... @@ -105,16 +105,16 @@ namespace br { namespace cuda {
105 105 * src_row - The computed source pixel row (mapping from this row)
106 106 * src_col - The computed source pixel column (mapping from this col)
107 107 */
108   - __device__ __forceinline__ void cudaaffine_kernel_get_src_coord(double *trans_inv, int dst_row, int dst_col, double* src_row_pnt, double* src_col_pnt){
  108 + __device__ __forceinline__ void getSrcCoordDevice(double *trans_inv, int dst_row, int dst_col, double* src_row_pnt, double* src_col_pnt){
109 109 *src_col_pnt = dst_col * trans_inv[0] + dst_row * trans_inv[3] + trans_inv[6];
110 110 *src_row_pnt = dst_col * trans_inv[1] + dst_row * trans_inv[4] + trans_inv[7];
111 111  
112 112 //printf("Dst: [%d, %d, 1] = [%d, %d, 1] \n[ %0.4f, %0.4f, %0.4f] \n[ %0.4f, %0.4f, %0.4f ]\n[ %0.4f, %0.4f, %0.4f ]\n\n", *src_col, *src_row, dst_col, dst_row, trans_inv[0], trans_inv[1], trans_inv[2], trans_inv[3], trans_inv[4], trans_inv[5], trans_inv[6], trans_inv[7], trans_inv[8]);
113 113  
114 114 }
115   -
116 115  
117   - __global__ void cudaaffine_kernel(uint8_t* srcPtr, uint8_t* dstPtr, double* trans_inv, int src_rows, int src_cols, int dst_rows, int dst_cols){
  116 +
  117 + __global__ void affineKernel(uint8_t* srcPtr, uint8_t* dstPtr, double* trans_inv, int src_rows, int src_cols, int dst_rows, int dst_cols){
118 118 int dstRowInd = blockIdx.y*blockDim.y+threadIdx.y;
119 119 int dstColInd = blockIdx.x*blockDim.x+threadIdx.x;
120 120 int dstIndex = dstRowInd*dst_cols + dstColInd;
... ... @@ -134,15 +134,15 @@ namespace br { namespace cuda {
134 134 }
135 135 }
136 136  
137   - cudaaffine_kernel_get_src_coord(trans_inv, dstRowInd, dstColInd, &srcRowPnt, &srcColPnt);
138   - //const uint8_t cval = cudaaffine_kernel_get_distance_pixel_value(srcRowPnt, srcColPnt, srcPtr, src_rows, src_cols); // Get initial pixel value
139   - const uint8_t cval = cudaaffine_kernel_get_bilinear_pixel_value(srcRowPnt, srcColPnt, srcPtr, src_rows, src_cols); // Get initial pixel value
140   - //const uint8_t cval = cudaaffine_kernel_get_pixel_value(round(srcRowPnt), round(srcColPnt), srcPtr, src_rows, src_cols); // Get initial pixel value
  137 + getSrcCoordDevice(trans_inv, dstRowInd, dstColInd, &srcRowPnt, &srcColPnt);
  138 + //const uint8_t cval = getDistancePixelValueDevice(srcRowPnt, srcColPnt, srcPtr, src_rows, src_cols); // Get initial pixel value
  139 + const uint8_t cval = getBilinearPixelValueDevice(srcRowPnt, srcColPnt, srcPtr, src_rows, src_cols); // Get initial pixel value
  140 + //const uint8_t cval = getPixelValueDevice(round(srcRowPnt), round(srcColPnt), srcPtr, src_rows, src_cols); // Get initial pixel value
141 141  
142 142 dstPtr[dstIndex] = cval;
143 143 }
144 144  
145   - void cudaaffine_wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols) {
  145 + void wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols) {
146 146 cudaError_t err;
147 147 double* gpuInverse;
148 148  
... ... @@ -152,7 +152,7 @@ namespace br { namespace cuda {
152 152  
153 153 //************************************************************************
154 154 // Input affine is a 2x3 Mat whose transpose is used in the computations
155   - // [x, y, 1] = [u, v, 1] [ a^T | [0 0 1]^T ]
  155 + // [x, y, 1] = [u, v, 1] [ a^T | [0 0 1]^T ]
156 156 // See "Digital Image Warping" by George Wolburg (p. 50)
157 157 //************************************************************************
158 158  
... ... @@ -210,7 +210,7 @@ namespace br { namespace cuda {
210 210  
211 211 CUDA_SAFE_MEMCPY(gpuInverse, affineInverse, 9*sizeof(double), cudaMemcpyHostToDevice, &err);
212 212  
213   - cudaaffine_kernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), gpuInverse, src_rows, src_cols, dst_rows, dst_cols);
  213 + affineKernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), gpuInverse, src_rows, src_cols, dst_rows, dst_cols);
214 214 CUDA_KERNEL_ERR_CHK(&err);
215 215  
216 216 CUDA_SAFE_FREE(srcPtr, &err);
... ... @@ -225,5 +225,4 @@ namespace br { namespace cuda {
225 225 // }
226 226 // printf("\n");
227 227 }
228   -} // end cuda
229   -} // end br
  228 +}}}
... ...
openbr/plugins/cuda/cudacvtfloat.cpp
... ... @@ -7,7 +7,8 @@ using namespace cv;
7 7  
8 8 #include <openbr/plugins/openbr_internal.h>
9 9  
10   -namespace br { namespace cuda { namespace cudacvtfloat {
  10 +// definitions from the CUDA source file
  11 +namespace br { namespace cuda { namespace cvtfloat {
11 12 void wrapper(void* src, void** dst, int rows, int cols);
12 13 }}}
13 14  
... ... @@ -16,7 +17,7 @@ namespace br
16 17  
17 18 /*!
18 19 * \ingroup transforms
19   - * \brief Converts byte to floating point
  20 + * \brief Converts 8-bit images currently on GPU into 32-bit floating point equivalent.
20 21 * \author Colin Heinzmann \cite DepthDeluxe
21 22 */
22 23 class CUDACvtFloatTransform : public UntrainableTransform
... ... @@ -45,7 +46,7 @@ class CUDACvtFloatTransform : public UntrainableTransform
45 46 dstDataPtr[2] = srcDataPtr[2];
46 47 dstDataPtr[3] = srcDataPtr[3]; *((int*)dstDataPtr[3]) = CV_32FC1;
47 48  
48   - br::cuda::cudacvtfloat::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
  49 + cuda::cvtfloat::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
49 50 dst = dstMat;
50 51 }
51 52 };
... ...
openbr/plugins/cuda/cudacvtfloat.cu
... ... @@ -3,7 +3,7 @@ using namespace std;
3 3  
4 4 #include "cudadefines.hpp"
5 5  
6   -namespace br { namespace cuda { namespace cudacvtfloat {
  6 +namespace br { namespace cuda { namespace cvtfloat {
7 7  
8 8 __global__ void kernel(const unsigned char* src, float* dst, int rows, int cols) {
9 9 // get my index
... ... @@ -20,12 +20,6 @@ namespace br { namespace cuda { namespace cudacvtfloat {
20 20 }
21 21  
22 22 void wrapper(void* src, void** dst, int rows, int cols) {
23   - //unsigned char* cudaSrc;
24   - //cudaMalloc(&cudaSrc, rows*cols*sizeof(unsigned char));
25   - //cudaMemcpy(cudaSrc, src, rows*cols*sizeof(unsigned char), cudaMemcpyHostToDevice);
26   -
27   - //float* cudaDst;
28   - //cudaMalloc(&cudaDst, rows*cols*sizeof(float));
29 23 cudaError_t err;
30 24 CUDA_SAFE_MALLOC(dst, rows*cols*sizeof(float), &err);
31 25  
... ...
openbr/plugins/cuda/CUDAL2.cpp renamed to openbr/plugins/cuda/cudal2.cpp
... ... @@ -19,7 +19,8 @@ using namespace std;
19 19  
20 20 #include <openbr/plugins/openbr_internal.h>
21 21  
22   -namespace br { namespace cuda { namespace L2{
  22 +// definitions from the CUDA source file
  23 +namespace br { namespace cuda { namespace L2 {
23 24 void wrapper(float* cudaAPtr, float* cudaBPtr, int length, float* outPtr);
24 25 }}}
25 26  
... ... @@ -29,7 +30,7 @@ namespace br
29 30 /*!
30 31 * \ingroup distances
31 32 * \brief L2 distance computed using eigen.
32   - * \author Josh Klontz \cite jklontz
  33 + * \author Colin Heinzmann \cite DepthDeluxe
33 34 */
34 35 class CUDAL2Distance : public UntrainableDistance
35 36 {
... ... @@ -45,7 +46,7 @@ class CUDAL2Distance : public UntrainableDistance
45 46 float* cudaBPtr = (float*)b.ptr<void*>()[0];
46 47  
47 48 float out;
48   - br::cuda::L2::wrapper(cudaAPtr, cudaBPtr, rows*cols, &out);
  49 + cuda::L2::wrapper(cudaAPtr, cudaBPtr, rows*cols, &out);
49 50  
50 51 return out;
51 52 }
... ... @@ -55,4 +56,4 @@ BR_REGISTER(Distance, CUDAL2Distance)
55 56  
56 57 } // namespace br
57 58  
58   -#include "cuda/CUDAL2.moc"
  59 +#include "cuda/cudal2.moc"
... ...
openbr/plugins/cuda/CUDAL2.cu renamed to openbr/plugins/cuda/cudal2.cu
... ... @@ -4,7 +4,7 @@
4 4  
5 5 namespace br { namespace cuda { namespace L2 {
6 6  
7   - __global__ void my_subtract_kernel(float* aPtr, float* bPtr, float* workPtr, int length) {
  7 + __global__ void subtractKernel(float* aPtr, float* bPtr, float* workPtr, int length) {
8 8 int index = blockIdx.x*blockDim.x+threadIdx.x;
9 9  
10 10 if (index >= length) {
... ... @@ -18,7 +18,7 @@ namespace br { namespace cuda { namespace L2 {
18 18 workPtr[index] = workPtr[index] * workPtr[index];
19 19 }
20 20  
21   - __global__ void collapse_kernel(float* inPtr, float* outPtr, int length) {
  21 + __global__ void collapseKernel(float* inPtr, float* outPtr, int length) {
22 22 // make sure there is only one thread that we are calling
23 23 if (blockIdx.x != 0 || threadIdx.x != 0) {
24 24 return;
... ... @@ -45,11 +45,11 @@ namespace br { namespace cuda { namespace L2 {
45 45 // perform the subtraction
46 46 int threadsPerBlock = 64;
47 47 int numBlocks = length / threadsPerBlock + 1;
48   - my_subtract_kernel<<<threadsPerBlock, numBlocks>>>(cudaAPtr, cudaBPtr, cudaWorkBufferPtr, length);
  48 + subtractKernel<<<threadsPerBlock, numBlocks>>>(cudaAPtr, cudaBPtr, cudaWorkBufferPtr, length);
49 49 CUDA_KERNEL_ERR_CHK(&err);
50 50  
51 51 // perform the collapse
52   - collapse_kernel<<<1,1>>>(cudaWorkBufferPtr, cudaOutPtr, length);
  52 + collapseKernel<<<1,1>>>(cudaWorkBufferPtr, cudaOutPtr, length);
53 53 CUDA_KERNEL_ERR_CHK(&err);
54 54  
55 55 // copy the single value back to the destinsion
... ... @@ -63,6 +63,3 @@ namespace br { namespace cuda { namespace L2 {
63 63 CUDA_SAFE_FREE(cudaWorkBufferPtr, &err);
64 64 }
65 65 }}}
66   -
67   -// 128CUDAEigenfaces on 6400 ATT: 54.367s
68   -// 128CUDAEigenfacesL2 on 6400 ATT:
... ...
openbr/plugins/cuda/cudalbp.cpp
... ... @@ -31,44 +31,21 @@ using namespace std;
31 31  
32 32 #include <openbr/plugins/openbr_internal.h>
33 33  
34   -#include "MatManager.hpp"
35   -
36 34 using namespace cv;
37 35  
38   -string type2str(int type) {
39   - string r;
40   -
41   - uchar depth = type & CV_MAT_DEPTH_MASK;
42   - uchar chans = 1 + (type >> CV_CN_SHIFT);
43   -
44   - switch ( depth ) {
45   - case CV_8U: r = "8U"; break;
46   - case CV_8S: r = "8S"; break;
47   - case CV_16U: r = "16U"; break;
48   - case CV_16S: r = "16S"; break;
49   - case CV_32S: r = "32S"; break;
50   - case CV_32F: r = "32F"; break;
51   - case CV_64F: r = "64F"; break;
52   - default: r = "User"; break;
53   - }
54   -
55   - r += "C";
56   - r += (chans+'0');
57   -
58   - return r;
59   -}
60   -
61   -namespace br { namespace cuda {
62   - void cudalbp_wrapper(void* srcPtr, void** dstPtr, int rows, int cols);
63   - void cudalbp_init_wrapper(uint8_t* lut);
64   -}}
  36 +// definitions from the CUDA source file
  37 +namespace br { namespace cuda { namespace lbp {
  38 + void wrapper(void* srcPtr, void** dstPtr, int rows, int cols);
  39 + void initializeWrapper(uint8_t* lut);
  40 +}}}
65 41  
66 42 namespace br
67 43 {
68 44 /*!
69 45 * \ingroup transforms
70   - * \brief Convert the image into a feature vector using Local Binary Patterns in CUDA
71   - * \author Colin Heinzmann, Li Li \cite DepthDeluxe, booli
  46 + * \brief Convert the image into a feature vector using Local Binary Patterns in CUDA. Modified from stock OpenBR plugin.
  47 + * \author Colin Heinzmann \cite DepthDeluxe
  48 + * \author Li Li \cite booli
72 49 */
73 50 class CUDALBPTransform : public UntrainableTransform
74 51 {
... ... @@ -84,8 +61,6 @@ class CUDALBPTransform : public UntrainableTransform
84 61 uchar lut[256];
85 62 uchar null;
86 63  
87   - //cuda::MatManager* matManager;
88   -
89 64 public:
90 65 /* Returns the number of 0->1 or 1->0 transitions in i */
91 66 static int numTransitions(int i)
... ... @@ -136,36 +111,14 @@ class CUDALBPTransform : public UntrainableTransform
136 111 if (!set[i])
137 112 lut[i] = null; // Set to null id
138 113  
139   - // init the mat manager for managing 10 mats
140   - //matManager = new cuda::MatManager(10);
141   -
142 114 // copy lut over to the GPU
143   - br::cuda::cudalbp_init_wrapper(lut);
  115 + cuda::lbp::initializeWrapper(lut);
144 116  
145 117 std::cout << "Initialized CUDALBP" << std::endl;
146 118 }
147 119  
148 120 void project(const Template &src, Template &dst) const
149 121 {
150   - //Mat& m = (Mat&)src.m();
151   - //cuda::MatManager::matindex a;
152   - //cuda::MatManager::matindex b;
153   - //a = matManager->reserve(m);
154   - //matManager->upload(a, m);
155   -
156   - // reserve the second mat and check the dimensiosn
157   - //b = matManager->reserve(m);
158   -
159   - //uint8_t* srcMatPtr = matManager->get_mat_pointer_from_index(a);
160   - //uint8_t* dstMatPtr = matManager->get_mat_pointer_from_index(b);
161   - //br::cuda::cudalbp_wrapper(srcMatPtr, dstMatPtr, lutGpuPtr, m.cols, m.rows, m.step1());
162   -
163   - //matManager->download(b, dst);
164   -
165   - // release both the mats
166   - //matManager->release(a);
167   - //matManager->release(b);
168   -
169 122 void* const* srcDataPtr = src.m().ptr<void*>();
170 123 int rows = *((int*)srcDataPtr[1]);
171 124 int cols = *((int*)srcDataPtr[2]);
... ... @@ -177,13 +130,13 @@ class CUDALBPTransform : public UntrainableTransform
177 130 dstDataPtr[2] = srcDataPtr[2];
178 131 dstDataPtr[3] = srcDataPtr[3];
179 132  
180   - br::cuda::cudalbp_wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
  133 + cuda::lbp::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
181 134 dst = dstMat;
182 135 }
183 136 };
184 137  
185 138 BR_REGISTER(Transform, CUDALBPTransform)
186 139  
187   -} // namespace br
  140 +}
188 141  
189 142 #include "cuda/cudalbp.moc"
... ...
openbr/plugins/cuda/cudalbp.cu
... ... @@ -9,14 +9,18 @@ using namespace std;
9 9 using namespace cv;
10 10 using namespace cv::gpu;
11 11  
12   -namespace br { namespace cuda {
  12 +/*
  13 + * These are the CUDA functions for CUDALBP. See cudapca.cpp for more details
  14 + */
  15 +
  16 +namespace br { namespace cuda { namespace lbp {
13 17 uint8_t* lut;
14 18  
15   - __device__ __forceinline__ uint8_t cudalbp_kernel_get_pixel_value(int row, int col, uint8_t* srcPtr, int rows, int cols) {
  19 + __device__ __forceinline__ uint8_t getPixelValueKernel(int row, int col, uint8_t* srcPtr, int rows, int cols) {
16 20 return (srcPtr + row*cols)[col];
17 21 }
18 22  
19   - __global__ void cudalbp_kernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols, uint8_t* lut)
  23 + __global__ void lutKernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols, uint8_t* lut)
20 24 {
21 25 int rowInd = blockIdx.y*blockDim.y+threadIdx.y;
22 26 int colInd = blockIdx.x*blockDim.x+threadIdx.x;
... ... @@ -34,22 +38,22 @@ namespace br { namespace cuda {
34 38 }
35 39 }
36 40  
37   - const uint8_t cval = cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+0*radius, srcPtr, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]); // center value
38   - uint8_t val = lut[(cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 128 : 0) |
39   - (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 64 : 0) |
40   - (cudalbp_kernel_get_pixel_value(rowInd-1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 32 : 0) |
41   - (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 16 : 0) |
42   - (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 8 : 0) |
43   - (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 4 : 0) |
44   - (cudalbp_kernel_get_pixel_value(rowInd+1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 2 : 0) |
45   - (cudalbp_kernel_get_pixel_value(rowInd+0*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 1 : 0)];
  41 + const uint8_t cval = getPixelValueKernel(rowInd+0*radius, colInd+0*radius, srcPtr, rows, cols);//(srcPtr[(rowInd*srcStep+0*radius)*m.cols+colInd+0*radius]); // center value
  42 + uint8_t val = lut[(getPixelValueKernel(rowInd-1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 128 : 0) |
  43 + (getPixelValueKernel(rowInd-1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 64 : 0) |
  44 + (getPixelValueKernel(rowInd-1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 32 : 0) |
  45 + (getPixelValueKernel(rowInd+0*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 16 : 0) |
  46 + (getPixelValueKernel(rowInd+1*radius, colInd+1*radius, srcPtr, rows, cols) >= cval ? 8 : 0) |
  47 + (getPixelValueKernel(rowInd+1*radius, colInd+0*radius, srcPtr, rows, cols) >= cval ? 4 : 0) |
  48 + (getPixelValueKernel(rowInd+1*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 2 : 0) |
  49 + (getPixelValueKernel(rowInd+0*radius, colInd-1*radius, srcPtr, rows, cols) >= cval ? 1 : 0)];
46 50  
47 51 // store calculated value away in the right place
48 52 dstPtr[index] = val;
49 53 }
50 54  
51 55 //void cudalbp_wrapper(uint8_t* srcPtr, uint8_t* dstPtr, uint8_t* lut, int imageWidth, int imageHeight, size_t step)
52   - void cudalbp_wrapper(void* srcPtr, void** dstPtr, int rows, int cols)
  56 + void wrapper(void* srcPtr, void** dstPtr, int rows, int cols)
53 57 {
54 58 cudaError_t err;
55 59  
... ... @@ -59,15 +63,15 @@ namespace br { namespace cuda {
59 63 rows/threadsPerBlock.y + 1);
60 64  
61 65 CUDA_SAFE_MALLOC(dstPtr, rows*cols*sizeof(uint8_t), &err);
62   - cudalbp_kernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), rows, cols, lut);
  66 + lutKernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), rows, cols, lut);
63 67 CUDA_KERNEL_ERR_CHK(&err);
64 68  
65 69 CUDA_SAFE_FREE(srcPtr, &err);
66 70 }
67 71  
68   - void cudalbp_init_wrapper(uint8_t* cpuLut) {
  72 + void initializeWrapper(uint8_t* cpuLut) {
69 73 cudaError_t err;
70 74 CUDA_SAFE_MALLOC(&lut, 256*sizeof(uint8_t), &err);
71 75 CUDA_SAFE_MEMCPY(lut, cpuLut, 256*sizeof(uint8_t), cudaMemcpyHostToDevice, &err);
72 76 }
73   -}}
  77 +}}}
... ...
openbr/plugins/cuda/cudapca.cpp
... ... @@ -29,8 +29,9 @@ using namespace cv;
29 29 #include <openbr/core/eigenutils.h>
30 30 #include <openbr/core/opencvutils.h>
31 31  
  32 +// definitions from the CUDA source file
32 33 namespace br { namespace cuda { namespace pca {
33   - void loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems);
  34 + void initializeWrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems);
34 35 void wrapper(void* src, void** dst);
35 36 }}}
36 37  
... ... @@ -38,9 +39,7 @@ namespace br
38 39 {
39 40 /*!
40 41 * \ingroup transforms
41   - * \brief Projects input into learned Principal Component Analysis subspace using CUDA.
42   - * \author Brendan Klare \cite bklare
43   - * \author Josh Klontz \cite jklontz
  42 + * \brief Projects input into learned Principal Component Analysis subspace using CUDA. Modified from original PCA plugin.
44 43 * \author Colin Heinzmann \cite DepthDeluxe
45 44 *
46 45 * \br_property float keep Options are: [keep < 0 - All eigenvalues are retained, keep == 0 - No PCA is performed and the eigenvectors form an identity matrix, 0 < keep < 1 - Keep is the fraction of the variance to retain, keep >= 1 - keep is the number of leading eigenvectors to retain] Default is 0.95.
... ... @@ -134,22 +133,9 @@ private:
134 133 dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = keep;
135 134 dstDataPtr[3] = srcDataPtr[3];
136 135  
137   - br::cuda::pca::wrapper(srcDataPtr[0], &dstDataPtr[0]);
  136 + cuda::pca::wrapper(srcDataPtr[0], &dstDataPtr[0]);
138 137  
139 138 dst = dstMat;
140   -
141   - //dst = cv::Mat(1, keep, CV_32FC1);
142   -
143   - // perform the operation on the graphics card
144   - //cuda::cudapca_projectwrapper((float*)src.m().ptr<float>(), (float*)dst.m().ptr<float>());
145   -
146   - // Map Eigen into OpenCV
147   - //Mat cpuDst = cv::Mat(1, keep, CV_32FC1);
148   - //Eigen::Map<const Eigen::MatrixXf> inMap(src.m().ptr<float>(), src.m().rows*src.m().cols, 1);
149   - //Eigen::Map<Eigen::MatrixXf> outMap(cpuDst.ptr<float>(), keep, 1);
150   -
151   - // Do projection
152   - //outMap = eVecs.transpose() * (inMap - mean);
153 139 }
154 140  
155 141 void store(QDataStream &stream) const
... ... @@ -161,14 +147,6 @@ private:
161 147 {
162 148 stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> eVecs;
163 149  
164   - cout << "Mean Dimensions" << endl;
165   - cout << "\tRows: " << mean.rows() << " Cols: " << mean.cols() << endl;
166   - cout << "eVecs Dimensions" << endl;
167   - cout << "\tRows: " << eVecs.rows() << " Cols: " << eVecs.cols() << endl;
168   - cout << "eVals Dimensions" << endl;
169   - cout << "\tRows: " << eVals.rows() << " Cols: " << eVals.cols() << endl;
170   - cout << "Keep: " << keep << endl;
171   -
172 150 // TODO(colin): use Eigen Map class to generate map files so we don't have to copy the data
173 151 // serialize the eigenvectors
174 152 float* evBuffer = new float[eVecs.rows() * eVecs.cols()];
... ... @@ -187,7 +165,7 @@ private:
187 165 }
188 166  
189 167 // call the wrapper function
190   - br::cuda::pca::loadwrapper(evBuffer, eVecs.rows(), eVecs.cols(), meanBuffer, mean.rows()*mean.cols());
  168 + cuda::pca::initializeWrapper(evBuffer, eVecs.rows(), eVecs.cols(), meanBuffer, mean.rows()*mean.cols());
191 169  
192 170 delete evBuffer;
193 171 delete meanBuffer;
... ...
openbr/plugins/cuda/cudapca.cu
... ... @@ -9,6 +9,10 @@ using namespace std;
9 9 using namespace cv;
10 10 using namespace cv::gpu;
11 11  
  12 +/*
  13 + * These are the CUDA functions for CUDAPCA. See cudapca.cpp for more details
  14 + */
  15 +
12 16 namespace br { namespace cuda { namespace pca {
13 17 __global__ void multiplyKernel(float* src, float* intermediaryBuffer, float* evPtr, int evRows, int evCols, int stepSize) {
14 18 int colInd = blockIdx.x*blockDim.x+threadIdx.x;
... ... @@ -68,7 +72,7 @@ namespace br { namespace cuda { namespace pca {
68 72 int _numSteps; int _stepSize;
69 73 float* intermediaryBuffer;
70 74  
71   - void loadwrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) {
  75 + void initializeWrapper(float* evPtr, int evRows, int evCols, float* meanPtr, int meanElems) {
72 76 _evRows = evRows; _evCols = evCols;
73 77 _meanElems = meanElems;
74 78  
... ... @@ -95,7 +99,6 @@ namespace br { namespace cuda { namespace pca {
95 99 cudaError_t err;
96 100 CUDA_SAFE_MALLOC(dst, _evCols*sizeof(float), &err);
97 101  
98   -
99 102 // subtract out the mean of the image (mean is 1xpixels in size)
100 103 int threadsPerBlock = 64;
101 104 int numBlocks = _meanElems / threadsPerBlock + 1;
... ... @@ -114,8 +117,5 @@ namespace br { namespace cuda { namespace pca {
114 117 CUDA_KERNEL_ERR_CHK(&err);
115 118  
116 119 CUDA_SAFE_FREE(src, &err); // TODO(colin): figure out why adding this free causes memory corruption...
117   -
118   - // copy the data back to the CPU
119   - //cudaMemcpy(dst, _cudaDstPtr, _evCols*sizeof(float), cudaMemcpyDeviceToHost);
120 120 }
121 121 }}}
... ...
openbr/plugins/cuda/cudargb2grayscale.cpp
... ... @@ -25,17 +25,18 @@
25 25  
26 26 using namespace cv;
27 27  
28   -namespace br { namespace cuda{
29   - void cudargb2grayscale_wrapper(void* srcPtr, void**dstPtr, int rows, int cols);
30   -}}
  28 +// definitions from the CUDA source file
  29 +namespace br { namespace cuda { namespace rgb2grayscale {
  30 + void wrapper(void* srcPtr, void**dstPtr, int rows, int cols);
  31 +}}}
31 32  
32 33 namespace br
33 34 {
34 35  
35 36 /*!
36 37 * \ingroup transforms
37   - * \brief Colorspace conversion.
38   - * \author Li Li \cite Josh Klontz \cite jklontz
  38 + * \brief Converts 3-channel images to grayscale
  39 + * \author Li Li \cite booli
39 40 */
40 41 class CUDARGB2GrayScaleTransform : public UntrainableTransform
41 42 {
... ... @@ -57,8 +58,8 @@ private:
57 58 dstDataPtr[2] = srcDataPtr[2];
58 59 dstDataPtr[3] = srcDataPtr[3];
59 60 *((int*)dstDataPtr[3]) = CV_8UC1; // not sure if the type of the new mat is the same
60   -
61   - br::cuda::cudargb2grayscale_wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
  61 +
  62 + cuda::rgb2grayscale::wrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols);
62 63 dst = dstMat;
63 64  
64 65 /*
... ...
openbr/plugins/cuda/cudargb2grayscale.cu
... ... @@ -12,9 +12,9 @@ using namespace std;
12 12 using namespace cv;
13 13 using namespace cv::gpu;
14 14  
15   -namespace br{ namespace cuda {
  15 +namespace br { namespace cuda { namespace rgb2grayscale {
16 16  
17   - __global__ void cudargb2grayscale_kernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols)
  17 + __global__ void kernel(uint8_t* srcPtr, uint8_t* dstPtr, int rows, int cols)
18 18 {
19 19 int rowInd = blockIdx.y*blockDim.y+threadIdx.y;
20 20 int colInd = blockIdx.x*blockDim.x+threadIdx.x;
... ... @@ -31,7 +31,7 @@ namespace br{ namespace cuda {
31 31 return;
32 32 }
33 33  
34   - void cudargb2grayscale_wrapper(void* srcPtr, void** dstPtr, int rows, int cols)
  34 + void wrapper(void* srcPtr, void** dstPtr, int rows, int cols)
35 35 {
36 36 cudaError_t err;
37 37 dim3 threadsPerBlock(9, 9);
... ... @@ -39,9 +39,9 @@ namespace br{ namespace cuda {
39 39 rows/threadsPerBlock.y + 1);
40 40 CUDA_SAFE_MALLOC(dstPtr, rows*cols*sizeof(uint8_t), &err);
41 41  
42   - cudargb2grayscale_kernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*) (*dstPtr), rows, cols);
  42 + kernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*) (*dstPtr), rows, cols);
43 43 CUDA_KERNEL_ERR_CHK(&err);
44 44 CUDA_SAFE_FREE(srcPtr, &err);
45   - }
  45 + }
46 46  
47   -}}
  47 +}}}
... ...
openbr/plugins/cuda/passthrough.cpp deleted
1   -#include <openbr/plugins/openbr_internal.h>
2   -
3   -#include <opencv2/imgproc/imgproc.hpp>
4   -#include <opencv2/gpu/gpu.hpp>
5   -
6   -using namespace cv;
7   -using namespace cv::gpu;
8   -
9   -#include "passthrough.hpp"
10   -
11   -#include <iostream>
12   -
13   -
14   -namespace br
15   -{
16   - class CUDAPassthroughTransform : public UntrainableTransform
17   - {
18   - Q_OBJECT
19   -
20   -private:
21   - void project(const Template &src, Template &dst) const
22   - {
23   - // note: if you convert the image to grayscale, you get 8UC1
24   -
25   - // upload the src mat to the GPU
26   - GpuMat srcGpuMat, dstGpuMat;
27   - srcGpuMat.upload(src.m());
28   - dstGpuMat.upload(src.m());
29   -
30   - br::cuda::passthrough_wrapper(srcGpuMat, dstGpuMat);
31   -
32   - dstGpuMat.download(dst.m());
33   -
34   - // TODO(colin): add delete code
35   - srcGpuMat.release();
36   - dstGpuMat.release();
37   -
38   - printf("srcGpuMat empty: %d\n", (int)srcGpuMat.empty());
39   - printf("dstGpuMat empty: %d\n", (int)srcGpuMat.empty());
40   - }
41   - };
42   -
43   - BR_REGISTER(Transform, CUDAPassthroughTransform);
44   -}
45   -
46   -#include "cuda/passthrough.moc"
openbr/plugins/cuda/passthrough.cu deleted
1   -// note: Using 8-bit unsigned 1 channel images
2   -
3   -#include <opencv2/gpu/gpu.hpp>
4   -
5   -using namespace cv;
6   -using namespace cv::gpu;
7   -
8   -#include "passthrough.hpp"
9   -
10   -namespace br { namespace cuda {
11   - __global__ void passthrough_kernel(uint8_t* srcPtr, uint8_t* dstPtr, size_t srcStep, size_t dstStep, int cols, int rows) {
12   - int rowInd = blockIdx.y*blockDim.y+threadIdx.y;
13   - int colInd = blockIdx.x*blockDim.x+threadIdx.x;
14   -
15   - // don't do anything if we are outside the allowable positions
16   - if (rowInd >= rows || colInd >= cols)
17   - return;
18   -
19   - uint8_t srcVal = (srcPtr + rowInd*srcStep)[colInd];
20   - uint8_t* rowDstPtr = dstPtr + rowInd*dstStep;
21   -
22   - rowDstPtr[colInd] = srcVal;
23   - }
24   -
25   - void passthrough_wrapper(GpuMat& src, GpuMat& dst) {
26   - // convert the GpuMats to pointers
27   - uint8_t* srcPtr = (uint8_t*)src.data;
28   - uint8_t* dstPtr = (uint8_t*)dst.data;
29   -
30   - int imageWidth = src.cols;
31   - int imageHeight = src.rows;
32   -
33   - // make 8 * 8 = 64 square block
34   - dim3 threadsPerBlock(8, 8);
35   - dim3 numBlocks(imageWidth / threadsPerBlock.x + 1,
36   - imageHeight / threadsPerBlock.y + 1);
37   -
38   - passthrough_kernel<<<numBlocks, threadsPerBlock>>>(srcPtr, dstPtr, src.step, dst.step, imageWidth, imageHeight);
39   - }
40   -}}
41   -
42   -
43   -// read http://stackoverflow.com/questions/31927297/array-of-ptrstepszgpumat-to-a-c-cuda-kernel
openbr/plugins/cuda/passthrough.hpp deleted
1   -namespace br { namespace cuda {
2   - void passthrough_wrapper(GpuMat& src, GpuMat& dst);
3   -}}