From e534ff9c263a2e69048f865230a7b7122e02535e Mon Sep 17 00:00:00 2001
From: Greg Schrock <gls022@bucknell.edu>
Date: Mon, 29 Feb 2016 16:01:16 -0500
Subject: [PATCH] Added preliminary functional cuda affine

---
 openbr/plugins/cuda/cudaaffine.cpp | 192 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 openbr/plugins/cuda/cudaaffine.cu  | 153 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 345 insertions(+), 0 deletions(-)
 create mode 100644 openbr/plugins/cuda/cudaaffine.cpp
 create mode 100644 openbr/plugins/cuda/cudaaffine.cu
diff --git a/openbr/plugins/cuda/cudaaffine.cpp b/openbr/plugins/cuda/cudaaffine.cpp
new file mode 100644
index 0000000..46dd71f
--- /dev/null
+++ b/openbr/plugins/cuda/cudaaffine.cpp
@@ -0,0 +1,192 @@
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * Copyright 2012 The MITRE Corporation                                      *
+ *                                                                           *
+ * Licensed under the Apache License, Version 2.0 (the "License");           *
+ * you may not use this file except in compliance with the License.          *
+ * You may obtain a copy of the License at                                   *
+ *                                                                           *
+ *     http://www.apache.org/licenses/LICENSE-2.0                            *
+ *                                                                           *
+ * Unless required by applicable law or agreed to in writing, software       *
+ * distributed under the License is distributed on an "AS IS" BASIS,         *
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  *
+ * See the License for the specific language governing permissions and       *
+ * limitations under the License.                                            *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+
+#include <iostream>
+using namespace std;
+
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <pthread.h>
+
+#include <opencv2/opencv.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <opencv2/imgproc/imgproc_c.h>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/highgui/highgui_c.h>
+#include <limits>
+
+#include <openbr/plugins/openbr_internal.h>
+#include <openbr/core/opencvutils.h>
+
+#include "MatManager.hpp"
+
+using namespace cv;
+
+namespace br { namespace cuda {
+	void cudaaffine_wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols);
+}}
+
+namespace br
+{
+
+/*!
+ * \ingroup transforms
+ * \brief Performs a two or three point registration.
+ * \author Josh Klontz \cite jklontz
+ * \note Method: Area should be used for shrinking an image, Cubic for slow but accurate enlargment, Bilin for fast enlargement.
+ */
+class CUDAAffineTransform : public UntrainableTransform
+{
+    Q_OBJECT
+    Q_ENUMS(Method)
+
+public:
+    /*!< */
+    enum Method { Near = INTER_NEAREST,
+                  Area = INTER_AREA,
+                  Bilin = INTER_LINEAR,
+                  Cubic = INTER_CUBIC,
+                  Lanczo = INTER_LANCZOS4};
+
+private:
+    Q_PROPERTY(int width READ get_width WRITE set_width RESET reset_width STORED false)
+    Q_PROPERTY(int height READ get_height WRITE set_height RESET reset_height STORED false)
+    Q_PROPERTY(float x1 READ get_x1 WRITE set_x1 RESET reset_x1 STORED false)
+    Q_PROPERTY(float y1 READ get_y1 WRITE set_y1 RESET reset_y1 STORED false)
+    Q_PROPERTY(float x2 READ get_x2 WRITE set_x2 RESET reset_x2 STORED false)
+    Q_PROPERTY(float y2 READ get_y2 WRITE set_y2 RESET reset_y2 STORED false)
+    Q_PROPERTY(float x3 READ get_x3 WRITE set_x3 RESET reset_x3 STORED false)
+    Q_PROPERTY(float y3 READ get_y3 WRITE set_y3 RESET reset_y3 STORED false)
+    Q_PROPERTY(Method method READ get_method WRITE set_method RESET reset_method STORED false)
+    Q_PROPERTY(bool storeAffine READ get_storeAffine WRITE set_storeAffine RESET reset_storeAffine STORED false)
+    Q_PROPERTY(bool warpPoints READ get_warpPoints WRITE set_warpPoints RESET reset_warpPoints STORED false)
+    BR_PROPERTY(int, width, 64)
+    BR_PROPERTY(int, height, 64)
+    BR_PROPERTY(float, x1, 0)
+    BR_PROPERTY(float, y1, 0)
+    BR_PROPERTY(float, x2, -1)
+    BR_PROPERTY(float, y2, -1)
+    BR_PROPERTY(float, x3, -1)
+    BR_PROPERTY(float, y3, -1)
+    BR_PROPERTY(Method, method, Bilin)
+    BR_PROPERTY(bool, storeAffine, false)
+    BR_PROPERTY(bool, warpPoints, false)
+
+    static Point2f getThirdAffinePoint(const Point2f &a, const Point2f &b)
+    {
+        float dx = b.x - a.x;
+        float dy = b.y - a.y;
+        return Point2f(a.x - dy, a.y + dx);
+    }
+
+    void project(const Template &src, Template &dst) const
+    {
+        const bool twoPoints = ((x3 == -1) || (y3 == -1));
+
+        Point2f dstPoints[3];
+        dstPoints[0] = Point2f(x1*width, y1*height);
+        dstPoints[1] = Point2f((x2 == -1 ? 1 - x1 : x2)*width, (y2 == -1 ? y1 : y2)*height);
+        if (twoPoints) dstPoints[2] = getThirdAffinePoint(dstPoints[0], dstPoints[1]);
+        else           dstPoints[2] = Point2f(x3*width, y3*height);
+
+        Point2f srcPoints[3];
+        if (src.file.contains("Affine_0") &&
+            src.file.contains("Affine_1") &&
+            (src.file.contains("Affine_2") || twoPoints)) {
+            srcPoints[0] = OpenCVUtils::toPoint(src.file.get<QPointF>("Affine_0"));
+            srcPoints[1] = OpenCVUtils::toPoint(src.file.get<QPointF>("Affine_1"));
+            if (!twoPoints) srcPoints[2] = OpenCVUtils::toPoint(src.file.get<QPointF>("Affine_2"));
+        } else {
+            const QList<Point2f> landmarks = OpenCVUtils::toPoints(src.file.points());
+
+            if ((landmarks.size() < 2) || (!twoPoints && (landmarks.size() < 3))) {
+                resize(src, dst, Size(width, height));
+                return;
+            } else {
+                srcPoints[0] = landmarks[0];
+                srcPoints[1] = landmarks[1];
+                if (!twoPoints) srcPoints[2] = landmarks[2];
+            }
+        }
+        if (twoPoints) srcPoints[2] = getThirdAffinePoint(srcPoints[0], srcPoints[1]);
+
+        // Code section being altered (original)
+        //
+        // Mat affineTransform = getAffineTransform(srcPoints, dstPoints);
+        // warpAffine(src, dst, affineTransform, Size(width, height), method);
+        //
+        // end original
+
+        Mat affineTransform = getAffineTransform(srcPoints, dstPoints);
+
+        void* const* srcDataPtr = src.m().ptr<void*>();
+        int rows = *((int*)srcDataPtr[1]);
+        int cols = *((int*)srcDataPtr[2]);
+        int type = *((int*)srcDataPtr[3]);
+
+        Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
+        void** dstDataPtr = dstMat.ptr<void*>();
+
+        dstDataPtr[1] = srcDataPtr[1]; *((int*)dstDataPtr[1]) = height;  // rows
+        dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = width;   // cols
+        dstDataPtr[3] = srcDataPtr[3];
+
+        // Print the transform
+        //for(int x = 0; x < affineTransform.rows; x++){
+            //for(int y = 0; y < affineTransform.cols; y++){
+                //printf("%8.3f\t", affineTransform.at<double>(x, y));
+            //}
+            //printf("\n");
+        //}
+
+        br::cuda::cudaaffine_wrapper(srcDataPtr[0], &dstDataPtr[0], affineTransform, rows, cols, height, width);
+
+        // end altered code
+
+        if (warpPoints) {
+            QList<QPointF> points = src.file.points();
+            QList<QPointF> rotatedPoints;
+            for (int i=0; i<points.size(); i++) {
+                rotatedPoints.append(QPointF(points.at(i).x()*affineTransform.at<double>(0,0)+
+                                             points.at(i).y()*affineTransform.at<double>(0,1)+
+                                             affineTransform.at<double>(0,2),
+                                             points.at(i).x()*affineTransform.at<double>(1,0)+
+                                             points.at(i).y()*affineTransform.at<double>(1,1)+
+                                             affineTransform.at<double>(1,2)));
+            }
+
+            dst.file.setPoints(rotatedPoints);
+        }
+
+        if (storeAffine) {
+            QList<float> affineParams;
+            for (int i = 0 ; i < 2; i++)
+                for (int j = 0; j < 3; j++)
+                    affineParams.append(affineTransform.at<double>(i, j));
+            dst.file.setList("affineParameters", affineParams);
+        }
+
+        dst = dstMat;
+    }
+};
+
+BR_REGISTER(Transform, CUDAAffineTransform)
+
+} // namespace br
+
+#include "cuda/cudaaffine.moc"
diff --git a/openbr/plugins/cuda/cudaaffine.cu b/openbr/plugins/cuda/cudaaffine.cu
new file mode 100644
index 0000000..11f81d2
--- /dev/null
+++ b/openbr/plugins/cuda/cudaaffine.cu
@@ -0,0 +1,153 @@
+#include <iostream>
+using namespace std;
+
+#include <opencv2/gpu/gpu.hpp>
+#include <opencv2/opencv.hpp>
+#include <stdio.h>
+#include <math.h>
+
+#include "cudadefines.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+
+namespace br { namespace cuda {
+
+    __device__ __forceinline__ uint8_t cudaaffine_kernel_get_pixel_value(int row, int col, uint8_t* srcPtr, int rows, int cols) {
+        // don't do anything if the index is out of bounds
+        if (row < 1 || row >= rows-1 || col < 1 || col >= cols-1) {
+            if (row >= rows || col >= cols) {
+                return 0;
+            } else{
+                return 0; }
+        }
+        return (srcPtr + row*cols)[col];
+    }
+
+    /*
+     * trans_inv          - A pointer to a one-dimensional representation of the inverse of the transform matrix 3x3
+     * dst_row            - The destination row (mapping to this row)
+     * dst_col            - The destination column (mapping to this column)
+     * src_row            - The computed source pixel row (mapping from this row)
+     * src_col            - The computed source pixel column (mapping from this col)
+     */
+    __device__ __forceinline__ void cudaaffine_kernel_get_src_coord(double *trans_inv, int dst_row, int dst_col, int* src_row, int* src_col){
+        *src_col = round(dst_col * trans_inv[0] + dst_row * trans_inv[3] + trans_inv[6]);
+        *src_row = round(dst_col * trans_inv[1] + dst_row * trans_inv[4] + trans_inv[7]);
+
+        //printf("Dst: [%d, %d, 1] = [%d, %d, 1] \n[ %0.4f, %0.4f, %0.4f] \n[ %0.4f, %0.4f, %0.4f ]\n[ %0.4f, %0.4f, %0.4f ]\n\n", *src_col, *src_row, dst_col, dst_row, trans_inv[0], trans_inv[1], trans_inv[2], trans_inv[3], trans_inv[4], trans_inv[5], trans_inv[6], trans_inv[7], trans_inv[8]);
+
+		}
+				
+
+    __global__ void cudaaffine_kernel(uint8_t* srcPtr, uint8_t* dstPtr, double* trans_inv, int src_rows, int src_cols, int dst_rows, int dst_cols){
+        int dstRowInd = blockIdx.y*blockDim.y+threadIdx.y;
+        int dstColInd = blockIdx.x*blockDim.x+threadIdx.x;
+        int dstIndex = dstRowInd*dst_cols + dstColInd;
+
+        //printf("Kernel Inv:\n[%0.4f %0.4f %0.4f]\n[%0.4f %0.4f %0.4f]\n[%0.4f %0.4f %0.4f]\n\n", trans_inv[0], trans_inv[1], trans_inv[2], trans_inv[3], trans_inv[4], trans_inv[5], trans_inv[6], trans_inv[7], trans_inv[8]);
+
+        int srcRowInd;
+        int srcColInd;
+
+        // don't do anything if the index is out of bounds
+        if (dstRowInd < 1 || dstRowInd >= dst_rows-1 || dstColInd < 1 || dstColInd >= dst_cols-1) {
+            if (dstRowInd >= dst_rows || dstColInd >= dst_cols) {
+                return;
+            } else{
+                dstPtr[dstIndex] = 0;
+                return;
+            }
+        }
+
+        cudaaffine_kernel_get_src_coord(trans_inv, dstRowInd, dstColInd, &srcRowInd, &srcColInd);
+        const uint8_t cval = cudaaffine_kernel_get_pixel_value(srcRowInd, srcColInd, srcPtr, src_rows, src_cols); // Get initial pixel value
+
+        dstPtr[dstIndex] = cval;
+    }
+
+    void cudaaffine_wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols) {
+        cudaError_t err;
+        double* gpuInverse;
+
+        dim3 threadsPerBlock(8, 8);
+        dim3 numBlocks(dst_cols/threadsPerBlock.x + 1,
+                       dst_rows/threadsPerBlock.y + 1);
+
+        //************************************************************************
+        // Input affine is a 2x3 Mat whose transpose is used in the computations
+        // [x, y, 1] = [u, v, 1] [ a^T | [0 0 1]^T ] 
+        // See "Digital Image Warping" by George Wolburg (p. 50)
+        //************************************************************************
+
+        // get new transform elements
+        double a11 = affineTransform.at<double>(0, 0);
+        double a12 = affineTransform.at<double>(1, 0);
+        double a21 = affineTransform.at<double>(0, 1);
+        double a22 = affineTransform.at<double>(1, 1);
+        double a31 = affineTransform.at<double>(0, 2);
+        double a32 = affineTransform.at<double>(1, 2);
+        // double a23 = 0;
+        // double a13 = 0;
+        // double a33 = 1;
+
+        // compute transform inverse
+        double det = 1 / (a11*a22 - a21*a12);
+
+        double affineInverse[9];
+        affineInverse[0] = a22 * det;
+        affineInverse[1] = -a12 * det;
+        affineInverse[2] = 0;
+        affineInverse[3] = -a21 * det;
+        affineInverse[4] = a11 * det;
+        affineInverse[5] = 0;
+        affineInverse[6] = (a21*a32 - a31*a22) * det;
+        affineInverse[7] = (a31*a12 - a11*a32) * det;
+        affineInverse[8] = (a11*a22 - a21*a12) * det;
+
+        // Move from affineTransform to gpuAffine (currently fake)
+        // double fakeAffine[6];
+        // fakeAffine[0] = affineTransform.at<double>(0, 0);
+        // fakeAffine[1] = affineTransform.at<double>(0, 1);
+        // fakeAffine[2] = affineTransform.at<double>(0, 2);
+        // fakeAffine[3] = affineTransform.at<double>(1, 0);
+        // fakeAffine[4] = affineTransform.at<double>(1, 1);
+        // fakeAffine[5] = affineTransform.at<double>(1, 2);
+
+        // printf("\n");
+        // printf("%f\t%f\t%f\n", a11, a12, 0.0);
+        // printf("%f\t%f\t%f\n", a21, a22, 0.0);
+        // printf("%f\t%f\t%f\n", a31, a32, 1.0);
+        // printf("\n");
+
+        // printf("Affine Inverse:\n");
+        // for(int i = 0; i < 3; i++){
+            // for(int j = 0; j < 3; j++){
+                // printf("%f\t", affineInverse[3*i + j]);
+            // }
+            // printf("\n");
+        // }
+
+
+        CUDA_SAFE_MALLOC(dstPtr, dst_rows*dst_cols*sizeof(uint8_t), &err);
+        CUDA_SAFE_MALLOC(&gpuInverse, 3*3*sizeof(double), &err);
+
+        CUDA_SAFE_MEMCPY(gpuInverse, affineInverse, 9*sizeof(double), cudaMemcpyHostToDevice, &err);
+
+        cudaaffine_kernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)(*dstPtr), gpuInverse, src_rows, src_cols, dst_rows, dst_cols);
+        CUDA_KERNEL_ERR_CHK(&err);
+
+        CUDA_SAFE_FREE(srcPtr, &err);
+        CUDA_SAFE_FREE(gpuInverse, &err);
+
+        // printf("\n\n");
+        // for(int i = 0; i < cols; i++){
+            // for(int j = 0; j < src_rows; j++){
+                // printf("%4d\t", ((uint8_t*) dstPtr)[j*cols + i]);
+            // }
+            // printf("\n");
+        // }
+        // printf("\n");
+    }
+} // end cuda
+} // end br
--
libgit2 0.21.4