fixed Affine transform bug

DepthDeluxe
1 parent 555b8f38
Showing 2 changed files with 55 additions and 1 deletions
openbr/plugins/cuda/cudaaffine.cpp
openbr/plugins/cuda/cudaaffine.cu
@@ -38,6 +38,7 @@ using namespace cv;
  
 // definitions from the CUDA source file
 namespace br { namespace cuda { namespace affine {
+	void resizeWrapper(void* srcPtr, void** dstPtr, int src_rows, int src_cols, int dst_rows, int dst_cols);
 	void wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols);
 }}}
  
@@ -115,7 +116,20 @@ namespace br
 	            const QList<Point2f> landmarks = OpenCVUtils::toPoints(src.file.points());
  
 	            if ((landmarks.size() < 2) || (!twoPoints && (landmarks.size() < 3))) {
-	                resize(src, dst, Size(width, height));
+                  void* const* srcDataPtr = src.m().ptr<void*>();
+                  int rows = *((int*)srcDataPtr[1]);
+                  int cols = *((int*)srcDataPtr[2]);
+                  int type = *((int*)srcDataPtr[3]);
+
+                  Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type());
+                  void** dstDataPtr = dstMat.ptr<void*>();
+
+                  dstDataPtr[1] = srcDataPtr[1]; *((int*)dstDataPtr[1]) = height;  // rows
+                  dstDataPtr[2] = srcDataPtr[2]; *((int*)dstDataPtr[2]) = width;   // cols
+                  dstDataPtr[3] = srcDataPtr[3];
+
+                  cuda::affine::resizeWrapper(srcDataPtr[0], &dstDataPtr[0], rows, cols, height, width);
+                  dst = dstMat;
 	                return;
 	            } else {
 	                srcPoints[0] = landmarks[0];
@@ -127,6 +127,28 @@ namespace br { namespace cuda { namespace affine {
         *src_row_pnt = dst_col * trans_inv[1] + dst_row * trans_inv[4] + trans_inv[7];
 		}
  
+    __global__ void bilinearKernel(uint8_t* srcPtr, uint8_t* dstPtr, int srcRows, int srcCols, int dstRows, int dstCols) {
+        int dstRowInd = blockIdx.y*blockDim.y+threadIdx.y;
+        int dstColInd = blockIdx.x*blockDim.x+threadIdx.x;
+        int dstIndex = dstRowInd*dstCols + dstColInd;
+
+        // don't do anything if the index is out of bounds
+        if (dstRowInd < 1 || dstRowInd >= dstRows-1 || dstColInd < 1 || dstColInd >= dstCols-1) {
+            if (dstRowInd >= dstRows || dstColInd >= dstCols) {
+                return;
+            } else{
+                dstPtr[dstIndex] = 0;
+                return;
+            }
+        }
+
+        double rowScaleFactor = (double)dstRows / (double)srcRows;
+        double colScaleFactor = (double)dstCols / (double)srcCols;
+
+        uint8_t out = getBilinearPixelValueDevice(dstRowInd/rowScaleFactor, dstColInd/colScaleFactor, srcPtr, srcRows, srcCols);
+
+        dstPtr[dstIndex] = out;
+    }
  
     __global__ void affineKernel(uint8_t* srcPtr, uint8_t* dstPtr, double* trans_inv, int src_rows, int src_cols, int dst_rows, int dst_cols){
         int dstRowInd = blockIdx.y*blockDim.y+threadIdx.y;
@@ -152,6 +174,24 @@ namespace br { namespace cuda { namespace affine {
         dstPtr[dstIndex] = cval;
     }
  
+    void resizeWrapper(void* srcPtr, void** dstPtr, int srcRows, int srcCols, int dstRows, int dstCols) {
+      // perform bilinear filtering
+
+      // allocate space for destination
+      cudaError_t err;
+      CUDA_SAFE_MALLOC(dstPtr, dstRows*dstCols*sizeof(uint8_t), &err);
+
+      // call the bilinear kernel function
+      dim3 threadsPerBlock(8, 8);
+      dim3 numBlocks(dstCols/threadsPerBlock.x + 1,
+                     dstRows/threadsPerBlock.y + 1);
+
+      bilinearKernel<<<numBlocks, threadsPerBlock>>>((uint8_t*)srcPtr, (uint8_t*)*dstPtr, srcRows, srcCols, dstRows, dstCols);
+      CUDA_KERNEL_ERR_CHK(&err);
+
+      CUDA_SAFE_FREE(srcPtr, &err);
+    }
+
     void wrapper(void* srcPtr, void** dstPtr, Mat affineTransform, int src_rows, int src_cols, int dst_rows, int dst_cols) {
         cudaError_t err;
         double* gpuInverse;