Commit a2eadb178f1efa3714e2386f0b45490b0e4fc97b
1 parent
5e16788d
processing can now be done totally on the graphics card
Showing
5 changed files
with
48 additions
and
27 deletions
openbr/plugins/cuda/copyfrom.cpp
| ... | ... | @@ -11,7 +11,7 @@ using namespace cv; |
| 11 | 11 | // extern CUDA declaration |
| 12 | 12 | namespace br { namespace cuda { namespace cudacopyfrom { |
| 13 | 13 | //template <typename T> void wrapper(void* src, T* out, int rows, int cols) { |
| 14 | - void wrapper(void* src, unsigned char* out, const int rows, const int cols); | |
| 14 | + void wrapper(void* src, float* out, const int rows, const int cols); | |
| 15 | 15 | }}} |
| 16 | 16 | |
| 17 | 17 | namespace br |
| ... | ... | @@ -23,7 +23,7 @@ namespace br |
| 23 | 23 | private: |
| 24 | 24 | void project(const Template &src, Template &dst) const |
| 25 | 25 | { |
| 26 | - cout << "CUDACopyFrom Start" << endl; | |
| 26 | + cout << "CUDACopyFrom Start" << endl << endl << endl; | |
| 27 | 27 | |
| 28 | 28 | // pull the data back out of the Mat |
| 29 | 29 | void* const* dataPtr = src.m().ptr<void*>(); |
| ... | ... | @@ -32,16 +32,26 @@ private: |
| 32 | 32 | int cols = *((int*)dataPtr[2]); |
| 33 | 33 | int type = *((int*)dataPtr[3]); |
| 34 | 34 | |
| 35 | + if (type != CV_32FC1) { | |
| 36 | + cout << "ERR: Invalid data type!" << endl; | |
| 37 | + return; | |
| 38 | + } | |
| 39 | + | |
| 35 | 40 | cout << "cudaMemPtr: " << cudaMemPtr << endl; |
| 36 | 41 | cout << "rows: " << rows << endl; |
| 37 | 42 | cout << "cols: " << cols << endl; |
| 38 | 43 | cout << "type: " << type << endl; |
| 39 | 44 | |
| 40 | - dst = Mat(rows, cols, type); | |
| 41 | - | |
| 42 | - br::cuda::cudacopyfrom::wrapper(cudaMemPtr, dst.m().ptr<unsigned char>(), rows, cols); | |
| 45 | + Mat dstMat = Mat(rows, cols, type); | |
| 46 | + br::cuda::cudacopyfrom::wrapper(cudaMemPtr, dstMat.ptr<float>(), rows, cols); | |
| 47 | + dst = dstMat; | |
| 43 | 48 | |
| 44 | 49 | cout << "CUDACopyFrom End" << endl; |
| 50 | + | |
| 51 | + cout << "DST Data" << endl; | |
| 52 | + cout << "rows: " << dstMat.rows << endl; | |
| 53 | + cout << "cols: " << dstMat.cols << endl; | |
| 54 | + cout << "type: " << dstMat.type() << endl; | |
| 45 | 55 | } |
| 46 | 56 | }; |
| 47 | 57 | ... | ... |
openbr/plugins/cuda/copyfrom.cu
| 1 | 1 | namespace br { namespace cuda { namespace cudacopyfrom { |
| 2 | 2 | //template <typename T> void wrapper(void* src, T* out, int rows, int cols) { |
| 3 | - void wrapper(void* src, unsigned char* out, const int rows, const int cols) { | |
| 4 | - cudaMemcpy(out, src, rows*cols*sizeof(unsigned char), cudaMemcpyDeviceToHost); | |
| 3 | + void wrapper(void* src, float* dst, const int rows, const int cols) { | |
| 4 | + cudaMemcpy(dst, src, rows*cols*sizeof(float), cudaMemcpyDeviceToHost); | |
| 5 | 5 | cudaFree(src); |
| 6 | 6 | } |
| 7 | 7 | }}} | ... | ... |
openbr/plugins/cuda/cudacvtfloat.cpp
| ... | ... | @@ -7,7 +7,10 @@ using namespace cv; |
| 7 | 7 | |
| 8 | 8 | #include <openbr/plugins/openbr_internal.h> |
| 9 | 9 | |
| 10 | -#include "cudacvtfloat.hpp" | |
| 10 | + | |
| 11 | +namespace br { namespace cuda { namespace cudacvtfloat { | |
| 12 | + void wrapper(const unsigned char* src, void** dst, int rows, int cols); | |
| 13 | +}}} | |
| 11 | 14 | |
| 12 | 15 | namespace br |
| 13 | 16 | { |
| ... | ... | @@ -24,20 +27,32 @@ class CUDACvtFloatTransform : public UntrainableTransform |
| 24 | 27 | public: |
| 25 | 28 | void project(const Template &src, Template &dst) const |
| 26 | 29 | { |
| 30 | + cout << "CUDACvtFloat Start" << endl; | |
| 31 | + | |
| 32 | + void* const* srcDataPtr = src.m().ptr<void*>(); | |
| 33 | + void* srcMemPtr = srcDataPtr[0]; | |
| 34 | + int rows = *((int*)srcDataPtr[1]); | |
| 35 | + int cols = *((int*)srcDataPtr[2]); | |
| 36 | + int type = *((int*)srcDataPtr[3]); | |
| 37 | + | |
| 27 | 38 | // assume the image type is 256-monochrome |
| 28 | 39 | // TODO(colin): real exception handling |
| 29 | - if (src.m().type() != CV_8UC1) { | |
| 40 | + if (type != CV_8UC1) { | |
| 30 | 41 | cout << "ERR: Invalid memory format" << endl; |
| 31 | 42 | return; |
| 32 | 43 | } |
| 33 | 44 | |
| 45 | + // build the destination mat | |
| 46 | + Mat dstMat = Mat(src.m().rows, src.m().cols, src.m().type()); | |
| 47 | + void** dstDataPtr = dstMat.ptr<void*>(); | |
| 48 | + dstDataPtr[1] = srcDataPtr[1]; | |
| 49 | + dstDataPtr[2] = srcDataPtr[2]; | |
| 50 | + dstDataPtr[3] = srcDataPtr[3]; *((int*)dstDataPtr[3]) = CV_32FC1; | |
| 34 | 51 | |
| 35 | - int rows = src.m().rows; | |
| 36 | - int cols = src.m().cols; | |
| 37 | - | |
| 38 | - dst = Mat(rows, cols, CV_32FC1); | |
| 52 | + br::cuda::cudacvtfloat::wrapper((const unsigned char*)srcMemPtr, &dstDataPtr[0], rows, cols); | |
| 53 | + dst = dstMat; | |
| 39 | 54 | |
| 40 | - br::cuda::cudacvtfloat::wrapper((const unsigned char*)src.m().ptr<unsigned char>(), dst.m().ptr<float>(), rows, cols); | |
| 55 | + cout << "CUDACvtFloat End" << endl; | |
| 41 | 56 | } |
| 42 | 57 | }; |
| 43 | 58 | ... | ... |
openbr/plugins/cuda/cudacvtfloat.cu
| ... | ... | @@ -14,13 +14,15 @@ namespace br { namespace cuda { namespace cudacvtfloat { |
| 14 | 14 | dst[index] = (float)src[index]; |
| 15 | 15 | } |
| 16 | 16 | |
| 17 | - void wrapper(const unsigned char* src, float* dst, int rows, int cols) { | |
| 18 | - unsigned char* cudaSrc; | |
| 19 | - cudaMalloc(&cudaSrc, rows*cols*sizeof(unsigned char)); | |
| 20 | - cudaMemcpy(cudaSrc, src, rows*cols*sizeof(unsigned char), cudaMemcpyHostToDevice); | |
| 17 | + void wrapper(const unsigned char* src, void** dst, int rows, int cols) { | |
| 18 | + //unsigned char* cudaSrc; | |
| 19 | + //cudaMalloc(&cudaSrc, rows*cols*sizeof(unsigned char)); | |
| 20 | + //cudaMemcpy(cudaSrc, src, rows*cols*sizeof(unsigned char), cudaMemcpyHostToDevice); | |
| 21 | 21 | |
| 22 | - float* cudaDst; | |
| 23 | - cudaMalloc(&cudaDst, rows*cols*sizeof(float)); | |
| 22 | + //float* cudaDst; | |
| 23 | + //cudaMalloc(&cudaDst, rows*cols*sizeof(float)); | |
| 24 | + | |
| 25 | + cudaMalloc(dst, rows*cols*sizeof(float)); | |
| 24 | 26 | |
| 25 | 27 | dim3 threadsPerBlock(8, 8); |
| 26 | 28 | dim3 blocks( |
| ... | ... | @@ -28,10 +30,7 @@ namespace br { namespace cuda { namespace cudacvtfloat { |
| 28 | 30 | rows / threadsPerBlock.y + 1 |
| 29 | 31 | ); |
| 30 | 32 | |
| 31 | - kernel<<<threadsPerBlock, blocks>>>(cudaSrc, cudaDst, rows, cols); | |
| 32 | - | |
| 33 | - // copy the data back to the destination | |
| 34 | - cudaMemcpy(dst, cudaDst, rows*cols*sizeof(float), cudaMemcpyDeviceToHost); | |
| 33 | + kernel<<<threadsPerBlock, blocks>>>(src, (float*)(*dst), rows, cols); | |
| 35 | 34 | } |
| 36 | 35 | |
| 37 | 36 | }}} | ... | ... |
openbr/plugins/cuda/cudacvtfloat.hpp deleted