From 1a6637f6d6294ee1119d01a4947abf7c4a17787f Mon Sep 17 00:00:00 2001 From: DepthDeluxe Date: Wed, 2 Mar 2016 14:07:35 -0500 Subject: [PATCH] accumulation done on stack not shared memory increases PCA performance by >50% --- openbr/plugins/cuda/cudapca.cpp | 6 +----- openbr/plugins/cuda/cudapca.cu | 6 ++++-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/openbr/plugins/cuda/cudapca.cpp b/openbr/plugins/cuda/cudapca.cpp index cccd1f1..998404b 100644 --- a/openbr/plugins/cuda/cudapca.cpp +++ b/openbr/plugins/cuda/cudapca.cpp @@ -159,11 +159,7 @@ private: void load(QDataStream &stream) { - Eigen::MatrixXf originalEVecs; - stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> originalEVecs; - - // perform transpose before copying over - eVecs = originalEVecs; //originalEVecs.transpose(); + stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> eVecs; cout << "Mean Dimensions" << endl; cout << "\tRows: " << mean.rows() << " Cols: " << mean.cols() << endl; diff --git a/openbr/plugins/cuda/cudapca.cu b/openbr/plugins/cuda/cudapca.cu index 1f1fd12..8351e85 100644 --- a/openbr/plugins/cuda/cudapca.cu +++ b/openbr/plugins/cuda/cudapca.cu @@ -18,10 +18,12 @@ namespace br { namespace cuda { namespace pca { return; } - dst[colInd] = 0; + float acc = 0; for (int i=0; i < evRows; i++) { - dst[colInd] += evPtr[evCols*i + colInd] * src[i]; + acc += evPtr[evCols*i + colInd] * src[i]; } + + dst[colInd] = acc; } __global__ void subtractMeanKernel(float* out, float* mean, int numCols) { -- libgit2 0.21.4