Commit 1a6637f6d6294ee1119d01a4947abf7c4a17787f
1 parent
c837c1d9
accumulation done on stack not shared memory
increases PCA performance by >50%
Showing
2 changed files
with
5 additions
and
7 deletions
openbr/plugins/cuda/cudapca.cpp
| @@ -159,11 +159,7 @@ private: | @@ -159,11 +159,7 @@ private: | ||
| 159 | 159 | ||
| 160 | void load(QDataStream &stream) | 160 | void load(QDataStream &stream) |
| 161 | { | 161 | { |
| 162 | - Eigen::MatrixXf originalEVecs; | ||
| 163 | - stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> originalEVecs; | ||
| 164 | - | ||
| 165 | - // perform transpose before copying over | ||
| 166 | - eVecs = originalEVecs; //originalEVecs.transpose(); | 162 | + stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> eVecs; |
| 167 | 163 | ||
| 168 | cout << "Mean Dimensions" << endl; | 164 | cout << "Mean Dimensions" << endl; |
| 169 | cout << "\tRows: " << mean.rows() << " Cols: " << mean.cols() << endl; | 165 | cout << "\tRows: " << mean.rows() << " Cols: " << mean.cols() << endl; |
openbr/plugins/cuda/cudapca.cu
| @@ -18,10 +18,12 @@ namespace br { namespace cuda { namespace pca { | @@ -18,10 +18,12 @@ namespace br { namespace cuda { namespace pca { | ||
| 18 | return; | 18 | return; |
| 19 | } | 19 | } |
| 20 | 20 | ||
| 21 | - dst[colInd] = 0; | 21 | + float acc = 0; |
| 22 | for (int i=0; i < evRows; i++) { | 22 | for (int i=0; i < evRows; i++) { |
| 23 | - dst[colInd] += evPtr[evCols*i + colInd] * src[i]; | 23 | + acc += evPtr[evCols*i + colInd] * src[i]; |
| 24 | } | 24 | } |
| 25 | + | ||
| 26 | + dst[colInd] = acc; | ||
| 25 | } | 27 | } |
| 26 | 28 | ||
| 27 | __global__ void subtractMeanKernel(float* out, float* mean, int numCols) { | 29 | __global__ void subtractMeanKernel(float* out, float* mean, int numCols) { |