Commit 1a6637f6d6294ee1119d01a4947abf7c4a17787f
1 parent
c837c1d9
accumulation done on stack not shared memory
increases PCA performance by >50%
Showing
2 changed files
with
5 additions
and
7 deletions
openbr/plugins/cuda/cudapca.cpp
| ... | ... | @@ -159,11 +159,7 @@ private: |
| 159 | 159 | |
| 160 | 160 | void load(QDataStream &stream) |
| 161 | 161 | { |
| 162 | - Eigen::MatrixXf originalEVecs; | |
| 163 | - stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> originalEVecs; | |
| 164 | - | |
| 165 | - // perform transpose before copying over | |
| 166 | - eVecs = originalEVecs; //originalEVecs.transpose(); | |
| 162 | + stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> eVecs; | |
| 167 | 163 | |
| 168 | 164 | cout << "Mean Dimensions" << endl; |
| 169 | 165 | cout << "\tRows: " << mean.rows() << " Cols: " << mean.cols() << endl; | ... | ... |
openbr/plugins/cuda/cudapca.cu
| ... | ... | @@ -18,10 +18,12 @@ namespace br { namespace cuda { namespace pca { |
| 18 | 18 | return; |
| 19 | 19 | } |
| 20 | 20 | |
| 21 | - dst[colInd] = 0; | |
| 21 | + float acc = 0; | |
| 22 | 22 | for (int i=0; i < evRows; i++) { |
| 23 | - dst[colInd] += evPtr[evCols*i + colInd] * src[i]; | |
| 23 | + acc += evPtr[evCols*i + colInd] * src[i]; | |
| 24 | 24 | } |
| 25 | + | |
| 26 | + dst[colInd] = acc; | |
| 25 | 27 | } |
| 26 | 28 | |
| 27 | 29 | __global__ void subtractMeanKernel(float* out, float* mean, int numCols) { | ... | ... |