Commit 1a6637f6d6294ee1119d01a4947abf7c4a17787f

Authored by DepthDeluxe
1 parent c837c1d9

accumulation done on stack not shared memory

increases PCA performance by >50%
openbr/plugins/cuda/cudapca.cpp
... ... @@ -159,11 +159,7 @@ private:
159 159  
160 160 void load(QDataStream &stream)
161 161 {
162   - Eigen::MatrixXf originalEVecs;
163   - stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> originalEVecs;
164   -
165   - // perform transpose before copying over
166   - eVecs = originalEVecs; //originalEVecs.transpose();
  162 + stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> eVecs;
167 163  
168 164 cout << "Mean Dimensions" << endl;
169 165 cout << "\tRows: " << mean.rows() << " Cols: " << mean.cols() << endl;
... ...
openbr/plugins/cuda/cudapca.cu
... ... @@ -18,10 +18,12 @@ namespace br { namespace cuda { namespace pca {
18 18 return;
19 19 }
20 20  
21   - dst[colInd] = 0;
  21 + float acc = 0;
22 22 for (int i=0; i < evRows; i++) {
23   - dst[colInd] += evPtr[evCols*i + colInd] * src[i];
  23 + acc += evPtr[evCols*i + colInd] * src[i];
24 24 }
  25 +
  26 + dst[colInd] = acc;
25 27 }
26 28  
27 29 __global__ void subtractMeanKernel(float* out, float* mean, int numCols) {
... ...