Commit 1a6637f6d6294ee1119d01a4947abf7c4a17787f

Authored by DepthDeluxe
1 parent c837c1d9

accumulation done on stack not shared memory

increases PCA performance by >50%
openbr/plugins/cuda/cudapca.cpp
@@ -159,11 +159,7 @@ private: @@ -159,11 +159,7 @@ private:
159 159
160 void load(QDataStream &stream) 160 void load(QDataStream &stream)
161 { 161 {
162 - Eigen::MatrixXf originalEVecs;  
163 - stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> originalEVecs;  
164 -  
165 - // perform transpose before copying over  
166 - eVecs = originalEVecs; //originalEVecs.transpose(); 162 + stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> eVecs;
167 163
168 cout << "Mean Dimensions" << endl; 164 cout << "Mean Dimensions" << endl;
169 cout << "\tRows: " << mean.rows() << " Cols: " << mean.cols() << endl; 165 cout << "\tRows: " << mean.rows() << " Cols: " << mean.cols() << endl;
openbr/plugins/cuda/cudapca.cu
@@ -18,10 +18,12 @@ namespace br { namespace cuda { namespace pca { @@ -18,10 +18,12 @@ namespace br { namespace cuda { namespace pca {
18 return; 18 return;
19 } 19 }
20 20
21 - dst[colInd] = 0; 21 + float acc = 0;
22 for (int i=0; i < evRows; i++) { 22 for (int i=0; i < evRows; i++) {
23 - dst[colInd] += evPtr[evCols*i + colInd] * src[i]; 23 + acc += evPtr[evCols*i + colInd] * src[i];
24 } 24 }
  25 +
  26 + dst[colInd] = acc;
25 } 27 }
26 28
27 __global__ void subtractMeanKernel(float* out, float* mean, int numCols) { 29 __global__ void subtractMeanKernel(float* out, float* mean, int numCols) {