accumulation done on stack not shared memory

increases PCA performance by >50%

accumulation done on stack not shared memory
increases PCA performance by >50%
DepthDeluxe
1 parent c837c1d9
Showing 2 changed files with 5 additions and 7 deletions
openbr/plugins/cuda/cudapca.cpp
openbr/plugins/cuda/cudapca.cu
@@ -159,11 +159,7 @@ private:
     void load(QDataStream &stream)
     {
-        Eigen::MatrixXf originalEVecs;
-        stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> originalEVecs;
-
-        // perform transpose before copying over
-        eVecs = originalEVecs; //originalEVecs.transpose();
+        stream >> keep >> drop >> whiten >> originalRows >> mean >> eVals >> eVecs;
         cout << "Mean Dimensions" << endl;
         cout << "\tRows: " << mean.rows() << " Cols: " << mean.cols() << endl;
@@ -18,10 +18,12 @@ namespace br { namespace cuda { namespace pca {
       return;
     }
-    dst[colInd] = 0;
+    float acc = 0;
     for (int i=0; i < evRows; i++) {
-      dst[colInd] += evPtr[evCols*i + colInd] * src[i];
+      acc += evPtr[evCols*i + colInd] * src[i];
     }
+
+    dst[colInd] = acc;
   }
   __global__ void subtractMeanKernel(float* out, float* mean, int numCols) {