Commit c849ef1bd11d6b3ff78ce90efcc629d7af55c6cc
Committed by
GitHub
Merge pull request #493 from DepthDeluxe/master
Further performance enhancements of CUDAPCA
Showing
1 changed file
with
48 additions
and
1 deletions
openbr/plugins/cuda/cudapca.cpp
| @@ -297,7 +297,7 @@ protected: | @@ -297,7 +297,7 @@ protected: | ||
| 297 | for (int i=0; i<keep; i++) { | 297 | for (int i=0; i<keep; i++) { |
| 298 | int index = i+drop; | 298 | int index = i+drop; |
| 299 | eVals(i) = allEVals(index); | 299 | eVals(i) = allEVals(index); |
| 300 | - eVecs.col(i) = allEVecs.col(index).cast<float>() / allEVecs.col(index).norm(); | 300 | + eVecs.col(i) = allEVecs.col(index).cast<float>(); |
| 301 | if (whiten) eVecs.col(i) /= sqrt(eVals(i)); | 301 | if (whiten) eVecs.col(i) /= sqrt(eVals(i)); |
| 302 | } | 302 | } |
| 303 | 303 | ||
| @@ -519,6 +519,29 @@ protected: | @@ -519,6 +519,29 @@ protected: | ||
| 519 | dimsIn // ldc | 519 | dimsIn // ldc |
| 520 | ); | 520 | ); |
| 521 | 521 | ||
| 522 | + // normalize result then divide the column by the norm | ||
| 523 | + for (int i=0; i < instances; i++) { | ||
| 524 | + // compute the norm | ||
| 525 | + double norm; | ||
| 526 | + cublasDnrm2( | ||
| 527 | + cublasHandle, | ||
| 528 | + dimsIn, | ||
| 529 | + cudaMultedAllEVecs+i*dimsIn, | ||
| 530 | + 1, | ||
| 531 | + &norm | ||
| 532 | + ); | ||
| 533 | + | ||
| 534 | + // now divide by it | ||
| 535 | + norm = 1.0/norm; | ||
| 536 | + cublasDscal( | ||
| 537 | + cublasHandle, | ||
| 538 | + dimsIn, | ||
| 539 | + &norm, | ||
| 540 | + cudaMultedAllEVecs+i*dimsIn, | ||
| 541 | + 1 | ||
| 542 | + ); | ||
| 543 | + } | ||
| 544 | + | ||
| 522 | // get the eigenvectors from the multiplied value | 545 | // get the eigenvectors from the multiplied value |
| 523 | cublasGetMatrix( | 546 | cublasGetMatrix( |
| 524 | dimsIn, | 547 | dimsIn, |
| @@ -533,6 +556,30 @@ protected: | @@ -533,6 +556,30 @@ protected: | ||
| 533 | // free the memory used for multiplication | 556 | // free the memory used for multiplication |
| 534 | CUDA_SAFE_FREE(cudaMultedAllEVecs, &cudaError); | 557 | CUDA_SAFE_FREE(cudaMultedAllEVecs, &cudaError); |
| 535 | } else { | 558 | } else { |
| 559 | + // normalize result then divide the column by the norm | ||
| 560 | + for (int i=0; i < instances; i++) { | ||
| 561 | + // compute the norm | ||
| 562 | + double norm; | ||
| 563 | + cublasDnrm2( | ||
| 564 | + cublasHandle, | ||
| 565 | + covRows, | ||
| 566 | + cudaUPtr+i*covRows, | ||
| 567 | + 1, | ||
| 568 | + &norm | ||
| 569 | + ); | ||
| 570 | + | ||
| 571 | + // now divide by it | ||
| 572 | + norm = 1.0/norm; | ||
| 573 | + cublasDscal( | ||
| 574 | + cublasHandle, | ||
| 575 | + covRows, | ||
| 576 | + &norm, | ||
| 577 | + cudaUPtr+i*covRows, | ||
| 578 | + 1 | ||
| 579 | + ); | ||
| 580 | + } | ||
| 581 | + | ||
| 582 | + | ||
| 536 | // get the eigenvectors straight from the SVD | 583 | // get the eigenvectors straight from the SVD |
| 537 | cublasGetMatrix( | 584 | cublasGetMatrix( |
| 538 | covRows, | 585 | covRows, |