Commit ba9975a284718a1b33b325ecbf91f50a5df53322
1 parent
3791830c
KTS-2395
"Create index migration script" Implemented. Committed By: Conrad Vermeulen Reviewed By: Kevin Fourie git-svn-id: https://kt-dms.svn.sourceforge.net/svnroot/kt-dms/trunk@7360 c91229c3-7414-0410-bfa2-8a42b809f60b
Showing
1 changed file
with
221 additions
and
18 deletions
search2/indexing/indexerCore.inc.php
| @@ -525,6 +525,55 @@ abstract class Indexer | @@ -525,6 +525,55 @@ abstract class Indexer | ||
| 525 | } | 525 | } |
| 526 | } | 526 | } |
| 527 | 527 | ||
| 528 | + private function doesDiagnosticsPass($simple=false) | ||
| 529 | + { | ||
| 530 | + global $default; | ||
| 531 | + | ||
| 532 | + $config =& KTConfig::getSingleton(); | ||
| 533 | + // create a index log lock file in case there are errors, and we don't need to log them forever! | ||
| 534 | + // this function will create the lockfile if an error is detected. It will be removed as soon | ||
| 535 | + // as the problems with the indexer are removed. | ||
| 536 | + $lockFile = $config->get('cache/cacheDirectory') . '/index.log.lock'; | ||
| 537 | + | ||
| 538 | + $diagnosis = $this->diagnose(); | ||
| 539 | + if (!is_null($diagnosis)) | ||
| 540 | + { | ||
| 541 | + if (!is_file($lockFile)) | ||
| 542 | + { | ||
| 543 | + $default->log->error(_kt('Indexer problem: ') . $diagnosis); | ||
| 544 | + } | ||
| 545 | + touch($lockFile); | ||
| 546 | + return false; | ||
| 547 | + } | ||
| 548 | + | ||
| 549 | + if ($simple) | ||
| 550 | + { | ||
| 551 | + return true; | ||
| 552 | + } | ||
| 553 | + | ||
| 554 | + $diagnosis = $this->diagnoseExtractors(); | ||
| 555 | + if (!empty($diagnosis)) | ||
| 556 | + { | ||
| 557 | + if (!is_file($lockFile)) | ||
| 558 | + { | ||
| 559 | + foreach($diagnosis as $diag) | ||
| 560 | + { | ||
| 561 | + $default->log->error(sprintf(_kt('%s problem: %s'), $diag['name'],$diag['diagnosis'])); | ||
| 562 | + } | ||
| 563 | + } | ||
| 564 | + touch($lockFile); | ||
| 565 | + return false; | ||
| 566 | + } | ||
| 567 | + | ||
| 568 | + if (is_file($lockFile)) | ||
| 569 | + { | ||
| 570 | + $default->log->info(_kt('Issues with the indexer have been resolved!')); | ||
| 571 | + unlink($lockFile); | ||
| 572 | + } | ||
| 573 | + | ||
| 574 | + return true; | ||
| 575 | + } | ||
| 576 | + | ||
| 528 | /** | 577 | /** |
| 529 | * The main function that may be called repeatedly to index documents. | 578 | * The main function that may be called repeatedly to index documents. |
| 530 | * | 579 | * |
| @@ -534,8 +583,12 @@ abstract class Indexer | @@ -534,8 +583,12 @@ abstract class Indexer | ||
| 534 | { | 583 | { |
| 535 | global $default; | 584 | global $default; |
| 536 | 585 | ||
| 537 | - $config =& KTConfig::getSingleton(); | 586 | + if (!$this->doesDiagnosticsPass()) |
| 587 | + { | ||
| 588 | + return; | ||
| 589 | + } | ||
| 538 | 590 | ||
| 591 | + $config =& KTConfig::getSingleton(); | ||
| 539 | if (is_null($max)) | 592 | if (is_null($max)) |
| 540 | { | 593 | { |
| 541 | $max = $config->get('indexer/batchDocuments',20); | 594 | $max = $config->get('indexer/batchDocuments',20); |
| @@ -600,17 +653,28 @@ abstract class Indexer | @@ -600,17 +653,28 @@ abstract class Indexer | ||
| 600 | $indexDocument = in_array($docinfo['what'], array('A','C')); | 653 | $indexDocument = in_array($docinfo['what'], array('A','C')); |
| 601 | $indexDiscussion = in_array($docinfo['what'], array('A','D')); | 654 | $indexDiscussion = in_array($docinfo['what'], array('A','D')); |
| 602 | 655 | ||
| 603 | - if ($this->debug) $default->log->debug("Indexing docid: $docId extension: '$extension' mimetype: '$mimeType' extractor: '$extractorClass'"); | 656 | + if ($this->debug) |
| 657 | + { | ||
| 658 | + $default->log->debug(sprintf(_kt("Indexing docid: %d extension: '%s' mimetype: '%s' extractor: '%s'"), $docId, $extension,$mimeType,$extractorClass)); | ||
| 659 | + } | ||
| 604 | 660 | ||
| 605 | if (empty($extractorClass)) | 661 | if (empty($extractorClass)) |
| 606 | { | 662 | { |
| 607 | - if ($this->debug) $default->log->debug("No extractor for docid: $docId"); | 663 | + if ($this->debug) |
| 664 | + { | ||
| 665 | + $default->log->debug(sprintf(_kt("No extractor for docid: %d"),$docId)); | ||
| 666 | + } | ||
| 608 | 667 | ||
| 609 | Indexer::unqueueDocument($docId); | 668 | Indexer::unqueueDocument($docId); |
| 610 | continue; | 669 | continue; |
| 611 | } | 670 | } |
| 612 | 671 | ||
| 613 | - if ($this->debug) print "Processing document $docId.\n"; | 672 | + if ($this->debug) |
| 673 | + { | ||
| 674 | + $default->log->info(sprintf(_kt("Processing document %d.\n"),$docId)); | ||
| 675 | + } | ||
| 676 | + | ||
| 677 | + $removeFromQueue = true; | ||
| 614 | if ($indexDocument) | 678 | if ($indexDocument) |
| 615 | { | 679 | { |
| 616 | if (array_key_exists($extractorClass, $extractorCache)) | 680 | if (array_key_exists($extractorClass, $extractorCache)) |
| @@ -623,7 +687,7 @@ abstract class Indexer | @@ -623,7 +687,7 @@ abstract class Indexer | ||
| 623 | 687 | ||
| 624 | if (!class_exists($extractorClass)) | 688 | if (!class_exists($extractorClass)) |
| 625 | { | 689 | { |
| 626 | - $default->log->error("indexDocuments: extractor '$extractorClass' does not exist."); | 690 | + $default->log->error(sprintf(_kt("indexDocuments: extractor '%s' does not exist."),$extractorClass)); |
| 627 | continue; | 691 | continue; |
| 628 | } | 692 | } |
| 629 | 693 | ||
| @@ -632,13 +696,13 @@ abstract class Indexer | @@ -632,13 +696,13 @@ abstract class Indexer | ||
| 632 | 696 | ||
| 633 | if (is_null($extractor)) | 697 | if (is_null($extractor)) |
| 634 | { | 698 | { |
| 635 | - $default->log->error("indexDocuments: extractor '$extractorClass' not resolved - it is null."); | 699 | + $default->log->error(sprintf(_kt("indexDocuments: extractor '%s' not resolved - it is null."),$extractorClass)); |
| 636 | continue; | 700 | continue; |
| 637 | } | 701 | } |
| 638 | 702 | ||
| 639 | if (!($extractor instanceof DocumentExtractor)) | 703 | if (!($extractor instanceof DocumentExtractor)) |
| 640 | { | 704 | { |
| 641 | - $default->log->error("indexDocuments: extractor '$extractorClass' is not a document extractor class."); | 705 | + $default->log->error(sprintf(_kt("indexDocuments: extractor '%s' is not a document extractor class."),$extractorClass)); |
| 642 | continue; | 706 | continue; |
| 643 | } | 707 | } |
| 644 | 708 | ||
| @@ -648,7 +712,7 @@ abstract class Indexer | @@ -648,7 +712,7 @@ abstract class Indexer | ||
| 648 | 712 | ||
| 649 | if (empty($sourceFile) || !is_file($sourceFile)) | 713 | if (empty($sourceFile) || !is_file($sourceFile)) |
| 650 | { | 714 | { |
| 651 | - $default->log->error("indexDocuments: source file '$sourceFile' for document $docId does not exist."); | 715 | + $default->log->error(sprintf(_kt("indexDocuments: source file '%s' for document %d does not exist."),$sourceFile,$docId)); |
| 652 | Indexer::unqueueDocument($docId); | 716 | Indexer::unqueueDocument($docId); |
| 653 | continue; | 717 | continue; |
| 654 | } | 718 | } |
| @@ -659,14 +723,14 @@ abstract class Indexer | @@ -659,14 +723,14 @@ abstract class Indexer | ||
| 659 | $result = @copy($sourceFile, $intermediate); | 723 | $result = @copy($sourceFile, $intermediate); |
| 660 | if ($result === false) | 724 | if ($result === false) |
| 661 | { | 725 | { |
| 662 | - $default->log->error("Could not create intermediate file from document $docId"); | 726 | + $default->log->error(sprintf(_kt("Could not create intermediate file from document %d"),$docId)); |
| 663 | // problem. lets try again later. probably permission related. log the issue. | 727 | // problem. lets try again later. probably permission related. log the issue. |
| 664 | continue; | 728 | continue; |
| 665 | } | 729 | } |
| 666 | $sourceFile = $intermediate; | 730 | $sourceFile = $intermediate; |
| 667 | } | 731 | } |
| 668 | 732 | ||
| 669 | - $targetFile = tempnam($tempPath, 'ktindexer') . '.txt'; | 733 | + $targetFile = tempnam($tempPath, 'ktindexer'); |
| 670 | 734 | ||
| 671 | $extractor->setSourceFile($sourceFile); | 735 | $extractor->setSourceFile($sourceFile); |
| 672 | $extractor->setMimeType($mimeType); | 736 | $extractor->setMimeType($mimeType); |
| @@ -675,7 +739,10 @@ abstract class Indexer | @@ -675,7 +739,10 @@ abstract class Indexer | ||
| 675 | $extractor->setDocument($document); | 739 | $extractor->setDocument($document); |
| 676 | $extractor->setIndexingStatus(null); | 740 | $extractor->setIndexingStatus(null); |
| 677 | $extractor->setExtractionStatus(null); | 741 | $extractor->setExtractionStatus(null); |
| 678 | - if ($this->debug) $default->log->debug("Extra Info docid: $docId Source File: '$sourceFile' Target File: '$targetFile'"); | 742 | + if ($this->debug) |
| 743 | + { | ||
| 744 | + $default->log->debug(sprintf(_kt("Extra Info docid: %d Source File: '%s' Target File: '%s'"),$docId,$sourceFile,$targetFile)); | ||
| 745 | + } | ||
| 679 | 746 | ||
| 680 | $this->executeHook($extractor, 'pre_extract'); | 747 | $this->executeHook($extractor, 'pre_extract'); |
| 681 | $this->executeHook($extractor, 'pre_extract', $mimeType); | 748 | $this->executeHook($extractor, 'pre_extract', $mimeType); |
| @@ -691,7 +758,10 @@ abstract class Indexer | @@ -691,7 +758,10 @@ abstract class Indexer | ||
| 691 | { | 758 | { |
| 692 | $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version); | 759 | $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version); |
| 693 | 760 | ||
| 694 | - if (!$indexStatus) $default->log->error("Problem indexing document $docId"); | 761 | + if (!$indexStatus) |
| 762 | + { | ||
| 763 | + $default->log->error(sprintf(_kt("Problem indexing document %d - indexDocumentAndDiscussion"),$docId)); | ||
| 764 | + } | ||
| 695 | 765 | ||
| 696 | $extractor->setIndexingStatus($indexStatus); | 766 | $extractor->setIndexingStatus($indexStatus); |
| 697 | } | 767 | } |
| @@ -699,13 +769,16 @@ abstract class Indexer | @@ -699,13 +769,16 @@ abstract class Indexer | ||
| 699 | { | 769 | { |
| 700 | if (!$this->filterText($targetFile)) | 770 | if (!$this->filterText($targetFile)) |
| 701 | { | 771 | { |
| 702 | - $default->log->error("Problem filtering document $docId"); | 772 | + $default->log->error(sprintf(_kt("Problem filtering document %d"),$docId)); |
| 703 | } | 773 | } |
| 704 | else | 774 | else |
| 705 | { | 775 | { |
| 706 | $indexStatus = $this->indexDocument($docId, $targetFile, $title, $version); | 776 | $indexStatus = $this->indexDocument($docId, $targetFile, $title, $version); |
| 707 | 777 | ||
| 708 | - if (!$indexStatus) $default->log->error("Problem indexing document $docId"); | 778 | + if (!$indexStatus) |
| 779 | + { | ||
| 780 | + $default->log->error(sprintf(_kt("Problem indexing document %d - indexDocument"),$docId)); | ||
| 781 | + } | ||
| 709 | 782 | ||
| 710 | $extractor->setIndexingStatus($indexStatus); | 783 | $extractor->setIndexingStatus($indexStatus); |
| 711 | } | 784 | } |
| @@ -717,7 +790,7 @@ abstract class Indexer | @@ -717,7 +790,7 @@ abstract class Indexer | ||
| 717 | else | 790 | else |
| 718 | { | 791 | { |
| 719 | $extractor->setExtractionStatus(false); | 792 | $extractor->setExtractionStatus(false); |
| 720 | - $default->log->error("Could not extract contents from document $docId"); | 793 | + $default->log->error(sprintf(_kt("Could not extract contents from document %d"),$docId)); |
| 721 | } | 794 | } |
| 722 | 795 | ||
| 723 | $this->executeHook($extractor, 'post_extract', $mimeType); | 796 | $this->executeHook($extractor, 'post_extract', $mimeType); |
| @@ -729,17 +802,147 @@ abstract class Indexer | @@ -729,17 +802,147 @@ abstract class Indexer | ||
| 729 | } | 802 | } |
| 730 | 803 | ||
| 731 | @unlink($targetFile); | 804 | @unlink($targetFile); |
| 805 | + $removeFromQueue = $indexStatus; | ||
| 732 | } | 806 | } |
| 733 | else | 807 | else |
| 734 | { | 808 | { |
| 735 | $this->indexDiscussion($docId); | 809 | $this->indexDiscussion($docId); |
| 736 | } | 810 | } |
| 737 | 811 | ||
| 738 | - Indexer::unqueueDocument($docId); | ||
| 739 | - if ($this->debug) $default->log->debug("Done indexing docid: $docId"); | 812 | + if ($removeFromQueue) |
| 813 | + { | ||
| 814 | + Indexer::unqueueDocument($docId); | ||
| 815 | + } | ||
| 816 | + if ($this->debug) | ||
| 817 | + { | ||
| 818 | + $default->log->debug(sprintf(_kt("Done indexing docid: %d"),$docId)); | ||
| 819 | + } | ||
| 740 | 820 | ||
| 741 | } | 821 | } |
| 742 | - if ($this->debug) print "Done.\n"; | 822 | + if ($this->debug) |
| 823 | + { | ||
| 824 | + $default->log->debug(_kt("Done.")); | ||
| 825 | + } | ||
| 826 | + } | ||
| 827 | + | ||
| 828 | + public function migrateDocuments($max=null) | ||
| 829 | + { | ||
| 830 | + if (!$this->doesDiagnosticsPass(true)) | ||
| 831 | + { | ||
| 832 | + return; | ||
| 833 | + } | ||
| 834 | + | ||
| 835 | + $config =& KTConfig::getSingleton(); | ||
| 836 | + if (is_null($max)) | ||
| 837 | + { | ||
| 838 | + $max = $config->get('indexer/batchMigrateDocument',500); | ||
| 839 | + } | ||
| 840 | + | ||
| 841 | + global $default; | ||
| 842 | + | ||
| 843 | + $lockFile = $config->get('cache/cacheDirectory') . '/migration.lock'; | ||
| 844 | + if (is_file($lockFile)) | ||
| 845 | + { | ||
| 846 | + $default->log->info(_kt('migrateDocuments: migration lockfile detected. exiting.')); | ||
| 847 | + return; | ||
| 848 | + } | ||
| 849 | + touch($lockFile); | ||
| 850 | + $default->log->info(_kt('migrateDocuments: starting!')); | ||
| 851 | + | ||
| 852 | + $startTime = KTUtil::getSystemSetting('migrationStarted'); | ||
| 853 | + if (is_null($startTime)) | ||
| 854 | + { | ||
| 855 | + KTUtil::setSystemSetting('migrationStarted', time()); | ||
| 856 | + } | ||
| 857 | + | ||
| 858 | + $maxLoops = 5; | ||
| 859 | + | ||
| 860 | + $max = floor($max / $maxLoops); | ||
| 861 | + | ||
| 862 | + $start =KTUtil::getBenchmarkTime(); | ||
| 863 | + $noDocs = false; | ||
| 864 | + $numDocs = 0; | ||
| 865 | + | ||
| 866 | + for($loop=0;$loop<$maxLoops;$loop++) | ||
| 867 | + { | ||
| 868 | + | ||
| 869 | + $sql = "SELECT | ||
| 870 | + document_id, document_text | ||
| 871 | + FROM | ||
| 872 | + document_text | ||
| 873 | + ORDER BY document_id | ||
| 874 | + LIMIT $max"; | ||
| 875 | + $result = DBUtil::getResultArray($sql); | ||
| 876 | + if (PEAR::isError($result)) | ||
| 877 | + { | ||
| 878 | + break; | ||
| 879 | + } | ||
| 880 | + | ||
| 881 | + $docs = count($result); | ||
| 882 | + if ($docs == 0) | ||
| 883 | + { | ||
| 884 | + $noDocs = true; | ||
| 885 | + break; | ||
| 886 | + } | ||
| 887 | + $numDocs += $docs; | ||
| 888 | + | ||
| 889 | + foreach($result as $docinfo) | ||
| 890 | + { | ||
| 891 | + $docId = $docinfo['document_id']; | ||
| 892 | + | ||
| 893 | + $document = Document::get($docId); | ||
| 894 | + if (PEAR::isError($document) || is_null($document)) | ||
| 895 | + { | ||
| 896 | + $sql = "DELETE FROM document_text WHERE document_id=$docId"; | ||
| 897 | + DBUtil::runQuery($sql); | ||
| 898 | + $default->log->error(sprintf(_kt('migrateDocuments: Could not get document %d\'s document! Removing content!',$docId))); | ||
| 899 | + continue; | ||
| 900 | + } | ||
| 901 | + | ||
| 902 | + $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber(); | ||
| 903 | + | ||
| 904 | + $targetFile = tempnam($tempPath, 'ktindexer'); | ||
| 905 | + | ||
| 906 | + if (file_put_contents($targetFile, $docinfo['document_text']) === false) | ||
| 907 | + { | ||
| 908 | + $default->log->error(sprintf(_kt('migrateDocuments: Cannot write to \'%s\' for document id %d'), $targetFile, $docId)); | ||
| 909 | + continue; | ||
| 910 | + } | ||
| 911 | + // free memory asap ;) | ||
| 912 | + unset($docinfo['document_text']); | ||
| 913 | + | ||
| 914 | + $title = $document->getName(); | ||
| 915 | + | ||
| 916 | + $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version); | ||
| 917 | + | ||
| 918 | + if ($indexStatus) | ||
| 919 | + { | ||
| 920 | + $sql = "DELETE FROM document_text WHERE document_id=$docId"; | ||
| 921 | + DBUtil::runQuery($sql); | ||
| 922 | + } | ||
| 923 | + else | ||
| 924 | + { | ||
| 925 | + $default->log->error(sprintf(_kt("migrateDocuments: Problem indexing document %d"), $docId)); | ||
| 926 | + } | ||
| 927 | + | ||
| 928 | + @unlink($targetFile); | ||
| 929 | + } | ||
| 930 | + } | ||
| 931 | + | ||
| 932 | + @unlink($lockFile); | ||
| 933 | + | ||
| 934 | + $time = KTUtil::getBenchmarkTime() - $start; | ||
| 935 | + | ||
| 936 | + KTUtil::setSystemSetting('migrationTime', KTUtil::getSystemSetting('migrationTime',0) + $time); | ||
| 937 | + KTUtil::setSystemSetting('migratedDocuments', KTUtil::getSystemSetting('migratedDocuments',0) + $numDocs); | ||
| 938 | + | ||
| 939 | + $default->log->info(sprintf(_kt('migrateDocuments: done in %d seconds!'), $time)); | ||
| 940 | + if ($noDocs) | ||
| 941 | + { | ||
| 942 | + $default->log->info(_kt('migrateDocuments: Completed!')); | ||
| 943 | + KTUtil::setSystemSetting('migrationComplete', true); | ||
| 944 | + } | ||
| 945 | + | ||
| 743 | } | 946 | } |
| 744 | 947 | ||
| 745 | /** | 948 | /** |