Commit ba9975a284718a1b33b325ecbf91f50a5df53322
1 parent
3791830c
KTS-2395
"Create index migration script" Implemented. Committed By: Conrad Vermeulen Reviewed By: Kevin Fourie git-svn-id: https://kt-dms.svn.sourceforge.net/svnroot/kt-dms/trunk@7360 c91229c3-7414-0410-bfa2-8a42b809f60b
Showing
1 changed file
with
221 additions
and
18 deletions
search2/indexing/indexerCore.inc.php
| ... | ... | @@ -525,6 +525,55 @@ abstract class Indexer |
| 525 | 525 | } |
| 526 | 526 | } |
| 527 | 527 | |
| 528 | + private function doesDiagnosticsPass($simple=false) | |
| 529 | + { | |
| 530 | + global $default; | |
| 531 | + | |
| 532 | + $config =& KTConfig::getSingleton(); | |
| 533 | + // create a index log lock file in case there are errors, and we don't need to log them forever! | |
| 534 | + // this function will create the lockfile if an error is detected. It will be removed as soon | |
| 535 | + // as the problems with the indexer are removed. | |
| 536 | + $lockFile = $config->get('cache/cacheDirectory') . '/index.log.lock'; | |
| 537 | + | |
| 538 | + $diagnosis = $this->diagnose(); | |
| 539 | + if (!is_null($diagnosis)) | |
| 540 | + { | |
| 541 | + if (!is_file($lockFile)) | |
| 542 | + { | |
| 543 | + $default->log->error(_kt('Indexer problem: ') . $diagnosis); | |
| 544 | + } | |
| 545 | + touch($lockFile); | |
| 546 | + return false; | |
| 547 | + } | |
| 548 | + | |
| 549 | + if ($simple) | |
| 550 | + { | |
| 551 | + return true; | |
| 552 | + } | |
| 553 | + | |
| 554 | + $diagnosis = $this->diagnoseExtractors(); | |
| 555 | + if (!empty($diagnosis)) | |
| 556 | + { | |
| 557 | + if (!is_file($lockFile)) | |
| 558 | + { | |
| 559 | + foreach($diagnosis as $diag) | |
| 560 | + { | |
| 561 | + $default->log->error(sprintf(_kt('%s problem: %s'), $diag['name'],$diag['diagnosis'])); | |
| 562 | + } | |
| 563 | + } | |
| 564 | + touch($lockFile); | |
| 565 | + return false; | |
| 566 | + } | |
| 567 | + | |
| 568 | + if (is_file($lockFile)) | |
| 569 | + { | |
| 570 | + $default->log->info(_kt('Issues with the indexer have been resolved!')); | |
| 571 | + unlink($lockFile); | |
| 572 | + } | |
| 573 | + | |
| 574 | + return true; | |
| 575 | + } | |
| 576 | + | |
| 528 | 577 | /** |
| 529 | 578 | * The main function that may be called repeatedly to index documents. |
| 530 | 579 | * |
| ... | ... | @@ -534,8 +583,12 @@ abstract class Indexer |
| 534 | 583 | { |
| 535 | 584 | global $default; |
| 536 | 585 | |
| 537 | - $config =& KTConfig::getSingleton(); | |
| 586 | + if (!$this->doesDiagnosticsPass()) | |
| 587 | + { | |
| 588 | + return; | |
| 589 | + } | |
| 538 | 590 | |
| 591 | + $config =& KTConfig::getSingleton(); | |
| 539 | 592 | if (is_null($max)) |
| 540 | 593 | { |
| 541 | 594 | $max = $config->get('indexer/batchDocuments',20); |
| ... | ... | @@ -600,17 +653,28 @@ abstract class Indexer |
| 600 | 653 | $indexDocument = in_array($docinfo['what'], array('A','C')); |
| 601 | 654 | $indexDiscussion = in_array($docinfo['what'], array('A','D')); |
| 602 | 655 | |
| 603 | - if ($this->debug) $default->log->debug("Indexing docid: $docId extension: '$extension' mimetype: '$mimeType' extractor: '$extractorClass'"); | |
| 656 | + if ($this->debug) | |
| 657 | + { | |
| 658 | + $default->log->debug(sprintf(_kt("Indexing docid: %d extension: '%s' mimetype: '%s' extractor: '%s'"), $docId, $extension,$mimeType,$extractorClass)); | |
| 659 | + } | |
| 604 | 660 | |
| 605 | 661 | if (empty($extractorClass)) |
| 606 | 662 | { |
| 607 | - if ($this->debug) $default->log->debug("No extractor for docid: $docId"); | |
| 663 | + if ($this->debug) | |
| 664 | + { | |
| 665 | + $default->log->debug(sprintf(_kt("No extractor for docid: %d"),$docId)); | |
| 666 | + } | |
| 608 | 667 | |
| 609 | 668 | Indexer::unqueueDocument($docId); |
| 610 | 669 | continue; |
| 611 | 670 | } |
| 612 | 671 | |
| 613 | - if ($this->debug) print "Processing document $docId.\n"; | |
| 672 | + if ($this->debug) | |
| 673 | + { | |
| 674 | + $default->log->info(sprintf(_kt("Processing document %d.\n"),$docId)); | |
| 675 | + } | |
| 676 | + | |
| 677 | + $removeFromQueue = true; | |
| 614 | 678 | if ($indexDocument) |
| 615 | 679 | { |
| 616 | 680 | if (array_key_exists($extractorClass, $extractorCache)) |
| ... | ... | @@ -623,7 +687,7 @@ abstract class Indexer |
| 623 | 687 | |
| 624 | 688 | if (!class_exists($extractorClass)) |
| 625 | 689 | { |
| 626 | - $default->log->error("indexDocuments: extractor '$extractorClass' does not exist."); | |
| 690 | + $default->log->error(sprintf(_kt("indexDocuments: extractor '%s' does not exist."),$extractorClass)); | |
| 627 | 691 | continue; |
| 628 | 692 | } |
| 629 | 693 | |
| ... | ... | @@ -632,13 +696,13 @@ abstract class Indexer |
| 632 | 696 | |
| 633 | 697 | if (is_null($extractor)) |
| 634 | 698 | { |
| 635 | - $default->log->error("indexDocuments: extractor '$extractorClass' not resolved - it is null."); | |
| 699 | + $default->log->error(sprintf(_kt("indexDocuments: extractor '%s' not resolved - it is null."),$extractorClass)); | |
| 636 | 700 | continue; |
| 637 | 701 | } |
| 638 | 702 | |
| 639 | 703 | if (!($extractor instanceof DocumentExtractor)) |
| 640 | 704 | { |
| 641 | - $default->log->error("indexDocuments: extractor '$extractorClass' is not a document extractor class."); | |
| 705 | + $default->log->error(sprintf(_kt("indexDocuments: extractor '%s' is not a document extractor class."),$extractorClass)); | |
| 642 | 706 | continue; |
| 643 | 707 | } |
| 644 | 708 | |
| ... | ... | @@ -648,7 +712,7 @@ abstract class Indexer |
| 648 | 712 | |
| 649 | 713 | if (empty($sourceFile) || !is_file($sourceFile)) |
| 650 | 714 | { |
| 651 | - $default->log->error("indexDocuments: source file '$sourceFile' for document $docId does not exist."); | |
| 715 | + $default->log->error(sprintf(_kt("indexDocuments: source file '%s' for document %d does not exist."),$sourceFile,$docId)); | |
| 652 | 716 | Indexer::unqueueDocument($docId); |
| 653 | 717 | continue; |
| 654 | 718 | } |
| ... | ... | @@ -659,14 +723,14 @@ abstract class Indexer |
| 659 | 723 | $result = @copy($sourceFile, $intermediate); |
| 660 | 724 | if ($result === false) |
| 661 | 725 | { |
| 662 | - $default->log->error("Could not create intermediate file from document $docId"); | |
| 726 | + $default->log->error(sprintf(_kt("Could not create intermediate file from document %d"),$docId)); | |
| 663 | 727 | // problem. lets try again later. probably permission related. log the issue. |
| 664 | 728 | continue; |
| 665 | 729 | } |
| 666 | 730 | $sourceFile = $intermediate; |
| 667 | 731 | } |
| 668 | 732 | |
| 669 | - $targetFile = tempnam($tempPath, 'ktindexer') . '.txt'; | |
| 733 | + $targetFile = tempnam($tempPath, 'ktindexer'); | |
| 670 | 734 | |
| 671 | 735 | $extractor->setSourceFile($sourceFile); |
| 672 | 736 | $extractor->setMimeType($mimeType); |
| ... | ... | @@ -675,7 +739,10 @@ abstract class Indexer |
| 675 | 739 | $extractor->setDocument($document); |
| 676 | 740 | $extractor->setIndexingStatus(null); |
| 677 | 741 | $extractor->setExtractionStatus(null); |
| 678 | - if ($this->debug) $default->log->debug("Extra Info docid: $docId Source File: '$sourceFile' Target File: '$targetFile'"); | |
| 742 | + if ($this->debug) | |
| 743 | + { | |
| 744 | + $default->log->debug(sprintf(_kt("Extra Info docid: %d Source File: '%s' Target File: '%s'"),$docId,$sourceFile,$targetFile)); | |
| 745 | + } | |
| 679 | 746 | |
| 680 | 747 | $this->executeHook($extractor, 'pre_extract'); |
| 681 | 748 | $this->executeHook($extractor, 'pre_extract', $mimeType); |
| ... | ... | @@ -691,7 +758,10 @@ abstract class Indexer |
| 691 | 758 | { |
| 692 | 759 | $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version); |
| 693 | 760 | |
| 694 | - if (!$indexStatus) $default->log->error("Problem indexing document $docId"); | |
| 761 | + if (!$indexStatus) | |
| 762 | + { | |
| 763 | + $default->log->error(sprintf(_kt("Problem indexing document %d - indexDocumentAndDiscussion"),$docId)); | |
| 764 | + } | |
| 695 | 765 | |
| 696 | 766 | $extractor->setIndexingStatus($indexStatus); |
| 697 | 767 | } |
| ... | ... | @@ -699,13 +769,16 @@ abstract class Indexer |
| 699 | 769 | { |
| 700 | 770 | if (!$this->filterText($targetFile)) |
| 701 | 771 | { |
| 702 | - $default->log->error("Problem filtering document $docId"); | |
| 772 | + $default->log->error(sprintf(_kt("Problem filtering document %d"),$docId)); | |
| 703 | 773 | } |
| 704 | 774 | else |
| 705 | 775 | { |
| 706 | 776 | $indexStatus = $this->indexDocument($docId, $targetFile, $title, $version); |
| 707 | 777 | |
| 708 | - if (!$indexStatus) $default->log->error("Problem indexing document $docId"); | |
| 778 | + if (!$indexStatus) | |
| 779 | + { | |
| 780 | + $default->log->error(sprintf(_kt("Problem indexing document %d - indexDocument"),$docId)); | |
| 781 | + } | |
| 709 | 782 | |
| 710 | 783 | $extractor->setIndexingStatus($indexStatus); |
| 711 | 784 | } |
| ... | ... | @@ -717,7 +790,7 @@ abstract class Indexer |
| 717 | 790 | else |
| 718 | 791 | { |
| 719 | 792 | $extractor->setExtractionStatus(false); |
| 720 | - $default->log->error("Could not extract contents from document $docId"); | |
| 793 | + $default->log->error(sprintf(_kt("Could not extract contents from document %d"),$docId)); | |
| 721 | 794 | } |
| 722 | 795 | |
| 723 | 796 | $this->executeHook($extractor, 'post_extract', $mimeType); |
| ... | ... | @@ -729,17 +802,147 @@ abstract class Indexer |
| 729 | 802 | } |
| 730 | 803 | |
| 731 | 804 | @unlink($targetFile); |
| 805 | + $removeFromQueue = $indexStatus; | |
| 732 | 806 | } |
| 733 | 807 | else |
| 734 | 808 | { |
| 735 | 809 | $this->indexDiscussion($docId); |
| 736 | 810 | } |
| 737 | 811 | |
| 738 | - Indexer::unqueueDocument($docId); | |
| 739 | - if ($this->debug) $default->log->debug("Done indexing docid: $docId"); | |
| 812 | + if ($removeFromQueue) | |
| 813 | + { | |
| 814 | + Indexer::unqueueDocument($docId); | |
| 815 | + } | |
| 816 | + if ($this->debug) | |
| 817 | + { | |
| 818 | + $default->log->debug(sprintf(_kt("Done indexing docid: %d"),$docId)); | |
| 819 | + } | |
| 740 | 820 | |
| 741 | 821 | } |
| 742 | - if ($this->debug) print "Done.\n"; | |
| 822 | + if ($this->debug) | |
| 823 | + { | |
| 824 | + $default->log->debug(_kt("Done.")); | |
| 825 | + } | |
| 826 | + } | |
| 827 | + | |
| 828 | + public function migrateDocuments($max=null) | |
| 829 | + { | |
| 830 | + if (!$this->doesDiagnosticsPass(true)) | |
| 831 | + { | |
| 832 | + return; | |
| 833 | + } | |
| 834 | + | |
| 835 | + $config =& KTConfig::getSingleton(); | |
| 836 | + if (is_null($max)) | |
| 837 | + { | |
| 838 | + $max = $config->get('indexer/batchMigrateDocument',500); | |
| 839 | + } | |
| 840 | + | |
| 841 | + global $default; | |
| 842 | + | |
| 843 | + $lockFile = $config->get('cache/cacheDirectory') . '/migration.lock'; | |
| 844 | + if (is_file($lockFile)) | |
| 845 | + { | |
| 846 | + $default->log->info(_kt('migrateDocuments: migration lockfile detected. exiting.')); | |
| 847 | + return; | |
| 848 | + } | |
| 849 | + touch($lockFile); | |
| 850 | + $default->log->info(_kt('migrateDocuments: starting!')); | |
| 851 | + | |
| 852 | + $startTime = KTUtil::getSystemSetting('migrationStarted'); | |
| 853 | + if (is_null($startTime)) | |
| 854 | + { | |
| 855 | + KTUtil::setSystemSetting('migrationStarted', time()); | |
| 856 | + } | |
| 857 | + | |
| 858 | + $maxLoops = 5; | |
| 859 | + | |
| 860 | + $max = floor($max / $maxLoops); | |
| 861 | + | |
| 862 | + $start =KTUtil::getBenchmarkTime(); | |
| 863 | + $noDocs = false; | |
| 864 | + $numDocs = 0; | |
| 865 | + | |
| 866 | + for($loop=0;$loop<$maxLoops;$loop++) | |
| 867 | + { | |
| 868 | + | |
| 869 | + $sql = "SELECT | |
| 870 | + document_id, document_text | |
| 871 | + FROM | |
| 872 | + document_text | |
| 873 | + ORDER BY document_id | |
| 874 | + LIMIT $max"; | |
| 875 | + $result = DBUtil::getResultArray($sql); | |
| 876 | + if (PEAR::isError($result)) | |
| 877 | + { | |
| 878 | + break; | |
| 879 | + } | |
| 880 | + | |
| 881 | + $docs = count($result); | |
| 882 | + if ($docs == 0) | |
| 883 | + { | |
| 884 | + $noDocs = true; | |
| 885 | + break; | |
| 886 | + } | |
| 887 | + $numDocs += $docs; | |
| 888 | + | |
| 889 | + foreach($result as $docinfo) | |
| 890 | + { | |
| 891 | + $docId = $docinfo['document_id']; | |
| 892 | + | |
| 893 | + $document = Document::get($docId); | |
| 894 | + if (PEAR::isError($document) || is_null($document)) | |
| 895 | + { | |
| 896 | + $sql = "DELETE FROM document_text WHERE document_id=$docId"; | |
| 897 | + DBUtil::runQuery($sql); | |
| 898 | + $default->log->error(sprintf(_kt('migrateDocuments: Could not get document %d\'s document! Removing content!',$docId))); | |
| 899 | + continue; | |
| 900 | + } | |
| 901 | + | |
| 902 | + $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber(); | |
| 903 | + | |
| 904 | + $targetFile = tempnam($tempPath, 'ktindexer'); | |
| 905 | + | |
| 906 | + if (file_put_contents($targetFile, $docinfo['document_text']) === false) | |
| 907 | + { | |
| 908 | + $default->log->error(sprintf(_kt('migrateDocuments: Cannot write to \'%s\' for document id %d'), $targetFile, $docId)); | |
| 909 | + continue; | |
| 910 | + } | |
| 911 | + // free memory asap ;) | |
| 912 | + unset($docinfo['document_text']); | |
| 913 | + | |
| 914 | + $title = $document->getName(); | |
| 915 | + | |
| 916 | + $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version); | |
| 917 | + | |
| 918 | + if ($indexStatus) | |
| 919 | + { | |
| 920 | + $sql = "DELETE FROM document_text WHERE document_id=$docId"; | |
| 921 | + DBUtil::runQuery($sql); | |
| 922 | + } | |
| 923 | + else | |
| 924 | + { | |
| 925 | + $default->log->error(sprintf(_kt("migrateDocuments: Problem indexing document %d"), $docId)); | |
| 926 | + } | |
| 927 | + | |
| 928 | + @unlink($targetFile); | |
| 929 | + } | |
| 930 | + } | |
| 931 | + | |
| 932 | + @unlink($lockFile); | |
| 933 | + | |
| 934 | + $time = KTUtil::getBenchmarkTime() - $start; | |
| 935 | + | |
| 936 | + KTUtil::setSystemSetting('migrationTime', KTUtil::getSystemSetting('migrationTime',0) + $time); | |
| 937 | + KTUtil::setSystemSetting('migratedDocuments', KTUtil::getSystemSetting('migratedDocuments',0) + $numDocs); | |
| 938 | + | |
| 939 | + $default->log->info(sprintf(_kt('migrateDocuments: done in %d seconds!'), $time)); | |
| 940 | + if ($noDocs) | |
| 941 | + { | |
| 942 | + $default->log->info(_kt('migrateDocuments: Completed!')); | |
| 943 | + KTUtil::setSystemSetting('migrationComplete', true); | |
| 944 | + } | |
| 945 | + | |
| 743 | 946 | } |
| 744 | 947 | |
| 745 | 948 | /** | ... | ... |