Commit ba9975a284718a1b33b325ecbf91f50a5df53322

Authored by conradverm
1 parent 3791830c

KTS-2395

"Create index migration script"
Implemented.

Committed By: Conrad Vermeulen
Reviewed By: Kevin Fourie

git-svn-id: https://kt-dms.svn.sourceforge.net/svnroot/kt-dms/trunk@7360 c91229c3-7414-0410-bfa2-8a42b809f60b
search2/indexing/indexerCore.inc.php
@@ -525,6 +525,55 @@ abstract class Indexer @@ -525,6 +525,55 @@ abstract class Indexer
525 } 525 }
526 } 526 }
527 527
  528 + private function doesDiagnosticsPass($simple=false)
  529 + {
  530 + global $default;
  531 +
  532 + $config =& KTConfig::getSingleton();
  533 + // create a index log lock file in case there are errors, and we don't need to log them forever!
  534 + // this function will create the lockfile if an error is detected. It will be removed as soon
  535 + // as the problems with the indexer are removed.
  536 + $lockFile = $config->get('cache/cacheDirectory') . '/index.log.lock';
  537 +
  538 + $diagnosis = $this->diagnose();
  539 + if (!is_null($diagnosis))
  540 + {
  541 + if (!is_file($lockFile))
  542 + {
  543 + $default->log->error(_kt('Indexer problem: ') . $diagnosis);
  544 + }
  545 + touch($lockFile);
  546 + return false;
  547 + }
  548 +
  549 + if ($simple)
  550 + {
  551 + return true;
  552 + }
  553 +
  554 + $diagnosis = $this->diagnoseExtractors();
  555 + if (!empty($diagnosis))
  556 + {
  557 + if (!is_file($lockFile))
  558 + {
  559 + foreach($diagnosis as $diag)
  560 + {
  561 + $default->log->error(sprintf(_kt('%s problem: %s'), $diag['name'],$diag['diagnosis']));
  562 + }
  563 + }
  564 + touch($lockFile);
  565 + return false;
  566 + }
  567 +
  568 + if (is_file($lockFile))
  569 + {
  570 + $default->log->info(_kt('Issues with the indexer have been resolved!'));
  571 + unlink($lockFile);
  572 + }
  573 +
  574 + return true;
  575 + }
  576 +
528 /** 577 /**
529 * The main function that may be called repeatedly to index documents. 578 * The main function that may be called repeatedly to index documents.
530 * 579 *
@@ -534,8 +583,12 @@ abstract class Indexer @@ -534,8 +583,12 @@ abstract class Indexer
534 { 583 {
535 global $default; 584 global $default;
536 585
537 - $config =& KTConfig::getSingleton(); 586 + if (!$this->doesDiagnosticsPass())
  587 + {
  588 + return;
  589 + }
538 590
  591 + $config =& KTConfig::getSingleton();
539 if (is_null($max)) 592 if (is_null($max))
540 { 593 {
541 $max = $config->get('indexer/batchDocuments',20); 594 $max = $config->get('indexer/batchDocuments',20);
@@ -600,17 +653,28 @@ abstract class Indexer @@ -600,17 +653,28 @@ abstract class Indexer
600 $indexDocument = in_array($docinfo['what'], array('A','C')); 653 $indexDocument = in_array($docinfo['what'], array('A','C'));
601 $indexDiscussion = in_array($docinfo['what'], array('A','D')); 654 $indexDiscussion = in_array($docinfo['what'], array('A','D'));
602 655
603 - if ($this->debug) $default->log->debug("Indexing docid: $docId extension: '$extension' mimetype: '$mimeType' extractor: '$extractorClass'"); 656 + if ($this->debug)
  657 + {
  658 + $default->log->debug(sprintf(_kt("Indexing docid: %d extension: '%s' mimetype: '%s' extractor: '%s'"), $docId, $extension,$mimeType,$extractorClass));
  659 + }
604 660
605 if (empty($extractorClass)) 661 if (empty($extractorClass))
606 { 662 {
607 - if ($this->debug) $default->log->debug("No extractor for docid: $docId"); 663 + if ($this->debug)
  664 + {
  665 + $default->log->debug(sprintf(_kt("No extractor for docid: %d"),$docId));
  666 + }
608 667
609 Indexer::unqueueDocument($docId); 668 Indexer::unqueueDocument($docId);
610 continue; 669 continue;
611 } 670 }
612 671
613 - if ($this->debug) print "Processing document $docId.\n"; 672 + if ($this->debug)
  673 + {
  674 + $default->log->info(sprintf(_kt("Processing document %d.\n"),$docId));
  675 + }
  676 +
  677 + $removeFromQueue = true;
614 if ($indexDocument) 678 if ($indexDocument)
615 { 679 {
616 if (array_key_exists($extractorClass, $extractorCache)) 680 if (array_key_exists($extractorClass, $extractorCache))
@@ -623,7 +687,7 @@ abstract class Indexer @@ -623,7 +687,7 @@ abstract class Indexer
623 687
624 if (!class_exists($extractorClass)) 688 if (!class_exists($extractorClass))
625 { 689 {
626 - $default->log->error("indexDocuments: extractor '$extractorClass' does not exist."); 690 + $default->log->error(sprintf(_kt("indexDocuments: extractor '%s' does not exist."),$extractorClass));
627 continue; 691 continue;
628 } 692 }
629 693
@@ -632,13 +696,13 @@ abstract class Indexer @@ -632,13 +696,13 @@ abstract class Indexer
632 696
633 if (is_null($extractor)) 697 if (is_null($extractor))
634 { 698 {
635 - $default->log->error("indexDocuments: extractor '$extractorClass' not resolved - it is null."); 699 + $default->log->error(sprintf(_kt("indexDocuments: extractor '%s' not resolved - it is null."),$extractorClass));
636 continue; 700 continue;
637 } 701 }
638 702
639 if (!($extractor instanceof DocumentExtractor)) 703 if (!($extractor instanceof DocumentExtractor))
640 { 704 {
641 - $default->log->error("indexDocuments: extractor '$extractorClass' is not a document extractor class."); 705 + $default->log->error(sprintf(_kt("indexDocuments: extractor '%s' is not a document extractor class."),$extractorClass));
642 continue; 706 continue;
643 } 707 }
644 708
@@ -648,7 +712,7 @@ abstract class Indexer @@ -648,7 +712,7 @@ abstract class Indexer
648 712
649 if (empty($sourceFile) || !is_file($sourceFile)) 713 if (empty($sourceFile) || !is_file($sourceFile))
650 { 714 {
651 - $default->log->error("indexDocuments: source file '$sourceFile' for document $docId does not exist."); 715 + $default->log->error(sprintf(_kt("indexDocuments: source file '%s' for document %d does not exist."),$sourceFile,$docId));
652 Indexer::unqueueDocument($docId); 716 Indexer::unqueueDocument($docId);
653 continue; 717 continue;
654 } 718 }
@@ -659,14 +723,14 @@ abstract class Indexer @@ -659,14 +723,14 @@ abstract class Indexer
659 $result = @copy($sourceFile, $intermediate); 723 $result = @copy($sourceFile, $intermediate);
660 if ($result === false) 724 if ($result === false)
661 { 725 {
662 - $default->log->error("Could not create intermediate file from document $docId"); 726 + $default->log->error(sprintf(_kt("Could not create intermediate file from document %d"),$docId));
663 // problem. lets try again later. probably permission related. log the issue. 727 // problem. lets try again later. probably permission related. log the issue.
664 continue; 728 continue;
665 } 729 }
666 $sourceFile = $intermediate; 730 $sourceFile = $intermediate;
667 } 731 }
668 732
669 - $targetFile = tempnam($tempPath, 'ktindexer') . '.txt'; 733 + $targetFile = tempnam($tempPath, 'ktindexer');
670 734
671 $extractor->setSourceFile($sourceFile); 735 $extractor->setSourceFile($sourceFile);
672 $extractor->setMimeType($mimeType); 736 $extractor->setMimeType($mimeType);
@@ -675,7 +739,10 @@ abstract class Indexer @@ -675,7 +739,10 @@ abstract class Indexer
675 $extractor->setDocument($document); 739 $extractor->setDocument($document);
676 $extractor->setIndexingStatus(null); 740 $extractor->setIndexingStatus(null);
677 $extractor->setExtractionStatus(null); 741 $extractor->setExtractionStatus(null);
678 - if ($this->debug) $default->log->debug("Extra Info docid: $docId Source File: '$sourceFile' Target File: '$targetFile'"); 742 + if ($this->debug)
  743 + {
  744 + $default->log->debug(sprintf(_kt("Extra Info docid: %d Source File: '%s' Target File: '%s'"),$docId,$sourceFile,$targetFile));
  745 + }
679 746
680 $this->executeHook($extractor, 'pre_extract'); 747 $this->executeHook($extractor, 'pre_extract');
681 $this->executeHook($extractor, 'pre_extract', $mimeType); 748 $this->executeHook($extractor, 'pre_extract', $mimeType);
@@ -691,7 +758,10 @@ abstract class Indexer @@ -691,7 +758,10 @@ abstract class Indexer
691 { 758 {
692 $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version); 759 $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);
693 760
694 - if (!$indexStatus) $default->log->error("Problem indexing document $docId"); 761 + if (!$indexStatus)
  762 + {
  763 + $default->log->error(sprintf(_kt("Problem indexing document %d - indexDocumentAndDiscussion"),$docId));
  764 + }
695 765
696 $extractor->setIndexingStatus($indexStatus); 766 $extractor->setIndexingStatus($indexStatus);
697 } 767 }
@@ -699,13 +769,16 @@ abstract class Indexer @@ -699,13 +769,16 @@ abstract class Indexer
699 { 769 {
700 if (!$this->filterText($targetFile)) 770 if (!$this->filterText($targetFile))
701 { 771 {
702 - $default->log->error("Problem filtering document $docId"); 772 + $default->log->error(sprintf(_kt("Problem filtering document %d"),$docId));
703 } 773 }
704 else 774 else
705 { 775 {
706 $indexStatus = $this->indexDocument($docId, $targetFile, $title, $version); 776 $indexStatus = $this->indexDocument($docId, $targetFile, $title, $version);
707 777
708 - if (!$indexStatus) $default->log->error("Problem indexing document $docId"); 778 + if (!$indexStatus)
  779 + {
  780 + $default->log->error(sprintf(_kt("Problem indexing document %d - indexDocument"),$docId));
  781 + }
709 782
710 $extractor->setIndexingStatus($indexStatus); 783 $extractor->setIndexingStatus($indexStatus);
711 } 784 }
@@ -717,7 +790,7 @@ abstract class Indexer @@ -717,7 +790,7 @@ abstract class Indexer
717 else 790 else
718 { 791 {
719 $extractor->setExtractionStatus(false); 792 $extractor->setExtractionStatus(false);
720 - $default->log->error("Could not extract contents from document $docId"); 793 + $default->log->error(sprintf(_kt("Could not extract contents from document %d"),$docId));
721 } 794 }
722 795
723 $this->executeHook($extractor, 'post_extract', $mimeType); 796 $this->executeHook($extractor, 'post_extract', $mimeType);
@@ -729,17 +802,147 @@ abstract class Indexer @@ -729,17 +802,147 @@ abstract class Indexer
729 } 802 }
730 803
731 @unlink($targetFile); 804 @unlink($targetFile);
  805 + $removeFromQueue = $indexStatus;
732 } 806 }
733 else 807 else
734 { 808 {
735 $this->indexDiscussion($docId); 809 $this->indexDiscussion($docId);
736 } 810 }
737 811
738 - Indexer::unqueueDocument($docId);  
739 - if ($this->debug) $default->log->debug("Done indexing docid: $docId"); 812 + if ($removeFromQueue)
  813 + {
  814 + Indexer::unqueueDocument($docId);
  815 + }
  816 + if ($this->debug)
  817 + {
  818 + $default->log->debug(sprintf(_kt("Done indexing docid: %d"),$docId));
  819 + }
740 820
741 } 821 }
742 - if ($this->debug) print "Done.\n"; 822 + if ($this->debug)
  823 + {
  824 + $default->log->debug(_kt("Done."));
  825 + }
  826 + }
  827 +
  828 + public function migrateDocuments($max=null)
  829 + {
  830 + if (!$this->doesDiagnosticsPass(true))
  831 + {
  832 + return;
  833 + }
  834 +
  835 + $config =& KTConfig::getSingleton();
  836 + if (is_null($max))
  837 + {
  838 + $max = $config->get('indexer/batchMigrateDocument',500);
  839 + }
  840 +
  841 + global $default;
  842 +
  843 + $lockFile = $config->get('cache/cacheDirectory') . '/migration.lock';
  844 + if (is_file($lockFile))
  845 + {
  846 + $default->log->info(_kt('migrateDocuments: migration lockfile detected. exiting.'));
  847 + return;
  848 + }
  849 + touch($lockFile);
  850 + $default->log->info(_kt('migrateDocuments: starting!'));
  851 +
  852 + $startTime = KTUtil::getSystemSetting('migrationStarted');
  853 + if (is_null($startTime))
  854 + {
  855 + KTUtil::setSystemSetting('migrationStarted', time());
  856 + }
  857 +
  858 + $maxLoops = 5;
  859 +
  860 + $max = floor($max / $maxLoops);
  861 +
  862 + $start =KTUtil::getBenchmarkTime();
  863 + $noDocs = false;
  864 + $numDocs = 0;
  865 +
  866 + for($loop=0;$loop<$maxLoops;$loop++)
  867 + {
  868 +
  869 + $sql = "SELECT
  870 + document_id, document_text
  871 + FROM
  872 + document_text
  873 + ORDER BY document_id
  874 + LIMIT $max";
  875 + $result = DBUtil::getResultArray($sql);
  876 + if (PEAR::isError($result))
  877 + {
  878 + break;
  879 + }
  880 +
  881 + $docs = count($result);
  882 + if ($docs == 0)
  883 + {
  884 + $noDocs = true;
  885 + break;
  886 + }
  887 + $numDocs += $docs;
  888 +
  889 + foreach($result as $docinfo)
  890 + {
  891 + $docId = $docinfo['document_id'];
  892 +
  893 + $document = Document::get($docId);
  894 + if (PEAR::isError($document) || is_null($document))
  895 + {
  896 + $sql = "DELETE FROM document_text WHERE document_id=$docId";
  897 + DBUtil::runQuery($sql);
  898 + $default->log->error(sprintf(_kt('migrateDocuments: Could not get document %d\'s document! Removing content!',$docId)));
  899 + continue;
  900 + }
  901 +
  902 + $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();
  903 +
  904 + $targetFile = tempnam($tempPath, 'ktindexer');
  905 +
  906 + if (file_put_contents($targetFile, $docinfo['document_text']) === false)
  907 + {
  908 + $default->log->error(sprintf(_kt('migrateDocuments: Cannot write to \'%s\' for document id %d'), $targetFile, $docId));
  909 + continue;
  910 + }
  911 + // free memory asap ;)
  912 + unset($docinfo['document_text']);
  913 +
  914 + $title = $document->getName();
  915 +
  916 + $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);
  917 +
  918 + if ($indexStatus)
  919 + {
  920 + $sql = "DELETE FROM document_text WHERE document_id=$docId";
  921 + DBUtil::runQuery($sql);
  922 + }
  923 + else
  924 + {
  925 + $default->log->error(sprintf(_kt("migrateDocuments: Problem indexing document %d"), $docId));
  926 + }
  927 +
  928 + @unlink($targetFile);
  929 + }
  930 + }
  931 +
  932 + @unlink($lockFile);
  933 +
  934 + $time = KTUtil::getBenchmarkTime() - $start;
  935 +
  936 + KTUtil::setSystemSetting('migrationTime', KTUtil::getSystemSetting('migrationTime',0) + $time);
  937 + KTUtil::setSystemSetting('migratedDocuments', KTUtil::getSystemSetting('migratedDocuments',0) + $numDocs);
  938 +
  939 + $default->log->info(sprintf(_kt('migrateDocuments: done in %d seconds!'), $time));
  940 + if ($noDocs)
  941 + {
  942 + $default->log->info(_kt('migrateDocuments: Completed!'));
  943 + KTUtil::setSystemSetting('migrationComplete', true);
  944 + }
  945 +
743 } 946 }
744 947
745 /** 948 /**