From ebf14f1e2be0bc59efd8d0adfeb915b855c31285 Mon Sep 17 00:00:00 2001 From: Megan Watson Date: Fri, 20 Nov 2009 17:21:05 +0200 Subject: [PATCH] Rebuilt Lucene with extra debug statements. Split the processors and indexing into 2 queues. PT: 1731097 --- bin/luceneserver/ktlucene.jar | Bin 23504 -> 0 bytes call_home.php | 18 ++++++++++++++++++ search2/documentProcessor/bin/documentProcessor.php | 3 ++- search2/documentProcessor/documentProcessor.inc.php | 130 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------- search2/indexing/extractors/OpenOfficeTextExtractor.inc.php | 8 ++++---- search2/indexing/extractors/OpenXmlTextExtractor.inc.php | 5 ++++- search2/indexing/indexerCore.inc.php | 109 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---- sql/mysql/install/data.sql | 3 ++- sql/mysql/install/structure.sql | 16 +++++++++++++++- sql/mysql/upgrade/3.7.0.2/processor_queue.sql | 13 +++++++++++++ 10 files changed, 266 insertions(+), 39 deletions(-) create mode 100644 call_home.php create mode 100644 sql/mysql/upgrade/3.7.0.2/processor_queue.sql diff --git a/bin/luceneserver/ktlucene.jar b/bin/luceneserver/ktlucene.jar index cf908d4..92192d0 100644 Binary files a/bin/luceneserver/ktlucene.jar and b/bin/luceneserver/ktlucene.jar differ diff --git a/call_home.php b/call_home.php new file mode 100644 index 0000000..7520d41 --- /dev/null +++ b/call_home.php @@ -0,0 +1,18 @@ +||||| +*/ + +$data = isset($_REQUEST['system_info']) ? strip_tags($_REQUEST['system_info']) : ''; + +if(empty($data)){ + exit(0); +} + +$file = 'var/system_info.txt'; +$fp = fopen($file, 'a'); +fwrite($fp, $data."\n"); +fclose($fp); + +exit(0); +?> \ No newline at end of file diff --git a/search2/documentProcessor/bin/documentProcessor.php b/search2/documentProcessor/bin/documentProcessor.php index 297fa3e..e1a003e 100644 --- a/search2/documentProcessor/bin/documentProcessor.php +++ b/search2/documentProcessor/bin/documentProcessor.php @@ -6,7 +6,7 @@ * KnowledgeTree Community Edition * Document Management Made Simple * Copyright (C) 2008, 2009 KnowledgeTree Inc. - * + * * * This program is free software; you can redistribute it and/or modify it under * the terms of the GNU General Public License version 3 as published by the @@ -47,6 +47,7 @@ chdir(dirname(__FILE__)); require_once('../documentProcessor.inc.php'); $documentProcessor = DocumentProcessor::get(); +$documentProcessor->processIndexQueue(); $documentProcessor->processQueue(); exit; ?> diff --git a/search2/documentProcessor/documentProcessor.inc.php b/search2/documentProcessor/documentProcessor.inc.php index 3e54746..9967e57 100644 --- a/search2/documentProcessor/documentProcessor.inc.php +++ b/search2/documentProcessor/documentProcessor.inc.php @@ -6,7 +6,7 @@ * KnowledgeTree Community Edition * Document Management Made Simple * Copyright (C) 2008, 2009 KnowledgeTree Inc. - * + * * * This program is free software; you can redistribute it and/or modify it under * the terms of the GNU General Public License version 3 as published by the @@ -105,6 +105,11 @@ class DocumentProcessor return $singleton; } + /** + * Load the processors that will get run on the documents, eg pdf generation + * + * @return array + */ private function loadProcessors() { // Get list of registered processors (plugins) @@ -116,7 +121,7 @@ class DocumentProcessor if(PEAR::isError($results)){ global $default; - $default->log->debug('documentProcessor: error loading processors').' - '.$results->getMessage(); + $default->log->error('documentProcessor: error loading processors').' - '.$results->getMessage(); return false; } @@ -139,34 +144,65 @@ class DocumentProcessor return $processors; } - public function processQueue() + /** + * Fetch the documents in the indexing queue and start the indexer + * + */ + public function processIndexQueue() { global $default; - $default->log->debug('documentProcessor: starting'); + + if(!$default->enableIndexing){ + $default->log->debug('documentProcessor: indexer disabled'); + return ; + } + + $default->log->debug('documentProcessor: starting indexer'); // Check for lock file to ensure processor is not currently running $cacheDir = $default->cacheDirectory; $lockFile = $cacheDir . DIRECTORY_SEPARATOR . 'document_processor.lock'; if(file_exists($lockFile)){ - // lock file exists, exit - $default->log->debug('documentProcessor: stopping, lock file in place '.$lockFile); - return ; + // If something causes the document processor to stop part way through processing, the lock + // file will remain stopping the document processor from resuming. To workaround this problem + // we check the creation date of the lockfile and remove it if it is older than 24 hours or + // 48 hours if the batch size is greater than 1000 documents. + $stat = stat($lockFile); + $created = $stat['mtime']; + + $gap = 24; + if($this->limit > 1000){ + $gap = 48; + $default->log->warn('documentProcessor: batch size of documents to index is set to '.$this->limit.', this could cause problems.'); + } + $check = time() - ($gap*60*60); + + if($check > $created){ + $default->log->error('documentProcessor: lock file is older than '.$gap.' hours, deleting it to restart indexing - '.$lockFile); + @unlink($lockFile); + }else{ + // lock file exists, exit + // through a warning if the lock file is older than half an hour + $small_gap = time() - (30*60); + if($small_gap > $created){ + $default->log->warn('documentProcessor: stopping, lock file in place since '. date('Y-m-d H:i:s', $created) .' - '.$lockFile); + } + return ; + } } - if($default->enableIndexing){ - // Setup indexing - load extractors, run diagnostics - if($this->indexer->preIndexingSetup() === false){ - $default->log->debug('documentProcessor: stopping - indexer setup failed.'); - return; - } + // Setup indexing - load extractors, run diagnostics + if($this->indexer->preIndexingSetup() === false){ + $default->log->error('documentProcessor: stopping - indexer setup failed.'); + return; } // Get document queue $queue = $this->indexer->getDocumentsQueue($this->limit); if(empty($queue)){ - $default->log->debug('documentProcessor: stopping - no documents in processing queue'); + $default->log->debug('documentProcessor: stopping - no documents in indexing queue'); return ; } @@ -177,7 +213,8 @@ class DocumentProcessor foreach($queue as $item){ // Get the document object - $document = Document::get($item['document_id']); + $docId = $item['document_id']; + $document = Document::get($docId); if (PEAR::isError($document)) { @@ -186,9 +223,54 @@ class DocumentProcessor } // index document - if($default->enableIndexing){ - $this->indexer->processDocument($document, $item); - } + $this->indexer->processDocument($document, $item); + } + + // update the indexer statistics + $this->indexer->updateIndexStats(); + + // Remove lock file to indicate processing has completed + if(file_exists($lockFile)){ + @unlink($lockFile); + } + + $default->log->debug('documentProcessor: stopping indexer, batch completed'); + } + + /** + * Fetch the process queue for running the processors on + * + */ + public function processQueue() + { + global $default; + $default->log->debug('documentProcessor: starting processing'); + + // Get processing queue + // Use the same batch size as the indexer (for now) + // If the batch size is huge then reset it to a smaller number + // Open office leaks memory, so we don't want to do too many documents at once + $batch = ($this->limit > 500) ? 500 : $this->limit; + + $queue = $this->indexer->getDocumentProcessingQueue($batch); + + if(empty($queue)){ + $default->log->debug('documentProcessor: stopping - no documents in processing queue'); + return ; + } + + // Process queue + foreach($queue as $item){ + + // Get the document object + $docId = $item['document_id']; + $document = Document::get($docId); + + if (PEAR::isError($document)) + { + Indexer::unqueueDocFromProcessing($docId, "Cannot resolve document id: {$document->getMessage()}", 'error'); + continue; + } // loop through processors if($this->processors !== false){ @@ -204,19 +286,13 @@ class DocumentProcessor // Process document $processor->setDocument($document); $processor->processDocument(); + + Indexer::unqueueDocFromProcessing($docId, "Document processed", 'debug'); } } } - // update the indexer statistics - $this->indexer->updateIndexStats(); - - // Remove lock file to indicate processing has completed - if(file_exists($lockFile)){ - @unlink($lockFile); - } - - $default->log->debug('documentProcessor: stopping'); + $default->log->debug('documentProcessor: stopping processing, batch completed'); } /** diff --git a/search2/indexing/extractors/OpenOfficeTextExtractor.inc.php b/search2/indexing/extractors/OpenOfficeTextExtractor.inc.php index 36e18cc..dfee453 100644 --- a/search2/indexing/extractors/OpenOfficeTextExtractor.inc.php +++ b/search2/indexing/extractors/OpenOfficeTextExtractor.inc.php @@ -1,14 +1,11 @@ unzip) { return sprintf(_kt("Cannot locate unzip: %s."), $this->unzip); diff --git a/search2/indexing/extractors/OpenXmlTextExtractor.inc.php b/search2/indexing/extractors/OpenXmlTextExtractor.inc.php index 4cb09ba..609889d 100644 --- a/search2/indexing/extractors/OpenXmlTextExtractor.inc.php +++ b/search2/indexing/extractors/OpenXmlTextExtractor.inc.php @@ -6,7 +6,7 @@ * KnowledgeTree Community Edition * Document Management Made Simple * Copyright (C) 2008, 2009 KnowledgeTree Inc. - * + * * * This program is free software; you can redistribute it and/or modify it under * the terms of the GNU General Public License version 3 as published by the @@ -37,6 +37,8 @@ * */ +require_once(KT_DIR.'/thirdparty/peclzip/pclzip.lib.php'); + class OpenXmlTextExtractor extends ExternalDocumentExtractor { public function __construct() @@ -321,6 +323,7 @@ class OpenXmlTextExtractor extends ExternalDocumentExtractor */ public function diagnose() { + return null; if (false === $this->unzip) { return sprintf(_kt("Cannot locate unzip: %s."), $this->unzip); diff --git a/search2/indexing/indexerCore.inc.php b/search2/indexing/indexerCore.inc.php index ead6e54..9b26082 100755 --- a/search2/indexing/indexerCore.inc.php +++ b/search2/indexing/indexerCore.inc.php @@ -6,7 +6,7 @@ * KnowledgeTree Community Edition * Document Management Made Simple * Copyright (C) 2008, 2009 KnowledgeTree Inc. - * + * * * This program is free software; you can redistribute it and/or modify it under * the terms of the GNU General Public License version 3 as published by the @@ -643,6 +643,16 @@ abstract class Indexer $default->log->debug("index: Queuing indexing of $document_id"); + // Appending the process queue to the index for convenience + // Don't want to complicate matters by creating too many new classes and files + Indexer::unqueueDocFromProcessing($document_id); + + // enqueue item + $date = date('Y-m-d H:i:s'); + $sql = "INSERT INTO process_queue(document_id, date_added) VALUES($document_id, '$date')"; + DBUtil::runQuery($sql); + + $default->log->debug("Processing queue: Queuing document for processing - $document_id"); } private static function incrementCount() @@ -722,8 +732,37 @@ abstract class Indexer DBUtil::runQuery($sql); $default->log->debug("Indexer::clearoutDeleted: removed documents from indexing queue that have been deleted"); + + // Multiple indexing processes cannot occur at the same time - the lock file prevents this. + // However if the indexing is interrupted the documents can get stuck in the queue with the processdate set + // but never having been indexed. To prevent this we will clear the processdate on all documents without errors. + $sql = 'UPDATE index_files SET processdate = null where processdate is not null and status_msg is null'; + $res = DBUtil::runQuery($sql); + + if(PEAR::isError($res)){ + $default->log->error("Indexer::clearoutDeleted: something happened ".$res->getMessage); + } + + $default->log->debug("Indexer::clearoutDeleted: resetting processdate for documents that may be stuck"); } + /** + * Clearout the processing of documents that no longer exist. + * + */ + public static function clearoutDeletedFromProcessor() + { + global $default; + + $sql = 'DELETE FROM + process_queue + WHERE + document_id in (SELECT d.id FROM documents AS d WHERE d.status_id=3) OR + NOT EXISTS(SELECT process_queue.document_id FROM documents WHERE process_queue.document_id=documents.id)'; + $result = DBUtil::runQuery($sql); + + $default->log->debug("Process queue: removed documents from processing queue that have been deleted"); + } /** * Check if a document is scheduled to be indexed @@ -1191,7 +1230,7 @@ abstract class Indexer } /** - * Get the queue of documents for processing + * Get the queue of documents for indexing * Refactored from indexDocuments() */ public function getDocumentsQueue($max = null) @@ -1222,7 +1261,7 @@ abstract class Indexer if (PEAR::isError($result)) { //unlink($indexLockFile); - if ($this->debug) $default->log->debug('indexDocuments: stopping - db error'); + if ($this->debug) $default->log->error('indexDocuments: stopping - db error'); return; } KTUtil::setSystemSetting('luceneIndexingDate', time()); @@ -1253,6 +1292,51 @@ abstract class Indexer } /** + * Get the queue of documents for processing + * + */ + public function getDocumentProcessingQueue($max = null) + { + global $default; + $max = (empty($max)) ? 20 : $max; + + // Cleanup the queue + Indexer::clearoutDeletedFromProcessor(); + + $date = date('Y-m-d H:i:s'); + // identify the indexers that must run + // mysql specific limit! + $sql = "SELECT + pq.document_id, mt.filetypes, mt.mimetypes + FROM + process_queue pq + INNER JOIN documents d ON pq.document_id=d.id + INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id + INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id + INNER JOIN mime_types mt ON dcv.mime_id=mt.id + WHERE + (pq.date_processed IS NULL or pq.date_processed < date_sub('$date', interval 1 day)) AND dmv.status_id=1 + ORDER BY date_added + LIMIT $max"; + + $result = DBUtil::getResultArray($sql); + if (PEAR::isError($result)) + { + $default->log->error('Processing queue: stopping - db error: '.$result->getMessage()); + return; + } + + // bail if no work to do + if (count($result) == 0) + { + $default->log->debug('Processing queue: stopping - no work to be done'); + return; + } + + return $result; + } + + /** * Process a document - extract text and index it * Refactored from indexDocuments() * @@ -1813,7 +1897,7 @@ abstract class Indexer } /** - * Remove the document from the queue. This is normally called when it has been processed. + * Remove the document from the indexing queue. This is normally called when it has been processed. * * @param int $docid */ @@ -1829,6 +1913,23 @@ abstract class Indexer } /** + * Remove the document from the processing queue. This is normally called when it has been processed. + * + * @param int $docid + */ + public static function unqueueDocFromProcessing($docid, $reason=false, $level='debug') + { + $sql = "DELETE FROM process_queue WHERE document_id=$docid"; + $result = DBUtil::runQuery($sql); + + if ($reason !== false) + { + global $default; + $default->log->$level("Processor queue: removing document $docid from the queue - $reason"); + } + } + + /** * Run a query on the index. * * @param string $query diff --git a/sql/mysql/install/data.sql b/sql/mysql/install/data.sql index 1a7d4d3..1fe1968 100755 --- a/sql/mysql/install/data.sql +++ b/sql/mysql/install/data.sql @@ -1774,7 +1774,8 @@ INSERT INTO `upgrades` VALUES (230,'sql*3.7.0-1*0*3.7.0-1/hide_zip_config.sql','Database upgrade to version 3.7.0-1: Hide zip config','2009-09-01 00:00:00',1,'upgrade*3.7.0-1*99*upgrade3.7.0-1'), (231,'sql*3.7.0-1*0*3.7.0-1/mime_extractors_reset.sql','Database upgrade to version 3.7.0-1: Mime extractors reset','2009-09-01 00:00:00',1,'upgrade*3.7.0-1*99*upgrade3.7.0-1'), (232,'upgrade*3.7.0-1*99*upgrade3.7.0-1','Upgrade from version 3.6.3 to 3.7.0-1','2009-11-13 00:00:00',1,'upgrade*3.7.0-1*99*upgrade3.7.0-1'), -(233,'upgrade*3.7.0.2*99*upgrade3.7.0.2','Upgrade from version 3.7.0-1 to 3.7.0.2','2009-11-19 00:00:00',1,'upgrade*3.7.0.2*99*upgrade3.7.0.2'); +(233,'sql*3.7.0.2*0*3.7.0.2/processor_queue.sql','Database upgrade to version 3.7.0-1: Processor Queue','2009-09-01 00:00:00',1,'upgrade*3.7.0.2*99*upgrade3.7.0.2'), +(234,'upgrade*3.7.0.2*99*upgrade3.7.0.2','Upgrade from version 3.7.0-1 to 3.7.0.2','2009-11-19 00:00:00',1,'upgrade*3.7.0.2*99*upgrade3.7.0.2'); /*!40000 ALTER TABLE `upgrades` ENABLE KEYS */; UNLOCK TABLES; diff --git a/sql/mysql/install/structure.sql b/sql/mysql/install/structure.sql index e86d9a6..e4a604d 100644 --- a/sql/mysql/install/structure.sql +++ b/sql/mysql/install/structure.sql @@ -4,7 +4,7 @@ -- KnowledgeTree Community Edition -- Document Management Made Simple -- Copyright (C) 2008, 2009 KnowledgeTree Inc. --- +-- -- -- This program is free software; you can redistribute it and/or modify it under -- the terms of the GNU General Public License version 3 as published by the @@ -1327,6 +1327,20 @@ CREATE TABLE `plugins` ( ) ENGINE=InnoDB DEFAULT CHARSET=utf8; -- +-- Table structure for table `process_queue` +-- + +CREATE table `process_queue` ( + `document_id` int(11) NOT NULL, + `date_added` timestamp NOT NULL default CURRENT_TIMESTAMP on update CURRENT_TIMESTAMP, + `date_processed` timestamp, + `status_msg` mediumtext, + `process_type` varchar(20), + PRIMARY KEY (`document_id`), + CONSTRAINT `process_queue_ibfk_1` FOREIGN KEY (`document_id`) REFERENCES `documents` (`id`) ON DELETE CASCADE ON UPDATE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8; + +-- -- Table structure for table `role_allocations` -- diff --git a/sql/mysql/upgrade/3.7.0.2/processor_queue.sql b/sql/mysql/upgrade/3.7.0.2/processor_queue.sql new file mode 100644 index 0000000..81250da --- /dev/null +++ b/sql/mysql/upgrade/3.7.0.2/processor_queue.sql @@ -0,0 +1,13 @@ +-- +-- Table structure for table `process_queue` +-- + +CREATE table `process_queue` ( + `document_id` int(11) NOT NULL, + `date_added` timestamp NOT NULL default CURRENT_TIMESTAMP on update CURRENT_TIMESTAMP, + `date_processed` timestamp, + `status_msg` mediumtext, + `process_type` varchar(20), + PRIMARY KEY (`document_id`), + CONSTRAINT `process_queue_ibfk_1` FOREIGN KEY (`document_id`) REFERENCES `documents` (`id`) ON DELETE CASCADE ON UPDATE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8; -- libgit2 0.21.4