Commit ebf14f1e2be0bc59efd8d0adfeb915b855c31285

Authored by Megan Watson
1 parent 7c1f0c28

Rebuilt Lucene with extra debug statements. Split the processors and indexing into 2 queues.

PT: 1731097

Committed by: Megan Watson
bin/luceneserver/ktlucene.jar
No preview for this file type
call_home.php 0 → 100644
  1 +<?php
  2 +/*
  3 +* Data incoming format: <installation guid>|<user count>|<document count>|<KT version>|<KT edition>|<OS info>
  4 +*/
  5 +
  6 +$data = isset($_REQUEST['system_info']) ? strip_tags($_REQUEST['system_info']) : '';
  7 +
  8 +if(empty($data)){
  9 + exit(0);
  10 +}
  11 +
  12 +$file = 'var/system_info.txt';
  13 +$fp = fopen($file, 'a');
  14 +fwrite($fp, $data."\n");
  15 +fclose($fp);
  16 +
  17 +exit(0);
  18 +?>
0 19 \ No newline at end of file
... ...
search2/documentProcessor/bin/documentProcessor.php
... ... @@ -6,7 +6,7 @@
6 6 * KnowledgeTree Community Edition
7 7 * Document Management Made Simple
8 8 * Copyright (C) 2008, 2009 KnowledgeTree Inc.
9   - *
  9 + *
10 10 *
11 11 * This program is free software; you can redistribute it and/or modify it under
12 12 * the terms of the GNU General Public License version 3 as published by the
... ... @@ -47,6 +47,7 @@ chdir(dirname(__FILE__));
47 47 require_once('../documentProcessor.inc.php');
48 48  
49 49 $documentProcessor = DocumentProcessor::get();
  50 +$documentProcessor->processIndexQueue();
50 51 $documentProcessor->processQueue();
51 52 exit;
52 53 ?>
... ...
search2/documentProcessor/documentProcessor.inc.php
... ... @@ -6,7 +6,7 @@
6 6 * KnowledgeTree Community Edition
7 7 * Document Management Made Simple
8 8 * Copyright (C) 2008, 2009 KnowledgeTree Inc.
9   - *
  9 + *
10 10 *
11 11 * This program is free software; you can redistribute it and/or modify it under
12 12 * the terms of the GNU General Public License version 3 as published by the
... ... @@ -105,6 +105,11 @@ class DocumentProcessor
105 105 return $singleton;
106 106 }
107 107  
  108 + /**
  109 + * Load the processors that will get run on the documents, eg pdf generation
  110 + *
  111 + * @return array
  112 + */
108 113 private function loadProcessors()
109 114 {
110 115 // Get list of registered processors (plugins)
... ... @@ -116,7 +121,7 @@ class DocumentProcessor
116 121  
117 122 if(PEAR::isError($results)){
118 123 global $default;
119   - $default->log->debug('documentProcessor: error loading processors').' - '.$results->getMessage();
  124 + $default->log->error('documentProcessor: error loading processors').' - '.$results->getMessage();
120 125 return false;
121 126 }
122 127  
... ... @@ -139,34 +144,65 @@ class DocumentProcessor
139 144 return $processors;
140 145 }
141 146  
142   - public function processQueue()
  147 + /**
  148 + * Fetch the documents in the indexing queue and start the indexer
  149 + *
  150 + */
  151 + public function processIndexQueue()
143 152 {
144 153 global $default;
145   - $default->log->debug('documentProcessor: starting');
  154 +
  155 + if(!$default->enableIndexing){
  156 + $default->log->debug('documentProcessor: indexer disabled');
  157 + return ;
  158 + }
  159 +
  160 + $default->log->debug('documentProcessor: starting indexer');
146 161  
147 162 // Check for lock file to ensure processor is not currently running
148 163 $cacheDir = $default->cacheDirectory;
149 164 $lockFile = $cacheDir . DIRECTORY_SEPARATOR . 'document_processor.lock';
150 165  
151 166 if(file_exists($lockFile)){
152   - // lock file exists, exit
153   - $default->log->debug('documentProcessor: stopping, lock file in place '.$lockFile);
154   - return ;
  167 + // If something causes the document processor to stop part way through processing, the lock
  168 + // file will remain stopping the document processor from resuming. To workaround this problem
  169 + // we check the creation date of the lockfile and remove it if it is older than 24 hours or
  170 + // 48 hours if the batch size is greater than 1000 documents.
  171 + $stat = stat($lockFile);
  172 + $created = $stat['mtime'];
  173 +
  174 + $gap = 24;
  175 + if($this->limit > 1000){
  176 + $gap = 48;
  177 + $default->log->warn('documentProcessor: batch size of documents to index is set to '.$this->limit.', this could cause problems.');
  178 + }
  179 + $check = time() - ($gap*60*60);
  180 +
  181 + if($check > $created){
  182 + $default->log->error('documentProcessor: lock file is older than '.$gap.' hours, deleting it to restart indexing - '.$lockFile);
  183 + @unlink($lockFile);
  184 + }else{
  185 + // lock file exists, exit
  186 + // through a warning if the lock file is older than half an hour
  187 + $small_gap = time() - (30*60);
  188 + if($small_gap > $created){
  189 + $default->log->warn('documentProcessor: stopping, lock file in place since '. date('Y-m-d H:i:s', $created) .' - '.$lockFile);
  190 + }
  191 + return ;
  192 + }
155 193 }
156 194  
157   - if($default->enableIndexing){
158   - // Setup indexing - load extractors, run diagnostics
159   - if($this->indexer->preIndexingSetup() === false){
160   - $default->log->debug('documentProcessor: stopping - indexer setup failed.');
161   - return;
162   - }
  195 + // Setup indexing - load extractors, run diagnostics
  196 + if($this->indexer->preIndexingSetup() === false){
  197 + $default->log->error('documentProcessor: stopping - indexer setup failed.');
  198 + return;
163 199 }
164 200  
165 201 // Get document queue
166 202 $queue = $this->indexer->getDocumentsQueue($this->limit);
167 203  
168 204 if(empty($queue)){
169   - $default->log->debug('documentProcessor: stopping - no documents in processing queue');
  205 + $default->log->debug('documentProcessor: stopping - no documents in indexing queue');
170 206 return ;
171 207 }
172 208  
... ... @@ -177,7 +213,8 @@ class DocumentProcessor
177 213 foreach($queue as $item){
178 214  
179 215 // Get the document object
180   - $document = Document::get($item['document_id']);
  216 + $docId = $item['document_id'];
  217 + $document = Document::get($docId);
181 218  
182 219 if (PEAR::isError($document))
183 220 {
... ... @@ -186,9 +223,54 @@ class DocumentProcessor
186 223 }
187 224  
188 225 // index document
189   - if($default->enableIndexing){
190   - $this->indexer->processDocument($document, $item);
191   - }
  226 + $this->indexer->processDocument($document, $item);
  227 + }
  228 +
  229 + // update the indexer statistics
  230 + $this->indexer->updateIndexStats();
  231 +
  232 + // Remove lock file to indicate processing has completed
  233 + if(file_exists($lockFile)){
  234 + @unlink($lockFile);
  235 + }
  236 +
  237 + $default->log->debug('documentProcessor: stopping indexer, batch completed');
  238 + }
  239 +
  240 + /**
  241 + * Fetch the process queue for running the processors on
  242 + *
  243 + */
  244 + public function processQueue()
  245 + {
  246 + global $default;
  247 + $default->log->debug('documentProcessor: starting processing');
  248 +
  249 + // Get processing queue
  250 + // Use the same batch size as the indexer (for now)
  251 + // If the batch size is huge then reset it to a smaller number
  252 + // Open office leaks memory, so we don't want to do too many documents at once
  253 + $batch = ($this->limit > 500) ? 500 : $this->limit;
  254 +
  255 + $queue = $this->indexer->getDocumentProcessingQueue($batch);
  256 +
  257 + if(empty($queue)){
  258 + $default->log->debug('documentProcessor: stopping - no documents in processing queue');
  259 + return ;
  260 + }
  261 +
  262 + // Process queue
  263 + foreach($queue as $item){
  264 +
  265 + // Get the document object
  266 + $docId = $item['document_id'];
  267 + $document = Document::get($docId);
  268 +
  269 + if (PEAR::isError($document))
  270 + {
  271 + Indexer::unqueueDocFromProcessing($docId, "Cannot resolve document id: {$document->getMessage()}", 'error');
  272 + continue;
  273 + }
192 274  
193 275 // loop through processors
194 276 if($this->processors !== false){
... ... @@ -204,19 +286,13 @@ class DocumentProcessor
204 286 // Process document
205 287 $processor->setDocument($document);
206 288 $processor->processDocument();
  289 +
  290 + Indexer::unqueueDocFromProcessing($docId, "Document processed", 'debug');
207 291 }
208 292 }
209 293 }
210 294  
211   - // update the indexer statistics
212   - $this->indexer->updateIndexStats();
213   -
214   - // Remove lock file to indicate processing has completed
215   - if(file_exists($lockFile)){
216   - @unlink($lockFile);
217   - }
218   -
219   - $default->log->debug('documentProcessor: stopping');
  295 + $default->log->debug('documentProcessor: stopping processing, batch completed');
220 296 }
221 297  
222 298 /**
... ...
search2/indexing/extractors/OpenOfficeTextExtractor.inc.php
1 1 <?php
2   -
3   -require_once(KT_DIR.'/thirdparty/peclzip/pclzip.lib.php');
4   -
5 2 /**
6 3 * $Id:$
7 4 *
8 5 * KnowledgeTree Community Edition
9 6 * Document Management Made Simple
10 7 * Copyright (C) 2008, 2009 KnowledgeTree Inc.
11   - *
  8 + *
12 9 *
13 10 * This program is free software; you can redistribute it and/or modify it under
14 11 * the terms of the GNU General Public License version 3 as published by the
... ... @@ -39,6 +36,8 @@ require_once(KT_DIR.&#39;/thirdparty/peclzip/pclzip.lib.php&#39;);
39 36 *
40 37 */
41 38  
  39 +require_once(KT_DIR.'/thirdparty/peclzip/pclzip.lib.php');
  40 +
42 41 class OpenOfficeTextExtractor extends ExternalDocumentExtractor
43 42 {
44 43 public function __construct()
... ... @@ -138,6 +137,7 @@ class OpenOfficeTextExtractor extends ExternalDocumentExtractor
138 137 */
139 138 public function diagnose()
140 139 {
  140 + return null;
141 141 if (false === $this->unzip)
142 142 {
143 143 return sprintf(_kt("Cannot locate unzip: %s."), $this->unzip);
... ...
search2/indexing/extractors/OpenXmlTextExtractor.inc.php
... ... @@ -6,7 +6,7 @@
6 6 * KnowledgeTree Community Edition
7 7 * Document Management Made Simple
8 8 * Copyright (C) 2008, 2009 KnowledgeTree Inc.
9   - *
  9 + *
10 10 *
11 11 * This program is free software; you can redistribute it and/or modify it under
12 12 * the terms of the GNU General Public License version 3 as published by the
... ... @@ -37,6 +37,8 @@
37 37 *
38 38 */
39 39  
  40 +require_once(KT_DIR.'/thirdparty/peclzip/pclzip.lib.php');
  41 +
40 42 class OpenXmlTextExtractor extends ExternalDocumentExtractor
41 43 {
42 44 public function __construct()
... ... @@ -321,6 +323,7 @@ class OpenXmlTextExtractor extends ExternalDocumentExtractor
321 323 */
322 324 public function diagnose()
323 325 {
  326 + return null;
324 327 if (false === $this->unzip)
325 328 {
326 329 return sprintf(_kt("Cannot locate unzip: %s."), $this->unzip);
... ...
search2/indexing/indexerCore.inc.php
... ... @@ -6,7 +6,7 @@
6 6 * KnowledgeTree Community Edition
7 7 * Document Management Made Simple
8 8 * Copyright (C) 2008, 2009 KnowledgeTree Inc.
9   - *
  9 + *
10 10 *
11 11 * This program is free software; you can redistribute it and/or modify it under
12 12 * the terms of the GNU General Public License version 3 as published by the
... ... @@ -643,6 +643,16 @@ abstract class Indexer
643 643  
644 644 $default->log->debug("index: Queuing indexing of $document_id");
645 645  
  646 + // Appending the process queue to the index for convenience
  647 + // Don't want to complicate matters by creating too many new classes and files
  648 + Indexer::unqueueDocFromProcessing($document_id);
  649 +
  650 + // enqueue item
  651 + $date = date('Y-m-d H:i:s');
  652 + $sql = "INSERT INTO process_queue(document_id, date_added) VALUES($document_id, '$date')";
  653 + DBUtil::runQuery($sql);
  654 +
  655 + $default->log->debug("Processing queue: Queuing document for processing - $document_id");
646 656 }
647 657  
648 658 private static function incrementCount()
... ... @@ -722,8 +732,37 @@ abstract class Indexer
722 732 DBUtil::runQuery($sql);
723 733  
724 734 $default->log->debug("Indexer::clearoutDeleted: removed documents from indexing queue that have been deleted");
  735 +
  736 + // Multiple indexing processes cannot occur at the same time - the lock file prevents this.
  737 + // However if the indexing is interrupted the documents can get stuck in the queue with the processdate set
  738 + // but never having been indexed. To prevent this we will clear the processdate on all documents without errors.
  739 + $sql = 'UPDATE index_files SET processdate = null where processdate is not null and status_msg is null';
  740 + $res = DBUtil::runQuery($sql);
  741 +
  742 + if(PEAR::isError($res)){
  743 + $default->log->error("Indexer::clearoutDeleted: something happened ".$res->getMessage);
  744 + }
  745 +
  746 + $default->log->debug("Indexer::clearoutDeleted: resetting processdate for documents that may be stuck");
725 747 }
726 748  
  749 + /**
  750 + * Clearout the processing of documents that no longer exist.
  751 + *
  752 + */
  753 + public static function clearoutDeletedFromProcessor()
  754 + {
  755 + global $default;
  756 +
  757 + $sql = 'DELETE FROM
  758 + process_queue
  759 + WHERE
  760 + document_id in (SELECT d.id FROM documents AS d WHERE d.status_id=3) OR
  761 + NOT EXISTS(SELECT process_queue.document_id FROM documents WHERE process_queue.document_id=documents.id)';
  762 + $result = DBUtil::runQuery($sql);
  763 +
  764 + $default->log->debug("Process queue: removed documents from processing queue that have been deleted");
  765 + }
727 766  
728 767 /**
729 768 * Check if a document is scheduled to be indexed
... ... @@ -1191,7 +1230,7 @@ abstract class Indexer
1191 1230 }
1192 1231  
1193 1232 /**
1194   - * Get the queue of documents for processing
  1233 + * Get the queue of documents for indexing
1195 1234 * Refactored from indexDocuments()
1196 1235 */
1197 1236 public function getDocumentsQueue($max = null)
... ... @@ -1222,7 +1261,7 @@ abstract class Indexer
1222 1261 if (PEAR::isError($result))
1223 1262 {
1224 1263 //unlink($indexLockFile);
1225   - if ($this->debug) $default->log->debug('indexDocuments: stopping - db error');
  1264 + if ($this->debug) $default->log->error('indexDocuments: stopping - db error');
1226 1265 return;
1227 1266 }
1228 1267 KTUtil::setSystemSetting('luceneIndexingDate', time());
... ... @@ -1253,6 +1292,51 @@ abstract class Indexer
1253 1292 }
1254 1293  
1255 1294 /**
  1295 + * Get the queue of documents for processing
  1296 + *
  1297 + */
  1298 + public function getDocumentProcessingQueue($max = null)
  1299 + {
  1300 + global $default;
  1301 + $max = (empty($max)) ? 20 : $max;
  1302 +
  1303 + // Cleanup the queue
  1304 + Indexer::clearoutDeletedFromProcessor();
  1305 +
  1306 + $date = date('Y-m-d H:i:s');
  1307 + // identify the indexers that must run
  1308 + // mysql specific limit!
  1309 + $sql = "SELECT
  1310 + pq.document_id, mt.filetypes, mt.mimetypes
  1311 + FROM
  1312 + process_queue pq
  1313 + INNER JOIN documents d ON pq.document_id=d.id
  1314 + INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
  1315 + INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
  1316 + INNER JOIN mime_types mt ON dcv.mime_id=mt.id
  1317 + WHERE
  1318 + (pq.date_processed IS NULL or pq.date_processed < date_sub('$date', interval 1 day)) AND dmv.status_id=1
  1319 + ORDER BY date_added
  1320 + LIMIT $max";
  1321 +
  1322 + $result = DBUtil::getResultArray($sql);
  1323 + if (PEAR::isError($result))
  1324 + {
  1325 + $default->log->error('Processing queue: stopping - db error: '.$result->getMessage());
  1326 + return;
  1327 + }
  1328 +
  1329 + // bail if no work to do
  1330 + if (count($result) == 0)
  1331 + {
  1332 + $default->log->debug('Processing queue: stopping - no work to be done');
  1333 + return;
  1334 + }
  1335 +
  1336 + return $result;
  1337 + }
  1338 +
  1339 + /**
1256 1340 * Process a document - extract text and index it
1257 1341 * Refactored from indexDocuments()
1258 1342 *
... ... @@ -1813,7 +1897,7 @@ abstract class Indexer
1813 1897 }
1814 1898  
1815 1899 /**
1816   - * Remove the document from the queue. This is normally called when it has been processed.
  1900 + * Remove the document from the indexing queue. This is normally called when it has been processed.
1817 1901 *
1818 1902 * @param int $docid
1819 1903 */
... ... @@ -1829,6 +1913,23 @@ abstract class Indexer
1829 1913 }
1830 1914  
1831 1915 /**
  1916 + * Remove the document from the processing queue. This is normally called when it has been processed.
  1917 + *
  1918 + * @param int $docid
  1919 + */
  1920 + public static function unqueueDocFromProcessing($docid, $reason=false, $level='debug')
  1921 + {
  1922 + $sql = "DELETE FROM process_queue WHERE document_id=$docid";
  1923 + $result = DBUtil::runQuery($sql);
  1924 +
  1925 + if ($reason !== false)
  1926 + {
  1927 + global $default;
  1928 + $default->log->$level("Processor queue: removing document $docid from the queue - $reason");
  1929 + }
  1930 + }
  1931 +
  1932 + /**
1832 1933 * Run a query on the index.
1833 1934 *
1834 1935 * @param string $query
... ...
sql/mysql/install/data.sql
... ... @@ -1774,7 +1774,8 @@ INSERT INTO `upgrades` VALUES
1774 1774 (230,'sql*3.7.0-1*0*3.7.0-1/hide_zip_config.sql','Database upgrade to version 3.7.0-1: Hide zip config','2009-09-01 00:00:00',1,'upgrade*3.7.0-1*99*upgrade3.7.0-1'),
1775 1775 (231,'sql*3.7.0-1*0*3.7.0-1/mime_extractors_reset.sql','Database upgrade to version 3.7.0-1: Mime extractors reset','2009-09-01 00:00:00',1,'upgrade*3.7.0-1*99*upgrade3.7.0-1'),
1776 1776 (232,'upgrade*3.7.0-1*99*upgrade3.7.0-1','Upgrade from version 3.6.3 to 3.7.0-1','2009-11-13 00:00:00',1,'upgrade*3.7.0-1*99*upgrade3.7.0-1'),
1777   -(233,'upgrade*3.7.0.2*99*upgrade3.7.0.2','Upgrade from version 3.7.0-1 to 3.7.0.2','2009-11-19 00:00:00',1,'upgrade*3.7.0.2*99*upgrade3.7.0.2');
  1777 +(233,'sql*3.7.0.2*0*3.7.0.2/processor_queue.sql','Database upgrade to version 3.7.0-1: Processor Queue','2009-09-01 00:00:00',1,'upgrade*3.7.0.2*99*upgrade3.7.0.2'),
  1778 +(234,'upgrade*3.7.0.2*99*upgrade3.7.0.2','Upgrade from version 3.7.0-1 to 3.7.0.2','2009-11-19 00:00:00',1,'upgrade*3.7.0.2*99*upgrade3.7.0.2');
1778 1779 /*!40000 ALTER TABLE `upgrades` ENABLE KEYS */;
1779 1780 UNLOCK TABLES;
1780 1781  
... ...
sql/mysql/install/structure.sql
... ... @@ -4,7 +4,7 @@
4 4 -- KnowledgeTree Community Edition
5 5 -- Document Management Made Simple
6 6 -- Copyright (C) 2008, 2009 KnowledgeTree Inc.
7   ---
  7 +--
8 8 --
9 9 -- This program is free software; you can redistribute it and/or modify it under
10 10 -- the terms of the GNU General Public License version 3 as published by the
... ... @@ -1327,6 +1327,20 @@ CREATE TABLE `plugins` (
1327 1327 ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
1328 1328  
1329 1329 --
  1330 +-- Table structure for table `process_queue`
  1331 +--
  1332 +
  1333 +CREATE table `process_queue` (
  1334 + `document_id` int(11) NOT NULL,
  1335 + `date_added` timestamp NOT NULL default CURRENT_TIMESTAMP on update CURRENT_TIMESTAMP,
  1336 + `date_processed` timestamp,
  1337 + `status_msg` mediumtext,
  1338 + `process_type` varchar(20),
  1339 + PRIMARY KEY (`document_id`),
  1340 + CONSTRAINT `process_queue_ibfk_1` FOREIGN KEY (`document_id`) REFERENCES `documents` (`id`) ON DELETE CASCADE ON UPDATE CASCADE
  1341 +) ENGINE=InnoDB DEFAULT CHARSET=utf8;
  1342 +
  1343 +--
1330 1344 -- Table structure for table `role_allocations`
1331 1345 --
1332 1346  
... ...
sql/mysql/upgrade/3.7.0.2/processor_queue.sql 0 → 100644
  1 +--
  2 +-- Table structure for table `process_queue`
  3 +--
  4 +
  5 +CREATE table `process_queue` (
  6 + `document_id` int(11) NOT NULL,
  7 + `date_added` timestamp NOT NULL default CURRENT_TIMESTAMP on update CURRENT_TIMESTAMP,
  8 + `date_processed` timestamp,
  9 + `status_msg` mediumtext,
  10 + `process_type` varchar(20),
  11 + PRIMARY KEY (`document_id`),
  12 + CONSTRAINT `process_queue_ibfk_1` FOREIGN KEY (`document_id`) REFERENCES `documents` (`id`) ON DELETE CASCADE ON UPDATE CASCADE
  13 +) ENGINE=InnoDB DEFAULT CHARSET=utf8;
... ...