Commit ca9e23dd6281155fc86bcd97be9d1a3022b94d50

Authored by kevin_fourie
1 parent f4eb662e

Merged in from DEV trunk...

KTS-3444
"Indexing needs to be more resiliant when encountering errors with open office"
Fixed.

Committed By: Conrad Vermeulen
Reviewed By: Philip Arkoll


git-svn-id: https://kt-dms.svn.sourceforge.net/svnroot/kt-dms/branches/3.5.3-Branch@8655 c91229c3-7414-0410-bfa2-8a42b809f60b
search2/indexing/extractorCore.inc.php
@@ -557,7 +557,10 @@ abstract class TextExtractor extends DocumentExtractor @@ -557,7 +557,10 @@ abstract class TextExtractor extends DocumentExtractor
557 */ 557 */
558 public function extractTextContent() 558 public function extractTextContent()
559 { 559 {
560 - $content = file_get_contents($this->sourcefile); 560 +
  561 + $config = KTConfig::getSingleton();
  562 + $maxTextSize = $config->get('indexer/maxTextSize', 1024 * 1024 * 10); // we'll only take 10 meg by default
  563 + $content = substr(file_get_contents($this->sourcefile), 0, $maxTextSize);
561 if (false === $content) 564 if (false === $content)
562 { 565 {
563 return false; 566 return false;
search2/indexing/extractors/OOTextExtractor.inc.php
@@ -123,13 +123,53 @@ class OOTextExtractor extends ExternalDocumentExtractor @@ -123,13 +123,53 @@ class OOTextExtractor extends ExternalDocumentExtractor
123 123
124 public function extractTextContent() 124 public function extractTextContent()
125 { 125 {
126 - if (false === parent::extractTextContent()) 126 + global $default;
  127 +
  128 + $docId = $this->document->getId();
  129 +
  130 + if (empty($this->extension))
  131 + {
  132 + $default->log->info("DocumentId: $docId - Document does not have an extension");
  133 + Indexer::unqueueDocument($docId, sprintf(("Removing document from queue: documentId %d"),$docId));
  134 + return false;
  135 + }
  136 +
  137 + // Open Office does not support the following files
  138 + if (in_array($this->extension, array('xlt')))
  139 + {
  140 + $default->log->info("DocumentId: $docId - Document does not have an extension");
  141 + Indexer::unqueueDocument($docId, sprintf(("Removing document from queue: documentId %d"),$docId));
  142 + return false;
  143 + }
  144 +
  145 + if (false === parent::extractTextContent())
127 { 146 {
  147 + if (strpos($this->output, 'OpenOffice process not found or not listening') !== false)
  148 + {
  149 + $indexer = Indexer::get();
  150 + $indexer->restartBatch();
  151 + return false;
  152 + }
  153 + elseif (strpos($this->output, 'Unexpected connection closure') !== false
  154 + || strpos($this->output, '\'NoneType\' object has no attribute \'storeToURL\'') !== false
  155 + || strpos($this->output, 'The document could not be opened for conversion. This could indicate an unsupported mimetype.') !== false
  156 + || strpos($this->output, 'URL seems to be an unsupported one.') !== false
  157 + || strpos($this->output, '__main__.com.sun.star.task.ErrorCodeIOException') !== false)
  158 + {
  159 + $default->log->info("DocumentId: $docId - Suspect the file cannot be indexed by Open Office.");
  160 + file_put_contents($this->targetfile, '');
  161 + $indexer = Indexer::get();
  162 + $indexer->restartBatch();
  163 +
  164 + Indexer::unqueueDocument($docId, sprintf(_kt("Removing document from queue: documentId %d"),$docId));
  165 + return true;
  166 + }
128 return false; 167 return false;
129 } 168 }
130 169
131 if ($this->targetExtension != 'html') 170 if ($this->targetExtension != 'html')
132 { 171 {
  172 + file_put_contents($this->targetfile, '');
133 return true; 173 return true;
134 } 174 }
135 $content = file_get_contents($this->targetfile); 175 $content = file_get_contents($this->targetfile);
search2/indexing/extractors/PDFExtractor.inc.php
@@ -59,15 +59,26 @@ class PDFExtractor extends ApplicationExtractor @@ -59,15 +59,26 @@ class PDFExtractor extends ApplicationExtractor
59 59
60 protected function exec($cmd) 60 protected function exec($cmd)
61 { 61 {
  62 + global $default;
62 $res = parent::exec($cmd); 63 $res = parent::exec($cmd);
63 64
64 - if (false === $res && (strpos($this->output, 'Copying of text from this document is not allowed') !== false)) 65 + if (false === $res && ((strpos($this->output, 'Copying of text from this document is not allowed') !== false) ||
  66 + (strpos($this->output, 'Incorrect password') !== false)))
65 { 67 {
66 $this->output = ''; 68 $this->output = '';
67 file_put_contents($this->targetfile, _kt('Security properties on the PDF document prevent text from being extracted.')); 69 file_put_contents($this->targetfile, _kt('Security properties on the PDF document prevent text from being extracted.'));
  70 + $default->log->info('Security properties on the PDF document prevent text from being extracted.');
68 return true; 71 return true;
69 } 72 }
70 73
  74 + if (false === $res && (strpos($this->output, 'PDF file is damaged') !== false))
  75 + {
  76 + $this->output = '';
  77 + $default->log->info('PDF file is damaged');
  78 + return true;
  79 + }
  80 +
  81 +
71 if (false === $res && (strpos($this->output, '(continuing anyway)') !== false)) 82 if (false === $res && (strpos($this->output, '(continuing anyway)') !== false))
72 { 83 {
73 $this->output = ''; 84 $this->output = '';
search2/indexing/indexerCore.inc.php
1 -<?php  
2 -  
3 -/**  
4 - * $Id:$  
5 - *  
6 - * KnowledgeTree Community Edition  
7 - * Document Management Made Simple  
8 - * Copyright (C) 2008 KnowledgeTree Inc.  
9 - * Portions copyright The Jam Warehouse Software (Pty) Limited  
10 - *  
11 - * This program is free software; you can redistribute it and/or modify it under  
12 - * the terms of the GNU General Public License version 3 as published by the  
13 - * Free Software Foundation.  
14 - *  
15 - * This program is distributed in the hope that it will be useful, but WITHOUT  
16 - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS  
17 - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more  
18 - * details.  
19 - *  
20 - * You should have received a copy of the GNU General Public License  
21 - * along with this program. If not, see <http://www.gnu.org/licenses/>.  
22 - *  
23 - * You can contact KnowledgeTree Inc., PO Box 7775 #87847, San Francisco,  
24 - * California 94120-7775, or email info@knowledgetree.com.  
25 - *  
26 - * The interactive user interfaces in modified source and object code versions  
27 - * of this program must display Appropriate Legal Notices, as required under  
28 - * Section 5 of the GNU General Public License version 3.  
29 - *  
30 - * In accordance with Section 7(b) of the GNU General Public License version 3,  
31 - * these Appropriate Legal Notices must retain the display of the "Powered by  
32 - * KnowledgeTree" logo and retain the original copyright notice. If the display of the  
33 - * logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices  
34 - * must display the words "Powered by KnowledgeTree" and retain the original  
35 - * copyright notice.  
36 - * Contributor( s): ______________________________________  
37 - *  
38 - */  
39 -  
40 -define('SEARCH2_INDEXER_DIR',realpath(dirname(__FILE__)) . '/');  
41 -require_once('indexing/extractorCore.inc.php');  
42 -require_once(KT_DIR . '/plugins/ktcore/scheduler/schedulerUtil.php');  
43 -  
44 -  
45 -class IndexerInconsistencyException extends Exception {};  
46 -  
47 -class QueryResultItem  
48 -{  
49 - protected $document_id;  
50 - protected $title;  
51 - protected $rank;  
52 - protected $text;  
53 - protected $filesize;  
54 - protected $fullpath;  
55 - protected $live;  
56 - protected $version;  
57 - protected $mimeType;  
58 - protected $filename;  
59 - protected $thumbnail; // TODO: if not null, gui can display a thumbnail  
60 - protected $viewer; // TODO: if not null, a viewer can be used to view the document  
61 - protected $document;  
62 - protected $checkedOutUser;  
63 - protected $dateCheckedout;  
64 - protected $workflowState;  
65 - protected $workflow;  
66 - protected $modifiedBy;  
67 - protected $dateModified;  
68 - protected $createdBy;  
69 - protected $dateCreated;  
70 - protected $owner;  
71 - protected $immutable;  
72 - protected $deleted;  
73 - protected $status;  
74 - protected $folderId;  
75 - protected $storagePath;  
76 - protected $documentType;  
77 - protected $mimeIconPath;  
78 - protected $mimeDisplay;  
79 - protected $oemDocumentNo;  
80 -  
81 - public function __construct($document_id, $rank=null, $title=null, $text=null)  
82 - {  
83 - $this->document_id=(int) $document_id;  
84 - $this->rank= $rank;  
85 - $this->title=$title;  
86 - $this->text = $text;  
87 - $this->live = true;  
88 - $this->loadDocumentInfo();  
89 - }  
90 -  
91 - protected function __isset($property)  
92 - {  
93 - switch($property)  
94 - {  
95 - case 'DocumentID': return isset($this->document_id);  
96 - case 'Rank': return isset($this->rank);  
97 - case 'Text': return isset($this->text);  
98 - case 'Title': return isset($this->title);  
99 - case null: break;  
100 - default:  
101 - throw new Exception("Unknown property '$property' to get on QueryResultItem");  
102 - }  
103 - return true; // should not be reached  
104 - }  
105 -  
106 - public function loadDocumentInfo()  
107 - {  
108 - global $default;  
109 - $sql = "SELECT  
110 - d.folder_id, f.full_path, f.name, dcv.size as filesize, dcv.major_version,  
111 - dcv.minor_version, dcv.filename, cou.name as checkoutuser, w.human_name as workflow, ws.human_name as workflowstate,  
112 - mt.mimetypes as mimetype, md.mime_doc as mimedoc, d.checkedout, mbu.name as modifiedbyuser, d.modified,  
113 - cbu.name as createdbyuser, ou.name as owneruser, d.immutable, d.status_id, d.created,dcv.storage_path, dtl.name as document_type,  
114 - mt.icon_path as mime_icon_path, mt.friendly_name as mime_display, d.oem_no, dmv.name as title  
115 - FROM  
116 - documents d  
117 - INNER JOIN document_metadata_version dmv ON d.metadata_version_id = dmv.id  
118 - INNER JOIN document_content_version dcv ON dmv.content_version_id = dcv.id  
119 - INNER JOIN mime_types mt ON dcv.mime_id=mt.id  
120 - LEFT JOIN document_types_lookup dtl ON dtl.id=dmv.document_type_id  
121 - LEFT JOIN folders f ON f.id=d.folder_id  
122 - LEFT JOIN users cou ON d.checked_out_user_id=cou.id  
123 - LEFT JOIN workflows w ON dmv.workflow_id=w.id  
124 - LEFT JOIN workflow_states ws ON dmv.workflow_state_id = ws.id  
125 - LEFT JOIN mime_documents md ON mt.mime_document_id = md.id  
126 - LEFT JOIN users mbu ON d.modified_user_id=mbu.id  
127 - LEFT JOIN users cbu ON d.creator_id=cbu.id  
128 - LEFT JOIN users ou ON d.owner_id=ou.id  
129 - WHERE  
130 - d.id=$this->document_id";  
131 -  
132 - $result = DBUtil::getOneResult($sql);  
133 -  
134 - if (PEAR::isError($result) || empty($result))  
135 - {  
136 - $this->live = false;  
137 - if (PEAR::isError($result))  
138 - {  
139 - throw new Exception('Database exception! There appears to be an error in the system: ' .$result->getMessage());  
140 - }  
141 -  
142 - $default->log->error('QueryResultItem: $result is null');  
143 - $msg = 'The database did not have a record matching the result from the document indexer. This may occur if there is an inconsistency between the document indexer and the repository. The indexer needs to be repaired.';  
144 - $default->log->error('QueryResultItem: ' . $msg);  
145 - // TODO: repair process where we scan documents in index, and delete those for which there is nothing in the repository  
146 - throw new IndexerInconsistencyException(_kt($msg));  
147 - }  
148 -  
149 - // document_id, relevance, text, title  
150 -  
151 - $this->documentType = $result['document_type'];  
152 - $this->filename=$result['filename'];  
153 - $this->filesize = KTUtil::filesizeToString($result['filesize']);  
154 - $this->folderId = $result['folder_id'];  
155 - $this->title = $result['title'];  
156 -  
157 - $this->createdBy = $result['createdbyuser'];  
158 - $this->dateCreated = $result['created'];  
159 -  
160 - $this->modifiedBy = $result['modifiedbyuser'];  
161 - $this->dateModified = $result['modified'];  
162 -  
163 - $this->checkedOutUser = $result['checkoutuser'];  
164 - $this->dateCheckedout = $result['checkedout'];  
165 -  
166 - $this->owner = $result['owneruser'];  
167 -  
168 - $this->version = $result['major_version'] . '.' . $result['minor_version'];  
169 -  
170 - $this->immutable = ($result['immutable'] + 0)?_kt('Immutable'):'';  
171 -  
172 - $this->workflow = $result['workflow'];  
173 - $this->workflowState = $result['workflowstate'];  
174 -  
175 - $this->oemDocumentNo = $result['oem_no'];  
176 - if (empty($this->oemDocumentNo)) $this->oemDocumentNo = 'n/a';  
177 -  
178 - if (is_null($result['name']))  
179 - {  
180 - $this->fullpath = '(orphaned)';  
181 - }  
182 - else  
183 - {  
184 - $this->fullpath = $result['full_path'];  
185 - }  
186 -  
187 - $this->mimeType = $result['mimetype'];  
188 - $this->mimeIconPath = $result['mime_icon_path'];  
189 - $this->mimeDisplay = $result['mime_display'];  
190 -  
191 - $this->storagePath = $result['storage_path'];  
192 - $this->status = Document::getStatusString($result['status_id']);  
193 - }  
194 -  
195 - protected function __get($property)  
196 - {  
197 - switch($property)  
198 - {  
199 - case null: return '';  
200 - case 'DocumentID': return (int) $this->document_id;  
201 - case 'Relevance':  
202 - case 'Rank': return (float) $this->rank;  
203 - case 'Text': return (string) $this->text;  
204 - case 'Title': return (string) $this->title;  
205 - case 'FullPath': return (string) $this->fullpath;  
206 - case 'IsLive': return (bool) $this->live;  
207 - case 'Filesize': return $this->filesize;  
208 - case 'Version': return (string) $this->version;  
209 - case 'Filename': return (string)$this->filename;  
210 - case 'FolderId': return (int)$this->folderId;  
211 - case 'OemDocumentNo': return (string) $this->oemDocumentNo;  
212 - case 'Document':  
213 - if (is_null($this->document))  
214 - {  
215 - $this->document = Document::get($this->document_id);  
216 - }  
217 - return $this->document;  
218 - case 'IsAvailable':  
219 - return $this->Document->isLive();  
220 - case 'CheckedOutUser':  
221 - case 'CheckedOutBy':  
222 - return (string) $this->checkedOutUser;  
223 - case 'WorkflowOnly':  
224 - case 'Workflow':  
225 - return (string)$this->workflow;  
226 - case 'WorkflowStateOnly':  
227 - case 'WorkflowState':  
228 - return (string)$this->workflowState;  
229 - case 'WorkflowAndState':  
230 - if (is_null($this->workflow))  
231 - {  
232 - return '';  
233 - }  
234 - return "$this->workflow - $this->workflowState";  
235 - case 'MimeType':  
236 - return (string) $this->mimeType;  
237 - case 'MimeIconPath':  
238 - return (string) $this->mimeIconPath;  
239 - case 'MimeDisplay':  
240 - return (string) $this->mimeDisplay;  
241 - case 'DateCheckedOut':  
242 - return (string) $this->dateCheckedout;  
243 - case 'ModifiedBy':  
244 - return (string) $this->modifiedBy;  
245 - case 'DateModified':  
246 - return (string) $this->dateModified;  
247 - case 'CreatedBy':  
248 - return (string) $this->createdBy;  
249 - case 'DateCreated':  
250 - return (string) $this->dateCreated;  
251 - case 'Owner':  
252 - case 'OwnedBy':  
253 - return (string) $this->owner;  
254 - case 'IsImmutable':  
255 - case 'Immutable':  
256 - return (bool) $this->immutable;  
257 - case 'Status':  
258 - return $this->status;  
259 - case 'StoragePath':  
260 - return $this->storagePath;  
261 - case 'DocumentType':  
262 - return $this->documentType;  
263 - case 'Permissions':  
264 - return 'not available';  
265 - case 'CanBeReadByUser':  
266 - if (!$this->live)  
267 - return false;  
268 - if (Permission::userHasDocumentReadPermission($this->Document))  
269 - return true;  
270 - if (Permission::adminIsInAdminMode())  
271 - return true;  
272 - return false;  
273 - default:  
274 - throw new Exception("Unknown property '$property' to get on QueryResultItem");  
275 - }  
276 - return ''; // Should not be reached  
277 - }  
278 -  
279 - protected function __set($property, $value)  
280 - {  
281 - switch($property)  
282 - {  
283 - case 'Rank': $this->rank = number_format($value,2,'.',','); break;  
284 - case 'Title': $this->title = $value; break;  
285 - case 'Text': $this->text = $value; break;  
286 - default:  
287 - throw new Exception("Unknown property '$property' to set on QueryResultItem");  
288 - }  
289 - }  
290 -}  
291 -  
292 -function MatchResultCompare($a, $b)  
293 -{  
294 - if ($a->Rank == $b->Rank) {  
295 - return 0;  
296 - }  
297 - return ($a->Rank < $b->Rank) ? -1 : 1;  
298 -}  
299 -  
300 -abstract class Indexer  
301 -{  
302 - /**  
303 - * Cache of extractors  
304 - *  
305 - * @var array  
306 - */  
307 - private $extractorCache;  
308 -  
309 - /**  
310 - * Indicates if the indexer will do logging.  
311 - *  
312 - * @var boolean  
313 - */  
314 - private $debug;  
315 - /**  
316 - * Cache on mime related hooks  
317 - *  
318 - * @var unknown_type  
319 - */  
320 - private $mimeHookCache;  
321 - /**  
322 - * Cache on general hooks.  
323 - *  
324 - * @var array  
325 - */  
326 - private $generalHookCache;  
327 -  
328 - /**  
329 - * This is a path to the extractors.  
330 - *  
331 - * @var string  
332 - */  
333 - private $extractorPath;  
334 - /**  
335 - * This is a path to the hooks.  
336 - *  
337 - * @var string  
338 - */  
339 - private $hookPath;  
340 -  
341 - private $enabledExtractors;  
342 -  
343 - /**  
344 - * Initialise the indexer  
345 - *  
346 - */  
347 - protected function __construct()  
348 - {  
349 - $config = KTConfig::getSingleton();  
350 -  
351 - $this->extractorCache = array();  
352 - $this->debug = $config->get('indexer/debug', true);  
353 - $this->hookCache = array();  
354 - $this->generalHookCache = array();  
355 - $this->extractorPath = $config->get('indexer/extractorPath', 'extractors');  
356 - $this->hookPath = $config->get('indexer/extractorHookPath','extractorHooks');  
357 -  
358 - $this->loadExtractorStatus();  
359 - }  
360 -  
361 - /**  
362 - * Get the list if enabled extractors  
363 - *  
364 - */  
365 - private function loadExtractorStatus()  
366 - {  
367 - $sql = "SELECT id, name FROM mime_extractors WHERE active=1";  
368 - $rs = DBUtil::getResultArray($sql);  
369 - $this->enabledExtractors = array();  
370 - foreach($rs as $item)  
371 - {  
372 - $this->enabledExtractors[] = $item['name'];  
373 - }  
374 - }  
375 -  
376 - private function isExtractorEnabled($extractor)  
377 - {  
378 - return in_array($extractor, $this->enabledExtractors);  
379 - }  
380 -  
381 - /**  
382 - * Returns a reference to the main class  
383 - *  
384 - * @return Indexer  
385 - */  
386 - public static function get()  
387 - {  
388 - static $singleton = null;  
389 -  
390 - if (is_null($singleton))  
391 - {  
392 - $config = KTConfig::getSingleton();  
393 - $classname = $config->get('indexer/coreClass');  
394 -  
395 - require_once('indexing/indexers/' . $classname . '.inc.php');  
396 -  
397 - if (!class_exists($classname))  
398 - {  
399 - throw new Exception("Class '$classname' does not exist.");  
400 - }  
401 -  
402 - $singleton = new $classname;  
403 - }  
404 -  
405 - return $singleton;  
406 - }  
407 -  
408 - public abstract function deleteDocument($docid);  
409 -  
410 - /**  
411 - * Remove the association of all extractors to mime types on the database.  
412 - *  
413 - */  
414 - public function clearExtractors()  
415 - {  
416 - global $default;  
417 -  
418 - $sql = "update mime_types set extractor_id=null";  
419 - DBUtil::runQuery($sql);  
420 -  
421 - $sql = "delete from mime_extractors";  
422 - DBUtil::runQuery($sql);  
423 -  
424 - if ($this->debug) $default->log->debug('clearExtractors');  
425 - }  
426 -  
427 - /**  
428 - * lookup the name of the extractor class based on the mime type.  
429 - *  
430 - * @param string $type  
431 - * @return string  
432 - */  
433 - public static function resolveExtractor($type)  
434 - {  
435 - global $default;  
436 - $sql = "select extractor from mime_types where filetypes='$type'";  
437 - $class = DBUtil::getOneResultKey($sql,'extractor');  
438 - if (PEAR::isError($class))  
439 - {  
440 - $default->log->error("resolveExtractor: cannot resolve $type");  
441 - return $class;  
442 - }  
443 - if ($this->debug) $default->log->debug(sprintf(_kt("resolveExtractor: Resolved '%s' from mime type '%s'."), $class, $type));  
444 - return $class;  
445 - }  
446 -  
447 - /**  
448 - * Return all the discussion text.  
449 - *  
450 - * @param int $docid  
451 - * @return string  
452 - */  
453 - public static function getDiscussionText($docid)  
454 - {  
455 - $sql = "SELECT  
456 - dc.subject, dc.body  
457 - FROM  
458 - discussion_threads dt  
459 - INNER JOIN discussion_comments dc ON dc.thread_id=dt.id AND dc.id BETWEEN dt.first_comment_id AND dt.last_comment_id  
460 - WHERE  
461 - dt.document_id=$docid";  
462 - $result = DBUtil::getResultArray($sql);  
463 - $text = '';  
464 -  
465 - foreach($result as $record)  
466 - {  
467 - $text .= $record['subject'] . "\n" . $record['body'] . "\n";  
468 - }  
469 -  
470 - return $text;  
471 - }  
472 -  
473 - /**  
474 - * Schedule the indexing of a document.  
475 - *  
476 - * @param string $document  
477 - * @param string $what  
478 - */  
479 - public static function index($document, $what='A')  
480 - {  
481 - global $default;  
482 -  
483 - if (is_numeric($document))  
484 - {  
485 - $document = Document::get($document+0);  
486 - }  
487 -  
488 - if (PEAR::isError($document))  
489 - {  
490 - $default->log->error("index: Could not index document: " .$document->getMessage());  
491 - return;  
492 - }  
493 -  
494 - $document_id = $document->getId();  
495 - $userid=$_SESSION['userID'];  
496 - if (empty($userid)) $userid=1;  
497 -  
498 - // we dequeue the document so that there are no issues when enqueuing  
499 - Indexer::unqueueDocument($document_id);  
500 -  
501 - // enqueue item  
502 - $sql = "INSERT INTO index_files(document_id, user_id, what) VALUES($document_id, $userid, '$what')";  
503 - DBUtil::runQuery($sql);  
504 -  
505 - $default->log->debug("index: Queuing indexing of $document_id");  
506 -  
507 - }  
508 -  
509 - private static function incrementCount()  
510 - {  
511 - // Get count from system settings  
512 - $count = Indexer::getIndexedDocumentCount();  
513 - $count = (int)$count + 1;  
514 - Indexer::updateIndexedDocumentCount($count);  
515 - }  
516 -  
517 - public static function getIndexedDocumentCount()  
518 - {  
519 - $count = KTUtil::getSystemSetting('indexedDocumentCount', 0);  
520 - return (int) $count;  
521 - }  
522 -  
523 - public static function updateIndexedDocumentCount($cnt = 0)  
524 - {  
525 - KTUtil::setSystemSetting('indexedDocumentCount', $cnt);  
526 - }  
527 -  
528 - public static function reindexQueue()  
529 - {  
530 - $sql = "UPDATE index_files SET processdate = null";  
531 - DBUtil::runQuery($sql);  
532 - }  
533 -  
534 - public static function reindexDocument($documentId)  
535 - {  
536 - $sql = "UPDATE index_files SET processdate=null, status_msg=null WHERE document_id=$documentId";  
537 - DBUtil::runQuery($sql);  
538 - }  
539 -  
540 -  
541 -  
542 - public static function indexAll()  
543 - {  
544 - $userid=$_SESSION['userID'];  
545 - if (empty($userid)) $userid=1;  
546 -  
547 - $sql = "DELETE FROM index_files";  
548 - DBUtil::runQuery($sql);  
549 -  
550 - $sql = "INSERT INTO index_files(document_id, user_id, what) SELECT id, $userid, 'A' FROM documents WHERE status_id=1 and id not in (select document_id from index_files)";  
551 - DBUtil::runQuery($sql);  
552 - }  
553 -  
554 - /**  
555 - * Clearout the scheduling of documents that no longer exist.  
556 - *  
557 - */  
558 - public static function clearoutDeleted()  
559 - {  
560 - global $default;  
561 -  
562 - $sql = 'DELETE FROM  
563 - index_files  
564 - WHERE  
565 - document_id in (SELECT d.id FROM documents AS d WHERE d.status_id=3) OR  
566 - NOT EXISTS(SELECT index_files.document_id FROM documents WHERE index_files.document_id=documents.id)';  
567 - DBUtil::runQuery($sql);  
568 -  
569 - $default->log->debug("Indexer::clearoutDeleted: removed documents from indexing queue that have been deleted");  
570 - }  
571 -  
572 -  
573 - /**  
574 - * Check if a document is scheduled to be indexed  
575 - *  
576 - * @param mixed $document This may be a document or document id  
577 - * @return boolean  
578 - */  
579 - public static function isDocumentScheduled($document)  
580 - {  
581 - if (is_numeric($document))  
582 - {  
583 - $docid = $document;  
584 - }  
585 - else if ($document instanceof Document)  
586 - {  
587 - $docid = $document->getId();  
588 - }  
589 - else  
590 - {  
591 - return false;  
592 - }  
593 - $sql = "SELECT 1 FROM index_files WHERE document_id=$docid";  
594 - $result = DBUtil::getResultArray($sql);  
595 - return count($result) > 0;  
596 - }  
597 -  
598 - /**  
599 - * Filters text removing redundant characters such as continuous newlines and spaces.  
600 - *  
601 - * @param string $filename  
602 - */  
603 - private function filterText($filename)  
604 - {  
605 - $content = file_get_contents($filename);  
606 -  
607 - $src = array("([\r\n])","([\n][\n])","([\n])","([\t])",'([ ][ ])');  
608 - $tgt = array("\n","\n",' ',' ',' ');  
609 -  
610 - // shrink what is being stored.  
611 - do  
612 - {  
613 - $orig = $content;  
614 - $content = preg_replace($src, $tgt, $content);  
615 - } while ($content != $orig);  
616 -  
617 - return file_put_contents($filename, $content) !== false;  
618 - }  
619 -  
620 - /**  
621 - * Load hooks for text extraction process.  
622 - *  
623 - */  
624 - private function loadExtractorHooks()  
625 - {  
626 - $this->generalHookCache = array();  
627 - $this->mimeHookCache = array();  
628 -  
629 -  
630 - $dir = opendir(SearchHelper::correctPath($this->hookPath));  
631 - while (($file = readdir($dir)) !== false)  
632 - {  
633 - if (substr($file,-12) == 'Hook.inc.php')  
634 - {  
635 - require_once($this->hookPath . '/' . $file);  
636 - $class = substr($file, 0, -8);  
637 -  
638 - if (!class_exists($class))  
639 - {  
640 - continue;  
641 - }  
642 -  
643 - $hook = new $class;  
644 - if (!($class instanceof ExtractorHook))  
645 - {  
646 - continue;  
647 - }  
648 -  
649 - $mimeTypes = $hook->registerMimeTypes();  
650 - if (is_null($mimeTypes))  
651 - {  
652 - $this->generalHookCache[] = & $hook;  
653 - }  
654 - else  
655 - {  
656 - foreach($mimeTypes as $type)  
657 - {  
658 - $this->mimeHookCache[$type][] = & $hook;  
659 - }  
660 - }  
661 -  
662 - }  
663 - }  
664 - closedir($dir);  
665 - }  
666 -  
667 - /**  
668 - * This is a refactored function to execute the hooks.  
669 - *  
670 - * @param DocumentExtractor $extractor  
671 - * @param string $phase  
672 - * @param string $mimeType Optional. If set, indicates which hooks must be used, else assume general.  
673 - */  
674 - private function executeHook($extractor, $phase, $mimeType = null)  
675 - {  
676 - $hooks = array();  
677 - if (is_null($mimeType))  
678 - {  
679 - $hooks = $this->generalHookCache;  
680 - }  
681 - else  
682 - {  
683 - if (array_key_exists($mimeType, $this->mimeHookCache))  
684 - {  
685 - $hooks = $this->mimeHookCache[$mimeType];  
686 - }  
687 - }  
688 - if (empty($hooks))  
689 - {  
690 - return;  
691 - }  
692 -  
693 - foreach($hooks as $hook)  
694 - {  
695 - $hook->$phase($extractor);  
696 - }  
697 - }  
698 -  
699 - private function doesDiagnosticsPass($simple=false)  
700 - {  
701 - global $default;  
702 -  
703 - $config =& KTConfig::getSingleton();  
704 - // create a index log lock file in case there are errors, and we don't need to log them forever!  
705 - // this function will create the lockfile if an error is detected. It will be removed as soon  
706 - // as the problems with the indexer are removed.  
707 - $lockFile = $config->get('cache/cacheDirectory') . '/index.log.lock';  
708 -  
709 - $diagnosis = $this->diagnose();  
710 - if (!is_null($diagnosis))  
711 - {  
712 - if (!is_file($lockFile))  
713 - {  
714 - $default->log->error(_kt('Indexer problem: ') . $diagnosis);  
715 - }  
716 - touch($lockFile);  
717 - return false;  
718 - }  
719 -  
720 - if ($simple)  
721 - {  
722 - return true;  
723 - }  
724 -  
725 - $diagnosis = $this->diagnoseExtractors();  
726 - if (!empty($diagnosis))  
727 - {  
728 - if (!is_file($lockFile))  
729 - {  
730 - foreach($diagnosis as $diag)  
731 - {  
732 - $default->log->error(sprintf(_kt('%s problem: %s'), $diag['name'],$diag['diagnosis']));  
733 - }  
734 - }  
735 - touch($lockFile);  
736 - return false;  
737 - }  
738 -  
739 - if (is_file($lockFile))  
740 - {  
741 - $default->log->info(_kt('Issues with the indexer have been resolved!'));  
742 - unlink($lockFile);  
743 - }  
744 -  
745 - return true;  
746 - }  
747 -  
748 - /**  
749 - * This does the initial mime type association between mime types and text extractors  
750 - *  
751 - */  
752 - public function checkForRegisteredTypes()  
753 - {  
754 - global $default;  
755 -  
756 - // we are only doing this once!  
757 - $initRegistered = KTUtil::getSystemSetting('mimeTypesRegistered', false);  
758 - if ($initRegistered)  
759 - {  
760 - return;  
761 - }  
762 - if ($this->debug) $default->log->debug('checkForRegisteredTypes: start');  
763 -  
764 - $date = date('Y-m-d H:i');  
765 - $sql = "UPDATE scheduler_tasks SET run_time='$date'";  
766 - DBUtil::runQuery($sql);  
767 -  
768 - $this->registerTypes(true);  
769 -  
770 - $disable = array(  
771 - OS_WINDOWS=>array('PSExtractor'),  
772 - OS_UNIX => array()  
773 - );  
774 -  
775 - $disableForOS = OS_WINDOWS?$disable[OS_WINDOWS]:$disable[OS_UNIX];  
776 -  
777 - foreach($disableForOS as $extractor)  
778 - {  
779 - $sql = "UPDATE mime_extractors SET active=0 WHERE name='$extractor'";  
780 - DBUtil::runQuery($sql);  
781 - $default->log->info("checkForRegisteredTypes: disabled '$extractor'");  
782 - }  
783 -  
784 - if ($this->debug) $default->log->debug('checkForRegisteredTypes: done');  
785 - KTUtil::setSystemSetting('mimeTypesRegistered', true);  
786 - }  
787 -  
788 - private function updatePendingDocumentStatus($documentId, $message, $level)  
789 - {  
790 - $this->indexingHistory .= "\n" . $level . ': ' . $message;  
791 - $message = sanitizeForSQL($this->indexingHistory);  
792 - $sql = "UPDATE index_files SET status_msg='$message' WHERE document_id=$documentId";  
793 - DBUtil::runQuery($sql);  
794 - }  
795 -  
796 - /**  
797 - *  
798 - * @param int $documentId  
799 - * @param string $message  
800 - * @param string $level This may be info, error, debug  
801 - */  
802 - private function logPendingDocumentInfoStatus($documentId, $message, $level)  
803 - {  
804 - $this->updatePendingDocumentStatus($documentId, $message, $level);  
805 - global $default;  
806 -  
807 - switch ($level)  
808 - {  
809 - case 'debug':  
810 - if ($this->debug)  
811 - {  
812 - $default->log->debug($message);  
813 - }  
814 - break;  
815 - default:  
816 - $default->log->$level($message);  
817 - }  
818 - }  
819 -  
820 -  
821 -  
822 - public function getExtractor($extractorClass)  
823 - {  
824 - if (empty($extractorClass))  
825 - {  
826 - return null;  
827 - }  
828 -  
829 - $includeFile = SEARCH2_INDEXER_DIR . 'extractors/' . $extractorClass . '.inc.php';  
830 - if (!file_exists($includeFile))  
831 - {  
832 - throw new Exception("Extractor file does not exist: $includeFile");  
833 - }  
834 -  
835 - require_once($includeFile);  
836 -  
837 - if (!class_exists($extractorClass))  
838 - {  
839 - throw new Exception("Extractor '$classname' not defined in file: $includeFile");  
840 - }  
841 -  
842 - $extractor = new $extractorClass();  
843 -  
844 - if (!($extractor instanceof DocumentExtractor))  
845 - {  
846 - throw new Exception("Class $classname was expected to be of type DocumentExtractor");  
847 - }  
848 -  
849 - return $extractor;  
850 - }  
851 -  
852 - public static function getIndexingQueue($problemItemsOnly=true)  
853 - {  
854 -  
855 - if ($problemItemsOnly)  
856 - {  
857 - $sql = "SELECT  
858 - iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename  
859 - FROM  
860 - index_files iff  
861 - INNER JOIN documents d ON iff.document_id=d.id  
862 - INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id  
863 - INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id  
864 - INNER JOIN mime_types mt ON dcv.mime_id=mt.id  
865 - LEFT JOIN mime_extractors me ON mt.extractor_id=me.id  
866 - WHERE  
867 - (iff.status_msg IS NOT NULL AND iff.status_msg <> '') AND d.status_id=1  
868 - ORDER BY indexdate ";  
869 - }  
870 - else  
871 - {  
872 - $sql = "SELECT  
873 - iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename  
874 - FROM  
875 - index_files iff  
876 - INNER JOIN documents d ON iff.document_id=d.id  
877 - INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id  
878 - INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id  
879 - INNER JOIN mime_types mt ON dcv.mime_id=mt.id  
880 - LEFT JOIN mime_extractors me ON mt.extractor_id=me.id  
881 - WHERE  
882 - (iff.status_msg IS NULL or iff.status_msg = '') AND d.status_id=1  
883 - ORDER BY indexdate ";  
884 - }  
885 - $aResult = DBUtil::getResultArray($sql);  
886 -  
887 - return $aResult;  
888 - }  
889 -  
890 - public static function getPendingIndexingQueue()  
891 - {  
892 - return Indexer::getIndexingQueue(false);  
893 - }  
894 -  
895 - /**  
896 - * The main function that may be called repeatedly to index documents.  
897 - *  
898 - * @param int $max Default 20  
899 - */  
900 - public function indexDocuments($max=null)  
901 - {  
902 - global $default;  
903 - $config =& KTConfig::getSingleton();  
904 -  
905 - /*$indexLockFile = $config->get('cache/cacheDirectory') . '/main.index.lock';  
906 - if (is_file($indexLockFile))  
907 - {  
908 - $default->log->info('indexDocuments: main.index.lock seems to exist. it could be that the indexing is still underway.');  
909 - $default->log->info('indexDocuments: Remove "' . $indexLockFile . '" if the indexing is not running or extend the frequency at which the background task runs!');  
910 - return;  
911 - }  
912 - touch($indexLockFile);*/  
913 -  
914 -  
915 - $this->checkForRegisteredTypes();  
916 -  
917 - if ($this->debug) $default->log->debug('indexDocuments: start');  
918 - if (!$this->doesDiagnosticsPass())  
919 - {  
920 - //unlink($indexLockFile);  
921 - if ($this->debug) $default->log->debug('indexDocuments: stopping - diagnostics problem. The dashboard will provide more information.');  
922 - return;  
923 - }  
924 -  
925 - if (is_null($max))  
926 - {  
927 - $max = $config->get('indexer/batchDocuments',20);  
928 - }  
929 -  
930 - $this->loadExtractorHooks();  
931 -  
932 - Indexer::clearoutDeleted();  
933 -  
934 - $date = date('Y-m-d H:i:s');  
935 - // identify the indexers that must run  
936 - // mysql specific limit!  
937 - $sql = "SELECT  
938 - iff.document_id, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what  
939 - FROM  
940 - index_files iff  
941 - INNER JOIN documents d ON iff.document_id=d.id  
942 - INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id  
943 - INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id  
944 - INNER JOIN mime_types mt ON dcv.mime_id=mt.id  
945 - LEFT JOIN mime_extractors me ON mt.extractor_id=me.id  
946 - WHERE  
947 - (iff.processdate IS NULL or iff.processdate < cast(cast('$date' as date) -1 as date)) AND dmv.status_id=1  
948 - ORDER BY indexdate  
949 - LIMIT $max";  
950 - $result = DBUtil::getResultArray($sql);  
951 - if (PEAR::isError($result))  
952 - {  
953 - //unlink($indexLockFile);  
954 - if ($this->debug) $default->log->debug('indexDocuments: stopping - db error');  
955 - return;  
956 - }  
957 - KTUtil::setSystemSetting('luceneIndexingDate', time());  
958 -  
959 - // bail if no work to do  
960 - if (count($result) == 0)  
961 - {  
962 - //unlink($indexLockFile);  
963 - if ($this->debug) $default->log->debug('indexDocuments: stopping - no work to be done');  
964 - return;  
965 - }  
966 -  
967 - // identify any documents that need indexing and mark them  
968 - // so they are not taken in a followup run  
969 - $ids = array();  
970 - foreach($result as $docinfo)  
971 - {  
972 - $ids[] = $docinfo['document_id'];  
973 - }  
974 -  
975 - // mark the documents as being processed  
976 -  
977 - $ids=implode(',',$ids);  
978 - $sql = "UPDATE index_files SET processdate='$date' WHERE document_id in ($ids)";  
979 - DBUtil::runQuery($sql);  
980 -  
981 - $extractorCache = array();  
982 - $storageManager = KTStorageManagerUtil::getSingleton();  
983 -  
984 - $tempPath = $config->get("urls/tmpDirectory");  
985 -  
986 - foreach($result as $docinfo)  
987 - {  
988 - // increment indexed documents count  
989 - Indexer::incrementCount();  
990 -  
991 - $docId=$docinfo['document_id'];  
992 - $extension=$docinfo['filetypes'];  
993 - $mimeType=$docinfo['mimetypes'];  
994 - $extractorClass=$docinfo['extractor'];  
995 - $indexDocument = in_array($docinfo['what'], array('A','C'));  
996 - $indexDiscussion = in_array($docinfo['what'], array('A','D'));  
997 - $this->indexingHistory = '';  
998 -  
999 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Indexing docid: %d extension: '%s' mimetype: '%s' extractor: '%s'"), $docId, $extension,$mimeType,$extractorClass), 'debug');  
1000 -  
1001 - if (empty($extractorClass))  
1002 - {  
1003 - /*  
1004 -  
1005 - if no extractor is found and we don't need to index discussions, then we can remove the item from the queue.  
1006 -  
1007 - */  
1008 - if ($indexDiscussion)  
1009 - {  
1010 - $indexDocument = false;  
1011 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Not indexing docid: %d content because extractor could not be resolve. Still indexing discussion."), $docId), 'info');  
1012 - }  
1013 - else  
1014 - {  
1015 - Indexer::unqueueDocument($docId, sprintf(_kt("No extractor for docid: %d"),$docId));  
1016 - continue;  
1017 - }  
1018 - }  
1019 - else  
1020 - {  
1021 - /*  
1022 -  
1023 - If an extractor is available, we must ensure it is enabled.  
1024 -  
1025 - */  
1026 -  
1027 - if (!$this->isExtractorEnabled($extractorClass))  
1028 - {  
1029 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("diagnose: Not indexing docid: %d because extractor '%s' is disabled."), $docId, $extractorClass), 'info');  
1030 - continue;  
1031 - }  
1032 - }  
1033 -  
1034 - if ($this->debug)  
1035 - {  
1036 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Processing docid: %d.\n"),$docId), 'info');  
1037 - }  
1038 -  
1039 - $document = Document::get($docId);  
1040 - if (PEAR::isError($document))  
1041 - {  
1042 - Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Cannot resolve document id %d: %s."),$docId, $document->getMessage()), 'error');  
1043 - continue;  
1044 - }  
1045 -  
1046 - $filename = $document->getFileName();  
1047 - if (substr($filename,0,1) == '~')  
1048 - {  
1049 - Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Filename for document id %d starts with a tilde (~). This is assumed to be a temporary file. This is ignored."),$docId), 'error');  
1050 - continue;  
1051 - }  
1052 -  
1053 - $removeFromQueue = true;  
1054 - if ($indexDocument)  
1055 - {  
1056 - if (array_key_exists($extractorClass, $extractorCache))  
1057 - {  
1058 - $extractor = $extractorCache[$extractorClass];  
1059 - }  
1060 - else  
1061 - {  
1062 - $extractor = $extractorCache[$extractorClass] = $this->getExtractor($extractorClass);  
1063 - }  
1064 -  
1065 - if (!($extractor instanceof DocumentExtractor))  
1066 - {  
1067 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("indexDocuments: extractor '%s' is not a document extractor class."),$extractorClass), 'error');  
1068 - continue;  
1069 - }  
1070 -  
1071 -  
1072 -  
1073 - $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();  
1074 - $sourceFile = $storageManager->temporaryFile($document);  
1075 -  
1076 - if (empty($sourceFile) || !is_file($sourceFile))  
1077 - {  
1078 - Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: source file '%s' for document %d does not exist."),$sourceFile,$docId), 'error');  
1079 - continue;  
1080 - }  
1081 -  
1082 - if ($extractor->needsIntermediateSourceFile())  
1083 - {  
1084 - $extension = pathinfo($document->getFileName(), PATHINFO_EXTENSION);  
1085 -  
1086 - $intermediate = $tempPath . '/'. $docId . '.' . $extension;  
1087 - $result = @copy($sourceFile, $intermediate);  
1088 - if ($result === false)  
1089 - {  
1090 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not create intermediate file from document %d"),$docId), 'error');  
1091 - // problem. lets try again later. probably permission related. log the issue.  
1092 - continue;  
1093 - }  
1094 - $sourceFile = $intermediate;  
1095 - }  
1096 -  
1097 - $targetFile = tempnam($tempPath, 'ktindexer');  
1098 -  
1099 - $extractor->setSourceFile($sourceFile);  
1100 - $extractor->setMimeType($mimeType);  
1101 - $extractor->setExtension($extension);  
1102 - $extractor->setTargetFile($targetFile);  
1103 - $extractor->setDocument($document);  
1104 - $extractor->setIndexingStatus(null);  
1105 - $extractor->setExtractionStatus(null);  
1106 -  
1107 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Extra Info docid: %d Source File: '%s' Target File: '%s'"),$docId,$sourceFile,$targetFile), 'debug');  
1108 -  
1109 - $this->executeHook($extractor, 'pre_extract');  
1110 - $this->executeHook($extractor, 'pre_extract', $mimeType);  
1111 - $removeFromQueue = false;  
1112 -  
1113 - if ($extractor->extractTextContent())  
1114 - {  
1115 - // the extractor may need to create another target file  
1116 - $targetFile = $extractor->getTargetFile();  
1117 -  
1118 - $extractor->setExtractionStatus(true);  
1119 - $this->executeHook($extractor, 'pre_index');  
1120 - $this->executeHook($extractor, 'pre_index', $mimeType);  
1121 -  
1122 - $title = $document->getName();  
1123 - if ($indexDiscussion)  
1124 - {  
1125 - if (!$this->filterText($targetFile))  
1126 - {  
1127 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');  
1128 - }  
1129 - else  
1130 - {  
1131 - $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);  
1132 - $removeFromQueue = $indexStatus;  
1133 - if (!$indexStatus)  
1134 - {  
1135 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocumentAndDiscussion"),$docId), 'error');  
1136 - }  
1137 -  
1138 - $extractor->setIndexingStatus($indexStatus);  
1139 - }  
1140 - }  
1141 - else  
1142 - {  
1143 - if (!$this->filterText($targetFile))  
1144 - {  
1145 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');  
1146 - }  
1147 - else  
1148 - {  
1149 - $indexStatus = $this->indexDocument($docId, $targetFile, $title, $version);  
1150 - $removeFromQueue = $indexStatus;  
1151 -  
1152 - if (!$indexStatus)  
1153 - {  
1154 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocument"),$docId), 'error');  
1155 - $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error');  
1156 - }  
1157 -  
1158 - $extractor->setIndexingStatus($indexStatus);  
1159 - }  
1160 - }  
1161 -  
1162 - $this->executeHook($extractor, 'post_index', $mimeType);  
1163 - $this->executeHook($extractor, 'post_index');  
1164 - }  
1165 - else  
1166 - {  
1167 - $extractor->setExtractionStatus(false);  
1168 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not extract contents from document %d"),$docId), 'error');  
1169 - $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error');  
1170 - }  
1171 -  
1172 - $this->executeHook($extractor, 'post_extract', $mimeType);  
1173 - $this->executeHook($extractor, 'post_extract');  
1174 -  
1175 - if ($extractor->needsIntermediateSourceFile())  
1176 - {  
1177 - @unlink($sourceFile);  
1178 - }  
1179 -  
1180 - @unlink($targetFile);  
1181 -  
1182 - }  
1183 - else  
1184 - {  
1185 - $indexStatus = $this->indexDiscussion($docId);  
1186 - $removeFromQueue = $indexStatus;  
1187 - }  
1188 -  
1189 - if ($removeFromQueue)  
1190 - {  
1191 - Indexer::unqueueDocument($docId, sprintf(_kt("Done indexing docid: %d"),$docId));  
1192 - }  
1193 - else  
1194 - {  
1195 - if ($this->debug) $default->log->debug(sprintf(_kt("Document docid: %d was not removed from the queue as it looks like there was a problem with the extraction process"),$docId));  
1196 - }  
1197 - }  
1198 - if ($this->debug) $default->log->debug('indexDocuments: done');  
1199 - //unlink($indexLockFile);  
1200 - }  
1201 -  
1202 - public function migrateDocuments($max=null)  
1203 - {  
1204 - global $default;  
1205 -  
1206 - $default->log->info(_kt('migrateDocuments: starting'));  
1207 -  
1208 - if (!$this->doesDiagnosticsPass(true))  
1209 - {  
1210 - $default->log->info(_kt('migrateDocuments: stopping - diagnostics problem. The dashboard will provide more information.'));  
1211 - return;  
1212 - }  
1213 -  
1214 - if (KTUtil::getSystemSetting('migrationComplete') == 'true')  
1215 - {  
1216 - $default->log->info(_kt('migrateDocuments: stopping - migration is complete.'));  
1217 - return;  
1218 - }  
1219 -  
1220 - $config =& KTConfig::getSingleton();  
1221 - if (is_null($max))  
1222 - {  
1223 - $max = $config->get('indexer/batchMigrateDocument',500);  
1224 - }  
1225 -  
1226 - $lockFile = $config->get('cache/cacheDirectory') . '/migration.lock';  
1227 - if (is_file($lockFile))  
1228 - {  
1229 - $default->log->info(_kt('migrateDocuments: stopping - migration lockfile detected.'));  
1230 - return;  
1231 - }  
1232 - touch($lockFile);  
1233 -  
1234 - $startTime = KTUtil::getSystemSetting('migrationStarted');  
1235 - if (is_null($startTime))  
1236 - {  
1237 - KTUtil::setSystemSetting('migrationStarted', time());  
1238 - }  
1239 -  
1240 - $maxLoops = 5;  
1241 -  
1242 - $max = ceil($max / $maxLoops);  
1243 -  
1244 - $start =KTUtil::getBenchmarkTime();  
1245 - $noDocs = false;  
1246 - $numDocs = 0;  
1247 -  
1248 - for($loop=0;$loop<$maxLoops;$loop++)  
1249 - {  
1250 -  
1251 - $sql = "SELECT  
1252 - document_id, document_text  
1253 - FROM  
1254 - document_text  
1255 - ORDER BY document_id  
1256 - LIMIT $max";  
1257 - $result = DBUtil::getResultArray($sql);  
1258 - if (PEAR::isError($result))  
1259 - {  
1260 - $default->log->info(_kt('migrateDocuments: db error'));  
1261 - break;  
1262 - }  
1263 -  
1264 - $docs = count($result);  
1265 - if ($docs == 0)  
1266 - {  
1267 - $noDocs = true;  
1268 - break;  
1269 - }  
1270 - $numDocs += $docs;  
1271 -  
1272 - foreach($result as $docinfo)  
1273 - {  
1274 - $docId = $docinfo['document_id'];  
1275 -  
1276 - $document = Document::get($docId);  
1277 - if (PEAR::isError($document) || is_null($document))  
1278 - {  
1279 - $sql = "DELETE FROM document_text WHERE document_id=$docId";  
1280 - DBUtil::runQuery($sql);  
1281 - $default->log->error(sprintf(_kt('migrateDocuments: Could not get document %d\'s document! Removing content!'),$docId));  
1282 - continue;  
1283 - }  
1284 -  
1285 - $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();  
1286 -  
1287 - $targetFile = tempnam($tempPath, 'ktindexer');  
1288 -  
1289 - if (file_put_contents($targetFile, $docinfo['document_text']) === false)  
1290 - {  
1291 - $default->log->error(sprintf(_kt('migrateDocuments: Cannot write to \'%s\' for document id %d'), $targetFile, $docId));  
1292 - continue;  
1293 - }  
1294 - // free memory asap ;)  
1295 - unset($docinfo['document_text']);  
1296 -  
1297 - $title = $document->getName();  
1298 -  
1299 - $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);  
1300 -  
1301 - if ($indexStatus)  
1302 - {  
1303 - $sql = "DELETE FROM document_text WHERE document_id=$docId";  
1304 - DBUtil::runQuery($sql);  
1305 - }  
1306 - else  
1307 - {  
1308 - $default->log->error(sprintf(_kt("migrateDocuments: Problem indexing document %d"), $docId));  
1309 - }  
1310 -  
1311 - @unlink($targetFile);  
1312 - }  
1313 - }  
1314 -  
1315 - @unlink($lockFile);  
1316 -  
1317 - $time = KTUtil::getBenchmarkTime() - $start;  
1318 -  
1319 - KTUtil::setSystemSetting('migrationTime', KTUtil::getSystemSetting('migrationTime',0) + $time);  
1320 - KTUtil::setSystemSetting('migratedDocuments', KTUtil::getSystemSetting('migratedDocuments',0) + $numDocs);  
1321 -  
1322 - $default->log->info(sprintf(_kt('migrateDocuments: stopping - done in %d seconds!'), $time));  
1323 - if ($noDocs)  
1324 - {  
1325 - $default->log->info(_kt('migrateDocuments: Completed!'));  
1326 - KTUtil::setSystemSetting('migrationComplete', 'true');  
1327 - schedulerUtil::deleteByName('Index Migration');  
1328 - $default->log->debug(_kt('migrateDocuments: Disabling \'Index Migration\' task by removing scheduler entry.'));  
1329 - }  
1330 - }  
1331 -  
1332 - /**  
1333 - * Index a document. The base class must override this function.  
1334 - *  
1335 - * @param int $docId  
1336 - * @param string $textFile  
1337 - */  
1338 - protected abstract function indexDocument($docId, $textFile, $title, $version);  
1339 -  
1340 -  
1341 - public function updateDocumentIndex($docId, $text)  
1342 - {  
1343 - $config = KTConfig::getSingleton();  
1344 - $tempPath = $config->get("urls/tmpDirectory");  
1345 - $tempFile = tempnam($tempPath,'ud_');  
1346 -  
1347 - file_put_contents($tempFile, $text);  
1348 -  
1349 - $document = Document::get($docId);  
1350 - $title = $document->getDescription();  
1351 - $version = $document->getVersion();  
1352 -  
1353 - $result = $this->indexDocument($docId, $tempFile, $title, $version);  
1354 -  
1355 - if (file_exists($tempFile))  
1356 - {  
1357 - unlink($tempFile);  
1358 - }  
1359 -  
1360 - return $result;  
1361 - }  
1362 -  
1363 - /**  
1364 - * Index a discussion. The base class must override this function.  
1365 - *  
1366 - * @param int $docId  
1367 - */  
1368 - protected abstract function indexDiscussion($docId);  
1369 -  
1370 - /**  
1371 - * Diagnose the indexer. e.g. Check that the indexing server is running.  
1372 - *  
1373 - */  
1374 - public abstract function diagnose();  
1375 -  
1376 - /**  
1377 - * Diagnose the extractors.  
1378 - *  
1379 - * @return array  
1380 - */  
1381 - public function diagnoseExtractors()  
1382 - {  
1383 - $diagnosis = $this->_diagnose($this->extractorPath, 'DocumentExtractor', 'Extractor.inc.php');  
1384 - $diagnosis = array_merge($diagnosis, $this->_diagnose($this->hookPath, 'Hook', 'Hook.inc.php'));  
1385 -  
1386 - return $diagnosis;  
1387 - }  
1388 -  
1389 - /**  
1390 - * This is a refactored diagnose function.  
1391 - *  
1392 - * @param string $path  
1393 - * @param string $class  
1394 - * @param string $extension  
1395 - * @return array  
1396 - */  
1397 - private function _diagnose($path, $baseclass, $extension)  
1398 - {  
1399 - global $default;  
1400 -  
1401 - $diagnoses = array();  
1402 -  
1403 - $dir = opendir(SearchHelper::correctPath($path));  
1404 - $extlen = - strlen($extension);  
1405 -  
1406 - while (($file = readdir($dir)) !== false)  
1407 - {  
1408 - if (substr($file,0,1) == '.')  
1409 - {  
1410 - continue;  
1411 - }  
1412 - if (substr($file,$extlen) != $extension)  
1413 - {  
1414 - $default->log->error(sprintf(_kt("diagnose: '%s' does not have extension '%s'."), $file, $extension));  
1415 - continue;  
1416 - }  
1417 -  
1418 - require_once($path . '/' . $file);  
1419 -  
1420 - $class = substr($file, 0, -8);  
1421 - if (!class_exists($class))  
1422 - {  
1423 - $default->log->error(sprintf(_kt("diagnose: class '%s' does not exist."), $class));  
1424 - continue;  
1425 - }  
1426 -  
1427 - if (!$this->isExtractorEnabled($class))  
1428 - {  
1429 - $default->log->debug(sprintf(_kt("diagnose: extractor '%s' is disabled."), $class));  
1430 - continue;  
1431 - }  
1432 -  
1433 - $extractor = new $class();  
1434 - if (!is_a($extractor, $baseclass))  
1435 - {  
1436 - $default->log->error(sprintf(_kt("diagnose(): '%s' is not of type DocumentExtractor"), $class));  
1437 - continue;  
1438 - }  
1439 -  
1440 - $types = $extractor->getSupportedMimeTypes();  
1441 - if (empty($types))  
1442 - {  
1443 - if ($this->debug) $default->log->debug(sprintf(_kt("diagnose: class '%s' does not support any types."), $class));  
1444 - continue;  
1445 - }  
1446 -  
1447 - $diagnosis=$extractor->diagnose();  
1448 - if (empty($diagnosis))  
1449 - {  
1450 - continue;  
1451 - }  
1452 - $diagnoses[$class] = array(  
1453 - 'name'=>$extractor->getDisplayName(),  
1454 - 'diagnosis'=>$diagnosis  
1455 - );  
1456 -  
1457 - }  
1458 - closedir($dir);  
1459 -  
1460 - return $diagnoses;  
1461 - }  
1462 -  
1463 -  
1464 - /**  
1465 - * Register the extractor types.  
1466 - *  
1467 - * @param boolean $clear. Optional. Defaults to false.  
1468 - */  
1469 - public function registerTypes($clear=false)  
1470 - {  
1471 - if ($clear)  
1472 - {  
1473 - $this->clearExtractors();  
1474 - }  
1475 - $dir = opendir(SearchHelper::correctPath($this->extractorPath));  
1476 - while (($file = readdir($dir)) !== false)  
1477 - {  
1478 - if (substr($file,-17) == 'Extractor.inc.php')  
1479 - {  
1480 - require_once($this->extractorPath . '/' . $file);  
1481 - $class = substr($file, 0, -8);  
1482 -  
1483 - if (!class_exists($class))  
1484 - {  
1485 - // if the class does not exist, we can't do anything.  
1486 - continue;  
1487 - }  
1488 -  
1489 - $extractor = new $class;  
1490 - if ($extractor instanceof DocumentExtractor)  
1491 - {  
1492 - $extractor->registerMimeTypes();  
1493 - }  
1494 - }  
1495 - }  
1496 - closedir($dir);  
1497 - }  
1498 -  
1499 - /**  
1500 - * This is used as a possible obtimisation effort. It may be overridden in that case.  
1501 - *  
1502 - * @param int $docId  
1503 - * @param string $textFile  
1504 - */  
1505 - protected function indexDocumentAndDiscussion($docId, $textFile, $title, $version)  
1506 - {  
1507 - $this->indexDocument($docId, $textFile, $title, $version);  
1508 - $this->indexDiscussion($docId);  
1509 - }  
1510 -  
1511 - /**  
1512 - * Remove the document from the queue. This is normally called when it has been processed.  
1513 - *  
1514 - * @param int $docid  
1515 - */  
1516 - public static function unqueueDocument($docid, $reason=false, $level='debug')  
1517 - {  
1518 - $sql = "DELETE FROM index_files WHERE document_id=$docid";  
1519 - DBUtil::runQuery($sql);  
1520 - if ($reason !== false)  
1521 - {  
1522 - global $default;  
1523 - $default->log->$level("Indexer: removing document $docid from the queue - $reason");  
1524 - }  
1525 - }  
1526 -  
1527 - /**  
1528 - * Run a query on the index.  
1529 - *  
1530 - * @param string $query  
1531 - * @return array  
1532 - */  
1533 - public abstract function query($query);  
1534 -  
1535 - /**  
1536 - * Converts an integer to a string that can be easily compared and reversed.  
1537 - *  
1538 - * @param int $int  
1539 - * @return string  
1540 - */  
1541 - public static function longToString($int)  
1542 - {  
1543 - $maxlen = 14;  
1544 -  
1545 - $a2z = array('a','b','c','d','e','f','g','h','i','j');  
1546 - $o29 = array('0','1','2','3','4','5','6','7','8','9');  
1547 - $l = str_pad('',$maxlen - strlen("$int"),'0') . $int;  
1548 -  
1549 - return str_replace($o29, $a2z, $l);  
1550 - }  
1551 -  
1552 - /**  
1553 - * Converts a string to an integer.  
1554 - *  
1555 - * @param string $str  
1556 - * @return int  
1557 - */  
1558 - public static function stringToLong($str)  
1559 - {  
1560 - $a2z = array('a','b','c','d','e','f','g','h','i','j');  
1561 - $o29 = array('0','1','2','3','4','5','6','7','8','9');  
1562 -  
1563 - $int = str_replace($a2z, $o29, $str) + 0;  
1564 -  
1565 - return $int;  
1566 - }  
1567 -  
1568 - /**  
1569 - * Possibly we can optimise indexes. This method must be overriden.  
1570 - * The new function must call the parent!  
1571 - *  
1572 - */  
1573 - public function optimise()  
1574 - {  
1575 - KTUtil::setSystemSetting('luceneOptimisationDate', time());  
1576 - }  
1577 -  
1578 - /**  
1579 - * Shuts down the indexer  
1580 - *  
1581 - */  
1582 - public function shutdown()  
1583 - {  
1584 - // do nothing generally  
1585 - }  
1586 -  
1587 - /**  
1588 - * Returns the name of the indexer.  
1589 - *  
1590 - * @return string  
1591 - */  
1592 - public abstract function getDisplayName();  
1593 -  
1594 -  
1595 - /**  
1596 - * Returns the number of non-deleted documents in the index.  
1597 - *  
1598 - * @return int  
1599 - */  
1600 - public abstract function getDocumentsInIndex();  
1601 -  
1602 - /**  
1603 - * Returns the path to the index directory  
1604 - *  
1605 - * @return string  
1606 - */  
1607 - public function getIndexDirectory()  
1608 - {  
1609 - $config = KTConfig::getSingleton();  
1610 - $directory = $config->get('indexer/luceneDirectory');  
1611 - return $directory;  
1612 - }  
1613 -}  
1614 -  
1615 -?> 1 +<?php
  2 +
  3 +/**
  4 + * $Id:$
  5 + *
  6 + * KnowledgeTree Community Edition
  7 + * Document Management Made Simple
  8 + * Copyright (C) 2008 KnowledgeTree Inc.
  9 + * Portions copyright The Jam Warehouse Software (Pty) Limited
  10 + *
  11 + * This program is free software; you can redistribute it and/or modify it under
  12 + * the terms of the GNU General Public License version 3 as published by the
  13 + * Free Software Foundation.
  14 + *
  15 + * This program is distributed in the hope that it will be useful, but WITHOUT
  16 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17 + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  18 + * details.
  19 + *
  20 + * You should have received a copy of the GNU General Public License
  21 + * along with this program. If not, see <http://www.gnu.org/licenses/>.
  22 + *
  23 + * You can contact KnowledgeTree Inc., PO Box 7775 #87847, San Francisco,
  24 + * California 94120-7775, or email info@knowledgetree.com.
  25 + *
  26 + * The interactive user interfaces in modified source and object code versions
  27 + * of this program must display Appropriate Legal Notices, as required under
  28 + * Section 5 of the GNU General Public License version 3.
  29 + *
  30 + * In accordance with Section 7(b) of the GNU General Public License version 3,
  31 + * these Appropriate Legal Notices must retain the display of the "Powered by
  32 + * KnowledgeTree" logo and retain the original copyright notice. If the display of the
  33 + * logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices
  34 + * must display the words "Powered by KnowledgeTree" and retain the original
  35 + * copyright notice.
  36 + * Contributor( s): ______________________________________
  37 + *
  38 + */
  39 +
  40 +define('SEARCH2_INDEXER_DIR',realpath(dirname(__FILE__)) . '/');
  41 +require_once('indexing/extractorCore.inc.php');
  42 +require_once(KT_DIR . '/plugins/ktcore/scheduler/schedulerUtil.php');
  43 +
  44 +
  45 +class IndexerInconsistencyException extends Exception {};
  46 +
  47 +class QueryResultItem
  48 +{
  49 + protected $document_id;
  50 + protected $title;
  51 + protected $rank;
  52 + protected $text;
  53 + protected $filesize;
  54 + protected $fullpath;
  55 + protected $live;
  56 + protected $version;
  57 + protected $mimeType;
  58 + protected $filename;
  59 + protected $thumbnail; // TODO: if not null, gui can display a thumbnail
  60 + protected $viewer; // TODO: if not null, a viewer can be used to view the document
  61 + protected $document;
  62 + protected $checkedOutUser;
  63 + protected $dateCheckedout;
  64 + protected $workflowState;
  65 + protected $workflow;
  66 + protected $modifiedBy;
  67 + protected $dateModified;
  68 + protected $createdBy;
  69 + protected $dateCreated;
  70 + protected $owner;
  71 + protected $immutable;
  72 + protected $deleted;
  73 + protected $status;
  74 + protected $folderId;
  75 + protected $storagePath;
  76 + protected $documentType;
  77 + protected $mimeIconPath;
  78 + protected $mimeDisplay;
  79 + protected $oemDocumentNo;
  80 +
  81 + public function __construct($document_id, $rank=null, $title=null, $text=null)
  82 + {
  83 + $this->document_id=(int) $document_id;
  84 + $this->rank= $rank;
  85 + $this->title=$title;
  86 + $this->text = $text;
  87 + $this->live = true;
  88 + $this->loadDocumentInfo();
  89 + }
  90 +
  91 + protected function __isset($property)
  92 + {
  93 + switch($property)
  94 + {
  95 + case 'DocumentID': return isset($this->document_id);
  96 + case 'Rank': return isset($this->rank);
  97 + case 'Text': return isset($this->text);
  98 + case 'Title': return isset($this->title);
  99 + case null: break;
  100 + default:
  101 + throw new Exception("Unknown property '$property' to get on QueryResultItem");
  102 + }
  103 + return true; // should not be reached
  104 + }
  105 +
  106 + public function loadDocumentInfo()
  107 + {
  108 + global $default;
  109 + $sql = "SELECT
  110 + d.folder_id, f.full_path, f.name, dcv.size as filesize, dcv.major_version,
  111 + dcv.minor_version, dcv.filename, cou.name as checkoutuser, w.human_name as workflow, ws.human_name as workflowstate,
  112 + mt.mimetypes as mimetype, md.mime_doc as mimedoc, d.checkedout, mbu.name as modifiedbyuser, d.modified,
  113 + cbu.name as createdbyuser, ou.name as owneruser, d.immutable, d.status_id, d.created,dcv.storage_path, dtl.name as document_type,
  114 + mt.icon_path as mime_icon_path, mt.friendly_name as mime_display, d.oem_no, dmv.name as title
  115 + FROM
  116 + documents d
  117 + INNER JOIN document_metadata_version dmv ON d.metadata_version_id = dmv.id
  118 + INNER JOIN document_content_version dcv ON dmv.content_version_id = dcv.id
  119 + INNER JOIN mime_types mt ON dcv.mime_id=mt.id
  120 + LEFT JOIN document_types_lookup dtl ON dtl.id=dmv.document_type_id
  121 + LEFT JOIN folders f ON f.id=d.folder_id
  122 + LEFT JOIN users cou ON d.checked_out_user_id=cou.id
  123 + LEFT JOIN workflows w ON dmv.workflow_id=w.id
  124 + LEFT JOIN workflow_states ws ON dmv.workflow_state_id = ws.id
  125 + LEFT JOIN mime_documents md ON mt.mime_document_id = md.id
  126 + LEFT JOIN users mbu ON d.modified_user_id=mbu.id
  127 + LEFT JOIN users cbu ON d.creator_id=cbu.id
  128 + LEFT JOIN users ou ON d.owner_id=ou.id
  129 + WHERE
  130 + d.id=$this->document_id";
  131 +
  132 + $result = DBUtil::getOneResult($sql);
  133 +
  134 + if (PEAR::isError($result) || empty($result))
  135 + {
  136 + $this->live = false;
  137 + if (PEAR::isError($result))
  138 + {
  139 + throw new Exception('Database exception! There appears to be an error in the system: ' .$result->getMessage());
  140 + }
  141 +
  142 + $default->log->error('QueryResultItem: $result is null');
  143 + $msg = 'The database did not have a record matching the result from the document indexer. This may occur if there is an inconsistency between the document indexer and the repository. The indexer needs to be repaired.';
  144 + $default->log->error('QueryResultItem: ' . $msg);
  145 + // TODO: repair process where we scan documents in index, and delete those for which there is nothing in the repository
  146 + throw new IndexerInconsistencyException(_kt($msg));
  147 + }
  148 +
  149 + // document_id, relevance, text, title
  150 +
  151 + $this->documentType = $result['document_type'];
  152 + $this->filename=$result['filename'];
  153 + $this->filesize = KTUtil::filesizeToString($result['filesize']);
  154 + $this->folderId = $result['folder_id'];
  155 + $this->title = $result['title'];
  156 +
  157 + $this->createdBy = $result['createdbyuser'];
  158 + $this->dateCreated = $result['created'];
  159 +
  160 + $this->modifiedBy = $result['modifiedbyuser'];
  161 + $this->dateModified = $result['modified'];
  162 +
  163 + $this->checkedOutUser = $result['checkoutuser'];
  164 + $this->dateCheckedout = $result['checkedout'];
  165 +
  166 + $this->owner = $result['owneruser'];
  167 +
  168 + $this->version = $result['major_version'] . '.' . $result['minor_version'];
  169 +
  170 + $this->immutable = ($result['immutable'] + 0)?_kt('Immutable'):'';
  171 +
  172 + $this->workflow = $result['workflow'];
  173 + $this->workflowState = $result['workflowstate'];
  174 +
  175 + $this->oemDocumentNo = $result['oem_no'];
  176 + if (empty($this->oemDocumentNo)) $this->oemDocumentNo = 'n/a';
  177 +
  178 + if (is_null($result['name']))
  179 + {
  180 + $this->fullpath = '(orphaned)';
  181 + }
  182 + else
  183 + {
  184 + $this->fullpath = $result['full_path'];
  185 + }
  186 +
  187 + $this->mimeType = $result['mimetype'];
  188 + $this->mimeIconPath = $result['mime_icon_path'];
  189 + $this->mimeDisplay = $result['mime_display'];
  190 +
  191 + $this->storagePath = $result['storage_path'];
  192 + $this->status = Document::getStatusString($result['status_id']);
  193 + }
  194 +
  195 + protected function __get($property)
  196 + {
  197 + switch($property)
  198 + {
  199 + case null: return '';
  200 + case 'DocumentID': return (int) $this->document_id;
  201 + case 'Relevance':
  202 + case 'Rank': return (float) $this->rank;
  203 + case 'Text': return (string) $this->text;
  204 + case 'Title': return (string) $this->title;
  205 + case 'FullPath': return (string) $this->fullpath;
  206 + case 'IsLive': return (bool) $this->live;
  207 + case 'Filesize': return $this->filesize;
  208 + case 'Version': return (string) $this->version;
  209 + case 'Filename': return (string)$this->filename;
  210 + case 'FolderId': return (int)$this->folderId;
  211 + case 'OemDocumentNo': return (string) $this->oemDocumentNo;
  212 + case 'Document':
  213 + if (is_null($this->document))
  214 + {
  215 + $this->document = Document::get($this->document_id);
  216 + }
  217 + return $this->document;
  218 + case 'IsAvailable':
  219 + return $this->Document->isLive();
  220 + case 'CheckedOutUser':
  221 + case 'CheckedOutBy':
  222 + return (string) $this->checkedOutUser;
  223 + case 'WorkflowOnly':
  224 + case 'Workflow':
  225 + return (string)$this->workflow;
  226 + case 'WorkflowStateOnly':
  227 + case 'WorkflowState':
  228 + return (string)$this->workflowState;
  229 + case 'WorkflowAndState':
  230 + if (is_null($this->workflow))
  231 + {
  232 + return '';
  233 + }
  234 + return "$this->workflow - $this->workflowState";
  235 + case 'MimeType':
  236 + return (string) $this->mimeType;
  237 + case 'MimeIconPath':
  238 + return (string) $this->mimeIconPath;
  239 + case 'MimeDisplay':
  240 + return (string) $this->mimeDisplay;
  241 + case 'DateCheckedOut':
  242 + return (string) $this->dateCheckedout;
  243 + case 'ModifiedBy':
  244 + return (string) $this->modifiedBy;
  245 + case 'DateModified':
  246 + return (string) $this->dateModified;
  247 + case 'CreatedBy':
  248 + return (string) $this->createdBy;
  249 + case 'DateCreated':
  250 + return (string) $this->dateCreated;
  251 + case 'Owner':
  252 + case 'OwnedBy':
  253 + return (string) $this->owner;
  254 + case 'IsImmutable':
  255 + case 'Immutable':
  256 + return (bool) $this->immutable;
  257 + case 'Status':
  258 + return $this->status;
  259 + case 'StoragePath':
  260 + return $this->storagePath;
  261 + case 'DocumentType':
  262 + return $this->documentType;
  263 + case 'Permissions':
  264 + return 'not available';
  265 + case 'CanBeReadByUser':
  266 + if (!$this->live)
  267 + return false;
  268 + if (Permission::userHasDocumentReadPermission($this->Document))
  269 + return true;
  270 + if (Permission::adminIsInAdminMode())
  271 + return true;
  272 + return false;
  273 + default:
  274 + throw new Exception("Unknown property '$property' to get on QueryResultItem");
  275 + }
  276 + return ''; // Should not be reached
  277 + }
  278 +
  279 + protected function __set($property, $value)
  280 + {
  281 + switch($property)
  282 + {
  283 + case 'Rank': $this->rank = number_format($value,2,'.',','); break;
  284 + case 'Title': $this->title = $value; break;
  285 + case 'Text': $this->text = $value; break;
  286 + default:
  287 + throw new Exception("Unknown property '$property' to set on QueryResultItem");
  288 + }
  289 + }
  290 +}
  291 +
  292 +function MatchResultCompare($a, $b)
  293 +{
  294 + if ($a->Rank == $b->Rank) {
  295 + return 0;
  296 + }
  297 + return ($a->Rank < $b->Rank) ? -1 : 1;
  298 +}
  299 +
  300 +abstract class Indexer
  301 +{
  302 + /**
  303 + * Cache of extractors
  304 + *
  305 + * @var array
  306 + */
  307 + private $extractorCache;
  308 +
  309 + /**
  310 + * Indicates if the indexer will do logging.
  311 + *
  312 + * @var boolean
  313 + */
  314 + private $debug;
  315 + /**
  316 + * Cache on mime related hooks
  317 + *
  318 + * @var unknown_type
  319 + */
  320 + private $mimeHookCache;
  321 + /**
  322 + * Cache on general hooks.
  323 + *
  324 + * @var array
  325 + */
  326 + private $generalHookCache;
  327 +
  328 + /**
  329 + * This is a path to the extractors.
  330 + *
  331 + * @var string
  332 + */
  333 + private $extractorPath;
  334 + /**
  335 + * This is a path to the hooks.
  336 + *
  337 + * @var string
  338 + */
  339 + private $hookPath;
  340 +
  341 + private $enabledExtractors;
  342 +
  343 + /**
  344 + * Initialise the indexer
  345 + *
  346 + */
  347 + protected function __construct()
  348 + {
  349 + $config = KTConfig::getSingleton();
  350 +
  351 + $this->extractorCache = array();
  352 + $this->debug = $config->get('indexer/debug', true);
  353 + $this->hookCache = array();
  354 + $this->generalHookCache = array();
  355 + $this->extractorPath = $config->get('indexer/extractorPath', 'extractors');
  356 + $this->hookPath = $config->get('indexer/extractorHookPath','extractorHooks');
  357 +
  358 + $this->loadExtractorStatus();
  359 + }
  360 +
  361 + /**
  362 + * Get the list if enabled extractors
  363 + *
  364 + */
  365 + private function loadExtractorStatus()
  366 + {
  367 + $sql = "SELECT id, name FROM mime_extractors WHERE active=1";
  368 + $rs = DBUtil::getResultArray($sql);
  369 + $this->enabledExtractors = array();
  370 + foreach($rs as $item)
  371 + {
  372 + $this->enabledExtractors[] = $item['name'];
  373 + }
  374 + }
  375 +
  376 + private function isExtractorEnabled($extractor)
  377 + {
  378 + return in_array($extractor, $this->enabledExtractors);
  379 + }
  380 +
  381 + /**
  382 + * Returns a reference to the main class
  383 + *
  384 + * @return Indexer
  385 + */
  386 + public static function get()
  387 + {
  388 + static $singleton = null;
  389 +
  390 + if (is_null($singleton))
  391 + {
  392 + $config = KTConfig::getSingleton();
  393 + $classname = $config->get('indexer/coreClass');
  394 +
  395 + require_once('indexing/indexers/' . $classname . '.inc.php');
  396 +
  397 + if (!class_exists($classname))
  398 + {
  399 + throw new Exception("Class '$classname' does not exist.");
  400 + }
  401 +
  402 + $singleton = new $classname;
  403 + }
  404 +
  405 + return $singleton;
  406 + }
  407 +
  408 + public abstract function deleteDocument($docid);
  409 +
  410 + /**
  411 + * Remove the association of all extractors to mime types on the database.
  412 + *
  413 + */
  414 + public function clearExtractors()
  415 + {
  416 + global $default;
  417 +
  418 + $sql = "update mime_types set extractor_id=null";
  419 + DBUtil::runQuery($sql);
  420 +
  421 + $sql = "delete from mime_extractors";
  422 + DBUtil::runQuery($sql);
  423 +
  424 + if ($this->debug) $default->log->debug('clearExtractors');
  425 + }
  426 +
  427 + /**
  428 + * lookup the name of the extractor class based on the mime type.
  429 + *
  430 + * @param string $type
  431 + * @return string
  432 + */
  433 + public static function resolveExtractor($type)
  434 + {
  435 + global $default;
  436 + $sql = "select extractor from mime_types where filetypes='$type'";
  437 + $class = DBUtil::getOneResultKey($sql,'extractor');
  438 + if (PEAR::isError($class))
  439 + {
  440 + $default->log->error("resolveExtractor: cannot resolve $type");
  441 + return $class;
  442 + }
  443 + if ($this->debug) $default->log->debug(sprintf(_kt("resolveExtractor: Resolved '%s' from mime type '%s'."), $class, $type));
  444 + return $class;
  445 + }
  446 +
  447 + /**
  448 + * Return all the discussion text.
  449 + *
  450 + * @param int $docid
  451 + * @return string
  452 + */
  453 + public static function getDiscussionText($docid)
  454 + {
  455 + $sql = "SELECT
  456 + dc.subject, dc.body
  457 + FROM
  458 + discussion_threads dt
  459 + INNER JOIN discussion_comments dc ON dc.thread_id=dt.id AND dc.id BETWEEN dt.first_comment_id AND dt.last_comment_id
  460 + WHERE
  461 + dt.document_id=$docid";
  462 + $result = DBUtil::getResultArray($sql);
  463 + $text = '';
  464 +
  465 + foreach($result as $record)
  466 + {
  467 + $text .= $record['subject'] . "\n" . $record['body'] . "\n";
  468 + }
  469 +
  470 + return $text;
  471 + }
  472 +
  473 + /**
  474 + * Schedule the indexing of a document.
  475 + *
  476 + * @param string $document
  477 + * @param string $what
  478 + */
  479 + public static function index($document, $what='A')
  480 + {
  481 + global $default;
  482 +
  483 + if (is_numeric($document))
  484 + {
  485 + $document = Document::get($document+0);
  486 + }
  487 +
  488 + if (PEAR::isError($document))
  489 + {
  490 + $default->log->error("index: Could not index document: " .$document->getMessage());
  491 + return;
  492 + }
  493 +
  494 + $document_id = $document->getId();
  495 + $userid=$_SESSION['userID'];
  496 + if (empty($userid)) $userid=1;
  497 +
  498 + // we dequeue the document so that there are no issues when enqueuing
  499 + Indexer::unqueueDocument($document_id);
  500 +
  501 + // enqueue item
  502 + $sql = "INSERT INTO index_files(document_id, user_id, what) VALUES($document_id, $userid, '$what')";
  503 + DBUtil::runQuery($sql);
  504 +
  505 + $default->log->debug("index: Queuing indexing of $document_id");
  506 +
  507 + }
  508 +
  509 + private static function incrementCount()
  510 + {
  511 + // Get count from system settings
  512 + $count = Indexer::getIndexedDocumentCount();
  513 + $count = (int)$count + 1;
  514 + Indexer::updateIndexedDocumentCount($count);
  515 + }
  516 +
  517 + public static function getIndexedDocumentCount()
  518 + {
  519 + $count = KTUtil::getSystemSetting('indexedDocumentCount', 0);
  520 + return (int) $count;
  521 + }
  522 +
  523 + public static function updateIndexedDocumentCount($cnt = 0)
  524 + {
  525 + KTUtil::setSystemSetting('indexedDocumentCount', $cnt);
  526 + }
  527 +
  528 + public static function reindexQueue()
  529 + {
  530 + $sql = "UPDATE index_files SET processdate = null";
  531 + DBUtil::runQuery($sql);
  532 + }
  533 +
  534 + public static function reindexDocument($documentId)
  535 + {
  536 + $sql = "UPDATE index_files SET processdate=null, status_msg=null WHERE document_id=$documentId";
  537 + DBUtil::runQuery($sql);
  538 + }
  539 +
  540 +
  541 +
  542 + public static function indexAll()
  543 + {
  544 + $userid=$_SESSION['userID'];
  545 + if (empty($userid)) $userid=1;
  546 +
  547 + $sql = "DELETE FROM index_files";
  548 + DBUtil::runQuery($sql);
  549 +
  550 + $sql = "INSERT INTO index_files(document_id, user_id, what) SELECT id, $userid, 'A' FROM documents WHERE status_id=1 and id not in (select document_id from index_files)";
  551 + DBUtil::runQuery($sql);
  552 + }
  553 +
  554 + /**
  555 + * Clearout the scheduling of documents that no longer exist.
  556 + *
  557 + */
  558 + public static function clearoutDeleted()
  559 + {
  560 + global $default;
  561 +
  562 + $sql = 'DELETE FROM
  563 + index_files
  564 + WHERE
  565 + document_id in (SELECT d.id FROM documents AS d WHERE d.status_id=3) OR
  566 + NOT EXISTS(SELECT index_files.document_id FROM documents WHERE index_files.document_id=documents.id)';
  567 + DBUtil::runQuery($sql);
  568 +
  569 + $default->log->debug("Indexer::clearoutDeleted: removed documents from indexing queue that have been deleted");
  570 + }
  571 +
  572 +
  573 + /**
  574 + * Check if a document is scheduled to be indexed
  575 + *
  576 + * @param mixed $document This may be a document or document id
  577 + * @return boolean
  578 + */
  579 + public static function isDocumentScheduled($document)
  580 + {
  581 + if (is_numeric($document))
  582 + {
  583 + $docid = $document;
  584 + }
  585 + else if ($document instanceof Document)
  586 + {
  587 + $docid = $document->getId();
  588 + }
  589 + else
  590 + {
  591 + return false;
  592 + }
  593 + $sql = "SELECT 1 FROM index_files WHERE document_id=$docid";
  594 + $result = DBUtil::getResultArray($sql);
  595 + return count($result) > 0;
  596 + }
  597 +
  598 + /**
  599 + * Filters text removing redundant characters such as continuous newlines and spaces.
  600 + *
  601 + * @param string $filename
  602 + */
  603 + private function filterText($filename)
  604 + {
  605 + $content = file_get_contents($filename);
  606 +
  607 + $src = array("([\r\n])","([\n][\n])","([\n])","([\t])",'([ ][ ])');
  608 + $tgt = array("\n","\n",' ',' ',' ');
  609 +
  610 + // shrink what is being stored.
  611 + do
  612 + {
  613 + $orig = $content;
  614 + $content = preg_replace($src, $tgt, $content);
  615 + } while ($content != $orig);
  616 +
  617 + return file_put_contents($filename, $content) !== false;
  618 + }
  619 +
  620 + /**
  621 + * Load hooks for text extraction process.
  622 + *
  623 + */
  624 + private function loadExtractorHooks()
  625 + {
  626 + $this->generalHookCache = array();
  627 + $this->mimeHookCache = array();
  628 +
  629 +
  630 + $dir = opendir(SearchHelper::correctPath($this->hookPath));
  631 + while (($file = readdir($dir)) !== false)
  632 + {
  633 + if (substr($file,-12) == 'Hook.inc.php')
  634 + {
  635 + require_once($this->hookPath . '/' . $file);
  636 + $class = substr($file, 0, -8);
  637 +
  638 + if (!class_exists($class))
  639 + {
  640 + continue;
  641 + }
  642 +
  643 + $hook = new $class;
  644 + if (!($class instanceof ExtractorHook))
  645 + {
  646 + continue;
  647 + }
  648 +
  649 + $mimeTypes = $hook->registerMimeTypes();
  650 + if (is_null($mimeTypes))
  651 + {
  652 + $this->generalHookCache[] = & $hook;
  653 + }
  654 + else
  655 + {
  656 + foreach($mimeTypes as $type)
  657 + {
  658 + $this->mimeHookCache[$type][] = & $hook;
  659 + }
  660 + }
  661 +
  662 + }
  663 + }
  664 + closedir($dir);
  665 + }
  666 +
  667 + /**
  668 + * This is a refactored function to execute the hooks.
  669 + *
  670 + * @param DocumentExtractor $extractor
  671 + * @param string $phase
  672 + * @param string $mimeType Optional. If set, indicates which hooks must be used, else assume general.
  673 + */
  674 + private function executeHook($extractor, $phase, $mimeType = null)
  675 + {
  676 + $hooks = array();
  677 + if (is_null($mimeType))
  678 + {
  679 + $hooks = $this->generalHookCache;
  680 + }
  681 + else
  682 + {
  683 + if (array_key_exists($mimeType, $this->mimeHookCache))
  684 + {
  685 + $hooks = $this->mimeHookCache[$mimeType];
  686 + }
  687 + }
  688 + if (empty($hooks))
  689 + {
  690 + return;
  691 + }
  692 +
  693 + foreach($hooks as $hook)
  694 + {
  695 + $hook->$phase($extractor);
  696 + }
  697 + }
  698 +
  699 + private function doesDiagnosticsPass($simple=false)
  700 + {
  701 + global $default;
  702 +
  703 + $config =& KTConfig::getSingleton();
  704 + // create a index log lock file in case there are errors, and we don't need to log them forever!
  705 + // this function will create the lockfile if an error is detected. It will be removed as soon
  706 + // as the problems with the indexer are removed.
  707 + $lockFile = $config->get('cache/cacheDirectory') . '/index.log.lock';
  708 +
  709 + $diagnosis = $this->diagnose();
  710 + if (!is_null($diagnosis))
  711 + {
  712 + if (!is_file($lockFile))
  713 + {
  714 + $default->log->error(_kt('Indexer problem: ') . $diagnosis);
  715 + }
  716 + touch($lockFile);
  717 + return false;
  718 + }
  719 +
  720 + if ($simple)
  721 + {
  722 + return true;
  723 + }
  724 +
  725 + $diagnosis = $this->diagnoseExtractors();
  726 + if (!empty($diagnosis))
  727 + {
  728 + if (!is_file($lockFile))
  729 + {
  730 + foreach($diagnosis as $diag)
  731 + {
  732 + $default->log->error(sprintf(_kt('%s problem: %s'), $diag['name'],$diag['diagnosis']));
  733 + }
  734 + }
  735 + touch($lockFile);
  736 + return false;
  737 + }
  738 +
  739 + if (is_file($lockFile))
  740 + {
  741 + $default->log->info(_kt('Issues with the indexer have been resolved!'));
  742 + unlink($lockFile);
  743 + }
  744 +
  745 + return true;
  746 + }
  747 +
  748 + /**
  749 + * This does the initial mime type association between mime types and text extractors
  750 + *
  751 + */
  752 + public function checkForRegisteredTypes()
  753 + {
  754 + global $default;
  755 +
  756 + // we are only doing this once!
  757 + $initRegistered = KTUtil::getSystemSetting('mimeTypesRegistered', false);
  758 + if ($initRegistered)
  759 + {
  760 + return;
  761 + }
  762 + if ($this->debug) $default->log->debug('checkForRegisteredTypes: start');
  763 +
  764 + $date = date('Y-m-d H:i');
  765 + $sql = "UPDATE scheduler_tasks SET run_time='$date'";
  766 + DBUtil::runQuery($sql);
  767 +
  768 + $this->registerTypes(true);
  769 +
  770 + $disable = array(
  771 + OS_WINDOWS=>array('PSExtractor'),
  772 + OS_UNIX => array()
  773 + );
  774 +
  775 + $disableForOS = OS_WINDOWS?$disable[OS_WINDOWS]:$disable[OS_UNIX];
  776 +
  777 + foreach($disableForOS as $extractor)
  778 + {
  779 + $sql = "UPDATE mime_extractors SET active=0 WHERE name='$extractor'";
  780 + DBUtil::runQuery($sql);
  781 + $default->log->info("checkForRegisteredTypes: disabled '$extractor'");
  782 + }
  783 +
  784 + if ($this->debug) $default->log->debug('checkForRegisteredTypes: done');
  785 + KTUtil::setSystemSetting('mimeTypesRegistered', true);
  786 + }
  787 +
  788 + private function updatePendingDocumentStatus($documentId, $message, $level)
  789 + {
  790 + $this->indexingHistory .= "\n" . $level . ': ' . $message;
  791 + $message = sanitizeForSQL($this->indexingHistory);
  792 + $sql = "UPDATE index_files SET status_msg='$message' WHERE document_id=$documentId";
  793 + DBUtil::runQuery($sql);
  794 + }
  795 +
  796 + private $restartCurrentBatch = false;
  797 +
  798 + public function restartBatch()
  799 + {
  800 + $this->restartCurrentBatch = true;
  801 + }
  802 +
  803 + /**
  804 + *
  805 + * @param int $documentId
  806 + * @param string $message
  807 + * @param string $level This may be info, error, debug
  808 + */
  809 + private function logPendingDocumentInfoStatus($documentId, $message, $level)
  810 + {
  811 + $this->updatePendingDocumentStatus($documentId, $message, $level);
  812 + global $default;
  813 +
  814 + switch ($level)
  815 + {
  816 + case 'debug':
  817 + if ($this->debug)
  818 + {
  819 + $default->log->debug($message);
  820 + }
  821 + break;
  822 + default:
  823 + $default->log->$level($message);
  824 + }
  825 + }
  826 +
  827 +
  828 +
  829 + public function getExtractor($extractorClass)
  830 + {
  831 + if (empty($extractorClass))
  832 + {
  833 + return null;
  834 + }
  835 +
  836 + $includeFile = SEARCH2_INDEXER_DIR . 'extractors/' . $extractorClass . '.inc.php';
  837 + if (!file_exists($includeFile))
  838 + {
  839 + throw new Exception("Extractor file does not exist: $includeFile");
  840 + }
  841 +
  842 + require_once($includeFile);
  843 +
  844 + if (!class_exists($extractorClass))
  845 + {
  846 + throw new Exception("Extractor '$classname' not defined in file: $includeFile");
  847 + }
  848 +
  849 + $extractor = new $extractorClass();
  850 +
  851 + if (!($extractor instanceof DocumentExtractor))
  852 + {
  853 + throw new Exception("Class $classname was expected to be of type DocumentExtractor");
  854 + }
  855 +
  856 + return $extractor;
  857 + }
  858 +
  859 + public static function getIndexingQueue($problemItemsOnly=true)
  860 + {
  861 +
  862 + if ($problemItemsOnly)
  863 + {
  864 + $sql = "SELECT
  865 + iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename
  866 + FROM
  867 + index_files iff
  868 + INNER JOIN documents d ON iff.document_id=d.id
  869 + INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
  870 + INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
  871 + INNER JOIN mime_types mt ON dcv.mime_id=mt.id
  872 + LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
  873 + WHERE
  874 + (iff.status_msg IS NOT NULL AND iff.status_msg <> '') AND d.status_id=1
  875 + ORDER BY indexdate ";
  876 + }
  877 + else
  878 + {
  879 + $sql = "SELECT
  880 + iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename
  881 + FROM
  882 + index_files iff
  883 + INNER JOIN documents d ON iff.document_id=d.id
  884 + INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
  885 + INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
  886 + INNER JOIN mime_types mt ON dcv.mime_id=mt.id
  887 + LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
  888 + WHERE
  889 + (iff.status_msg IS NULL or iff.status_msg = '') AND d.status_id=1
  890 + ORDER BY indexdate ";
  891 + }
  892 + $aResult = DBUtil::getResultArray($sql);
  893 +
  894 + return $aResult;
  895 + }
  896 +
  897 + public static function getPendingIndexingQueue()
  898 + {
  899 + return Indexer::getIndexingQueue(false);
  900 + }
  901 +
  902 + /**
  903 + * The main function that may be called repeatedly to index documents.
  904 + *
  905 + * @param int $max Default 20
  906 + */
  907 + public function indexDocuments($max=null)
  908 + {
  909 + global $default;
  910 + $config =& KTConfig::getSingleton();
  911 +
  912 + /*$indexLockFile = $config->get('cache/cacheDirectory') . '/main.index.lock';
  913 + if (is_file($indexLockFile))
  914 + {
  915 + $default->log->info('indexDocuments: main.index.lock seems to exist. it could be that the indexing is still underway.');
  916 + $default->log->info('indexDocuments: Remove "' . $indexLockFile . '" if the indexing is not running or extend the frequency at which the background task runs!');
  917 + return;
  918 + }
  919 + touch($indexLockFile);*/
  920 +
  921 +
  922 + $this->checkForRegisteredTypes();
  923 +
  924 + if ($this->debug) $default->log->debug('indexDocuments: start');
  925 + if (!$this->doesDiagnosticsPass())
  926 + {
  927 + //unlink($indexLockFile);
  928 + if ($this->debug) $default->log->debug('indexDocuments: stopping - diagnostics problem. The dashboard will provide more information.');
  929 + return;
  930 + }
  931 +
  932 + if (is_null($max))
  933 + {
  934 + $max = $config->get('indexer/batchDocuments',20);
  935 + }
  936 +
  937 + $this->loadExtractorHooks();
  938 +
  939 + Indexer::clearoutDeleted();
  940 +
  941 + $date = date('Y-m-d H:i:s');
  942 + // identify the indexers that must run
  943 + // mysql specific limit!
  944 + $sql = "SELECT
  945 + iff.document_id, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what
  946 + FROM
  947 + index_files iff
  948 + INNER JOIN documents d ON iff.document_id=d.id
  949 + INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
  950 + INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
  951 + INNER JOIN mime_types mt ON dcv.mime_id=mt.id
  952 + LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
  953 + WHERE
  954 + (iff.processdate IS NULL or iff.processdate < date_sub('$date', interval 1 day)) AND dmv.status_id=1
  955 + ORDER BY indexdate
  956 + LIMIT $max";
  957 + $result = DBUtil::getResultArray($sql);
  958 + if (PEAR::isError($result))
  959 + {
  960 + //unlink($indexLockFile);
  961 + if ($this->debug) $default->log->debug('indexDocuments: stopping - db error');
  962 + return;
  963 + }
  964 + KTUtil::setSystemSetting('luceneIndexingDate', time());
  965 +
  966 + // bail if no work to do
  967 + if (count($result) == 0)
  968 + {
  969 + //unlink($indexLockFile);
  970 + if ($this->debug) $default->log->debug('indexDocuments: stopping - no work to be done');
  971 + return;
  972 + }
  973 +
  974 + // identify any documents that need indexing and mark them
  975 + // so they are not taken in a followup run
  976 + $ids = array();
  977 + foreach($result as $docinfo)
  978 + {
  979 + $ids[] = $docinfo['document_id'];
  980 + }
  981 +
  982 + // mark the documents as being processed
  983 +
  984 + $ids=implode(',',$ids);
  985 + $sql = "UPDATE index_files SET processdate='$date' WHERE document_id in ($ids)";
  986 + DBUtil::runQuery($sql);
  987 +
  988 + $extractorCache = array();
  989 + $storageManager = KTStorageManagerUtil::getSingleton();
  990 +
  991 + $tempPath = $config->get("urls/tmpDirectory");
  992 +
  993 + foreach($result as $docinfo)
  994 + {
  995 + // increment indexed documents count
  996 + Indexer::incrementCount();
  997 +
  998 + $docId=$docinfo['document_id'];
  999 + $extension=$docinfo['filetypes'];
  1000 + $mimeType=$docinfo['mimetypes'];
  1001 + $extractorClass=$docinfo['extractor'];
  1002 + $indexDocument = in_array($docinfo['what'], array('A','C'));
  1003 + $indexDiscussion = in_array($docinfo['what'], array('A','D'));
  1004 + $this->indexingHistory = '';
  1005 +
  1006 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Indexing docid: %d extension: '%s' mimetype: '%s' extractor: '%s'"), $docId, $extension,$mimeType,$extractorClass), 'debug');
  1007 +
  1008 + if (empty($extractorClass))
  1009 + {
  1010 + /*
  1011 +
  1012 + if no extractor is found and we don't need to index discussions, then we can remove the item from the queue.
  1013 +
  1014 + */
  1015 + if ($indexDiscussion)
  1016 + {
  1017 + $indexDocument = false;
  1018 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Not indexing docid: %d content because extractor could not be resolve. Still indexing discussion."), $docId), 'info');
  1019 + }
  1020 + else
  1021 + {
  1022 + Indexer::unqueueDocument($docId, sprintf(_kt("No extractor for docid: %d"),$docId));
  1023 + continue;
  1024 + }
  1025 + }
  1026 + else
  1027 + {
  1028 + /*
  1029 +
  1030 + If an extractor is available, we must ensure it is enabled.
  1031 +
  1032 + */
  1033 +
  1034 + if (!$this->isExtractorEnabled($extractorClass))
  1035 + {
  1036 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("diagnose: Not indexing docid: %d because extractor '%s' is disabled."), $docId, $extractorClass), 'info');
  1037 + continue;
  1038 + }
  1039 + }
  1040 +
  1041 + if ($this->debug)
  1042 + {
  1043 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Processing docid: %d.\n"),$docId), 'info');
  1044 + }
  1045 +
  1046 + $document = Document::get($docId);
  1047 + if (PEAR::isError($document))
  1048 + {
  1049 + Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Cannot resolve document id %d: %s."),$docId, $document->getMessage()), 'error');
  1050 + continue;
  1051 + }
  1052 +
  1053 + if ($this->restartCurrentBatch)
  1054 + {
  1055 + Indexer::unqueueDocument($docId);
  1056 + Indexer::index($docId, 'A');
  1057 + continue;
  1058 + }
  1059 +
  1060 +
  1061 + $filename = $document->getFileName();
  1062 + if (substr($filename,0,1) == '~' || substr($filename,-1) == '~')
  1063 + {
  1064 + Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Filename for document id %d starts with a tilde (~). This is assumed to be a temporary file. This is ignored."),$docId), 'error');
  1065 + continue;
  1066 + }
  1067 +
  1068 + $removeFromQueue = true;
  1069 + if ($indexDocument)
  1070 + {
  1071 + if (array_key_exists($extractorClass, $extractorCache))
  1072 + {
  1073 + $extractor = $extractorCache[$extractorClass];
  1074 + }
  1075 + else
  1076 + {
  1077 + $extractor = $extractorCache[$extractorClass] = $this->getExtractor($extractorClass);
  1078 + }
  1079 +
  1080 + if (!($extractor instanceof DocumentExtractor))
  1081 + {
  1082 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("indexDocuments: extractor '%s' is not a document extractor class."),$extractorClass), 'error');
  1083 + continue;
  1084 + }
  1085 +
  1086 +
  1087 +
  1088 + $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();
  1089 + $sourceFile = $storageManager->temporaryFile($document);
  1090 +
  1091 + if (empty($sourceFile) || !is_file($sourceFile))
  1092 + {
  1093 + Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: source file '%s' for document %d does not exist."),$sourceFile,$docId), 'error');
  1094 + continue;
  1095 + }
  1096 +
  1097 + if ($extractor->needsIntermediateSourceFile())
  1098 + {
  1099 + //$extension = pathinfo($document->getFileName(), PATHINFO_EXTENSION);
  1100 +
  1101 + $intermediate = $tempPath . '/'. $docId . '.' . $extension;
  1102 + $result = @copy($sourceFile, $intermediate);
  1103 + if ($result === false)
  1104 + {
  1105 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not create intermediate file from document %d"),$docId), 'error');
  1106 + // problem. lets try again later. probably permission related. log the issue.
  1107 + continue;
  1108 + }
  1109 + $sourceFile = $intermediate;
  1110 + }
  1111 +
  1112 + $targetFile = tempnam($tempPath, 'ktindexer');
  1113 +
  1114 + $extractor->setSourceFile($sourceFile);
  1115 + $extractor->setMimeType($mimeType);
  1116 + $extractor->setExtension($extension);
  1117 + $extractor->setTargetFile($targetFile);
  1118 + $extractor->setDocument($document);
  1119 + $extractor->setIndexingStatus(null);
  1120 + $extractor->setExtractionStatus(null);
  1121 +
  1122 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Extra Info docid: %d Source File: '%s' Target File: '%s'"),$docId,$sourceFile,$targetFile), 'debug');
  1123 +
  1124 + $this->executeHook($extractor, 'pre_extract');
  1125 + $this->executeHook($extractor, 'pre_extract', $mimeType);
  1126 + $removeFromQueue = false;
  1127 +
  1128 + if ($extractor->extractTextContent())
  1129 + {
  1130 + // the extractor may need to create another target file
  1131 + $targetFile = $extractor->getTargetFile();
  1132 +
  1133 + $extractor->setExtractionStatus(true);
  1134 + $this->executeHook($extractor, 'pre_index');
  1135 + $this->executeHook($extractor, 'pre_index', $mimeType);
  1136 +
  1137 + $title = $document->getName();
  1138 + if ($indexDiscussion)
  1139 + {
  1140 + if (!$this->filterText($targetFile))
  1141 + {
  1142 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');
  1143 + }
  1144 + else
  1145 + {
  1146 + $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);
  1147 + $removeFromQueue = $indexStatus;
  1148 + if (!$indexStatus)
  1149 + {
  1150 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocumentAndDiscussion"),$docId), 'error');
  1151 + }
  1152 +
  1153 + $extractor->setIndexingStatus($indexStatus);
  1154 + }
  1155 + }
  1156 + else
  1157 + {
  1158 + if (!$this->filterText($targetFile))
  1159 + {
  1160 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');
  1161 + }
  1162 + else
  1163 + {
  1164 + $indexStatus = $this->indexDocument($docId, $targetFile, $title, $version);
  1165 + $removeFromQueue = $indexStatus;
  1166 +
  1167 + if (!$indexStatus)
  1168 + {
  1169 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocument"),$docId), 'error');
  1170 + $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error');
  1171 + }
  1172 +
  1173 + $extractor->setIndexingStatus($indexStatus);
  1174 + }
  1175 + }
  1176 +
  1177 + $this->executeHook($extractor, 'post_index', $mimeType);
  1178 + $this->executeHook($extractor, 'post_index');
  1179 + }
  1180 + else
  1181 + {
  1182 + $extractor->setExtractionStatus(false);
  1183 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not extract contents from document %d"),$docId), 'error');
  1184 + $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error');
  1185 + }
  1186 +
  1187 + $this->executeHook($extractor, 'post_extract', $mimeType);
  1188 + $this->executeHook($extractor, 'post_extract');
  1189 +
  1190 + if ($extractor->needsIntermediateSourceFile())
  1191 + {
  1192 + @unlink($sourceFile);
  1193 + }
  1194 +
  1195 + @unlink($targetFile);
  1196 +
  1197 + }
  1198 + else
  1199 + {
  1200 + $indexStatus = $this->indexDiscussion($docId);
  1201 + $removeFromQueue = $indexStatus;
  1202 + }
  1203 +
  1204 + if ($removeFromQueue)
  1205 + {
  1206 + Indexer::unqueueDocument($docId, sprintf(_kt("Done indexing docid: %d"),$docId));
  1207 + }
  1208 + else
  1209 + {
  1210 + if ($this->debug) $default->log->debug(sprintf(_kt("Document docid: %d was not removed from the queue as it looks like there was a problem with the extraction process"),$docId));
  1211 + }
  1212 + }
  1213 + if ($this->debug) $default->log->debug('indexDocuments: done');
  1214 + //unlink($indexLockFile);
  1215 + }
  1216 +
  1217 + public function migrateDocuments($max=null)
  1218 + {
  1219 + global $default;
  1220 +
  1221 + $default->log->info(_kt('migrateDocuments: starting'));
  1222 +
  1223 + if (!$this->doesDiagnosticsPass(true))
  1224 + {
  1225 + $default->log->info(_kt('migrateDocuments: stopping - diagnostics problem. The dashboard will provide more information.'));
  1226 + return;
  1227 + }
  1228 +
  1229 + if (KTUtil::getSystemSetting('migrationComplete') == 'true')
  1230 + {
  1231 + $default->log->info(_kt('migrateDocuments: stopping - migration is complete.'));
  1232 + return;
  1233 + }
  1234 +
  1235 + $config =& KTConfig::getSingleton();
  1236 + if (is_null($max))
  1237 + {
  1238 + $max = $config->get('indexer/batchMigrateDocument',500);
  1239 + }
  1240 +
  1241 + $lockFile = $config->get('cache/cacheDirectory') . '/migration.lock';
  1242 + if (is_file($lockFile))
  1243 + {
  1244 + $default->log->info(_kt('migrateDocuments: stopping - migration lockfile detected.'));
  1245 + return;
  1246 + }
  1247 + touch($lockFile);
  1248 +
  1249 + $startTime = KTUtil::getSystemSetting('migrationStarted');
  1250 + if (is_null($startTime))
  1251 + {
  1252 + KTUtil::setSystemSetting('migrationStarted', time());
  1253 + }
  1254 +
  1255 + $maxLoops = 5;
  1256 +
  1257 + $max = ceil($max / $maxLoops);
  1258 +
  1259 + $start =KTUtil::getBenchmarkTime();
  1260 + $noDocs = false;
  1261 + $numDocs = 0;
  1262 +
  1263 + for($loop=0;$loop<$maxLoops;$loop++)
  1264 + {
  1265 +
  1266 + $sql = "SELECT
  1267 + document_id, document_text
  1268 + FROM
  1269 + document_text
  1270 + ORDER BY document_id
  1271 + LIMIT $max";
  1272 + $result = DBUtil::getResultArray($sql);
  1273 + if (PEAR::isError($result))
  1274 + {
  1275 + $default->log->info(_kt('migrateDocuments: db error'));
  1276 + break;
  1277 + }
  1278 +
  1279 + $docs = count($result);
  1280 + if ($docs == 0)
  1281 + {
  1282 + $noDocs = true;
  1283 + break;
  1284 + }
  1285 + $numDocs += $docs;
  1286 +
  1287 + foreach($result as $docinfo)
  1288 + {
  1289 + $docId = $docinfo['document_id'];
  1290 +
  1291 + $document = Document::get($docId);
  1292 + if (PEAR::isError($document) || is_null($document))
  1293 + {
  1294 + $sql = "DELETE FROM document_text WHERE document_id=$docId";
  1295 + DBUtil::runQuery($sql);
  1296 + $default->log->error(sprintf(_kt('migrateDocuments: Could not get document %d\'s document! Removing content!'),$docId));
  1297 + continue;
  1298 + }
  1299 +
  1300 + $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();
  1301 +
  1302 + $targetFile = tempnam($tempPath, 'ktindexer');
  1303 +
  1304 + if (file_put_contents($targetFile, $docinfo['document_text']) === false)
  1305 + {
  1306 + $default->log->error(sprintf(_kt('migrateDocuments: Cannot write to \'%s\' for document id %d'), $targetFile, $docId));
  1307 + continue;
  1308 + }
  1309 + // free memory asap ;)
  1310 + unset($docinfo['document_text']);
  1311 +
  1312 + $title = $document->getName();
  1313 +
  1314 + $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);
  1315 +
  1316 + if ($indexStatus)
  1317 + {
  1318 + $sql = "DELETE FROM document_text WHERE document_id=$docId";
  1319 + DBUtil::runQuery($sql);
  1320 + }
  1321 + else
  1322 + {
  1323 + $default->log->error(sprintf(_kt("migrateDocuments: Problem indexing document %d"), $docId));
  1324 + }
  1325 +
  1326 + @unlink($targetFile);
  1327 + }
  1328 + }
  1329 +
  1330 + @unlink($lockFile);
  1331 +
  1332 + $time = KTUtil::getBenchmarkTime() - $start;
  1333 +
  1334 + KTUtil::setSystemSetting('migrationTime', KTUtil::getSystemSetting('migrationTime',0) + $time);
  1335 + KTUtil::setSystemSetting('migratedDocuments', KTUtil::getSystemSetting('migratedDocuments',0) + $numDocs);
  1336 +
  1337 + $default->log->info(sprintf(_kt('migrateDocuments: stopping - done in %d seconds!'), $time));
  1338 + if ($noDocs)
  1339 + {
  1340 + $default->log->info(_kt('migrateDocuments: Completed!'));
  1341 + KTUtil::setSystemSetting('migrationComplete', 'true');
  1342 + schedulerUtil::deleteByName('Index Migration');
  1343 + $default->log->debug(_kt('migrateDocuments: Disabling \'Index Migration\' task by removing scheduler entry.'));
  1344 + }
  1345 + }
  1346 +
  1347 + /**
  1348 + * Index a document. The base class must override this function.
  1349 + *
  1350 + * @param int $docId
  1351 + * @param string $textFile
  1352 + */
  1353 + protected abstract function indexDocument($docId, $textFile, $title, $version);
  1354 +
  1355 +
  1356 + public function updateDocumentIndex($docId, $text)
  1357 + {
  1358 + $config = KTConfig::getSingleton();
  1359 + $tempPath = $config->get("urls/tmpDirectory");
  1360 + $tempFile = tempnam($tempPath,'ud_');
  1361 +
  1362 + file_put_contents($tempFile, $text);
  1363 +
  1364 + $document = Document::get($docId);
  1365 + $title = $document->getDescription();
  1366 + $version = $document->getVersion();
  1367 +
  1368 + $result = $this->indexDocument($docId, $tempFile, $title, $version);
  1369 +
  1370 + if (file_exists($tempFile))
  1371 + {
  1372 + unlink($tempFile);
  1373 + }
  1374 +
  1375 + return $result;
  1376 + }
  1377 +
  1378 + /**
  1379 + * Index a discussion. The base class must override this function.
  1380 + *
  1381 + * @param int $docId
  1382 + */
  1383 + protected abstract function indexDiscussion($docId);
  1384 +
  1385 + /**
  1386 + * Diagnose the indexer. e.g. Check that the indexing server is running.
  1387 + *
  1388 + */
  1389 + public abstract function diagnose();
  1390 +
  1391 + /**
  1392 + * Diagnose the extractors.
  1393 + *
  1394 + * @return array
  1395 + */
  1396 + public function diagnoseExtractors()
  1397 + {
  1398 + $diagnosis = $this->_diagnose($this->extractorPath, 'DocumentExtractor', 'Extractor.inc.php');
  1399 + $diagnosis = array_merge($diagnosis, $this->_diagnose($this->hookPath, 'Hook', 'Hook.inc.php'));
  1400 +
  1401 + return $diagnosis;
  1402 + }
  1403 +
  1404 + /**
  1405 + * This is a refactored diagnose function.
  1406 + *
  1407 + * @param string $path
  1408 + * @param string $class
  1409 + * @param string $extension
  1410 + * @return array
  1411 + */
  1412 + private function _diagnose($path, $baseclass, $extension)
  1413 + {
  1414 + global $default;
  1415 +
  1416 + $diagnoses = array();
  1417 +
  1418 + $dir = opendir(SearchHelper::correctPath($path));
  1419 + $extlen = - strlen($extension);
  1420 +
  1421 + while (($file = readdir($dir)) !== false)
  1422 + {
  1423 + if (substr($file,0,1) == '.')
  1424 + {
  1425 + continue;
  1426 + }
  1427 + if (substr($file,$extlen) != $extension)
  1428 + {
  1429 + $default->log->error(sprintf(_kt("diagnose: '%s' does not have extension '%s'."), $file, $extension));
  1430 + continue;
  1431 + }
  1432 +
  1433 + require_once($path . '/' . $file);
  1434 +
  1435 + $class = substr($file, 0, -8);
  1436 + if (!class_exists($class))
  1437 + {
  1438 + $default->log->error(sprintf(_kt("diagnose: class '%s' does not exist."), $class));
  1439 + continue;
  1440 + }
  1441 +
  1442 + if (!$this->isExtractorEnabled($class))
  1443 + {
  1444 + $default->log->debug(sprintf(_kt("diagnose: extractor '%s' is disabled."), $class));
  1445 + continue;
  1446 + }
  1447 +
  1448 + $extractor = new $class();
  1449 + if (!is_a($extractor, $baseclass))
  1450 + {
  1451 + $default->log->error(sprintf(_kt("diagnose(): '%s' is not of type DocumentExtractor"), $class));
  1452 + continue;
  1453 + }
  1454 +
  1455 + $types = $extractor->getSupportedMimeTypes();
  1456 + if (empty($types))
  1457 + {
  1458 + if ($this->debug) $default->log->debug(sprintf(_kt("diagnose: class '%s' does not support any types."), $class));
  1459 + continue;
  1460 + }
  1461 +
  1462 + $diagnosis=$extractor->diagnose();
  1463 + if (empty($diagnosis))
  1464 + {
  1465 + continue;
  1466 + }
  1467 + $diagnoses[$class] = array(
  1468 + 'name'=>$extractor->getDisplayName(),
  1469 + 'diagnosis'=>$diagnosis
  1470 + );
  1471 +
  1472 + }
  1473 + closedir($dir);
  1474 +
  1475 + return $diagnoses;
  1476 + }
  1477 +
  1478 +
  1479 + /**
  1480 + * Register the extractor types.
  1481 + *
  1482 + * @param boolean $clear. Optional. Defaults to false.
  1483 + */
  1484 + public function registerTypes($clear=false)
  1485 + {
  1486 + if ($clear)
  1487 + {
  1488 + $this->clearExtractors();
  1489 + }
  1490 + $dir = opendir(SearchHelper::correctPath($this->extractorPath));
  1491 + while (($file = readdir($dir)) !== false)
  1492 + {
  1493 + if (substr($file,-17) == 'Extractor.inc.php')
  1494 + {
  1495 + require_once($this->extractorPath . '/' . $file);
  1496 + $class = substr($file, 0, -8);
  1497 +
  1498 + if (!class_exists($class))
  1499 + {
  1500 + // if the class does not exist, we can't do anything.
  1501 + continue;
  1502 + }
  1503 +
  1504 + $extractor = new $class;
  1505 + if ($extractor instanceof DocumentExtractor)
  1506 + {
  1507 + $extractor->registerMimeTypes();
  1508 + }
  1509 + }
  1510 + }
  1511 + closedir($dir);
  1512 + }
  1513 +
  1514 + /**
  1515 + * This is used as a possible obtimisation effort. It may be overridden in that case.
  1516 + *
  1517 + * @param int $docId
  1518 + * @param string $textFile
  1519 + */
  1520 + protected function indexDocumentAndDiscussion($docId, $textFile, $title, $version)
  1521 + {
  1522 + $this->indexDocument($docId, $textFile, $title, $version);
  1523 + $this->indexDiscussion($docId);
  1524 + }
  1525 +
  1526 + /**
  1527 + * Remove the document from the queue. This is normally called when it has been processed.
  1528 + *
  1529 + * @param int $docid
  1530 + */
  1531 + public static function unqueueDocument($docid, $reason=false, $level='debug')
  1532 + {
  1533 + $sql = "DELETE FROM index_files WHERE document_id=$docid";
  1534 + DBUtil::runQuery($sql);
  1535 + if ($reason !== false)
  1536 + {
  1537 + global $default;
  1538 + $default->log->$level("Indexer: removing document $docid from the queue - $reason");
  1539 + }
  1540 + }
  1541 +
  1542 + /**
  1543 + * Run a query on the index.
  1544 + *
  1545 + * @param string $query
  1546 + * @return array
  1547 + */
  1548 + public abstract function query($query);
  1549 +
  1550 + /**
  1551 + * Converts an integer to a string that can be easily compared and reversed.
  1552 + *
  1553 + * @param int $int
  1554 + * @return string
  1555 + */
  1556 + public static function longToString($int)
  1557 + {
  1558 + $maxlen = 14;
  1559 +
  1560 + $a2z = array('a','b','c','d','e','f','g','h','i','j');
  1561 + $o29 = array('0','1','2','3','4','5','6','7','8','9');
  1562 + $l = str_pad('',$maxlen - strlen("$int"),'0') . $int;
  1563 +
  1564 + return str_replace($o29, $a2z, $l);
  1565 + }
  1566 +
  1567 + /**
  1568 + * Converts a string to an integer.
  1569 + *
  1570 + * @param string $str
  1571 + * @return int
  1572 + */
  1573 + public static function stringToLong($str)
  1574 + {
  1575 + $a2z = array('a','b','c','d','e','f','g','h','i','j');
  1576 + $o29 = array('0','1','2','3','4','5','6','7','8','9');
  1577 +
  1578 + $int = str_replace($a2z, $o29, $str) + 0;
  1579 +
  1580 + return $int;
  1581 + }
  1582 +
  1583 + /**
  1584 + * Possibly we can optimise indexes. This method must be overriden.
  1585 + * The new function must call the parent!
  1586 + *
  1587 + */
  1588 + public function optimise()
  1589 + {
  1590 + KTUtil::setSystemSetting('luceneOptimisationDate', time());
  1591 + }
  1592 +
  1593 + /**
  1594 + * Shuts down the indexer
  1595 + *
  1596 + */
  1597 + public function shutdown()
  1598 + {
  1599 + // do nothing generally
  1600 + }
  1601 +
  1602 + /**
  1603 + * Returns the name of the indexer.
  1604 + *
  1605 + * @return string
  1606 + */
  1607 + public abstract function getDisplayName();
  1608 +
  1609 +
  1610 + /**
  1611 + * Returns the number of non-deleted documents in the index.
  1612 + *
  1613 + * @return int
  1614 + */
  1615 + public abstract function getDocumentsInIndex();
  1616 +
  1617 + /**
  1618 + * Returns the path to the index directory
  1619 + *
  1620 + * @return string
  1621 + */
  1622 + public function getIndexDirectory()
  1623 + {
  1624 + $config = KTConfig::getSingleton();
  1625 + $directory = $config->get('indexer/luceneDirectory');
  1626 + return $directory;
  1627 + }
  1628 +}
  1629 +
  1630 +?>