Commit 4f809981a4726dd0053e7670ca28066005f21e94

Authored by Megan Watson
1 parent 06bfc0d7

KTS-3806

"The number of documents in the indexing queue is incorrect on the Document Indexer Statistics dashlet"
Fixed. Adjusted sql to ignore problem documents.

Committed by: Megan Watson
Reviewed by: Conrad Vermeulen



git-svn-id: https://kt-dms.svn.sourceforge.net/svnroot/kt-dms/trunk@9511 c91229c3-7414-0410-bfa2-8a42b809f60b
Showing 1 changed file with 1862 additions and 1862 deletions
search2/indexing/indexerCore.inc.php
1 -<?php  
2 -  
3 -/**  
4 - * $Id:$  
5 - *  
6 - * KnowledgeTree Community Edition  
7 - * Document Management Made Simple  
8 - * Copyright (C) 2008 KnowledgeTree Inc.  
9 - * Portions copyright The Jam Warehouse Software (Pty) Limited  
10 - *  
11 - * This program is free software; you can redistribute it and/or modify it under  
12 - * the terms of the GNU General Public License version 3 as published by the  
13 - * Free Software Foundation.  
14 - *  
15 - * This program is distributed in the hope that it will be useful, but WITHOUT  
16 - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS  
17 - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more  
18 - * details.  
19 - *  
20 - * You should have received a copy of the GNU General Public License  
21 - * along with this program. If not, see <http://www.gnu.org/licenses/>.  
22 - *  
23 - * You can contact KnowledgeTree Inc., PO Box 7775 #87847, San Francisco,  
24 - * California 94120-7775, or email info@knowledgetree.com.  
25 - *  
26 - * The interactive user interfaces in modified source and object code versions  
27 - * of this program must display Appropriate Legal Notices, as required under  
28 - * Section 5 of the GNU General Public License version 3.  
29 - *  
30 - * In accordance with Section 7(b) of the GNU General Public License version 3,  
31 - * these Appropriate Legal Notices must retain the display of the "Powered by  
32 - * KnowledgeTree" logo and retain the original copyright notice. If the display of the  
33 - * logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices  
34 - * must display the words "Powered by KnowledgeTree" and retain the original  
35 - * copyright notice.  
36 - * Contributor( s): ______________________________________  
37 - *  
38 - */  
39 -  
40 -define('SEARCH2_INDEXER_DIR',realpath(dirname(__FILE__)) . '/');  
41 -require_once('indexing/extractorCore.inc.php');  
42 -require_once(KT_DIR . '/plugins/ktcore/scheduler/schedulerUtil.php');  
43 -require_once(KT_DIR . '/ktapi/ktapi.inc.php');  
44 -  
45 -class IndexerInconsistencyException extends Exception {};  
46 -  
47 -// TODO: Query Result Items code should be moved into the Search section. It has less to do with indexing...  
48 -  
49 -class QueryResultItem  
50 -{  
51 - protected $id;  
52 - protected $title;  
53 - protected $rank;  
54 - protected $text;  
55 - protected $fullpath;  
56 -  
57 - public function __construct($id, $title, $rank, $text, $fullpath)  
58 - {  
59 - $this->id = $id;  
60 - $this->title = $title;  
61 - $this->rank = $rank;  
62 - $this->text = $text;  
63 - $this->fullpath = $fullpath;  
64 - }  
65 -  
66 - public function getId() { return $this->id; }  
67 - public function getRealId() { return $this->id; }  
68 -  
69 - public function getIsProxy() { return $this instanceof ProxyResultItem; }  
70 - public function getIsFolder() { return substr(get_class($this), 0, 6) == 'Folder' ; }  
71 - public function getIsDocument() { return substr(get_class($this), 0, 8) == 'Document' ; }  
72 -  
73 - public function setRank($value)  
74 - {  
75 - $this->rank = number_format($value,2,'.',',');  
76 - }  
77 -  
78 - public function getIsLive()  
79 - {  
80 - return true;  
81 - }  
82 -  
83 - public function setTitle($value)  
84 - {  
85 - $this->title = $value;  
86 - }  
87 -  
88 - public function setText($value)  
89 - {  
90 - $this->text = $value;  
91 - }  
92 -  
93 - public function getRelevance() { return (float) $this->rank; }  
94 - public function getRank() { return $this->getRelevance(); }  
95 - public function getText() { return (string) $this->text; }  
96 - public function getTitle() { return (string) $this->title; }  
97 - public function getFullPath() { return (string) $this->fullpath; }  
98 -  
99 - protected function __get($property)  
100 - {  
101 - if (empty($property))  
102 - {  
103 - return '';  
104 - }  
105 -  
106 - $method = 'get' . $property;  
107 - if (method_exists($this, $method))  
108 - {  
109 - return $this->$method();  
110 - }  
111 - return $this->getUnknown();  
112 - }  
113 -  
114 - protected function getUnknown()  
115 - {  
116 - return _kt('n/a');  
117 - }  
118 -  
119 - protected function __set($property, $value)  
120 - {  
121 - if (empty($property))  
122 - {  
123 - return '';  
124 - }  
125 -  
126 - $method = 'set' . $property;  
127 - if (method_exists($this, $method))  
128 - {  
129 - return $this->$method($value);  
130 - }  
131 - throw new Exception("Unknown property '$property' to set on QueryResultItem");  
132 - }  
133 -}  
134 -  
135 -class ProxyResultItem extends QueryResultItem  
136 -{  
137 - protected $proxy;  
138 - protected $proxyId;  
139 -  
140 - public function __construct($proxyId, $proxy)  
141 - {  
142 - parent::__construct($proxyId, $proxy->getTitle, $proxy->getRank(), $proxy->getText(), $proxy->getFullPath());  
143 - $this->proxyId = $proxyId;  
144 - $this->proxy = $proxy;  
145 - }  
146 -  
147 - public function getId() { return $this->proxyId; }  
148 - public function getTitle() { return $this->proxy->getTitle(); }  
149 - public function getRealId() { return $this->proxy->getId(); }  
150 -  
151 - protected function __get($property)  
152 - {  
153 - $method = 'get' . $property;  
154 -  
155 - if (method_exists($this, $method))  
156 - {  
157 - return $this->$method();  
158 - }  
159 - else  
160 - {  
161 - return $this->proxy->$method();  
162 - }  
163 - }  
164 -  
165 - protected function __set($property, $value)  
166 - {  
167 - $method = 'set' . $property;  
168 - if (method_exists($this, $method))  
169 - {  
170 - return $this->$method($value);  
171 - }  
172 - else  
173 - {  
174 - return $this->proxy->$method($value);  
175 - }  
176 - }  
177 -}  
178 -  
179 -class DocumentResultItem extends QueryResultItem  
180 -{  
181 - protected $filesize;  
182 - protected $live;  
183 - protected $version;  
184 - protected $mimeType;  
185 - protected $filename;  
186 - protected $thumbnail; // TODO: if not null, gui can display a thumbnail  
187 - protected $viewer; // TODO: if not null, a viewer can be used to view the document  
188 - protected $document;  
189 - protected $checkedOutUser;  
190 - protected $dateCheckedout;  
191 - protected $workflowState;  
192 - protected $workflow;  
193 - protected $modifiedBy;  
194 - protected $dateModified;  
195 - protected $createdBy;  
196 - protected $dateCreated;  
197 - protected $owner;  
198 - protected $immutable;  
199 - protected $deleted;  
200 - protected $status;  
201 - protected $folderId;  
202 - protected $storagePath;  
203 - protected $documentType;  
204 - protected $mimeIconPath;  
205 - protected $mimeDisplay;  
206 - protected $oemDocumentNo;  
207 -  
208 - public function __construct($document_id, $rank=null, $title=null, $text=null, $fullpath = null)  
209 - {  
210 - parent::__construct($document_id, $title, $rank, $text, $fullpath);  
211 - $this->live = true;  
212 - $this->loadDocumentInfo();  
213 - }  
214 -  
215 - // TODO: this is bad. must refactor to do the query on the group of documents.  
216 - public function loadDocumentInfo()  
217 - {  
218 - global $default;  
219 - $sql = "SELECT  
220 - d.folder_id, f.full_path, f.name, dcv.size as filesize, dcv.major_version,  
221 - dcv.minor_version, dcv.filename, cou.name as checkoutuser, w.human_name as workflow, ws.human_name as workflowstate,  
222 - mt.mimetypes as mimetype, md.mime_doc as mimedoc, d.checkedout, mbu.name as modifiedbyuser, d.modified,  
223 - cbu.name as createdbyuser, ou.name as owneruser, d.immutable, d.status_id, d.created,dcv.storage_path, dtl.name as document_type,  
224 - mt.icon_path as mime_icon_path, mt.friendly_name as mime_display, d.oem_no, dmv.name as title  
225 - FROM  
226 - documents d  
227 - INNER JOIN document_metadata_version dmv ON d.metadata_version_id = dmv.id  
228 - INNER JOIN document_content_version dcv ON dmv.content_version_id = dcv.id  
229 - INNER JOIN mime_types mt ON dcv.mime_id=mt.id  
230 - LEFT JOIN document_types_lookup dtl ON dtl.id=dmv.document_type_id  
231 - LEFT JOIN folders f ON f.id=d.folder_id  
232 - LEFT JOIN users cou ON d.checked_out_user_id=cou.id  
233 - LEFT JOIN workflows w ON dmv.workflow_id=w.id  
234 - LEFT JOIN workflow_states ws ON dmv.workflow_state_id = ws.id  
235 - LEFT JOIN mime_documents md ON mt.mime_document_id = md.id  
236 - LEFT JOIN users mbu ON d.modified_user_id=mbu.id  
237 - LEFT JOIN users cbu ON d.creator_id=cbu.id  
238 - LEFT JOIN users ou ON d.owner_id=ou.id  
239 - WHERE  
240 - d.id=$this->id";  
241 -  
242 - $result = DBUtil::getOneResult($sql);  
243 -  
244 - if (PEAR::isError($result) || empty($result))  
245 - {  
246 - $this->live = false;  
247 - if (PEAR::isError($result))  
248 - {  
249 - throw new Exception('Database exception! There appears to be an error in the system: ' .$result->getMessage());  
250 - }  
251 -  
252 - $default->log->error('QueryResultItem: $result is null');  
253 - $msg = 'The database did not have a record matching the result from the document indexer. This may occur if there is an inconsistency between the document indexer and the repository. The indexer needs to be repaired.';  
254 - $default->log->error('QueryResultItem: ' . $msg);  
255 - // TODO: repair process where we scan documents in index, and delete those for which there is nothing in the repository  
256 - throw new IndexerInconsistencyException(_kt($msg));  
257 - }  
258 -  
259 - // document_id, relevance, text, title  
260 -  
261 - $this->documentType = $result['document_type'];  
262 - $this->filename=$result['filename'];  
263 - $this->filesize = KTUtil::filesizeToString($result['filesize']);  
264 - $this->folderId = $result['folder_id'];  
265 - $this->title = $result['title'];  
266 -  
267 - $this->createdBy = $result['createdbyuser'];  
268 - $this->dateCreated = $result['created'];  
269 -  
270 - $this->modifiedBy = $result['modifiedbyuser'];  
271 - $this->dateModified = $result['modified'];  
272 -  
273 - $this->checkedOutUser = $result['checkoutuser'];  
274 - $this->dateCheckedout = $result['checkedout'];  
275 -  
276 - $this->owner = $result['owneruser'];  
277 -  
278 - $this->version = $result['major_version'] . '.' . $result['minor_version'];  
279 -  
280 - $this->immutable = ($result['immutable'] + 0)?_kt('Immutable'):'';  
281 -  
282 - $this->workflow = $result['workflow'];  
283 - $this->workflowState = $result['workflowstate'];  
284 -  
285 - $this->oemDocumentNo = $result['oem_no'];  
286 - if (empty($this->oemDocumentNo)) $this->oemDocumentNo = 'n/a';  
287 -  
288 - if (is_null($result['name']))  
289 - {  
290 - $this->fullpath = '(orphaned)';  
291 - }  
292 - else  
293 - {  
294 - $this->fullpath = $result['full_path'];  
295 - }  
296 -  
297 - $this->mimeType = $result['mimetype'];  
298 - $this->mimeIconPath = $result['mime_icon_path'];  
299 - if (empty($this->mimeIconPath))  
300 - {  
301 - $this->mimeIconPath = 'unspecified_type';  
302 - }  
303 - $this->mimeDisplay = $result['mime_display'];  
304 -  
305 - $this->storagePath = $result['storage_path'];  
306 - $this->status = Document::getStatusString($result['status_id']);  
307 - }  
308 -  
309 - public function getDocumentID() { return $this->getId(); }  
310 - public function getIsLive() { return (bool) $this->live; }  
311 - public function getFilesize() { return $this->filesize; }  
312 - public function getVersion() { return (string) $this->version; }  
313 - public function getFilename() { return (string)$this->filename; }  
314 - public function getFolderId() { return (int)$this->folderId; }  
315 - public function getOemDocumentNo() { return (string) $this->oemDocumentNo; }  
316 - public function getDocument() { return Document::get($this->id); }  
317 - public function getIsAvailable() { return $this->Document->isLive(); }  
318 - public function getCheckedOutUser() { return (string) $this->checkedOutUser; }  
319 - public function getCheckedOutByr() { return $this->getCheckedOutUser(); }  
320 - public function getWorkflowOnly() { return (string)$this->workflow; }  
321 - public function getWorkflow() { return $this->getWorkflow(); }  
322 - public function getWorkflowStateOnly() { return (string)$this->workflowState; }  
323 - public function getWorkflowState() { return $this->getWorkflowStateOnly(); }  
324 - public function getWorkflowAndState() {  
325 - if (is_null($this->workflow))  
326 - {  
327 - return '';  
328 - }  
329 - return "$this->workflow - $this->workflowState";  
330 - }  
331 - public function getMimeType() { return (string) $this->mimeType; }  
332 - public function getMimeIconPath() { return (string) $this->mimeIconPath; }  
333 - public function getMimeDisplay() { return (string) $this->mimeDisplay; }  
334 - public function getDateCheckedOut() { return (string) $this->dateCheckedout; }  
335 - public function getModifiedBy() { return (string) $this->modifiedBy; }  
336 - public function getDateModified() { return (string) $this->dateModified; }  
337 - public function getCreatedBy() { return (string) $this->createdBy; }  
338 - public function getDateCreated() { return (string) $this->dateCreated; }  
339 - public function getOwner() { return (string) $this->owner; }  
340 - public function getOwnedBy() { return $this->getOwner(); }  
341 - public function getIsImmutable() { return (bool) $this->immutable; }  
342 - public function getImmutable() { return $this->getIsImmutable(); }  
343 - public function getStatus() { return $this->status; }  
344 - public function getStoragePath() { return $this->storagePath; }  
345 - public function getDocumentType() { return $this->documentType; }  
346 - public function getPermissions() { return KTAPI_Document::get_permission_string($this->Document); }  
347 - public function getCanBeReadByUser() {  
348 - if (!$this->live)  
349 - return false;  
350 - if (Permission::userHasDocumentReadPermission($this->Document))  
351 - return true;  
352 - if (Permission::adminIsInAdminMode())  
353 - return true;  
354 - return false;  
355 - }  
356 -}  
357 -  
358 -class FolderResultItem extends QueryResultItem  
359 -{  
360 - protected $folder;  
361 - protected $createdBy;  
362 - protected $parentId;  
363 -  
364 - public function __construct($folder_id, $rank=null, $title=null, $text=null, $fullpath = null)  
365 - {  
366 - parent::__construct($folder_id, $title, $rank, $text, $fullpath);  
367 - $this->loadFolderInfo();  
368 - }  
369 -  
370 - public function getFolderID() { return $this->getId(); }  
371 - public function getParentID() { return $this->parentId; }  
372 - public function getCreatedBy() { return $this->createdBy; }  
373 - public function getMimeIconPath() { return 'folder'; }  
374 - public function getFolder() { return Folder::get($this->getFolderID()); }  
375 - public function getPermissions() { return KTAPI_Folder::get_permission_string($this->Folder); }  
376 -  
377 - public function loadFolderInfo()  
378 - {  
379 - global $default;  
380 - $folder = $this->getFolder();  
381 - if (PEAR::isError($folder))  
382 - {  
383 - throw new Exception('Database exception! There appears to be an error in the system: ' .$result->getMessage());  
384 - }  
385 - $this->title = $folder->getName();  
386 - $this->fullpath = '/' . $folder->getFullPath();  
387 - $this->parentId = $folder->getParentId();  
388 -  
389 - $user = User::get($folder->getCreatorID());  
390 - $this->createdBy = (PEAR::isError($user))?_kt('Unknown'):$user->getName();  
391 - }  
392 -  
393 -}  
394 -  
395 -class DocumentShortcutResultItem extends ProxyResultItem  
396 -{  
397 - public function getDocumentID() { return $this->getId(); }  
398 - public function getMimeIconPath() { return $this->proxy->getMimeIconPath() . '_shortcut'; }  
399 -  
400 -}  
401 -  
402 -class FolderShortcutResultItem extends ProxyResultItem  
403 -{  
404 - public function getFolderID() { return $this->getId(); }  
405 - public function getMimeIconPath() { return 'folder_shortcut'; }  
406 -  
407 -}  
408 -  
409 -function MatchResultCompare($a, $b)  
410 -{  
411 - if ($a->Rank == $b->Rank) {  
412 - return 0;  
413 - }  
414 - return ($a->Rank < $b->Rank) ? -1 : 1;  
415 -}  
416 -  
417 -abstract class Indexer  
418 -{  
419 - /**  
420 - * Cache of extractors  
421 - *  
422 - * @var array  
423 - */  
424 - private $extractorCache;  
425 -  
426 - /**  
427 - * Indicates if the indexer will do logging.  
428 - *  
429 - * @var boolean  
430 - */  
431 - private $debug;  
432 - /**  
433 - * Cache on mime related hooks  
434 - *  
435 - * @var unknown_type  
436 - */  
437 - private $mimeHookCache;  
438 - /**  
439 - * Cache on general hooks.  
440 - *  
441 - * @var array  
442 - */  
443 - private $generalHookCache;  
444 -  
445 - /**  
446 - * This is a path to the extractors.  
447 - *  
448 - * @var string  
449 - */  
450 - private $extractorPath;  
451 - /**  
452 - * This is a path to the hooks.  
453 - *  
454 - * @var string  
455 - */  
456 - private $hookPath;  
457 -  
458 - private $enabledExtractors;  
459 -  
460 - /**  
461 - * Initialise the indexer  
462 - *  
463 - */  
464 - protected function __construct()  
465 - {  
466 - $config = KTConfig::getSingleton();  
467 -  
468 - $this->extractorCache = array();  
469 - $this->debug = $config->get('indexer/debug', true);  
470 - $this->hookCache = array();  
471 - $this->generalHookCache = array();  
472 - $this->extractorPath = $config->get('indexer/extractorPath', 'extractors');  
473 - $this->hookPath = $config->get('indexer/extractorHookPath','extractorHooks');  
474 -  
475 - $this->loadExtractorStatus();  
476 - }  
477 -  
478 - /**  
479 - * Get the list if enabled extractors  
480 - *  
481 - */  
482 - private function loadExtractorStatus()  
483 - {  
484 - $sql = "SELECT id, name FROM mime_extractors WHERE active=1";  
485 - $rs = DBUtil::getResultArray($sql);  
486 - $this->enabledExtractors = array();  
487 - foreach($rs as $item)  
488 - {  
489 - $this->enabledExtractors[] = $item['name'];  
490 - }  
491 - }  
492 -  
493 - private function isExtractorEnabled($extractor)  
494 - {  
495 - return in_array($extractor, $this->enabledExtractors);  
496 - }  
497 -  
498 - /**  
499 - * Returns a reference to the main class  
500 - *  
501 - * @return Indexer  
502 - */  
503 - public static function get()  
504 - {  
505 - static $singleton = null;  
506 -  
507 - if (is_null($singleton))  
508 - {  
509 - $config = KTConfig::getSingleton();  
510 - $classname = $config->get('indexer/coreClass');  
511 -  
512 - require_once('indexing/indexers/' . $classname . '.inc.php');  
513 -  
514 - if (!class_exists($classname))  
515 - {  
516 - throw new Exception("Class '$classname' does not exist.");  
517 - }  
518 -  
519 - $singleton = new $classname;  
520 - }  
521 -  
522 - return $singleton;  
523 - }  
524 -  
525 - public abstract function deleteDocument($docid);  
526 -  
527 - /**  
528 - * Remove the association of all extractors to mime types on the database.  
529 - *  
530 - */  
531 - public function clearExtractors()  
532 - {  
533 - global $default;  
534 -  
535 - $sql = "update mime_types set extractor_id=null";  
536 - DBUtil::runQuery($sql);  
537 -  
538 - $sql = "delete from mime_extractors";  
539 - DBUtil::runQuery($sql);  
540 -  
541 - if ($this->debug) $default->log->debug('clearExtractors');  
542 - }  
543 -  
544 - /**  
545 - * lookup the name of the extractor class based on the mime type.  
546 - *  
547 - * @param string $type  
548 - * @return string  
549 - */  
550 - public static function resolveExtractor($type)  
551 - {  
552 - global $default;  
553 - $sql = "select extractor from mime_types where filetypes='$type'";  
554 - $class = DBUtil::getOneResultKey($sql,'extractor');  
555 - if (PEAR::isError($class))  
556 - {  
557 - $default->log->error("resolveExtractor: cannot resolve $type");  
558 - return $class;  
559 - }  
560 - if ($this->debug) $default->log->debug(sprintf(_kt("resolveExtractor: Resolved '%s' from mime type '%s'."), $class, $type));  
561 - return $class;  
562 - }  
563 -  
564 - /**  
565 - * Return all the discussion text.  
566 - *  
567 - * @param int $docid  
568 - * @return string  
569 - */  
570 - public static function getDiscussionText($docid)  
571 - {  
572 - $sql = "SELECT  
573 - dc.subject, dc.body  
574 - FROM  
575 - discussion_threads dt  
576 - INNER JOIN discussion_comments dc ON dc.thread_id=dt.id AND dc.id BETWEEN dt.first_comment_id AND dt.last_comment_id  
577 - WHERE  
578 - dt.document_id=$docid";  
579 - $result = DBUtil::getResultArray($sql);  
580 - $text = '';  
581 -  
582 - foreach($result as $record)  
583 - {  
584 - $text .= $record['subject'] . "\n" . $record['body'] . "\n";  
585 - }  
586 -  
587 - return $text;  
588 - }  
589 -  
590 - /**  
591 - * Schedule the indexing of a document.  
592 - *  
593 - * @param string $document  
594 - * @param string $what  
595 - */  
596 - public static function index($document, $what='A')  
597 - {  
598 - global $default;  
599 -  
600 - if (is_numeric($document))  
601 - {  
602 - $document = Document::get($document+0);  
603 - }  
604 -  
605 - if (PEAR::isError($document))  
606 - {  
607 - $default->log->error("index: Could not index document: " .$document->getMessage());  
608 - return;  
609 - }  
610 -  
611 - $document_id = $document->getId();  
612 - $userid=$_SESSION['userID'];  
613 - if (empty($userid)) $userid=1;  
614 -  
615 - // we dequeue the document so that there are no issues when enqueuing  
616 - Indexer::unqueueDocument($document_id);  
617 -  
618 - // enqueue item  
619 - $sql = "INSERT INTO index_files(document_id, user_id, what) VALUES($document_id, $userid, '$what')";  
620 - DBUtil::runQuery($sql);  
621 -  
622 - $default->log->debug("index: Queuing indexing of $document_id");  
623 -  
624 - }  
625 -  
626 - private static function incrementCount()  
627 - {  
628 - // Get count from system settings  
629 - $count = Indexer::getIndexedDocumentCount();  
630 - $count = (int)$count + 1;  
631 - Indexer::updateIndexedDocumentCount($count);  
632 - }  
633 -  
634 - public static function getIndexedDocumentCount()  
635 - {  
636 - $count = KTUtil::getSystemSetting('indexedDocumentCount', 0);  
637 - return (int) $count;  
638 - }  
639 -  
640 - public static function updateIndexedDocumentCount($cnt = 0)  
641 - {  
642 - KTUtil::setSystemSetting('indexedDocumentCount', $cnt);  
643 - }  
644 -  
645 - public static function reindexQueue()  
646 - {  
647 - $sql = "UPDATE index_files SET processdate = null";  
648 - DBUtil::runQuery($sql);  
649 - }  
650 -  
651 - public static function reindexDocument($documentId)  
652 - {  
653 - $sql = "UPDATE index_files SET processdate=null, status_msg=null WHERE document_id=$documentId";  
654 - DBUtil::runQuery($sql);  
655 - }  
656 -  
657 -  
658 -  
659 - public static function indexAll()  
660 - {  
661 - $userid=$_SESSION['userID'];  
662 - if (empty($userid)) $userid=1;  
663 -  
664 - $sql = "DELETE FROM index_files";  
665 - DBUtil::runQuery($sql);  
666 -  
667 - $sql = "INSERT INTO index_files(document_id, user_id, what) SELECT id, $userid, 'A' FROM documents WHERE status_id=1 and id not in (select document_id from index_files)";  
668 - DBUtil::runQuery($sql);  
669 - }  
670 -  
671 - public static function indexFolder($folder)  
672 - {  
673 - $userid=$_SESSION['userID'];  
674 - if (empty($userid)) $userid=1;  
675 -  
676 - if (!$folder instanceof Folder && !$folder instanceof FolderProxy)  
677 - {  
678 - throw new Exception('Folder expected');  
679 - }  
680 -  
681 - $full_path = $folder->getFullPath();  
682 -  
683 - $sql = "INSERT INTO index_files(document_id, user_id, what) SELECT id, $userid, 'A' FROM documents WHERE full_path like '{$full_path}/%' AND status_id=1 and id not in (select document_id from index_files)";  
684 - DBUtil::runQuery($sql);  
685 - }  
686 -  
687 - /**  
688 - * Clearout the scheduling of documents that no longer exist.  
689 - *  
690 - */  
691 - public static function clearoutDeleted()  
692 - {  
693 - global $default;  
694 -  
695 - $sql = 'DELETE FROM  
696 - index_files  
697 - WHERE  
698 - document_id in (SELECT d.id FROM documents AS d WHERE d.status_id=3) OR  
699 - NOT EXISTS(SELECT index_files.document_id FROM documents WHERE index_files.document_id=documents.id)';  
700 - DBUtil::runQuery($sql);  
701 -  
702 - $default->log->debug("Indexer::clearoutDeleted: removed documents from indexing queue that have been deleted");  
703 - }  
704 -  
705 -  
706 - /**  
707 - * Check if a document is scheduled to be indexed  
708 - *  
709 - * @param mixed $document This may be a document or document id  
710 - * @return boolean  
711 - */  
712 - public static function isDocumentScheduled($document)  
713 - {  
714 - if (is_numeric($document))  
715 - {  
716 - $docid = $document;  
717 - }  
718 - else if ($document instanceof Document)  
719 - {  
720 - $docid = $document->getId();  
721 - }  
722 - else  
723 - {  
724 - return false;  
725 - }  
726 - $sql = "SELECT 1 FROM index_files WHERE document_id=$docid";  
727 - $result = DBUtil::getResultArray($sql);  
728 - return count($result) > 0;  
729 - }  
730 -  
731 - /**  
732 - * Filters text removing redundant characters such as continuous newlines and spaces.  
733 - *  
734 - * @param string $filename  
735 - */  
736 - private function filterText($filename)  
737 - {  
738 - $content = file_get_contents($filename);  
739 -  
740 - $src = array("([\r\n])","([\n][\n])","([\n])","([\t])",'([ ][ ])');  
741 - $tgt = array("\n","\n",' ',' ',' ');  
742 -  
743 - // shrink what is being stored.  
744 - do  
745 - {  
746 - $orig = $content;  
747 - $content = preg_replace($src, $tgt, $content);  
748 - } while ($content != $orig);  
749 -  
750 - return file_put_contents($filename, $content) !== false;  
751 - }  
752 -  
753 - /**  
754 - * Load hooks for text extraction process.  
755 - *  
756 - */  
757 - private function loadExtractorHooks()  
758 - {  
759 - $this->generalHookCache = array();  
760 - $this->mimeHookCache = array();  
761 -  
762 -  
763 - $dir = opendir(SearchHelper::correctPath($this->hookPath));  
764 - while (($file = readdir($dir)) !== false)  
765 - {  
766 - if (substr($file,-12) == 'Hook.inc.php')  
767 - {  
768 - require_once($this->hookPath . '/' . $file);  
769 - $class = substr($file, 0, -8);  
770 -  
771 - if (!class_exists($class))  
772 - {  
773 - continue;  
774 - }  
775 -  
776 - $hook = new $class;  
777 - if (!($class instanceof ExtractorHook))  
778 - {  
779 - continue;  
780 - }  
781 -  
782 - $mimeTypes = $hook->registerMimeTypes();  
783 - if (is_null($mimeTypes))  
784 - {  
785 - $this->generalHookCache[] = & $hook;  
786 - }  
787 - else  
788 - {  
789 - foreach($mimeTypes as $type)  
790 - {  
791 - $this->mimeHookCache[$type][] = & $hook;  
792 - }  
793 - }  
794 -  
795 - }  
796 - }  
797 - closedir($dir);  
798 - }  
799 -  
800 - /**  
801 - * This is a refactored function to execute the hooks.  
802 - *  
803 - * @param DocumentExtractor $extractor  
804 - * @param string $phase  
805 - * @param string $mimeType Optional. If set, indicates which hooks must be used, else assume general.  
806 - */  
807 - private function executeHook($extractor, $phase, $mimeType = null)  
808 - {  
809 - $hooks = array();  
810 - if (is_null($mimeType))  
811 - {  
812 - $hooks = $this->generalHookCache;  
813 - }  
814 - else  
815 - {  
816 - if (array_key_exists($mimeType, $this->mimeHookCache))  
817 - {  
818 - $hooks = $this->mimeHookCache[$mimeType];  
819 - }  
820 - }  
821 - if (empty($hooks))  
822 - {  
823 - return;  
824 - }  
825 -  
826 - foreach($hooks as $hook)  
827 - {  
828 - $hook->$phase($extractor);  
829 - }  
830 - }  
831 -  
832 - private function doesDiagnosticsPass($simple=false)  
833 - {  
834 - global $default;  
835 -  
836 - $config =& KTConfig::getSingleton();  
837 - // create a index log lock file in case there are errors, and we don't need to log them forever!  
838 - // this function will create the lockfile if an error is detected. It will be removed as soon  
839 - // as the problems with the indexer are removed.  
840 - $lockFile = $config->get('cache/cacheDirectory') . '/index.log.lock';  
841 -  
842 - $diagnosis = $this->diagnose();  
843 - if (!is_null($diagnosis))  
844 - {  
845 - if (!is_file($lockFile))  
846 - {  
847 - $default->log->error(_kt('Indexer problem: ') . $diagnosis);  
848 - }  
849 - touch($lockFile);  
850 - return false;  
851 - }  
852 -  
853 - if ($simple)  
854 - {  
855 - return true;  
856 - }  
857 -  
858 - $diagnosis = $this->diagnoseExtractors();  
859 - if (!empty($diagnosis))  
860 - {  
861 - if (!is_file($lockFile))  
862 - {  
863 - foreach($diagnosis as $diag)  
864 - {  
865 - $default->log->error(sprintf(_kt('%s problem: %s'), $diag['name'],$diag['diagnosis']));  
866 - }  
867 - }  
868 - touch($lockFile);  
869 - return false;  
870 - }  
871 -  
872 - if (is_file($lockFile))  
873 - {  
874 - $default->log->info(_kt('Issues with the indexer have been resolved!'));  
875 - unlink($lockFile);  
876 - }  
877 -  
878 - return true;  
879 - }  
880 -  
881 - /**  
882 - * This does the initial mime type association between mime types and text extractors  
883 - *  
884 - */  
885 - public function checkForRegisteredTypes()  
886 - {  
887 - global $default;  
888 -  
889 - // we are only doing this once!  
890 - $initRegistered = KTUtil::getSystemSetting('mimeTypesRegistered', false);  
891 - if ($initRegistered)  
892 - {  
893 - return;  
894 - }  
895 - if ($this->debug) $default->log->debug('checkForRegisteredTypes: start');  
896 -  
897 - $date = date('Y-m-d H:i');  
898 - $sql = "UPDATE scheduler_tasks SET run_time='$date'";  
899 - DBUtil::runQuery($sql);  
900 -  
901 - $this->registerTypes(true);  
902 -  
903 - $disable = array(  
904 - 'windows'=>array('PSExtractor'),  
905 - 'unix' => array()  
906 - );  
907 -  
908 - $disableForOS = OS_WINDOWS?$disable['windows']:$disable['unix'];  
909 -  
910 - if (!empty($disableForOS))  
911 - {  
912 - $disableForOS = '\'' . implode("','", $disableForOS) .'\'';  
913 -  
914 - $sql = "UPDATE mime_extractors SET active=0 WHERE name in ($disableForOS)";  
915 - DBUtil::runQuery($sql);  
916 - $default->log->info("checkForRegisteredTypes: disabled '$extractor'");  
917 - }  
918 - $this->loadExtractorStatus();  
919 -  
920 - if ($this->debug) $default->log->debug('checkForRegisteredTypes: done');  
921 - KTUtil::setSystemSetting('mimeTypesRegistered', true);  
922 - }  
923 -  
924 - private function updatePendingDocumentStatus($documentId, $message, $level)  
925 - {  
926 - $this->indexingHistory .= "\n" . $level . ': ' . $message;  
927 - $message = sanitizeForSQL($this->indexingHistory);  
928 - $sql = "UPDATE index_files SET status_msg='$message' WHERE document_id=$documentId";  
929 - DBUtil::runQuery($sql);  
930 - }  
931 -  
932 - private $restartCurrentBatch = false;  
933 -  
934 - public function restartBatch()  
935 - {  
936 - $this->restartCurrentBatch = true;  
937 - }  
938 -  
939 - /**  
940 - *  
941 - * @param int $documentId  
942 - * @param string $message  
943 - * @param string $level This may be info, error, debug  
944 - */  
945 - private function logPendingDocumentInfoStatus($documentId, $message, $level)  
946 - {  
947 - $this->updatePendingDocumentStatus($documentId, $message, $level);  
948 - global $default;  
949 -  
950 - switch ($level)  
951 - {  
952 - case 'debug':  
953 - if ($this->debug)  
954 - {  
955 - $default->log->debug($message);  
956 - }  
957 - break;  
958 - default:  
959 - $default->log->$level($message);  
960 - }  
961 - }  
962 -  
963 -  
964 -  
965 - public function getExtractor($extractorClass)  
966 - {  
967 - if (empty($extractorClass))  
968 - {  
969 - return null;  
970 - }  
971 -  
972 - $includeFile = SEARCH2_INDEXER_DIR . 'extractors/' . $extractorClass . '.inc.php';  
973 - if (!file_exists($includeFile))  
974 - {  
975 - throw new Exception("Extractor file does not exist: $includeFile");  
976 - }  
977 -  
978 - require_once($includeFile);  
979 -  
980 - if (!class_exists($extractorClass))  
981 - {  
982 - throw new Exception("Extractor '$classname' not defined in file: $includeFile");  
983 - }  
984 -  
985 - $extractor = new $extractorClass();  
986 -  
987 - if (!($extractor instanceof DocumentExtractor))  
988 - {  
989 - throw new Exception("Class $classname was expected to be of type DocumentExtractor");  
990 - }  
991 -  
992 - return $extractor;  
993 - }  
994 -  
995 - public static function getIndexingQueue($problemItemsOnly=true)  
996 - {  
997 -  
998 - if ($problemItemsOnly)  
999 - {  
1000 - $sql = "SELECT  
1001 - iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename  
1002 - FROM  
1003 - index_files iff  
1004 - INNER JOIN documents d ON iff.document_id=d.id  
1005 - INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id  
1006 - INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id  
1007 - INNER JOIN mime_types mt ON dcv.mime_id=mt.id  
1008 - LEFT JOIN mime_extractors me ON mt.extractor_id=me.id  
1009 - WHERE  
1010 - (iff.status_msg IS NOT NULL AND iff.status_msg <> '') AND d.status_id=1  
1011 - ORDER BY indexdate ";  
1012 - }  
1013 - else  
1014 - {  
1015 - $sql = "SELECT  
1016 - iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename  
1017 - FROM  
1018 - index_files iff  
1019 - INNER JOIN documents d ON iff.document_id=d.id  
1020 - INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id  
1021 - INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id  
1022 - INNER JOIN mime_types mt ON dcv.mime_id=mt.id  
1023 - LEFT JOIN mime_extractors me ON mt.extractor_id=me.id  
1024 - WHERE  
1025 - (iff.status_msg IS NULL or iff.status_msg = '') AND d.status_id=1  
1026 - ORDER BY indexdate ";  
1027 - }  
1028 - $aResult = DBUtil::getResultArray($sql);  
1029 -  
1030 - return $aResult;  
1031 - }  
1032 -  
1033 - public static function getPendingIndexingQueue()  
1034 - {  
1035 - return Indexer::getIndexingQueue(false);  
1036 - }  
1037 -  
1038 - public function updateIndexStats()  
1039 - {  
1040 - $optimisationDate = KTUtil::getSystemSetting('luceneOptimisationDate', '');  
1041 -  
1042 - $noOptimisation = false;  
1043 - if ($optimisationDate == '')  
1044 - {  
1045 - $optimisationDate = _kt('N/A');  
1046 - $optimisationPeriod = $optimisationDate;  
1047 - }  
1048 - else  
1049 - {  
1050 - $optimisationPeriod = KTUtil::computePeriodToDate($optimisationDate, null, true);  
1051 - $noOptimisation = $optimisationPeriod['days'] > 2;  
1052 - $optimisationPeriod = $optimisationPeriod['str'];  
1053 - $optimisationDate = date('Y-m-d H:i:s', $optimisationDate);  
1054 - }  
1055 -  
1056 - $indexingDate = KTUtil::getSystemSetting('luceneIndexingDate', '');  
1057 - if ($indexingDate == '')  
1058 - {  
1059 - $indexingDate = _kt('N/A');  
1060 - $indexingPeriod = $indexingDate;  
1061 - }  
1062 - else  
1063 - {  
1064 - $indexingPeriod = KTUtil::computePeriodToDate($indexingDate);  
1065 - $indexingDate = date('Y-m-d H:i:s', $indexingDate);  
1066 - }  
1067 -  
1068 - $index = Indexer::get();  
1069 - $docsInIndex = $index->getDocumentsInIndex();  
1070 -  
1071 - // we are only interested in documents that are active  
1072 - $sql = "SELECT count(*) as docsInQueue FROM index_files i inner join documents d on i.document_id = d.id where d.status_id=1";  
1073 - $docsInQueue = DBUtil::getOneResultKey($sql, 'docsInQueue');  
1074 -  
1075 - $sql = "SELECT count(*) as errorsInQueue FROM index_files i inner join documents d on i.document_id = d.id where (i.status_msg is not null or i.status_msg <> '') and d.status_id=1";  
1076 - $errorsInQueue = DBUtil::getOneResultKey($sql, 'errorsInQueue');  
1077 -  
1078 - $sql = "SELECT count(*) as docsInRepository FROM documents where status_id=1";  
1079 - $docsInRepository = DBUtil::getOneResultKey($sql, 'docsInRepository');  
1080 -  
1081 - if ($docsInRepository == 0)  
1082 - {  
1083 - $indexingCoverage = '0.00%';  
1084 - $queueCoverage = $indexingCoverage;  
1085 - }  
1086 - else  
1087 - {  
1088 - // compute indexing coverage  
1089 - $indexingCoverage = _kt('Not Available');  
1090 - if (is_numeric($docsInIndex))  
1091 - {  
1092 - $indexingCoverage = ($docsInIndex * 100) / $docsInRepository;  
1093 - $indexingCoverage = number_format($indexingCoverage, 2, '.',',') . '%';  
1094 - }  
1095 -  
1096 - // compute queue coverage  
1097 - $queueCoverage = _kt('Not Available');  
1098 - if (is_numeric($docsInQueue))  
1099 - {  
1100 - $queueCoverage = ($docsInQueue * 100) / $docsInRepository;  
1101 - $queueCoverage = number_format($queueCoverage, 2, '.',',') . '%';  
1102 - }  
1103 - }  
1104 -  
1105 -  
1106 - $stats = array(  
1107 - 'optimisationDate'=>$optimisationDate,  
1108 - 'optimisationPeriod'=>$optimisationPeriod,  
1109 - 'indexingDate'=>$indexingDate,  
1110 - 'indexingPeriod'=>$indexingPeriod,  
1111 - 'docsInIndex'=>$docsInIndex,  
1112 - 'docsInQueue'=>$docsInQueue,  
1113 - 'errorsInQueue'=>$errorsInQueue,  
1114 - 'docsInRepository'=>$docsInRepository,  
1115 - 'indexingCoverage'=>$indexingCoverage,  
1116 - 'queueCoverage'=>$queueCoverage,  
1117 - 'noOptimisation'=>$noOptimisation  
1118 - );  
1119 -  
1120 - KTUtil::setSystemSetting('indexerStats', serialize($stats));  
1121 -  
1122 - $indexer = Indexer::get();  
1123 -  
1124 - $diagnosis = $indexer->diagnose();  
1125 - KTUtil::setSystemSetting('indexerDiagnostics', serialize($diagnosis));  
1126 -  
1127 - $extractorDiagnosis = $indexer->diagnoseExtractors();  
1128 -  
1129 - KTUtil::setSystemSetting('extractorDiagnostics', serialize($extractorDiagnosis));  
1130 - }  
1131 -  
1132 - /**  
1133 - * The main function that may be called repeatedly to index documents.  
1134 - *  
1135 - * @param int $max Default 20  
1136 - */  
1137 - public function indexDocuments($max=null)  
1138 - {  
1139 - global $default;  
1140 - $config =& KTConfig::getSingleton();  
1141 -  
1142 - /*$indexLockFile = $config->get('cache/cacheDirectory') . '/main.index.lock';  
1143 - if (is_file($indexLockFile))  
1144 - {  
1145 - $default->log->info('indexDocuments: main.index.lock seems to exist. it could be that the indexing is still underway.');  
1146 - $default->log->info('indexDocuments: Remove "' . $indexLockFile . '" if the indexing is not running or extend the frequency at which the background task runs!');  
1147 - return;  
1148 - }  
1149 - touch($indexLockFile);*/  
1150 -  
1151 -  
1152 - $this->checkForRegisteredTypes();  
1153 -  
1154 - if ($this->debug) $default->log->debug('indexDocuments: start');  
1155 - if (!$this->doesDiagnosticsPass())  
1156 - {  
1157 - //unlink($indexLockFile);  
1158 - if ($this->debug) $default->log->debug('indexDocuments: stopping - diagnostics problem. The dashboard will provide more information.');  
1159 - return;  
1160 - }  
1161 -  
1162 - if (is_null($max))  
1163 - {  
1164 - $max = $config->get('indexer/batchDocuments',20);  
1165 - }  
1166 -  
1167 - $this->loadExtractorHooks();  
1168 -  
1169 - Indexer::clearoutDeleted();  
1170 -  
1171 - $date = date('Y-m-d H:i:s');  
1172 - // identify the indexers that must run  
1173 - // mysql specific limit!  
1174 - $sql = "SELECT  
1175 - iff.document_id, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what  
1176 - FROM  
1177 - index_files iff  
1178 - INNER JOIN documents d ON iff.document_id=d.id  
1179 - INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id  
1180 - INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id  
1181 - INNER JOIN mime_types mt ON dcv.mime_id=mt.id  
1182 - LEFT JOIN mime_extractors me ON mt.extractor_id=me.id  
1183 - WHERE  
1184 - (iff.processdate IS NULL or iff.processdate < date_sub('$date', interval 1 day)) AND dmv.status_id=1  
1185 - ORDER BY indexdate  
1186 - LIMIT $max";  
1187 - $result = DBUtil::getResultArray($sql);  
1188 - if (PEAR::isError($result))  
1189 - {  
1190 - //unlink($indexLockFile);  
1191 - if ($this->debug) $default->log->debug('indexDocuments: stopping - db error');  
1192 - return;  
1193 - }  
1194 - KTUtil::setSystemSetting('luceneIndexingDate', time());  
1195 -  
1196 - // bail if no work to do  
1197 - if (count($result) == 0)  
1198 - {  
1199 - //unlink($indexLockFile);  
1200 - if ($this->debug) $default->log->debug('indexDocuments: stopping - no work to be done');  
1201 - return;  
1202 - }  
1203 -  
1204 - // identify any documents that need indexing and mark them  
1205 - // so they are not taken in a followup run  
1206 - $ids = array();  
1207 - foreach($result as $docinfo)  
1208 - {  
1209 - $ids[] = $docinfo['document_id'];  
1210 - }  
1211 -  
1212 - // mark the documents as being processed  
1213 -  
1214 - $ids=implode(',',$ids);  
1215 - $sql = "UPDATE index_files SET processdate='$date' WHERE document_id in ($ids)";  
1216 - DBUtil::runQuery($sql);  
1217 -  
1218 - $extractorCache = array();  
1219 - $storageManager = KTStorageManagerUtil::getSingleton();  
1220 -  
1221 - $tempPath = $config->get("urls/tmpDirectory");  
1222 -  
1223 - foreach($result as $docinfo)  
1224 - {  
1225 - // increment indexed documents count  
1226 - Indexer::incrementCount();  
1227 -  
1228 - $docId=$docinfo['document_id'];  
1229 - $extension=$docinfo['filetypes'];  
1230 - $mimeType=$docinfo['mimetypes'];  
1231 - $extractorClass=$docinfo['extractor'];  
1232 - $indexDocument = in_array($docinfo['what'], array('A','C'));  
1233 - $indexDiscussion = in_array($docinfo['what'], array('A','D'));  
1234 - $this->indexingHistory = '';  
1235 -  
1236 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Indexing docid: %d extension: '%s' mimetype: '%s' extractor: '%s'"), $docId, $extension,$mimeType,$extractorClass), 'debug');  
1237 -  
1238 - if (empty($extractorClass))  
1239 - {  
1240 - /*  
1241 -  
1242 - if no extractor is found and we don't need to index discussions, then we can remove the item from the queue.  
1243 -  
1244 - */  
1245 - if ($indexDiscussion)  
1246 - {  
1247 - $indexDocument = false;  
1248 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Not indexing docid: %d content because extractor could not be resolve. Still indexing discussion."), $docId), 'info');  
1249 - }  
1250 - else  
1251 - {  
1252 - Indexer::unqueueDocument($docId, sprintf(_kt("No extractor for docid: %d"),$docId));  
1253 - continue;  
1254 - }  
1255 - }  
1256 - else  
1257 - {  
1258 - /*  
1259 -  
1260 - If an extractor is available, we must ensure it is enabled.  
1261 -  
1262 - */  
1263 -  
1264 - if (!$this->isExtractorEnabled($extractorClass))  
1265 - {  
1266 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("diagnose: Not indexing docid: %d because extractor '%s' is disabled."), $docId, $extractorClass), 'info');  
1267 - continue;  
1268 - }  
1269 - }  
1270 -  
1271 - if ($this->debug)  
1272 - {  
1273 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Processing docid: %d.\n"),$docId), 'info');  
1274 - }  
1275 -  
1276 - $document = Document::get($docId);  
1277 - if (PEAR::isError($document))  
1278 - {  
1279 - Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Cannot resolve document id %d: %s."),$docId, $document->getMessage()), 'error');  
1280 - continue;  
1281 - }  
1282 -  
1283 - if ($this->restartCurrentBatch)  
1284 - {  
1285 - Indexer::unqueueDocument($docId);  
1286 - Indexer::index($docId, 'A');  
1287 - continue;  
1288 - }  
1289 -  
1290 -  
1291 - $filename = $document->getFileName();  
1292 - if (substr($filename,0,1) == '~' || substr($filename,-1) == '~')  
1293 - {  
1294 - Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Filename for document id %d starts with a tilde (~). This is assumed to be a temporary file. This is ignored."),$docId), 'error');  
1295 - continue;  
1296 - }  
1297 -  
1298 - $removeFromQueue = true;  
1299 - if ($indexDocument)  
1300 - {  
1301 - if (array_key_exists($extractorClass, $extractorCache))  
1302 - {  
1303 - $extractor = $extractorCache[$extractorClass];  
1304 - }  
1305 - else  
1306 - {  
1307 - $extractor = $extractorCache[$extractorClass] = $this->getExtractor($extractorClass);  
1308 - }  
1309 -  
1310 - if (!($extractor instanceof DocumentExtractor))  
1311 - {  
1312 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("indexDocuments: extractor '%s' is not a document extractor class."),$extractorClass), 'error');  
1313 - continue;  
1314 - }  
1315 -  
1316 -  
1317 -  
1318 - $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();  
1319 - $sourceFile = $storageManager->temporaryFile($document);  
1320 -  
1321 - if (empty($sourceFile) || !is_file($sourceFile))  
1322 - {  
1323 - Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: source file '%s' for document %d does not exist."),$sourceFile,$docId), 'error');  
1324 - continue;  
1325 - }  
1326 -  
1327 - if ($extractor->needsIntermediateSourceFile())  
1328 - {  
1329 - //$extension = pathinfo($document->getFileName(), PATHINFO_EXTENSION);  
1330 -  
1331 - $intermediate = $tempPath . '/'. $docId . '.' . $extension;  
1332 - $result = @copy($sourceFile, $intermediate);  
1333 - if ($result === false)  
1334 - {  
1335 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not create intermediate file from document %d"),$docId), 'error');  
1336 - // problem. lets try again later. probably permission related. log the issue.  
1337 - continue;  
1338 - }  
1339 - $sourceFile = $intermediate;  
1340 - }  
1341 -  
1342 - $targetFile = tempnam($tempPath, 'ktindexer');  
1343 -  
1344 - $extractor->setSourceFile($sourceFile);  
1345 - $extractor->setMimeType($mimeType);  
1346 - $extractor->setExtension($extension);  
1347 - $extractor->setTargetFile($targetFile);  
1348 - $extractor->setDocument($document);  
1349 - $extractor->setIndexingStatus(null);  
1350 - $extractor->setExtractionStatus(null);  
1351 -  
1352 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Extra Info docid: %d Source File: '%s' Target File: '%s'"),$docId,$sourceFile,$targetFile), 'debug');  
1353 -  
1354 - $this->executeHook($extractor, 'pre_extract');  
1355 - $this->executeHook($extractor, 'pre_extract', $mimeType);  
1356 - $removeFromQueue = false;  
1357 -  
1358 - if ($extractor->extractTextContent())  
1359 - {  
1360 - // the extractor may need to create another target file  
1361 - $targetFile = $extractor->getTargetFile();  
1362 -  
1363 - $extractor->setExtractionStatus(true);  
1364 - $this->executeHook($extractor, 'pre_index');  
1365 - $this->executeHook($extractor, 'pre_index', $mimeType);  
1366 -  
1367 - $title = $document->getName();  
1368 - if ($indexDiscussion)  
1369 - {  
1370 - if (!$this->filterText($targetFile))  
1371 - {  
1372 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');  
1373 - }  
1374 - else  
1375 - {  
1376 - $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);  
1377 - $removeFromQueue = $indexStatus;  
1378 - if (!$indexStatus)  
1379 - {  
1380 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocumentAndDiscussion"),$docId), 'error');  
1381 - }  
1382 -  
1383 - $extractor->setIndexingStatus($indexStatus);  
1384 - }  
1385 - }  
1386 - else  
1387 - {  
1388 - if (!$this->filterText($targetFile))  
1389 - {  
1390 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');  
1391 - }  
1392 - else  
1393 - {  
1394 - $indexStatus = $this->indexDocument($docId, $targetFile, $title, $version);  
1395 - $removeFromQueue = $indexStatus;  
1396 -  
1397 - if (!$indexStatus)  
1398 - {  
1399 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocument"),$docId), 'error');  
1400 - $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error');  
1401 - }  
1402 -  
1403 - $extractor->setIndexingStatus($indexStatus);  
1404 - }  
1405 - }  
1406 -  
1407 - $this->executeHook($extractor, 'post_index', $mimeType);  
1408 - $this->executeHook($extractor, 'post_index');  
1409 - }  
1410 - else  
1411 - {  
1412 - $extractor->setExtractionStatus(false);  
1413 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not extract contents from document %d"),$docId), 'error');  
1414 - $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error');  
1415 - }  
1416 -  
1417 - $this->executeHook($extractor, 'post_extract', $mimeType);  
1418 - $this->executeHook($extractor, 'post_extract');  
1419 -  
1420 - if ($extractor->needsIntermediateSourceFile())  
1421 - {  
1422 - @unlink($sourceFile);  
1423 - }  
1424 -  
1425 - @unlink($targetFile);  
1426 -  
1427 - }  
1428 - else  
1429 - {  
1430 - $indexStatus = $this->indexDiscussion($docId);  
1431 - $removeFromQueue = $indexStatus;  
1432 - }  
1433 -  
1434 - if ($removeFromQueue)  
1435 - {  
1436 - Indexer::unqueueDocument($docId, sprintf(_kt("Done indexing docid: %d"),$docId));  
1437 - }  
1438 - else  
1439 - {  
1440 - if ($this->debug) $default->log->debug(sprintf(_kt("Document docid: %d was not removed from the queue as it looks like there was a problem with the extraction process"),$docId));  
1441 - }  
1442 - }  
1443 - if ($this->debug) $default->log->debug('indexDocuments: done');  
1444 - //unlink($indexLockFile);  
1445 - }  
1446 -  
1447 - public function migrateDocuments($max=null)  
1448 - {  
1449 - global $default;  
1450 -  
1451 - $default->log->info(_kt('migrateDocuments: starting'));  
1452 -  
1453 - if (!$this->doesDiagnosticsPass(true))  
1454 - {  
1455 - $default->log->info(_kt('migrateDocuments: stopping - diagnostics problem. The dashboard will provide more information.'));  
1456 - return;  
1457 - }  
1458 -  
1459 - if (KTUtil::getSystemSetting('migrationComplete') == 'true')  
1460 - {  
1461 - $default->log->info(_kt('migrateDocuments: stopping - migration is complete.'));  
1462 - return;  
1463 - }  
1464 -  
1465 - $config =& KTConfig::getSingleton();  
1466 - if (is_null($max))  
1467 - {  
1468 - $max = $config->get('indexer/batchMigrateDocument',500);  
1469 - }  
1470 -  
1471 - $lockFile = $config->get('cache/cacheDirectory') . '/migration.lock';  
1472 - if (is_file($lockFile))  
1473 - {  
1474 - $default->log->info(_kt('migrateDocuments: stopping - migration lockfile detected.'));  
1475 - return;  
1476 - }  
1477 - touch($lockFile);  
1478 -  
1479 - $startTime = KTUtil::getSystemSetting('migrationStarted');  
1480 - if (is_null($startTime))  
1481 - {  
1482 - KTUtil::setSystemSetting('migrationStarted', time());  
1483 - }  
1484 -  
1485 - $maxLoops = 5;  
1486 -  
1487 - $max = ceil($max / $maxLoops);  
1488 -  
1489 - $start =KTUtil::getBenchmarkTime();  
1490 - $noDocs = false;  
1491 - $numDocs = 0;  
1492 -  
1493 - for($loop=0;$loop<$maxLoops;$loop++)  
1494 - {  
1495 -  
1496 - $sql = "SELECT  
1497 - document_id, document_text  
1498 - FROM  
1499 - document_text  
1500 - ORDER BY document_id  
1501 - LIMIT $max";  
1502 - $result = DBUtil::getResultArray($sql);  
1503 - if (PEAR::isError($result))  
1504 - {  
1505 - $default->log->info(_kt('migrateDocuments: db error'));  
1506 - break;  
1507 - }  
1508 -  
1509 - $docs = count($result);  
1510 - if ($docs == 0)  
1511 - {  
1512 - $noDocs = true;  
1513 - break;  
1514 - }  
1515 - $numDocs += $docs;  
1516 -  
1517 - foreach($result as $docinfo)  
1518 - {  
1519 - $docId = $docinfo['document_id'];  
1520 -  
1521 - $document = Document::get($docId);  
1522 - if (PEAR::isError($document) || is_null($document))  
1523 - {  
1524 - $sql = "DELETE FROM document_text WHERE document_id=$docId";  
1525 - DBUtil::runQuery($sql);  
1526 - $default->log->error(sprintf(_kt('migrateDocuments: Could not get document %d\'s document! Removing content!'),$docId));  
1527 - continue;  
1528 - }  
1529 -  
1530 - $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();  
1531 -  
1532 - $targetFile = tempnam($tempPath, 'ktindexer');  
1533 -  
1534 - if (file_put_contents($targetFile, $docinfo['document_text']) === false)  
1535 - {  
1536 - $default->log->error(sprintf(_kt('migrateDocuments: Cannot write to \'%s\' for document id %d'), $targetFile, $docId));  
1537 - continue;  
1538 - }  
1539 - // free memory asap ;)  
1540 - unset($docinfo['document_text']);  
1541 -  
1542 - $title = $document->getName();  
1543 -  
1544 - $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);  
1545 -  
1546 - if ($indexStatus)  
1547 - {  
1548 - $sql = "DELETE FROM document_text WHERE document_id=$docId";  
1549 - DBUtil::runQuery($sql);  
1550 - }  
1551 - else  
1552 - {  
1553 - $default->log->error(sprintf(_kt("migrateDocuments: Problem indexing document %d"), $docId));  
1554 - }  
1555 -  
1556 - @unlink($targetFile);  
1557 - }  
1558 - }  
1559 -  
1560 - @unlink($lockFile);  
1561 -  
1562 - $time = KTUtil::getBenchmarkTime() - $start;  
1563 -  
1564 - KTUtil::setSystemSetting('migrationTime', KTUtil::getSystemSetting('migrationTime',0) + $time);  
1565 - KTUtil::setSystemSetting('migratedDocuments', KTUtil::getSystemSetting('migratedDocuments',0) + $numDocs);  
1566 -  
1567 - $default->log->info(sprintf(_kt('migrateDocuments: stopping - done in %d seconds!'), $time));  
1568 - if ($noDocs)  
1569 - {  
1570 - $default->log->info(_kt('migrateDocuments: Completed!'));  
1571 - KTUtil::setSystemSetting('migrationComplete', 'true');  
1572 - schedulerUtil::deleteByName('Index Migration');  
1573 - $default->log->debug(_kt('migrateDocuments: Disabling \'Index Migration\' task by removing scheduler entry.'));  
1574 - }  
1575 - }  
1576 -  
1577 - /**  
1578 - * Index a document. The base class must override this function.  
1579 - *  
1580 - * @param int $docId  
1581 - * @param string $textFile  
1582 - */  
1583 - protected abstract function indexDocument($docId, $textFile, $title, $version);  
1584 -  
1585 -  
1586 - public function updateDocumentIndex($docId, $text)  
1587 - {  
1588 - $config = KTConfig::getSingleton();  
1589 - $tempPath = $config->get("urls/tmpDirectory");  
1590 - $tempFile = tempnam($tempPath,'ud_');  
1591 -  
1592 - file_put_contents($tempFile, $text);  
1593 -  
1594 - $document = Document::get($docId);  
1595 - $title = $document->getDescription();  
1596 - $version = $document->getVersion();  
1597 -  
1598 - $result = $this->indexDocument($docId, $tempFile, $title, $version);  
1599 -  
1600 - if (file_exists($tempFile))  
1601 - {  
1602 - unlink($tempFile);  
1603 - }  
1604 -  
1605 - return $result;  
1606 - }  
1607 -  
1608 - /**  
1609 - * Index a discussion. The base class must override this function.  
1610 - *  
1611 - * @param int $docId  
1612 - */  
1613 - protected abstract function indexDiscussion($docId);  
1614 -  
1615 - /**  
1616 - * Diagnose the indexer. e.g. Check that the indexing server is running.  
1617 - *  
1618 - */  
1619 - public abstract function diagnose();  
1620 -  
1621 - /**  
1622 - * Diagnose the extractors.  
1623 - *  
1624 - * @return array  
1625 - */  
1626 - public function diagnoseExtractors()  
1627 - {  
1628 - $diagnosis = $this->_diagnose($this->extractorPath, 'DocumentExtractor', 'Extractor.inc.php');  
1629 - $diagnosis = array_merge($diagnosis, $this->_diagnose($this->hookPath, 'Hook', 'Hook.inc.php'));  
1630 -  
1631 - return $diagnosis;  
1632 - }  
1633 -  
1634 - /**  
1635 - * This is a refactored diagnose function.  
1636 - *  
1637 - * @param string $path  
1638 - * @param string $class  
1639 - * @param string $extension  
1640 - * @return array  
1641 - */  
1642 - private function _diagnose($path, $baseclass, $extension)  
1643 - {  
1644 - global $default;  
1645 -  
1646 - $diagnoses = array();  
1647 -  
1648 - $dir = opendir(SearchHelper::correctPath($path));  
1649 - $extlen = - strlen($extension);  
1650 -  
1651 - while (($file = readdir($dir)) !== false)  
1652 - {  
1653 - if (substr($file,0,1) == '.')  
1654 - {  
1655 - continue;  
1656 - }  
1657 - if (substr($file,$extlen) != $extension)  
1658 - {  
1659 - $default->log->error(sprintf(_kt("diagnose: '%s' does not have extension '%s'."), $file, $extension));  
1660 - continue;  
1661 - }  
1662 -  
1663 - require_once($path . '/' . $file);  
1664 -  
1665 - $class = substr($file, 0, -8);  
1666 - if (!class_exists($class))  
1667 - {  
1668 - $default->log->error(sprintf(_kt("diagnose: class '%s' does not exist."), $class));  
1669 - continue;  
1670 - }  
1671 -  
1672 - if (!$this->isExtractorEnabled($class))  
1673 - {  
1674 - $default->log->debug(sprintf(_kt("diagnose: extractor '%s' is disabled."), $class));  
1675 - continue;  
1676 - }  
1677 -  
1678 - $extractor = new $class();  
1679 - if (!is_a($extractor, $baseclass))  
1680 - {  
1681 - $default->log->error(sprintf(_kt("diagnose(): '%s' is not of type DocumentExtractor"), $class));  
1682 - continue;  
1683 - }  
1684 -  
1685 - $types = $extractor->getSupportedMimeTypes();  
1686 - if (empty($types))  
1687 - {  
1688 - if ($this->debug) $default->log->debug(sprintf(_kt("diagnose: class '%s' does not support any types."), $class));  
1689 - continue;  
1690 - }  
1691 -  
1692 - $diagnosis=$extractor->diagnose();  
1693 - if (empty($diagnosis))  
1694 - {  
1695 - continue;  
1696 - }  
1697 - $diagnoses[$class] = array(  
1698 - 'name'=>$extractor->getDisplayName(),  
1699 - 'diagnosis'=>$diagnosis  
1700 - );  
1701 -  
1702 - }  
1703 - closedir($dir);  
1704 -  
1705 - return $diagnoses;  
1706 - }  
1707 -  
1708 -  
1709 - /**  
1710 - * Register the extractor types.  
1711 - *  
1712 - * @param boolean $clear. Optional. Defaults to false.  
1713 - */  
1714 - public function registerTypes($clear=false)  
1715 - {  
1716 - if ($clear)  
1717 - {  
1718 - $this->clearExtractors();  
1719 - }  
1720 - $dir = opendir(SearchHelper::correctPath($this->extractorPath));  
1721 - while (($file = readdir($dir)) !== false)  
1722 - {  
1723 - if (substr($file,-17) == 'Extractor.inc.php')  
1724 - {  
1725 - require_once($this->extractorPath . '/' . $file);  
1726 - $class = substr($file, 0, -8);  
1727 -  
1728 - if (!class_exists($class))  
1729 - {  
1730 - // if the class does not exist, we can't do anything.  
1731 - continue;  
1732 - }  
1733 -  
1734 - $extractor = new $class;  
1735 - if ($extractor instanceof DocumentExtractor)  
1736 - {  
1737 - $extractor->registerMimeTypes();  
1738 - }  
1739 - }  
1740 - }  
1741 - closedir($dir);  
1742 - }  
1743 -  
1744 - /**  
1745 - * This is used as a possible obtimisation effort. It may be overridden in that case.  
1746 - *  
1747 - * @param int $docId  
1748 - * @param string $textFile  
1749 - */  
1750 - protected function indexDocumentAndDiscussion($docId, $textFile, $title, $version)  
1751 - {  
1752 - $this->indexDocument($docId, $textFile, $title, $version);  
1753 - $this->indexDiscussion($docId);  
1754 - }  
1755 -  
1756 - /**  
1757 - * Remove the document from the queue. This is normally called when it has been processed.  
1758 - *  
1759 - * @param int $docid  
1760 - */  
1761 - public static function unqueueDocument($docid, $reason=false, $level='debug')  
1762 - {  
1763 - $sql = "DELETE FROM index_files WHERE document_id=$docid";  
1764 - DBUtil::runQuery($sql);  
1765 - if ($reason !== false)  
1766 - {  
1767 - global $default;  
1768 - $default->log->$level("Indexer: removing document $docid from the queue - $reason");  
1769 - }  
1770 - }  
1771 -  
1772 - /**  
1773 - * Run a query on the index.  
1774 - *  
1775 - * @param string $query  
1776 - * @return array  
1777 - */  
1778 - public abstract function query($query);  
1779 -  
1780 - /**  
1781 - * Converts an integer to a string that can be easily compared and reversed.  
1782 - *  
1783 - * @param int $int  
1784 - * @return string  
1785 - */  
1786 - public static function longToString($int)  
1787 - {  
1788 - $maxlen = 14;  
1789 -  
1790 - $a2z = array('a','b','c','d','e','f','g','h','i','j');  
1791 - $o29 = array('0','1','2','3','4','5','6','7','8','9');  
1792 - $l = str_pad('',$maxlen - strlen("$int"),'0') . $int;  
1793 -  
1794 - return str_replace($o29, $a2z, $l);  
1795 - }  
1796 -  
1797 - /**  
1798 - * Converts a string to an integer.  
1799 - *  
1800 - * @param string $str  
1801 - * @return int  
1802 - */  
1803 - public static function stringToLong($str)  
1804 - {  
1805 - $a2z = array('a','b','c','d','e','f','g','h','i','j');  
1806 - $o29 = array('0','1','2','3','4','5','6','7','8','9');  
1807 -  
1808 - $int = str_replace($a2z, $o29, $str) + 0;  
1809 -  
1810 - return $int;  
1811 - }  
1812 -  
1813 - /**  
1814 - * Possibly we can optimise indexes. This method must be overriden.  
1815 - * The new function must call the parent!  
1816 - *  
1817 - */  
1818 - public function optimise()  
1819 - {  
1820 - KTUtil::setSystemSetting('luceneOptimisationDate', time());  
1821 - }  
1822 -  
1823 - /**  
1824 - * Shuts down the indexer  
1825 - *  
1826 - */  
1827 - public function shutdown()  
1828 - {  
1829 - // do nothing generally  
1830 - }  
1831 -  
1832 - /**  
1833 - * Returns the name of the indexer.  
1834 - *  
1835 - * @return string  
1836 - */  
1837 - public abstract function getDisplayName();  
1838 -  
1839 -  
1840 - /**  
1841 - * Returns the number of non-deleted documents in the index.  
1842 - *  
1843 - * @return int  
1844 - */  
1845 - public abstract function getDocumentsInIndex();  
1846 -  
1847 - public abstract function isDocumentIndexed($documentId);  
1848 -  
1849 - /**  
1850 - * Returns the path to the index directory  
1851 - *  
1852 - * @return string  
1853 - */  
1854 - public function getIndexDirectory()  
1855 - {  
1856 - $config = KTConfig::getSingleton();  
1857 - $directory = $config->get('indexer/luceneDirectory');  
1858 - return $directory;  
1859 - }  
1860 -}  
1861 -  
1862 -?> 1 +<?php
  2 +
  3 +/**
  4 + * $Id:$
  5 + *
  6 + * KnowledgeTree Community Edition
  7 + * Document Management Made Simple
  8 + * Copyright (C) 2008 KnowledgeTree Inc.
  9 + * Portions copyright The Jam Warehouse Software (Pty) Limited
  10 + *
  11 + * This program is free software; you can redistribute it and/or modify it under
  12 + * the terms of the GNU General Public License version 3 as published by the
  13 + * Free Software Foundation.
  14 + *
  15 + * This program is distributed in the hope that it will be useful, but WITHOUT
  16 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17 + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  18 + * details.
  19 + *
  20 + * You should have received a copy of the GNU General Public License
  21 + * along with this program. If not, see <http://www.gnu.org/licenses/>.
  22 + *
  23 + * You can contact KnowledgeTree Inc., PO Box 7775 #87847, San Francisco,
  24 + * California 94120-7775, or email info@knowledgetree.com.
  25 + *
  26 + * The interactive user interfaces in modified source and object code versions
  27 + * of this program must display Appropriate Legal Notices, as required under
  28 + * Section 5 of the GNU General Public License version 3.
  29 + *
  30 + * In accordance with Section 7(b) of the GNU General Public License version 3,
  31 + * these Appropriate Legal Notices must retain the display of the "Powered by
  32 + * KnowledgeTree" logo and retain the original copyright notice. If the display of the
  33 + * logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices
  34 + * must display the words "Powered by KnowledgeTree" and retain the original
  35 + * copyright notice.
  36 + * Contributor( s): ______________________________________
  37 + *
  38 + */
  39 +
  40 +define('SEARCH2_INDEXER_DIR',realpath(dirname(__FILE__)) . '/');
  41 +require_once('indexing/extractorCore.inc.php');
  42 +require_once(KT_DIR . '/plugins/ktcore/scheduler/schedulerUtil.php');
  43 +require_once(KT_DIR . '/ktapi/ktapi.inc.php');
  44 +
  45 +class IndexerInconsistencyException extends Exception {};
  46 +
  47 +// TODO: Query Result Items code should be moved into the Search section. It has less to do with indexing...
  48 +
  49 +class QueryResultItem
  50 +{
  51 + protected $id;
  52 + protected $title;
  53 + protected $rank;
  54 + protected $text;
  55 + protected $fullpath;
  56 +
  57 + public function __construct($id, $title, $rank, $text, $fullpath)
  58 + {
  59 + $this->id = $id;
  60 + $this->title = $title;
  61 + $this->rank = $rank;
  62 + $this->text = $text;
  63 + $this->fullpath = $fullpath;
  64 + }
  65 +
  66 + public function getId() { return $this->id; }
  67 + public function getRealId() { return $this->id; }
  68 +
  69 + public function getIsProxy() { return $this instanceof ProxyResultItem; }
  70 + public function getIsFolder() { return substr(get_class($this), 0, 6) == 'Folder' ; }
  71 + public function getIsDocument() { return substr(get_class($this), 0, 8) == 'Document' ; }
  72 +
  73 + public function setRank($value)
  74 + {
  75 + $this->rank = number_format($value,2,'.',',');
  76 + }
  77 +
  78 + public function getIsLive()
  79 + {
  80 + return true;
  81 + }
  82 +
  83 + public function setTitle($value)
  84 + {
  85 + $this->title = $value;
  86 + }
  87 +
  88 + public function setText($value)
  89 + {
  90 + $this->text = $value;
  91 + }
  92 +
  93 + public function getRelevance() { return (float) $this->rank; }
  94 + public function getRank() { return $this->getRelevance(); }
  95 + public function getText() { return (string) $this->text; }
  96 + public function getTitle() { return (string) $this->title; }
  97 + public function getFullPath() { return (string) $this->fullpath; }
  98 +
  99 + protected function __get($property)
  100 + {
  101 + if (empty($property))
  102 + {
  103 + return '';
  104 + }
  105 +
  106 + $method = 'get' . $property;
  107 + if (method_exists($this, $method))
  108 + {
  109 + return $this->$method();
  110 + }
  111 + return $this->getUnknown();
  112 + }
  113 +
  114 + protected function getUnknown()
  115 + {
  116 + return _kt('n/a');
  117 + }
  118 +
  119 + protected function __set($property, $value)
  120 + {
  121 + if (empty($property))
  122 + {
  123 + return '';
  124 + }
  125 +
  126 + $method = 'set' . $property;
  127 + if (method_exists($this, $method))
  128 + {
  129 + return $this->$method($value);
  130 + }
  131 + throw new Exception("Unknown property '$property' to set on QueryResultItem");
  132 + }
  133 +}
  134 +
  135 +class ProxyResultItem extends QueryResultItem
  136 +{
  137 + protected $proxy;
  138 + protected $proxyId;
  139 +
  140 + public function __construct($proxyId, $proxy)
  141 + {
  142 + parent::__construct($proxyId, $proxy->getTitle, $proxy->getRank(), $proxy->getText(), $proxy->getFullPath());
  143 + $this->proxyId = $proxyId;
  144 + $this->proxy = $proxy;
  145 + }
  146 +
  147 + public function getId() { return $this->proxyId; }
  148 + public function getTitle() { return $this->proxy->getTitle(); }
  149 + public function getRealId() { return $this->proxy->getId(); }
  150 +
  151 + protected function __get($property)
  152 + {
  153 + $method = 'get' . $property;
  154 +
  155 + if (method_exists($this, $method))
  156 + {
  157 + return $this->$method();
  158 + }
  159 + else
  160 + {
  161 + return $this->proxy->$method();
  162 + }
  163 + }
  164 +
  165 + protected function __set($property, $value)
  166 + {
  167 + $method = 'set' . $property;
  168 + if (method_exists($this, $method))
  169 + {
  170 + return $this->$method($value);
  171 + }
  172 + else
  173 + {
  174 + return $this->proxy->$method($value);
  175 + }
  176 + }
  177 +}
  178 +
  179 +class DocumentResultItem extends QueryResultItem
  180 +{
  181 + protected $filesize;
  182 + protected $live;
  183 + protected $version;
  184 + protected $mimeType;
  185 + protected $filename;
  186 + protected $thumbnail; // TODO: if not null, gui can display a thumbnail
  187 + protected $viewer; // TODO: if not null, a viewer can be used to view the document
  188 + protected $document;
  189 + protected $checkedOutUser;
  190 + protected $dateCheckedout;
  191 + protected $workflowState;
  192 + protected $workflow;
  193 + protected $modifiedBy;
  194 + protected $dateModified;
  195 + protected $createdBy;
  196 + protected $dateCreated;
  197 + protected $owner;
  198 + protected $immutable;
  199 + protected $deleted;
  200 + protected $status;
  201 + protected $folderId;
  202 + protected $storagePath;
  203 + protected $documentType;
  204 + protected $mimeIconPath;
  205 + protected $mimeDisplay;
  206 + protected $oemDocumentNo;
  207 +
  208 + public function __construct($document_id, $rank=null, $title=null, $text=null, $fullpath = null)
  209 + {
  210 + parent::__construct($document_id, $title, $rank, $text, $fullpath);
  211 + $this->live = true;
  212 + $this->loadDocumentInfo();
  213 + }
  214 +
  215 + // TODO: this is bad. must refactor to do the query on the group of documents.
  216 + public function loadDocumentInfo()
  217 + {
  218 + global $default;
  219 + $sql = "SELECT
  220 + d.folder_id, f.full_path, f.name, dcv.size as filesize, dcv.major_version,
  221 + dcv.minor_version, dcv.filename, cou.name as checkoutuser, w.human_name as workflow, ws.human_name as workflowstate,
  222 + mt.mimetypes as mimetype, md.mime_doc as mimedoc, d.checkedout, mbu.name as modifiedbyuser, d.modified,
  223 + cbu.name as createdbyuser, ou.name as owneruser, d.immutable, d.status_id, d.created,dcv.storage_path, dtl.name as document_type,
  224 + mt.icon_path as mime_icon_path, mt.friendly_name as mime_display, d.oem_no, dmv.name as title
  225 + FROM
  226 + documents d
  227 + INNER JOIN document_metadata_version dmv ON d.metadata_version_id = dmv.id
  228 + INNER JOIN document_content_version dcv ON dmv.content_version_id = dcv.id
  229 + INNER JOIN mime_types mt ON dcv.mime_id=mt.id
  230 + LEFT JOIN document_types_lookup dtl ON dtl.id=dmv.document_type_id
  231 + LEFT JOIN folders f ON f.id=d.folder_id
  232 + LEFT JOIN users cou ON d.checked_out_user_id=cou.id
  233 + LEFT JOIN workflows w ON dmv.workflow_id=w.id
  234 + LEFT JOIN workflow_states ws ON dmv.workflow_state_id = ws.id
  235 + LEFT JOIN mime_documents md ON mt.mime_document_id = md.id
  236 + LEFT JOIN users mbu ON d.modified_user_id=mbu.id
  237 + LEFT JOIN users cbu ON d.creator_id=cbu.id
  238 + LEFT JOIN users ou ON d.owner_id=ou.id
  239 + WHERE
  240 + d.id=$this->id";
  241 +
  242 + $result = DBUtil::getOneResult($sql);
  243 +
  244 + if (PEAR::isError($result) || empty($result))
  245 + {
  246 + $this->live = false;
  247 + if (PEAR::isError($result))
  248 + {
  249 + throw new Exception('Database exception! There appears to be an error in the system: ' .$result->getMessage());
  250 + }
  251 +
  252 + $default->log->error('QueryResultItem: $result is null');
  253 + $msg = 'The database did not have a record matching the result from the document indexer. This may occur if there is an inconsistency between the document indexer and the repository. The indexer needs to be repaired.';
  254 + $default->log->error('QueryResultItem: ' . $msg);
  255 + // TODO: repair process where we scan documents in index, and delete those for which there is nothing in the repository
  256 + throw new IndexerInconsistencyException(_kt($msg));
  257 + }
  258 +
  259 + // document_id, relevance, text, title
  260 +
  261 + $this->documentType = $result['document_type'];
  262 + $this->filename=$result['filename'];
  263 + $this->filesize = KTUtil::filesizeToString($result['filesize']);
  264 + $this->folderId = $result['folder_id'];
  265 + $this->title = $result['title'];
  266 +
  267 + $this->createdBy = $result['createdbyuser'];
  268 + $this->dateCreated = $result['created'];
  269 +
  270 + $this->modifiedBy = $result['modifiedbyuser'];
  271 + $this->dateModified = $result['modified'];
  272 +
  273 + $this->checkedOutUser = $result['checkoutuser'];
  274 + $this->dateCheckedout = $result['checkedout'];
  275 +
  276 + $this->owner = $result['owneruser'];
  277 +
  278 + $this->version = $result['major_version'] . '.' . $result['minor_version'];
  279 +
  280 + $this->immutable = ($result['immutable'] + 0)?_kt('Immutable'):'';
  281 +
  282 + $this->workflow = $result['workflow'];
  283 + $this->workflowState = $result['workflowstate'];
  284 +
  285 + $this->oemDocumentNo = $result['oem_no'];
  286 + if (empty($this->oemDocumentNo)) $this->oemDocumentNo = 'n/a';
  287 +
  288 + if (is_null($result['name']))
  289 + {
  290 + $this->fullpath = '(orphaned)';
  291 + }
  292 + else
  293 + {
  294 + $this->fullpath = $result['full_path'];
  295 + }
  296 +
  297 + $this->mimeType = $result['mimetype'];
  298 + $this->mimeIconPath = $result['mime_icon_path'];
  299 + if (empty($this->mimeIconPath))
  300 + {
  301 + $this->mimeIconPath = 'unspecified_type';
  302 + }
  303 + $this->mimeDisplay = $result['mime_display'];
  304 +
  305 + $this->storagePath = $result['storage_path'];
  306 + $this->status = Document::getStatusString($result['status_id']);
  307 + }
  308 +
  309 + public function getDocumentID() { return $this->getId(); }
  310 + public function getIsLive() { return (bool) $this->live; }
  311 + public function getFilesize() { return $this->filesize; }
  312 + public function getVersion() { return (string) $this->version; }
  313 + public function getFilename() { return (string)$this->filename; }
  314 + public function getFolderId() { return (int)$this->folderId; }
  315 + public function getOemDocumentNo() { return (string) $this->oemDocumentNo; }
  316 + public function getDocument() { return Document::get($this->id); }
  317 + public function getIsAvailable() { return $this->Document->isLive(); }
  318 + public function getCheckedOutUser() { return (string) $this->checkedOutUser; }
  319 + public function getCheckedOutByr() { return $this->getCheckedOutUser(); }
  320 + public function getWorkflowOnly() { return (string)$this->workflow; }
  321 + public function getWorkflow() { return $this->getWorkflow(); }
  322 + public function getWorkflowStateOnly() { return (string)$this->workflowState; }
  323 + public function getWorkflowState() { return $this->getWorkflowStateOnly(); }
  324 + public function getWorkflowAndState() {
  325 + if (is_null($this->workflow))
  326 + {
  327 + return '';
  328 + }
  329 + return "$this->workflow - $this->workflowState";
  330 + }
  331 + public function getMimeType() { return (string) $this->mimeType; }
  332 + public function getMimeIconPath() { return (string) $this->mimeIconPath; }
  333 + public function getMimeDisplay() { return (string) $this->mimeDisplay; }
  334 + public function getDateCheckedOut() { return (string) $this->dateCheckedout; }
  335 + public function getModifiedBy() { return (string) $this->modifiedBy; }
  336 + public function getDateModified() { return (string) $this->dateModified; }
  337 + public function getCreatedBy() { return (string) $this->createdBy; }
  338 + public function getDateCreated() { return (string) $this->dateCreated; }
  339 + public function getOwner() { return (string) $this->owner; }
  340 + public function getOwnedBy() { return $this->getOwner(); }
  341 + public function getIsImmutable() { return (bool) $this->immutable; }
  342 + public function getImmutable() { return $this->getIsImmutable(); }
  343 + public function getStatus() { return $this->status; }
  344 + public function getStoragePath() { return $this->storagePath; }
  345 + public function getDocumentType() { return $this->documentType; }
  346 + public function getPermissions() { return KTAPI_Document::get_permission_string($this->Document); }
  347 + public function getCanBeReadByUser() {
  348 + if (!$this->live)
  349 + return false;
  350 + if (Permission::userHasDocumentReadPermission($this->Document))
  351 + return true;
  352 + if (Permission::adminIsInAdminMode())
  353 + return true;
  354 + return false;
  355 + }
  356 +}
  357 +
  358 +class FolderResultItem extends QueryResultItem
  359 +{
  360 + protected $folder;
  361 + protected $createdBy;
  362 + protected $parentId;
  363 +
  364 + public function __construct($folder_id, $rank=null, $title=null, $text=null, $fullpath = null)
  365 + {
  366 + parent::__construct($folder_id, $title, $rank, $text, $fullpath);
  367 + $this->loadFolderInfo();
  368 + }
  369 +
  370 + public function getFolderID() { return $this->getId(); }
  371 + public function getParentID() { return $this->parentId; }
  372 + public function getCreatedBy() { return $this->createdBy; }
  373 + public function getMimeIconPath() { return 'folder'; }
  374 + public function getFolder() { return Folder::get($this->getFolderID()); }
  375 + public function getPermissions() { return KTAPI_Folder::get_permission_string($this->Folder); }
  376 +
  377 + public function loadFolderInfo()
  378 + {
  379 + global $default;
  380 + $folder = $this->getFolder();
  381 + if (PEAR::isError($folder))
  382 + {
  383 + throw new Exception('Database exception! There appears to be an error in the system: ' .$result->getMessage());
  384 + }
  385 + $this->title = $folder->getName();
  386 + $this->fullpath = '/' . $folder->getFullPath();
  387 + $this->parentId = $folder->getParentId();
  388 +
  389 + $user = User::get($folder->getCreatorID());
  390 + $this->createdBy = (PEAR::isError($user))?_kt('Unknown'):$user->getName();
  391 + }
  392 +
  393 +}
  394 +
  395 +class DocumentShortcutResultItem extends ProxyResultItem
  396 +{
  397 + public function getDocumentID() { return $this->getId(); }
  398 + public function getMimeIconPath() { return $this->proxy->getMimeIconPath() . '_shortcut'; }
  399 +
  400 +}
  401 +
  402 +class FolderShortcutResultItem extends ProxyResultItem
  403 +{
  404 + public function getFolderID() { return $this->getId(); }
  405 + public function getMimeIconPath() { return 'folder_shortcut'; }
  406 +
  407 +}
  408 +
  409 +function MatchResultCompare($a, $b)
  410 +{
  411 + if ($a->Rank == $b->Rank) {
  412 + return 0;
  413 + }
  414 + return ($a->Rank < $b->Rank) ? -1 : 1;
  415 +}
  416 +
  417 +abstract class Indexer
  418 +{
  419 + /**
  420 + * Cache of extractors
  421 + *
  422 + * @var array
  423 + */
  424 + private $extractorCache;
  425 +
  426 + /**
  427 + * Indicates if the indexer will do logging.
  428 + *
  429 + * @var boolean
  430 + */
  431 + private $debug;
  432 + /**
  433 + * Cache on mime related hooks
  434 + *
  435 + * @var unknown_type
  436 + */
  437 + private $mimeHookCache;
  438 + /**
  439 + * Cache on general hooks.
  440 + *
  441 + * @var array
  442 + */
  443 + private $generalHookCache;
  444 +
  445 + /**
  446 + * This is a path to the extractors.
  447 + *
  448 + * @var string
  449 + */
  450 + private $extractorPath;
  451 + /**
  452 + * This is a path to the hooks.
  453 + *
  454 + * @var string
  455 + */
  456 + private $hookPath;
  457 +
  458 + private $enabledExtractors;
  459 +
  460 + /**
  461 + * Initialise the indexer
  462 + *
  463 + */
  464 + protected function __construct()
  465 + {
  466 + $config = KTConfig::getSingleton();
  467 +
  468 + $this->extractorCache = array();
  469 + $this->debug = $config->get('indexer/debug', true);
  470 + $this->hookCache = array();
  471 + $this->generalHookCache = array();
  472 + $this->extractorPath = $config->get('indexer/extractorPath', 'extractors');
  473 + $this->hookPath = $config->get('indexer/extractorHookPath','extractorHooks');
  474 +
  475 + $this->loadExtractorStatus();
  476 + }
  477 +
  478 + /**
  479 + * Get the list if enabled extractors
  480 + *
  481 + */
  482 + private function loadExtractorStatus()
  483 + {
  484 + $sql = "SELECT id, name FROM mime_extractors WHERE active=1";
  485 + $rs = DBUtil::getResultArray($sql);
  486 + $this->enabledExtractors = array();
  487 + foreach($rs as $item)
  488 + {
  489 + $this->enabledExtractors[] = $item['name'];
  490 + }
  491 + }
  492 +
  493 + private function isExtractorEnabled($extractor)
  494 + {
  495 + return in_array($extractor, $this->enabledExtractors);
  496 + }
  497 +
  498 + /**
  499 + * Returns a reference to the main class
  500 + *
  501 + * @return Indexer
  502 + */
  503 + public static function get()
  504 + {
  505 + static $singleton = null;
  506 +
  507 + if (is_null($singleton))
  508 + {
  509 + $config = KTConfig::getSingleton();
  510 + $classname = $config->get('indexer/coreClass');
  511 +
  512 + require_once('indexing/indexers/' . $classname . '.inc.php');
  513 +
  514 + if (!class_exists($classname))
  515 + {
  516 + throw new Exception("Class '$classname' does not exist.");
  517 + }
  518 +
  519 + $singleton = new $classname;
  520 + }
  521 +
  522 + return $singleton;
  523 + }
  524 +
  525 + public abstract function deleteDocument($docid);
  526 +
  527 + /**
  528 + * Remove the association of all extractors to mime types on the database.
  529 + *
  530 + */
  531 + public function clearExtractors()
  532 + {
  533 + global $default;
  534 +
  535 + $sql = "update mime_types set extractor_id=null";
  536 + DBUtil::runQuery($sql);
  537 +
  538 + $sql = "delete from mime_extractors";
  539 + DBUtil::runQuery($sql);
  540 +
  541 + if ($this->debug) $default->log->debug('clearExtractors');
  542 + }
  543 +
  544 + /**
  545 + * lookup the name of the extractor class based on the mime type.
  546 + *
  547 + * @param string $type
  548 + * @return string
  549 + */
  550 + public static function resolveExtractor($type)
  551 + {
  552 + global $default;
  553 + $sql = "select extractor from mime_types where filetypes='$type'";
  554 + $class = DBUtil::getOneResultKey($sql,'extractor');
  555 + if (PEAR::isError($class))
  556 + {
  557 + $default->log->error("resolveExtractor: cannot resolve $type");
  558 + return $class;
  559 + }
  560 + if ($this->debug) $default->log->debug(sprintf(_kt("resolveExtractor: Resolved '%s' from mime type '%s'."), $class, $type));
  561 + return $class;
  562 + }
  563 +
  564 + /**
  565 + * Return all the discussion text.
  566 + *
  567 + * @param int $docid
  568 + * @return string
  569 + */
  570 + public static function getDiscussionText($docid)
  571 + {
  572 + $sql = "SELECT
  573 + dc.subject, dc.body
  574 + FROM
  575 + discussion_threads dt
  576 + INNER JOIN discussion_comments dc ON dc.thread_id=dt.id AND dc.id BETWEEN dt.first_comment_id AND dt.last_comment_id
  577 + WHERE
  578 + dt.document_id=$docid";
  579 + $result = DBUtil::getResultArray($sql);
  580 + $text = '';
  581 +
  582 + foreach($result as $record)
  583 + {
  584 + $text .= $record['subject'] . "\n" . $record['body'] . "\n";
  585 + }
  586 +
  587 + return $text;
  588 + }
  589 +
  590 + /**
  591 + * Schedule the indexing of a document.
  592 + *
  593 + * @param string $document
  594 + * @param string $what
  595 + */
  596 + public static function index($document, $what='A')
  597 + {
  598 + global $default;
  599 +
  600 + if (is_numeric($document))
  601 + {
  602 + $document = Document::get($document+0);
  603 + }
  604 +
  605 + if (PEAR::isError($document))
  606 + {
  607 + $default->log->error("index: Could not index document: " .$document->getMessage());
  608 + return;
  609 + }
  610 +
  611 + $document_id = $document->getId();
  612 + $userid=$_SESSION['userID'];
  613 + if (empty($userid)) $userid=1;
  614 +
  615 + // we dequeue the document so that there are no issues when enqueuing
  616 + Indexer::unqueueDocument($document_id);
  617 +
  618 + // enqueue item
  619 + $sql = "INSERT INTO index_files(document_id, user_id, what) VALUES($document_id, $userid, '$what')";
  620 + DBUtil::runQuery($sql);
  621 +
  622 + $default->log->debug("index: Queuing indexing of $document_id");
  623 +
  624 + }
  625 +
  626 + private static function incrementCount()
  627 + {
  628 + // Get count from system settings
  629 + $count = Indexer::getIndexedDocumentCount();
  630 + $count = (int)$count + 1;
  631 + Indexer::updateIndexedDocumentCount($count);
  632 + }
  633 +
  634 + public static function getIndexedDocumentCount()
  635 + {
  636 + $count = KTUtil::getSystemSetting('indexedDocumentCount', 0);
  637 + return (int) $count;
  638 + }
  639 +
  640 + public static function updateIndexedDocumentCount($cnt = 0)
  641 + {
  642 + KTUtil::setSystemSetting('indexedDocumentCount', $cnt);
  643 + }
  644 +
  645 + public static function reindexQueue()
  646 + {
  647 + $sql = "UPDATE index_files SET processdate = null";
  648 + DBUtil::runQuery($sql);
  649 + }
  650 +
  651 + public static function reindexDocument($documentId)
  652 + {
  653 + $sql = "UPDATE index_files SET processdate=null, status_msg=null WHERE document_id=$documentId";
  654 + DBUtil::runQuery($sql);
  655 + }
  656 +
  657 +
  658 +
  659 + public static function indexAll()
  660 + {
  661 + $userid=$_SESSION['userID'];
  662 + if (empty($userid)) $userid=1;
  663 +
  664 + $sql = "DELETE FROM index_files";
  665 + DBUtil::runQuery($sql);
  666 +
  667 + $sql = "INSERT INTO index_files(document_id, user_id, what) SELECT id, $userid, 'A' FROM documents WHERE status_id=1 and id not in (select document_id from index_files)";
  668 + DBUtil::runQuery($sql);
  669 + }
  670 +
  671 + public static function indexFolder($folder)
  672 + {
  673 + $userid=$_SESSION['userID'];
  674 + if (empty($userid)) $userid=1;
  675 +
  676 + if (!$folder instanceof Folder && !$folder instanceof FolderProxy)
  677 + {
  678 + throw new Exception('Folder expected');
  679 + }
  680 +
  681 + $full_path = $folder->getFullPath();
  682 +
  683 + $sql = "INSERT INTO index_files(document_id, user_id, what) SELECT id, $userid, 'A' FROM documents WHERE full_path like '{$full_path}/%' AND status_id=1 and id not in (select document_id from index_files)";
  684 + DBUtil::runQuery($sql);
  685 + }
  686 +
  687 + /**
  688 + * Clearout the scheduling of documents that no longer exist.
  689 + *
  690 + */
  691 + public static function clearoutDeleted()
  692 + {
  693 + global $default;
  694 +
  695 + $sql = 'DELETE FROM
  696 + index_files
  697 + WHERE
  698 + document_id in (SELECT d.id FROM documents AS d WHERE d.status_id=3) OR
  699 + NOT EXISTS(SELECT index_files.document_id FROM documents WHERE index_files.document_id=documents.id)';
  700 + DBUtil::runQuery($sql);
  701 +
  702 + $default->log->debug("Indexer::clearoutDeleted: removed documents from indexing queue that have been deleted");
  703 + }
  704 +
  705 +
  706 + /**
  707 + * Check if a document is scheduled to be indexed
  708 + *
  709 + * @param mixed $document This may be a document or document id
  710 + * @return boolean
  711 + */
  712 + public static function isDocumentScheduled($document)
  713 + {
  714 + if (is_numeric($document))
  715 + {
  716 + $docid = $document;
  717 + }
  718 + else if ($document instanceof Document)
  719 + {
  720 + $docid = $document->getId();
  721 + }
  722 + else
  723 + {
  724 + return false;
  725 + }
  726 + $sql = "SELECT 1 FROM index_files WHERE document_id=$docid";
  727 + $result = DBUtil::getResultArray($sql);
  728 + return count($result) > 0;
  729 + }
  730 +
  731 + /**
  732 + * Filters text removing redundant characters such as continuous newlines and spaces.
  733 + *
  734 + * @param string $filename
  735 + */
  736 + private function filterText($filename)
  737 + {
  738 + $content = file_get_contents($filename);
  739 +
  740 + $src = array("([\r\n])","([\n][\n])","([\n])","([\t])",'([ ][ ])');
  741 + $tgt = array("\n","\n",' ',' ',' ');
  742 +
  743 + // shrink what is being stored.
  744 + do
  745 + {
  746 + $orig = $content;
  747 + $content = preg_replace($src, $tgt, $content);
  748 + } while ($content != $orig);
  749 +
  750 + return file_put_contents($filename, $content) !== false;
  751 + }
  752 +
  753 + /**
  754 + * Load hooks for text extraction process.
  755 + *
  756 + */
  757 + private function loadExtractorHooks()
  758 + {
  759 + $this->generalHookCache = array();
  760 + $this->mimeHookCache = array();
  761 +
  762 +
  763 + $dir = opendir(SearchHelper::correctPath($this->hookPath));
  764 + while (($file = readdir($dir)) !== false)
  765 + {
  766 + if (substr($file,-12) == 'Hook.inc.php')
  767 + {
  768 + require_once($this->hookPath . '/' . $file);
  769 + $class = substr($file, 0, -8);
  770 +
  771 + if (!class_exists($class))
  772 + {
  773 + continue;
  774 + }
  775 +
  776 + $hook = new $class;
  777 + if (!($class instanceof ExtractorHook))
  778 + {
  779 + continue;
  780 + }
  781 +
  782 + $mimeTypes = $hook->registerMimeTypes();
  783 + if (is_null($mimeTypes))
  784 + {
  785 + $this->generalHookCache[] = & $hook;
  786 + }
  787 + else
  788 + {
  789 + foreach($mimeTypes as $type)
  790 + {
  791 + $this->mimeHookCache[$type][] = & $hook;
  792 + }
  793 + }
  794 +
  795 + }
  796 + }
  797 + closedir($dir);
  798 + }
  799 +
  800 + /**
  801 + * This is a refactored function to execute the hooks.
  802 + *
  803 + * @param DocumentExtractor $extractor
  804 + * @param string $phase
  805 + * @param string $mimeType Optional. If set, indicates which hooks must be used, else assume general.
  806 + */
  807 + private function executeHook($extractor, $phase, $mimeType = null)
  808 + {
  809 + $hooks = array();
  810 + if (is_null($mimeType))
  811 + {
  812 + $hooks = $this->generalHookCache;
  813 + }
  814 + else
  815 + {
  816 + if (array_key_exists($mimeType, $this->mimeHookCache))
  817 + {
  818 + $hooks = $this->mimeHookCache[$mimeType];
  819 + }
  820 + }
  821 + if (empty($hooks))
  822 + {
  823 + return;
  824 + }
  825 +
  826 + foreach($hooks as $hook)
  827 + {
  828 + $hook->$phase($extractor);
  829 + }
  830 + }
  831 +
  832 + private function doesDiagnosticsPass($simple=false)
  833 + {
  834 + global $default;
  835 +
  836 + $config =& KTConfig::getSingleton();
  837 + // create a index log lock file in case there are errors, and we don't need to log them forever!
  838 + // this function will create the lockfile if an error is detected. It will be removed as soon
  839 + // as the problems with the indexer are removed.
  840 + $lockFile = $config->get('cache/cacheDirectory') . '/index.log.lock';
  841 +
  842 + $diagnosis = $this->diagnose();
  843 + if (!is_null($diagnosis))
  844 + {
  845 + if (!is_file($lockFile))
  846 + {
  847 + $default->log->error(_kt('Indexer problem: ') . $diagnosis);
  848 + }
  849 + touch($lockFile);
  850 + return false;
  851 + }
  852 +
  853 + if ($simple)
  854 + {
  855 + return true;
  856 + }
  857 +
  858 + $diagnosis = $this->diagnoseExtractors();
  859 + if (!empty($diagnosis))
  860 + {
  861 + if (!is_file($lockFile))
  862 + {
  863 + foreach($diagnosis as $diag)
  864 + {
  865 + $default->log->error(sprintf(_kt('%s problem: %s'), $diag['name'],$diag['diagnosis']));
  866 + }
  867 + }
  868 + touch($lockFile);
  869 + return false;
  870 + }
  871 +
  872 + if (is_file($lockFile))
  873 + {
  874 + $default->log->info(_kt('Issues with the indexer have been resolved!'));
  875 + unlink($lockFile);
  876 + }
  877 +
  878 + return true;
  879 + }
  880 +
  881 + /**
  882 + * This does the initial mime type association between mime types and text extractors
  883 + *
  884 + */
  885 + public function checkForRegisteredTypes()
  886 + {
  887 + global $default;
  888 +
  889 + // we are only doing this once!
  890 + $initRegistered = KTUtil::getSystemSetting('mimeTypesRegistered', false);
  891 + if ($initRegistered)
  892 + {
  893 + return;
  894 + }
  895 + if ($this->debug) $default->log->debug('checkForRegisteredTypes: start');
  896 +
  897 + $date = date('Y-m-d H:i');
  898 + $sql = "UPDATE scheduler_tasks SET run_time='$date'";
  899 + DBUtil::runQuery($sql);
  900 +
  901 + $this->registerTypes(true);
  902 +
  903 + $disable = array(
  904 + 'windows'=>array('PSExtractor'),
  905 + 'unix' => array()
  906 + );
  907 +
  908 + $disableForOS = OS_WINDOWS?$disable['windows']:$disable['unix'];
  909 +
  910 + if (!empty($disableForOS))
  911 + {
  912 + $disableForOS = '\'' . implode("','", $disableForOS) .'\'';
  913 +
  914 + $sql = "UPDATE mime_extractors SET active=0 WHERE name in ($disableForOS)";
  915 + DBUtil::runQuery($sql);
  916 + $default->log->info("checkForRegisteredTypes: disabled '$extractor'");
  917 + }
  918 + $this->loadExtractorStatus();
  919 +
  920 + if ($this->debug) $default->log->debug('checkForRegisteredTypes: done');
  921 + KTUtil::setSystemSetting('mimeTypesRegistered', true);
  922 + }
  923 +
  924 + private function updatePendingDocumentStatus($documentId, $message, $level)
  925 + {
  926 + $this->indexingHistory .= "\n" . $level . ': ' . $message;
  927 + $message = sanitizeForSQL($this->indexingHistory);
  928 + $sql = "UPDATE index_files SET status_msg='$message' WHERE document_id=$documentId";
  929 + DBUtil::runQuery($sql);
  930 + }
  931 +
  932 + private $restartCurrentBatch = false;
  933 +
  934 + public function restartBatch()
  935 + {
  936 + $this->restartCurrentBatch = true;
  937 + }
  938 +
  939 + /**
  940 + *
  941 + * @param int $documentId
  942 + * @param string $message
  943 + * @param string $level This may be info, error, debug
  944 + */
  945 + private function logPendingDocumentInfoStatus($documentId, $message, $level)
  946 + {
  947 + $this->updatePendingDocumentStatus($documentId, $message, $level);
  948 + global $default;
  949 +
  950 + switch ($level)
  951 + {
  952 + case 'debug':
  953 + if ($this->debug)
  954 + {
  955 + $default->log->debug($message);
  956 + }
  957 + break;
  958 + default:
  959 + $default->log->$level($message);
  960 + }
  961 + }
  962 +
  963 +
  964 +
  965 + public function getExtractor($extractorClass)
  966 + {
  967 + if (empty($extractorClass))
  968 + {
  969 + return null;
  970 + }
  971 +
  972 + $includeFile = SEARCH2_INDEXER_DIR . 'extractors/' . $extractorClass . '.inc.php';
  973 + if (!file_exists($includeFile))
  974 + {
  975 + throw new Exception("Extractor file does not exist: $includeFile");
  976 + }
  977 +
  978 + require_once($includeFile);
  979 +
  980 + if (!class_exists($extractorClass))
  981 + {
  982 + throw new Exception("Extractor '$classname' not defined in file: $includeFile");
  983 + }
  984 +
  985 + $extractor = new $extractorClass();
  986 +
  987 + if (!($extractor instanceof DocumentExtractor))
  988 + {
  989 + throw new Exception("Class $classname was expected to be of type DocumentExtractor");
  990 + }
  991 +
  992 + return $extractor;
  993 + }
  994 +
  995 + public static function getIndexingQueue($problemItemsOnly=true)
  996 + {
  997 +
  998 + if ($problemItemsOnly)
  999 + {
  1000 + $sql = "SELECT
  1001 + iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename
  1002 + FROM
  1003 + index_files iff
  1004 + INNER JOIN documents d ON iff.document_id=d.id
  1005 + INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
  1006 + INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
  1007 + INNER JOIN mime_types mt ON dcv.mime_id=mt.id
  1008 + LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
  1009 + WHERE
  1010 + (iff.status_msg IS NOT NULL AND iff.status_msg <> '') AND d.status_id=1
  1011 + ORDER BY indexdate ";
  1012 + }
  1013 + else
  1014 + {
  1015 + $sql = "SELECT
  1016 + iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename
  1017 + FROM
  1018 + index_files iff
  1019 + INNER JOIN documents d ON iff.document_id=d.id
  1020 + INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
  1021 + INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
  1022 + INNER JOIN mime_types mt ON dcv.mime_id=mt.id
  1023 + LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
  1024 + WHERE
  1025 + (iff.status_msg IS NULL or iff.status_msg = '') AND d.status_id=1
  1026 + ORDER BY indexdate ";
  1027 + }
  1028 + $aResult = DBUtil::getResultArray($sql);
  1029 +
  1030 + return $aResult;
  1031 + }
  1032 +
  1033 + public static function getPendingIndexingQueue()
  1034 + {
  1035 + return Indexer::getIndexingQueue(false);
  1036 + }
  1037 +
  1038 + public function updateIndexStats()
  1039 + {
  1040 + $optimisationDate = KTUtil::getSystemSetting('luceneOptimisationDate', '');
  1041 +
  1042 + $noOptimisation = false;
  1043 + if ($optimisationDate == '')
  1044 + {
  1045 + $optimisationDate = _kt('N/A');
  1046 + $optimisationPeriod = $optimisationDate;
  1047 + }
  1048 + else
  1049 + {
  1050 + $optimisationPeriod = KTUtil::computePeriodToDate($optimisationDate, null, true);
  1051 + $noOptimisation = $optimisationPeriod['days'] > 2;
  1052 + $optimisationPeriod = $optimisationPeriod['str'];
  1053 + $optimisationDate = date('Y-m-d H:i:s', $optimisationDate);
  1054 + }
  1055 +
  1056 + $indexingDate = KTUtil::getSystemSetting('luceneIndexingDate', '');
  1057 + if ($indexingDate == '')
  1058 + {
  1059 + $indexingDate = _kt('N/A');
  1060 + $indexingPeriod = $indexingDate;
  1061 + }
  1062 + else
  1063 + {
  1064 + $indexingPeriod = KTUtil::computePeriodToDate($indexingDate);
  1065 + $indexingDate = date('Y-m-d H:i:s', $indexingDate);
  1066 + }
  1067 +
  1068 + $index = Indexer::get();
  1069 + $docsInIndex = $index->getDocumentsInIndex();
  1070 +
  1071 + // we are only interested in documents that are active
  1072 + $sql = "SELECT count(*) as docsInQueue FROM index_files i inner join documents d on i.document_id = d.id where (i.status_msg is null or i.status_msg = '') and d.status_id=1";
  1073 + $docsInQueue = DBUtil::getOneResultKey($sql, 'docsInQueue');
  1074 +
  1075 + $sql = "SELECT count(*) as errorsInQueue FROM index_files i inner join documents d on i.document_id = d.id where (i.status_msg is not null or i.status_msg <> '') and d.status_id=1";
  1076 + $errorsInQueue = DBUtil::getOneResultKey($sql, 'errorsInQueue');
  1077 +
  1078 + $sql = "SELECT count(*) as docsInRepository FROM documents where status_id=1";
  1079 + $docsInRepository = DBUtil::getOneResultKey($sql, 'docsInRepository');
  1080 +
  1081 + if ($docsInRepository == 0)
  1082 + {
  1083 + $indexingCoverage = '0.00%';
  1084 + $queueCoverage = $indexingCoverage;
  1085 + }
  1086 + else
  1087 + {
  1088 + // compute indexing coverage
  1089 + $indexingCoverage = _kt('Not Available');
  1090 + if (is_numeric($docsInIndex))
  1091 + {
  1092 + $indexingCoverage = ($docsInIndex * 100) / $docsInRepository;
  1093 + $indexingCoverage = number_format($indexingCoverage, 2, '.',',') . '%';
  1094 + }
  1095 +
  1096 + // compute queue coverage
  1097 + $queueCoverage = _kt('Not Available');
  1098 + if (is_numeric($docsInQueue))
  1099 + {
  1100 + $queueCoverage = ($docsInQueue * 100) / $docsInRepository;
  1101 + $queueCoverage = number_format($queueCoverage, 2, '.',',') . '%';
  1102 + }
  1103 + }
  1104 +
  1105 +
  1106 + $stats = array(
  1107 + 'optimisationDate'=>$optimisationDate,
  1108 + 'optimisationPeriod'=>$optimisationPeriod,
  1109 + 'indexingDate'=>$indexingDate,
  1110 + 'indexingPeriod'=>$indexingPeriod,
  1111 + 'docsInIndex'=>$docsInIndex,
  1112 + 'docsInQueue'=>$docsInQueue,
  1113 + 'errorsInQueue'=>$errorsInQueue,
  1114 + 'docsInRepository'=>$docsInRepository,
  1115 + 'indexingCoverage'=>$indexingCoverage,
  1116 + 'queueCoverage'=>$queueCoverage,
  1117 + 'noOptimisation'=>$noOptimisation
  1118 + );
  1119 +
  1120 + KTUtil::setSystemSetting('indexerStats', serialize($stats));
  1121 +
  1122 + $indexer = Indexer::get();
  1123 +
  1124 + $diagnosis = $indexer->diagnose();
  1125 + KTUtil::setSystemSetting('indexerDiagnostics', serialize($diagnosis));
  1126 +
  1127 + $extractorDiagnosis = $indexer->diagnoseExtractors();
  1128 +
  1129 + KTUtil::setSystemSetting('extractorDiagnostics', serialize($extractorDiagnosis));
  1130 + }
  1131 +
  1132 + /**
  1133 + * The main function that may be called repeatedly to index documents.
  1134 + *
  1135 + * @param int $max Default 20
  1136 + */
  1137 + public function indexDocuments($max=null)
  1138 + {
  1139 + global $default;
  1140 + $config =& KTConfig::getSingleton();
  1141 +
  1142 + /*$indexLockFile = $config->get('cache/cacheDirectory') . '/main.index.lock';
  1143 + if (is_file($indexLockFile))
  1144 + {
  1145 + $default->log->info('indexDocuments: main.index.lock seems to exist. it could be that the indexing is still underway.');
  1146 + $default->log->info('indexDocuments: Remove "' . $indexLockFile . '" if the indexing is not running or extend the frequency at which the background task runs!');
  1147 + return;
  1148 + }
  1149 + touch($indexLockFile);*/
  1150 +
  1151 +
  1152 + $this->checkForRegisteredTypes();
  1153 +
  1154 + if ($this->debug) $default->log->debug('indexDocuments: start');
  1155 + if (!$this->doesDiagnosticsPass())
  1156 + {
  1157 + //unlink($indexLockFile);
  1158 + if ($this->debug) $default->log->debug('indexDocuments: stopping - diagnostics problem. The dashboard will provide more information.');
  1159 + return;
  1160 + }
  1161 +
  1162 + if (is_null($max))
  1163 + {
  1164 + $max = $config->get('indexer/batchDocuments',20);
  1165 + }
  1166 +
  1167 + $this->loadExtractorHooks();
  1168 +
  1169 + Indexer::clearoutDeleted();
  1170 +
  1171 + $date = date('Y-m-d H:i:s');
  1172 + // identify the indexers that must run
  1173 + // mysql specific limit!
  1174 + $sql = "SELECT
  1175 + iff.document_id, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what
  1176 + FROM
  1177 + index_files iff
  1178 + INNER JOIN documents d ON iff.document_id=d.id
  1179 + INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
  1180 + INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
  1181 + INNER JOIN mime_types mt ON dcv.mime_id=mt.id
  1182 + LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
  1183 + WHERE
  1184 + (iff.processdate IS NULL or iff.processdate < date_sub('$date', interval 1 day)) AND dmv.status_id=1
  1185 + ORDER BY indexdate
  1186 + LIMIT $max";
  1187 + $result = DBUtil::getResultArray($sql);
  1188 + if (PEAR::isError($result))
  1189 + {
  1190 + //unlink($indexLockFile);
  1191 + if ($this->debug) $default->log->debug('indexDocuments: stopping - db error');
  1192 + return;
  1193 + }
  1194 + KTUtil::setSystemSetting('luceneIndexingDate', time());
  1195 +
  1196 + // bail if no work to do
  1197 + if (count($result) == 0)
  1198 + {
  1199 + //unlink($indexLockFile);
  1200 + if ($this->debug) $default->log->debug('indexDocuments: stopping - no work to be done');
  1201 + return;
  1202 + }
  1203 +
  1204 + // identify any documents that need indexing and mark them
  1205 + // so they are not taken in a followup run
  1206 + $ids = array();
  1207 + foreach($result as $docinfo)
  1208 + {
  1209 + $ids[] = $docinfo['document_id'];
  1210 + }
  1211 +
  1212 + // mark the documents as being processed
  1213 +
  1214 + $ids=implode(',',$ids);
  1215 + $sql = "UPDATE index_files SET processdate='$date' WHERE document_id in ($ids)";
  1216 + DBUtil::runQuery($sql);
  1217 +
  1218 + $extractorCache = array();
  1219 + $storageManager = KTStorageManagerUtil::getSingleton();
  1220 +
  1221 + $tempPath = $config->get("urls/tmpDirectory");
  1222 +
  1223 + foreach($result as $docinfo)
  1224 + {
  1225 + // increment indexed documents count
  1226 + Indexer::incrementCount();
  1227 +
  1228 + $docId=$docinfo['document_id'];
  1229 + $extension=$docinfo['filetypes'];
  1230 + $mimeType=$docinfo['mimetypes'];
  1231 + $extractorClass=$docinfo['extractor'];
  1232 + $indexDocument = in_array($docinfo['what'], array('A','C'));
  1233 + $indexDiscussion = in_array($docinfo['what'], array('A','D'));
  1234 + $this->indexingHistory = '';
  1235 +
  1236 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Indexing docid: %d extension: '%s' mimetype: '%s' extractor: '%s'"), $docId, $extension,$mimeType,$extractorClass), 'debug');
  1237 +
  1238 + if (empty($extractorClass))
  1239 + {
  1240 + /*
  1241 +
  1242 + if no extractor is found and we don't need to index discussions, then we can remove the item from the queue.
  1243 +
  1244 + */
  1245 + if ($indexDiscussion)
  1246 + {
  1247 + $indexDocument = false;
  1248 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Not indexing docid: %d content because extractor could not be resolve. Still indexing discussion."), $docId), 'info');
  1249 + }
  1250 + else
  1251 + {
  1252 + Indexer::unqueueDocument($docId, sprintf(_kt("No extractor for docid: %d"),$docId));
  1253 + continue;
  1254 + }
  1255 + }
  1256 + else
  1257 + {
  1258 + /*
  1259 +
  1260 + If an extractor is available, we must ensure it is enabled.
  1261 +
  1262 + */
  1263 +
  1264 + if (!$this->isExtractorEnabled($extractorClass))
  1265 + {
  1266 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("diagnose: Not indexing docid: %d because extractor '%s' is disabled."), $docId, $extractorClass), 'info');
  1267 + continue;
  1268 + }
  1269 + }
  1270 +
  1271 + if ($this->debug)
  1272 + {
  1273 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Processing docid: %d.\n"),$docId), 'info');
  1274 + }
  1275 +
  1276 + $document = Document::get($docId);
  1277 + if (PEAR::isError($document))
  1278 + {
  1279 + Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Cannot resolve document id %d: %s."),$docId, $document->getMessage()), 'error');
  1280 + continue;
  1281 + }
  1282 +
  1283 + if ($this->restartCurrentBatch)
  1284 + {
  1285 + Indexer::unqueueDocument($docId);
  1286 + Indexer::index($docId, 'A');
  1287 + continue;
  1288 + }
  1289 +
  1290 +
  1291 + $filename = $document->getFileName();
  1292 + if (substr($filename,0,1) == '~' || substr($filename,-1) == '~')
  1293 + {
  1294 + Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Filename for document id %d starts with a tilde (~). This is assumed to be a temporary file. This is ignored."),$docId), 'error');
  1295 + continue;
  1296 + }
  1297 +
  1298 + $removeFromQueue = true;
  1299 + if ($indexDocument)
  1300 + {
  1301 + if (array_key_exists($extractorClass, $extractorCache))
  1302 + {
  1303 + $extractor = $extractorCache[$extractorClass];
  1304 + }
  1305 + else
  1306 + {
  1307 + $extractor = $extractorCache[$extractorClass] = $this->getExtractor($extractorClass);
  1308 + }
  1309 +
  1310 + if (!($extractor instanceof DocumentExtractor))
  1311 + {
  1312 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("indexDocuments: extractor '%s' is not a document extractor class."),$extractorClass), 'error');
  1313 + continue;
  1314 + }
  1315 +
  1316 +
  1317 +
  1318 + $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();
  1319 + $sourceFile = $storageManager->temporaryFile($document);
  1320 +
  1321 + if (empty($sourceFile) || !is_file($sourceFile))
  1322 + {
  1323 + Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: source file '%s' for document %d does not exist."),$sourceFile,$docId), 'error');
  1324 + continue;
  1325 + }
  1326 +
  1327 + if ($extractor->needsIntermediateSourceFile())
  1328 + {
  1329 + //$extension = pathinfo($document->getFileName(), PATHINFO_EXTENSION);
  1330 +
  1331 + $intermediate = $tempPath . '/'. $docId . '.' . $extension;
  1332 + $result = @copy($sourceFile, $intermediate);
  1333 + if ($result === false)
  1334 + {
  1335 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not create intermediate file from document %d"),$docId), 'error');
  1336 + // problem. lets try again later. probably permission related. log the issue.
  1337 + continue;
  1338 + }
  1339 + $sourceFile = $intermediate;
  1340 + }
  1341 +
  1342 + $targetFile = tempnam($tempPath, 'ktindexer');
  1343 +
  1344 + $extractor->setSourceFile($sourceFile);
  1345 + $extractor->setMimeType($mimeType);
  1346 + $extractor->setExtension($extension);
  1347 + $extractor->setTargetFile($targetFile);
  1348 + $extractor->setDocument($document);
  1349 + $extractor->setIndexingStatus(null);
  1350 + $extractor->setExtractionStatus(null);
  1351 +
  1352 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Extra Info docid: %d Source File: '%s' Target File: '%s'"),$docId,$sourceFile,$targetFile), 'debug');
  1353 +
  1354 + $this->executeHook($extractor, 'pre_extract');
  1355 + $this->executeHook($extractor, 'pre_extract', $mimeType);
  1356 + $removeFromQueue = false;
  1357 +
  1358 + if ($extractor->extractTextContent())
  1359 + {
  1360 + // the extractor may need to create another target file
  1361 + $targetFile = $extractor->getTargetFile();
  1362 +
  1363 + $extractor->setExtractionStatus(true);
  1364 + $this->executeHook($extractor, 'pre_index');
  1365 + $this->executeHook($extractor, 'pre_index', $mimeType);
  1366 +
  1367 + $title = $document->getName();
  1368 + if ($indexDiscussion)
  1369 + {
  1370 + if (!$this->filterText($targetFile))
  1371 + {
  1372 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');
  1373 + }
  1374 + else
  1375 + {
  1376 + $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);
  1377 + $removeFromQueue = $indexStatus;
  1378 + if (!$indexStatus)
  1379 + {
  1380 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocumentAndDiscussion"),$docId), 'error');
  1381 + }
  1382 +
  1383 + $extractor->setIndexingStatus($indexStatus);
  1384 + }
  1385 + }
  1386 + else
  1387 + {
  1388 + if (!$this->filterText($targetFile))
  1389 + {
  1390 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');
  1391 + }
  1392 + else
  1393 + {
  1394 + $indexStatus = $this->indexDocument($docId, $targetFile, $title, $version);
  1395 + $removeFromQueue = $indexStatus;
  1396 +
  1397 + if (!$indexStatus)
  1398 + {
  1399 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocument"),$docId), 'error');
  1400 + $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error');
  1401 + }
  1402 +
  1403 + $extractor->setIndexingStatus($indexStatus);
  1404 + }
  1405 + }
  1406 +
  1407 + $this->executeHook($extractor, 'post_index', $mimeType);
  1408 + $this->executeHook($extractor, 'post_index');
  1409 + }
  1410 + else
  1411 + {
  1412 + $extractor->setExtractionStatus(false);
  1413 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not extract contents from document %d"),$docId), 'error');
  1414 + $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error');
  1415 + }
  1416 +
  1417 + $this->executeHook($extractor, 'post_extract', $mimeType);
  1418 + $this->executeHook($extractor, 'post_extract');
  1419 +
  1420 + if ($extractor->needsIntermediateSourceFile())
  1421 + {
  1422 + @unlink($sourceFile);
  1423 + }
  1424 +
  1425 + @unlink($targetFile);
  1426 +
  1427 + }
  1428 + else
  1429 + {
  1430 + $indexStatus = $this->indexDiscussion($docId);
  1431 + $removeFromQueue = $indexStatus;
  1432 + }
  1433 +
  1434 + if ($removeFromQueue)
  1435 + {
  1436 + Indexer::unqueueDocument($docId, sprintf(_kt("Done indexing docid: %d"),$docId));
  1437 + }
  1438 + else
  1439 + {
  1440 + if ($this->debug) $default->log->debug(sprintf(_kt("Document docid: %d was not removed from the queue as it looks like there was a problem with the extraction process"),$docId));
  1441 + }
  1442 + }
  1443 + if ($this->debug) $default->log->debug('indexDocuments: done');
  1444 + //unlink($indexLockFile);
  1445 + }
  1446 +
  1447 + public function migrateDocuments($max=null)
  1448 + {
  1449 + global $default;
  1450 +
  1451 + $default->log->info(_kt('migrateDocuments: starting'));
  1452 +
  1453 + if (!$this->doesDiagnosticsPass(true))
  1454 + {
  1455 + $default->log->info(_kt('migrateDocuments: stopping - diagnostics problem. The dashboard will provide more information.'));
  1456 + return;
  1457 + }
  1458 +
  1459 + if (KTUtil::getSystemSetting('migrationComplete') == 'true')
  1460 + {
  1461 + $default->log->info(_kt('migrateDocuments: stopping - migration is complete.'));
  1462 + return;
  1463 + }
  1464 +
  1465 + $config =& KTConfig::getSingleton();
  1466 + if (is_null($max))
  1467 + {
  1468 + $max = $config->get('indexer/batchMigrateDocument',500);
  1469 + }
  1470 +
  1471 + $lockFile = $config->get('cache/cacheDirectory') . '/migration.lock';
  1472 + if (is_file($lockFile))
  1473 + {
  1474 + $default->log->info(_kt('migrateDocuments: stopping - migration lockfile detected.'));
  1475 + return;
  1476 + }
  1477 + touch($lockFile);
  1478 +
  1479 + $startTime = KTUtil::getSystemSetting('migrationStarted');
  1480 + if (is_null($startTime))
  1481 + {
  1482 + KTUtil::setSystemSetting('migrationStarted', time());
  1483 + }
  1484 +
  1485 + $maxLoops = 5;
  1486 +
  1487 + $max = ceil($max / $maxLoops);
  1488 +
  1489 + $start =KTUtil::getBenchmarkTime();
  1490 + $noDocs = false;
  1491 + $numDocs = 0;
  1492 +
  1493 + for($loop=0;$loop<$maxLoops;$loop++)
  1494 + {
  1495 +
  1496 + $sql = "SELECT
  1497 + document_id, document_text
  1498 + FROM
  1499 + document_text
  1500 + ORDER BY document_id
  1501 + LIMIT $max";
  1502 + $result = DBUtil::getResultArray($sql);
  1503 + if (PEAR::isError($result))
  1504 + {
  1505 + $default->log->info(_kt('migrateDocuments: db error'));
  1506 + break;
  1507 + }
  1508 +
  1509 + $docs = count($result);
  1510 + if ($docs == 0)
  1511 + {
  1512 + $noDocs = true;
  1513 + break;
  1514 + }
  1515 + $numDocs += $docs;
  1516 +
  1517 + foreach($result as $docinfo)
  1518 + {
  1519 + $docId = $docinfo['document_id'];
  1520 +
  1521 + $document = Document::get($docId);
  1522 + if (PEAR::isError($document) || is_null($document))
  1523 + {
  1524 + $sql = "DELETE FROM document_text WHERE document_id=$docId";
  1525 + DBUtil::runQuery($sql);
  1526 + $default->log->error(sprintf(_kt('migrateDocuments: Could not get document %d\'s document! Removing content!'),$docId));
  1527 + continue;
  1528 + }
  1529 +
  1530 + $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();
  1531 +
  1532 + $targetFile = tempnam($tempPath, 'ktindexer');
  1533 +
  1534 + if (file_put_contents($targetFile, $docinfo['document_text']) === false)
  1535 + {
  1536 + $default->log->error(sprintf(_kt('migrateDocuments: Cannot write to \'%s\' for document id %d'), $targetFile, $docId));
  1537 + continue;
  1538 + }
  1539 + // free memory asap ;)
  1540 + unset($docinfo['document_text']);
  1541 +
  1542 + $title = $document->getName();
  1543 +
  1544 + $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);
  1545 +
  1546 + if ($indexStatus)
  1547 + {
  1548 + $sql = "DELETE FROM document_text WHERE document_id=$docId";
  1549 + DBUtil::runQuery($sql);
  1550 + }
  1551 + else
  1552 + {
  1553 + $default->log->error(sprintf(_kt("migrateDocuments: Problem indexing document %d"), $docId));
  1554 + }
  1555 +
  1556 + @unlink($targetFile);
  1557 + }
  1558 + }
  1559 +
  1560 + @unlink($lockFile);
  1561 +
  1562 + $time = KTUtil::getBenchmarkTime() - $start;
  1563 +
  1564 + KTUtil::setSystemSetting('migrationTime', KTUtil::getSystemSetting('migrationTime',0) + $time);
  1565 + KTUtil::setSystemSetting('migratedDocuments', KTUtil::getSystemSetting('migratedDocuments',0) + $numDocs);
  1566 +
  1567 + $default->log->info(sprintf(_kt('migrateDocuments: stopping - done in %d seconds!'), $time));
  1568 + if ($noDocs)
  1569 + {
  1570 + $default->log->info(_kt('migrateDocuments: Completed!'));
  1571 + KTUtil::setSystemSetting('migrationComplete', 'true');
  1572 + schedulerUtil::deleteByName('Index Migration');
  1573 + $default->log->debug(_kt('migrateDocuments: Disabling \'Index Migration\' task by removing scheduler entry.'));
  1574 + }
  1575 + }
  1576 +
  1577 + /**
  1578 + * Index a document. The base class must override this function.
  1579 + *
  1580 + * @param int $docId
  1581 + * @param string $textFile
  1582 + */
  1583 + protected abstract function indexDocument($docId, $textFile, $title, $version);
  1584 +
  1585 +
  1586 + public function updateDocumentIndex($docId, $text)
  1587 + {
  1588 + $config = KTConfig::getSingleton();
  1589 + $tempPath = $config->get("urls/tmpDirectory");
  1590 + $tempFile = tempnam($tempPath,'ud_');
  1591 +
  1592 + file_put_contents($tempFile, $text);
  1593 +
  1594 + $document = Document::get($docId);
  1595 + $title = $document->getDescription();
  1596 + $version = $document->getVersion();
  1597 +
  1598 + $result = $this->indexDocument($docId, $tempFile, $title, $version);
  1599 +
  1600 + if (file_exists($tempFile))
  1601 + {
  1602 + unlink($tempFile);
  1603 + }
  1604 +
  1605 + return $result;
  1606 + }
  1607 +
  1608 + /**
  1609 + * Index a discussion. The base class must override this function.
  1610 + *
  1611 + * @param int $docId
  1612 + */
  1613 + protected abstract function indexDiscussion($docId);
  1614 +
  1615 + /**
  1616 + * Diagnose the indexer. e.g. Check that the indexing server is running.
  1617 + *
  1618 + */
  1619 + public abstract function diagnose();
  1620 +
  1621 + /**
  1622 + * Diagnose the extractors.
  1623 + *
  1624 + * @return array
  1625 + */
  1626 + public function diagnoseExtractors()
  1627 + {
  1628 + $diagnosis = $this->_diagnose($this->extractorPath, 'DocumentExtractor', 'Extractor.inc.php');
  1629 + $diagnosis = array_merge($diagnosis, $this->_diagnose($this->hookPath, 'Hook', 'Hook.inc.php'));
  1630 +
  1631 + return $diagnosis;
  1632 + }
  1633 +
  1634 + /**
  1635 + * This is a refactored diagnose function.
  1636 + *
  1637 + * @param string $path
  1638 + * @param string $class
  1639 + * @param string $extension
  1640 + * @return array
  1641 + */
  1642 + private function _diagnose($path, $baseclass, $extension)
  1643 + {
  1644 + global $default;
  1645 +
  1646 + $diagnoses = array();
  1647 +
  1648 + $dir = opendir(SearchHelper::correctPath($path));
  1649 + $extlen = - strlen($extension);
  1650 +
  1651 + while (($file = readdir($dir)) !== false)
  1652 + {
  1653 + if (substr($file,0,1) == '.')
  1654 + {
  1655 + continue;
  1656 + }
  1657 + if (substr($file,$extlen) != $extension)
  1658 + {
  1659 + $default->log->error(sprintf(_kt("diagnose: '%s' does not have extension '%s'."), $file, $extension));
  1660 + continue;
  1661 + }
  1662 +
  1663 + require_once($path . '/' . $file);
  1664 +
  1665 + $class = substr($file, 0, -8);
  1666 + if (!class_exists($class))
  1667 + {
  1668 + $default->log->error(sprintf(_kt("diagnose: class '%s' does not exist."), $class));
  1669 + continue;
  1670 + }
  1671 +
  1672 + if (!$this->isExtractorEnabled($class))
  1673 + {
  1674 + $default->log->debug(sprintf(_kt("diagnose: extractor '%s' is disabled."), $class));
  1675 + continue;
  1676 + }
  1677 +
  1678 + $extractor = new $class();
  1679 + if (!is_a($extractor, $baseclass))
  1680 + {
  1681 + $default->log->error(sprintf(_kt("diagnose(): '%s' is not of type DocumentExtractor"), $class));
  1682 + continue;
  1683 + }
  1684 +
  1685 + $types = $extractor->getSupportedMimeTypes();
  1686 + if (empty($types))
  1687 + {
  1688 + if ($this->debug) $default->log->debug(sprintf(_kt("diagnose: class '%s' does not support any types."), $class));
  1689 + continue;
  1690 + }
  1691 +
  1692 + $diagnosis=$extractor->diagnose();
  1693 + if (empty($diagnosis))
  1694 + {
  1695 + continue;
  1696 + }
  1697 + $diagnoses[$class] = array(
  1698 + 'name'=>$extractor->getDisplayName(),
  1699 + 'diagnosis'=>$diagnosis
  1700 + );
  1701 +
  1702 + }
  1703 + closedir($dir);
  1704 +
  1705 + return $diagnoses;
  1706 + }
  1707 +
  1708 +
  1709 + /**
  1710 + * Register the extractor types.
  1711 + *
  1712 + * @param boolean $clear. Optional. Defaults to false.
  1713 + */
  1714 + public function registerTypes($clear=false)
  1715 + {
  1716 + if ($clear)
  1717 + {
  1718 + $this->clearExtractors();
  1719 + }
  1720 + $dir = opendir(SearchHelper::correctPath($this->extractorPath));
  1721 + while (($file = readdir($dir)) !== false)
  1722 + {
  1723 + if (substr($file,-17) == 'Extractor.inc.php')
  1724 + {
  1725 + require_once($this->extractorPath . '/' . $file);
  1726 + $class = substr($file, 0, -8);
  1727 +
  1728 + if (!class_exists($class))
  1729 + {
  1730 + // if the class does not exist, we can't do anything.
  1731 + continue;
  1732 + }
  1733 +
  1734 + $extractor = new $class;
  1735 + if ($extractor instanceof DocumentExtractor)
  1736 + {
  1737 + $extractor->registerMimeTypes();
  1738 + }
  1739 + }
  1740 + }
  1741 + closedir($dir);
  1742 + }
  1743 +
  1744 + /**
  1745 + * This is used as a possible obtimisation effort. It may be overridden in that case.
  1746 + *
  1747 + * @param int $docId
  1748 + * @param string $textFile
  1749 + */
  1750 + protected function indexDocumentAndDiscussion($docId, $textFile, $title, $version)
  1751 + {
  1752 + $this->indexDocument($docId, $textFile, $title, $version);
  1753 + $this->indexDiscussion($docId);
  1754 + }
  1755 +
  1756 + /**
  1757 + * Remove the document from the queue. This is normally called when it has been processed.
  1758 + *
  1759 + * @param int $docid
  1760 + */
  1761 + public static function unqueueDocument($docid, $reason=false, $level='debug')
  1762 + {
  1763 + $sql = "DELETE FROM index_files WHERE document_id=$docid";
  1764 + DBUtil::runQuery($sql);
  1765 + if ($reason !== false)
  1766 + {
  1767 + global $default;
  1768 + $default->log->$level("Indexer: removing document $docid from the queue - $reason");
  1769 + }
  1770 + }
  1771 +
  1772 + /**
  1773 + * Run a query on the index.
  1774 + *
  1775 + * @param string $query
  1776 + * @return array
  1777 + */
  1778 + public abstract function query($query);
  1779 +
  1780 + /**
  1781 + * Converts an integer to a string that can be easily compared and reversed.
  1782 + *
  1783 + * @param int $int
  1784 + * @return string
  1785 + */
  1786 + public static function longToString($int)
  1787 + {
  1788 + $maxlen = 14;
  1789 +
  1790 + $a2z = array('a','b','c','d','e','f','g','h','i','j');
  1791 + $o29 = array('0','1','2','3','4','5','6','7','8','9');
  1792 + $l = str_pad('',$maxlen - strlen("$int"),'0') . $int;
  1793 +
  1794 + return str_replace($o29, $a2z, $l);
  1795 + }
  1796 +
  1797 + /**
  1798 + * Converts a string to an integer.
  1799 + *
  1800 + * @param string $str
  1801 + * @return int
  1802 + */
  1803 + public static function stringToLong($str)
  1804 + {
  1805 + $a2z = array('a','b','c','d','e','f','g','h','i','j');
  1806 + $o29 = array('0','1','2','3','4','5','6','7','8','9');
  1807 +
  1808 + $int = str_replace($a2z, $o29, $str) + 0;
  1809 +
  1810 + return $int;
  1811 + }
  1812 +
  1813 + /**
  1814 + * Possibly we can optimise indexes. This method must be overriden.
  1815 + * The new function must call the parent!
  1816 + *
  1817 + */
  1818 + public function optimise()
  1819 + {
  1820 + KTUtil::setSystemSetting('luceneOptimisationDate', time());
  1821 + }
  1822 +
  1823 + /**
  1824 + * Shuts down the indexer
  1825 + *
  1826 + */
  1827 + public function shutdown()
  1828 + {
  1829 + // do nothing generally
  1830 + }
  1831 +
  1832 + /**
  1833 + * Returns the name of the indexer.
  1834 + *
  1835 + * @return string
  1836 + */
  1837 + public abstract function getDisplayName();
  1838 +
  1839 +
  1840 + /**
  1841 + * Returns the number of non-deleted documents in the index.
  1842 + *
  1843 + * @return int
  1844 + */
  1845 + public abstract function getDocumentsInIndex();
  1846 +
  1847 + public abstract function isDocumentIndexed($documentId);
  1848 +
  1849 + /**
  1850 + * Returns the path to the index directory
  1851 + *
  1852 + * @return string
  1853 + */
  1854 + public function getIndexDirectory()
  1855 + {
  1856 + $config = KTConfig::getSingleton();
  1857 + $directory = $config->get('indexer/luceneDirectory');
  1858 + return $directory;
  1859 + }
  1860 +}
  1861 +
  1862 +?>