Commit 4f809981a4726dd0053e7670ca28066005f21e94

Authored by Megan Watson
1 parent 06bfc0d7

KTS-3806

"The number of documents in the indexing queue is incorrect on the Document Indexer Statistics dashlet"
Fixed. Adjusted sql to ignore problem documents.

Committed by: Megan Watson
Reviewed by: Conrad Vermeulen



git-svn-id: https://kt-dms.svn.sourceforge.net/svnroot/kt-dms/trunk@9511 c91229c3-7414-0410-bfa2-8a42b809f60b
Showing 1 changed file with 1862 additions and 1862 deletions
search2/indexing/indexerCore.inc.php
1   -<?php
2   -
3   -/**
4   - * $Id:$
5   - *
6   - * KnowledgeTree Community Edition
7   - * Document Management Made Simple
8   - * Copyright (C) 2008 KnowledgeTree Inc.
9   - * Portions copyright The Jam Warehouse Software (Pty) Limited
10   - *
11   - * This program is free software; you can redistribute it and/or modify it under
12   - * the terms of the GNU General Public License version 3 as published by the
13   - * Free Software Foundation.
14   - *
15   - * This program is distributed in the hope that it will be useful, but WITHOUT
16   - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17   - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
18   - * details.
19   - *
20   - * You should have received a copy of the GNU General Public License
21   - * along with this program. If not, see <http://www.gnu.org/licenses/>.
22   - *
23   - * You can contact KnowledgeTree Inc., PO Box 7775 #87847, San Francisco,
24   - * California 94120-7775, or email info@knowledgetree.com.
25   - *
26   - * The interactive user interfaces in modified source and object code versions
27   - * of this program must display Appropriate Legal Notices, as required under
28   - * Section 5 of the GNU General Public License version 3.
29   - *
30   - * In accordance with Section 7(b) of the GNU General Public License version 3,
31   - * these Appropriate Legal Notices must retain the display of the "Powered by
32   - * KnowledgeTree" logo and retain the original copyright notice. If the display of the
33   - * logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices
34   - * must display the words "Powered by KnowledgeTree" and retain the original
35   - * copyright notice.
36   - * Contributor( s): ______________________________________
37   - *
38   - */
39   -
40   -define('SEARCH2_INDEXER_DIR',realpath(dirname(__FILE__)) . '/');
41   -require_once('indexing/extractorCore.inc.php');
42   -require_once(KT_DIR . '/plugins/ktcore/scheduler/schedulerUtil.php');
43   -require_once(KT_DIR . '/ktapi/ktapi.inc.php');
44   -
45   -class IndexerInconsistencyException extends Exception {};
46   -
47   -// TODO: Query Result Items code should be moved into the Search section. It has less to do with indexing...
48   -
49   -class QueryResultItem
50   -{
51   - protected $id;
52   - protected $title;
53   - protected $rank;
54   - protected $text;
55   - protected $fullpath;
56   -
57   - public function __construct($id, $title, $rank, $text, $fullpath)
58   - {
59   - $this->id = $id;
60   - $this->title = $title;
61   - $this->rank = $rank;
62   - $this->text = $text;
63   - $this->fullpath = $fullpath;
64   - }
65   -
66   - public function getId() { return $this->id; }
67   - public function getRealId() { return $this->id; }
68   -
69   - public function getIsProxy() { return $this instanceof ProxyResultItem; }
70   - public function getIsFolder() { return substr(get_class($this), 0, 6) == 'Folder' ; }
71   - public function getIsDocument() { return substr(get_class($this), 0, 8) == 'Document' ; }
72   -
73   - public function setRank($value)
74   - {
75   - $this->rank = number_format($value,2,'.',',');
76   - }
77   -
78   - public function getIsLive()
79   - {
80   - return true;
81   - }
82   -
83   - public function setTitle($value)
84   - {
85   - $this->title = $value;
86   - }
87   -
88   - public function setText($value)
89   - {
90   - $this->text = $value;
91   - }
92   -
93   - public function getRelevance() { return (float) $this->rank; }
94   - public function getRank() { return $this->getRelevance(); }
95   - public function getText() { return (string) $this->text; }
96   - public function getTitle() { return (string) $this->title; }
97   - public function getFullPath() { return (string) $this->fullpath; }
98   -
99   - protected function __get($property)
100   - {
101   - if (empty($property))
102   - {
103   - return '';
104   - }
105   -
106   - $method = 'get' . $property;
107   - if (method_exists($this, $method))
108   - {
109   - return $this->$method();
110   - }
111   - return $this->getUnknown();
112   - }
113   -
114   - protected function getUnknown()
115   - {
116   - return _kt('n/a');
117   - }
118   -
119   - protected function __set($property, $value)
120   - {
121   - if (empty($property))
122   - {
123   - return '';
124   - }
125   -
126   - $method = 'set' . $property;
127   - if (method_exists($this, $method))
128   - {
129   - return $this->$method($value);
130   - }
131   - throw new Exception("Unknown property '$property' to set on QueryResultItem");
132   - }
133   -}
134   -
135   -class ProxyResultItem extends QueryResultItem
136   -{
137   - protected $proxy;
138   - protected $proxyId;
139   -
140   - public function __construct($proxyId, $proxy)
141   - {
142   - parent::__construct($proxyId, $proxy->getTitle, $proxy->getRank(), $proxy->getText(), $proxy->getFullPath());
143   - $this->proxyId = $proxyId;
144   - $this->proxy = $proxy;
145   - }
146   -
147   - public function getId() { return $this->proxyId; }
148   - public function getTitle() { return $this->proxy->getTitle(); }
149   - public function getRealId() { return $this->proxy->getId(); }
150   -
151   - protected function __get($property)
152   - {
153   - $method = 'get' . $property;
154   -
155   - if (method_exists($this, $method))
156   - {
157   - return $this->$method();
158   - }
159   - else
160   - {
161   - return $this->proxy->$method();
162   - }
163   - }
164   -
165   - protected function __set($property, $value)
166   - {
167   - $method = 'set' . $property;
168   - if (method_exists($this, $method))
169   - {
170   - return $this->$method($value);
171   - }
172   - else
173   - {
174   - return $this->proxy->$method($value);
175   - }
176   - }
177   -}
178   -
179   -class DocumentResultItem extends QueryResultItem
180   -{
181   - protected $filesize;
182   - protected $live;
183   - protected $version;
184   - protected $mimeType;
185   - protected $filename;
186   - protected $thumbnail; // TODO: if not null, gui can display a thumbnail
187   - protected $viewer; // TODO: if not null, a viewer can be used to view the document
188   - protected $document;
189   - protected $checkedOutUser;
190   - protected $dateCheckedout;
191   - protected $workflowState;
192   - protected $workflow;
193   - protected $modifiedBy;
194   - protected $dateModified;
195   - protected $createdBy;
196   - protected $dateCreated;
197   - protected $owner;
198   - protected $immutable;
199   - protected $deleted;
200   - protected $status;
201   - protected $folderId;
202   - protected $storagePath;
203   - protected $documentType;
204   - protected $mimeIconPath;
205   - protected $mimeDisplay;
206   - protected $oemDocumentNo;
207   -
208   - public function __construct($document_id, $rank=null, $title=null, $text=null, $fullpath = null)
209   - {
210   - parent::__construct($document_id, $title, $rank, $text, $fullpath);
211   - $this->live = true;
212   - $this->loadDocumentInfo();
213   - }
214   -
215   - // TODO: this is bad. must refactor to do the query on the group of documents.
216   - public function loadDocumentInfo()
217   - {
218   - global $default;
219   - $sql = "SELECT
220   - d.folder_id, f.full_path, f.name, dcv.size as filesize, dcv.major_version,
221   - dcv.minor_version, dcv.filename, cou.name as checkoutuser, w.human_name as workflow, ws.human_name as workflowstate,
222   - mt.mimetypes as mimetype, md.mime_doc as mimedoc, d.checkedout, mbu.name as modifiedbyuser, d.modified,
223   - cbu.name as createdbyuser, ou.name as owneruser, d.immutable, d.status_id, d.created,dcv.storage_path, dtl.name as document_type,
224   - mt.icon_path as mime_icon_path, mt.friendly_name as mime_display, d.oem_no, dmv.name as title
225   - FROM
226   - documents d
227   - INNER JOIN document_metadata_version dmv ON d.metadata_version_id = dmv.id
228   - INNER JOIN document_content_version dcv ON dmv.content_version_id = dcv.id
229   - INNER JOIN mime_types mt ON dcv.mime_id=mt.id
230   - LEFT JOIN document_types_lookup dtl ON dtl.id=dmv.document_type_id
231   - LEFT JOIN folders f ON f.id=d.folder_id
232   - LEFT JOIN users cou ON d.checked_out_user_id=cou.id
233   - LEFT JOIN workflows w ON dmv.workflow_id=w.id
234   - LEFT JOIN workflow_states ws ON dmv.workflow_state_id = ws.id
235   - LEFT JOIN mime_documents md ON mt.mime_document_id = md.id
236   - LEFT JOIN users mbu ON d.modified_user_id=mbu.id
237   - LEFT JOIN users cbu ON d.creator_id=cbu.id
238   - LEFT JOIN users ou ON d.owner_id=ou.id
239   - WHERE
240   - d.id=$this->id";
241   -
242   - $result = DBUtil::getOneResult($sql);
243   -
244   - if (PEAR::isError($result) || empty($result))
245   - {
246   - $this->live = false;
247   - if (PEAR::isError($result))
248   - {
249   - throw new Exception('Database exception! There appears to be an error in the system: ' .$result->getMessage());
250   - }
251   -
252   - $default->log->error('QueryResultItem: $result is null');
253   - $msg = 'The database did not have a record matching the result from the document indexer. This may occur if there is an inconsistency between the document indexer and the repository. The indexer needs to be repaired.';
254   - $default->log->error('QueryResultItem: ' . $msg);
255   - // TODO: repair process where we scan documents in index, and delete those for which there is nothing in the repository
256   - throw new IndexerInconsistencyException(_kt($msg));
257   - }
258   -
259   - // document_id, relevance, text, title
260   -
261   - $this->documentType = $result['document_type'];
262   - $this->filename=$result['filename'];
263   - $this->filesize = KTUtil::filesizeToString($result['filesize']);
264   - $this->folderId = $result['folder_id'];
265   - $this->title = $result['title'];
266   -
267   - $this->createdBy = $result['createdbyuser'];
268   - $this->dateCreated = $result['created'];
269   -
270   - $this->modifiedBy = $result['modifiedbyuser'];
271   - $this->dateModified = $result['modified'];
272   -
273   - $this->checkedOutUser = $result['checkoutuser'];
274   - $this->dateCheckedout = $result['checkedout'];
275   -
276   - $this->owner = $result['owneruser'];
277   -
278   - $this->version = $result['major_version'] . '.' . $result['minor_version'];
279   -
280   - $this->immutable = ($result['immutable'] + 0)?_kt('Immutable'):'';
281   -
282   - $this->workflow = $result['workflow'];
283   - $this->workflowState = $result['workflowstate'];
284   -
285   - $this->oemDocumentNo = $result['oem_no'];
286   - if (empty($this->oemDocumentNo)) $this->oemDocumentNo = 'n/a';
287   -
288   - if (is_null($result['name']))
289   - {
290   - $this->fullpath = '(orphaned)';
291   - }
292   - else
293   - {
294   - $this->fullpath = $result['full_path'];
295   - }
296   -
297   - $this->mimeType = $result['mimetype'];
298   - $this->mimeIconPath = $result['mime_icon_path'];
299   - if (empty($this->mimeIconPath))
300   - {
301   - $this->mimeIconPath = 'unspecified_type';
302   - }
303   - $this->mimeDisplay = $result['mime_display'];
304   -
305   - $this->storagePath = $result['storage_path'];
306   - $this->status = Document::getStatusString($result['status_id']);
307   - }
308   -
309   - public function getDocumentID() { return $this->getId(); }
310   - public function getIsLive() { return (bool) $this->live; }
311   - public function getFilesize() { return $this->filesize; }
312   - public function getVersion() { return (string) $this->version; }
313   - public function getFilename() { return (string)$this->filename; }
314   - public function getFolderId() { return (int)$this->folderId; }
315   - public function getOemDocumentNo() { return (string) $this->oemDocumentNo; }
316   - public function getDocument() { return Document::get($this->id); }
317   - public function getIsAvailable() { return $this->Document->isLive(); }
318   - public function getCheckedOutUser() { return (string) $this->checkedOutUser; }
319   - public function getCheckedOutByr() { return $this->getCheckedOutUser(); }
320   - public function getWorkflowOnly() { return (string)$this->workflow; }
321   - public function getWorkflow() { return $this->getWorkflow(); }
322   - public function getWorkflowStateOnly() { return (string)$this->workflowState; }
323   - public function getWorkflowState() { return $this->getWorkflowStateOnly(); }
324   - public function getWorkflowAndState() {
325   - if (is_null($this->workflow))
326   - {
327   - return '';
328   - }
329   - return "$this->workflow - $this->workflowState";
330   - }
331   - public function getMimeType() { return (string) $this->mimeType; }
332   - public function getMimeIconPath() { return (string) $this->mimeIconPath; }
333   - public function getMimeDisplay() { return (string) $this->mimeDisplay; }
334   - public function getDateCheckedOut() { return (string) $this->dateCheckedout; }
335   - public function getModifiedBy() { return (string) $this->modifiedBy; }
336   - public function getDateModified() { return (string) $this->dateModified; }
337   - public function getCreatedBy() { return (string) $this->createdBy; }
338   - public function getDateCreated() { return (string) $this->dateCreated; }
339   - public function getOwner() { return (string) $this->owner; }
340   - public function getOwnedBy() { return $this->getOwner(); }
341   - public function getIsImmutable() { return (bool) $this->immutable; }
342   - public function getImmutable() { return $this->getIsImmutable(); }
343   - public function getStatus() { return $this->status; }
344   - public function getStoragePath() { return $this->storagePath; }
345   - public function getDocumentType() { return $this->documentType; }
346   - public function getPermissions() { return KTAPI_Document::get_permission_string($this->Document); }
347   - public function getCanBeReadByUser() {
348   - if (!$this->live)
349   - return false;
350   - if (Permission::userHasDocumentReadPermission($this->Document))
351   - return true;
352   - if (Permission::adminIsInAdminMode())
353   - return true;
354   - return false;
355   - }
356   -}
357   -
358   -class FolderResultItem extends QueryResultItem
359   -{
360   - protected $folder;
361   - protected $createdBy;
362   - protected $parentId;
363   -
364   - public function __construct($folder_id, $rank=null, $title=null, $text=null, $fullpath = null)
365   - {
366   - parent::__construct($folder_id, $title, $rank, $text, $fullpath);
367   - $this->loadFolderInfo();
368   - }
369   -
370   - public function getFolderID() { return $this->getId(); }
371   - public function getParentID() { return $this->parentId; }
372   - public function getCreatedBy() { return $this->createdBy; }
373   - public function getMimeIconPath() { return 'folder'; }
374   - public function getFolder() { return Folder::get($this->getFolderID()); }
375   - public function getPermissions() { return KTAPI_Folder::get_permission_string($this->Folder); }
376   -
377   - public function loadFolderInfo()
378   - {
379   - global $default;
380   - $folder = $this->getFolder();
381   - if (PEAR::isError($folder))
382   - {
383   - throw new Exception('Database exception! There appears to be an error in the system: ' .$result->getMessage());
384   - }
385   - $this->title = $folder->getName();
386   - $this->fullpath = '/' . $folder->getFullPath();
387   - $this->parentId = $folder->getParentId();
388   -
389   - $user = User::get($folder->getCreatorID());
390   - $this->createdBy = (PEAR::isError($user))?_kt('Unknown'):$user->getName();
391   - }
392   -
393   -}
394   -
395   -class DocumentShortcutResultItem extends ProxyResultItem
396   -{
397   - public function getDocumentID() { return $this->getId(); }
398   - public function getMimeIconPath() { return $this->proxy->getMimeIconPath() . '_shortcut'; }
399   -
400   -}
401   -
402   -class FolderShortcutResultItem extends ProxyResultItem
403   -{
404   - public function getFolderID() { return $this->getId(); }
405   - public function getMimeIconPath() { return 'folder_shortcut'; }
406   -
407   -}
408   -
409   -function MatchResultCompare($a, $b)
410   -{
411   - if ($a->Rank == $b->Rank) {
412   - return 0;
413   - }
414   - return ($a->Rank < $b->Rank) ? -1 : 1;
415   -}
416   -
417   -abstract class Indexer
418   -{
419   - /**
420   - * Cache of extractors
421   - *
422   - * @var array
423   - */
424   - private $extractorCache;
425   -
426   - /**
427   - * Indicates if the indexer will do logging.
428   - *
429   - * @var boolean
430   - */
431   - private $debug;
432   - /**
433   - * Cache on mime related hooks
434   - *
435   - * @var unknown_type
436   - */
437   - private $mimeHookCache;
438   - /**
439   - * Cache on general hooks.
440   - *
441   - * @var array
442   - */
443   - private $generalHookCache;
444   -
445   - /**
446   - * This is a path to the extractors.
447   - *
448   - * @var string
449   - */
450   - private $extractorPath;
451   - /**
452   - * This is a path to the hooks.
453   - *
454   - * @var string
455   - */
456   - private $hookPath;
457   -
458   - private $enabledExtractors;
459   -
460   - /**
461   - * Initialise the indexer
462   - *
463   - */
464   - protected function __construct()
465   - {
466   - $config = KTConfig::getSingleton();
467   -
468   - $this->extractorCache = array();
469   - $this->debug = $config->get('indexer/debug', true);
470   - $this->hookCache = array();
471   - $this->generalHookCache = array();
472   - $this->extractorPath = $config->get('indexer/extractorPath', 'extractors');
473   - $this->hookPath = $config->get('indexer/extractorHookPath','extractorHooks');
474   -
475   - $this->loadExtractorStatus();
476   - }
477   -
478   - /**
479   - * Get the list if enabled extractors
480   - *
481   - */
482   - private function loadExtractorStatus()
483   - {
484   - $sql = "SELECT id, name FROM mime_extractors WHERE active=1";
485   - $rs = DBUtil::getResultArray($sql);
486   - $this->enabledExtractors = array();
487   - foreach($rs as $item)
488   - {
489   - $this->enabledExtractors[] = $item['name'];
490   - }
491   - }
492   -
493   - private function isExtractorEnabled($extractor)
494   - {
495   - return in_array($extractor, $this->enabledExtractors);
496   - }
497   -
498   - /**
499   - * Returns a reference to the main class
500   - *
501   - * @return Indexer
502   - */
503   - public static function get()
504   - {
505   - static $singleton = null;
506   -
507   - if (is_null($singleton))
508   - {
509   - $config = KTConfig::getSingleton();
510   - $classname = $config->get('indexer/coreClass');
511   -
512   - require_once('indexing/indexers/' . $classname . '.inc.php');
513   -
514   - if (!class_exists($classname))
515   - {
516   - throw new Exception("Class '$classname' does not exist.");
517   - }
518   -
519   - $singleton = new $classname;
520   - }
521   -
522   - return $singleton;
523   - }
524   -
525   - public abstract function deleteDocument($docid);
526   -
527   - /**
528   - * Remove the association of all extractors to mime types on the database.
529   - *
530   - */
531   - public function clearExtractors()
532   - {
533   - global $default;
534   -
535   - $sql = "update mime_types set extractor_id=null";
536   - DBUtil::runQuery($sql);
537   -
538   - $sql = "delete from mime_extractors";
539   - DBUtil::runQuery($sql);
540   -
541   - if ($this->debug) $default->log->debug('clearExtractors');
542   - }
543   -
544   - /**
545   - * lookup the name of the extractor class based on the mime type.
546   - *
547   - * @param string $type
548   - * @return string
549   - */
550   - public static function resolveExtractor($type)
551   - {
552   - global $default;
553   - $sql = "select extractor from mime_types where filetypes='$type'";
554   - $class = DBUtil::getOneResultKey($sql,'extractor');
555   - if (PEAR::isError($class))
556   - {
557   - $default->log->error("resolveExtractor: cannot resolve $type");
558   - return $class;
559   - }
560   - if ($this->debug) $default->log->debug(sprintf(_kt("resolveExtractor: Resolved '%s' from mime type '%s'."), $class, $type));
561   - return $class;
562   - }
563   -
564   - /**
565   - * Return all the discussion text.
566   - *
567   - * @param int $docid
568   - * @return string
569   - */
570   - public static function getDiscussionText($docid)
571   - {
572   - $sql = "SELECT
573   - dc.subject, dc.body
574   - FROM
575   - discussion_threads dt
576   - INNER JOIN discussion_comments dc ON dc.thread_id=dt.id AND dc.id BETWEEN dt.first_comment_id AND dt.last_comment_id
577   - WHERE
578   - dt.document_id=$docid";
579   - $result = DBUtil::getResultArray($sql);
580   - $text = '';
581   -
582   - foreach($result as $record)
583   - {
584   - $text .= $record['subject'] . "\n" . $record['body'] . "\n";
585   - }
586   -
587   - return $text;
588   - }
589   -
590   - /**
591   - * Schedule the indexing of a document.
592   - *
593   - * @param string $document
594   - * @param string $what
595   - */
596   - public static function index($document, $what='A')
597   - {
598   - global $default;
599   -
600   - if (is_numeric($document))
601   - {
602   - $document = Document::get($document+0);
603   - }
604   -
605   - if (PEAR::isError($document))
606   - {
607   - $default->log->error("index: Could not index document: " .$document->getMessage());
608   - return;
609   - }
610   -
611   - $document_id = $document->getId();
612   - $userid=$_SESSION['userID'];
613   - if (empty($userid)) $userid=1;
614   -
615   - // we dequeue the document so that there are no issues when enqueuing
616   - Indexer::unqueueDocument($document_id);
617   -
618   - // enqueue item
619   - $sql = "INSERT INTO index_files(document_id, user_id, what) VALUES($document_id, $userid, '$what')";
620   - DBUtil::runQuery($sql);
621   -
622   - $default->log->debug("index: Queuing indexing of $document_id");
623   -
624   - }
625   -
626   - private static function incrementCount()
627   - {
628   - // Get count from system settings
629   - $count = Indexer::getIndexedDocumentCount();
630   - $count = (int)$count + 1;
631   - Indexer::updateIndexedDocumentCount($count);
632   - }
633   -
634   - public static function getIndexedDocumentCount()
635   - {
636   - $count = KTUtil::getSystemSetting('indexedDocumentCount', 0);
637   - return (int) $count;
638   - }
639   -
640   - public static function updateIndexedDocumentCount($cnt = 0)
641   - {
642   - KTUtil::setSystemSetting('indexedDocumentCount', $cnt);
643   - }
644   -
645   - public static function reindexQueue()
646   - {
647   - $sql = "UPDATE index_files SET processdate = null";
648   - DBUtil::runQuery($sql);
649   - }
650   -
651   - public static function reindexDocument($documentId)
652   - {
653   - $sql = "UPDATE index_files SET processdate=null, status_msg=null WHERE document_id=$documentId";
654   - DBUtil::runQuery($sql);
655   - }
656   -
657   -
658   -
659   - public static function indexAll()
660   - {
661   - $userid=$_SESSION['userID'];
662   - if (empty($userid)) $userid=1;
663   -
664   - $sql = "DELETE FROM index_files";
665   - DBUtil::runQuery($sql);
666   -
667   - $sql = "INSERT INTO index_files(document_id, user_id, what) SELECT id, $userid, 'A' FROM documents WHERE status_id=1 and id not in (select document_id from index_files)";
668   - DBUtil::runQuery($sql);
669   - }
670   -
671   - public static function indexFolder($folder)
672   - {
673   - $userid=$_SESSION['userID'];
674   - if (empty($userid)) $userid=1;
675   -
676   - if (!$folder instanceof Folder && !$folder instanceof FolderProxy)
677   - {
678   - throw new Exception('Folder expected');
679   - }
680   -
681   - $full_path = $folder->getFullPath();
682   -
683   - $sql = "INSERT INTO index_files(document_id, user_id, what) SELECT id, $userid, 'A' FROM documents WHERE full_path like '{$full_path}/%' AND status_id=1 and id not in (select document_id from index_files)";
684   - DBUtil::runQuery($sql);
685   - }
686   -
687   - /**
688   - * Clearout the scheduling of documents that no longer exist.
689   - *
690   - */
691   - public static function clearoutDeleted()
692   - {
693   - global $default;
694   -
695   - $sql = 'DELETE FROM
696   - index_files
697   - WHERE
698   - document_id in (SELECT d.id FROM documents AS d WHERE d.status_id=3) OR
699   - NOT EXISTS(SELECT index_files.document_id FROM documents WHERE index_files.document_id=documents.id)';
700   - DBUtil::runQuery($sql);
701   -
702   - $default->log->debug("Indexer::clearoutDeleted: removed documents from indexing queue that have been deleted");
703   - }
704   -
705   -
706   - /**
707   - * Check if a document is scheduled to be indexed
708   - *
709   - * @param mixed $document This may be a document or document id
710   - * @return boolean
711   - */
712   - public static function isDocumentScheduled($document)
713   - {
714   - if (is_numeric($document))
715   - {
716   - $docid = $document;
717   - }
718   - else if ($document instanceof Document)
719   - {
720   - $docid = $document->getId();
721   - }
722   - else
723   - {
724   - return false;
725   - }
726   - $sql = "SELECT 1 FROM index_files WHERE document_id=$docid";
727   - $result = DBUtil::getResultArray($sql);
728   - return count($result) > 0;
729   - }
730   -
731   - /**
732   - * Filters text removing redundant characters such as continuous newlines and spaces.
733   - *
734   - * @param string $filename
735   - */
736   - private function filterText($filename)
737   - {
738   - $content = file_get_contents($filename);
739   -
740   - $src = array("([\r\n])","([\n][\n])","([\n])","([\t])",'([ ][ ])');
741   - $tgt = array("\n","\n",' ',' ',' ');
742   -
743   - // shrink what is being stored.
744   - do
745   - {
746   - $orig = $content;
747   - $content = preg_replace($src, $tgt, $content);
748   - } while ($content != $orig);
749   -
750   - return file_put_contents($filename, $content) !== false;
751   - }
752   -
753   - /**
754   - * Load hooks for text extraction process.
755   - *
756   - */
757   - private function loadExtractorHooks()
758   - {
759   - $this->generalHookCache = array();
760   - $this->mimeHookCache = array();
761   -
762   -
763   - $dir = opendir(SearchHelper::correctPath($this->hookPath));
764   - while (($file = readdir($dir)) !== false)
765   - {
766   - if (substr($file,-12) == 'Hook.inc.php')
767   - {
768   - require_once($this->hookPath . '/' . $file);
769   - $class = substr($file, 0, -8);
770   -
771   - if (!class_exists($class))
772   - {
773   - continue;
774   - }
775   -
776   - $hook = new $class;
777   - if (!($class instanceof ExtractorHook))
778   - {
779   - continue;
780   - }
781   -
782   - $mimeTypes = $hook->registerMimeTypes();
783   - if (is_null($mimeTypes))
784   - {
785   - $this->generalHookCache[] = & $hook;
786   - }
787   - else
788   - {
789   - foreach($mimeTypes as $type)
790   - {
791   - $this->mimeHookCache[$type][] = & $hook;
792   - }
793   - }
794   -
795   - }
796   - }
797   - closedir($dir);
798   - }
799   -
800   - /**
801   - * This is a refactored function to execute the hooks.
802   - *
803   - * @param DocumentExtractor $extractor
804   - * @param string $phase
805   - * @param string $mimeType Optional. If set, indicates which hooks must be used, else assume general.
806   - */
807   - private function executeHook($extractor, $phase, $mimeType = null)
808   - {
809   - $hooks = array();
810   - if (is_null($mimeType))
811   - {
812   - $hooks = $this->generalHookCache;
813   - }
814   - else
815   - {
816   - if (array_key_exists($mimeType, $this->mimeHookCache))
817   - {
818   - $hooks = $this->mimeHookCache[$mimeType];
819   - }
820   - }
821   - if (empty($hooks))
822   - {
823   - return;
824   - }
825   -
826   - foreach($hooks as $hook)
827   - {
828   - $hook->$phase($extractor);
829   - }
830   - }
831   -
832   - private function doesDiagnosticsPass($simple=false)
833   - {
834   - global $default;
835   -
836   - $config =& KTConfig::getSingleton();
837   - // create a index log lock file in case there are errors, and we don't need to log them forever!
838   - // this function will create the lockfile if an error is detected. It will be removed as soon
839   - // as the problems with the indexer are removed.
840   - $lockFile = $config->get('cache/cacheDirectory') . '/index.log.lock';
841   -
842   - $diagnosis = $this->diagnose();
843   - if (!is_null($diagnosis))
844   - {
845   - if (!is_file($lockFile))
846   - {
847   - $default->log->error(_kt('Indexer problem: ') . $diagnosis);
848   - }
849   - touch($lockFile);
850   - return false;
851   - }
852   -
853   - if ($simple)
854   - {
855   - return true;
856   - }
857   -
858   - $diagnosis = $this->diagnoseExtractors();
859   - if (!empty($diagnosis))
860   - {
861   - if (!is_file($lockFile))
862   - {
863   - foreach($diagnosis as $diag)
864   - {
865   - $default->log->error(sprintf(_kt('%s problem: %s'), $diag['name'],$diag['diagnosis']));
866   - }
867   - }
868   - touch($lockFile);
869   - return false;
870   - }
871   -
872   - if (is_file($lockFile))
873   - {
874   - $default->log->info(_kt('Issues with the indexer have been resolved!'));
875   - unlink($lockFile);
876   - }
877   -
878   - return true;
879   - }
880   -
881   - /**
882   - * This does the initial mime type association between mime types and text extractors
883   - *
884   - */
885   - public function checkForRegisteredTypes()
886   - {
887   - global $default;
888   -
889   - // we are only doing this once!
890   - $initRegistered = KTUtil::getSystemSetting('mimeTypesRegistered', false);
891   - if ($initRegistered)
892   - {
893   - return;
894   - }
895   - if ($this->debug) $default->log->debug('checkForRegisteredTypes: start');
896   -
897   - $date = date('Y-m-d H:i');
898   - $sql = "UPDATE scheduler_tasks SET run_time='$date'";
899   - DBUtil::runQuery($sql);
900   -
901   - $this->registerTypes(true);
902   -
903   - $disable = array(
904   - 'windows'=>array('PSExtractor'),
905   - 'unix' => array()
906   - );
907   -
908   - $disableForOS = OS_WINDOWS?$disable['windows']:$disable['unix'];
909   -
910   - if (!empty($disableForOS))
911   - {
912   - $disableForOS = '\'' . implode("','", $disableForOS) .'\'';
913   -
914   - $sql = "UPDATE mime_extractors SET active=0 WHERE name in ($disableForOS)";
915   - DBUtil::runQuery($sql);
916   - $default->log->info("checkForRegisteredTypes: disabled '$extractor'");
917   - }
918   - $this->loadExtractorStatus();
919   -
920   - if ($this->debug) $default->log->debug('checkForRegisteredTypes: done');
921   - KTUtil::setSystemSetting('mimeTypesRegistered', true);
922   - }
923   -
924   - private function updatePendingDocumentStatus($documentId, $message, $level)
925   - {
926   - $this->indexingHistory .= "\n" . $level . ': ' . $message;
927   - $message = sanitizeForSQL($this->indexingHistory);
928   - $sql = "UPDATE index_files SET status_msg='$message' WHERE document_id=$documentId";
929   - DBUtil::runQuery($sql);
930   - }
931   -
932   - private $restartCurrentBatch = false;
933   -
934   - public function restartBatch()
935   - {
936   - $this->restartCurrentBatch = true;
937   - }
938   -
939   - /**
940   - *
941   - * @param int $documentId
942   - * @param string $message
943   - * @param string $level This may be info, error, debug
944   - */
945   - private function logPendingDocumentInfoStatus($documentId, $message, $level)
946   - {
947   - $this->updatePendingDocumentStatus($documentId, $message, $level);
948   - global $default;
949   -
950   - switch ($level)
951   - {
952   - case 'debug':
953   - if ($this->debug)
954   - {
955   - $default->log->debug($message);
956   - }
957   - break;
958   - default:
959   - $default->log->$level($message);
960   - }
961   - }
962   -
963   -
964   -
965   - public function getExtractor($extractorClass)
966   - {
967   - if (empty($extractorClass))
968   - {
969   - return null;
970   - }
971   -
972   - $includeFile = SEARCH2_INDEXER_DIR . 'extractors/' . $extractorClass . '.inc.php';
973   - if (!file_exists($includeFile))
974   - {
975   - throw new Exception("Extractor file does not exist: $includeFile");
976   - }
977   -
978   - require_once($includeFile);
979   -
980   - if (!class_exists($extractorClass))
981   - {
982   - throw new Exception("Extractor '$classname' not defined in file: $includeFile");
983   - }
984   -
985   - $extractor = new $extractorClass();
986   -
987   - if (!($extractor instanceof DocumentExtractor))
988   - {
989   - throw new Exception("Class $classname was expected to be of type DocumentExtractor");
990   - }
991   -
992   - return $extractor;
993   - }
994   -
995   - public static function getIndexingQueue($problemItemsOnly=true)
996   - {
997   -
998   - if ($problemItemsOnly)
999   - {
1000   - $sql = "SELECT
1001   - iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename
1002   - FROM
1003   - index_files iff
1004   - INNER JOIN documents d ON iff.document_id=d.id
1005   - INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
1006   - INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
1007   - INNER JOIN mime_types mt ON dcv.mime_id=mt.id
1008   - LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
1009   - WHERE
1010   - (iff.status_msg IS NOT NULL AND iff.status_msg <> '') AND d.status_id=1
1011   - ORDER BY indexdate ";
1012   - }
1013   - else
1014   - {
1015   - $sql = "SELECT
1016   - iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename
1017   - FROM
1018   - index_files iff
1019   - INNER JOIN documents d ON iff.document_id=d.id
1020   - INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
1021   - INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
1022   - INNER JOIN mime_types mt ON dcv.mime_id=mt.id
1023   - LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
1024   - WHERE
1025   - (iff.status_msg IS NULL or iff.status_msg = '') AND d.status_id=1
1026   - ORDER BY indexdate ";
1027   - }
1028   - $aResult = DBUtil::getResultArray($sql);
1029   -
1030   - return $aResult;
1031   - }
1032   -
1033   - public static function getPendingIndexingQueue()
1034   - {
1035   - return Indexer::getIndexingQueue(false);
1036   - }
1037   -
1038   - public function updateIndexStats()
1039   - {
1040   - $optimisationDate = KTUtil::getSystemSetting('luceneOptimisationDate', '');
1041   -
1042   - $noOptimisation = false;
1043   - if ($optimisationDate == '')
1044   - {
1045   - $optimisationDate = _kt('N/A');
1046   - $optimisationPeriod = $optimisationDate;
1047   - }
1048   - else
1049   - {
1050   - $optimisationPeriod = KTUtil::computePeriodToDate($optimisationDate, null, true);
1051   - $noOptimisation = $optimisationPeriod['days'] > 2;
1052   - $optimisationPeriod = $optimisationPeriod['str'];
1053   - $optimisationDate = date('Y-m-d H:i:s', $optimisationDate);
1054   - }
1055   -
1056   - $indexingDate = KTUtil::getSystemSetting('luceneIndexingDate', '');
1057   - if ($indexingDate == '')
1058   - {
1059   - $indexingDate = _kt('N/A');
1060   - $indexingPeriod = $indexingDate;
1061   - }
1062   - else
1063   - {
1064   - $indexingPeriod = KTUtil::computePeriodToDate($indexingDate);
1065   - $indexingDate = date('Y-m-d H:i:s', $indexingDate);
1066   - }
1067   -
1068   - $index = Indexer::get();
1069   - $docsInIndex = $index->getDocumentsInIndex();
1070   -
1071   - // we are only interested in documents that are active
1072   - $sql = "SELECT count(*) as docsInQueue FROM index_files i inner join documents d on i.document_id = d.id where d.status_id=1";
1073   - $docsInQueue = DBUtil::getOneResultKey($sql, 'docsInQueue');
1074   -
1075   - $sql = "SELECT count(*) as errorsInQueue FROM index_files i inner join documents d on i.document_id = d.id where (i.status_msg is not null or i.status_msg <> '') and d.status_id=1";
1076   - $errorsInQueue = DBUtil::getOneResultKey($sql, 'errorsInQueue');
1077   -
1078   - $sql = "SELECT count(*) as docsInRepository FROM documents where status_id=1";
1079   - $docsInRepository = DBUtil::getOneResultKey($sql, 'docsInRepository');
1080   -
1081   - if ($docsInRepository == 0)
1082   - {
1083   - $indexingCoverage = '0.00%';
1084   - $queueCoverage = $indexingCoverage;
1085   - }
1086   - else
1087   - {
1088   - // compute indexing coverage
1089   - $indexingCoverage = _kt('Not Available');
1090   - if (is_numeric($docsInIndex))
1091   - {
1092   - $indexingCoverage = ($docsInIndex * 100) / $docsInRepository;
1093   - $indexingCoverage = number_format($indexingCoverage, 2, '.',',') . '%';
1094   - }
1095   -
1096   - // compute queue coverage
1097   - $queueCoverage = _kt('Not Available');
1098   - if (is_numeric($docsInQueue))
1099   - {
1100   - $queueCoverage = ($docsInQueue * 100) / $docsInRepository;
1101   - $queueCoverage = number_format($queueCoverage, 2, '.',',') . '%';
1102   - }
1103   - }
1104   -
1105   -
1106   - $stats = array(
1107   - 'optimisationDate'=>$optimisationDate,
1108   - 'optimisationPeriod'=>$optimisationPeriod,
1109   - 'indexingDate'=>$indexingDate,
1110   - 'indexingPeriod'=>$indexingPeriod,
1111   - 'docsInIndex'=>$docsInIndex,
1112   - 'docsInQueue'=>$docsInQueue,
1113   - 'errorsInQueue'=>$errorsInQueue,
1114   - 'docsInRepository'=>$docsInRepository,
1115   - 'indexingCoverage'=>$indexingCoverage,
1116   - 'queueCoverage'=>$queueCoverage,
1117   - 'noOptimisation'=>$noOptimisation
1118   - );
1119   -
1120   - KTUtil::setSystemSetting('indexerStats', serialize($stats));
1121   -
1122   - $indexer = Indexer::get();
1123   -
1124   - $diagnosis = $indexer->diagnose();
1125   - KTUtil::setSystemSetting('indexerDiagnostics', serialize($diagnosis));
1126   -
1127   - $extractorDiagnosis = $indexer->diagnoseExtractors();
1128   -
1129   - KTUtil::setSystemSetting('extractorDiagnostics', serialize($extractorDiagnosis));
1130   - }
1131   -
1132   - /**
1133   - * The main function that may be called repeatedly to index documents.
1134   - *
1135   - * @param int $max Default 20
1136   - */
1137   - public function indexDocuments($max=null)
1138   - {
1139   - global $default;
1140   - $config =& KTConfig::getSingleton();
1141   -
1142   - /*$indexLockFile = $config->get('cache/cacheDirectory') . '/main.index.lock';
1143   - if (is_file($indexLockFile))
1144   - {
1145   - $default->log->info('indexDocuments: main.index.lock seems to exist. it could be that the indexing is still underway.');
1146   - $default->log->info('indexDocuments: Remove "' . $indexLockFile . '" if the indexing is not running or extend the frequency at which the background task runs!');
1147   - return;
1148   - }
1149   - touch($indexLockFile);*/
1150   -
1151   -
1152   - $this->checkForRegisteredTypes();
1153   -
1154   - if ($this->debug) $default->log->debug('indexDocuments: start');
1155   - if (!$this->doesDiagnosticsPass())
1156   - {
1157   - //unlink($indexLockFile);
1158   - if ($this->debug) $default->log->debug('indexDocuments: stopping - diagnostics problem. The dashboard will provide more information.');
1159   - return;
1160   - }
1161   -
1162   - if (is_null($max))
1163   - {
1164   - $max = $config->get('indexer/batchDocuments',20);
1165   - }
1166   -
1167   - $this->loadExtractorHooks();
1168   -
1169   - Indexer::clearoutDeleted();
1170   -
1171   - $date = date('Y-m-d H:i:s');
1172   - // identify the indexers that must run
1173   - // mysql specific limit!
1174   - $sql = "SELECT
1175   - iff.document_id, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what
1176   - FROM
1177   - index_files iff
1178   - INNER JOIN documents d ON iff.document_id=d.id
1179   - INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
1180   - INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
1181   - INNER JOIN mime_types mt ON dcv.mime_id=mt.id
1182   - LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
1183   - WHERE
1184   - (iff.processdate IS NULL or iff.processdate < date_sub('$date', interval 1 day)) AND dmv.status_id=1
1185   - ORDER BY indexdate
1186   - LIMIT $max";
1187   - $result = DBUtil::getResultArray($sql);
1188   - if (PEAR::isError($result))
1189   - {
1190   - //unlink($indexLockFile);
1191   - if ($this->debug) $default->log->debug('indexDocuments: stopping - db error');
1192   - return;
1193   - }
1194   - KTUtil::setSystemSetting('luceneIndexingDate', time());
1195   -
1196   - // bail if no work to do
1197   - if (count($result) == 0)
1198   - {
1199   - //unlink($indexLockFile);
1200   - if ($this->debug) $default->log->debug('indexDocuments: stopping - no work to be done');
1201   - return;
1202   - }
1203   -
1204   - // identify any documents that need indexing and mark them
1205   - // so they are not taken in a followup run
1206   - $ids = array();
1207   - foreach($result as $docinfo)
1208   - {
1209   - $ids[] = $docinfo['document_id'];
1210   - }
1211   -
1212   - // mark the documents as being processed
1213   -
1214   - $ids=implode(',',$ids);
1215   - $sql = "UPDATE index_files SET processdate='$date' WHERE document_id in ($ids)";
1216   - DBUtil::runQuery($sql);
1217   -
1218   - $extractorCache = array();
1219   - $storageManager = KTStorageManagerUtil::getSingleton();
1220   -
1221   - $tempPath = $config->get("urls/tmpDirectory");
1222   -
1223   - foreach($result as $docinfo)
1224   - {
1225   - // increment indexed documents count
1226   - Indexer::incrementCount();
1227   -
1228   - $docId=$docinfo['document_id'];
1229   - $extension=$docinfo['filetypes'];
1230   - $mimeType=$docinfo['mimetypes'];
1231   - $extractorClass=$docinfo['extractor'];
1232   - $indexDocument = in_array($docinfo['what'], array('A','C'));
1233   - $indexDiscussion = in_array($docinfo['what'], array('A','D'));
1234   - $this->indexingHistory = '';
1235   -
1236   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Indexing docid: %d extension: '%s' mimetype: '%s' extractor: '%s'"), $docId, $extension,$mimeType,$extractorClass), 'debug');
1237   -
1238   - if (empty($extractorClass))
1239   - {
1240   - /*
1241   -
1242   - if no extractor is found and we don't need to index discussions, then we can remove the item from the queue.
1243   -
1244   - */
1245   - if ($indexDiscussion)
1246   - {
1247   - $indexDocument = false;
1248   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Not indexing docid: %d content because extractor could not be resolve. Still indexing discussion."), $docId), 'info');
1249   - }
1250   - else
1251   - {
1252   - Indexer::unqueueDocument($docId, sprintf(_kt("No extractor for docid: %d"),$docId));
1253   - continue;
1254   - }
1255   - }
1256   - else
1257   - {
1258   - /*
1259   -
1260   - If an extractor is available, we must ensure it is enabled.
1261   -
1262   - */
1263   -
1264   - if (!$this->isExtractorEnabled($extractorClass))
1265   - {
1266   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("diagnose: Not indexing docid: %d because extractor '%s' is disabled."), $docId, $extractorClass), 'info');
1267   - continue;
1268   - }
1269   - }
1270   -
1271   - if ($this->debug)
1272   - {
1273   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Processing docid: %d.\n"),$docId), 'info');
1274   - }
1275   -
1276   - $document = Document::get($docId);
1277   - if (PEAR::isError($document))
1278   - {
1279   - Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Cannot resolve document id %d: %s."),$docId, $document->getMessage()), 'error');
1280   - continue;
1281   - }
1282   -
1283   - if ($this->restartCurrentBatch)
1284   - {
1285   - Indexer::unqueueDocument($docId);
1286   - Indexer::index($docId, 'A');
1287   - continue;
1288   - }
1289   -
1290   -
1291   - $filename = $document->getFileName();
1292   - if (substr($filename,0,1) == '~' || substr($filename,-1) == '~')
1293   - {
1294   - Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Filename for document id %d starts with a tilde (~). This is assumed to be a temporary file. This is ignored."),$docId), 'error');
1295   - continue;
1296   - }
1297   -
1298   - $removeFromQueue = true;
1299   - if ($indexDocument)
1300   - {
1301   - if (array_key_exists($extractorClass, $extractorCache))
1302   - {
1303   - $extractor = $extractorCache[$extractorClass];
1304   - }
1305   - else
1306   - {
1307   - $extractor = $extractorCache[$extractorClass] = $this->getExtractor($extractorClass);
1308   - }
1309   -
1310   - if (!($extractor instanceof DocumentExtractor))
1311   - {
1312   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("indexDocuments: extractor '%s' is not a document extractor class."),$extractorClass), 'error');
1313   - continue;
1314   - }
1315   -
1316   -
1317   -
1318   - $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();
1319   - $sourceFile = $storageManager->temporaryFile($document);
1320   -
1321   - if (empty($sourceFile) || !is_file($sourceFile))
1322   - {
1323   - Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: source file '%s' for document %d does not exist."),$sourceFile,$docId), 'error');
1324   - continue;
1325   - }
1326   -
1327   - if ($extractor->needsIntermediateSourceFile())
1328   - {
1329   - //$extension = pathinfo($document->getFileName(), PATHINFO_EXTENSION);
1330   -
1331   - $intermediate = $tempPath . '/'. $docId . '.' . $extension;
1332   - $result = @copy($sourceFile, $intermediate);
1333   - if ($result === false)
1334   - {
1335   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not create intermediate file from document %d"),$docId), 'error');
1336   - // problem. lets try again later. probably permission related. log the issue.
1337   - continue;
1338   - }
1339   - $sourceFile = $intermediate;
1340   - }
1341   -
1342   - $targetFile = tempnam($tempPath, 'ktindexer');
1343   -
1344   - $extractor->setSourceFile($sourceFile);
1345   - $extractor->setMimeType($mimeType);
1346   - $extractor->setExtension($extension);
1347   - $extractor->setTargetFile($targetFile);
1348   - $extractor->setDocument($document);
1349   - $extractor->setIndexingStatus(null);
1350   - $extractor->setExtractionStatus(null);
1351   -
1352   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Extra Info docid: %d Source File: '%s' Target File: '%s'"),$docId,$sourceFile,$targetFile), 'debug');
1353   -
1354   - $this->executeHook($extractor, 'pre_extract');
1355   - $this->executeHook($extractor, 'pre_extract', $mimeType);
1356   - $removeFromQueue = false;
1357   -
1358   - if ($extractor->extractTextContent())
1359   - {
1360   - // the extractor may need to create another target file
1361   - $targetFile = $extractor->getTargetFile();
1362   -
1363   - $extractor->setExtractionStatus(true);
1364   - $this->executeHook($extractor, 'pre_index');
1365   - $this->executeHook($extractor, 'pre_index', $mimeType);
1366   -
1367   - $title = $document->getName();
1368   - if ($indexDiscussion)
1369   - {
1370   - if (!$this->filterText($targetFile))
1371   - {
1372   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');
1373   - }
1374   - else
1375   - {
1376   - $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);
1377   - $removeFromQueue = $indexStatus;
1378   - if (!$indexStatus)
1379   - {
1380   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocumentAndDiscussion"),$docId), 'error');
1381   - }
1382   -
1383   - $extractor->setIndexingStatus($indexStatus);
1384   - }
1385   - }
1386   - else
1387   - {
1388   - if (!$this->filterText($targetFile))
1389   - {
1390   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');
1391   - }
1392   - else
1393   - {
1394   - $indexStatus = $this->indexDocument($docId, $targetFile, $title, $version);
1395   - $removeFromQueue = $indexStatus;
1396   -
1397   - if (!$indexStatus)
1398   - {
1399   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocument"),$docId), 'error');
1400   - $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error');
1401   - }
1402   -
1403   - $extractor->setIndexingStatus($indexStatus);
1404   - }
1405   - }
1406   -
1407   - $this->executeHook($extractor, 'post_index', $mimeType);
1408   - $this->executeHook($extractor, 'post_index');
1409   - }
1410   - else
1411   - {
1412   - $extractor->setExtractionStatus(false);
1413   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not extract contents from document %d"),$docId), 'error');
1414   - $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error');
1415   - }
1416   -
1417   - $this->executeHook($extractor, 'post_extract', $mimeType);
1418   - $this->executeHook($extractor, 'post_extract');
1419   -
1420   - if ($extractor->needsIntermediateSourceFile())
1421   - {
1422   - @unlink($sourceFile);
1423   - }
1424   -
1425   - @unlink($targetFile);
1426   -
1427   - }
1428   - else
1429   - {
1430   - $indexStatus = $this->indexDiscussion($docId);
1431   - $removeFromQueue = $indexStatus;
1432   - }
1433   -
1434   - if ($removeFromQueue)
1435   - {
1436   - Indexer::unqueueDocument($docId, sprintf(_kt("Done indexing docid: %d"),$docId));
1437   - }
1438   - else
1439   - {
1440   - if ($this->debug) $default->log->debug(sprintf(_kt("Document docid: %d was not removed from the queue as it looks like there was a problem with the extraction process"),$docId));
1441   - }
1442   - }
1443   - if ($this->debug) $default->log->debug('indexDocuments: done');
1444   - //unlink($indexLockFile);
1445   - }
1446   -
1447   - public function migrateDocuments($max=null)
1448   - {
1449   - global $default;
1450   -
1451   - $default->log->info(_kt('migrateDocuments: starting'));
1452   -
1453   - if (!$this->doesDiagnosticsPass(true))
1454   - {
1455   - $default->log->info(_kt('migrateDocuments: stopping - diagnostics problem. The dashboard will provide more information.'));
1456   - return;
1457   - }
1458   -
1459   - if (KTUtil::getSystemSetting('migrationComplete') == 'true')
1460   - {
1461   - $default->log->info(_kt('migrateDocuments: stopping - migration is complete.'));
1462   - return;
1463   - }
1464   -
1465   - $config =& KTConfig::getSingleton();
1466   - if (is_null($max))
1467   - {
1468   - $max = $config->get('indexer/batchMigrateDocument',500);
1469   - }
1470   -
1471   - $lockFile = $config->get('cache/cacheDirectory') . '/migration.lock';
1472   - if (is_file($lockFile))
1473   - {
1474   - $default->log->info(_kt('migrateDocuments: stopping - migration lockfile detected.'));
1475   - return;
1476   - }
1477   - touch($lockFile);
1478   -
1479   - $startTime = KTUtil::getSystemSetting('migrationStarted');
1480   - if (is_null($startTime))
1481   - {
1482   - KTUtil::setSystemSetting('migrationStarted', time());
1483   - }
1484   -
1485   - $maxLoops = 5;
1486   -
1487   - $max = ceil($max / $maxLoops);
1488   -
1489   - $start =KTUtil::getBenchmarkTime();
1490   - $noDocs = false;
1491   - $numDocs = 0;
1492   -
1493   - for($loop=0;$loop<$maxLoops;$loop++)
1494   - {
1495   -
1496   - $sql = "SELECT
1497   - document_id, document_text
1498   - FROM
1499   - document_text
1500   - ORDER BY document_id
1501   - LIMIT $max";
1502   - $result = DBUtil::getResultArray($sql);
1503   - if (PEAR::isError($result))
1504   - {
1505   - $default->log->info(_kt('migrateDocuments: db error'));
1506   - break;
1507   - }
1508   -
1509   - $docs = count($result);
1510   - if ($docs == 0)
1511   - {
1512   - $noDocs = true;
1513   - break;
1514   - }
1515   - $numDocs += $docs;
1516   -
1517   - foreach($result as $docinfo)
1518   - {
1519   - $docId = $docinfo['document_id'];
1520   -
1521   - $document = Document::get($docId);
1522   - if (PEAR::isError($document) || is_null($document))
1523   - {
1524   - $sql = "DELETE FROM document_text WHERE document_id=$docId";
1525   - DBUtil::runQuery($sql);
1526   - $default->log->error(sprintf(_kt('migrateDocuments: Could not get document %d\'s document! Removing content!'),$docId));
1527   - continue;
1528   - }
1529   -
1530   - $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();
1531   -
1532   - $targetFile = tempnam($tempPath, 'ktindexer');
1533   -
1534   - if (file_put_contents($targetFile, $docinfo['document_text']) === false)
1535   - {
1536   - $default->log->error(sprintf(_kt('migrateDocuments: Cannot write to \'%s\' for document id %d'), $targetFile, $docId));
1537   - continue;
1538   - }
1539   - // free memory asap ;)
1540   - unset($docinfo['document_text']);
1541   -
1542   - $title = $document->getName();
1543   -
1544   - $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);
1545   -
1546   - if ($indexStatus)
1547   - {
1548   - $sql = "DELETE FROM document_text WHERE document_id=$docId";
1549   - DBUtil::runQuery($sql);
1550   - }
1551   - else
1552   - {
1553   - $default->log->error(sprintf(_kt("migrateDocuments: Problem indexing document %d"), $docId));
1554   - }
1555   -
1556   - @unlink($targetFile);
1557   - }
1558   - }
1559   -
1560   - @unlink($lockFile);
1561   -
1562   - $time = KTUtil::getBenchmarkTime() - $start;
1563   -
1564   - KTUtil::setSystemSetting('migrationTime', KTUtil::getSystemSetting('migrationTime',0) + $time);
1565   - KTUtil::setSystemSetting('migratedDocuments', KTUtil::getSystemSetting('migratedDocuments',0) + $numDocs);
1566   -
1567   - $default->log->info(sprintf(_kt('migrateDocuments: stopping - done in %d seconds!'), $time));
1568   - if ($noDocs)
1569   - {
1570   - $default->log->info(_kt('migrateDocuments: Completed!'));
1571   - KTUtil::setSystemSetting('migrationComplete', 'true');
1572   - schedulerUtil::deleteByName('Index Migration');
1573   - $default->log->debug(_kt('migrateDocuments: Disabling \'Index Migration\' task by removing scheduler entry.'));
1574   - }
1575   - }
1576   -
1577   - /**
1578   - * Index a document. The base class must override this function.
1579   - *
1580   - * @param int $docId
1581   - * @param string $textFile
1582   - */
1583   - protected abstract function indexDocument($docId, $textFile, $title, $version);
1584   -
1585   -
1586   - public function updateDocumentIndex($docId, $text)
1587   - {
1588   - $config = KTConfig::getSingleton();
1589   - $tempPath = $config->get("urls/tmpDirectory");
1590   - $tempFile = tempnam($tempPath,'ud_');
1591   -
1592   - file_put_contents($tempFile, $text);
1593   -
1594   - $document = Document::get($docId);
1595   - $title = $document->getDescription();
1596   - $version = $document->getVersion();
1597   -
1598   - $result = $this->indexDocument($docId, $tempFile, $title, $version);
1599   -
1600   - if (file_exists($tempFile))
1601   - {
1602   - unlink($tempFile);
1603   - }
1604   -
1605   - return $result;
1606   - }
1607   -
1608   - /**
1609   - * Index a discussion. The base class must override this function.
1610   - *
1611   - * @param int $docId
1612   - */
1613   - protected abstract function indexDiscussion($docId);
1614   -
1615   - /**
1616   - * Diagnose the indexer. e.g. Check that the indexing server is running.
1617   - *
1618   - */
1619   - public abstract function diagnose();
1620   -
1621   - /**
1622   - * Diagnose the extractors.
1623   - *
1624   - * @return array
1625   - */
1626   - public function diagnoseExtractors()
1627   - {
1628   - $diagnosis = $this->_diagnose($this->extractorPath, 'DocumentExtractor', 'Extractor.inc.php');
1629   - $diagnosis = array_merge($diagnosis, $this->_diagnose($this->hookPath, 'Hook', 'Hook.inc.php'));
1630   -
1631   - return $diagnosis;
1632   - }
1633   -
1634   - /**
1635   - * This is a refactored diagnose function.
1636   - *
1637   - * @param string $path
1638   - * @param string $class
1639   - * @param string $extension
1640   - * @return array
1641   - */
1642   - private function _diagnose($path, $baseclass, $extension)
1643   - {
1644   - global $default;
1645   -
1646   - $diagnoses = array();
1647   -
1648   - $dir = opendir(SearchHelper::correctPath($path));
1649   - $extlen = - strlen($extension);
1650   -
1651   - while (($file = readdir($dir)) !== false)
1652   - {
1653   - if (substr($file,0,1) == '.')
1654   - {
1655   - continue;
1656   - }
1657   - if (substr($file,$extlen) != $extension)
1658   - {
1659   - $default->log->error(sprintf(_kt("diagnose: '%s' does not have extension '%s'."), $file, $extension));
1660   - continue;
1661   - }
1662   -
1663   - require_once($path . '/' . $file);
1664   -
1665   - $class = substr($file, 0, -8);
1666   - if (!class_exists($class))
1667   - {
1668   - $default->log->error(sprintf(_kt("diagnose: class '%s' does not exist."), $class));
1669   - continue;
1670   - }
1671   -
1672   - if (!$this->isExtractorEnabled($class))
1673   - {
1674   - $default->log->debug(sprintf(_kt("diagnose: extractor '%s' is disabled."), $class));
1675   - continue;
1676   - }
1677   -
1678   - $extractor = new $class();
1679   - if (!is_a($extractor, $baseclass))
1680   - {
1681   - $default->log->error(sprintf(_kt("diagnose(): '%s' is not of type DocumentExtractor"), $class));
1682   - continue;
1683   - }
1684   -
1685   - $types = $extractor->getSupportedMimeTypes();
1686   - if (empty($types))
1687   - {
1688   - if ($this->debug) $default->log->debug(sprintf(_kt("diagnose: class '%s' does not support any types."), $class));
1689   - continue;
1690   - }
1691   -
1692   - $diagnosis=$extractor->diagnose();
1693   - if (empty($diagnosis))
1694   - {
1695   - continue;
1696   - }
1697   - $diagnoses[$class] = array(
1698   - 'name'=>$extractor->getDisplayName(),
1699   - 'diagnosis'=>$diagnosis
1700   - );
1701   -
1702   - }
1703   - closedir($dir);
1704   -
1705   - return $diagnoses;
1706   - }
1707   -
1708   -
1709   - /**
1710   - * Register the extractor types.
1711   - *
1712   - * @param boolean $clear. Optional. Defaults to false.
1713   - */
1714   - public function registerTypes($clear=false)
1715   - {
1716   - if ($clear)
1717   - {
1718   - $this->clearExtractors();
1719   - }
1720   - $dir = opendir(SearchHelper::correctPath($this->extractorPath));
1721   - while (($file = readdir($dir)) !== false)
1722   - {
1723   - if (substr($file,-17) == 'Extractor.inc.php')
1724   - {
1725   - require_once($this->extractorPath . '/' . $file);
1726   - $class = substr($file, 0, -8);
1727   -
1728   - if (!class_exists($class))
1729   - {
1730   - // if the class does not exist, we can't do anything.
1731   - continue;
1732   - }
1733   -
1734   - $extractor = new $class;
1735   - if ($extractor instanceof DocumentExtractor)
1736   - {
1737   - $extractor->registerMimeTypes();
1738   - }
1739   - }
1740   - }
1741   - closedir($dir);
1742   - }
1743   -
1744   - /**
1745   - * This is used as a possible obtimisation effort. It may be overridden in that case.
1746   - *
1747   - * @param int $docId
1748   - * @param string $textFile
1749   - */
1750   - protected function indexDocumentAndDiscussion($docId, $textFile, $title, $version)
1751   - {
1752   - $this->indexDocument($docId, $textFile, $title, $version);
1753   - $this->indexDiscussion($docId);
1754   - }
1755   -
1756   - /**
1757   - * Remove the document from the queue. This is normally called when it has been processed.
1758   - *
1759   - * @param int $docid
1760   - */
1761   - public static function unqueueDocument($docid, $reason=false, $level='debug')
1762   - {
1763   - $sql = "DELETE FROM index_files WHERE document_id=$docid";
1764   - DBUtil::runQuery($sql);
1765   - if ($reason !== false)
1766   - {
1767   - global $default;
1768   - $default->log->$level("Indexer: removing document $docid from the queue - $reason");
1769   - }
1770   - }
1771   -
1772   - /**
1773   - * Run a query on the index.
1774   - *
1775   - * @param string $query
1776   - * @return array
1777   - */
1778   - public abstract function query($query);
1779   -
1780   - /**
1781   - * Converts an integer to a string that can be easily compared and reversed.
1782   - *
1783   - * @param int $int
1784   - * @return string
1785   - */
1786   - public static function longToString($int)
1787   - {
1788   - $maxlen = 14;
1789   -
1790   - $a2z = array('a','b','c','d','e','f','g','h','i','j');
1791   - $o29 = array('0','1','2','3','4','5','6','7','8','9');
1792   - $l = str_pad('',$maxlen - strlen("$int"),'0') . $int;
1793   -
1794   - return str_replace($o29, $a2z, $l);
1795   - }
1796   -
1797   - /**
1798   - * Converts a string to an integer.
1799   - *
1800   - * @param string $str
1801   - * @return int
1802   - */
1803   - public static function stringToLong($str)
1804   - {
1805   - $a2z = array('a','b','c','d','e','f','g','h','i','j');
1806   - $o29 = array('0','1','2','3','4','5','6','7','8','9');
1807   -
1808   - $int = str_replace($a2z, $o29, $str) + 0;
1809   -
1810   - return $int;
1811   - }
1812   -
1813   - /**
1814   - * Possibly we can optimise indexes. This method must be overriden.
1815   - * The new function must call the parent!
1816   - *
1817   - */
1818   - public function optimise()
1819   - {
1820   - KTUtil::setSystemSetting('luceneOptimisationDate', time());
1821   - }
1822   -
1823   - /**
1824   - * Shuts down the indexer
1825   - *
1826   - */
1827   - public function shutdown()
1828   - {
1829   - // do nothing generally
1830   - }
1831   -
1832   - /**
1833   - * Returns the name of the indexer.
1834   - *
1835   - * @return string
1836   - */
1837   - public abstract function getDisplayName();
1838   -
1839   -
1840   - /**
1841   - * Returns the number of non-deleted documents in the index.
1842   - *
1843   - * @return int
1844   - */
1845   - public abstract function getDocumentsInIndex();
1846   -
1847   - public abstract function isDocumentIndexed($documentId);
1848   -
1849   - /**
1850   - * Returns the path to the index directory
1851   - *
1852   - * @return string
1853   - */
1854   - public function getIndexDirectory()
1855   - {
1856   - $config = KTConfig::getSingleton();
1857   - $directory = $config->get('indexer/luceneDirectory');
1858   - return $directory;
1859   - }
1860   -}
1861   -
1862   -?>
  1 +<?php
  2 +
  3 +/**
  4 + * $Id:$
  5 + *
  6 + * KnowledgeTree Community Edition
  7 + * Document Management Made Simple
  8 + * Copyright (C) 2008 KnowledgeTree Inc.
  9 + * Portions copyright The Jam Warehouse Software (Pty) Limited
  10 + *
  11 + * This program is free software; you can redistribute it and/or modify it under
  12 + * the terms of the GNU General Public License version 3 as published by the
  13 + * Free Software Foundation.
  14 + *
  15 + * This program is distributed in the hope that it will be useful, but WITHOUT
  16 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17 + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  18 + * details.
  19 + *
  20 + * You should have received a copy of the GNU General Public License
  21 + * along with this program. If not, see <http://www.gnu.org/licenses/>.
  22 + *
  23 + * You can contact KnowledgeTree Inc., PO Box 7775 #87847, San Francisco,
  24 + * California 94120-7775, or email info@knowledgetree.com.
  25 + *
  26 + * The interactive user interfaces in modified source and object code versions
  27 + * of this program must display Appropriate Legal Notices, as required under
  28 + * Section 5 of the GNU General Public License version 3.
  29 + *
  30 + * In accordance with Section 7(b) of the GNU General Public License version 3,
  31 + * these Appropriate Legal Notices must retain the display of the "Powered by
  32 + * KnowledgeTree" logo and retain the original copyright notice. If the display of the
  33 + * logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices
  34 + * must display the words "Powered by KnowledgeTree" and retain the original
  35 + * copyright notice.
  36 + * Contributor( s): ______________________________________
  37 + *
  38 + */
  39 +
  40 +define('SEARCH2_INDEXER_DIR',realpath(dirname(__FILE__)) . '/');
  41 +require_once('indexing/extractorCore.inc.php');
  42 +require_once(KT_DIR . '/plugins/ktcore/scheduler/schedulerUtil.php');
  43 +require_once(KT_DIR . '/ktapi/ktapi.inc.php');
  44 +
  45 +class IndexerInconsistencyException extends Exception {};
  46 +
  47 +// TODO: Query Result Items code should be moved into the Search section. It has less to do with indexing...
  48 +
  49 +class QueryResultItem
  50 +{
  51 + protected $id;
  52 + protected $title;
  53 + protected $rank;
  54 + protected $text;
  55 + protected $fullpath;
  56 +
  57 + public function __construct($id, $title, $rank, $text, $fullpath)
  58 + {
  59 + $this->id = $id;
  60 + $this->title = $title;
  61 + $this->rank = $rank;
  62 + $this->text = $text;
  63 + $this->fullpath = $fullpath;
  64 + }
  65 +
  66 + public function getId() { return $this->id; }
  67 + public function getRealId() { return $this->id; }
  68 +
  69 + public function getIsProxy() { return $this instanceof ProxyResultItem; }
  70 + public function getIsFolder() { return substr(get_class($this), 0, 6) == 'Folder' ; }
  71 + public function getIsDocument() { return substr(get_class($this), 0, 8) == 'Document' ; }
  72 +
  73 + public function setRank($value)
  74 + {
  75 + $this->rank = number_format($value,2,'.',',');
  76 + }
  77 +
  78 + public function getIsLive()
  79 + {
  80 + return true;
  81 + }
  82 +
  83 + public function setTitle($value)
  84 + {
  85 + $this->title = $value;
  86 + }
  87 +
  88 + public function setText($value)
  89 + {
  90 + $this->text = $value;
  91 + }
  92 +
  93 + public function getRelevance() { return (float) $this->rank; }
  94 + public function getRank() { return $this->getRelevance(); }
  95 + public function getText() { return (string) $this->text; }
  96 + public function getTitle() { return (string) $this->title; }
  97 + public function getFullPath() { return (string) $this->fullpath; }
  98 +
  99 + protected function __get($property)
  100 + {
  101 + if (empty($property))
  102 + {
  103 + return '';
  104 + }
  105 +
  106 + $method = 'get' . $property;
  107 + if (method_exists($this, $method))
  108 + {
  109 + return $this->$method();
  110 + }
  111 + return $this->getUnknown();
  112 + }
  113 +
  114 + protected function getUnknown()
  115 + {
  116 + return _kt('n/a');
  117 + }
  118 +
  119 + protected function __set($property, $value)
  120 + {
  121 + if (empty($property))
  122 + {
  123 + return '';
  124 + }
  125 +
  126 + $method = 'set' . $property;
  127 + if (method_exists($this, $method))
  128 + {
  129 + return $this->$method($value);
  130 + }
  131 + throw new Exception("Unknown property '$property' to set on QueryResultItem");
  132 + }
  133 +}
  134 +
  135 +class ProxyResultItem extends QueryResultItem
  136 +{
  137 + protected $proxy;
  138 + protected $proxyId;
  139 +
  140 + public function __construct($proxyId, $proxy)
  141 + {
  142 + parent::__construct($proxyId, $proxy->getTitle, $proxy->getRank(), $proxy->getText(), $proxy->getFullPath());
  143 + $this->proxyId = $proxyId;
  144 + $this->proxy = $proxy;
  145 + }
  146 +
  147 + public function getId() { return $this->proxyId; }
  148 + public function getTitle() { return $this->proxy->getTitle(); }
  149 + public function getRealId() { return $this->proxy->getId(); }
  150 +
  151 + protected function __get($property)
  152 + {
  153 + $method = 'get' . $property;
  154 +
  155 + if (method_exists($this, $method))
  156 + {
  157 + return $this->$method();
  158 + }
  159 + else
  160 + {
  161 + return $this->proxy->$method();
  162 + }
  163 + }
  164 +
  165 + protected function __set($property, $value)
  166 + {
  167 + $method = 'set' . $property;
  168 + if (method_exists($this, $method))
  169 + {
  170 + return $this->$method($value);
  171 + }
  172 + else
  173 + {
  174 + return $this->proxy->$method($value);
  175 + }
  176 + }
  177 +}
  178 +
  179 +class DocumentResultItem extends QueryResultItem
  180 +{
  181 + protected $filesize;
  182 + protected $live;
  183 + protected $version;
  184 + protected $mimeType;
  185 + protected $filename;
  186 + protected $thumbnail; // TODO: if not null, gui can display a thumbnail
  187 + protected $viewer; // TODO: if not null, a viewer can be used to view the document
  188 + protected $document;
  189 + protected $checkedOutUser;
  190 + protected $dateCheckedout;
  191 + protected $workflowState;
  192 + protected $workflow;
  193 + protected $modifiedBy;
  194 + protected $dateModified;
  195 + protected $createdBy;
  196 + protected $dateCreated;
  197 + protected $owner;
  198 + protected $immutable;
  199 + protected $deleted;
  200 + protected $status;
  201 + protected $folderId;
  202 + protected $storagePath;
  203 + protected $documentType;
  204 + protected $mimeIconPath;
  205 + protected $mimeDisplay;
  206 + protected $oemDocumentNo;
  207 +
  208 + public function __construct($document_id, $rank=null, $title=null, $text=null, $fullpath = null)
  209 + {
  210 + parent::__construct($document_id, $title, $rank, $text, $fullpath);
  211 + $this->live = true;
  212 + $this->loadDocumentInfo();
  213 + }
  214 +
  215 + // TODO: this is bad. must refactor to do the query on the group of documents.
  216 + public function loadDocumentInfo()
  217 + {
  218 + global $default;
  219 + $sql = "SELECT
  220 + d.folder_id, f.full_path, f.name, dcv.size as filesize, dcv.major_version,
  221 + dcv.minor_version, dcv.filename, cou.name as checkoutuser, w.human_name as workflow, ws.human_name as workflowstate,
  222 + mt.mimetypes as mimetype, md.mime_doc as mimedoc, d.checkedout, mbu.name as modifiedbyuser, d.modified,
  223 + cbu.name as createdbyuser, ou.name as owneruser, d.immutable, d.status_id, d.created,dcv.storage_path, dtl.name as document_type,
  224 + mt.icon_path as mime_icon_path, mt.friendly_name as mime_display, d.oem_no, dmv.name as title
  225 + FROM
  226 + documents d
  227 + INNER JOIN document_metadata_version dmv ON d.metadata_version_id = dmv.id
  228 + INNER JOIN document_content_version dcv ON dmv.content_version_id = dcv.id
  229 + INNER JOIN mime_types mt ON dcv.mime_id=mt.id
  230 + LEFT JOIN document_types_lookup dtl ON dtl.id=dmv.document_type_id
  231 + LEFT JOIN folders f ON f.id=d.folder_id
  232 + LEFT JOIN users cou ON d.checked_out_user_id=cou.id
  233 + LEFT JOIN workflows w ON dmv.workflow_id=w.id
  234 + LEFT JOIN workflow_states ws ON dmv.workflow_state_id = ws.id
  235 + LEFT JOIN mime_documents md ON mt.mime_document_id = md.id
  236 + LEFT JOIN users mbu ON d.modified_user_id=mbu.id
  237 + LEFT JOIN users cbu ON d.creator_id=cbu.id
  238 + LEFT JOIN users ou ON d.owner_id=ou.id
  239 + WHERE
  240 + d.id=$this->id";
  241 +
  242 + $result = DBUtil::getOneResult($sql);
  243 +
  244 + if (PEAR::isError($result) || empty($result))
  245 + {
  246 + $this->live = false;
  247 + if (PEAR::isError($result))
  248 + {
  249 + throw new Exception('Database exception! There appears to be an error in the system: ' .$result->getMessage());
  250 + }
  251 +
  252 + $default->log->error('QueryResultItem: $result is null');
  253 + $msg = 'The database did not have a record matching the result from the document indexer. This may occur if there is an inconsistency between the document indexer and the repository. The indexer needs to be repaired.';
  254 + $default->log->error('QueryResultItem: ' . $msg);
  255 + // TODO: repair process where we scan documents in index, and delete those for which there is nothing in the repository
  256 + throw new IndexerInconsistencyException(_kt($msg));
  257 + }
  258 +
  259 + // document_id, relevance, text, title
  260 +
  261 + $this->documentType = $result['document_type'];
  262 + $this->filename=$result['filename'];
  263 + $this->filesize = KTUtil::filesizeToString($result['filesize']);
  264 + $this->folderId = $result['folder_id'];
  265 + $this->title = $result['title'];
  266 +
  267 + $this->createdBy = $result['createdbyuser'];
  268 + $this->dateCreated = $result['created'];
  269 +
  270 + $this->modifiedBy = $result['modifiedbyuser'];
  271 + $this->dateModified = $result['modified'];
  272 +
  273 + $this->checkedOutUser = $result['checkoutuser'];
  274 + $this->dateCheckedout = $result['checkedout'];
  275 +
  276 + $this->owner = $result['owneruser'];
  277 +
  278 + $this->version = $result['major_version'] . '.' . $result['minor_version'];
  279 +
  280 + $this->immutable = ($result['immutable'] + 0)?_kt('Immutable'):'';
  281 +
  282 + $this->workflow = $result['workflow'];
  283 + $this->workflowState = $result['workflowstate'];
  284 +
  285 + $this->oemDocumentNo = $result['oem_no'];
  286 + if (empty($this->oemDocumentNo)) $this->oemDocumentNo = 'n/a';
  287 +
  288 + if (is_null($result['name']))
  289 + {
  290 + $this->fullpath = '(orphaned)';
  291 + }
  292 + else
  293 + {
  294 + $this->fullpath = $result['full_path'];
  295 + }
  296 +
  297 + $this->mimeType = $result['mimetype'];
  298 + $this->mimeIconPath = $result['mime_icon_path'];
  299 + if (empty($this->mimeIconPath))
  300 + {
  301 + $this->mimeIconPath = 'unspecified_type';
  302 + }
  303 + $this->mimeDisplay = $result['mime_display'];
  304 +
  305 + $this->storagePath = $result['storage_path'];
  306 + $this->status = Document::getStatusString($result['status_id']);
  307 + }
  308 +
  309 + public function getDocumentID() { return $this->getId(); }
  310 + public function getIsLive() { return (bool) $this->live; }
  311 + public function getFilesize() { return $this->filesize; }
  312 + public function getVersion() { return (string) $this->version; }
  313 + public function getFilename() { return (string)$this->filename; }
  314 + public function getFolderId() { return (int)$this->folderId; }
  315 + public function getOemDocumentNo() { return (string) $this->oemDocumentNo; }
  316 + public function getDocument() { return Document::get($this->id); }
  317 + public function getIsAvailable() { return $this->Document->isLive(); }
  318 + public function getCheckedOutUser() { return (string) $this->checkedOutUser; }
  319 + public function getCheckedOutByr() { return $this->getCheckedOutUser(); }
  320 + public function getWorkflowOnly() { return (string)$this->workflow; }
  321 + public function getWorkflow() { return $this->getWorkflow(); }
  322 + public function getWorkflowStateOnly() { return (string)$this->workflowState; }
  323 + public function getWorkflowState() { return $this->getWorkflowStateOnly(); }
  324 + public function getWorkflowAndState() {
  325 + if (is_null($this->workflow))
  326 + {
  327 + return '';
  328 + }
  329 + return "$this->workflow - $this->workflowState";
  330 + }
  331 + public function getMimeType() { return (string) $this->mimeType; }
  332 + public function getMimeIconPath() { return (string) $this->mimeIconPath; }
  333 + public function getMimeDisplay() { return (string) $this->mimeDisplay; }
  334 + public function getDateCheckedOut() { return (string) $this->dateCheckedout; }
  335 + public function getModifiedBy() { return (string) $this->modifiedBy; }
  336 + public function getDateModified() { return (string) $this->dateModified; }
  337 + public function getCreatedBy() { return (string) $this->createdBy; }
  338 + public function getDateCreated() { return (string) $this->dateCreated; }
  339 + public function getOwner() { return (string) $this->owner; }
  340 + public function getOwnedBy() { return $this->getOwner(); }
  341 + public function getIsImmutable() { return (bool) $this->immutable; }
  342 + public function getImmutable() { return $this->getIsImmutable(); }
  343 + public function getStatus() { return $this->status; }
  344 + public function getStoragePath() { return $this->storagePath; }
  345 + public function getDocumentType() { return $this->documentType; }
  346 + public function getPermissions() { return KTAPI_Document::get_permission_string($this->Document); }
  347 + public function getCanBeReadByUser() {
  348 + if (!$this->live)
  349 + return false;
  350 + if (Permission::userHasDocumentReadPermission($this->Document))
  351 + return true;
  352 + if (Permission::adminIsInAdminMode())
  353 + return true;
  354 + return false;
  355 + }
  356 +}
  357 +
  358 +class FolderResultItem extends QueryResultItem
  359 +{
  360 + protected $folder;
  361 + protected $createdBy;
  362 + protected $parentId;
  363 +
  364 + public function __construct($folder_id, $rank=null, $title=null, $text=null, $fullpath = null)
  365 + {
  366 + parent::__construct($folder_id, $title, $rank, $text, $fullpath);
  367 + $this->loadFolderInfo();
  368 + }
  369 +
  370 + public function getFolderID() { return $this->getId(); }
  371 + public function getParentID() { return $this->parentId; }
  372 + public function getCreatedBy() { return $this->createdBy; }
  373 + public function getMimeIconPath() { return 'folder'; }
  374 + public function getFolder() { return Folder::get($this->getFolderID()); }
  375 + public function getPermissions() { return KTAPI_Folder::get_permission_string($this->Folder); }
  376 +
  377 + public function loadFolderInfo()
  378 + {
  379 + global $default;
  380 + $folder = $this->getFolder();
  381 + if (PEAR::isError($folder))
  382 + {
  383 + throw new Exception('Database exception! There appears to be an error in the system: ' .$result->getMessage());
  384 + }
  385 + $this->title = $folder->getName();
  386 + $this->fullpath = '/' . $folder->getFullPath();
  387 + $this->parentId = $folder->getParentId();
  388 +
  389 + $user = User::get($folder->getCreatorID());
  390 + $this->createdBy = (PEAR::isError($user))?_kt('Unknown'):$user->getName();
  391 + }
  392 +
  393 +}
  394 +
  395 +class DocumentShortcutResultItem extends ProxyResultItem
  396 +{
  397 + public function getDocumentID() { return $this->getId(); }
  398 + public function getMimeIconPath() { return $this->proxy->getMimeIconPath() . '_shortcut'; }
  399 +
  400 +}
  401 +
  402 +class FolderShortcutResultItem extends ProxyResultItem
  403 +{
  404 + public function getFolderID() { return $this->getId(); }
  405 + public function getMimeIconPath() { return 'folder_shortcut'; }
  406 +
  407 +}
  408 +
  409 +function MatchResultCompare($a, $b)
  410 +{
  411 + if ($a->Rank == $b->Rank) {
  412 + return 0;
  413 + }
  414 + return ($a->Rank < $b->Rank) ? -1 : 1;
  415 +}
  416 +
  417 +abstract class Indexer
  418 +{
  419 + /**
  420 + * Cache of extractors
  421 + *
  422 + * @var array
  423 + */
  424 + private $extractorCache;
  425 +
  426 + /**
  427 + * Indicates if the indexer will do logging.
  428 + *
  429 + * @var boolean
  430 + */
  431 + private $debug;
  432 + /**
  433 + * Cache on mime related hooks
  434 + *
  435 + * @var unknown_type
  436 + */
  437 + private $mimeHookCache;
  438 + /**
  439 + * Cache on general hooks.
  440 + *
  441 + * @var array
  442 + */
  443 + private $generalHookCache;
  444 +
  445 + /**
  446 + * This is a path to the extractors.
  447 + *
  448 + * @var string
  449 + */
  450 + private $extractorPath;
  451 + /**
  452 + * This is a path to the hooks.
  453 + *
  454 + * @var string
  455 + */
  456 + private $hookPath;
  457 +
  458 + private $enabledExtractors;
  459 +
  460 + /**
  461 + * Initialise the indexer
  462 + *
  463 + */
  464 + protected function __construct()
  465 + {
  466 + $config = KTConfig::getSingleton();
  467 +
  468 + $this->extractorCache = array();
  469 + $this->debug = $config->get('indexer/debug', true);
  470 + $this->hookCache = array();
  471 + $this->generalHookCache = array();
  472 + $this->extractorPath = $config->get('indexer/extractorPath', 'extractors');
  473 + $this->hookPath = $config->get('indexer/extractorHookPath','extractorHooks');
  474 +
  475 + $this->loadExtractorStatus();
  476 + }
  477 +
  478 + /**
  479 + * Get the list if enabled extractors
  480 + *
  481 + */
  482 + private function loadExtractorStatus()
  483 + {
  484 + $sql = "SELECT id, name FROM mime_extractors WHERE active=1";
  485 + $rs = DBUtil::getResultArray($sql);
  486 + $this->enabledExtractors = array();
  487 + foreach($rs as $item)
  488 + {
  489 + $this->enabledExtractors[] = $item['name'];
  490 + }
  491 + }
  492 +
  493 + private function isExtractorEnabled($extractor)
  494 + {
  495 + return in_array($extractor, $this->enabledExtractors);
  496 + }
  497 +
  498 + /**
  499 + * Returns a reference to the main class
  500 + *
  501 + * @return Indexer
  502 + */
  503 + public static function get()
  504 + {
  505 + static $singleton = null;
  506 +
  507 + if (is_null($singleton))
  508 + {
  509 + $config = KTConfig::getSingleton();
  510 + $classname = $config->get('indexer/coreClass');
  511 +
  512 + require_once('indexing/indexers/' . $classname . '.inc.php');
  513 +
  514 + if (!class_exists($classname))
  515 + {
  516 + throw new Exception("Class '$classname' does not exist.");
  517 + }
  518 +
  519 + $singleton = new $classname;
  520 + }
  521 +
  522 + return $singleton;
  523 + }
  524 +
  525 + public abstract function deleteDocument($docid);
  526 +
  527 + /**
  528 + * Remove the association of all extractors to mime types on the database.
  529 + *
  530 + */
  531 + public function clearExtractors()
  532 + {
  533 + global $default;
  534 +
  535 + $sql = "update mime_types set extractor_id=null";
  536 + DBUtil::runQuery($sql);
  537 +
  538 + $sql = "delete from mime_extractors";
  539 + DBUtil::runQuery($sql);
  540 +
  541 + if ($this->debug) $default->log->debug('clearExtractors');
  542 + }
  543 +
  544 + /**
  545 + * lookup the name of the extractor class based on the mime type.
  546 + *
  547 + * @param string $type
  548 + * @return string
  549 + */
  550 + public static function resolveExtractor($type)
  551 + {
  552 + global $default;
  553 + $sql = "select extractor from mime_types where filetypes='$type'";
  554 + $class = DBUtil::getOneResultKey($sql,'extractor');
  555 + if (PEAR::isError($class))
  556 + {
  557 + $default->log->error("resolveExtractor: cannot resolve $type");
  558 + return $class;
  559 + }
  560 + if ($this->debug) $default->log->debug(sprintf(_kt("resolveExtractor: Resolved '%s' from mime type '%s'."), $class, $type));
  561 + return $class;
  562 + }
  563 +
  564 + /**
  565 + * Return all the discussion text.
  566 + *
  567 + * @param int $docid
  568 + * @return string
  569 + */
  570 + public static function getDiscussionText($docid)
  571 + {
  572 + $sql = "SELECT
  573 + dc.subject, dc.body
  574 + FROM
  575 + discussion_threads dt
  576 + INNER JOIN discussion_comments dc ON dc.thread_id=dt.id AND dc.id BETWEEN dt.first_comment_id AND dt.last_comment_id
  577 + WHERE
  578 + dt.document_id=$docid";
  579 + $result = DBUtil::getResultArray($sql);
  580 + $text = '';
  581 +
  582 + foreach($result as $record)
  583 + {
  584 + $text .= $record['subject'] . "\n" . $record['body'] . "\n";
  585 + }
  586 +
  587 + return $text;
  588 + }
  589 +
  590 + /**
  591 + * Schedule the indexing of a document.
  592 + *
  593 + * @param string $document
  594 + * @param string $what
  595 + */
  596 + public static function index($document, $what='A')
  597 + {
  598 + global $default;
  599 +
  600 + if (is_numeric($document))
  601 + {
  602 + $document = Document::get($document+0);
  603 + }
  604 +
  605 + if (PEAR::isError($document))
  606 + {
  607 + $default->log->error("index: Could not index document: " .$document->getMessage());
  608 + return;
  609 + }
  610 +
  611 + $document_id = $document->getId();
  612 + $userid=$_SESSION['userID'];
  613 + if (empty($userid)) $userid=1;
  614 +
  615 + // we dequeue the document so that there are no issues when enqueuing
  616 + Indexer::unqueueDocument($document_id);
  617 +
  618 + // enqueue item
  619 + $sql = "INSERT INTO index_files(document_id, user_id, what) VALUES($document_id, $userid, '$what')";
  620 + DBUtil::runQuery($sql);
  621 +
  622 + $default->log->debug("index: Queuing indexing of $document_id");
  623 +
  624 + }
  625 +
  626 + private static function incrementCount()
  627 + {
  628 + // Get count from system settings
  629 + $count = Indexer::getIndexedDocumentCount();
  630 + $count = (int)$count + 1;
  631 + Indexer::updateIndexedDocumentCount($count);
  632 + }
  633 +
  634 + public static function getIndexedDocumentCount()
  635 + {
  636 + $count = KTUtil::getSystemSetting('indexedDocumentCount', 0);
  637 + return (int) $count;
  638 + }
  639 +
  640 + public static function updateIndexedDocumentCount($cnt = 0)
  641 + {
  642 + KTUtil::setSystemSetting('indexedDocumentCount', $cnt);
  643 + }
  644 +
  645 + public static function reindexQueue()
  646 + {
  647 + $sql = "UPDATE index_files SET processdate = null";
  648 + DBUtil::runQuery($sql);
  649 + }
  650 +
  651 + public static function reindexDocument($documentId)
  652 + {
  653 + $sql = "UPDATE index_files SET processdate=null, status_msg=null WHERE document_id=$documentId";
  654 + DBUtil::runQuery($sql);
  655 + }
  656 +
  657 +
  658 +
  659 + public static function indexAll()
  660 + {
  661 + $userid=$_SESSION['userID'];
  662 + if (empty($userid)) $userid=1;
  663 +
  664 + $sql = "DELETE FROM index_files";
  665 + DBUtil::runQuery($sql);
  666 +
  667 + $sql = "INSERT INTO index_files(document_id, user_id, what) SELECT id, $userid, 'A' FROM documents WHERE status_id=1 and id not in (select document_id from index_files)";
  668 + DBUtil::runQuery($sql);
  669 + }
  670 +
  671 + public static function indexFolder($folder)
  672 + {
  673 + $userid=$_SESSION['userID'];
  674 + if (empty($userid)) $userid=1;
  675 +
  676 + if (!$folder instanceof Folder && !$folder instanceof FolderProxy)
  677 + {
  678 + throw new Exception('Folder expected');
  679 + }
  680 +
  681 + $full_path = $folder->getFullPath();
  682 +
  683 + $sql = "INSERT INTO index_files(document_id, user_id, what) SELECT id, $userid, 'A' FROM documents WHERE full_path like '{$full_path}/%' AND status_id=1 and id not in (select document_id from index_files)";
  684 + DBUtil::runQuery($sql);
  685 + }
  686 +
  687 + /**
  688 + * Clearout the scheduling of documents that no longer exist.
  689 + *
  690 + */
  691 + public static function clearoutDeleted()
  692 + {
  693 + global $default;
  694 +
  695 + $sql = 'DELETE FROM
  696 + index_files
  697 + WHERE
  698 + document_id in (SELECT d.id FROM documents AS d WHERE d.status_id=3) OR
  699 + NOT EXISTS(SELECT index_files.document_id FROM documents WHERE index_files.document_id=documents.id)';
  700 + DBUtil::runQuery($sql);
  701 +
  702 + $default->log->debug("Indexer::clearoutDeleted: removed documents from indexing queue that have been deleted");
  703 + }
  704 +
  705 +
  706 + /**
  707 + * Check if a document is scheduled to be indexed
  708 + *
  709 + * @param mixed $document This may be a document or document id
  710 + * @return boolean
  711 + */
  712 + public static function isDocumentScheduled($document)
  713 + {
  714 + if (is_numeric($document))
  715 + {
  716 + $docid = $document;
  717 + }
  718 + else if ($document instanceof Document)
  719 + {
  720 + $docid = $document->getId();
  721 + }
  722 + else
  723 + {
  724 + return false;
  725 + }
  726 + $sql = "SELECT 1 FROM index_files WHERE document_id=$docid";
  727 + $result = DBUtil::getResultArray($sql);
  728 + return count($result) > 0;
  729 + }
  730 +
  731 + /**
  732 + * Filters text removing redundant characters such as continuous newlines and spaces.
  733 + *
  734 + * @param string $filename
  735 + */
  736 + private function filterText($filename)
  737 + {
  738 + $content = file_get_contents($filename);
  739 +
  740 + $src = array("([\r\n])","([\n][\n])","([\n])","([\t])",'([ ][ ])');
  741 + $tgt = array("\n","\n",' ',' ',' ');
  742 +
  743 + // shrink what is being stored.
  744 + do
  745 + {
  746 + $orig = $content;
  747 + $content = preg_replace($src, $tgt, $content);
  748 + } while ($content != $orig);
  749 +
  750 + return file_put_contents($filename, $content) !== false;
  751 + }
  752 +
  753 + /**
  754 + * Load hooks for text extraction process.
  755 + *
  756 + */
  757 + private function loadExtractorHooks()
  758 + {
  759 + $this->generalHookCache = array();
  760 + $this->mimeHookCache = array();
  761 +
  762 +
  763 + $dir = opendir(SearchHelper::correctPath($this->hookPath));
  764 + while (($file = readdir($dir)) !== false)
  765 + {
  766 + if (substr($file,-12) == 'Hook.inc.php')
  767 + {
  768 + require_once($this->hookPath . '/' . $file);
  769 + $class = substr($file, 0, -8);
  770 +
  771 + if (!class_exists($class))
  772 + {
  773 + continue;
  774 + }
  775 +
  776 + $hook = new $class;
  777 + if (!($class instanceof ExtractorHook))
  778 + {
  779 + continue;
  780 + }
  781 +
  782 + $mimeTypes = $hook->registerMimeTypes();
  783 + if (is_null($mimeTypes))
  784 + {
  785 + $this->generalHookCache[] = & $hook;
  786 + }
  787 + else
  788 + {
  789 + foreach($mimeTypes as $type)
  790 + {
  791 + $this->mimeHookCache[$type][] = & $hook;
  792 + }
  793 + }
  794 +
  795 + }
  796 + }
  797 + closedir($dir);
  798 + }
  799 +
  800 + /**
  801 + * This is a refactored function to execute the hooks.
  802 + *
  803 + * @param DocumentExtractor $extractor
  804 + * @param string $phase
  805 + * @param string $mimeType Optional. If set, indicates which hooks must be used, else assume general.
  806 + */
  807 + private function executeHook($extractor, $phase, $mimeType = null)
  808 + {
  809 + $hooks = array();
  810 + if (is_null($mimeType))
  811 + {
  812 + $hooks = $this->generalHookCache;
  813 + }
  814 + else
  815 + {
  816 + if (array_key_exists($mimeType, $this->mimeHookCache))
  817 + {
  818 + $hooks = $this->mimeHookCache[$mimeType];
  819 + }
  820 + }
  821 + if (empty($hooks))
  822 + {
  823 + return;
  824 + }
  825 +
  826 + foreach($hooks as $hook)
  827 + {
  828 + $hook->$phase($extractor);
  829 + }
  830 + }
  831 +
  832 + private function doesDiagnosticsPass($simple=false)
  833 + {
  834 + global $default;
  835 +
  836 + $config =& KTConfig::getSingleton();
  837 + // create a index log lock file in case there are errors, and we don't need to log them forever!
  838 + // this function will create the lockfile if an error is detected. It will be removed as soon
  839 + // as the problems with the indexer are removed.
  840 + $lockFile = $config->get('cache/cacheDirectory') . '/index.log.lock';
  841 +
  842 + $diagnosis = $this->diagnose();
  843 + if (!is_null($diagnosis))
  844 + {
  845 + if (!is_file($lockFile))
  846 + {
  847 + $default->log->error(_kt('Indexer problem: ') . $diagnosis);
  848 + }
  849 + touch($lockFile);
  850 + return false;
  851 + }
  852 +
  853 + if ($simple)
  854 + {
  855 + return true;
  856 + }
  857 +
  858 + $diagnosis = $this->diagnoseExtractors();
  859 + if (!empty($diagnosis))
  860 + {
  861 + if (!is_file($lockFile))
  862 + {
  863 + foreach($diagnosis as $diag)
  864 + {
  865 + $default->log->error(sprintf(_kt('%s problem: %s'), $diag['name'],$diag['diagnosis']));
  866 + }
  867 + }
  868 + touch($lockFile);
  869 + return false;
  870 + }
  871 +
  872 + if (is_file($lockFile))
  873 + {
  874 + $default->log->info(_kt('Issues with the indexer have been resolved!'));
  875 + unlink($lockFile);
  876 + }
  877 +
  878 + return true;
  879 + }
  880 +
  881 + /**
  882 + * This does the initial mime type association between mime types and text extractors
  883 + *
  884 + */
  885 + public function checkForRegisteredTypes()
  886 + {
  887 + global $default;
  888 +
  889 + // we are only doing this once!
  890 + $initRegistered = KTUtil::getSystemSetting('mimeTypesRegistered', false);
  891 + if ($initRegistered)
  892 + {
  893 + return;
  894 + }
  895 + if ($this->debug) $default->log->debug('checkForRegisteredTypes: start');
  896 +
  897 + $date = date('Y-m-d H:i');
  898 + $sql = "UPDATE scheduler_tasks SET run_time='$date'";
  899 + DBUtil::runQuery($sql);
  900 +
  901 + $this->registerTypes(true);
  902 +
  903 + $disable = array(
  904 + 'windows'=>array('PSExtractor'),
  905 + 'unix' => array()
  906 + );
  907 +
  908 + $disableForOS = OS_WINDOWS?$disable['windows']:$disable['unix'];
  909 +
  910 + if (!empty($disableForOS))
  911 + {
  912 + $disableForOS = '\'' . implode("','", $disableForOS) .'\'';
  913 +
  914 + $sql = "UPDATE mime_extractors SET active=0 WHERE name in ($disableForOS)";
  915 + DBUtil::runQuery($sql);
  916 + $default->log->info("checkForRegisteredTypes: disabled '$extractor'");
  917 + }
  918 + $this->loadExtractorStatus();
  919 +
  920 + if ($this->debug) $default->log->debug('checkForRegisteredTypes: done');
  921 + KTUtil::setSystemSetting('mimeTypesRegistered', true);
  922 + }
  923 +
  924 + private function updatePendingDocumentStatus($documentId, $message, $level)
  925 + {
  926 + $this->indexingHistory .= "\n" . $level . ': ' . $message;
  927 + $message = sanitizeForSQL($this->indexingHistory);
  928 + $sql = "UPDATE index_files SET status_msg='$message' WHERE document_id=$documentId";
  929 + DBUtil::runQuery($sql);
  930 + }
  931 +
  932 + private $restartCurrentBatch = false;
  933 +
  934 + public function restartBatch()
  935 + {
  936 + $this->restartCurrentBatch = true;
  937 + }
  938 +
  939 + /**
  940 + *
  941 + * @param int $documentId
  942 + * @param string $message
  943 + * @param string $level This may be info, error, debug
  944 + */
  945 + private function logPendingDocumentInfoStatus($documentId, $message, $level)
  946 + {
  947 + $this->updatePendingDocumentStatus($documentId, $message, $level);
  948 + global $default;
  949 +
  950 + switch ($level)
  951 + {
  952 + case 'debug':
  953 + if ($this->debug)
  954 + {
  955 + $default->log->debug($message);
  956 + }
  957 + break;
  958 + default:
  959 + $default->log->$level($message);
  960 + }
  961 + }
  962 +
  963 +
  964 +
  965 + public function getExtractor($extractorClass)
  966 + {
  967 + if (empty($extractorClass))
  968 + {
  969 + return null;
  970 + }
  971 +
  972 + $includeFile = SEARCH2_INDEXER_DIR . 'extractors/' . $extractorClass . '.inc.php';
  973 + if (!file_exists($includeFile))
  974 + {
  975 + throw new Exception("Extractor file does not exist: $includeFile");
  976 + }
  977 +
  978 + require_once($includeFile);
  979 +
  980 + if (!class_exists($extractorClass))
  981 + {
  982 + throw new Exception("Extractor '$classname' not defined in file: $includeFile");
  983 + }
  984 +
  985 + $extractor = new $extractorClass();
  986 +
  987 + if (!($extractor instanceof DocumentExtractor))
  988 + {
  989 + throw new Exception("Class $classname was expected to be of type DocumentExtractor");
  990 + }
  991 +
  992 + return $extractor;
  993 + }
  994 +
  995 + public static function getIndexingQueue($problemItemsOnly=true)
  996 + {
  997 +
  998 + if ($problemItemsOnly)
  999 + {
  1000 + $sql = "SELECT
  1001 + iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename
  1002 + FROM
  1003 + index_files iff
  1004 + INNER JOIN documents d ON iff.document_id=d.id
  1005 + INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
  1006 + INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
  1007 + INNER JOIN mime_types mt ON dcv.mime_id=mt.id
  1008 + LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
  1009 + WHERE
  1010 + (iff.status_msg IS NOT NULL AND iff.status_msg <> '') AND d.status_id=1
  1011 + ORDER BY indexdate ";
  1012 + }
  1013 + else
  1014 + {
  1015 + $sql = "SELECT
  1016 + iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename
  1017 + FROM
  1018 + index_files iff
  1019 + INNER JOIN documents d ON iff.document_id=d.id
  1020 + INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
  1021 + INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
  1022 + INNER JOIN mime_types mt ON dcv.mime_id=mt.id
  1023 + LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
  1024 + WHERE
  1025 + (iff.status_msg IS NULL or iff.status_msg = '') AND d.status_id=1
  1026 + ORDER BY indexdate ";
  1027 + }
  1028 + $aResult = DBUtil::getResultArray($sql);
  1029 +
  1030 + return $aResult;
  1031 + }
  1032 +
  1033 + public static function getPendingIndexingQueue()
  1034 + {
  1035 + return Indexer::getIndexingQueue(false);
  1036 + }
  1037 +
  1038 + public function updateIndexStats()
  1039 + {
  1040 + $optimisationDate = KTUtil::getSystemSetting('luceneOptimisationDate', '');
  1041 +
  1042 + $noOptimisation = false;
  1043 + if ($optimisationDate == '')
  1044 + {
  1045 + $optimisationDate = _kt('N/A');
  1046 + $optimisationPeriod = $optimisationDate;
  1047 + }
  1048 + else
  1049 + {
  1050 + $optimisationPeriod = KTUtil::computePeriodToDate($optimisationDate, null, true);
  1051 + $noOptimisation = $optimisationPeriod['days'] > 2;
  1052 + $optimisationPeriod = $optimisationPeriod['str'];
  1053 + $optimisationDate = date('Y-m-d H:i:s', $optimisationDate);
  1054 + }
  1055 +
  1056 + $indexingDate = KTUtil::getSystemSetting('luceneIndexingDate', '');
  1057 + if ($indexingDate == '')
  1058 + {
  1059 + $indexingDate = _kt('N/A');
  1060 + $indexingPeriod = $indexingDate;
  1061 + }
  1062 + else
  1063 + {
  1064 + $indexingPeriod = KTUtil::computePeriodToDate($indexingDate);
  1065 + $indexingDate = date('Y-m-d H:i:s', $indexingDate);
  1066 + }
  1067 +
  1068 + $index = Indexer::get();
  1069 + $docsInIndex = $index->getDocumentsInIndex();
  1070 +
  1071 + // we are only interested in documents that are active
  1072 + $sql = "SELECT count(*) as docsInQueue FROM index_files i inner join documents d on i.document_id = d.id where (i.status_msg is null or i.status_msg = '') and d.status_id=1";
  1073 + $docsInQueue = DBUtil::getOneResultKey($sql, 'docsInQueue');
  1074 +
  1075 + $sql = "SELECT count(*) as errorsInQueue FROM index_files i inner join documents d on i.document_id = d.id where (i.status_msg is not null or i.status_msg <> '') and d.status_id=1";
  1076 + $errorsInQueue = DBUtil::getOneResultKey($sql, 'errorsInQueue');
  1077 +
  1078 + $sql = "SELECT count(*) as docsInRepository FROM documents where status_id=1";
  1079 + $docsInRepository = DBUtil::getOneResultKey($sql, 'docsInRepository');
  1080 +
  1081 + if ($docsInRepository == 0)
  1082 + {
  1083 + $indexingCoverage = '0.00%';
  1084 + $queueCoverage = $indexingCoverage;
  1085 + }
  1086 + else
  1087 + {
  1088 + // compute indexing coverage
  1089 + $indexingCoverage = _kt('Not Available');
  1090 + if (is_numeric($docsInIndex))
  1091 + {
  1092 + $indexingCoverage = ($docsInIndex * 100) / $docsInRepository;
  1093 + $indexingCoverage = number_format($indexingCoverage, 2, '.',',') . '%';
  1094 + }
  1095 +
  1096 + // compute queue coverage
  1097 + $queueCoverage = _kt('Not Available');
  1098 + if (is_numeric($docsInQueue))
  1099 + {
  1100 + $queueCoverage = ($docsInQueue * 100) / $docsInRepository;
  1101 + $queueCoverage = number_format($queueCoverage, 2, '.',',') . '%';
  1102 + }
  1103 + }
  1104 +
  1105 +
  1106 + $stats = array(
  1107 + 'optimisationDate'=>$optimisationDate,
  1108 + 'optimisationPeriod'=>$optimisationPeriod,
  1109 + 'indexingDate'=>$indexingDate,
  1110 + 'indexingPeriod'=>$indexingPeriod,
  1111 + 'docsInIndex'=>$docsInIndex,
  1112 + 'docsInQueue'=>$docsInQueue,
  1113 + 'errorsInQueue'=>$errorsInQueue,
  1114 + 'docsInRepository'=>$docsInRepository,
  1115 + 'indexingCoverage'=>$indexingCoverage,
  1116 + 'queueCoverage'=>$queueCoverage,
  1117 + 'noOptimisation'=>$noOptimisation
  1118 + );
  1119 +
  1120 + KTUtil::setSystemSetting('indexerStats', serialize($stats));
  1121 +
  1122 + $indexer = Indexer::get();
  1123 +
  1124 + $diagnosis = $indexer->diagnose();
  1125 + KTUtil::setSystemSetting('indexerDiagnostics', serialize($diagnosis));
  1126 +
  1127 + $extractorDiagnosis = $indexer->diagnoseExtractors();
  1128 +
  1129 + KTUtil::setSystemSetting('extractorDiagnostics', serialize($extractorDiagnosis));
  1130 + }
  1131 +
  1132 + /**
  1133 + * The main function that may be called repeatedly to index documents.
  1134 + *
  1135 + * @param int $max Default 20
  1136 + */
  1137 + public function indexDocuments($max=null)
  1138 + {
  1139 + global $default;
  1140 + $config =& KTConfig::getSingleton();
  1141 +
  1142 + /*$indexLockFile = $config->get('cache/cacheDirectory') . '/main.index.lock';
  1143 + if (is_file($indexLockFile))
  1144 + {
  1145 + $default->log->info('indexDocuments: main.index.lock seems to exist. it could be that the indexing is still underway.');
  1146 + $default->log->info('indexDocuments: Remove "' . $indexLockFile . '" if the indexing is not running or extend the frequency at which the background task runs!');
  1147 + return;
  1148 + }
  1149 + touch($indexLockFile);*/
  1150 +
  1151 +
  1152 + $this->checkForRegisteredTypes();
  1153 +
  1154 + if ($this->debug) $default->log->debug('indexDocuments: start');
  1155 + if (!$this->doesDiagnosticsPass())
  1156 + {
  1157 + //unlink($indexLockFile);
  1158 + if ($this->debug) $default->log->debug('indexDocuments: stopping - diagnostics problem. The dashboard will provide more information.');
  1159 + return;
  1160 + }
  1161 +
  1162 + if (is_null($max))
  1163 + {
  1164 + $max = $config->get('indexer/batchDocuments',20);
  1165 + }
  1166 +
  1167 + $this->loadExtractorHooks();
  1168 +
  1169 + Indexer::clearoutDeleted();
  1170 +
  1171 + $date = date('Y-m-d H:i:s');
  1172 + // identify the indexers that must run
  1173 + // mysql specific limit!
  1174 + $sql = "SELECT
  1175 + iff.document_id, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what
  1176 + FROM
  1177 + index_files iff
  1178 + INNER JOIN documents d ON iff.document_id=d.id
  1179 + INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
  1180 + INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
  1181 + INNER JOIN mime_types mt ON dcv.mime_id=mt.id
  1182 + LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
  1183 + WHERE
  1184 + (iff.processdate IS NULL or iff.processdate < date_sub('$date', interval 1 day)) AND dmv.status_id=1
  1185 + ORDER BY indexdate
  1186 + LIMIT $max";
  1187 + $result = DBUtil::getResultArray($sql);
  1188 + if (PEAR::isError($result))
  1189 + {
  1190 + //unlink($indexLockFile);
  1191 + if ($this->debug) $default->log->debug('indexDocuments: stopping - db error');
  1192 + return;
  1193 + }
  1194 + KTUtil::setSystemSetting('luceneIndexingDate', time());
  1195 +
  1196 + // bail if no work to do
  1197 + if (count($result) == 0)
  1198 + {
  1199 + //unlink($indexLockFile);
  1200 + if ($this->debug) $default->log->debug('indexDocuments: stopping - no work to be done');
  1201 + return;
  1202 + }
  1203 +
  1204 + // identify any documents that need indexing and mark them
  1205 + // so they are not taken in a followup run
  1206 + $ids = array();
  1207 + foreach($result as $docinfo)
  1208 + {
  1209 + $ids[] = $docinfo['document_id'];
  1210 + }
  1211 +
  1212 + // mark the documents as being processed
  1213 +
  1214 + $ids=implode(',',$ids);
  1215 + $sql = "UPDATE index_files SET processdate='$date' WHERE document_id in ($ids)";
  1216 + DBUtil::runQuery($sql);
  1217 +
  1218 + $extractorCache = array();
  1219 + $storageManager = KTStorageManagerUtil::getSingleton();
  1220 +
  1221 + $tempPath = $config->get("urls/tmpDirectory");
  1222 +
  1223 + foreach($result as $docinfo)
  1224 + {
  1225 + // increment indexed documents count
  1226 + Indexer::incrementCount();
  1227 +
  1228 + $docId=$docinfo['document_id'];
  1229 + $extension=$docinfo['filetypes'];
  1230 + $mimeType=$docinfo['mimetypes'];
  1231 + $extractorClass=$docinfo['extractor'];
  1232 + $indexDocument = in_array($docinfo['what'], array('A','C'));
  1233 + $indexDiscussion = in_array($docinfo['what'], array('A','D'));
  1234 + $this->indexingHistory = '';
  1235 +
  1236 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Indexing docid: %d extension: '%s' mimetype: '%s' extractor: '%s'"), $docId, $extension,$mimeType,$extractorClass), 'debug');
  1237 +
  1238 + if (empty($extractorClass))
  1239 + {
  1240 + /*
  1241 +
  1242 + if no extractor is found and we don't need to index discussions, then we can remove the item from the queue.
  1243 +
  1244 + */
  1245 + if ($indexDiscussion)
  1246 + {
  1247 + $indexDocument = false;
  1248 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Not indexing docid: %d content because extractor could not be resolve. Still indexing discussion."), $docId), 'info');
  1249 + }
  1250 + else
  1251 + {
  1252 + Indexer::unqueueDocument($docId, sprintf(_kt("No extractor for docid: %d"),$docId));
  1253 + continue;
  1254 + }
  1255 + }
  1256 + else
  1257 + {
  1258 + /*
  1259 +
  1260 + If an extractor is available, we must ensure it is enabled.
  1261 +
  1262 + */
  1263 +
  1264 + if (!$this->isExtractorEnabled($extractorClass))
  1265 + {
  1266 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("diagnose: Not indexing docid: %d because extractor '%s' is disabled."), $docId, $extractorClass), 'info');
  1267 + continue;
  1268 + }
  1269 + }
  1270 +
  1271 + if ($this->debug)
  1272 + {
  1273 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Processing docid: %d.\n"),$docId), 'info');
  1274 + }
  1275 +
  1276 + $document = Document::get($docId);
  1277 + if (PEAR::isError($document))
  1278 + {
  1279 + Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Cannot resolve document id %d: %s."),$docId, $document->getMessage()), 'error');
  1280 + continue;
  1281 + }
  1282 +
  1283 + if ($this->restartCurrentBatch)
  1284 + {
  1285 + Indexer::unqueueDocument($docId);
  1286 + Indexer::index($docId, 'A');
  1287 + continue;
  1288 + }
  1289 +
  1290 +
  1291 + $filename = $document->getFileName();
  1292 + if (substr($filename,0,1) == '~' || substr($filename,-1) == '~')
  1293 + {
  1294 + Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Filename for document id %d starts with a tilde (~). This is assumed to be a temporary file. This is ignored."),$docId), 'error');
  1295 + continue;
  1296 + }
  1297 +
  1298 + $removeFromQueue = true;
  1299 + if ($indexDocument)
  1300 + {
  1301 + if (array_key_exists($extractorClass, $extractorCache))
  1302 + {
  1303 + $extractor = $extractorCache[$extractorClass];
  1304 + }
  1305 + else
  1306 + {
  1307 + $extractor = $extractorCache[$extractorClass] = $this->getExtractor($extractorClass);
  1308 + }
  1309 +
  1310 + if (!($extractor instanceof DocumentExtractor))
  1311 + {
  1312 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("indexDocuments: extractor '%s' is not a document extractor class."),$extractorClass), 'error');
  1313 + continue;
  1314 + }
  1315 +
  1316 +
  1317 +
  1318 + $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();
  1319 + $sourceFile = $storageManager->temporaryFile($document);
  1320 +
  1321 + if (empty($sourceFile) || !is_file($sourceFile))
  1322 + {
  1323 + Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: source file '%s' for document %d does not exist."),$sourceFile,$docId), 'error');
  1324 + continue;
  1325 + }
  1326 +
  1327 + if ($extractor->needsIntermediateSourceFile())
  1328 + {
  1329 + //$extension = pathinfo($document->getFileName(), PATHINFO_EXTENSION);
  1330 +
  1331 + $intermediate = $tempPath . '/'. $docId . '.' . $extension;
  1332 + $result = @copy($sourceFile, $intermediate);
  1333 + if ($result === false)
  1334 + {
  1335 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not create intermediate file from document %d"),$docId), 'error');
  1336 + // problem. lets try again later. probably permission related. log the issue.
  1337 + continue;
  1338 + }
  1339 + $sourceFile = $intermediate;
  1340 + }
  1341 +
  1342 + $targetFile = tempnam($tempPath, 'ktindexer');
  1343 +
  1344 + $extractor->setSourceFile($sourceFile);
  1345 + $extractor->setMimeType($mimeType);
  1346 + $extractor->setExtension($extension);
  1347 + $extractor->setTargetFile($targetFile);
  1348 + $extractor->setDocument($document);
  1349 + $extractor->setIndexingStatus(null);
  1350 + $extractor->setExtractionStatus(null);
  1351 +
  1352 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Extra Info docid: %d Source File: '%s' Target File: '%s'"),$docId,$sourceFile,$targetFile), 'debug');
  1353 +
  1354 + $this->executeHook($extractor, 'pre_extract');
  1355 + $this->executeHook($extractor, 'pre_extract', $mimeType);
  1356 + $removeFromQueue = false;
  1357 +
  1358 + if ($extractor->extractTextContent())
  1359 + {
  1360 + // the extractor may need to create another target file
  1361 + $targetFile = $extractor->getTargetFile();
  1362 +
  1363 + $extractor->setExtractionStatus(true);
  1364 + $this->executeHook($extractor, 'pre_index');
  1365 + $this->executeHook($extractor, 'pre_index', $mimeType);
  1366 +
  1367 + $title = $document->getName();
  1368 + if ($indexDiscussion)
  1369 + {
  1370 + if (!$this->filterText($targetFile))
  1371 + {
  1372 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');
  1373 + }
  1374 + else
  1375 + {
  1376 + $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);
  1377 + $removeFromQueue = $indexStatus;
  1378 + if (!$indexStatus)
  1379 + {
  1380 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocumentAndDiscussion"),$docId), 'error');
  1381 + }
  1382 +
  1383 + $extractor->setIndexingStatus($indexStatus);
  1384 + }
  1385 + }
  1386 + else
  1387 + {
  1388 + if (!$this->filterText($targetFile))
  1389 + {
  1390 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');
  1391 + }
  1392 + else
  1393 + {
  1394 + $indexStatus = $this->indexDocument($docId, $targetFile, $title, $version);
  1395 + $removeFromQueue = $indexStatus;
  1396 +
  1397 + if (!$indexStatus)
  1398 + {
  1399 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocument"),$docId), 'error');
  1400 + $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error');
  1401 + }
  1402 +
  1403 + $extractor->setIndexingStatus($indexStatus);
  1404 + }
  1405 + }
  1406 +
  1407 + $this->executeHook($extractor, 'post_index', $mimeType);
  1408 + $this->executeHook($extractor, 'post_index');
  1409 + }
  1410 + else
  1411 + {
  1412 + $extractor->setExtractionStatus(false);
  1413 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not extract contents from document %d"),$docId), 'error');
  1414 + $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error');
  1415 + }
  1416 +
  1417 + $this->executeHook($extractor, 'post_extract', $mimeType);
  1418 + $this->executeHook($extractor, 'post_extract');
  1419 +
  1420 + if ($extractor->needsIntermediateSourceFile())
  1421 + {
  1422 + @unlink($sourceFile);
  1423 + }
  1424 +
  1425 + @unlink($targetFile);
  1426 +
  1427 + }
  1428 + else
  1429 + {
  1430 + $indexStatus = $this->indexDiscussion($docId);
  1431 + $removeFromQueue = $indexStatus;
  1432 + }
  1433 +
  1434 + if ($removeFromQueue)
  1435 + {
  1436 + Indexer::unqueueDocument($docId, sprintf(_kt("Done indexing docid: %d"),$docId));
  1437 + }
  1438 + else
  1439 + {
  1440 + if ($this->debug) $default->log->debug(sprintf(_kt("Document docid: %d was not removed from the queue as it looks like there was a problem with the extraction process"),$docId));
  1441 + }
  1442 + }
  1443 + if ($this->debug) $default->log->debug('indexDocuments: done');
  1444 + //unlink($indexLockFile);
  1445 + }
  1446 +
  1447 + public function migrateDocuments($max=null)
  1448 + {
  1449 + global $default;
  1450 +
  1451 + $default->log->info(_kt('migrateDocuments: starting'));
  1452 +
  1453 + if (!$this->doesDiagnosticsPass(true))
  1454 + {
  1455 + $default->log->info(_kt('migrateDocuments: stopping - diagnostics problem. The dashboard will provide more information.'));
  1456 + return;
  1457 + }
  1458 +
  1459 + if (KTUtil::getSystemSetting('migrationComplete') == 'true')
  1460 + {
  1461 + $default->log->info(_kt('migrateDocuments: stopping - migration is complete.'));
  1462 + return;
  1463 + }
  1464 +
  1465 + $config =& KTConfig::getSingleton();
  1466 + if (is_null($max))
  1467 + {
  1468 + $max = $config->get('indexer/batchMigrateDocument',500);
  1469 + }
  1470 +
  1471 + $lockFile = $config->get('cache/cacheDirectory') . '/migration.lock';
  1472 + if (is_file($lockFile))
  1473 + {
  1474 + $default->log->info(_kt('migrateDocuments: stopping - migration lockfile detected.'));
  1475 + return;
  1476 + }
  1477 + touch($lockFile);
  1478 +
  1479 + $startTime = KTUtil::getSystemSetting('migrationStarted');
  1480 + if (is_null($startTime))
  1481 + {
  1482 + KTUtil::setSystemSetting('migrationStarted', time());
  1483 + }
  1484 +
  1485 + $maxLoops = 5;
  1486 +
  1487 + $max = ceil($max / $maxLoops);
  1488 +
  1489 + $start =KTUtil::getBenchmarkTime();
  1490 + $noDocs = false;
  1491 + $numDocs = 0;
  1492 +
  1493 + for($loop=0;$loop<$maxLoops;$loop++)
  1494 + {
  1495 +
  1496 + $sql = "SELECT
  1497 + document_id, document_text
  1498 + FROM
  1499 + document_text
  1500 + ORDER BY document_id
  1501 + LIMIT $max";
  1502 + $result = DBUtil::getResultArray($sql);
  1503 + if (PEAR::isError($result))
  1504 + {
  1505 + $default->log->info(_kt('migrateDocuments: db error'));
  1506 + break;
  1507 + }
  1508 +
  1509 + $docs = count($result);
  1510 + if ($docs == 0)
  1511 + {
  1512 + $noDocs = true;
  1513 + break;
  1514 + }
  1515 + $numDocs += $docs;
  1516 +
  1517 + foreach($result as $docinfo)
  1518 + {
  1519 + $docId = $docinfo['document_id'];
  1520 +
  1521 + $document = Document::get($docId);
  1522 + if (PEAR::isError($document) || is_null($document))
  1523 + {
  1524 + $sql = "DELETE FROM document_text WHERE document_id=$docId";
  1525 + DBUtil::runQuery($sql);
  1526 + $default->log->error(sprintf(_kt('migrateDocuments: Could not get document %d\'s document! Removing content!'),$docId));
  1527 + continue;
  1528 + }
  1529 +
  1530 + $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();
  1531 +
  1532 + $targetFile = tempnam($tempPath, 'ktindexer');
  1533 +
  1534 + if (file_put_contents($targetFile, $docinfo['document_text']) === false)
  1535 + {
  1536 + $default->log->error(sprintf(_kt('migrateDocuments: Cannot write to \'%s\' for document id %d'), $targetFile, $docId));
  1537 + continue;
  1538 + }
  1539 + // free memory asap ;)
  1540 + unset($docinfo['document_text']);
  1541 +
  1542 + $title = $document->getName();
  1543 +
  1544 + $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);
  1545 +
  1546 + if ($indexStatus)
  1547 + {
  1548 + $sql = "DELETE FROM document_text WHERE document_id=$docId";
  1549 + DBUtil::runQuery($sql);
  1550 + }
  1551 + else
  1552 + {
  1553 + $default->log->error(sprintf(_kt("migrateDocuments: Problem indexing document %d"), $docId));
  1554 + }
  1555 +
  1556 + @unlink($targetFile);
  1557 + }
  1558 + }
  1559 +
  1560 + @unlink($lockFile);
  1561 +
  1562 + $time = KTUtil::getBenchmarkTime() - $start;
  1563 +
  1564 + KTUtil::setSystemSetting('migrationTime', KTUtil::getSystemSetting('migrationTime',0) + $time);
  1565 + KTUtil::setSystemSetting('migratedDocuments', KTUtil::getSystemSetting('migratedDocuments',0) + $numDocs);
  1566 +
  1567 + $default->log->info(sprintf(_kt('migrateDocuments: stopping - done in %d seconds!'), $time));
  1568 + if ($noDocs)
  1569 + {
  1570 + $default->log->info(_kt('migrateDocuments: Completed!'));
  1571 + KTUtil::setSystemSetting('migrationComplete', 'true');
  1572 + schedulerUtil::deleteByName('Index Migration');
  1573 + $default->log->debug(_kt('migrateDocuments: Disabling \'Index Migration\' task by removing scheduler entry.'));
  1574 + }
  1575 + }
  1576 +
  1577 + /**
  1578 + * Index a document. The base class must override this function.
  1579 + *
  1580 + * @param int $docId
  1581 + * @param string $textFile
  1582 + */
  1583 + protected abstract function indexDocument($docId, $textFile, $title, $version);
  1584 +
  1585 +
  1586 + public function updateDocumentIndex($docId, $text)
  1587 + {
  1588 + $config = KTConfig::getSingleton();
  1589 + $tempPath = $config->get("urls/tmpDirectory");
  1590 + $tempFile = tempnam($tempPath,'ud_');
  1591 +
  1592 + file_put_contents($tempFile, $text);
  1593 +
  1594 + $document = Document::get($docId);
  1595 + $title = $document->getDescription();
  1596 + $version = $document->getVersion();
  1597 +
  1598 + $result = $this->indexDocument($docId, $tempFile, $title, $version);
  1599 +
  1600 + if (file_exists($tempFile))
  1601 + {
  1602 + unlink($tempFile);
  1603 + }
  1604 +
  1605 + return $result;
  1606 + }
  1607 +
  1608 + /**
  1609 + * Index a discussion. The base class must override this function.
  1610 + *
  1611 + * @param int $docId
  1612 + */
  1613 + protected abstract function indexDiscussion($docId);
  1614 +
  1615 + /**
  1616 + * Diagnose the indexer. e.g. Check that the indexing server is running.
  1617 + *
  1618 + */
  1619 + public abstract function diagnose();
  1620 +
  1621 + /**
  1622 + * Diagnose the extractors.
  1623 + *
  1624 + * @return array
  1625 + */
  1626 + public function diagnoseExtractors()
  1627 + {
  1628 + $diagnosis = $this->_diagnose($this->extractorPath, 'DocumentExtractor', 'Extractor.inc.php');
  1629 + $diagnosis = array_merge($diagnosis, $this->_diagnose($this->hookPath, 'Hook', 'Hook.inc.php'));
  1630 +
  1631 + return $diagnosis;
  1632 + }
  1633 +
  1634 + /**
  1635 + * This is a refactored diagnose function.
  1636 + *
  1637 + * @param string $path
  1638 + * @param string $class
  1639 + * @param string $extension
  1640 + * @return array
  1641 + */
  1642 + private function _diagnose($path, $baseclass, $extension)
  1643 + {
  1644 + global $default;
  1645 +
  1646 + $diagnoses = array();
  1647 +
  1648 + $dir = opendir(SearchHelper::correctPath($path));
  1649 + $extlen = - strlen($extension);
  1650 +
  1651 + while (($file = readdir($dir)) !== false)
  1652 + {
  1653 + if (substr($file,0,1) == '.')
  1654 + {
  1655 + continue;
  1656 + }
  1657 + if (substr($file,$extlen) != $extension)
  1658 + {
  1659 + $default->log->error(sprintf(_kt("diagnose: '%s' does not have extension '%s'."), $file, $extension));
  1660 + continue;
  1661 + }
  1662 +
  1663 + require_once($path . '/' . $file);
  1664 +
  1665 + $class = substr($file, 0, -8);
  1666 + if (!class_exists($class))
  1667 + {
  1668 + $default->log->error(sprintf(_kt("diagnose: class '%s' does not exist."), $class));
  1669 + continue;
  1670 + }
  1671 +
  1672 + if (!$this->isExtractorEnabled($class))
  1673 + {
  1674 + $default->log->debug(sprintf(_kt("diagnose: extractor '%s' is disabled."), $class));
  1675 + continue;
  1676 + }
  1677 +
  1678 + $extractor = new $class();
  1679 + if (!is_a($extractor, $baseclass))
  1680 + {
  1681 + $default->log->error(sprintf(_kt("diagnose(): '%s' is not of type DocumentExtractor"), $class));
  1682 + continue;
  1683 + }
  1684 +
  1685 + $types = $extractor->getSupportedMimeTypes();
  1686 + if (empty($types))
  1687 + {
  1688 + if ($this->debug) $default->log->debug(sprintf(_kt("diagnose: class '%s' does not support any types."), $class));
  1689 + continue;
  1690 + }
  1691 +
  1692 + $diagnosis=$extractor->diagnose();
  1693 + if (empty($diagnosis))
  1694 + {
  1695 + continue;
  1696 + }
  1697 + $diagnoses[$class] = array(
  1698 + 'name'=>$extractor->getDisplayName(),
  1699 + 'diagnosis'=>$diagnosis
  1700 + );
  1701 +
  1702 + }
  1703 + closedir($dir);
  1704 +
  1705 + return $diagnoses;
  1706 + }
  1707 +
  1708 +
  1709 + /**
  1710 + * Register the extractor types.
  1711 + *
  1712 + * @param boolean $clear. Optional. Defaults to false.
  1713 + */
  1714 + public function registerTypes($clear=false)
  1715 + {
  1716 + if ($clear)
  1717 + {
  1718 + $this->clearExtractors();
  1719 + }
  1720 + $dir = opendir(SearchHelper::correctPath($this->extractorPath));
  1721 + while (($file = readdir($dir)) !== false)
  1722 + {
  1723 + if (substr($file,-17) == 'Extractor.inc.php')
  1724 + {
  1725 + require_once($this->extractorPath . '/' . $file);
  1726 + $class = substr($file, 0, -8);
  1727 +
  1728 + if (!class_exists($class))
  1729 + {
  1730 + // if the class does not exist, we can't do anything.
  1731 + continue;
  1732 + }
  1733 +
  1734 + $extractor = new $class;
  1735 + if ($extractor instanceof DocumentExtractor)
  1736 + {
  1737 + $extractor->registerMimeTypes();
  1738 + }
  1739 + }
  1740 + }
  1741 + closedir($dir);
  1742 + }
  1743 +
  1744 + /**
  1745 + * This is used as a possible obtimisation effort. It may be overridden in that case.
  1746 + *
  1747 + * @param int $docId
  1748 + * @param string $textFile
  1749 + */
  1750 + protected function indexDocumentAndDiscussion($docId, $textFile, $title, $version)
  1751 + {
  1752 + $this->indexDocument($docId, $textFile, $title, $version);
  1753 + $this->indexDiscussion($docId);
  1754 + }
  1755 +
  1756 + /**
  1757 + * Remove the document from the queue. This is normally called when it has been processed.
  1758 + *
  1759 + * @param int $docid
  1760 + */
  1761 + public static function unqueueDocument($docid, $reason=false, $level='debug')
  1762 + {
  1763 + $sql = "DELETE FROM index_files WHERE document_id=$docid";
  1764 + DBUtil::runQuery($sql);
  1765 + if ($reason !== false)
  1766 + {
  1767 + global $default;
  1768 + $default->log->$level("Indexer: removing document $docid from the queue - $reason");
  1769 + }
  1770 + }
  1771 +
  1772 + /**
  1773 + * Run a query on the index.
  1774 + *
  1775 + * @param string $query
  1776 + * @return array
  1777 + */
  1778 + public abstract function query($query);
  1779 +
  1780 + /**
  1781 + * Converts an integer to a string that can be easily compared and reversed.
  1782 + *
  1783 + * @param int $int
  1784 + * @return string
  1785 + */
  1786 + public static function longToString($int)
  1787 + {
  1788 + $maxlen = 14;
  1789 +
  1790 + $a2z = array('a','b','c','d','e','f','g','h','i','j');
  1791 + $o29 = array('0','1','2','3','4','5','6','7','8','9');
  1792 + $l = str_pad('',$maxlen - strlen("$int"),'0') . $int;
  1793 +
  1794 + return str_replace($o29, $a2z, $l);
  1795 + }
  1796 +
  1797 + /**
  1798 + * Converts a string to an integer.
  1799 + *
  1800 + * @param string $str
  1801 + * @return int
  1802 + */
  1803 + public static function stringToLong($str)
  1804 + {
  1805 + $a2z = array('a','b','c','d','e','f','g','h','i','j');
  1806 + $o29 = array('0','1','2','3','4','5','6','7','8','9');
  1807 +
  1808 + $int = str_replace($a2z, $o29, $str) + 0;
  1809 +
  1810 + return $int;
  1811 + }
  1812 +
  1813 + /**
  1814 + * Possibly we can optimise indexes. This method must be overriden.
  1815 + * The new function must call the parent!
  1816 + *
  1817 + */
  1818 + public function optimise()
  1819 + {
  1820 + KTUtil::setSystemSetting('luceneOptimisationDate', time());
  1821 + }
  1822 +
  1823 + /**
  1824 + * Shuts down the indexer
  1825 + *
  1826 + */
  1827 + public function shutdown()
  1828 + {
  1829 + // do nothing generally
  1830 + }
  1831 +
  1832 + /**
  1833 + * Returns the name of the indexer.
  1834 + *
  1835 + * @return string
  1836 + */
  1837 + public abstract function getDisplayName();
  1838 +
  1839 +
  1840 + /**
  1841 + * Returns the number of non-deleted documents in the index.
  1842 + *
  1843 + * @return int
  1844 + */
  1845 + public abstract function getDocumentsInIndex();
  1846 +
  1847 + public abstract function isDocumentIndexed($documentId);
  1848 +
  1849 + /**
  1850 + * Returns the path to the index directory
  1851 + *
  1852 + * @return string
  1853 + */
  1854 + public function getIndexDirectory()
  1855 + {
  1856 + $config = KTConfig::getSingleton();
  1857 + $directory = $config->get('indexer/luceneDirectory');
  1858 + return $directory;
  1859 + }
  1860 +}
  1861 +
  1862 +?>
... ...