Commit ca9e23dd6281155fc86bcd97be9d1a3022b94d50

Authored by kevin_fourie
1 parent f4eb662e

Merged in from DEV trunk...

KTS-3444
"Indexing needs to be more resiliant when encountering errors with open office"
Fixed.

Committed By: Conrad Vermeulen
Reviewed By: Philip Arkoll


git-svn-id: https://kt-dms.svn.sourceforge.net/svnroot/kt-dms/branches/3.5.3-Branch@8655 c91229c3-7414-0410-bfa2-8a42b809f60b
search2/indexing/extractorCore.inc.php
... ... @@ -557,7 +557,10 @@ abstract class TextExtractor extends DocumentExtractor
557 557 */
558 558 public function extractTextContent()
559 559 {
560   - $content = file_get_contents($this->sourcefile);
  560 +
  561 + $config = KTConfig::getSingleton();
  562 + $maxTextSize = $config->get('indexer/maxTextSize', 1024 * 1024 * 10); // we'll only take 10 meg by default
  563 + $content = substr(file_get_contents($this->sourcefile), 0, $maxTextSize);
561 564 if (false === $content)
562 565 {
563 566 return false;
... ...
search2/indexing/extractors/OOTextExtractor.inc.php
... ... @@ -123,13 +123,53 @@ class OOTextExtractor extends ExternalDocumentExtractor
123 123  
124 124 public function extractTextContent()
125 125 {
126   - if (false === parent::extractTextContent())
  126 + global $default;
  127 +
  128 + $docId = $this->document->getId();
  129 +
  130 + if (empty($this->extension))
  131 + {
  132 + $default->log->info("DocumentId: $docId - Document does not have an extension");
  133 + Indexer::unqueueDocument($docId, sprintf(("Removing document from queue: documentId %d"),$docId));
  134 + return false;
  135 + }
  136 +
  137 + // Open Office does not support the following files
  138 + if (in_array($this->extension, array('xlt')))
  139 + {
  140 + $default->log->info("DocumentId: $docId - Document does not have an extension");
  141 + Indexer::unqueueDocument($docId, sprintf(("Removing document from queue: documentId %d"),$docId));
  142 + return false;
  143 + }
  144 +
  145 + if (false === parent::extractTextContent())
127 146 {
  147 + if (strpos($this->output, 'OpenOffice process not found or not listening') !== false)
  148 + {
  149 + $indexer = Indexer::get();
  150 + $indexer->restartBatch();
  151 + return false;
  152 + }
  153 + elseif (strpos($this->output, 'Unexpected connection closure') !== false
  154 + || strpos($this->output, '\'NoneType\' object has no attribute \'storeToURL\'') !== false
  155 + || strpos($this->output, 'The document could not be opened for conversion. This could indicate an unsupported mimetype.') !== false
  156 + || strpos($this->output, 'URL seems to be an unsupported one.') !== false
  157 + || strpos($this->output, '__main__.com.sun.star.task.ErrorCodeIOException') !== false)
  158 + {
  159 + $default->log->info("DocumentId: $docId - Suspect the file cannot be indexed by Open Office.");
  160 + file_put_contents($this->targetfile, '');
  161 + $indexer = Indexer::get();
  162 + $indexer->restartBatch();
  163 +
  164 + Indexer::unqueueDocument($docId, sprintf(_kt("Removing document from queue: documentId %d"),$docId));
  165 + return true;
  166 + }
128 167 return false;
129 168 }
130 169  
131 170 if ($this->targetExtension != 'html')
132 171 {
  172 + file_put_contents($this->targetfile, '');
133 173 return true;
134 174 }
135 175 $content = file_get_contents($this->targetfile);
... ...
search2/indexing/extractors/PDFExtractor.inc.php
... ... @@ -59,15 +59,26 @@ class PDFExtractor extends ApplicationExtractor
59 59  
60 60 protected function exec($cmd)
61 61 {
  62 + global $default;
62 63 $res = parent::exec($cmd);
63 64  
64   - if (false === $res && (strpos($this->output, 'Copying of text from this document is not allowed') !== false))
  65 + if (false === $res && ((strpos($this->output, 'Copying of text from this document is not allowed') !== false) ||
  66 + (strpos($this->output, 'Incorrect password') !== false)))
65 67 {
66 68 $this->output = '';
67 69 file_put_contents($this->targetfile, _kt('Security properties on the PDF document prevent text from being extracted.'));
  70 + $default->log->info('Security properties on the PDF document prevent text from being extracted.');
68 71 return true;
69 72 }
70 73  
  74 + if (false === $res && (strpos($this->output, 'PDF file is damaged') !== false))
  75 + {
  76 + $this->output = '';
  77 + $default->log->info('PDF file is damaged');
  78 + return true;
  79 + }
  80 +
  81 +
71 82 if (false === $res && (strpos($this->output, '(continuing anyway)') !== false))
72 83 {
73 84 $this->output = '';
... ...
search2/indexing/indexerCore.inc.php
1   -<?php
2   -
3   -/**
4   - * $Id:$
5   - *
6   - * KnowledgeTree Community Edition
7   - * Document Management Made Simple
8   - * Copyright (C) 2008 KnowledgeTree Inc.
9   - * Portions copyright The Jam Warehouse Software (Pty) Limited
10   - *
11   - * This program is free software; you can redistribute it and/or modify it under
12   - * the terms of the GNU General Public License version 3 as published by the
13   - * Free Software Foundation.
14   - *
15   - * This program is distributed in the hope that it will be useful, but WITHOUT
16   - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17   - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
18   - * details.
19   - *
20   - * You should have received a copy of the GNU General Public License
21   - * along with this program. If not, see <http://www.gnu.org/licenses/>.
22   - *
23   - * You can contact KnowledgeTree Inc., PO Box 7775 #87847, San Francisco,
24   - * California 94120-7775, or email info@knowledgetree.com.
25   - *
26   - * The interactive user interfaces in modified source and object code versions
27   - * of this program must display Appropriate Legal Notices, as required under
28   - * Section 5 of the GNU General Public License version 3.
29   - *
30   - * In accordance with Section 7(b) of the GNU General Public License version 3,
31   - * these Appropriate Legal Notices must retain the display of the "Powered by
32   - * KnowledgeTree" logo and retain the original copyright notice. If the display of the
33   - * logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices
34   - * must display the words "Powered by KnowledgeTree" and retain the original
35   - * copyright notice.
36   - * Contributor( s): ______________________________________
37   - *
38   - */
39   -
40   -define('SEARCH2_INDEXER_DIR',realpath(dirname(__FILE__)) . '/');
41   -require_once('indexing/extractorCore.inc.php');
42   -require_once(KT_DIR . '/plugins/ktcore/scheduler/schedulerUtil.php');
43   -
44   -
45   -class IndexerInconsistencyException extends Exception {};
46   -
47   -class QueryResultItem
48   -{
49   - protected $document_id;
50   - protected $title;
51   - protected $rank;
52   - protected $text;
53   - protected $filesize;
54   - protected $fullpath;
55   - protected $live;
56   - protected $version;
57   - protected $mimeType;
58   - protected $filename;
59   - protected $thumbnail; // TODO: if not null, gui can display a thumbnail
60   - protected $viewer; // TODO: if not null, a viewer can be used to view the document
61   - protected $document;
62   - protected $checkedOutUser;
63   - protected $dateCheckedout;
64   - protected $workflowState;
65   - protected $workflow;
66   - protected $modifiedBy;
67   - protected $dateModified;
68   - protected $createdBy;
69   - protected $dateCreated;
70   - protected $owner;
71   - protected $immutable;
72   - protected $deleted;
73   - protected $status;
74   - protected $folderId;
75   - protected $storagePath;
76   - protected $documentType;
77   - protected $mimeIconPath;
78   - protected $mimeDisplay;
79   - protected $oemDocumentNo;
80   -
81   - public function __construct($document_id, $rank=null, $title=null, $text=null)
82   - {
83   - $this->document_id=(int) $document_id;
84   - $this->rank= $rank;
85   - $this->title=$title;
86   - $this->text = $text;
87   - $this->live = true;
88   - $this->loadDocumentInfo();
89   - }
90   -
91   - protected function __isset($property)
92   - {
93   - switch($property)
94   - {
95   - case 'DocumentID': return isset($this->document_id);
96   - case 'Rank': return isset($this->rank);
97   - case 'Text': return isset($this->text);
98   - case 'Title': return isset($this->title);
99   - case null: break;
100   - default:
101   - throw new Exception("Unknown property '$property' to get on QueryResultItem");
102   - }
103   - return true; // should not be reached
104   - }
105   -
106   - public function loadDocumentInfo()
107   - {
108   - global $default;
109   - $sql = "SELECT
110   - d.folder_id, f.full_path, f.name, dcv.size as filesize, dcv.major_version,
111   - dcv.minor_version, dcv.filename, cou.name as checkoutuser, w.human_name as workflow, ws.human_name as workflowstate,
112   - mt.mimetypes as mimetype, md.mime_doc as mimedoc, d.checkedout, mbu.name as modifiedbyuser, d.modified,
113   - cbu.name as createdbyuser, ou.name as owneruser, d.immutable, d.status_id, d.created,dcv.storage_path, dtl.name as document_type,
114   - mt.icon_path as mime_icon_path, mt.friendly_name as mime_display, d.oem_no, dmv.name as title
115   - FROM
116   - documents d
117   - INNER JOIN document_metadata_version dmv ON d.metadata_version_id = dmv.id
118   - INNER JOIN document_content_version dcv ON dmv.content_version_id = dcv.id
119   - INNER JOIN mime_types mt ON dcv.mime_id=mt.id
120   - LEFT JOIN document_types_lookup dtl ON dtl.id=dmv.document_type_id
121   - LEFT JOIN folders f ON f.id=d.folder_id
122   - LEFT JOIN users cou ON d.checked_out_user_id=cou.id
123   - LEFT JOIN workflows w ON dmv.workflow_id=w.id
124   - LEFT JOIN workflow_states ws ON dmv.workflow_state_id = ws.id
125   - LEFT JOIN mime_documents md ON mt.mime_document_id = md.id
126   - LEFT JOIN users mbu ON d.modified_user_id=mbu.id
127   - LEFT JOIN users cbu ON d.creator_id=cbu.id
128   - LEFT JOIN users ou ON d.owner_id=ou.id
129   - WHERE
130   - d.id=$this->document_id";
131   -
132   - $result = DBUtil::getOneResult($sql);
133   -
134   - if (PEAR::isError($result) || empty($result))
135   - {
136   - $this->live = false;
137   - if (PEAR::isError($result))
138   - {
139   - throw new Exception('Database exception! There appears to be an error in the system: ' .$result->getMessage());
140   - }
141   -
142   - $default->log->error('QueryResultItem: $result is null');
143   - $msg = 'The database did not have a record matching the result from the document indexer. This may occur if there is an inconsistency between the document indexer and the repository. The indexer needs to be repaired.';
144   - $default->log->error('QueryResultItem: ' . $msg);
145   - // TODO: repair process where we scan documents in index, and delete those for which there is nothing in the repository
146   - throw new IndexerInconsistencyException(_kt($msg));
147   - }
148   -
149   - // document_id, relevance, text, title
150   -
151   - $this->documentType = $result['document_type'];
152   - $this->filename=$result['filename'];
153   - $this->filesize = KTUtil::filesizeToString($result['filesize']);
154   - $this->folderId = $result['folder_id'];
155   - $this->title = $result['title'];
156   -
157   - $this->createdBy = $result['createdbyuser'];
158   - $this->dateCreated = $result['created'];
159   -
160   - $this->modifiedBy = $result['modifiedbyuser'];
161   - $this->dateModified = $result['modified'];
162   -
163   - $this->checkedOutUser = $result['checkoutuser'];
164   - $this->dateCheckedout = $result['checkedout'];
165   -
166   - $this->owner = $result['owneruser'];
167   -
168   - $this->version = $result['major_version'] . '.' . $result['minor_version'];
169   -
170   - $this->immutable = ($result['immutable'] + 0)?_kt('Immutable'):'';
171   -
172   - $this->workflow = $result['workflow'];
173   - $this->workflowState = $result['workflowstate'];
174   -
175   - $this->oemDocumentNo = $result['oem_no'];
176   - if (empty($this->oemDocumentNo)) $this->oemDocumentNo = 'n/a';
177   -
178   - if (is_null($result['name']))
179   - {
180   - $this->fullpath = '(orphaned)';
181   - }
182   - else
183   - {
184   - $this->fullpath = $result['full_path'];
185   - }
186   -
187   - $this->mimeType = $result['mimetype'];
188   - $this->mimeIconPath = $result['mime_icon_path'];
189   - $this->mimeDisplay = $result['mime_display'];
190   -
191   - $this->storagePath = $result['storage_path'];
192   - $this->status = Document::getStatusString($result['status_id']);
193   - }
194   -
195   - protected function __get($property)
196   - {
197   - switch($property)
198   - {
199   - case null: return '';
200   - case 'DocumentID': return (int) $this->document_id;
201   - case 'Relevance':
202   - case 'Rank': return (float) $this->rank;
203   - case 'Text': return (string) $this->text;
204   - case 'Title': return (string) $this->title;
205   - case 'FullPath': return (string) $this->fullpath;
206   - case 'IsLive': return (bool) $this->live;
207   - case 'Filesize': return $this->filesize;
208   - case 'Version': return (string) $this->version;
209   - case 'Filename': return (string)$this->filename;
210   - case 'FolderId': return (int)$this->folderId;
211   - case 'OemDocumentNo': return (string) $this->oemDocumentNo;
212   - case 'Document':
213   - if (is_null($this->document))
214   - {
215   - $this->document = Document::get($this->document_id);
216   - }
217   - return $this->document;
218   - case 'IsAvailable':
219   - return $this->Document->isLive();
220   - case 'CheckedOutUser':
221   - case 'CheckedOutBy':
222   - return (string) $this->checkedOutUser;
223   - case 'WorkflowOnly':
224   - case 'Workflow':
225   - return (string)$this->workflow;
226   - case 'WorkflowStateOnly':
227   - case 'WorkflowState':
228   - return (string)$this->workflowState;
229   - case 'WorkflowAndState':
230   - if (is_null($this->workflow))
231   - {
232   - return '';
233   - }
234   - return "$this->workflow - $this->workflowState";
235   - case 'MimeType':
236   - return (string) $this->mimeType;
237   - case 'MimeIconPath':
238   - return (string) $this->mimeIconPath;
239   - case 'MimeDisplay':
240   - return (string) $this->mimeDisplay;
241   - case 'DateCheckedOut':
242   - return (string) $this->dateCheckedout;
243   - case 'ModifiedBy':
244   - return (string) $this->modifiedBy;
245   - case 'DateModified':
246   - return (string) $this->dateModified;
247   - case 'CreatedBy':
248   - return (string) $this->createdBy;
249   - case 'DateCreated':
250   - return (string) $this->dateCreated;
251   - case 'Owner':
252   - case 'OwnedBy':
253   - return (string) $this->owner;
254   - case 'IsImmutable':
255   - case 'Immutable':
256   - return (bool) $this->immutable;
257   - case 'Status':
258   - return $this->status;
259   - case 'StoragePath':
260   - return $this->storagePath;
261   - case 'DocumentType':
262   - return $this->documentType;
263   - case 'Permissions':
264   - return 'not available';
265   - case 'CanBeReadByUser':
266   - if (!$this->live)
267   - return false;
268   - if (Permission::userHasDocumentReadPermission($this->Document))
269   - return true;
270   - if (Permission::adminIsInAdminMode())
271   - return true;
272   - return false;
273   - default:
274   - throw new Exception("Unknown property '$property' to get on QueryResultItem");
275   - }
276   - return ''; // Should not be reached
277   - }
278   -
279   - protected function __set($property, $value)
280   - {
281   - switch($property)
282   - {
283   - case 'Rank': $this->rank = number_format($value,2,'.',','); break;
284   - case 'Title': $this->title = $value; break;
285   - case 'Text': $this->text = $value; break;
286   - default:
287   - throw new Exception("Unknown property '$property' to set on QueryResultItem");
288   - }
289   - }
290   -}
291   -
292   -function MatchResultCompare($a, $b)
293   -{
294   - if ($a->Rank == $b->Rank) {
295   - return 0;
296   - }
297   - return ($a->Rank < $b->Rank) ? -1 : 1;
298   -}
299   -
300   -abstract class Indexer
301   -{
302   - /**
303   - * Cache of extractors
304   - *
305   - * @var array
306   - */
307   - private $extractorCache;
308   -
309   - /**
310   - * Indicates if the indexer will do logging.
311   - *
312   - * @var boolean
313   - */
314   - private $debug;
315   - /**
316   - * Cache on mime related hooks
317   - *
318   - * @var unknown_type
319   - */
320   - private $mimeHookCache;
321   - /**
322   - * Cache on general hooks.
323   - *
324   - * @var array
325   - */
326   - private $generalHookCache;
327   -
328   - /**
329   - * This is a path to the extractors.
330   - *
331   - * @var string
332   - */
333   - private $extractorPath;
334   - /**
335   - * This is a path to the hooks.
336   - *
337   - * @var string
338   - */
339   - private $hookPath;
340   -
341   - private $enabledExtractors;
342   -
343   - /**
344   - * Initialise the indexer
345   - *
346   - */
347   - protected function __construct()
348   - {
349   - $config = KTConfig::getSingleton();
350   -
351   - $this->extractorCache = array();
352   - $this->debug = $config->get('indexer/debug', true);
353   - $this->hookCache = array();
354   - $this->generalHookCache = array();
355   - $this->extractorPath = $config->get('indexer/extractorPath', 'extractors');
356   - $this->hookPath = $config->get('indexer/extractorHookPath','extractorHooks');
357   -
358   - $this->loadExtractorStatus();
359   - }
360   -
361   - /**
362   - * Get the list if enabled extractors
363   - *
364   - */
365   - private function loadExtractorStatus()
366   - {
367   - $sql = "SELECT id, name FROM mime_extractors WHERE active=1";
368   - $rs = DBUtil::getResultArray($sql);
369   - $this->enabledExtractors = array();
370   - foreach($rs as $item)
371   - {
372   - $this->enabledExtractors[] = $item['name'];
373   - }
374   - }
375   -
376   - private function isExtractorEnabled($extractor)
377   - {
378   - return in_array($extractor, $this->enabledExtractors);
379   - }
380   -
381   - /**
382   - * Returns a reference to the main class
383   - *
384   - * @return Indexer
385   - */
386   - public static function get()
387   - {
388   - static $singleton = null;
389   -
390   - if (is_null($singleton))
391   - {
392   - $config = KTConfig::getSingleton();
393   - $classname = $config->get('indexer/coreClass');
394   -
395   - require_once('indexing/indexers/' . $classname . '.inc.php');
396   -
397   - if (!class_exists($classname))
398   - {
399   - throw new Exception("Class '$classname' does not exist.");
400   - }
401   -
402   - $singleton = new $classname;
403   - }
404   -
405   - return $singleton;
406   - }
407   -
408   - public abstract function deleteDocument($docid);
409   -
410   - /**
411   - * Remove the association of all extractors to mime types on the database.
412   - *
413   - */
414   - public function clearExtractors()
415   - {
416   - global $default;
417   -
418   - $sql = "update mime_types set extractor_id=null";
419   - DBUtil::runQuery($sql);
420   -
421   - $sql = "delete from mime_extractors";
422   - DBUtil::runQuery($sql);
423   -
424   - if ($this->debug) $default->log->debug('clearExtractors');
425   - }
426   -
427   - /**
428   - * lookup the name of the extractor class based on the mime type.
429   - *
430   - * @param string $type
431   - * @return string
432   - */
433   - public static function resolveExtractor($type)
434   - {
435   - global $default;
436   - $sql = "select extractor from mime_types where filetypes='$type'";
437   - $class = DBUtil::getOneResultKey($sql,'extractor');
438   - if (PEAR::isError($class))
439   - {
440   - $default->log->error("resolveExtractor: cannot resolve $type");
441   - return $class;
442   - }
443   - if ($this->debug) $default->log->debug(sprintf(_kt("resolveExtractor: Resolved '%s' from mime type '%s'."), $class, $type));
444   - return $class;
445   - }
446   -
447   - /**
448   - * Return all the discussion text.
449   - *
450   - * @param int $docid
451   - * @return string
452   - */
453   - public static function getDiscussionText($docid)
454   - {
455   - $sql = "SELECT
456   - dc.subject, dc.body
457   - FROM
458   - discussion_threads dt
459   - INNER JOIN discussion_comments dc ON dc.thread_id=dt.id AND dc.id BETWEEN dt.first_comment_id AND dt.last_comment_id
460   - WHERE
461   - dt.document_id=$docid";
462   - $result = DBUtil::getResultArray($sql);
463   - $text = '';
464   -
465   - foreach($result as $record)
466   - {
467   - $text .= $record['subject'] . "\n" . $record['body'] . "\n";
468   - }
469   -
470   - return $text;
471   - }
472   -
473   - /**
474   - * Schedule the indexing of a document.
475   - *
476   - * @param string $document
477   - * @param string $what
478   - */
479   - public static function index($document, $what='A')
480   - {
481   - global $default;
482   -
483   - if (is_numeric($document))
484   - {
485   - $document = Document::get($document+0);
486   - }
487   -
488   - if (PEAR::isError($document))
489   - {
490   - $default->log->error("index: Could not index document: " .$document->getMessage());
491   - return;
492   - }
493   -
494   - $document_id = $document->getId();
495   - $userid=$_SESSION['userID'];
496   - if (empty($userid)) $userid=1;
497   -
498   - // we dequeue the document so that there are no issues when enqueuing
499   - Indexer::unqueueDocument($document_id);
500   -
501   - // enqueue item
502   - $sql = "INSERT INTO index_files(document_id, user_id, what) VALUES($document_id, $userid, '$what')";
503   - DBUtil::runQuery($sql);
504   -
505   - $default->log->debug("index: Queuing indexing of $document_id");
506   -
507   - }
508   -
509   - private static function incrementCount()
510   - {
511   - // Get count from system settings
512   - $count = Indexer::getIndexedDocumentCount();
513   - $count = (int)$count + 1;
514   - Indexer::updateIndexedDocumentCount($count);
515   - }
516   -
517   - public static function getIndexedDocumentCount()
518   - {
519   - $count = KTUtil::getSystemSetting('indexedDocumentCount', 0);
520   - return (int) $count;
521   - }
522   -
523   - public static function updateIndexedDocumentCount($cnt = 0)
524   - {
525   - KTUtil::setSystemSetting('indexedDocumentCount', $cnt);
526   - }
527   -
528   - public static function reindexQueue()
529   - {
530   - $sql = "UPDATE index_files SET processdate = null";
531   - DBUtil::runQuery($sql);
532   - }
533   -
534   - public static function reindexDocument($documentId)
535   - {
536   - $sql = "UPDATE index_files SET processdate=null, status_msg=null WHERE document_id=$documentId";
537   - DBUtil::runQuery($sql);
538   - }
539   -
540   -
541   -
542   - public static function indexAll()
543   - {
544   - $userid=$_SESSION['userID'];
545   - if (empty($userid)) $userid=1;
546   -
547   - $sql = "DELETE FROM index_files";
548   - DBUtil::runQuery($sql);
549   -
550   - $sql = "INSERT INTO index_files(document_id, user_id, what) SELECT id, $userid, 'A' FROM documents WHERE status_id=1 and id not in (select document_id from index_files)";
551   - DBUtil::runQuery($sql);
552   - }
553   -
554   - /**
555   - * Clearout the scheduling of documents that no longer exist.
556   - *
557   - */
558   - public static function clearoutDeleted()
559   - {
560   - global $default;
561   -
562   - $sql = 'DELETE FROM
563   - index_files
564   - WHERE
565   - document_id in (SELECT d.id FROM documents AS d WHERE d.status_id=3) OR
566   - NOT EXISTS(SELECT index_files.document_id FROM documents WHERE index_files.document_id=documents.id)';
567   - DBUtil::runQuery($sql);
568   -
569   - $default->log->debug("Indexer::clearoutDeleted: removed documents from indexing queue that have been deleted");
570   - }
571   -
572   -
573   - /**
574   - * Check if a document is scheduled to be indexed
575   - *
576   - * @param mixed $document This may be a document or document id
577   - * @return boolean
578   - */
579   - public static function isDocumentScheduled($document)
580   - {
581   - if (is_numeric($document))
582   - {
583   - $docid = $document;
584   - }
585   - else if ($document instanceof Document)
586   - {
587   - $docid = $document->getId();
588   - }
589   - else
590   - {
591   - return false;
592   - }
593   - $sql = "SELECT 1 FROM index_files WHERE document_id=$docid";
594   - $result = DBUtil::getResultArray($sql);
595   - return count($result) > 0;
596   - }
597   -
598   - /**
599   - * Filters text removing redundant characters such as continuous newlines and spaces.
600   - *
601   - * @param string $filename
602   - */
603   - private function filterText($filename)
604   - {
605   - $content = file_get_contents($filename);
606   -
607   - $src = array("([\r\n])","([\n][\n])","([\n])","([\t])",'([ ][ ])');
608   - $tgt = array("\n","\n",' ',' ',' ');
609   -
610   - // shrink what is being stored.
611   - do
612   - {
613   - $orig = $content;
614   - $content = preg_replace($src, $tgt, $content);
615   - } while ($content != $orig);
616   -
617   - return file_put_contents($filename, $content) !== false;
618   - }
619   -
620   - /**
621   - * Load hooks for text extraction process.
622   - *
623   - */
624   - private function loadExtractorHooks()
625   - {
626   - $this->generalHookCache = array();
627   - $this->mimeHookCache = array();
628   -
629   -
630   - $dir = opendir(SearchHelper::correctPath($this->hookPath));
631   - while (($file = readdir($dir)) !== false)
632   - {
633   - if (substr($file,-12) == 'Hook.inc.php')
634   - {
635   - require_once($this->hookPath . '/' . $file);
636   - $class = substr($file, 0, -8);
637   -
638   - if (!class_exists($class))
639   - {
640   - continue;
641   - }
642   -
643   - $hook = new $class;
644   - if (!($class instanceof ExtractorHook))
645   - {
646   - continue;
647   - }
648   -
649   - $mimeTypes = $hook->registerMimeTypes();
650   - if (is_null($mimeTypes))
651   - {
652   - $this->generalHookCache[] = & $hook;
653   - }
654   - else
655   - {
656   - foreach($mimeTypes as $type)
657   - {
658   - $this->mimeHookCache[$type][] = & $hook;
659   - }
660   - }
661   -
662   - }
663   - }
664   - closedir($dir);
665   - }
666   -
667   - /**
668   - * This is a refactored function to execute the hooks.
669   - *
670   - * @param DocumentExtractor $extractor
671   - * @param string $phase
672   - * @param string $mimeType Optional. If set, indicates which hooks must be used, else assume general.
673   - */
674   - private function executeHook($extractor, $phase, $mimeType = null)
675   - {
676   - $hooks = array();
677   - if (is_null($mimeType))
678   - {
679   - $hooks = $this->generalHookCache;
680   - }
681   - else
682   - {
683   - if (array_key_exists($mimeType, $this->mimeHookCache))
684   - {
685   - $hooks = $this->mimeHookCache[$mimeType];
686   - }
687   - }
688   - if (empty($hooks))
689   - {
690   - return;
691   - }
692   -
693   - foreach($hooks as $hook)
694   - {
695   - $hook->$phase($extractor);
696   - }
697   - }
698   -
699   - private function doesDiagnosticsPass($simple=false)
700   - {
701   - global $default;
702   -
703   - $config =& KTConfig::getSingleton();
704   - // create a index log lock file in case there are errors, and we don't need to log them forever!
705   - // this function will create the lockfile if an error is detected. It will be removed as soon
706   - // as the problems with the indexer are removed.
707   - $lockFile = $config->get('cache/cacheDirectory') . '/index.log.lock';
708   -
709   - $diagnosis = $this->diagnose();
710   - if (!is_null($diagnosis))
711   - {
712   - if (!is_file($lockFile))
713   - {
714   - $default->log->error(_kt('Indexer problem: ') . $diagnosis);
715   - }
716   - touch($lockFile);
717   - return false;
718   - }
719   -
720   - if ($simple)
721   - {
722   - return true;
723   - }
724   -
725   - $diagnosis = $this->diagnoseExtractors();
726   - if (!empty($diagnosis))
727   - {
728   - if (!is_file($lockFile))
729   - {
730   - foreach($diagnosis as $diag)
731   - {
732   - $default->log->error(sprintf(_kt('%s problem: %s'), $diag['name'],$diag['diagnosis']));
733   - }
734   - }
735   - touch($lockFile);
736   - return false;
737   - }
738   -
739   - if (is_file($lockFile))
740   - {
741   - $default->log->info(_kt('Issues with the indexer have been resolved!'));
742   - unlink($lockFile);
743   - }
744   -
745   - return true;
746   - }
747   -
748   - /**
749   - * This does the initial mime type association between mime types and text extractors
750   - *
751   - */
752   - public function checkForRegisteredTypes()
753   - {
754   - global $default;
755   -
756   - // we are only doing this once!
757   - $initRegistered = KTUtil::getSystemSetting('mimeTypesRegistered', false);
758   - if ($initRegistered)
759   - {
760   - return;
761   - }
762   - if ($this->debug) $default->log->debug('checkForRegisteredTypes: start');
763   -
764   - $date = date('Y-m-d H:i');
765   - $sql = "UPDATE scheduler_tasks SET run_time='$date'";
766   - DBUtil::runQuery($sql);
767   -
768   - $this->registerTypes(true);
769   -
770   - $disable = array(
771   - OS_WINDOWS=>array('PSExtractor'),
772   - OS_UNIX => array()
773   - );
774   -
775   - $disableForOS = OS_WINDOWS?$disable[OS_WINDOWS]:$disable[OS_UNIX];
776   -
777   - foreach($disableForOS as $extractor)
778   - {
779   - $sql = "UPDATE mime_extractors SET active=0 WHERE name='$extractor'";
780   - DBUtil::runQuery($sql);
781   - $default->log->info("checkForRegisteredTypes: disabled '$extractor'");
782   - }
783   -
784   - if ($this->debug) $default->log->debug('checkForRegisteredTypes: done');
785   - KTUtil::setSystemSetting('mimeTypesRegistered', true);
786   - }
787   -
788   - private function updatePendingDocumentStatus($documentId, $message, $level)
789   - {
790   - $this->indexingHistory .= "\n" . $level . ': ' . $message;
791   - $message = sanitizeForSQL($this->indexingHistory);
792   - $sql = "UPDATE index_files SET status_msg='$message' WHERE document_id=$documentId";
793   - DBUtil::runQuery($sql);
794   - }
795   -
796   - /**
797   - *
798   - * @param int $documentId
799   - * @param string $message
800   - * @param string $level This may be info, error, debug
801   - */
802   - private function logPendingDocumentInfoStatus($documentId, $message, $level)
803   - {
804   - $this->updatePendingDocumentStatus($documentId, $message, $level);
805   - global $default;
806   -
807   - switch ($level)
808   - {
809   - case 'debug':
810   - if ($this->debug)
811   - {
812   - $default->log->debug($message);
813   - }
814   - break;
815   - default:
816   - $default->log->$level($message);
817   - }
818   - }
819   -
820   -
821   -
822   - public function getExtractor($extractorClass)
823   - {
824   - if (empty($extractorClass))
825   - {
826   - return null;
827   - }
828   -
829   - $includeFile = SEARCH2_INDEXER_DIR . 'extractors/' . $extractorClass . '.inc.php';
830   - if (!file_exists($includeFile))
831   - {
832   - throw new Exception("Extractor file does not exist: $includeFile");
833   - }
834   -
835   - require_once($includeFile);
836   -
837   - if (!class_exists($extractorClass))
838   - {
839   - throw new Exception("Extractor '$classname' not defined in file: $includeFile");
840   - }
841   -
842   - $extractor = new $extractorClass();
843   -
844   - if (!($extractor instanceof DocumentExtractor))
845   - {
846   - throw new Exception("Class $classname was expected to be of type DocumentExtractor");
847   - }
848   -
849   - return $extractor;
850   - }
851   -
852   - public static function getIndexingQueue($problemItemsOnly=true)
853   - {
854   -
855   - if ($problemItemsOnly)
856   - {
857   - $sql = "SELECT
858   - iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename
859   - FROM
860   - index_files iff
861   - INNER JOIN documents d ON iff.document_id=d.id
862   - INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
863   - INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
864   - INNER JOIN mime_types mt ON dcv.mime_id=mt.id
865   - LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
866   - WHERE
867   - (iff.status_msg IS NOT NULL AND iff.status_msg <> '') AND d.status_id=1
868   - ORDER BY indexdate ";
869   - }
870   - else
871   - {
872   - $sql = "SELECT
873   - iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename
874   - FROM
875   - index_files iff
876   - INNER JOIN documents d ON iff.document_id=d.id
877   - INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
878   - INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
879   - INNER JOIN mime_types mt ON dcv.mime_id=mt.id
880   - LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
881   - WHERE
882   - (iff.status_msg IS NULL or iff.status_msg = '') AND d.status_id=1
883   - ORDER BY indexdate ";
884   - }
885   - $aResult = DBUtil::getResultArray($sql);
886   -
887   - return $aResult;
888   - }
889   -
890   - public static function getPendingIndexingQueue()
891   - {
892   - return Indexer::getIndexingQueue(false);
893   - }
894   -
895   - /**
896   - * The main function that may be called repeatedly to index documents.
897   - *
898   - * @param int $max Default 20
899   - */
900   - public function indexDocuments($max=null)
901   - {
902   - global $default;
903   - $config =& KTConfig::getSingleton();
904   -
905   - /*$indexLockFile = $config->get('cache/cacheDirectory') . '/main.index.lock';
906   - if (is_file($indexLockFile))
907   - {
908   - $default->log->info('indexDocuments: main.index.lock seems to exist. it could be that the indexing is still underway.');
909   - $default->log->info('indexDocuments: Remove "' . $indexLockFile . '" if the indexing is not running or extend the frequency at which the background task runs!');
910   - return;
911   - }
912   - touch($indexLockFile);*/
913   -
914   -
915   - $this->checkForRegisteredTypes();
916   -
917   - if ($this->debug) $default->log->debug('indexDocuments: start');
918   - if (!$this->doesDiagnosticsPass())
919   - {
920   - //unlink($indexLockFile);
921   - if ($this->debug) $default->log->debug('indexDocuments: stopping - diagnostics problem. The dashboard will provide more information.');
922   - return;
923   - }
924   -
925   - if (is_null($max))
926   - {
927   - $max = $config->get('indexer/batchDocuments',20);
928   - }
929   -
930   - $this->loadExtractorHooks();
931   -
932   - Indexer::clearoutDeleted();
933   -
934   - $date = date('Y-m-d H:i:s');
935   - // identify the indexers that must run
936   - // mysql specific limit!
937   - $sql = "SELECT
938   - iff.document_id, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what
939   - FROM
940   - index_files iff
941   - INNER JOIN documents d ON iff.document_id=d.id
942   - INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
943   - INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
944   - INNER JOIN mime_types mt ON dcv.mime_id=mt.id
945   - LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
946   - WHERE
947   - (iff.processdate IS NULL or iff.processdate < cast(cast('$date' as date) -1 as date)) AND dmv.status_id=1
948   - ORDER BY indexdate
949   - LIMIT $max";
950   - $result = DBUtil::getResultArray($sql);
951   - if (PEAR::isError($result))
952   - {
953   - //unlink($indexLockFile);
954   - if ($this->debug) $default->log->debug('indexDocuments: stopping - db error');
955   - return;
956   - }
957   - KTUtil::setSystemSetting('luceneIndexingDate', time());
958   -
959   - // bail if no work to do
960   - if (count($result) == 0)
961   - {
962   - //unlink($indexLockFile);
963   - if ($this->debug) $default->log->debug('indexDocuments: stopping - no work to be done');
964   - return;
965   - }
966   -
967   - // identify any documents that need indexing and mark them
968   - // so they are not taken in a followup run
969   - $ids = array();
970   - foreach($result as $docinfo)
971   - {
972   - $ids[] = $docinfo['document_id'];
973   - }
974   -
975   - // mark the documents as being processed
976   -
977   - $ids=implode(',',$ids);
978   - $sql = "UPDATE index_files SET processdate='$date' WHERE document_id in ($ids)";
979   - DBUtil::runQuery($sql);
980   -
981   - $extractorCache = array();
982   - $storageManager = KTStorageManagerUtil::getSingleton();
983   -
984   - $tempPath = $config->get("urls/tmpDirectory");
985   -
986   - foreach($result as $docinfo)
987   - {
988   - // increment indexed documents count
989   - Indexer::incrementCount();
990   -
991   - $docId=$docinfo['document_id'];
992   - $extension=$docinfo['filetypes'];
993   - $mimeType=$docinfo['mimetypes'];
994   - $extractorClass=$docinfo['extractor'];
995   - $indexDocument = in_array($docinfo['what'], array('A','C'));
996   - $indexDiscussion = in_array($docinfo['what'], array('A','D'));
997   - $this->indexingHistory = '';
998   -
999   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Indexing docid: %d extension: '%s' mimetype: '%s' extractor: '%s'"), $docId, $extension,$mimeType,$extractorClass), 'debug');
1000   -
1001   - if (empty($extractorClass))
1002   - {
1003   - /*
1004   -
1005   - if no extractor is found and we don't need to index discussions, then we can remove the item from the queue.
1006   -
1007   - */
1008   - if ($indexDiscussion)
1009   - {
1010   - $indexDocument = false;
1011   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Not indexing docid: %d content because extractor could not be resolve. Still indexing discussion."), $docId), 'info');
1012   - }
1013   - else
1014   - {
1015   - Indexer::unqueueDocument($docId, sprintf(_kt("No extractor for docid: %d"),$docId));
1016   - continue;
1017   - }
1018   - }
1019   - else
1020   - {
1021   - /*
1022   -
1023   - If an extractor is available, we must ensure it is enabled.
1024   -
1025   - */
1026   -
1027   - if (!$this->isExtractorEnabled($extractorClass))
1028   - {
1029   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("diagnose: Not indexing docid: %d because extractor '%s' is disabled."), $docId, $extractorClass), 'info');
1030   - continue;
1031   - }
1032   - }
1033   -
1034   - if ($this->debug)
1035   - {
1036   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Processing docid: %d.\n"),$docId), 'info');
1037   - }
1038   -
1039   - $document = Document::get($docId);
1040   - if (PEAR::isError($document))
1041   - {
1042   - Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Cannot resolve document id %d: %s."),$docId, $document->getMessage()), 'error');
1043   - continue;
1044   - }
1045   -
1046   - $filename = $document->getFileName();
1047   - if (substr($filename,0,1) == '~')
1048   - {
1049   - Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Filename for document id %d starts with a tilde (~). This is assumed to be a temporary file. This is ignored."),$docId), 'error');
1050   - continue;
1051   - }
1052   -
1053   - $removeFromQueue = true;
1054   - if ($indexDocument)
1055   - {
1056   - if (array_key_exists($extractorClass, $extractorCache))
1057   - {
1058   - $extractor = $extractorCache[$extractorClass];
1059   - }
1060   - else
1061   - {
1062   - $extractor = $extractorCache[$extractorClass] = $this->getExtractor($extractorClass);
1063   - }
1064   -
1065   - if (!($extractor instanceof DocumentExtractor))
1066   - {
1067   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("indexDocuments: extractor '%s' is not a document extractor class."),$extractorClass), 'error');
1068   - continue;
1069   - }
1070   -
1071   -
1072   -
1073   - $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();
1074   - $sourceFile = $storageManager->temporaryFile($document);
1075   -
1076   - if (empty($sourceFile) || !is_file($sourceFile))
1077   - {
1078   - Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: source file '%s' for document %d does not exist."),$sourceFile,$docId), 'error');
1079   - continue;
1080   - }
1081   -
1082   - if ($extractor->needsIntermediateSourceFile())
1083   - {
1084   - $extension = pathinfo($document->getFileName(), PATHINFO_EXTENSION);
1085   -
1086   - $intermediate = $tempPath . '/'. $docId . '.' . $extension;
1087   - $result = @copy($sourceFile, $intermediate);
1088   - if ($result === false)
1089   - {
1090   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not create intermediate file from document %d"),$docId), 'error');
1091   - // problem. lets try again later. probably permission related. log the issue.
1092   - continue;
1093   - }
1094   - $sourceFile = $intermediate;
1095   - }
1096   -
1097   - $targetFile = tempnam($tempPath, 'ktindexer');
1098   -
1099   - $extractor->setSourceFile($sourceFile);
1100   - $extractor->setMimeType($mimeType);
1101   - $extractor->setExtension($extension);
1102   - $extractor->setTargetFile($targetFile);
1103   - $extractor->setDocument($document);
1104   - $extractor->setIndexingStatus(null);
1105   - $extractor->setExtractionStatus(null);
1106   -
1107   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Extra Info docid: %d Source File: '%s' Target File: '%s'"),$docId,$sourceFile,$targetFile), 'debug');
1108   -
1109   - $this->executeHook($extractor, 'pre_extract');
1110   - $this->executeHook($extractor, 'pre_extract', $mimeType);
1111   - $removeFromQueue = false;
1112   -
1113   - if ($extractor->extractTextContent())
1114   - {
1115   - // the extractor may need to create another target file
1116   - $targetFile = $extractor->getTargetFile();
1117   -
1118   - $extractor->setExtractionStatus(true);
1119   - $this->executeHook($extractor, 'pre_index');
1120   - $this->executeHook($extractor, 'pre_index', $mimeType);
1121   -
1122   - $title = $document->getName();
1123   - if ($indexDiscussion)
1124   - {
1125   - if (!$this->filterText($targetFile))
1126   - {
1127   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');
1128   - }
1129   - else
1130   - {
1131   - $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);
1132   - $removeFromQueue = $indexStatus;
1133   - if (!$indexStatus)
1134   - {
1135   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocumentAndDiscussion"),$docId), 'error');
1136   - }
1137   -
1138   - $extractor->setIndexingStatus($indexStatus);
1139   - }
1140   - }
1141   - else
1142   - {
1143   - if (!$this->filterText($targetFile))
1144   - {
1145   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');
1146   - }
1147   - else
1148   - {
1149   - $indexStatus = $this->indexDocument($docId, $targetFile, $title, $version);
1150   - $removeFromQueue = $indexStatus;
1151   -
1152   - if (!$indexStatus)
1153   - {
1154   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocument"),$docId), 'error');
1155   - $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error');
1156   - }
1157   -
1158   - $extractor->setIndexingStatus($indexStatus);
1159   - }
1160   - }
1161   -
1162   - $this->executeHook($extractor, 'post_index', $mimeType);
1163   - $this->executeHook($extractor, 'post_index');
1164   - }
1165   - else
1166   - {
1167   - $extractor->setExtractionStatus(false);
1168   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not extract contents from document %d"),$docId), 'error');
1169   - $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error');
1170   - }
1171   -
1172   - $this->executeHook($extractor, 'post_extract', $mimeType);
1173   - $this->executeHook($extractor, 'post_extract');
1174   -
1175   - if ($extractor->needsIntermediateSourceFile())
1176   - {
1177   - @unlink($sourceFile);
1178   - }
1179   -
1180   - @unlink($targetFile);
1181   -
1182   - }
1183   - else
1184   - {
1185   - $indexStatus = $this->indexDiscussion($docId);
1186   - $removeFromQueue = $indexStatus;
1187   - }
1188   -
1189   - if ($removeFromQueue)
1190   - {
1191   - Indexer::unqueueDocument($docId, sprintf(_kt("Done indexing docid: %d"),$docId));
1192   - }
1193   - else
1194   - {
1195   - if ($this->debug) $default->log->debug(sprintf(_kt("Document docid: %d was not removed from the queue as it looks like there was a problem with the extraction process"),$docId));
1196   - }
1197   - }
1198   - if ($this->debug) $default->log->debug('indexDocuments: done');
1199   - //unlink($indexLockFile);
1200   - }
1201   -
1202   - public function migrateDocuments($max=null)
1203   - {
1204   - global $default;
1205   -
1206   - $default->log->info(_kt('migrateDocuments: starting'));
1207   -
1208   - if (!$this->doesDiagnosticsPass(true))
1209   - {
1210   - $default->log->info(_kt('migrateDocuments: stopping - diagnostics problem. The dashboard will provide more information.'));
1211   - return;
1212   - }
1213   -
1214   - if (KTUtil::getSystemSetting('migrationComplete') == 'true')
1215   - {
1216   - $default->log->info(_kt('migrateDocuments: stopping - migration is complete.'));
1217   - return;
1218   - }
1219   -
1220   - $config =& KTConfig::getSingleton();
1221   - if (is_null($max))
1222   - {
1223   - $max = $config->get('indexer/batchMigrateDocument',500);
1224   - }
1225   -
1226   - $lockFile = $config->get('cache/cacheDirectory') . '/migration.lock';
1227   - if (is_file($lockFile))
1228   - {
1229   - $default->log->info(_kt('migrateDocuments: stopping - migration lockfile detected.'));
1230   - return;
1231   - }
1232   - touch($lockFile);
1233   -
1234   - $startTime = KTUtil::getSystemSetting('migrationStarted');
1235   - if (is_null($startTime))
1236   - {
1237   - KTUtil::setSystemSetting('migrationStarted', time());
1238   - }
1239   -
1240   - $maxLoops = 5;
1241   -
1242   - $max = ceil($max / $maxLoops);
1243   -
1244   - $start =KTUtil::getBenchmarkTime();
1245   - $noDocs = false;
1246   - $numDocs = 0;
1247   -
1248   - for($loop=0;$loop<$maxLoops;$loop++)
1249   - {
1250   -
1251   - $sql = "SELECT
1252   - document_id, document_text
1253   - FROM
1254   - document_text
1255   - ORDER BY document_id
1256   - LIMIT $max";
1257   - $result = DBUtil::getResultArray($sql);
1258   - if (PEAR::isError($result))
1259   - {
1260   - $default->log->info(_kt('migrateDocuments: db error'));
1261   - break;
1262   - }
1263   -
1264   - $docs = count($result);
1265   - if ($docs == 0)
1266   - {
1267   - $noDocs = true;
1268   - break;
1269   - }
1270   - $numDocs += $docs;
1271   -
1272   - foreach($result as $docinfo)
1273   - {
1274   - $docId = $docinfo['document_id'];
1275   -
1276   - $document = Document::get($docId);
1277   - if (PEAR::isError($document) || is_null($document))
1278   - {
1279   - $sql = "DELETE FROM document_text WHERE document_id=$docId";
1280   - DBUtil::runQuery($sql);
1281   - $default->log->error(sprintf(_kt('migrateDocuments: Could not get document %d\'s document! Removing content!'),$docId));
1282   - continue;
1283   - }
1284   -
1285   - $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();
1286   -
1287   - $targetFile = tempnam($tempPath, 'ktindexer');
1288   -
1289   - if (file_put_contents($targetFile, $docinfo['document_text']) === false)
1290   - {
1291   - $default->log->error(sprintf(_kt('migrateDocuments: Cannot write to \'%s\' for document id %d'), $targetFile, $docId));
1292   - continue;
1293   - }
1294   - // free memory asap ;)
1295   - unset($docinfo['document_text']);
1296   -
1297   - $title = $document->getName();
1298   -
1299   - $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);
1300   -
1301   - if ($indexStatus)
1302   - {
1303   - $sql = "DELETE FROM document_text WHERE document_id=$docId";
1304   - DBUtil::runQuery($sql);
1305   - }
1306   - else
1307   - {
1308   - $default->log->error(sprintf(_kt("migrateDocuments: Problem indexing document %d"), $docId));
1309   - }
1310   -
1311   - @unlink($targetFile);
1312   - }
1313   - }
1314   -
1315   - @unlink($lockFile);
1316   -
1317   - $time = KTUtil::getBenchmarkTime() - $start;
1318   -
1319   - KTUtil::setSystemSetting('migrationTime', KTUtil::getSystemSetting('migrationTime',0) + $time);
1320   - KTUtil::setSystemSetting('migratedDocuments', KTUtil::getSystemSetting('migratedDocuments',0) + $numDocs);
1321   -
1322   - $default->log->info(sprintf(_kt('migrateDocuments: stopping - done in %d seconds!'), $time));
1323   - if ($noDocs)
1324   - {
1325   - $default->log->info(_kt('migrateDocuments: Completed!'));
1326   - KTUtil::setSystemSetting('migrationComplete', 'true');
1327   - schedulerUtil::deleteByName('Index Migration');
1328   - $default->log->debug(_kt('migrateDocuments: Disabling \'Index Migration\' task by removing scheduler entry.'));
1329   - }
1330   - }
1331   -
1332   - /**
1333   - * Index a document. The base class must override this function.
1334   - *
1335   - * @param int $docId
1336   - * @param string $textFile
1337   - */
1338   - protected abstract function indexDocument($docId, $textFile, $title, $version);
1339   -
1340   -
1341   - public function updateDocumentIndex($docId, $text)
1342   - {
1343   - $config = KTConfig::getSingleton();
1344   - $tempPath = $config->get("urls/tmpDirectory");
1345   - $tempFile = tempnam($tempPath,'ud_');
1346   -
1347   - file_put_contents($tempFile, $text);
1348   -
1349   - $document = Document::get($docId);
1350   - $title = $document->getDescription();
1351   - $version = $document->getVersion();
1352   -
1353   - $result = $this->indexDocument($docId, $tempFile, $title, $version);
1354   -
1355   - if (file_exists($tempFile))
1356   - {
1357   - unlink($tempFile);
1358   - }
1359   -
1360   - return $result;
1361   - }
1362   -
1363   - /**
1364   - * Index a discussion. The base class must override this function.
1365   - *
1366   - * @param int $docId
1367   - */
1368   - protected abstract function indexDiscussion($docId);
1369   -
1370   - /**
1371   - * Diagnose the indexer. e.g. Check that the indexing server is running.
1372   - *
1373   - */
1374   - public abstract function diagnose();
1375   -
1376   - /**
1377   - * Diagnose the extractors.
1378   - *
1379   - * @return array
1380   - */
1381   - public function diagnoseExtractors()
1382   - {
1383   - $diagnosis = $this->_diagnose($this->extractorPath, 'DocumentExtractor', 'Extractor.inc.php');
1384   - $diagnosis = array_merge($diagnosis, $this->_diagnose($this->hookPath, 'Hook', 'Hook.inc.php'));
1385   -
1386   - return $diagnosis;
1387   - }
1388   -
1389   - /**
1390   - * This is a refactored diagnose function.
1391   - *
1392   - * @param string $path
1393   - * @param string $class
1394   - * @param string $extension
1395   - * @return array
1396   - */
1397   - private function _diagnose($path, $baseclass, $extension)
1398   - {
1399   - global $default;
1400   -
1401   - $diagnoses = array();
1402   -
1403   - $dir = opendir(SearchHelper::correctPath($path));
1404   - $extlen = - strlen($extension);
1405   -
1406   - while (($file = readdir($dir)) !== false)
1407   - {
1408   - if (substr($file,0,1) == '.')
1409   - {
1410   - continue;
1411   - }
1412   - if (substr($file,$extlen) != $extension)
1413   - {
1414   - $default->log->error(sprintf(_kt("diagnose: '%s' does not have extension '%s'."), $file, $extension));
1415   - continue;
1416   - }
1417   -
1418   - require_once($path . '/' . $file);
1419   -
1420   - $class = substr($file, 0, -8);
1421   - if (!class_exists($class))
1422   - {
1423   - $default->log->error(sprintf(_kt("diagnose: class '%s' does not exist."), $class));
1424   - continue;
1425   - }
1426   -
1427   - if (!$this->isExtractorEnabled($class))
1428   - {
1429   - $default->log->debug(sprintf(_kt("diagnose: extractor '%s' is disabled."), $class));
1430   - continue;
1431   - }
1432   -
1433   - $extractor = new $class();
1434   - if (!is_a($extractor, $baseclass))
1435   - {
1436   - $default->log->error(sprintf(_kt("diagnose(): '%s' is not of type DocumentExtractor"), $class));
1437   - continue;
1438   - }
1439   -
1440   - $types = $extractor->getSupportedMimeTypes();
1441   - if (empty($types))
1442   - {
1443   - if ($this->debug) $default->log->debug(sprintf(_kt("diagnose: class '%s' does not support any types."), $class));
1444   - continue;
1445   - }
1446   -
1447   - $diagnosis=$extractor->diagnose();
1448   - if (empty($diagnosis))
1449   - {
1450   - continue;
1451   - }
1452   - $diagnoses[$class] = array(
1453   - 'name'=>$extractor->getDisplayName(),
1454   - 'diagnosis'=>$diagnosis
1455   - );
1456   -
1457   - }
1458   - closedir($dir);
1459   -
1460   - return $diagnoses;
1461   - }
1462   -
1463   -
1464   - /**
1465   - * Register the extractor types.
1466   - *
1467   - * @param boolean $clear. Optional. Defaults to false.
1468   - */
1469   - public function registerTypes($clear=false)
1470   - {
1471   - if ($clear)
1472   - {
1473   - $this->clearExtractors();
1474   - }
1475   - $dir = opendir(SearchHelper::correctPath($this->extractorPath));
1476   - while (($file = readdir($dir)) !== false)
1477   - {
1478   - if (substr($file,-17) == 'Extractor.inc.php')
1479   - {
1480   - require_once($this->extractorPath . '/' . $file);
1481   - $class = substr($file, 0, -8);
1482   -
1483   - if (!class_exists($class))
1484   - {
1485   - // if the class does not exist, we can't do anything.
1486   - continue;
1487   - }
1488   -
1489   - $extractor = new $class;
1490   - if ($extractor instanceof DocumentExtractor)
1491   - {
1492   - $extractor->registerMimeTypes();
1493   - }
1494   - }
1495   - }
1496   - closedir($dir);
1497   - }
1498   -
1499   - /**
1500   - * This is used as a possible obtimisation effort. It may be overridden in that case.
1501   - *
1502   - * @param int $docId
1503   - * @param string $textFile
1504   - */
1505   - protected function indexDocumentAndDiscussion($docId, $textFile, $title, $version)
1506   - {
1507   - $this->indexDocument($docId, $textFile, $title, $version);
1508   - $this->indexDiscussion($docId);
1509   - }
1510   -
1511   - /**
1512   - * Remove the document from the queue. This is normally called when it has been processed.
1513   - *
1514   - * @param int $docid
1515   - */
1516   - public static function unqueueDocument($docid, $reason=false, $level='debug')
1517   - {
1518   - $sql = "DELETE FROM index_files WHERE document_id=$docid";
1519   - DBUtil::runQuery($sql);
1520   - if ($reason !== false)
1521   - {
1522   - global $default;
1523   - $default->log->$level("Indexer: removing document $docid from the queue - $reason");
1524   - }
1525   - }
1526   -
1527   - /**
1528   - * Run a query on the index.
1529   - *
1530   - * @param string $query
1531   - * @return array
1532   - */
1533   - public abstract function query($query);
1534   -
1535   - /**
1536   - * Converts an integer to a string that can be easily compared and reversed.
1537   - *
1538   - * @param int $int
1539   - * @return string
1540   - */
1541   - public static function longToString($int)
1542   - {
1543   - $maxlen = 14;
1544   -
1545   - $a2z = array('a','b','c','d','e','f','g','h','i','j');
1546   - $o29 = array('0','1','2','3','4','5','6','7','8','9');
1547   - $l = str_pad('',$maxlen - strlen("$int"),'0') . $int;
1548   -
1549   - return str_replace($o29, $a2z, $l);
1550   - }
1551   -
1552   - /**
1553   - * Converts a string to an integer.
1554   - *
1555   - * @param string $str
1556   - * @return int
1557   - */
1558   - public static function stringToLong($str)
1559   - {
1560   - $a2z = array('a','b','c','d','e','f','g','h','i','j');
1561   - $o29 = array('0','1','2','3','4','5','6','7','8','9');
1562   -
1563   - $int = str_replace($a2z, $o29, $str) + 0;
1564   -
1565   - return $int;
1566   - }
1567   -
1568   - /**
1569   - * Possibly we can optimise indexes. This method must be overriden.
1570   - * The new function must call the parent!
1571   - *
1572   - */
1573   - public function optimise()
1574   - {
1575   - KTUtil::setSystemSetting('luceneOptimisationDate', time());
1576   - }
1577   -
1578   - /**
1579   - * Shuts down the indexer
1580   - *
1581   - */
1582   - public function shutdown()
1583   - {
1584   - // do nothing generally
1585   - }
1586   -
1587   - /**
1588   - * Returns the name of the indexer.
1589   - *
1590   - * @return string
1591   - */
1592   - public abstract function getDisplayName();
1593   -
1594   -
1595   - /**
1596   - * Returns the number of non-deleted documents in the index.
1597   - *
1598   - * @return int
1599   - */
1600   - public abstract function getDocumentsInIndex();
1601   -
1602   - /**
1603   - * Returns the path to the index directory
1604   - *
1605   - * @return string
1606   - */
1607   - public function getIndexDirectory()
1608   - {
1609   - $config = KTConfig::getSingleton();
1610   - $directory = $config->get('indexer/luceneDirectory');
1611   - return $directory;
1612   - }
1613   -}
1614   -
1615   -?>
  1 +<?php
  2 +
  3 +/**
  4 + * $Id:$
  5 + *
  6 + * KnowledgeTree Community Edition
  7 + * Document Management Made Simple
  8 + * Copyright (C) 2008 KnowledgeTree Inc.
  9 + * Portions copyright The Jam Warehouse Software (Pty) Limited
  10 + *
  11 + * This program is free software; you can redistribute it and/or modify it under
  12 + * the terms of the GNU General Public License version 3 as published by the
  13 + * Free Software Foundation.
  14 + *
  15 + * This program is distributed in the hope that it will be useful, but WITHOUT
  16 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17 + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  18 + * details.
  19 + *
  20 + * You should have received a copy of the GNU General Public License
  21 + * along with this program. If not, see <http://www.gnu.org/licenses/>.
  22 + *
  23 + * You can contact KnowledgeTree Inc., PO Box 7775 #87847, San Francisco,
  24 + * California 94120-7775, or email info@knowledgetree.com.
  25 + *
  26 + * The interactive user interfaces in modified source and object code versions
  27 + * of this program must display Appropriate Legal Notices, as required under
  28 + * Section 5 of the GNU General Public License version 3.
  29 + *
  30 + * In accordance with Section 7(b) of the GNU General Public License version 3,
  31 + * these Appropriate Legal Notices must retain the display of the "Powered by
  32 + * KnowledgeTree" logo and retain the original copyright notice. If the display of the
  33 + * logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices
  34 + * must display the words "Powered by KnowledgeTree" and retain the original
  35 + * copyright notice.
  36 + * Contributor( s): ______________________________________
  37 + *
  38 + */
  39 +
  40 +define('SEARCH2_INDEXER_DIR',realpath(dirname(__FILE__)) . '/');
  41 +require_once('indexing/extractorCore.inc.php');
  42 +require_once(KT_DIR . '/plugins/ktcore/scheduler/schedulerUtil.php');
  43 +
  44 +
  45 +class IndexerInconsistencyException extends Exception {};
  46 +
  47 +class QueryResultItem
  48 +{
  49 + protected $document_id;
  50 + protected $title;
  51 + protected $rank;
  52 + protected $text;
  53 + protected $filesize;
  54 + protected $fullpath;
  55 + protected $live;
  56 + protected $version;
  57 + protected $mimeType;
  58 + protected $filename;
  59 + protected $thumbnail; // TODO: if not null, gui can display a thumbnail
  60 + protected $viewer; // TODO: if not null, a viewer can be used to view the document
  61 + protected $document;
  62 + protected $checkedOutUser;
  63 + protected $dateCheckedout;
  64 + protected $workflowState;
  65 + protected $workflow;
  66 + protected $modifiedBy;
  67 + protected $dateModified;
  68 + protected $createdBy;
  69 + protected $dateCreated;
  70 + protected $owner;
  71 + protected $immutable;
  72 + protected $deleted;
  73 + protected $status;
  74 + protected $folderId;
  75 + protected $storagePath;
  76 + protected $documentType;
  77 + protected $mimeIconPath;
  78 + protected $mimeDisplay;
  79 + protected $oemDocumentNo;
  80 +
  81 + public function __construct($document_id, $rank=null, $title=null, $text=null)
  82 + {
  83 + $this->document_id=(int) $document_id;
  84 + $this->rank= $rank;
  85 + $this->title=$title;
  86 + $this->text = $text;
  87 + $this->live = true;
  88 + $this->loadDocumentInfo();
  89 + }
  90 +
  91 + protected function __isset($property)
  92 + {
  93 + switch($property)
  94 + {
  95 + case 'DocumentID': return isset($this->document_id);
  96 + case 'Rank': return isset($this->rank);
  97 + case 'Text': return isset($this->text);
  98 + case 'Title': return isset($this->title);
  99 + case null: break;
  100 + default:
  101 + throw new Exception("Unknown property '$property' to get on QueryResultItem");
  102 + }
  103 + return true; // should not be reached
  104 + }
  105 +
  106 + public function loadDocumentInfo()
  107 + {
  108 + global $default;
  109 + $sql = "SELECT
  110 + d.folder_id, f.full_path, f.name, dcv.size as filesize, dcv.major_version,
  111 + dcv.minor_version, dcv.filename, cou.name as checkoutuser, w.human_name as workflow, ws.human_name as workflowstate,
  112 + mt.mimetypes as mimetype, md.mime_doc as mimedoc, d.checkedout, mbu.name as modifiedbyuser, d.modified,
  113 + cbu.name as createdbyuser, ou.name as owneruser, d.immutable, d.status_id, d.created,dcv.storage_path, dtl.name as document_type,
  114 + mt.icon_path as mime_icon_path, mt.friendly_name as mime_display, d.oem_no, dmv.name as title
  115 + FROM
  116 + documents d
  117 + INNER JOIN document_metadata_version dmv ON d.metadata_version_id = dmv.id
  118 + INNER JOIN document_content_version dcv ON dmv.content_version_id = dcv.id
  119 + INNER JOIN mime_types mt ON dcv.mime_id=mt.id
  120 + LEFT JOIN document_types_lookup dtl ON dtl.id=dmv.document_type_id
  121 + LEFT JOIN folders f ON f.id=d.folder_id
  122 + LEFT JOIN users cou ON d.checked_out_user_id=cou.id
  123 + LEFT JOIN workflows w ON dmv.workflow_id=w.id
  124 + LEFT JOIN workflow_states ws ON dmv.workflow_state_id = ws.id
  125 + LEFT JOIN mime_documents md ON mt.mime_document_id = md.id
  126 + LEFT JOIN users mbu ON d.modified_user_id=mbu.id
  127 + LEFT JOIN users cbu ON d.creator_id=cbu.id
  128 + LEFT JOIN users ou ON d.owner_id=ou.id
  129 + WHERE
  130 + d.id=$this->document_id";
  131 +
  132 + $result = DBUtil::getOneResult($sql);
  133 +
  134 + if (PEAR::isError($result) || empty($result))
  135 + {
  136 + $this->live = false;
  137 + if (PEAR::isError($result))
  138 + {
  139 + throw new Exception('Database exception! There appears to be an error in the system: ' .$result->getMessage());
  140 + }
  141 +
  142 + $default->log->error('QueryResultItem: $result is null');
  143 + $msg = 'The database did not have a record matching the result from the document indexer. This may occur if there is an inconsistency between the document indexer and the repository. The indexer needs to be repaired.';
  144 + $default->log->error('QueryResultItem: ' . $msg);
  145 + // TODO: repair process where we scan documents in index, and delete those for which there is nothing in the repository
  146 + throw new IndexerInconsistencyException(_kt($msg));
  147 + }
  148 +
  149 + // document_id, relevance, text, title
  150 +
  151 + $this->documentType = $result['document_type'];
  152 + $this->filename=$result['filename'];
  153 + $this->filesize = KTUtil::filesizeToString($result['filesize']);
  154 + $this->folderId = $result['folder_id'];
  155 + $this->title = $result['title'];
  156 +
  157 + $this->createdBy = $result['createdbyuser'];
  158 + $this->dateCreated = $result['created'];
  159 +
  160 + $this->modifiedBy = $result['modifiedbyuser'];
  161 + $this->dateModified = $result['modified'];
  162 +
  163 + $this->checkedOutUser = $result['checkoutuser'];
  164 + $this->dateCheckedout = $result['checkedout'];
  165 +
  166 + $this->owner = $result['owneruser'];
  167 +
  168 + $this->version = $result['major_version'] . '.' . $result['minor_version'];
  169 +
  170 + $this->immutable = ($result['immutable'] + 0)?_kt('Immutable'):'';
  171 +
  172 + $this->workflow = $result['workflow'];
  173 + $this->workflowState = $result['workflowstate'];
  174 +
  175 + $this->oemDocumentNo = $result['oem_no'];
  176 + if (empty($this->oemDocumentNo)) $this->oemDocumentNo = 'n/a';
  177 +
  178 + if (is_null($result['name']))
  179 + {
  180 + $this->fullpath = '(orphaned)';
  181 + }
  182 + else
  183 + {
  184 + $this->fullpath = $result['full_path'];
  185 + }
  186 +
  187 + $this->mimeType = $result['mimetype'];
  188 + $this->mimeIconPath = $result['mime_icon_path'];
  189 + $this->mimeDisplay = $result['mime_display'];
  190 +
  191 + $this->storagePath = $result['storage_path'];
  192 + $this->status = Document::getStatusString($result['status_id']);
  193 + }
  194 +
  195 + protected function __get($property)
  196 + {
  197 + switch($property)
  198 + {
  199 + case null: return '';
  200 + case 'DocumentID': return (int) $this->document_id;
  201 + case 'Relevance':
  202 + case 'Rank': return (float) $this->rank;
  203 + case 'Text': return (string) $this->text;
  204 + case 'Title': return (string) $this->title;
  205 + case 'FullPath': return (string) $this->fullpath;
  206 + case 'IsLive': return (bool) $this->live;
  207 + case 'Filesize': return $this->filesize;
  208 + case 'Version': return (string) $this->version;
  209 + case 'Filename': return (string)$this->filename;
  210 + case 'FolderId': return (int)$this->folderId;
  211 + case 'OemDocumentNo': return (string) $this->oemDocumentNo;
  212 + case 'Document':
  213 + if (is_null($this->document))
  214 + {
  215 + $this->document = Document::get($this->document_id);
  216 + }
  217 + return $this->document;
  218 + case 'IsAvailable':
  219 + return $this->Document->isLive();
  220 + case 'CheckedOutUser':
  221 + case 'CheckedOutBy':
  222 + return (string) $this->checkedOutUser;
  223 + case 'WorkflowOnly':
  224 + case 'Workflow':
  225 + return (string)$this->workflow;
  226 + case 'WorkflowStateOnly':
  227 + case 'WorkflowState':
  228 + return (string)$this->workflowState;
  229 + case 'WorkflowAndState':
  230 + if (is_null($this->workflow))
  231 + {
  232 + return '';
  233 + }
  234 + return "$this->workflow - $this->workflowState";
  235 + case 'MimeType':
  236 + return (string) $this->mimeType;
  237 + case 'MimeIconPath':
  238 + return (string) $this->mimeIconPath;
  239 + case 'MimeDisplay':
  240 + return (string) $this->mimeDisplay;
  241 + case 'DateCheckedOut':
  242 + return (string) $this->dateCheckedout;
  243 + case 'ModifiedBy':
  244 + return (string) $this->modifiedBy;
  245 + case 'DateModified':
  246 + return (string) $this->dateModified;
  247 + case 'CreatedBy':
  248 + return (string) $this->createdBy;
  249 + case 'DateCreated':
  250 + return (string) $this->dateCreated;
  251 + case 'Owner':
  252 + case 'OwnedBy':
  253 + return (string) $this->owner;
  254 + case 'IsImmutable':
  255 + case 'Immutable':
  256 + return (bool) $this->immutable;
  257 + case 'Status':
  258 + return $this->status;
  259 + case 'StoragePath':
  260 + return $this->storagePath;
  261 + case 'DocumentType':
  262 + return $this->documentType;
  263 + case 'Permissions':
  264 + return 'not available';
  265 + case 'CanBeReadByUser':
  266 + if (!$this->live)
  267 + return false;
  268 + if (Permission::userHasDocumentReadPermission($this->Document))
  269 + return true;
  270 + if (Permission::adminIsInAdminMode())
  271 + return true;
  272 + return false;
  273 + default:
  274 + throw new Exception("Unknown property '$property' to get on QueryResultItem");
  275 + }
  276 + return ''; // Should not be reached
  277 + }
  278 +
  279 + protected function __set($property, $value)
  280 + {
  281 + switch($property)
  282 + {
  283 + case 'Rank': $this->rank = number_format($value,2,'.',','); break;
  284 + case 'Title': $this->title = $value; break;
  285 + case 'Text': $this->text = $value; break;
  286 + default:
  287 + throw new Exception("Unknown property '$property' to set on QueryResultItem");
  288 + }
  289 + }
  290 +}
  291 +
  292 +function MatchResultCompare($a, $b)
  293 +{
  294 + if ($a->Rank == $b->Rank) {
  295 + return 0;
  296 + }
  297 + return ($a->Rank < $b->Rank) ? -1 : 1;
  298 +}
  299 +
  300 +abstract class Indexer
  301 +{
  302 + /**
  303 + * Cache of extractors
  304 + *
  305 + * @var array
  306 + */
  307 + private $extractorCache;
  308 +
  309 + /**
  310 + * Indicates if the indexer will do logging.
  311 + *
  312 + * @var boolean
  313 + */
  314 + private $debug;
  315 + /**
  316 + * Cache on mime related hooks
  317 + *
  318 + * @var unknown_type
  319 + */
  320 + private $mimeHookCache;
  321 + /**
  322 + * Cache on general hooks.
  323 + *
  324 + * @var array
  325 + */
  326 + private $generalHookCache;
  327 +
  328 + /**
  329 + * This is a path to the extractors.
  330 + *
  331 + * @var string
  332 + */
  333 + private $extractorPath;
  334 + /**
  335 + * This is a path to the hooks.
  336 + *
  337 + * @var string
  338 + */
  339 + private $hookPath;
  340 +
  341 + private $enabledExtractors;
  342 +
  343 + /**
  344 + * Initialise the indexer
  345 + *
  346 + */
  347 + protected function __construct()
  348 + {
  349 + $config = KTConfig::getSingleton();
  350 +
  351 + $this->extractorCache = array();
  352 + $this->debug = $config->get('indexer/debug', true);
  353 + $this->hookCache = array();
  354 + $this->generalHookCache = array();
  355 + $this->extractorPath = $config->get('indexer/extractorPath', 'extractors');
  356 + $this->hookPath = $config->get('indexer/extractorHookPath','extractorHooks');
  357 +
  358 + $this->loadExtractorStatus();
  359 + }
  360 +
  361 + /**
  362 + * Get the list if enabled extractors
  363 + *
  364 + */
  365 + private function loadExtractorStatus()
  366 + {
  367 + $sql = "SELECT id, name FROM mime_extractors WHERE active=1";
  368 + $rs = DBUtil::getResultArray($sql);
  369 + $this->enabledExtractors = array();
  370 + foreach($rs as $item)
  371 + {
  372 + $this->enabledExtractors[] = $item['name'];
  373 + }
  374 + }
  375 +
  376 + private function isExtractorEnabled($extractor)
  377 + {
  378 + return in_array($extractor, $this->enabledExtractors);
  379 + }
  380 +
  381 + /**
  382 + * Returns a reference to the main class
  383 + *
  384 + * @return Indexer
  385 + */
  386 + public static function get()
  387 + {
  388 + static $singleton = null;
  389 +
  390 + if (is_null($singleton))
  391 + {
  392 + $config = KTConfig::getSingleton();
  393 + $classname = $config->get('indexer/coreClass');
  394 +
  395 + require_once('indexing/indexers/' . $classname . '.inc.php');
  396 +
  397 + if (!class_exists($classname))
  398 + {
  399 + throw new Exception("Class '$classname' does not exist.");
  400 + }
  401 +
  402 + $singleton = new $classname;
  403 + }
  404 +
  405 + return $singleton;
  406 + }
  407 +
  408 + public abstract function deleteDocument($docid);
  409 +
  410 + /**
  411 + * Remove the association of all extractors to mime types on the database.
  412 + *
  413 + */
  414 + public function clearExtractors()
  415 + {
  416 + global $default;
  417 +
  418 + $sql = "update mime_types set extractor_id=null";
  419 + DBUtil::runQuery($sql);
  420 +
  421 + $sql = "delete from mime_extractors";
  422 + DBUtil::runQuery($sql);
  423 +
  424 + if ($this->debug) $default->log->debug('clearExtractors');
  425 + }
  426 +
  427 + /**
  428 + * lookup the name of the extractor class based on the mime type.
  429 + *
  430 + * @param string $type
  431 + * @return string
  432 + */
  433 + public static function resolveExtractor($type)
  434 + {
  435 + global $default;
  436 + $sql = "select extractor from mime_types where filetypes='$type'";
  437 + $class = DBUtil::getOneResultKey($sql,'extractor');
  438 + if (PEAR::isError($class))
  439 + {
  440 + $default->log->error("resolveExtractor: cannot resolve $type");
  441 + return $class;
  442 + }
  443 + if ($this->debug) $default->log->debug(sprintf(_kt("resolveExtractor: Resolved '%s' from mime type '%s'."), $class, $type));
  444 + return $class;
  445 + }
  446 +
  447 + /**
  448 + * Return all the discussion text.
  449 + *
  450 + * @param int $docid
  451 + * @return string
  452 + */
  453 + public static function getDiscussionText($docid)
  454 + {
  455 + $sql = "SELECT
  456 + dc.subject, dc.body
  457 + FROM
  458 + discussion_threads dt
  459 + INNER JOIN discussion_comments dc ON dc.thread_id=dt.id AND dc.id BETWEEN dt.first_comment_id AND dt.last_comment_id
  460 + WHERE
  461 + dt.document_id=$docid";
  462 + $result = DBUtil::getResultArray($sql);
  463 + $text = '';
  464 +
  465 + foreach($result as $record)
  466 + {
  467 + $text .= $record['subject'] . "\n" . $record['body'] . "\n";
  468 + }
  469 +
  470 + return $text;
  471 + }
  472 +
  473 + /**
  474 + * Schedule the indexing of a document.
  475 + *
  476 + * @param string $document
  477 + * @param string $what
  478 + */
  479 + public static function index($document, $what='A')
  480 + {
  481 + global $default;
  482 +
  483 + if (is_numeric($document))
  484 + {
  485 + $document = Document::get($document+0);
  486 + }
  487 +
  488 + if (PEAR::isError($document))
  489 + {
  490 + $default->log->error("index: Could not index document: " .$document->getMessage());
  491 + return;
  492 + }
  493 +
  494 + $document_id = $document->getId();
  495 + $userid=$_SESSION['userID'];
  496 + if (empty($userid)) $userid=1;
  497 +
  498 + // we dequeue the document so that there are no issues when enqueuing
  499 + Indexer::unqueueDocument($document_id);
  500 +
  501 + // enqueue item
  502 + $sql = "INSERT INTO index_files(document_id, user_id, what) VALUES($document_id, $userid, '$what')";
  503 + DBUtil::runQuery($sql);
  504 +
  505 + $default->log->debug("index: Queuing indexing of $document_id");
  506 +
  507 + }
  508 +
  509 + private static function incrementCount()
  510 + {
  511 + // Get count from system settings
  512 + $count = Indexer::getIndexedDocumentCount();
  513 + $count = (int)$count + 1;
  514 + Indexer::updateIndexedDocumentCount($count);
  515 + }
  516 +
  517 + public static function getIndexedDocumentCount()
  518 + {
  519 + $count = KTUtil::getSystemSetting('indexedDocumentCount', 0);
  520 + return (int) $count;
  521 + }
  522 +
  523 + public static function updateIndexedDocumentCount($cnt = 0)
  524 + {
  525 + KTUtil::setSystemSetting('indexedDocumentCount', $cnt);
  526 + }
  527 +
  528 + public static function reindexQueue()
  529 + {
  530 + $sql = "UPDATE index_files SET processdate = null";
  531 + DBUtil::runQuery($sql);
  532 + }
  533 +
  534 + public static function reindexDocument($documentId)
  535 + {
  536 + $sql = "UPDATE index_files SET processdate=null, status_msg=null WHERE document_id=$documentId";
  537 + DBUtil::runQuery($sql);
  538 + }
  539 +
  540 +
  541 +
  542 + public static function indexAll()
  543 + {
  544 + $userid=$_SESSION['userID'];
  545 + if (empty($userid)) $userid=1;
  546 +
  547 + $sql = "DELETE FROM index_files";
  548 + DBUtil::runQuery($sql);
  549 +
  550 + $sql = "INSERT INTO index_files(document_id, user_id, what) SELECT id, $userid, 'A' FROM documents WHERE status_id=1 and id not in (select document_id from index_files)";
  551 + DBUtil::runQuery($sql);
  552 + }
  553 +
  554 + /**
  555 + * Clearout the scheduling of documents that no longer exist.
  556 + *
  557 + */
  558 + public static function clearoutDeleted()
  559 + {
  560 + global $default;
  561 +
  562 + $sql = 'DELETE FROM
  563 + index_files
  564 + WHERE
  565 + document_id in (SELECT d.id FROM documents AS d WHERE d.status_id=3) OR
  566 + NOT EXISTS(SELECT index_files.document_id FROM documents WHERE index_files.document_id=documents.id)';
  567 + DBUtil::runQuery($sql);
  568 +
  569 + $default->log->debug("Indexer::clearoutDeleted: removed documents from indexing queue that have been deleted");
  570 + }
  571 +
  572 +
  573 + /**
  574 + * Check if a document is scheduled to be indexed
  575 + *
  576 + * @param mixed $document This may be a document or document id
  577 + * @return boolean
  578 + */
  579 + public static function isDocumentScheduled($document)
  580 + {
  581 + if (is_numeric($document))
  582 + {
  583 + $docid = $document;
  584 + }
  585 + else if ($document instanceof Document)
  586 + {
  587 + $docid = $document->getId();
  588 + }
  589 + else
  590 + {
  591 + return false;
  592 + }
  593 + $sql = "SELECT 1 FROM index_files WHERE document_id=$docid";
  594 + $result = DBUtil::getResultArray($sql);
  595 + return count($result) > 0;
  596 + }
  597 +
  598 + /**
  599 + * Filters text removing redundant characters such as continuous newlines and spaces.
  600 + *
  601 + * @param string $filename
  602 + */
  603 + private function filterText($filename)
  604 + {
  605 + $content = file_get_contents($filename);
  606 +
  607 + $src = array("([\r\n])","([\n][\n])","([\n])","([\t])",'([ ][ ])');
  608 + $tgt = array("\n","\n",' ',' ',' ');
  609 +
  610 + // shrink what is being stored.
  611 + do
  612 + {
  613 + $orig = $content;
  614 + $content = preg_replace($src, $tgt, $content);
  615 + } while ($content != $orig);
  616 +
  617 + return file_put_contents($filename, $content) !== false;
  618 + }
  619 +
  620 + /**
  621 + * Load hooks for text extraction process.
  622 + *
  623 + */
  624 + private function loadExtractorHooks()
  625 + {
  626 + $this->generalHookCache = array();
  627 + $this->mimeHookCache = array();
  628 +
  629 +
  630 + $dir = opendir(SearchHelper::correctPath($this->hookPath));
  631 + while (($file = readdir($dir)) !== false)
  632 + {
  633 + if (substr($file,-12) == 'Hook.inc.php')
  634 + {
  635 + require_once($this->hookPath . '/' . $file);
  636 + $class = substr($file, 0, -8);
  637 +
  638 + if (!class_exists($class))
  639 + {
  640 + continue;
  641 + }
  642 +
  643 + $hook = new $class;
  644 + if (!($class instanceof ExtractorHook))
  645 + {
  646 + continue;
  647 + }
  648 +
  649 + $mimeTypes = $hook->registerMimeTypes();
  650 + if (is_null($mimeTypes))
  651 + {
  652 + $this->generalHookCache[] = & $hook;
  653 + }
  654 + else
  655 + {
  656 + foreach($mimeTypes as $type)
  657 + {
  658 + $this->mimeHookCache[$type][] = & $hook;
  659 + }
  660 + }
  661 +
  662 + }
  663 + }
  664 + closedir($dir);
  665 + }
  666 +
  667 + /**
  668 + * This is a refactored function to execute the hooks.
  669 + *
  670 + * @param DocumentExtractor $extractor
  671 + * @param string $phase
  672 + * @param string $mimeType Optional. If set, indicates which hooks must be used, else assume general.
  673 + */
  674 + private function executeHook($extractor, $phase, $mimeType = null)
  675 + {
  676 + $hooks = array();
  677 + if (is_null($mimeType))
  678 + {
  679 + $hooks = $this->generalHookCache;
  680 + }
  681 + else
  682 + {
  683 + if (array_key_exists($mimeType, $this->mimeHookCache))
  684 + {
  685 + $hooks = $this->mimeHookCache[$mimeType];
  686 + }
  687 + }
  688 + if (empty($hooks))
  689 + {
  690 + return;
  691 + }
  692 +
  693 + foreach($hooks as $hook)
  694 + {
  695 + $hook->$phase($extractor);
  696 + }
  697 + }
  698 +
  699 + private function doesDiagnosticsPass($simple=false)
  700 + {
  701 + global $default;
  702 +
  703 + $config =& KTConfig::getSingleton();
  704 + // create a index log lock file in case there are errors, and we don't need to log them forever!
  705 + // this function will create the lockfile if an error is detected. It will be removed as soon
  706 + // as the problems with the indexer are removed.
  707 + $lockFile = $config->get('cache/cacheDirectory') . '/index.log.lock';
  708 +
  709 + $diagnosis = $this->diagnose();
  710 + if (!is_null($diagnosis))
  711 + {
  712 + if (!is_file($lockFile))
  713 + {
  714 + $default->log->error(_kt('Indexer problem: ') . $diagnosis);
  715 + }
  716 + touch($lockFile);
  717 + return false;
  718 + }
  719 +
  720 + if ($simple)
  721 + {
  722 + return true;
  723 + }
  724 +
  725 + $diagnosis = $this->diagnoseExtractors();
  726 + if (!empty($diagnosis))
  727 + {
  728 + if (!is_file($lockFile))
  729 + {
  730 + foreach($diagnosis as $diag)
  731 + {
  732 + $default->log->error(sprintf(_kt('%s problem: %s'), $diag['name'],$diag['diagnosis']));
  733 + }
  734 + }
  735 + touch($lockFile);
  736 + return false;
  737 + }
  738 +
  739 + if (is_file($lockFile))
  740 + {
  741 + $default->log->info(_kt('Issues with the indexer have been resolved!'));
  742 + unlink($lockFile);
  743 + }
  744 +
  745 + return true;
  746 + }
  747 +
  748 + /**
  749 + * This does the initial mime type association between mime types and text extractors
  750 + *
  751 + */
  752 + public function checkForRegisteredTypes()
  753 + {
  754 + global $default;
  755 +
  756 + // we are only doing this once!
  757 + $initRegistered = KTUtil::getSystemSetting('mimeTypesRegistered', false);
  758 + if ($initRegistered)
  759 + {
  760 + return;
  761 + }
  762 + if ($this->debug) $default->log->debug('checkForRegisteredTypes: start');
  763 +
  764 + $date = date('Y-m-d H:i');
  765 + $sql = "UPDATE scheduler_tasks SET run_time='$date'";
  766 + DBUtil::runQuery($sql);
  767 +
  768 + $this->registerTypes(true);
  769 +
  770 + $disable = array(
  771 + OS_WINDOWS=>array('PSExtractor'),
  772 + OS_UNIX => array()
  773 + );
  774 +
  775 + $disableForOS = OS_WINDOWS?$disable[OS_WINDOWS]:$disable[OS_UNIX];
  776 +
  777 + foreach($disableForOS as $extractor)
  778 + {
  779 + $sql = "UPDATE mime_extractors SET active=0 WHERE name='$extractor'";
  780 + DBUtil::runQuery($sql);
  781 + $default->log->info("checkForRegisteredTypes: disabled '$extractor'");
  782 + }
  783 +
  784 + if ($this->debug) $default->log->debug('checkForRegisteredTypes: done');
  785 + KTUtil::setSystemSetting('mimeTypesRegistered', true);
  786 + }
  787 +
  788 + private function updatePendingDocumentStatus($documentId, $message, $level)
  789 + {
  790 + $this->indexingHistory .= "\n" . $level . ': ' . $message;
  791 + $message = sanitizeForSQL($this->indexingHistory);
  792 + $sql = "UPDATE index_files SET status_msg='$message' WHERE document_id=$documentId";
  793 + DBUtil::runQuery($sql);
  794 + }
  795 +
  796 + private $restartCurrentBatch = false;
  797 +
  798 + public function restartBatch()
  799 + {
  800 + $this->restartCurrentBatch = true;
  801 + }
  802 +
  803 + /**
  804 + *
  805 + * @param int $documentId
  806 + * @param string $message
  807 + * @param string $level This may be info, error, debug
  808 + */
  809 + private function logPendingDocumentInfoStatus($documentId, $message, $level)
  810 + {
  811 + $this->updatePendingDocumentStatus($documentId, $message, $level);
  812 + global $default;
  813 +
  814 + switch ($level)
  815 + {
  816 + case 'debug':
  817 + if ($this->debug)
  818 + {
  819 + $default->log->debug($message);
  820 + }
  821 + break;
  822 + default:
  823 + $default->log->$level($message);
  824 + }
  825 + }
  826 +
  827 +
  828 +
  829 + public function getExtractor($extractorClass)
  830 + {
  831 + if (empty($extractorClass))
  832 + {
  833 + return null;
  834 + }
  835 +
  836 + $includeFile = SEARCH2_INDEXER_DIR . 'extractors/' . $extractorClass . '.inc.php';
  837 + if (!file_exists($includeFile))
  838 + {
  839 + throw new Exception("Extractor file does not exist: $includeFile");
  840 + }
  841 +
  842 + require_once($includeFile);
  843 +
  844 + if (!class_exists($extractorClass))
  845 + {
  846 + throw new Exception("Extractor '$classname' not defined in file: $includeFile");
  847 + }
  848 +
  849 + $extractor = new $extractorClass();
  850 +
  851 + if (!($extractor instanceof DocumentExtractor))
  852 + {
  853 + throw new Exception("Class $classname was expected to be of type DocumentExtractor");
  854 + }
  855 +
  856 + return $extractor;
  857 + }
  858 +
  859 + public static function getIndexingQueue($problemItemsOnly=true)
  860 + {
  861 +
  862 + if ($problemItemsOnly)
  863 + {
  864 + $sql = "SELECT
  865 + iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename
  866 + FROM
  867 + index_files iff
  868 + INNER JOIN documents d ON iff.document_id=d.id
  869 + INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
  870 + INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
  871 + INNER JOIN mime_types mt ON dcv.mime_id=mt.id
  872 + LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
  873 + WHERE
  874 + (iff.status_msg IS NOT NULL AND iff.status_msg <> '') AND d.status_id=1
  875 + ORDER BY indexdate ";
  876 + }
  877 + else
  878 + {
  879 + $sql = "SELECT
  880 + iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename
  881 + FROM
  882 + index_files iff
  883 + INNER JOIN documents d ON iff.document_id=d.id
  884 + INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
  885 + INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
  886 + INNER JOIN mime_types mt ON dcv.mime_id=mt.id
  887 + LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
  888 + WHERE
  889 + (iff.status_msg IS NULL or iff.status_msg = '') AND d.status_id=1
  890 + ORDER BY indexdate ";
  891 + }
  892 + $aResult = DBUtil::getResultArray($sql);
  893 +
  894 + return $aResult;
  895 + }
  896 +
  897 + public static function getPendingIndexingQueue()
  898 + {
  899 + return Indexer::getIndexingQueue(false);
  900 + }
  901 +
  902 + /**
  903 + * The main function that may be called repeatedly to index documents.
  904 + *
  905 + * @param int $max Default 20
  906 + */
  907 + public function indexDocuments($max=null)
  908 + {
  909 + global $default;
  910 + $config =& KTConfig::getSingleton();
  911 +
  912 + /*$indexLockFile = $config->get('cache/cacheDirectory') . '/main.index.lock';
  913 + if (is_file($indexLockFile))
  914 + {
  915 + $default->log->info('indexDocuments: main.index.lock seems to exist. it could be that the indexing is still underway.');
  916 + $default->log->info('indexDocuments: Remove "' . $indexLockFile . '" if the indexing is not running or extend the frequency at which the background task runs!');
  917 + return;
  918 + }
  919 + touch($indexLockFile);*/
  920 +
  921 +
  922 + $this->checkForRegisteredTypes();
  923 +
  924 + if ($this->debug) $default->log->debug('indexDocuments: start');
  925 + if (!$this->doesDiagnosticsPass())
  926 + {
  927 + //unlink($indexLockFile);
  928 + if ($this->debug) $default->log->debug('indexDocuments: stopping - diagnostics problem. The dashboard will provide more information.');
  929 + return;
  930 + }
  931 +
  932 + if (is_null($max))
  933 + {
  934 + $max = $config->get('indexer/batchDocuments',20);
  935 + }
  936 +
  937 + $this->loadExtractorHooks();
  938 +
  939 + Indexer::clearoutDeleted();
  940 +
  941 + $date = date('Y-m-d H:i:s');
  942 + // identify the indexers that must run
  943 + // mysql specific limit!
  944 + $sql = "SELECT
  945 + iff.document_id, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what
  946 + FROM
  947 + index_files iff
  948 + INNER JOIN documents d ON iff.document_id=d.id
  949 + INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
  950 + INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
  951 + INNER JOIN mime_types mt ON dcv.mime_id=mt.id
  952 + LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
  953 + WHERE
  954 + (iff.processdate IS NULL or iff.processdate < date_sub('$date', interval 1 day)) AND dmv.status_id=1
  955 + ORDER BY indexdate
  956 + LIMIT $max";
  957 + $result = DBUtil::getResultArray($sql);
  958 + if (PEAR::isError($result))
  959 + {
  960 + //unlink($indexLockFile);
  961 + if ($this->debug) $default->log->debug('indexDocuments: stopping - db error');
  962 + return;
  963 + }
  964 + KTUtil::setSystemSetting('luceneIndexingDate', time());
  965 +
  966 + // bail if no work to do
  967 + if (count($result) == 0)
  968 + {
  969 + //unlink($indexLockFile);
  970 + if ($this->debug) $default->log->debug('indexDocuments: stopping - no work to be done');
  971 + return;
  972 + }
  973 +
  974 + // identify any documents that need indexing and mark them
  975 + // so they are not taken in a followup run
  976 + $ids = array();
  977 + foreach($result as $docinfo)
  978 + {
  979 + $ids[] = $docinfo['document_id'];
  980 + }
  981 +
  982 + // mark the documents as being processed
  983 +
  984 + $ids=implode(',',$ids);
  985 + $sql = "UPDATE index_files SET processdate='$date' WHERE document_id in ($ids)";
  986 + DBUtil::runQuery($sql);
  987 +
  988 + $extractorCache = array();
  989 + $storageManager = KTStorageManagerUtil::getSingleton();
  990 +
  991 + $tempPath = $config->get("urls/tmpDirectory");
  992 +
  993 + foreach($result as $docinfo)
  994 + {
  995 + // increment indexed documents count
  996 + Indexer::incrementCount();
  997 +
  998 + $docId=$docinfo['document_id'];
  999 + $extension=$docinfo['filetypes'];
  1000 + $mimeType=$docinfo['mimetypes'];
  1001 + $extractorClass=$docinfo['extractor'];
  1002 + $indexDocument = in_array($docinfo['what'], array('A','C'));
  1003 + $indexDiscussion = in_array($docinfo['what'], array('A','D'));
  1004 + $this->indexingHistory = '';
  1005 +
  1006 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Indexing docid: %d extension: '%s' mimetype: '%s' extractor: '%s'"), $docId, $extension,$mimeType,$extractorClass), 'debug');
  1007 +
  1008 + if (empty($extractorClass))
  1009 + {
  1010 + /*
  1011 +
  1012 + if no extractor is found and we don't need to index discussions, then we can remove the item from the queue.
  1013 +
  1014 + */
  1015 + if ($indexDiscussion)
  1016 + {
  1017 + $indexDocument = false;
  1018 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Not indexing docid: %d content because extractor could not be resolve. Still indexing discussion."), $docId), 'info');
  1019 + }
  1020 + else
  1021 + {
  1022 + Indexer::unqueueDocument($docId, sprintf(_kt("No extractor for docid: %d"),$docId));
  1023 + continue;
  1024 + }
  1025 + }
  1026 + else
  1027 + {
  1028 + /*
  1029 +
  1030 + If an extractor is available, we must ensure it is enabled.
  1031 +
  1032 + */
  1033 +
  1034 + if (!$this->isExtractorEnabled($extractorClass))
  1035 + {
  1036 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("diagnose: Not indexing docid: %d because extractor '%s' is disabled."), $docId, $extractorClass), 'info');
  1037 + continue;
  1038 + }
  1039 + }
  1040 +
  1041 + if ($this->debug)
  1042 + {
  1043 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Processing docid: %d.\n"),$docId), 'info');
  1044 + }
  1045 +
  1046 + $document = Document::get($docId);
  1047 + if (PEAR::isError($document))
  1048 + {
  1049 + Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Cannot resolve document id %d: %s."),$docId, $document->getMessage()), 'error');
  1050 + continue;
  1051 + }
  1052 +
  1053 + if ($this->restartCurrentBatch)
  1054 + {
  1055 + Indexer::unqueueDocument($docId);
  1056 + Indexer::index($docId, 'A');
  1057 + continue;
  1058 + }
  1059 +
  1060 +
  1061 + $filename = $document->getFileName();
  1062 + if (substr($filename,0,1) == '~' || substr($filename,-1) == '~')
  1063 + {
  1064 + Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Filename for document id %d starts with a tilde (~). This is assumed to be a temporary file. This is ignored."),$docId), 'error');
  1065 + continue;
  1066 + }
  1067 +
  1068 + $removeFromQueue = true;
  1069 + if ($indexDocument)
  1070 + {
  1071 + if (array_key_exists($extractorClass, $extractorCache))
  1072 + {
  1073 + $extractor = $extractorCache[$extractorClass];
  1074 + }
  1075 + else
  1076 + {
  1077 + $extractor = $extractorCache[$extractorClass] = $this->getExtractor($extractorClass);
  1078 + }
  1079 +
  1080 + if (!($extractor instanceof DocumentExtractor))
  1081 + {
  1082 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("indexDocuments: extractor '%s' is not a document extractor class."),$extractorClass), 'error');
  1083 + continue;
  1084 + }
  1085 +
  1086 +
  1087 +
  1088 + $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();
  1089 + $sourceFile = $storageManager->temporaryFile($document);
  1090 +
  1091 + if (empty($sourceFile) || !is_file($sourceFile))
  1092 + {
  1093 + Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: source file '%s' for document %d does not exist."),$sourceFile,$docId), 'error');
  1094 + continue;
  1095 + }
  1096 +
  1097 + if ($extractor->needsIntermediateSourceFile())
  1098 + {
  1099 + //$extension = pathinfo($document->getFileName(), PATHINFO_EXTENSION);
  1100 +
  1101 + $intermediate = $tempPath . '/'. $docId . '.' . $extension;
  1102 + $result = @copy($sourceFile, $intermediate);
  1103 + if ($result === false)
  1104 + {
  1105 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not create intermediate file from document %d"),$docId), 'error');
  1106 + // problem. lets try again later. probably permission related. log the issue.
  1107 + continue;
  1108 + }
  1109 + $sourceFile = $intermediate;
  1110 + }
  1111 +
  1112 + $targetFile = tempnam($tempPath, 'ktindexer');
  1113 +
  1114 + $extractor->setSourceFile($sourceFile);
  1115 + $extractor->setMimeType($mimeType);
  1116 + $extractor->setExtension($extension);
  1117 + $extractor->setTargetFile($targetFile);
  1118 + $extractor->setDocument($document);
  1119 + $extractor->setIndexingStatus(null);
  1120 + $extractor->setExtractionStatus(null);
  1121 +
  1122 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Extra Info docid: %d Source File: '%s' Target File: '%s'"),$docId,$sourceFile,$targetFile), 'debug');
  1123 +
  1124 + $this->executeHook($extractor, 'pre_extract');
  1125 + $this->executeHook($extractor, 'pre_extract', $mimeType);
  1126 + $removeFromQueue = false;
  1127 +
  1128 + if ($extractor->extractTextContent())
  1129 + {
  1130 + // the extractor may need to create another target file
  1131 + $targetFile = $extractor->getTargetFile();
  1132 +
  1133 + $extractor->setExtractionStatus(true);
  1134 + $this->executeHook($extractor, 'pre_index');
  1135 + $this->executeHook($extractor, 'pre_index', $mimeType);
  1136 +
  1137 + $title = $document->getName();
  1138 + if ($indexDiscussion)
  1139 + {
  1140 + if (!$this->filterText($targetFile))
  1141 + {
  1142 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');
  1143 + }
  1144 + else
  1145 + {
  1146 + $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);
  1147 + $removeFromQueue = $indexStatus;
  1148 + if (!$indexStatus)
  1149 + {
  1150 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocumentAndDiscussion"),$docId), 'error');
  1151 + }
  1152 +
  1153 + $extractor->setIndexingStatus($indexStatus);
  1154 + }
  1155 + }
  1156 + else
  1157 + {
  1158 + if (!$this->filterText($targetFile))
  1159 + {
  1160 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');
  1161 + }
  1162 + else
  1163 + {
  1164 + $indexStatus = $this->indexDocument($docId, $targetFile, $title, $version);
  1165 + $removeFromQueue = $indexStatus;
  1166 +
  1167 + if (!$indexStatus)
  1168 + {
  1169 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocument"),$docId), 'error');
  1170 + $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error');
  1171 + }
  1172 +
  1173 + $extractor->setIndexingStatus($indexStatus);
  1174 + }
  1175 + }
  1176 +
  1177 + $this->executeHook($extractor, 'post_index', $mimeType);
  1178 + $this->executeHook($extractor, 'post_index');
  1179 + }
  1180 + else
  1181 + {
  1182 + $extractor->setExtractionStatus(false);
  1183 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not extract contents from document %d"),$docId), 'error');
  1184 + $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error');
  1185 + }
  1186 +
  1187 + $this->executeHook($extractor, 'post_extract', $mimeType);
  1188 + $this->executeHook($extractor, 'post_extract');
  1189 +
  1190 + if ($extractor->needsIntermediateSourceFile())
  1191 + {
  1192 + @unlink($sourceFile);
  1193 + }
  1194 +
  1195 + @unlink($targetFile);
  1196 +
  1197 + }
  1198 + else
  1199 + {
  1200 + $indexStatus = $this->indexDiscussion($docId);
  1201 + $removeFromQueue = $indexStatus;
  1202 + }
  1203 +
  1204 + if ($removeFromQueue)
  1205 + {
  1206 + Indexer::unqueueDocument($docId, sprintf(_kt("Done indexing docid: %d"),$docId));
  1207 + }
  1208 + else
  1209 + {
  1210 + if ($this->debug) $default->log->debug(sprintf(_kt("Document docid: %d was not removed from the queue as it looks like there was a problem with the extraction process"),$docId));
  1211 + }
  1212 + }
  1213 + if ($this->debug) $default->log->debug('indexDocuments: done');
  1214 + //unlink($indexLockFile);
  1215 + }
  1216 +
  1217 + public function migrateDocuments($max=null)
  1218 + {
  1219 + global $default;
  1220 +
  1221 + $default->log->info(_kt('migrateDocuments: starting'));
  1222 +
  1223 + if (!$this->doesDiagnosticsPass(true))
  1224 + {
  1225 + $default->log->info(_kt('migrateDocuments: stopping - diagnostics problem. The dashboard will provide more information.'));
  1226 + return;
  1227 + }
  1228 +
  1229 + if (KTUtil::getSystemSetting('migrationComplete') == 'true')
  1230 + {
  1231 + $default->log->info(_kt('migrateDocuments: stopping - migration is complete.'));
  1232 + return;
  1233 + }
  1234 +
  1235 + $config =& KTConfig::getSingleton();
  1236 + if (is_null($max))
  1237 + {
  1238 + $max = $config->get('indexer/batchMigrateDocument',500);
  1239 + }
  1240 +
  1241 + $lockFile = $config->get('cache/cacheDirectory') . '/migration.lock';
  1242 + if (is_file($lockFile))
  1243 + {
  1244 + $default->log->info(_kt('migrateDocuments: stopping - migration lockfile detected.'));
  1245 + return;
  1246 + }
  1247 + touch($lockFile);
  1248 +
  1249 + $startTime = KTUtil::getSystemSetting('migrationStarted');
  1250 + if (is_null($startTime))
  1251 + {
  1252 + KTUtil::setSystemSetting('migrationStarted', time());
  1253 + }
  1254 +
  1255 + $maxLoops = 5;
  1256 +
  1257 + $max = ceil($max / $maxLoops);
  1258 +
  1259 + $start =KTUtil::getBenchmarkTime();
  1260 + $noDocs = false;
  1261 + $numDocs = 0;
  1262 +
  1263 + for($loop=0;$loop<$maxLoops;$loop++)
  1264 + {
  1265 +
  1266 + $sql = "SELECT
  1267 + document_id, document_text
  1268 + FROM
  1269 + document_text
  1270 + ORDER BY document_id
  1271 + LIMIT $max";
  1272 + $result = DBUtil::getResultArray($sql);
  1273 + if (PEAR::isError($result))
  1274 + {
  1275 + $default->log->info(_kt('migrateDocuments: db error'));
  1276 + break;
  1277 + }
  1278 +
  1279 + $docs = count($result);
  1280 + if ($docs == 0)
  1281 + {
  1282 + $noDocs = true;
  1283 + break;
  1284 + }
  1285 + $numDocs += $docs;
  1286 +
  1287 + foreach($result as $docinfo)
  1288 + {
  1289 + $docId = $docinfo['document_id'];
  1290 +
  1291 + $document = Document::get($docId);
  1292 + if (PEAR::isError($document) || is_null($document))
  1293 + {
  1294 + $sql = "DELETE FROM document_text WHERE document_id=$docId";
  1295 + DBUtil::runQuery($sql);
  1296 + $default->log->error(sprintf(_kt('migrateDocuments: Could not get document %d\'s document! Removing content!'),$docId));
  1297 + continue;
  1298 + }
  1299 +
  1300 + $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();
  1301 +
  1302 + $targetFile = tempnam($tempPath, 'ktindexer');
  1303 +
  1304 + if (file_put_contents($targetFile, $docinfo['document_text']) === false)
  1305 + {
  1306 + $default->log->error(sprintf(_kt('migrateDocuments: Cannot write to \'%s\' for document id %d'), $targetFile, $docId));
  1307 + continue;
  1308 + }
  1309 + // free memory asap ;)
  1310 + unset($docinfo['document_text']);
  1311 +
  1312 + $title = $document->getName();
  1313 +
  1314 + $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);
  1315 +
  1316 + if ($indexStatus)
  1317 + {
  1318 + $sql = "DELETE FROM document_text WHERE document_id=$docId";
  1319 + DBUtil::runQuery($sql);
  1320 + }
  1321 + else
  1322 + {
  1323 + $default->log->error(sprintf(_kt("migrateDocuments: Problem indexing document %d"), $docId));
  1324 + }
  1325 +
  1326 + @unlink($targetFile);
  1327 + }
  1328 + }
  1329 +
  1330 + @unlink($lockFile);
  1331 +
  1332 + $time = KTUtil::getBenchmarkTime() - $start;
  1333 +
  1334 + KTUtil::setSystemSetting('migrationTime', KTUtil::getSystemSetting('migrationTime',0) + $time);
  1335 + KTUtil::setSystemSetting('migratedDocuments', KTUtil::getSystemSetting('migratedDocuments',0) + $numDocs);
  1336 +
  1337 + $default->log->info(sprintf(_kt('migrateDocuments: stopping - done in %d seconds!'), $time));
  1338 + if ($noDocs)
  1339 + {
  1340 + $default->log->info(_kt('migrateDocuments: Completed!'));
  1341 + KTUtil::setSystemSetting('migrationComplete', 'true');
  1342 + schedulerUtil::deleteByName('Index Migration');
  1343 + $default->log->debug(_kt('migrateDocuments: Disabling \'Index Migration\' task by removing scheduler entry.'));
  1344 + }
  1345 + }
  1346 +
  1347 + /**
  1348 + * Index a document. The base class must override this function.
  1349 + *
  1350 + * @param int $docId
  1351 + * @param string $textFile
  1352 + */
  1353 + protected abstract function indexDocument($docId, $textFile, $title, $version);
  1354 +
  1355 +
  1356 + public function updateDocumentIndex($docId, $text)
  1357 + {
  1358 + $config = KTConfig::getSingleton();
  1359 + $tempPath = $config->get("urls/tmpDirectory");
  1360 + $tempFile = tempnam($tempPath,'ud_');
  1361 +
  1362 + file_put_contents($tempFile, $text);
  1363 +
  1364 + $document = Document::get($docId);
  1365 + $title = $document->getDescription();
  1366 + $version = $document->getVersion();
  1367 +
  1368 + $result = $this->indexDocument($docId, $tempFile, $title, $version);
  1369 +
  1370 + if (file_exists($tempFile))
  1371 + {
  1372 + unlink($tempFile);
  1373 + }
  1374 +
  1375 + return $result;
  1376 + }
  1377 +
  1378 + /**
  1379 + * Index a discussion. The base class must override this function.
  1380 + *
  1381 + * @param int $docId
  1382 + */
  1383 + protected abstract function indexDiscussion($docId);
  1384 +
  1385 + /**
  1386 + * Diagnose the indexer. e.g. Check that the indexing server is running.
  1387 + *
  1388 + */
  1389 + public abstract function diagnose();
  1390 +
  1391 + /**
  1392 + * Diagnose the extractors.
  1393 + *
  1394 + * @return array
  1395 + */
  1396 + public function diagnoseExtractors()
  1397 + {
  1398 + $diagnosis = $this->_diagnose($this->extractorPath, 'DocumentExtractor', 'Extractor.inc.php');
  1399 + $diagnosis = array_merge($diagnosis, $this->_diagnose($this->hookPath, 'Hook', 'Hook.inc.php'));
  1400 +
  1401 + return $diagnosis;
  1402 + }
  1403 +
  1404 + /**
  1405 + * This is a refactored diagnose function.
  1406 + *
  1407 + * @param string $path
  1408 + * @param string $class
  1409 + * @param string $extension
  1410 + * @return array
  1411 + */
  1412 + private function _diagnose($path, $baseclass, $extension)
  1413 + {
  1414 + global $default;
  1415 +
  1416 + $diagnoses = array();
  1417 +
  1418 + $dir = opendir(SearchHelper::correctPath($path));
  1419 + $extlen = - strlen($extension);
  1420 +
  1421 + while (($file = readdir($dir)) !== false)
  1422 + {
  1423 + if (substr($file,0,1) == '.')
  1424 + {
  1425 + continue;
  1426 + }
  1427 + if (substr($file,$extlen) != $extension)
  1428 + {
  1429 + $default->log->error(sprintf(_kt("diagnose: '%s' does not have extension '%s'."), $file, $extension));
  1430 + continue;
  1431 + }
  1432 +
  1433 + require_once($path . '/' . $file);
  1434 +
  1435 + $class = substr($file, 0, -8);
  1436 + if (!class_exists($class))
  1437 + {
  1438 + $default->log->error(sprintf(_kt("diagnose: class '%s' does not exist."), $class));
  1439 + continue;
  1440 + }
  1441 +
  1442 + if (!$this->isExtractorEnabled($class))
  1443 + {
  1444 + $default->log->debug(sprintf(_kt("diagnose: extractor '%s' is disabled."), $class));
  1445 + continue;
  1446 + }
  1447 +
  1448 + $extractor = new $class();
  1449 + if (!is_a($extractor, $baseclass))
  1450 + {
  1451 + $default->log->error(sprintf(_kt("diagnose(): '%s' is not of type DocumentExtractor"), $class));
  1452 + continue;
  1453 + }
  1454 +
  1455 + $types = $extractor->getSupportedMimeTypes();
  1456 + if (empty($types))
  1457 + {
  1458 + if ($this->debug) $default->log->debug(sprintf(_kt("diagnose: class '%s' does not support any types."), $class));
  1459 + continue;
  1460 + }
  1461 +
  1462 + $diagnosis=$extractor->diagnose();
  1463 + if (empty($diagnosis))
  1464 + {
  1465 + continue;
  1466 + }
  1467 + $diagnoses[$class] = array(
  1468 + 'name'=>$extractor->getDisplayName(),
  1469 + 'diagnosis'=>$diagnosis
  1470 + );
  1471 +
  1472 + }
  1473 + closedir($dir);
  1474 +
  1475 + return $diagnoses;
  1476 + }
  1477 +
  1478 +
  1479 + /**
  1480 + * Register the extractor types.
  1481 + *
  1482 + * @param boolean $clear. Optional. Defaults to false.
  1483 + */
  1484 + public function registerTypes($clear=false)
  1485 + {
  1486 + if ($clear)
  1487 + {
  1488 + $this->clearExtractors();
  1489 + }
  1490 + $dir = opendir(SearchHelper::correctPath($this->extractorPath));
  1491 + while (($file = readdir($dir)) !== false)
  1492 + {
  1493 + if (substr($file,-17) == 'Extractor.inc.php')
  1494 + {
  1495 + require_once($this->extractorPath . '/' . $file);
  1496 + $class = substr($file, 0, -8);
  1497 +
  1498 + if (!class_exists($class))
  1499 + {
  1500 + // if the class does not exist, we can't do anything.
  1501 + continue;
  1502 + }
  1503 +
  1504 + $extractor = new $class;
  1505 + if ($extractor instanceof DocumentExtractor)
  1506 + {
  1507 + $extractor->registerMimeTypes();
  1508 + }
  1509 + }
  1510 + }
  1511 + closedir($dir);
  1512 + }
  1513 +
  1514 + /**
  1515 + * This is used as a possible obtimisation effort. It may be overridden in that case.
  1516 + *
  1517 + * @param int $docId
  1518 + * @param string $textFile
  1519 + */
  1520 + protected function indexDocumentAndDiscussion($docId, $textFile, $title, $version)
  1521 + {
  1522 + $this->indexDocument($docId, $textFile, $title, $version);
  1523 + $this->indexDiscussion($docId);
  1524 + }
  1525 +
  1526 + /**
  1527 + * Remove the document from the queue. This is normally called when it has been processed.
  1528 + *
  1529 + * @param int $docid
  1530 + */
  1531 + public static function unqueueDocument($docid, $reason=false, $level='debug')
  1532 + {
  1533 + $sql = "DELETE FROM index_files WHERE document_id=$docid";
  1534 + DBUtil::runQuery($sql);
  1535 + if ($reason !== false)
  1536 + {
  1537 + global $default;
  1538 + $default->log->$level("Indexer: removing document $docid from the queue - $reason");
  1539 + }
  1540 + }
  1541 +
  1542 + /**
  1543 + * Run a query on the index.
  1544 + *
  1545 + * @param string $query
  1546 + * @return array
  1547 + */
  1548 + public abstract function query($query);
  1549 +
  1550 + /**
  1551 + * Converts an integer to a string that can be easily compared and reversed.
  1552 + *
  1553 + * @param int $int
  1554 + * @return string
  1555 + */
  1556 + public static function longToString($int)
  1557 + {
  1558 + $maxlen = 14;
  1559 +
  1560 + $a2z = array('a','b','c','d','e','f','g','h','i','j');
  1561 + $o29 = array('0','1','2','3','4','5','6','7','8','9');
  1562 + $l = str_pad('',$maxlen - strlen("$int"),'0') . $int;
  1563 +
  1564 + return str_replace($o29, $a2z, $l);
  1565 + }
  1566 +
  1567 + /**
  1568 + * Converts a string to an integer.
  1569 + *
  1570 + * @param string $str
  1571 + * @return int
  1572 + */
  1573 + public static function stringToLong($str)
  1574 + {
  1575 + $a2z = array('a','b','c','d','e','f','g','h','i','j');
  1576 + $o29 = array('0','1','2','3','4','5','6','7','8','9');
  1577 +
  1578 + $int = str_replace($a2z, $o29, $str) + 0;
  1579 +
  1580 + return $int;
  1581 + }
  1582 +
  1583 + /**
  1584 + * Possibly we can optimise indexes. This method must be overriden.
  1585 + * The new function must call the parent!
  1586 + *
  1587 + */
  1588 + public function optimise()
  1589 + {
  1590 + KTUtil::setSystemSetting('luceneOptimisationDate', time());
  1591 + }
  1592 +
  1593 + /**
  1594 + * Shuts down the indexer
  1595 + *
  1596 + */
  1597 + public function shutdown()
  1598 + {
  1599 + // do nothing generally
  1600 + }
  1601 +
  1602 + /**
  1603 + * Returns the name of the indexer.
  1604 + *
  1605 + * @return string
  1606 + */
  1607 + public abstract function getDisplayName();
  1608 +
  1609 +
  1610 + /**
  1611 + * Returns the number of non-deleted documents in the index.
  1612 + *
  1613 + * @return int
  1614 + */
  1615 + public abstract function getDocumentsInIndex();
  1616 +
  1617 + /**
  1618 + * Returns the path to the index directory
  1619 + *
  1620 + * @return string
  1621 + */
  1622 + public function getIndexDirectory()
  1623 + {
  1624 + $config = KTConfig::getSingleton();
  1625 + $directory = $config->get('indexer/luceneDirectory');
  1626 + return $directory;
  1627 + }
  1628 +}
  1629 +
  1630 +?>
... ...