Commit 734ac2d6e429d79895c964d297ed21ff41f27059

Authored by Megan Watson
1 parent 2a2dc506

KTS-3440

"Restart open office periodically"
Fixed. A count is incremented on indexing a document (successfully or not), after 50 documents open office is restarted.

Committed by: Megan Watson
Reviewed by: Conrad Vermuelen




git-svn-id: https://kt-dms.svn.sourceforge.net/svnroot/kt-dms/trunk@8634 c91229c3-7414-0410-bfa2-8a42b809f60b
bin/checkopenoffice.php
... ... @@ -8,31 +8,31 @@
8 8 * Document Management Made Simple
9 9 * Copyright (C) 2008 KnowledgeTree Inc.
10 10 * Portions copyright The Jam Warehouse Software (Pty) Limited
11   - *
  11 + *
12 12 * This program is free software; you can redistribute it and/or modify it under
13 13 * the terms of the GNU General Public License version 3 as published by the
14 14 * Free Software Foundation.
15   - *
  15 + *
16 16 * This program is distributed in the hope that it will be useful, but WITHOUT
17 17 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
18 18 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
19 19 * details.
20   - *
  20 + *
21 21 * You should have received a copy of the GNU General Public License
22 22 * along with this program. If not, see <http://www.gnu.org/licenses/>.
23   - *
24   - * You can contact KnowledgeTree Inc., PO Box 7775 #87847, San Francisco,
  23 + *
  24 + * You can contact KnowledgeTree Inc., PO Box 7775 #87847, San Francisco,
25 25 * California 94120-7775, or email info@knowledgetree.com.
26   - *
  26 + *
27 27 * The interactive user interfaces in modified source and object code versions
28 28 * of this program must display Appropriate Legal Notices, as required under
29 29 * Section 5 of the GNU General Public License version 3.
30   - *
  30 + *
31 31 * In accordance with Section 7(b) of the GNU General Public License version 3,
32 32 * these Appropriate Legal Notices must retain the display of the "Powered by
33   - * KnowledgeTree" logo and retain the original copyright notice. If the display of the
  33 + * KnowledgeTree" logo and retain the original copyright notice. If the display of the
34 34 * logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices
35   - * must display the words "Powered by KnowledgeTree" and retain the original
  35 + * must display the words "Powered by KnowledgeTree" and retain the original
36 36 * copyright notice.
37 37 * Contributor( s): ______________________________________
38 38 */
... ... @@ -51,10 +51,25 @@ so for windows we use the win32 service status checks.
51 51 // Check if the calling function requires a return value
52 52 $sGiveOutput = (isset($argv[1]) && $argv[1] == 'output') ? true : false;
53 53  
  54 +// Check indexed document count
  55 +// If the number of indexed documents is greater than the set amount, restart open office
  56 +// this clears open office's memory usage
  57 +$resetPoint = 50; // todo: put in config
  58 +$count = Indexer::getIndexedDocumentCount();
  59 +
  60 +$restartOO = false;
  61 +if($count > $resetPoint){
  62 + $restartOO = true;
  63 +
  64 + // reset the count
  65 + Indexer::updateIndexedDocumentCount(0);
  66 + $default->log->debug('Check Open Office Task: Restarting open office.');
  67 +}
  68 +
54 69 // First we check the host:port to see if open office is running
55 70 $sCheckOO = SearchHelper::checkOpenOfficeAvailablity();
56 71  
57   -if(empty($sCheckOO)){
  72 +if(empty($sCheckOO) && !$restartOO){
58 73 // If the check returns empty then it is available on that port so we exit
59 74 if($sGiveOutput){
60 75 echo 1;
... ... @@ -62,21 +77,44 @@ if(empty($sCheckOO)){
62 77 exit;
63 78 }
64 79  
65   -// Open office appears not to be running.
66   -
  80 +// Open office appears not to be running or requires a restart
67 81 if(OS_WINDOWS){
68   - // If this is vista, it might be being blocked, so we query the service
69 82 $OOService = 'ktopenoffice';
70   - $result = win32_query_service_status($OOService);
71   -
72   - if(is_array($result)){
73   - $iProcessId = $result['ProcessId'];
74   - if(!empty($iProcessId) && $iProcessId != 0){
75   - // If there is a process id (PID) then open office is running so we exit
76   - if($sGiveOutput){
77   - echo 1;
  83 + $default->log->debug('Check Open Office Task: ' . get_current_user());
  84 +
  85 + if($restartOO){
  86 + // If Open office needs to be restarted - stop it here
  87 + $result_stop = win32_stop_service($OOService);
  88 +
  89 +
  90 + // Wait for the service to stop fully before trying to restart it
  91 + $continue = false;
  92 + $cnt = 0;
  93 + while($continue === false && $cnt < 15){
  94 + $result = win32_query_service_status($OOService);
  95 +
  96 + if(isset($result['ProcessId']) && $result['ProcessId'] != 0){
  97 + // If there is still a process id then the service has not stopped yet.
  98 + sleep(2);
  99 + $continue = false;
  100 + $cnt++;
  101 + }else{
  102 + $continue = true;
  103 + }
  104 + }
  105 + }else{
  106 + // If this is vista, checking the port may not work so we query the service
  107 + $result = win32_query_service_status($OOService);
  108 +
  109 + if(is_array($result)){
  110 + $iProcessId = $result['ProcessId'];
  111 + if(!empty($iProcessId) && $iProcessId != 0){
  112 + // If there is a process id (PID) then open office is running so we exit
  113 + if($sGiveOutput){
  114 + echo 1;
  115 + }
  116 + exit;
78 117 }
79   - exit;
80 118 }
81 119 }
82 120  
... ... @@ -97,11 +135,14 @@ if(OS_WINDOWS){
97 135  
98 136 $default->log->debug('Check Open Office Task: Open office service could not be started. Error code '.$result2);
99 137  
100   -
101 138 // Attempt using the dmsctl batch script
102 139 $sPath = realpath('../../bin/dmsctl.bat');
  140 +
103 141 if(file_exists($sPath)){
104 142 $sCmd = "\"$sPath\" start";
  143 + $default->log->debug('Check Open Office Task: ' . get_current_user());
  144 + $default->log->debug('Check Open Office Task: ' . $sCmd);
  145 +
105 146 $res = KTUtil::pexec($sCmd);
106 147  
107 148 $default->log->debug('Check Open Office Task: Attempted start using dmsctl.bat.');
... ... @@ -120,25 +161,39 @@ if(OS_WINDOWS){
120 161 // If the OS is Unix or Linux
121 162 $sPath = realpath('../../dmsctl.sh');
122 163 if(file_exists($sPath)){
123   - $sCmd = "\"$sPath\" start";
124   - KTUtil::pexec($sCmd);
  164 + // If Open office needs to be restarted - stop it here
  165 + if($restartOO){
  166 + $sCmd = "\"$sPath\" restart soffice";
  167 + $default->log->debug('Check Open Office Task: ' . get_current_user());
  168 + $default->log->debug('Check Open Office Task: ' . $sCmd);
125 169  
126   - $default->log->debug('Check Open Office Task: Attempted start using dmsctl.sh.');
127   - if($sGiveOutput){
  170 + KTUtil::pexec($sCmd);
  171 +
  172 + $default->log->debug('Check Open Office Task: Attempted restart using dmsctl.sh.');
  173 + }else{
  174 + $sCmd = "\"$sPath\" start soffice";
  175 + $default->log->debug('Check Open Office Task: ' . get_current_user());
  176 + $default->log->debug('Check Open Office Task: ' . $sCmd);
  177 +
  178 + KTUtil::pexec($sCmd);
  179 +
  180 + $default->log->debug('Check Open Office Task: Attempted start using dmsctl.sh.');
  181 + }
  182 + if($sGiveOutput){
128 183 echo 2;
129 184 }
130 185 exit;
131 186 }else{
132   - $default->log->debug('Check Open Office Task: Can\'t find dmsctl.sh, this may be a source install.');
133   - if($sGiveOutput){
  187 + $default->log->debug('Check Open Office Task: Can\'t find dmsctl.sh, this may be a source install.');
  188 + if($sGiveOutput){
134 189 echo 0;
135 190 }
136 191 exit;
137   - }
  192 + }
138 193 }
139 194 $default->log->debug('Check Open Office Task: Can\'t start Open office, this may be a source install.');
140 195 if($sGiveOutput){
141 196 echo 0;
142 197 }
143 198 exit;
144   -?>
145 199 \ No newline at end of file
  200 +?>
... ...
search2/indexing/indexerCore.inc.php
1   -<?php
2   -
3   -/**
4   - * $Id:$
5   - *
6   - * KnowledgeTree Community Edition
7   - * Document Management Made Simple
8   - * Copyright (C) 2008 KnowledgeTree Inc.
9   - * Portions copyright The Jam Warehouse Software (Pty) Limited
10   - *
11   - * This program is free software; you can redistribute it and/or modify it under
12   - * the terms of the GNU General Public License version 3 as published by the
13   - * Free Software Foundation.
14   - *
15   - * This program is distributed in the hope that it will be useful, but WITHOUT
16   - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17   - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
18   - * details.
19   - *
20   - * You should have received a copy of the GNU General Public License
21   - * along with this program. If not, see <http://www.gnu.org/licenses/>.
22   - *
23   - * You can contact KnowledgeTree Inc., PO Box 7775 #87847, San Francisco,
24   - * California 94120-7775, or email info@knowledgetree.com.
25   - *
26   - * The interactive user interfaces in modified source and object code versions
27   - * of this program must display Appropriate Legal Notices, as required under
28   - * Section 5 of the GNU General Public License version 3.
29   - *
30   - * In accordance with Section 7(b) of the GNU General Public License version 3,
31   - * these Appropriate Legal Notices must retain the display of the "Powered by
32   - * KnowledgeTree" logo and retain the original copyright notice. If the display of the
33   - * logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices
34   - * must display the words "Powered by KnowledgeTree" and retain the original
35   - * copyright notice.
36   - * Contributor( s): ______________________________________
37   - *
38   - */
39   -
40   -define('SEARCH2_INDEXER_DIR',realpath(dirname(__FILE__)) . '/');
41   -require_once('indexing/extractorCore.inc.php');
42   -require_once(KT_DIR . '/plugins/ktcore/scheduler/schedulerUtil.php');
43   -
44   -
45   -class IndexerInconsistencyException extends Exception {};
46   -
47   -class QueryResultItem
48   -{
49   - protected $document_id;
50   - protected $title;
51   - protected $rank;
52   - protected $text;
53   - protected $filesize;
54   - protected $fullpath;
55   - protected $live;
56   - protected $version;
57   - protected $mimeType;
58   - protected $filename;
59   - protected $thumbnail; // TODO: if not null, gui can display a thumbnail
60   - protected $viewer; // TODO: if not null, a viewer can be used to view the document
61   - protected $document;
62   - protected $checkedOutUser;
63   - protected $dateCheckedout;
64   - protected $workflowState;
65   - protected $workflow;
66   - protected $modifiedBy;
67   - protected $dateModified;
68   - protected $createdBy;
69   - protected $dateCreated;
70   - protected $owner;
71   - protected $immutable;
72   - protected $deleted;
73   - protected $status;
74   - protected $folderId;
75   - protected $storagePath;
76   - protected $documentType;
77   - protected $mimeIconPath;
78   - protected $mimeDisplay;
79   - protected $oemDocumentNo;
80   -
81   - public function __construct($document_id, $rank=null, $title=null, $text=null)
82   - {
83   - $this->document_id=(int) $document_id;
84   - $this->rank= $rank;
85   - $this->title=$title;
86   - $this->text = $text;
87   - $this->live = true;
88   - $this->loadDocumentInfo();
89   - }
90   -
91   - protected function __isset($property)
92   - {
93   - switch($property)
94   - {
95   - case 'DocumentID': return isset($this->document_id);
96   - case 'Rank': return isset($this->rank);
97   - case 'Text': return isset($this->text);
98   - case 'Title': return isset($this->title);
99   - case null: break;
100   - default:
101   - throw new Exception("Unknown property '$property' to get on QueryResultItem");
102   - }
103   - return true; // should not be reached
104   - }
105   -
106   - public function loadDocumentInfo()
107   - {
108   - global $default;
109   - $sql = "SELECT
110   - d.folder_id, f.full_path, f.name, dcv.size as filesize, dcv.major_version,
111   - dcv.minor_version, dcv.filename, cou.name as checkoutuser, w.human_name as workflow, ws.human_name as workflowstate,
112   - mt.mimetypes as mimetype, md.mime_doc as mimedoc, d.checkedout, mbu.name as modifiedbyuser, d.modified,
113   - cbu.name as createdbyuser, ou.name as owneruser, d.immutable, d.status_id, d.created,dcv.storage_path, dtl.name as document_type,
114   - mt.icon_path as mime_icon_path, mt.friendly_name as mime_display, d.oem_no, dmv.name as title
115   - FROM
116   - documents d
117   - INNER JOIN document_metadata_version dmv ON d.metadata_version_id = dmv.id
118   - INNER JOIN document_content_version dcv ON dmv.content_version_id = dcv.id
119   - INNER JOIN mime_types mt ON dcv.mime_id=mt.id
120   - LEFT JOIN document_types_lookup dtl ON dtl.id=dmv.document_type_id
121   - LEFT JOIN folders f ON f.id=d.folder_id
122   - LEFT JOIN users cou ON d.checked_out_user_id=cou.id
123   - LEFT JOIN workflows w ON dmv.workflow_id=w.id
124   - LEFT JOIN workflow_states ws ON dmv.workflow_state_id = ws.id
125   - LEFT JOIN mime_documents md ON mt.mime_document_id = md.id
126   - LEFT JOIN users mbu ON d.modified_user_id=mbu.id
127   - LEFT JOIN users cbu ON d.creator_id=cbu.id
128   - LEFT JOIN users ou ON d.owner_id=ou.id
129   - WHERE
130   - d.id=$this->document_id";
131   -
132   - $result = DBUtil::getOneResult($sql);
133   -
134   - if (PEAR::isError($result) || empty($result))
135   - {
136   - $this->live = false;
137   - if (PEAR::isError($result))
138   - {
139   - throw new Exception('Database exception! There appears to be an error in the system: ' .$result->getMessage());
140   - }
141   -
142   - $default->log->error('QueryResultItem: $result is null');
143   - $msg = 'The database did not have a record matching the result from the document indexer. This may occur if there is an inconsistency between the document indexer and the repository. The indexer needs to be repaired.';
144   - $default->log->error('QueryResultItem: ' . $msg);
145   - // TODO: repair process where we scan documents in index, and delete those for which there is nothing in the repository
146   - throw new IndexerInconsistencyException(_kt($msg));
147   - }
148   -
149   - // document_id, relevance, text, title
150   -
151   - $this->documentType = $result['document_type'];
152   - $this->filename=$result['filename'];
153   - $this->filesize = KTUtil::filesizeToString($result['filesize']);
154   - $this->folderId = $result['folder_id'];
155   - $this->title = $result['title'];
156   -
157   - $this->createdBy = $result['createdbyuser'];
158   - $this->dateCreated = $result['created'];
159   -
160   - $this->modifiedBy = $result['modifiedbyuser'];
161   - $this->dateModified = $result['modified'];
162   -
163   - $this->checkedOutUser = $result['checkoutuser'];
164   - $this->dateCheckedout = $result['checkedout'];
165   -
166   - $this->owner = $result['owneruser'];
167   -
168   - $this->version = $result['major_version'] . '.' . $result['minor_version'];
169   -
170   - $this->immutable = ($result['immutable'] + 0)?_kt('Immutable'):'';
171   -
172   - $this->workflow = $result['workflow'];
173   - $this->workflowState = $result['workflowstate'];
174   -
175   - $this->oemDocumentNo = $result['oem_no'];
176   - if (empty($this->oemDocumentNo)) $this->oemDocumentNo = 'n/a';
177   -
178   - if (is_null($result['name']))
179   - {
180   - $this->fullpath = '(orphaned)';
181   - }
182   - else
183   - {
184   - $this->fullpath = $result['full_path'];
185   - }
186   -
187   - $this->mimeType = $result['mimetype'];
188   - $this->mimeIconPath = $result['mime_icon_path'];
189   - $this->mimeDisplay = $result['mime_display'];
190   -
191   - $this->storagePath = $result['storage_path'];
192   - $this->status = Document::getStatusString($result['status_id']);
193   - }
194   -
195   - protected function __get($property)
196   - {
197   - switch($property)
198   - {
199   - case null: return '';
200   - case 'DocumentID': return (int) $this->document_id;
201   - case 'Relevance':
202   - case 'Rank': return (float) $this->rank;
203   - case 'Text': return (string) $this->text;
204   - case 'Title': return (string) $this->title;
205   - case 'FullPath': return (string) $this->fullpath;
206   - case 'IsLive': return (bool) $this->live;
207   - case 'Filesize': return $this->filesize;
208   - case 'Version': return (string) $this->version;
209   - case 'Filename': return (string)$this->filename;
210   - case 'FolderId': return (int)$this->folderId;
211   - case 'OemDocumentNo': return (string) $this->oemDocumentNo;
212   - case 'Document':
213   - if (is_null($this->document))
214   - {
215   - $this->document = Document::get($this->document_id);
216   - }
217   - return $this->document;
218   - case 'IsAvailable':
219   - return $this->Document->isLive();
220   - case 'CheckedOutUser':
221   - case 'CheckedOutBy':
222   - return (string) $this->checkedOutUser;
223   - case 'WorkflowOnly':
224   - case 'Workflow':
225   - return (string)$this->workflow;
226   - case 'WorkflowStateOnly':
227   - case 'WorkflowState':
228   - return (string)$this->workflowState;
229   - case 'WorkflowAndState':
230   - if (is_null($this->workflow))
231   - {
232   - return '';
233   - }
234   - return "$this->workflow - $this->workflowState";
235   - case 'MimeType':
236   - return (string) $this->mimeType;
237   - case 'MimeIconPath':
238   - return (string) $this->mimeIconPath;
239   - case 'MimeDisplay':
240   - return (string) $this->mimeDisplay;
241   - case 'DateCheckedOut':
242   - return (string) $this->dateCheckedout;
243   - case 'ModifiedBy':
244   - return (string) $this->modifiedBy;
245   - case 'DateModified':
246   - return (string) $this->dateModified;
247   - case 'CreatedBy':
248   - return (string) $this->createdBy;
249   - case 'DateCreated':
250   - return (string) $this->dateCreated;
251   - case 'Owner':
252   - case 'OwnedBy':
253   - return (string) $this->owner;
254   - case 'IsImmutable':
255   - case 'Immutable':
256   - return (bool) $this->immutable;
257   - case 'Status':
258   - return $this->status;
259   - case 'StoragePath':
260   - return $this->storagePath;
261   - case 'DocumentType':
262   - return $this->documentType;
263   - case 'Permissions':
264   - return 'not available';
265   - case 'CanBeReadByUser':
266   - if (!$this->live)
267   - return false;
268   - if (Permission::userHasDocumentReadPermission($this->Document))
269   - return true;
270   - if (Permission::adminIsInAdminMode())
271   - return true;
272   - return false;
273   - default:
274   - throw new Exception("Unknown property '$property' to get on QueryResultItem");
275   - }
276   - return ''; // Should not be reached
277   - }
278   -
279   - protected function __set($property, $value)
280   - {
281   - switch($property)
282   - {
283   - case 'Rank': $this->rank = number_format($value,2,'.',','); break;
284   - case 'Title': $this->title = $value; break;
285   - case 'Text': $this->text = $value; break;
286   - default:
287   - throw new Exception("Unknown property '$property' to set on QueryResultItem");
288   - }
289   - }
290   -}
291   -
292   -function MatchResultCompare($a, $b)
293   -{
294   - if ($a->Rank == $b->Rank) {
295   - return 0;
296   - }
297   - return ($a->Rank < $b->Rank) ? -1 : 1;
298   -}
299   -
300   -abstract class Indexer
301   -{
302   - /**
303   - * Cache of extractors
304   - *
305   - * @var array
306   - */
307   - private $extractorCache;
308   -
309   - /**
310   - * Indicates if the indexer will do logging.
311   - *
312   - * @var boolean
313   - */
314   - private $debug;
315   - /**
316   - * Cache on mime related hooks
317   - *
318   - * @var unknown_type
319   - */
320   - private $mimeHookCache;
321   - /**
322   - * Cache on general hooks.
323   - *
324   - * @var array
325   - */
326   - private $generalHookCache;
327   -
328   - /**
329   - * This is a path to the extractors.
330   - *
331   - * @var string
332   - */
333   - private $extractorPath;
334   - /**
335   - * This is a path to the hooks.
336   - *
337   - * @var string
338   - */
339   - private $hookPath;
340   -
341   - private $enabledExtractors;
342   -
343   - /**
344   - * Initialise the indexer
345   - *
346   - */
347   - protected function __construct()
348   - {
349   - $config = KTConfig::getSingleton();
350   -
351   - $this->extractorCache = array();
352   - $this->debug = $config->get('indexer/debug', true);
353   - $this->hookCache = array();
354   - $this->generalHookCache = array();
355   - $this->extractorPath = $config->get('indexer/extractorPath', 'extractors');
356   - $this->hookPath = $config->get('indexer/extractorHookPath','extractorHooks');
357   -
358   - $this->loadExtractorStatus();
359   - }
360   -
361   - /**
362   - * Get the list if enabled extractors
363   - *
364   - */
365   - private function loadExtractorStatus()
366   - {
367   - $sql = "SELECT id, name FROM mime_extractors WHERE active=1";
368   - $rs = DBUtil::getResultArray($sql);
369   - $this->enabledExtractors = array();
370   - foreach($rs as $item)
371   - {
372   - $this->enabledExtractors[] = $item['name'];
373   - }
374   - }
375   -
376   - private function isExtractorEnabled($extractor)
377   - {
378   - return in_array($extractor, $this->enabledExtractors);
379   - }
380   -
381   - /**
382   - * Returns a reference to the main class
383   - *
384   - * @return Indexer
385   - */
386   - public static function get()
387   - {
388   - static $singleton = null;
389   -
390   - if (is_null($singleton))
391   - {
392   - $config = KTConfig::getSingleton();
393   - $classname = $config->get('indexer/coreClass');
394   -
395   - require_once('indexing/indexers/' . $classname . '.inc.php');
396   -
397   - if (!class_exists($classname))
398   - {
399   - throw new Exception("Class '$classname' does not exist.");
400   - }
401   -
402   - $singleton = new $classname;
403   - }
404   -
405   - return $singleton;
406   - }
407   -
408   - public abstract function deleteDocument($docid);
409   -
410   - /**
411   - * Remove the association of all extractors to mime types on the database.
412   - *
413   - */
414   - public function clearExtractors()
415   - {
416   - global $default;
417   -
418   - $sql = "update mime_types set extractor_id=null";
419   - DBUtil::runQuery($sql);
420   -
421   - $sql = "delete from mime_extractors";
422   - DBUtil::runQuery($sql);
423   -
424   - if ($this->debug) $default->log->debug('clearExtractors');
425   - }
426   -
427   - /**
428   - * lookup the name of the extractor class based on the mime type.
429   - *
430   - * @param string $type
431   - * @return string
432   - */
433   - public static function resolveExtractor($type)
434   - {
435   - global $default;
436   - $sql = "select extractor from mime_types where filetypes='$type'";
437   - $class = DBUtil::getOneResultKey($sql,'extractor');
438   - if (PEAR::isError($class))
439   - {
440   - $default->log->error("resolveExtractor: cannot resolve $type");
441   - return $class;
442   - }
443   - if ($this->debug) $default->log->debug(sprintf(_kt("resolveExtractor: Resolved '%s' from mime type '%s'."), $class, $type));
444   - return $class;
445   - }
446   -
447   - /**
448   - * Return all the discussion text.
449   - *
450   - * @param int $docid
451   - * @return string
452   - */
453   - public static function getDiscussionText($docid)
454   - {
455   - $sql = "SELECT
456   - dc.subject, dc.body
457   - FROM
458   - discussion_threads dt
459   - INNER JOIN discussion_comments dc ON dc.thread_id=dt.id AND dc.id BETWEEN dt.first_comment_id AND dt.last_comment_id
460   - WHERE
461   - dt.document_id=$docid";
462   - $result = DBUtil::getResultArray($sql);
463   - $text = '';
464   -
465   - foreach($result as $record)
466   - {
467   - $text .= $record['subject'] . "\n" . $record['body'] . "\n";
468   - }
469   -
470   - return $text;
471   - }
472   -
473   - /**
474   - * Schedule the indexing of a document.
475   - *
476   - * @param string $document
477   - * @param string $what
478   - */
479   - public static function index($document, $what='A')
480   - {
481   - global $default;
482   -
483   - if (is_numeric($document))
484   - {
485   - $document = Document::get($document+0);
486   - }
487   -
488   - if (PEAR::isError($document))
489   - {
490   - $default->log->error("index: Could not index document: " .$document->getMessage());
491   - return;
492   - }
493   -
494   - $document_id = $document->getId();
495   - $userid=$_SESSION['userID'];
496   - if (empty($userid)) $userid=1;
497   -
498   - // we dequeue the document so that there are no issues when enqueuing
499   - Indexer::unqueueDocument($document_id);
500   -
501   - // enqueue item
502   - $sql = "INSERT INTO index_files(document_id, user_id, what) VALUES($document_id, $userid, '$what')";
503   - DBUtil::runQuery($sql);
504   -
505   - $default->log->debug("index: Queuing indexing of $document_id");
506   - }
507   -
508   - public static function reindexQueue()
509   - {
510   - $sql = "UPDATE index_files SET processdate = null";
511   - DBUtil::runQuery($sql);
512   - }
513   -
514   - public static function reindexDocument($documentId)
515   - {
516   - $sql = "UPDATE index_files SET processdate=null, status_msg=null WHERE document_id=$documentId";
517   - DBUtil::runQuery($sql);
518   - }
519   -
520   -
521   -
522   - public static function indexAll()
523   - {
524   - $userid=$_SESSION['userID'];
525   - if (empty($userid)) $userid=1;
526   -
527   - $sql = "DELETE FROM index_files";
528   - DBUtil::runQuery($sql);
529   -
530   - $sql = "INSERT INTO index_files(document_id, user_id, what) SELECT id, $userid, 'A' FROM documents WHERE status_id=1 and id not in (select document_id from index_files)";
531   - DBUtil::runQuery($sql);
532   - }
533   -
534   - /**
535   - * Clearout the scheduling of documents that no longer exist.
536   - *
537   - */
538   - public static function clearoutDeleted()
539   - {
540   - global $default;
541   -
542   - $sql = 'DELETE FROM
543   - index_files
544   - WHERE
545   - document_id in (SELECT d.id FROM documents AS d WHERE d.status_id=3) OR
546   - NOT EXISTS(SELECT index_files.document_id FROM documents WHERE index_files.document_id=documents.id)';
547   - DBUtil::runQuery($sql);
548   -
549   - $default->log->debug("Indexer::clearoutDeleted: removed documents from indexing queue that have been deleted");
550   - }
551   -
552   -
553   - /**
554   - * Check if a document is scheduled to be indexed
555   - *
556   - * @param mixed $document This may be a document or document id
557   - * @return boolean
558   - */
559   - public static function isDocumentScheduled($document)
560   - {
561   - if (is_numeric($document))
562   - {
563   - $docid = $document;
564   - }
565   - else if ($document instanceof Document)
566   - {
567   - $docid = $document->getId();
568   - }
569   - else
570   - {
571   - return false;
572   - }
573   - $sql = "SELECT 1 FROM index_files WHERE document_id=$docid";
574   - $result = DBUtil::getResultArray($sql);
575   - return count($result) > 0;
576   - }
577   -
578   - /**
579   - * Filters text removing redundant characters such as continuous newlines and spaces.
580   - *
581   - * @param string $filename
582   - */
583   - private function filterText($filename)
584   - {
585   - $content = file_get_contents($filename);
586   -
587   - $src = array("([\r\n])","([\n][\n])","([\n])","([\t])",'([ ][ ])');
588   - $tgt = array("\n","\n",' ',' ',' ');
589   -
590   - // shrink what is being stored.
591   - do
592   - {
593   - $orig = $content;
594   - $content = preg_replace($src, $tgt, $content);
595   - } while ($content != $orig);
596   -
597   - return file_put_contents($filename, $content) !== false;
598   - }
599   -
600   - /**
601   - * Load hooks for text extraction process.
602   - *
603   - */
604   - private function loadExtractorHooks()
605   - {
606   - $this->generalHookCache = array();
607   - $this->mimeHookCache = array();
608   -
609   -
610   - $dir = opendir(SearchHelper::correctPath($this->hookPath));
611   - while (($file = readdir($dir)) !== false)
612   - {
613   - if (substr($file,-12) == 'Hook.inc.php')
614   - {
615   - require_once($this->hookPath . '/' . $file);
616   - $class = substr($file, 0, -8);
617   -
618   - if (!class_exists($class))
619   - {
620   - continue;
621   - }
622   -
623   - $hook = new $class;
624   - if (!($class instanceof ExtractorHook))
625   - {
626   - continue;
627   - }
628   -
629   - $mimeTypes = $hook->registerMimeTypes();
630   - if (is_null($mimeTypes))
631   - {
632   - $this->generalHookCache[] = & $hook;
633   - }
634   - else
635   - {
636   - foreach($mimeTypes as $type)
637   - {
638   - $this->mimeHookCache[$type][] = & $hook;
639   - }
640   - }
641   -
642   - }
643   - }
644   - closedir($dir);
645   - }
646   -
647   - /**
648   - * This is a refactored function to execute the hooks.
649   - *
650   - * @param DocumentExtractor $extractor
651   - * @param string $phase
652   - * @param string $mimeType Optional. If set, indicates which hooks must be used, else assume general.
653   - */
654   - private function executeHook($extractor, $phase, $mimeType = null)
655   - {
656   - $hooks = array();
657   - if (is_null($mimeType))
658   - {
659   - $hooks = $this->generalHookCache;
660   - }
661   - else
662   - {
663   - if (array_key_exists($mimeType, $this->mimeHookCache))
664   - {
665   - $hooks = $this->mimeHookCache[$mimeType];
666   - }
667   - }
668   - if (empty($hooks))
669   - {
670   - return;
671   - }
672   -
673   - foreach($hooks as $hook)
674   - {
675   - $hook->$phase($extractor);
676   - }
677   - }
678   -
679   - private function doesDiagnosticsPass($simple=false)
680   - {
681   - global $default;
682   -
683   - $config =& KTConfig::getSingleton();
684   - // create a index log lock file in case there are errors, and we don't need to log them forever!
685   - // this function will create the lockfile if an error is detected. It will be removed as soon
686   - // as the problems with the indexer are removed.
687   - $lockFile = $config->get('cache/cacheDirectory') . '/index.log.lock';
688   -
689   - $diagnosis = $this->diagnose();
690   - if (!is_null($diagnosis))
691   - {
692   - if (!is_file($lockFile))
693   - {
694   - $default->log->error(_kt('Indexer problem: ') . $diagnosis);
695   - }
696   - touch($lockFile);
697   - return false;
698   - }
699   -
700   - if ($simple)
701   - {
702   - return true;
703   - }
704   -
705   - $diagnosis = $this->diagnoseExtractors();
706   - if (!empty($diagnosis))
707   - {
708   - if (!is_file($lockFile))
709   - {
710   - foreach($diagnosis as $diag)
711   - {
712   - $default->log->error(sprintf(_kt('%s problem: %s'), $diag['name'],$diag['diagnosis']));
713   - }
714   - }
715   - touch($lockFile);
716   - return false;
717   - }
718   -
719   - if (is_file($lockFile))
720   - {
721   - $default->log->info(_kt('Issues with the indexer have been resolved!'));
722   - unlink($lockFile);
723   - }
724   -
725   - return true;
726   - }
727   -
728   - /**
729   - * This does the initial mime type association between mime types and text extractors
730   - *
731   - */
732   - public function checkForRegisteredTypes()
733   - {
734   - global $default;
735   -
736   - // we are only doing this once!
737   - $initRegistered = KTUtil::getSystemSetting('mimeTypesRegistered', false);
738   - if ($initRegistered)
739   - {
740   - return;
741   - }
742   - if ($this->debug) $default->log->debug('checkForRegisteredTypes: start');
743   -
744   - $date = date('Y-m-d H:i');
745   - $sql = "UPDATE scheduler_tasks SET run_time='$date'";
746   - DBUtil::runQuery($sql);
747   -
748   - $this->registerTypes(true);
749   -
750   - $disable = array(
751   - OS_WINDOWS=>array('PSExtractor'),
752   - OS_UNIX => array()
753   - );
754   -
755   - $disableForOS = OS_WINDOWS?$disable[OS_WINDOWS]:$disable[OS_UNIX];
756   -
757   - foreach($disableForOS as $extractor)
758   - {
759   - $sql = "UPDATE mime_extractors SET active=0 WHERE name='$extractor'";
760   - DBUtil::runQuery($sql);
761   - $default->log->info("checkForRegisteredTypes: disabled '$extractor'");
762   - }
763   -
764   - if ($this->debug) $default->log->debug('checkForRegisteredTypes: done');
765   - KTUtil::setSystemSetting('mimeTypesRegistered', true);
766   - }
767   -
768   - private function updatePendingDocumentStatus($documentId, $message, $level)
769   - {
770   - $this->indexingHistory .= "\n" . $level . ': ' . $message;
771   - $message = sanitizeForSQL($this->indexingHistory);
772   - $sql = "UPDATE index_files SET status_msg='$message' WHERE document_id=$documentId";
773   - DBUtil::runQuery($sql);
774   - }
775   -
776   - /**
777   - *
778   - * @param int $documentId
779   - * @param string $message
780   - * @param string $level This may be info, error, debug
781   - */
782   - private function logPendingDocumentInfoStatus($documentId, $message, $level)
783   - {
784   - $this->updatePendingDocumentStatus($documentId, $message, $level);
785   - global $default;
786   -
787   - switch ($level)
788   - {
789   - case 'debug':
790   - if ($this->debug)
791   - {
792   - $default->log->debug($message);
793   - }
794   - break;
795   - default:
796   - $default->log->$level($message);
797   - }
798   - }
799   -
800   -
801   -
802   - public function getExtractor($extractorClass)
803   - {
804   - if (empty($extractorClass))
805   - {
806   - return null;
807   - }
808   -
809   - $includeFile = SEARCH2_INDEXER_DIR . 'extractors/' . $extractorClass . '.inc.php';
810   - if (!file_exists($includeFile))
811   - {
812   - throw new Exception("Extractor file does not exist: $includeFile");
813   - }
814   -
815   - require_once($includeFile);
816   -
817   - if (!class_exists($extractorClass))
818   - {
819   - throw new Exception("Extractor '$classname' not defined in file: $includeFile");
820   - }
821   -
822   - $extractor = new $extractorClass();
823   -
824   - if (!($extractor instanceof DocumentExtractor))
825   - {
826   - throw new Exception("Class $classname was expected to be of type DocumentExtractor");
827   - }
828   -
829   - return $extractor;
830   - }
831   -
832   - public static function getIndexingQueue($problemItemsOnly=true)
833   - {
834   -
835   - if ($problemItemsOnly)
836   - {
837   - $sql = "SELECT
838   - iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename
839   - FROM
840   - index_files iff
841   - INNER JOIN documents d ON iff.document_id=d.id
842   - INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
843   - INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
844   - INNER JOIN mime_types mt ON dcv.mime_id=mt.id
845   - LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
846   - WHERE
847   - (iff.status_msg IS NOT NULL AND iff.status_msg <> '') AND d.status_id=1
848   - ORDER BY indexdate ";
849   - }
850   - else
851   - {
852   - $sql = "SELECT
853   - iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename
854   - FROM
855   - index_files iff
856   - INNER JOIN documents d ON iff.document_id=d.id
857   - INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
858   - INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
859   - INNER JOIN mime_types mt ON dcv.mime_id=mt.id
860   - LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
861   - WHERE
862   - (iff.status_msg IS NULL or iff.status_msg = '') AND d.status_id=1
863   - ORDER BY indexdate ";
864   - }
865   - $aResult = DBUtil::getResultArray($sql);
866   -
867   - return $aResult;
868   - }
869   -
870   - public static function getPendingIndexingQueue()
871   - {
872   - return Indexer::getIndexingQueue(false);
873   - }
874   -
875   - /**
876   - * The main function that may be called repeatedly to index documents.
877   - *
878   - * @param int $max Default 20
879   - */
880   - public function indexDocuments($max=null)
881   - {
882   - global $default;
883   - $config =& KTConfig::getSingleton();
884   -
885   - /*$indexLockFile = $config->get('cache/cacheDirectory') . '/main.index.lock';
886   - if (is_file($indexLockFile))
887   - {
888   - $default->log->info('indexDocuments: main.index.lock seems to exist. it could be that the indexing is still underway.');
889   - $default->log->info('indexDocuments: Remove "' . $indexLockFile . '" if the indexing is not running or extend the frequency at which the background task runs!');
890   - return;
891   - }
892   - touch($indexLockFile);*/
893   -
894   -
895   - $this->checkForRegisteredTypes();
896   -
897   - if ($this->debug) $default->log->debug('indexDocuments: start');
898   - if (!$this->doesDiagnosticsPass())
899   - {
900   - //unlink($indexLockFile);
901   - if ($this->debug) $default->log->debug('indexDocuments: stopping - diagnostics problem. The dashboard will provide more information.');
902   - return;
903   - }
904   -
905   - if (is_null($max))
906   - {
907   - $max = $config->get('indexer/batchDocuments',20);
908   - }
909   -
910   - $this->loadExtractorHooks();
911   -
912   - Indexer::clearoutDeleted();
913   -
914   - $date = date('Y-m-d H:i:s');
915   - // identify the indexers that must run
916   - // mysql specific limit!
917   - $sql = "SELECT
918   - iff.document_id, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what
919   - FROM
920   - index_files iff
921   - INNER JOIN documents d ON iff.document_id=d.id
922   - INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
923   - INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
924   - INNER JOIN mime_types mt ON dcv.mime_id=mt.id
925   - LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
926   - WHERE
927   - (iff.processdate IS NULL or iff.processdate < cast(cast('$date' as date) -1 as date)) AND dmv.status_id=1
928   - ORDER BY indexdate
929   - LIMIT $max";
930   - $result = DBUtil::getResultArray($sql);
931   - if (PEAR::isError($result))
932   - {
933   - //unlink($indexLockFile);
934   - if ($this->debug) $default->log->debug('indexDocuments: stopping - db error');
935   - return;
936   - }
937   - KTUtil::setSystemSetting('luceneIndexingDate', time());
938   -
939   - // bail if no work to do
940   - if (count($result) == 0)
941   - {
942   - //unlink($indexLockFile);
943   - if ($this->debug) $default->log->debug('indexDocuments: stopping - no work to be done');
944   - return;
945   - }
946   -
947   - // identify any documents that need indexing and mark them
948   - // so they are not taken in a followup run
949   - $ids = array();
950   - foreach($result as $docinfo)
951   - {
952   - $ids[] = $docinfo['document_id'];
953   - }
954   -
955   - // mark the documents as being processed
956   -
957   - $ids=implode(',',$ids);
958   - $sql = "UPDATE index_files SET processdate='$date' WHERE document_id in ($ids)";
959   - DBUtil::runQuery($sql);
960   -
961   - $extractorCache = array();
962   - $storageManager = KTStorageManagerUtil::getSingleton();
963   -
964   - $tempPath = $config->get("urls/tmpDirectory");
965   -
966   - foreach($result as $docinfo)
967   - {
968   - $docId=$docinfo['document_id'];
969   - $extension=$docinfo['filetypes'];
970   - $mimeType=$docinfo['mimetypes'];
971   - $extractorClass=$docinfo['extractor'];
972   - $indexDocument = in_array($docinfo['what'], array('A','C'));
973   - $indexDiscussion = in_array($docinfo['what'], array('A','D'));
974   - $this->indexingHistory = '';
975   -
976   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Indexing docid: %d extension: '%s' mimetype: '%s' extractor: '%s'"), $docId, $extension,$mimeType,$extractorClass), 'debug');
977   -
978   - if (empty($extractorClass))
979   - {
980   - /*
981   -
982   - if no extractor is found and we don't need to index discussions, then we can remove the item from the queue.
983   -
984   - */
985   - if ($indexDiscussion)
986   - {
987   - $indexDocument = false;
988   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Not indexing docid: %d content because extractor could not be resolve. Still indexing discussion."), $docId), 'info');
989   - }
990   - else
991   - {
992   - Indexer::unqueueDocument($docId, sprintf(_kt("No extractor for docid: %d"),$docId));
993   - continue;
994   - }
995   - }
996   - else
997   - {
998   - /*
999   -
1000   - If an extractor is available, we must ensure it is enabled.
1001   -
1002   - */
1003   -
1004   - if (!$this->isExtractorEnabled($extractorClass))
1005   - {
1006   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("diagnose: Not indexing docid: %d because extractor '%s' is disabled."), $docId, $extractorClass), 'info');
1007   - continue;
1008   - }
1009   - }
1010   -
1011   - if ($this->debug)
1012   - {
1013   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Processing docid: %d.\n"),$docId), 'info');
1014   - }
1015   -
1016   - $document = Document::get($docId);
1017   - if (PEAR::isError($document))
1018   - {
1019   - Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Cannot resolve document id %d: %s."),$docId, $document->getMessage()), 'error');
1020   - continue;
1021   - }
1022   -
1023   - $filename = $document->getFileName();
1024   - if (substr($filename,0,1) == '~')
1025   - {
1026   - Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Filename for document id %d starts with a tilde (~). This is assumed to be a temporary file. This is ignored."),$docId), 'error');
1027   - continue;
1028   - }
1029   -
1030   - $removeFromQueue = true;
1031   - if ($indexDocument)
1032   - {
1033   - if (array_key_exists($extractorClass, $extractorCache))
1034   - {
1035   - $extractor = $extractorCache[$extractorClass];
1036   - }
1037   - else
1038   - {
1039   - $extractor = $extractorCache[$extractorClass] = $this->getExtractor($extractorClass);
1040   - }
1041   -
1042   - if (!($extractor instanceof DocumentExtractor))
1043   - {
1044   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("indexDocuments: extractor '%s' is not a document extractor class."),$extractorClass), 'error');
1045   - continue;
1046   - }
1047   -
1048   -
1049   -
1050   - $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();
1051   - $sourceFile = $storageManager->temporaryFile($document);
1052   -
1053   - if (empty($sourceFile) || !is_file($sourceFile))
1054   - {
1055   - Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: source file '%s' for document %d does not exist."),$sourceFile,$docId), 'error');
1056   - continue;
1057   - }
1058   -
1059   - if ($extractor->needsIntermediateSourceFile())
1060   - {
1061   - $extension = pathinfo($document->getFileName(), PATHINFO_EXTENSION);
1062   -
1063   - $intermediate = $tempPath . '/'. $docId . '.' . $extension;
1064   - $result = @copy($sourceFile, $intermediate);
1065   - if ($result === false)
1066   - {
1067   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not create intermediate file from document %d"),$docId), 'error');
1068   - // problem. lets try again later. probably permission related. log the issue.
1069   - continue;
1070   - }
1071   - $sourceFile = $intermediate;
1072   - }
1073   -
1074   - $targetFile = tempnam($tempPath, 'ktindexer');
1075   -
1076   - $extractor->setSourceFile($sourceFile);
1077   - $extractor->setMimeType($mimeType);
1078   - $extractor->setExtension($extension);
1079   - $extractor->setTargetFile($targetFile);
1080   - $extractor->setDocument($document);
1081   - $extractor->setIndexingStatus(null);
1082   - $extractor->setExtractionStatus(null);
1083   -
1084   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Extra Info docid: %d Source File: '%s' Target File: '%s'"),$docId,$sourceFile,$targetFile), 'debug');
1085   -
1086   - $this->executeHook($extractor, 'pre_extract');
1087   - $this->executeHook($extractor, 'pre_extract', $mimeType);
1088   - $removeFromQueue = false;
1089   -
1090   - if ($extractor->extractTextContent())
1091   - {
1092   - // the extractor may need to create another target file
1093   - $targetFile = $extractor->getTargetFile();
1094   -
1095   - $extractor->setExtractionStatus(true);
1096   - $this->executeHook($extractor, 'pre_index');
1097   - $this->executeHook($extractor, 'pre_index', $mimeType);
1098   -
1099   - $title = $document->getName();
1100   - if ($indexDiscussion)
1101   - {
1102   - if (!$this->filterText($targetFile))
1103   - {
1104   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');
1105   - }
1106   - else
1107   - {
1108   - $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);
1109   - $removeFromQueue = $indexStatus;
1110   - if (!$indexStatus)
1111   - {
1112   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocumentAndDiscussion"),$docId), 'error');
1113   - }
1114   -
1115   - $extractor->setIndexingStatus($indexStatus);
1116   - }
1117   - }
1118   - else
1119   - {
1120   - if (!$this->filterText($targetFile))
1121   - {
1122   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');
1123   - }
1124   - else
1125   - {
1126   - $indexStatus = $this->indexDocument($docId, $targetFile, $title, $version);
1127   - $removeFromQueue = $indexStatus;
1128   -
1129   - if (!$indexStatus)
1130   - {
1131   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocument"),$docId), 'error');
1132   - $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error');
1133   - }
1134   -
1135   - $extractor->setIndexingStatus($indexStatus);
1136   - }
1137   - }
1138   -
1139   - $this->executeHook($extractor, 'post_index', $mimeType);
1140   - $this->executeHook($extractor, 'post_index');
1141   - }
1142   - else
1143   - {
1144   - $extractor->setExtractionStatus(false);
1145   - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not extract contents from document %d"),$docId), 'error');
1146   - $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error');
1147   - }
1148   -
1149   - $this->executeHook($extractor, 'post_extract', $mimeType);
1150   - $this->executeHook($extractor, 'post_extract');
1151   -
1152   - if ($extractor->needsIntermediateSourceFile())
1153   - {
1154   - @unlink($sourceFile);
1155   - }
1156   -
1157   - @unlink($targetFile);
1158   -
1159   - }
1160   - else
1161   - {
1162   - $indexStatus = $this->indexDiscussion($docId);
1163   - $removeFromQueue = $indexStatus;
1164   - }
1165   -
1166   - if ($removeFromQueue)
1167   - {
1168   - Indexer::unqueueDocument($docId, sprintf(_kt("Done indexing docid: %d"),$docId));
1169   - }
1170   - else
1171   - {
1172   - if ($this->debug) $default->log->debug(sprintf(_kt("Document docid: %d was not removed from the queue as it looks like there was a problem with the extraction process"),$docId));
1173   - }
1174   - }
1175   - if ($this->debug) $default->log->debug('indexDocuments: done');
1176   - //unlink($indexLockFile);
1177   - }
1178   -
1179   - public function migrateDocuments($max=null)
1180   - {
1181   - global $default;
1182   -
1183   - $default->log->info(_kt('migrateDocuments: starting'));
1184   -
1185   - if (!$this->doesDiagnosticsPass(true))
1186   - {
1187   - $default->log->info(_kt('migrateDocuments: stopping - diagnostics problem. The dashboard will provide more information.'));
1188   - return;
1189   - }
1190   -
1191   - if (KTUtil::getSystemSetting('migrationComplete') == 'true')
1192   - {
1193   - $default->log->info(_kt('migrateDocuments: stopping - migration is complete.'));
1194   - return;
1195   - }
1196   -
1197   - $config =& KTConfig::getSingleton();
1198   - if (is_null($max))
1199   - {
1200   - $max = $config->get('indexer/batchMigrateDocument',500);
1201   - }
1202   -
1203   - $lockFile = $config->get('cache/cacheDirectory') . '/migration.lock';
1204   - if (is_file($lockFile))
1205   - {
1206   - $default->log->info(_kt('migrateDocuments: stopping - migration lockfile detected.'));
1207   - return;
1208   - }
1209   - touch($lockFile);
1210   -
1211   - $startTime = KTUtil::getSystemSetting('migrationStarted');
1212   - if (is_null($startTime))
1213   - {
1214   - KTUtil::setSystemSetting('migrationStarted', time());
1215   - }
1216   -
1217   - $maxLoops = 5;
1218   -
1219   - $max = ceil($max / $maxLoops);
1220   -
1221   - $start =KTUtil::getBenchmarkTime();
1222   - $noDocs = false;
1223   - $numDocs = 0;
1224   -
1225   - for($loop=0;$loop<$maxLoops;$loop++)
1226   - {
1227   -
1228   - $sql = "SELECT
1229   - document_id, document_text
1230   - FROM
1231   - document_text
1232   - ORDER BY document_id
1233   - LIMIT $max";
1234   - $result = DBUtil::getResultArray($sql);
1235   - if (PEAR::isError($result))
1236   - {
1237   - $default->log->info(_kt('migrateDocuments: db error'));
1238   - break;
1239   - }
1240   -
1241   - $docs = count($result);
1242   - if ($docs == 0)
1243   - {
1244   - $noDocs = true;
1245   - break;
1246   - }
1247   - $numDocs += $docs;
1248   -
1249   - foreach($result as $docinfo)
1250   - {
1251   - $docId = $docinfo['document_id'];
1252   -
1253   - $document = Document::get($docId);
1254   - if (PEAR::isError($document) || is_null($document))
1255   - {
1256   - $sql = "DELETE FROM document_text WHERE document_id=$docId";
1257   - DBUtil::runQuery($sql);
1258   - $default->log->error(sprintf(_kt('migrateDocuments: Could not get document %d\'s document! Removing content!'),$docId));
1259   - continue;
1260   - }
1261   -
1262   - $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();
1263   -
1264   - $targetFile = tempnam($tempPath, 'ktindexer');
1265   -
1266   - if (file_put_contents($targetFile, $docinfo['document_text']) === false)
1267   - {
1268   - $default->log->error(sprintf(_kt('migrateDocuments: Cannot write to \'%s\' for document id %d'), $targetFile, $docId));
1269   - continue;
1270   - }
1271   - // free memory asap ;)
1272   - unset($docinfo['document_text']);
1273   -
1274   - $title = $document->getName();
1275   -
1276   - $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);
1277   -
1278   - if ($indexStatus)
1279   - {
1280   - $sql = "DELETE FROM document_text WHERE document_id=$docId";
1281   - DBUtil::runQuery($sql);
1282   - }
1283   - else
1284   - {
1285   - $default->log->error(sprintf(_kt("migrateDocuments: Problem indexing document %d"), $docId));
1286   - }
1287   -
1288   - @unlink($targetFile);
1289   - }
1290   - }
1291   -
1292   - @unlink($lockFile);
1293   -
1294   - $time = KTUtil::getBenchmarkTime() - $start;
1295   -
1296   - KTUtil::setSystemSetting('migrationTime', KTUtil::getSystemSetting('migrationTime',0) + $time);
1297   - KTUtil::setSystemSetting('migratedDocuments', KTUtil::getSystemSetting('migratedDocuments',0) + $numDocs);
1298   -
1299   - $default->log->info(sprintf(_kt('migrateDocuments: stopping - done in %d seconds!'), $time));
1300   - if ($noDocs)
1301   - {
1302   - $default->log->info(_kt('migrateDocuments: Completed!'));
1303   - KTUtil::setSystemSetting('migrationComplete', 'true');
1304   - schedulerUtil::deleteByName('Index Migration');
1305   - $default->log->debug(_kt('migrateDocuments: Disabling \'Index Migration\' task by removing scheduler entry.'));
1306   - }
1307   - }
1308   -
1309   - /**
1310   - * Index a document. The base class must override this function.
1311   - *
1312   - * @param int $docId
1313   - * @param string $textFile
1314   - */
1315   - protected abstract function indexDocument($docId, $textFile, $title, $version);
1316   -
1317   -
1318   - public function updateDocumentIndex($docId, $text)
1319   - {
1320   - $config = KTConfig::getSingleton();
1321   - $tempPath = $config->get("urls/tmpDirectory");
1322   - $tempFile = tempnam($tempPath,'ud_');
1323   -
1324   - file_put_contents($tempFile, $text);
1325   -
1326   - $document = Document::get($docId);
1327   - $title = $document->getDescription();
1328   - $version = $document->getVersion();
1329   -
1330   - $result = $this->indexDocument($docId, $tempFile, $title, $version);
1331   -
1332   - if (file_exists($tempFile))
1333   - {
1334   - unlink($tempFile);
1335   - }
1336   -
1337   - return $result;
1338   - }
1339   -
1340   - /**
1341   - * Index a discussion. The base class must override this function.
1342   - *
1343   - * @param int $docId
1344   - */
1345   - protected abstract function indexDiscussion($docId);
1346   -
1347   - /**
1348   - * Diagnose the indexer. e.g. Check that the indexing server is running.
1349   - *
1350   - */
1351   - public abstract function diagnose();
1352   -
1353   - /**
1354   - * Diagnose the extractors.
1355   - *
1356   - * @return array
1357   - */
1358   - public function diagnoseExtractors()
1359   - {
1360   - $diagnosis = $this->_diagnose($this->extractorPath, 'DocumentExtractor', 'Extractor.inc.php');
1361   - $diagnosis = array_merge($diagnosis, $this->_diagnose($this->hookPath, 'Hook', 'Hook.inc.php'));
1362   -
1363   - return $diagnosis;
1364   - }
1365   -
1366   - /**
1367   - * This is a refactored diagnose function.
1368   - *
1369   - * @param string $path
1370   - * @param string $class
1371   - * @param string $extension
1372   - * @return array
1373   - */
1374   - private function _diagnose($path, $baseclass, $extension)
1375   - {
1376   - global $default;
1377   -
1378   - $diagnoses = array();
1379   -
1380   - $dir = opendir(SearchHelper::correctPath($path));
1381   - $extlen = - strlen($extension);
1382   -
1383   - while (($file = readdir($dir)) !== false)
1384   - {
1385   - if (substr($file,0,1) == '.')
1386   - {
1387   - continue;
1388   - }
1389   - if (substr($file,$extlen) != $extension)
1390   - {
1391   - $default->log->error(sprintf(_kt("diagnose: '%s' does not have extension '%s'."), $file, $extension));
1392   - continue;
1393   - }
1394   -
1395   - require_once($path . '/' . $file);
1396   -
1397   - $class = substr($file, 0, -8);
1398   - if (!class_exists($class))
1399   - {
1400   - $default->log->error(sprintf(_kt("diagnose: class '%s' does not exist."), $class));
1401   - continue;
1402   - }
1403   -
1404   - if (!$this->isExtractorEnabled($class))
1405   - {
1406   - $default->log->debug(sprintf(_kt("diagnose: extractor '%s' is disabled."), $class));
1407   - continue;
1408   - }
1409   -
1410   - $extractor = new $class();
1411   - if (!is_a($extractor, $baseclass))
1412   - {
1413   - $default->log->error(sprintf(_kt("diagnose(): '%s' is not of type DocumentExtractor"), $class));
1414   - continue;
1415   - }
1416   -
1417   - $types = $extractor->getSupportedMimeTypes();
1418   - if (empty($types))
1419   - {
1420   - if ($this->debug) $default->log->debug(sprintf(_kt("diagnose: class '%s' does not support any types."), $class));
1421   - continue;
1422   - }
1423   -
1424   - $diagnosis=$extractor->diagnose();
1425   - if (empty($diagnosis))
1426   - {
1427   - continue;
1428   - }
1429   - $diagnoses[$class] = array(
1430   - 'name'=>$extractor->getDisplayName(),
1431   - 'diagnosis'=>$diagnosis
1432   - );
1433   -
1434   - }
1435   - closedir($dir);
1436   -
1437   - return $diagnoses;
1438   - }
1439   -
1440   -
1441   - /**
1442   - * Register the extractor types.
1443   - *
1444   - * @param boolean $clear. Optional. Defaults to false.
1445   - */
1446   - public function registerTypes($clear=false)
1447   - {
1448   - if ($clear)
1449   - {
1450   - $this->clearExtractors();
1451   - }
1452   - $dir = opendir(SearchHelper::correctPath($this->extractorPath));
1453   - while (($file = readdir($dir)) !== false)
1454   - {
1455   - if (substr($file,-17) == 'Extractor.inc.php')
1456   - {
1457   - require_once($this->extractorPath . '/' . $file);
1458   - $class = substr($file, 0, -8);
1459   -
1460   - if (!class_exists($class))
1461   - {
1462   - // if the class does not exist, we can't do anything.
1463   - continue;
1464   - }
1465   -
1466   - $extractor = new $class;
1467   - if ($extractor instanceof DocumentExtractor)
1468   - {
1469   - $extractor->registerMimeTypes();
1470   - }
1471   - }
1472   - }
1473   - closedir($dir);
1474   - }
1475   -
1476   - /**
1477   - * This is used as a possible obtimisation effort. It may be overridden in that case.
1478   - *
1479   - * @param int $docId
1480   - * @param string $textFile
1481   - */
1482   - protected function indexDocumentAndDiscussion($docId, $textFile, $title, $version)
1483   - {
1484   - $this->indexDocument($docId, $textFile, $title, $version);
1485   - $this->indexDiscussion($docId);
1486   - }
1487   -
1488   - /**
1489   - * Remove the document from the queue. This is normally called when it has been processed.
1490   - *
1491   - * @param int $docid
1492   - */
1493   - public static function unqueueDocument($docid, $reason=false, $level='debug')
1494   - {
1495   - $sql = "DELETE FROM index_files WHERE document_id=$docid";
1496   - DBUtil::runQuery($sql);
1497   - if ($reason !== false)
1498   - {
1499   - global $default;
1500   - $default->log->$level("Indexer: removing document $docid from the queue - $reason");
1501   - }
1502   - }
1503   -
1504   - /**
1505   - * Run a query on the index.
1506   - *
1507   - * @param string $query
1508   - * @return array
1509   - */
1510   - public abstract function query($query);
1511   -
1512   - /**
1513   - * Converts an integer to a string that can be easily compared and reversed.
1514   - *
1515   - * @param int $int
1516   - * @return string
1517   - */
1518   - public static function longToString($int)
1519   - {
1520   - $maxlen = 14;
1521   -
1522   - $a2z = array('a','b','c','d','e','f','g','h','i','j');
1523   - $o29 = array('0','1','2','3','4','5','6','7','8','9');
1524   - $l = str_pad('',$maxlen - strlen("$int"),'0') . $int;
1525   -
1526   - return str_replace($o29, $a2z, $l);
1527   - }
1528   -
1529   - /**
1530   - * Converts a string to an integer.
1531   - *
1532   - * @param string $str
1533   - * @return int
1534   - */
1535   - public static function stringToLong($str)
1536   - {
1537   - $a2z = array('a','b','c','d','e','f','g','h','i','j');
1538   - $o29 = array('0','1','2','3','4','5','6','7','8','9');
1539   -
1540   - $int = str_replace($a2z, $o29, $str) + 0;
1541   -
1542   - return $int;
1543   - }
1544   -
1545   - /**
1546   - * Possibly we can optimise indexes. This method must be overriden.
1547   - * The new function must call the parent!
1548   - *
1549   - */
1550   - public function optimise()
1551   - {
1552   - KTUtil::setSystemSetting('luceneOptimisationDate', time());
1553   - }
1554   -
1555   - /**
1556   - * Shuts down the indexer
1557   - *
1558   - */
1559   - public function shutdown()
1560   - {
1561   - // do nothing generally
1562   - }
1563   -
1564   - /**
1565   - * Returns the name of the indexer.
1566   - *
1567   - * @return string
1568   - */
1569   - public abstract function getDisplayName();
1570   -
1571   -
1572   - /**
1573   - * Returns the number of non-deleted documents in the index.
1574   - *
1575   - * @return int
1576   - */
1577   - public abstract function getDocumentsInIndex();
1578   -
1579   - /**
1580   - * Returns the path to the index directory
1581   - *
1582   - * @return string
1583   - */
1584   - public function getIndexDirectory()
1585   - {
1586   - $config = KTConfig::getSingleton();
1587   - $directory = $config->get('indexer/luceneDirectory');
1588   - return $directory;
1589   - }
1590   -}
1591   -
1592   -?>
  1 +<?php
  2 +
  3 +/**
  4 + * $Id:$
  5 + *
  6 + * KnowledgeTree Community Edition
  7 + * Document Management Made Simple
  8 + * Copyright (C) 2008 KnowledgeTree Inc.
  9 + * Portions copyright The Jam Warehouse Software (Pty) Limited
  10 + *
  11 + * This program is free software; you can redistribute it and/or modify it under
  12 + * the terms of the GNU General Public License version 3 as published by the
  13 + * Free Software Foundation.
  14 + *
  15 + * This program is distributed in the hope that it will be useful, but WITHOUT
  16 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17 + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  18 + * details.
  19 + *
  20 + * You should have received a copy of the GNU General Public License
  21 + * along with this program. If not, see <http://www.gnu.org/licenses/>.
  22 + *
  23 + * You can contact KnowledgeTree Inc., PO Box 7775 #87847, San Francisco,
  24 + * California 94120-7775, or email info@knowledgetree.com.
  25 + *
  26 + * The interactive user interfaces in modified source and object code versions
  27 + * of this program must display Appropriate Legal Notices, as required under
  28 + * Section 5 of the GNU General Public License version 3.
  29 + *
  30 + * In accordance with Section 7(b) of the GNU General Public License version 3,
  31 + * these Appropriate Legal Notices must retain the display of the "Powered by
  32 + * KnowledgeTree" logo and retain the original copyright notice. If the display of the
  33 + * logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices
  34 + * must display the words "Powered by KnowledgeTree" and retain the original
  35 + * copyright notice.
  36 + * Contributor( s): ______________________________________
  37 + *
  38 + */
  39 +
  40 +define('SEARCH2_INDEXER_DIR',realpath(dirname(__FILE__)) . '/');
  41 +require_once('indexing/extractorCore.inc.php');
  42 +require_once(KT_DIR . '/plugins/ktcore/scheduler/schedulerUtil.php');
  43 +
  44 +
  45 +class IndexerInconsistencyException extends Exception {};
  46 +
  47 +class QueryResultItem
  48 +{
  49 + protected $document_id;
  50 + protected $title;
  51 + protected $rank;
  52 + protected $text;
  53 + protected $filesize;
  54 + protected $fullpath;
  55 + protected $live;
  56 + protected $version;
  57 + protected $mimeType;
  58 + protected $filename;
  59 + protected $thumbnail; // TODO: if not null, gui can display a thumbnail
  60 + protected $viewer; // TODO: if not null, a viewer can be used to view the document
  61 + protected $document;
  62 + protected $checkedOutUser;
  63 + protected $dateCheckedout;
  64 + protected $workflowState;
  65 + protected $workflow;
  66 + protected $modifiedBy;
  67 + protected $dateModified;
  68 + protected $createdBy;
  69 + protected $dateCreated;
  70 + protected $owner;
  71 + protected $immutable;
  72 + protected $deleted;
  73 + protected $status;
  74 + protected $folderId;
  75 + protected $storagePath;
  76 + protected $documentType;
  77 + protected $mimeIconPath;
  78 + protected $mimeDisplay;
  79 + protected $oemDocumentNo;
  80 +
  81 + public function __construct($document_id, $rank=null, $title=null, $text=null)
  82 + {
  83 + $this->document_id=(int) $document_id;
  84 + $this->rank= $rank;
  85 + $this->title=$title;
  86 + $this->text = $text;
  87 + $this->live = true;
  88 + $this->loadDocumentInfo();
  89 + }
  90 +
  91 + protected function __isset($property)
  92 + {
  93 + switch($property)
  94 + {
  95 + case 'DocumentID': return isset($this->document_id);
  96 + case 'Rank': return isset($this->rank);
  97 + case 'Text': return isset($this->text);
  98 + case 'Title': return isset($this->title);
  99 + case null: break;
  100 + default:
  101 + throw new Exception("Unknown property '$property' to get on QueryResultItem");
  102 + }
  103 + return true; // should not be reached
  104 + }
  105 +
  106 + public function loadDocumentInfo()
  107 + {
  108 + global $default;
  109 + $sql = "SELECT
  110 + d.folder_id, f.full_path, f.name, dcv.size as filesize, dcv.major_version,
  111 + dcv.minor_version, dcv.filename, cou.name as checkoutuser, w.human_name as workflow, ws.human_name as workflowstate,
  112 + mt.mimetypes as mimetype, md.mime_doc as mimedoc, d.checkedout, mbu.name as modifiedbyuser, d.modified,
  113 + cbu.name as createdbyuser, ou.name as owneruser, d.immutable, d.status_id, d.created,dcv.storage_path, dtl.name as document_type,
  114 + mt.icon_path as mime_icon_path, mt.friendly_name as mime_display, d.oem_no, dmv.name as title
  115 + FROM
  116 + documents d
  117 + INNER JOIN document_metadata_version dmv ON d.metadata_version_id = dmv.id
  118 + INNER JOIN document_content_version dcv ON dmv.content_version_id = dcv.id
  119 + INNER JOIN mime_types mt ON dcv.mime_id=mt.id
  120 + LEFT JOIN document_types_lookup dtl ON dtl.id=dmv.document_type_id
  121 + LEFT JOIN folders f ON f.id=d.folder_id
  122 + LEFT JOIN users cou ON d.checked_out_user_id=cou.id
  123 + LEFT JOIN workflows w ON dmv.workflow_id=w.id
  124 + LEFT JOIN workflow_states ws ON dmv.workflow_state_id = ws.id
  125 + LEFT JOIN mime_documents md ON mt.mime_document_id = md.id
  126 + LEFT JOIN users mbu ON d.modified_user_id=mbu.id
  127 + LEFT JOIN users cbu ON d.creator_id=cbu.id
  128 + LEFT JOIN users ou ON d.owner_id=ou.id
  129 + WHERE
  130 + d.id=$this->document_id";
  131 +
  132 + $result = DBUtil::getOneResult($sql);
  133 +
  134 + if (PEAR::isError($result) || empty($result))
  135 + {
  136 + $this->live = false;
  137 + if (PEAR::isError($result))
  138 + {
  139 + throw new Exception('Database exception! There appears to be an error in the system: ' .$result->getMessage());
  140 + }
  141 +
  142 + $default->log->error('QueryResultItem: $result is null');
  143 + $msg = 'The database did not have a record matching the result from the document indexer. This may occur if there is an inconsistency between the document indexer and the repository. The indexer needs to be repaired.';
  144 + $default->log->error('QueryResultItem: ' . $msg);
  145 + // TODO: repair process where we scan documents in index, and delete those for which there is nothing in the repository
  146 + throw new IndexerInconsistencyException(_kt($msg));
  147 + }
  148 +
  149 + // document_id, relevance, text, title
  150 +
  151 + $this->documentType = $result['document_type'];
  152 + $this->filename=$result['filename'];
  153 + $this->filesize = KTUtil::filesizeToString($result['filesize']);
  154 + $this->folderId = $result['folder_id'];
  155 + $this->title = $result['title'];
  156 +
  157 + $this->createdBy = $result['createdbyuser'];
  158 + $this->dateCreated = $result['created'];
  159 +
  160 + $this->modifiedBy = $result['modifiedbyuser'];
  161 + $this->dateModified = $result['modified'];
  162 +
  163 + $this->checkedOutUser = $result['checkoutuser'];
  164 + $this->dateCheckedout = $result['checkedout'];
  165 +
  166 + $this->owner = $result['owneruser'];
  167 +
  168 + $this->version = $result['major_version'] . '.' . $result['minor_version'];
  169 +
  170 + $this->immutable = ($result['immutable'] + 0)?_kt('Immutable'):'';
  171 +
  172 + $this->workflow = $result['workflow'];
  173 + $this->workflowState = $result['workflowstate'];
  174 +
  175 + $this->oemDocumentNo = $result['oem_no'];
  176 + if (empty($this->oemDocumentNo)) $this->oemDocumentNo = 'n/a';
  177 +
  178 + if (is_null($result['name']))
  179 + {
  180 + $this->fullpath = '(orphaned)';
  181 + }
  182 + else
  183 + {
  184 + $this->fullpath = $result['full_path'];
  185 + }
  186 +
  187 + $this->mimeType = $result['mimetype'];
  188 + $this->mimeIconPath = $result['mime_icon_path'];
  189 + $this->mimeDisplay = $result['mime_display'];
  190 +
  191 + $this->storagePath = $result['storage_path'];
  192 + $this->status = Document::getStatusString($result['status_id']);
  193 + }
  194 +
  195 + protected function __get($property)
  196 + {
  197 + switch($property)
  198 + {
  199 + case null: return '';
  200 + case 'DocumentID': return (int) $this->document_id;
  201 + case 'Relevance':
  202 + case 'Rank': return (float) $this->rank;
  203 + case 'Text': return (string) $this->text;
  204 + case 'Title': return (string) $this->title;
  205 + case 'FullPath': return (string) $this->fullpath;
  206 + case 'IsLive': return (bool) $this->live;
  207 + case 'Filesize': return $this->filesize;
  208 + case 'Version': return (string) $this->version;
  209 + case 'Filename': return (string)$this->filename;
  210 + case 'FolderId': return (int)$this->folderId;
  211 + case 'OemDocumentNo': return (string) $this->oemDocumentNo;
  212 + case 'Document':
  213 + if (is_null($this->document))
  214 + {
  215 + $this->document = Document::get($this->document_id);
  216 + }
  217 + return $this->document;
  218 + case 'IsAvailable':
  219 + return $this->Document->isLive();
  220 + case 'CheckedOutUser':
  221 + case 'CheckedOutBy':
  222 + return (string) $this->checkedOutUser;
  223 + case 'WorkflowOnly':
  224 + case 'Workflow':
  225 + return (string)$this->workflow;
  226 + case 'WorkflowStateOnly':
  227 + case 'WorkflowState':
  228 + return (string)$this->workflowState;
  229 + case 'WorkflowAndState':
  230 + if (is_null($this->workflow))
  231 + {
  232 + return '';
  233 + }
  234 + return "$this->workflow - $this->workflowState";
  235 + case 'MimeType':
  236 + return (string) $this->mimeType;
  237 + case 'MimeIconPath':
  238 + return (string) $this->mimeIconPath;
  239 + case 'MimeDisplay':
  240 + return (string) $this->mimeDisplay;
  241 + case 'DateCheckedOut':
  242 + return (string) $this->dateCheckedout;
  243 + case 'ModifiedBy':
  244 + return (string) $this->modifiedBy;
  245 + case 'DateModified':
  246 + return (string) $this->dateModified;
  247 + case 'CreatedBy':
  248 + return (string) $this->createdBy;
  249 + case 'DateCreated':
  250 + return (string) $this->dateCreated;
  251 + case 'Owner':
  252 + case 'OwnedBy':
  253 + return (string) $this->owner;
  254 + case 'IsImmutable':
  255 + case 'Immutable':
  256 + return (bool) $this->immutable;
  257 + case 'Status':
  258 + return $this->status;
  259 + case 'StoragePath':
  260 + return $this->storagePath;
  261 + case 'DocumentType':
  262 + return $this->documentType;
  263 + case 'Permissions':
  264 + return 'not available';
  265 + case 'CanBeReadByUser':
  266 + if (!$this->live)
  267 + return false;
  268 + if (Permission::userHasDocumentReadPermission($this->Document))
  269 + return true;
  270 + if (Permission::adminIsInAdminMode())
  271 + return true;
  272 + return false;
  273 + default:
  274 + throw new Exception("Unknown property '$property' to get on QueryResultItem");
  275 + }
  276 + return ''; // Should not be reached
  277 + }
  278 +
  279 + protected function __set($property, $value)
  280 + {
  281 + switch($property)
  282 + {
  283 + case 'Rank': $this->rank = number_format($value,2,'.',','); break;
  284 + case 'Title': $this->title = $value; break;
  285 + case 'Text': $this->text = $value; break;
  286 + default:
  287 + throw new Exception("Unknown property '$property' to set on QueryResultItem");
  288 + }
  289 + }
  290 +}
  291 +
  292 +function MatchResultCompare($a, $b)
  293 +{
  294 + if ($a->Rank == $b->Rank) {
  295 + return 0;
  296 + }
  297 + return ($a->Rank < $b->Rank) ? -1 : 1;
  298 +}
  299 +
  300 +abstract class Indexer
  301 +{
  302 + /**
  303 + * Cache of extractors
  304 + *
  305 + * @var array
  306 + */
  307 + private $extractorCache;
  308 +
  309 + /**
  310 + * Indicates if the indexer will do logging.
  311 + *
  312 + * @var boolean
  313 + */
  314 + private $debug;
  315 + /**
  316 + * Cache on mime related hooks
  317 + *
  318 + * @var unknown_type
  319 + */
  320 + private $mimeHookCache;
  321 + /**
  322 + * Cache on general hooks.
  323 + *
  324 + * @var array
  325 + */
  326 + private $generalHookCache;
  327 +
  328 + /**
  329 + * This is a path to the extractors.
  330 + *
  331 + * @var string
  332 + */
  333 + private $extractorPath;
  334 + /**
  335 + * This is a path to the hooks.
  336 + *
  337 + * @var string
  338 + */
  339 + private $hookPath;
  340 +
  341 + private $enabledExtractors;
  342 +
  343 + /**
  344 + * Initialise the indexer
  345 + *
  346 + */
  347 + protected function __construct()
  348 + {
  349 + $config = KTConfig::getSingleton();
  350 +
  351 + $this->extractorCache = array();
  352 + $this->debug = $config->get('indexer/debug', true);
  353 + $this->hookCache = array();
  354 + $this->generalHookCache = array();
  355 + $this->extractorPath = $config->get('indexer/extractorPath', 'extractors');
  356 + $this->hookPath = $config->get('indexer/extractorHookPath','extractorHooks');
  357 +
  358 + $this->loadExtractorStatus();
  359 + }
  360 +
  361 + /**
  362 + * Get the list if enabled extractors
  363 + *
  364 + */
  365 + private function loadExtractorStatus()
  366 + {
  367 + $sql = "SELECT id, name FROM mime_extractors WHERE active=1";
  368 + $rs = DBUtil::getResultArray($sql);
  369 + $this->enabledExtractors = array();
  370 + foreach($rs as $item)
  371 + {
  372 + $this->enabledExtractors[] = $item['name'];
  373 + }
  374 + }
  375 +
  376 + private function isExtractorEnabled($extractor)
  377 + {
  378 + return in_array($extractor, $this->enabledExtractors);
  379 + }
  380 +
  381 + /**
  382 + * Returns a reference to the main class
  383 + *
  384 + * @return Indexer
  385 + */
  386 + public static function get()
  387 + {
  388 + static $singleton = null;
  389 +
  390 + if (is_null($singleton))
  391 + {
  392 + $config = KTConfig::getSingleton();
  393 + $classname = $config->get('indexer/coreClass');
  394 +
  395 + require_once('indexing/indexers/' . $classname . '.inc.php');
  396 +
  397 + if (!class_exists($classname))
  398 + {
  399 + throw new Exception("Class '$classname' does not exist.");
  400 + }
  401 +
  402 + $singleton = new $classname;
  403 + }
  404 +
  405 + return $singleton;
  406 + }
  407 +
  408 + public abstract function deleteDocument($docid);
  409 +
  410 + /**
  411 + * Remove the association of all extractors to mime types on the database.
  412 + *
  413 + */
  414 + public function clearExtractors()
  415 + {
  416 + global $default;
  417 +
  418 + $sql = "update mime_types set extractor_id=null";
  419 + DBUtil::runQuery($sql);
  420 +
  421 + $sql = "delete from mime_extractors";
  422 + DBUtil::runQuery($sql);
  423 +
  424 + if ($this->debug) $default->log->debug('clearExtractors');
  425 + }
  426 +
  427 + /**
  428 + * lookup the name of the extractor class based on the mime type.
  429 + *
  430 + * @param string $type
  431 + * @return string
  432 + */
  433 + public static function resolveExtractor($type)
  434 + {
  435 + global $default;
  436 + $sql = "select extractor from mime_types where filetypes='$type'";
  437 + $class = DBUtil::getOneResultKey($sql,'extractor');
  438 + if (PEAR::isError($class))
  439 + {
  440 + $default->log->error("resolveExtractor: cannot resolve $type");
  441 + return $class;
  442 + }
  443 + if ($this->debug) $default->log->debug(sprintf(_kt("resolveExtractor: Resolved '%s' from mime type '%s'."), $class, $type));
  444 + return $class;
  445 + }
  446 +
  447 + /**
  448 + * Return all the discussion text.
  449 + *
  450 + * @param int $docid
  451 + * @return string
  452 + */
  453 + public static function getDiscussionText($docid)
  454 + {
  455 + $sql = "SELECT
  456 + dc.subject, dc.body
  457 + FROM
  458 + discussion_threads dt
  459 + INNER JOIN discussion_comments dc ON dc.thread_id=dt.id AND dc.id BETWEEN dt.first_comment_id AND dt.last_comment_id
  460 + WHERE
  461 + dt.document_id=$docid";
  462 + $result = DBUtil::getResultArray($sql);
  463 + $text = '';
  464 +
  465 + foreach($result as $record)
  466 + {
  467 + $text .= $record['subject'] . "\n" . $record['body'] . "\n";
  468 + }
  469 +
  470 + return $text;
  471 + }
  472 +
  473 + /**
  474 + * Schedule the indexing of a document.
  475 + *
  476 + * @param string $document
  477 + * @param string $what
  478 + */
  479 + public static function index($document, $what='A')
  480 + {
  481 + global $default;
  482 +
  483 + if (is_numeric($document))
  484 + {
  485 + $document = Document::get($document+0);
  486 + }
  487 +
  488 + if (PEAR::isError($document))
  489 + {
  490 + $default->log->error("index: Could not index document: " .$document->getMessage());
  491 + return;
  492 + }
  493 +
  494 + $document_id = $document->getId();
  495 + $userid=$_SESSION['userID'];
  496 + if (empty($userid)) $userid=1;
  497 +
  498 + // we dequeue the document so that there are no issues when enqueuing
  499 + Indexer::unqueueDocument($document_id);
  500 +
  501 + // enqueue item
  502 + $sql = "INSERT INTO index_files(document_id, user_id, what) VALUES($document_id, $userid, '$what')";
  503 + DBUtil::runQuery($sql);
  504 +
  505 + $default->log->debug("index: Queuing indexing of $document_id");
  506 +
  507 + }
  508 +
  509 + private static function incrementCount()
  510 + {
  511 + // Get count from system settings
  512 + $count = Indexer::getIndexedDocumentCount();
  513 + $count = (int)$count + 1;
  514 + Indexer::updateIndexedDocumentCount($count);
  515 + }
  516 +
  517 + public static function getIndexedDocumentCount()
  518 + {
  519 + $count = KTUtil::getSystemSetting('indexedDocumentCount', 0);
  520 + return (int) $count;
  521 + }
  522 +
  523 + public static function updateIndexedDocumentCount($cnt = 0)
  524 + {
  525 + KTUtil::setSystemSetting('indexedDocumentCount', $cnt);
  526 + }
  527 +
  528 + public static function reindexQueue()
  529 + {
  530 + $sql = "UPDATE index_files SET processdate = null";
  531 + DBUtil::runQuery($sql);
  532 + }
  533 +
  534 + public static function reindexDocument($documentId)
  535 + {
  536 + $sql = "UPDATE index_files SET processdate=null, status_msg=null WHERE document_id=$documentId";
  537 + DBUtil::runQuery($sql);
  538 + }
  539 +
  540 +
  541 +
  542 + public static function indexAll()
  543 + {
  544 + $userid=$_SESSION['userID'];
  545 + if (empty($userid)) $userid=1;
  546 +
  547 + $sql = "DELETE FROM index_files";
  548 + DBUtil::runQuery($sql);
  549 +
  550 + $sql = "INSERT INTO index_files(document_id, user_id, what) SELECT id, $userid, 'A' FROM documents WHERE status_id=1 and id not in (select document_id from index_files)";
  551 + DBUtil::runQuery($sql);
  552 + }
  553 +
  554 + /**
  555 + * Clearout the scheduling of documents that no longer exist.
  556 + *
  557 + */
  558 + public static function clearoutDeleted()
  559 + {
  560 + global $default;
  561 +
  562 + $sql = 'DELETE FROM
  563 + index_files
  564 + WHERE
  565 + document_id in (SELECT d.id FROM documents AS d WHERE d.status_id=3) OR
  566 + NOT EXISTS(SELECT index_files.document_id FROM documents WHERE index_files.document_id=documents.id)';
  567 + DBUtil::runQuery($sql);
  568 +
  569 + $default->log->debug("Indexer::clearoutDeleted: removed documents from indexing queue that have been deleted");
  570 + }
  571 +
  572 +
  573 + /**
  574 + * Check if a document is scheduled to be indexed
  575 + *
  576 + * @param mixed $document This may be a document or document id
  577 + * @return boolean
  578 + */
  579 + public static function isDocumentScheduled($document)
  580 + {
  581 + if (is_numeric($document))
  582 + {
  583 + $docid = $document;
  584 + }
  585 + else if ($document instanceof Document)
  586 + {
  587 + $docid = $document->getId();
  588 + }
  589 + else
  590 + {
  591 + return false;
  592 + }
  593 + $sql = "SELECT 1 FROM index_files WHERE document_id=$docid";
  594 + $result = DBUtil::getResultArray($sql);
  595 + return count($result) > 0;
  596 + }
  597 +
  598 + /**
  599 + * Filters text removing redundant characters such as continuous newlines and spaces.
  600 + *
  601 + * @param string $filename
  602 + */
  603 + private function filterText($filename)
  604 + {
  605 + $content = file_get_contents($filename);
  606 +
  607 + $src = array("([\r\n])","([\n][\n])","([\n])","([\t])",'([ ][ ])');
  608 + $tgt = array("\n","\n",' ',' ',' ');
  609 +
  610 + // shrink what is being stored.
  611 + do
  612 + {
  613 + $orig = $content;
  614 + $content = preg_replace($src, $tgt, $content);
  615 + } while ($content != $orig);
  616 +
  617 + return file_put_contents($filename, $content) !== false;
  618 + }
  619 +
  620 + /**
  621 + * Load hooks for text extraction process.
  622 + *
  623 + */
  624 + private function loadExtractorHooks()
  625 + {
  626 + $this->generalHookCache = array();
  627 + $this->mimeHookCache = array();
  628 +
  629 +
  630 + $dir = opendir(SearchHelper::correctPath($this->hookPath));
  631 + while (($file = readdir($dir)) !== false)
  632 + {
  633 + if (substr($file,-12) == 'Hook.inc.php')
  634 + {
  635 + require_once($this->hookPath . '/' . $file);
  636 + $class = substr($file, 0, -8);
  637 +
  638 + if (!class_exists($class))
  639 + {
  640 + continue;
  641 + }
  642 +
  643 + $hook = new $class;
  644 + if (!($class instanceof ExtractorHook))
  645 + {
  646 + continue;
  647 + }
  648 +
  649 + $mimeTypes = $hook->registerMimeTypes();
  650 + if (is_null($mimeTypes))
  651 + {
  652 + $this->generalHookCache[] = & $hook;
  653 + }
  654 + else
  655 + {
  656 + foreach($mimeTypes as $type)
  657 + {
  658 + $this->mimeHookCache[$type][] = & $hook;
  659 + }
  660 + }
  661 +
  662 + }
  663 + }
  664 + closedir($dir);
  665 + }
  666 +
  667 + /**
  668 + * This is a refactored function to execute the hooks.
  669 + *
  670 + * @param DocumentExtractor $extractor
  671 + * @param string $phase
  672 + * @param string $mimeType Optional. If set, indicates which hooks must be used, else assume general.
  673 + */
  674 + private function executeHook($extractor, $phase, $mimeType = null)
  675 + {
  676 + $hooks = array();
  677 + if (is_null($mimeType))
  678 + {
  679 + $hooks = $this->generalHookCache;
  680 + }
  681 + else
  682 + {
  683 + if (array_key_exists($mimeType, $this->mimeHookCache))
  684 + {
  685 + $hooks = $this->mimeHookCache[$mimeType];
  686 + }
  687 + }
  688 + if (empty($hooks))
  689 + {
  690 + return;
  691 + }
  692 +
  693 + foreach($hooks as $hook)
  694 + {
  695 + $hook->$phase($extractor);
  696 + }
  697 + }
  698 +
  699 + private function doesDiagnosticsPass($simple=false)
  700 + {
  701 + global $default;
  702 +
  703 + $config =& KTConfig::getSingleton();
  704 + // create a index log lock file in case there are errors, and we don't need to log them forever!
  705 + // this function will create the lockfile if an error is detected. It will be removed as soon
  706 + // as the problems with the indexer are removed.
  707 + $lockFile = $config->get('cache/cacheDirectory') . '/index.log.lock';
  708 +
  709 + $diagnosis = $this->diagnose();
  710 + if (!is_null($diagnosis))
  711 + {
  712 + if (!is_file($lockFile))
  713 + {
  714 + $default->log->error(_kt('Indexer problem: ') . $diagnosis);
  715 + }
  716 + touch($lockFile);
  717 + return false;
  718 + }
  719 +
  720 + if ($simple)
  721 + {
  722 + return true;
  723 + }
  724 +
  725 + $diagnosis = $this->diagnoseExtractors();
  726 + if (!empty($diagnosis))
  727 + {
  728 + if (!is_file($lockFile))
  729 + {
  730 + foreach($diagnosis as $diag)
  731 + {
  732 + $default->log->error(sprintf(_kt('%s problem: %s'), $diag['name'],$diag['diagnosis']));
  733 + }
  734 + }
  735 + touch($lockFile);
  736 + return false;
  737 + }
  738 +
  739 + if (is_file($lockFile))
  740 + {
  741 + $default->log->info(_kt('Issues with the indexer have been resolved!'));
  742 + unlink($lockFile);
  743 + }
  744 +
  745 + return true;
  746 + }
  747 +
  748 + /**
  749 + * This does the initial mime type association between mime types and text extractors
  750 + *
  751 + */
  752 + public function checkForRegisteredTypes()
  753 + {
  754 + global $default;
  755 +
  756 + // we are only doing this once!
  757 + $initRegistered = KTUtil::getSystemSetting('mimeTypesRegistered', false);
  758 + if ($initRegistered)
  759 + {
  760 + return;
  761 + }
  762 + if ($this->debug) $default->log->debug('checkForRegisteredTypes: start');
  763 +
  764 + $date = date('Y-m-d H:i');
  765 + $sql = "UPDATE scheduler_tasks SET run_time='$date'";
  766 + DBUtil::runQuery($sql);
  767 +
  768 + $this->registerTypes(true);
  769 +
  770 + $disable = array(
  771 + OS_WINDOWS=>array('PSExtractor'),
  772 + OS_UNIX => array()
  773 + );
  774 +
  775 + $disableForOS = OS_WINDOWS?$disable[OS_WINDOWS]:$disable[OS_UNIX];
  776 +
  777 + foreach($disableForOS as $extractor)
  778 + {
  779 + $sql = "UPDATE mime_extractors SET active=0 WHERE name='$extractor'";
  780 + DBUtil::runQuery($sql);
  781 + $default->log->info("checkForRegisteredTypes: disabled '$extractor'");
  782 + }
  783 +
  784 + if ($this->debug) $default->log->debug('checkForRegisteredTypes: done');
  785 + KTUtil::setSystemSetting('mimeTypesRegistered', true);
  786 + }
  787 +
  788 + private function updatePendingDocumentStatus($documentId, $message, $level)
  789 + {
  790 + $this->indexingHistory .= "\n" . $level . ': ' . $message;
  791 + $message = sanitizeForSQL($this->indexingHistory);
  792 + $sql = "UPDATE index_files SET status_msg='$message' WHERE document_id=$documentId";
  793 + DBUtil::runQuery($sql);
  794 + }
  795 +
  796 + /**
  797 + *
  798 + * @param int $documentId
  799 + * @param string $message
  800 + * @param string $level This may be info, error, debug
  801 + */
  802 + private function logPendingDocumentInfoStatus($documentId, $message, $level)
  803 + {
  804 + $this->updatePendingDocumentStatus($documentId, $message, $level);
  805 + global $default;
  806 +
  807 + switch ($level)
  808 + {
  809 + case 'debug':
  810 + if ($this->debug)
  811 + {
  812 + $default->log->debug($message);
  813 + }
  814 + break;
  815 + default:
  816 + $default->log->$level($message);
  817 + }
  818 + }
  819 +
  820 +
  821 +
  822 + public function getExtractor($extractorClass)
  823 + {
  824 + if (empty($extractorClass))
  825 + {
  826 + return null;
  827 + }
  828 +
  829 + $includeFile = SEARCH2_INDEXER_DIR . 'extractors/' . $extractorClass . '.inc.php';
  830 + if (!file_exists($includeFile))
  831 + {
  832 + throw new Exception("Extractor file does not exist: $includeFile");
  833 + }
  834 +
  835 + require_once($includeFile);
  836 +
  837 + if (!class_exists($extractorClass))
  838 + {
  839 + throw new Exception("Extractor '$classname' not defined in file: $includeFile");
  840 + }
  841 +
  842 + $extractor = new $extractorClass();
  843 +
  844 + if (!($extractor instanceof DocumentExtractor))
  845 + {
  846 + throw new Exception("Class $classname was expected to be of type DocumentExtractor");
  847 + }
  848 +
  849 + return $extractor;
  850 + }
  851 +
  852 + public static function getIndexingQueue($problemItemsOnly=true)
  853 + {
  854 +
  855 + if ($problemItemsOnly)
  856 + {
  857 + $sql = "SELECT
  858 + iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename
  859 + FROM
  860 + index_files iff
  861 + INNER JOIN documents d ON iff.document_id=d.id
  862 + INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
  863 + INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
  864 + INNER JOIN mime_types mt ON dcv.mime_id=mt.id
  865 + LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
  866 + WHERE
  867 + (iff.status_msg IS NOT NULL AND iff.status_msg <> '') AND d.status_id=1
  868 + ORDER BY indexdate ";
  869 + }
  870 + else
  871 + {
  872 + $sql = "SELECT
  873 + iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename
  874 + FROM
  875 + index_files iff
  876 + INNER JOIN documents d ON iff.document_id=d.id
  877 + INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
  878 + INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
  879 + INNER JOIN mime_types mt ON dcv.mime_id=mt.id
  880 + LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
  881 + WHERE
  882 + (iff.status_msg IS NULL or iff.status_msg = '') AND d.status_id=1
  883 + ORDER BY indexdate ";
  884 + }
  885 + $aResult = DBUtil::getResultArray($sql);
  886 +
  887 + return $aResult;
  888 + }
  889 +
  890 + public static function getPendingIndexingQueue()
  891 + {
  892 + return Indexer::getIndexingQueue(false);
  893 + }
  894 +
  895 + /**
  896 + * The main function that may be called repeatedly to index documents.
  897 + *
  898 + * @param int $max Default 20
  899 + */
  900 + public function indexDocuments($max=null)
  901 + {
  902 + global $default;
  903 + $config =& KTConfig::getSingleton();
  904 +
  905 + /*$indexLockFile = $config->get('cache/cacheDirectory') . '/main.index.lock';
  906 + if (is_file($indexLockFile))
  907 + {
  908 + $default->log->info('indexDocuments: main.index.lock seems to exist. it could be that the indexing is still underway.');
  909 + $default->log->info('indexDocuments: Remove "' . $indexLockFile . '" if the indexing is not running or extend the frequency at which the background task runs!');
  910 + return;
  911 + }
  912 + touch($indexLockFile);*/
  913 +
  914 +
  915 + $this->checkForRegisteredTypes();
  916 +
  917 + if ($this->debug) $default->log->debug('indexDocuments: start');
  918 + if (!$this->doesDiagnosticsPass())
  919 + {
  920 + //unlink($indexLockFile);
  921 + if ($this->debug) $default->log->debug('indexDocuments: stopping - diagnostics problem. The dashboard will provide more information.');
  922 + return;
  923 + }
  924 +
  925 + if (is_null($max))
  926 + {
  927 + $max = $config->get('indexer/batchDocuments',20);
  928 + }
  929 +
  930 + $this->loadExtractorHooks();
  931 +
  932 + Indexer::clearoutDeleted();
  933 +
  934 + $date = date('Y-m-d H:i:s');
  935 + // identify the indexers that must run
  936 + // mysql specific limit!
  937 + $sql = "SELECT
  938 + iff.document_id, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what
  939 + FROM
  940 + index_files iff
  941 + INNER JOIN documents d ON iff.document_id=d.id
  942 + INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
  943 + INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
  944 + INNER JOIN mime_types mt ON dcv.mime_id=mt.id
  945 + LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
  946 + WHERE
  947 + (iff.processdate IS NULL or iff.processdate < cast(cast('$date' as date) -1 as date)) AND dmv.status_id=1
  948 + ORDER BY indexdate
  949 + LIMIT $max";
  950 + $result = DBUtil::getResultArray($sql);
  951 + if (PEAR::isError($result))
  952 + {
  953 + //unlink($indexLockFile);
  954 + if ($this->debug) $default->log->debug('indexDocuments: stopping - db error');
  955 + return;
  956 + }
  957 + KTUtil::setSystemSetting('luceneIndexingDate', time());
  958 +
  959 + // bail if no work to do
  960 + if (count($result) == 0)
  961 + {
  962 + //unlink($indexLockFile);
  963 + if ($this->debug) $default->log->debug('indexDocuments: stopping - no work to be done');
  964 + return;
  965 + }
  966 +
  967 + // identify any documents that need indexing and mark them
  968 + // so they are not taken in a followup run
  969 + $ids = array();
  970 + foreach($result as $docinfo)
  971 + {
  972 + $ids[] = $docinfo['document_id'];
  973 + }
  974 +
  975 + // mark the documents as being processed
  976 +
  977 + $ids=implode(',',$ids);
  978 + $sql = "UPDATE index_files SET processdate='$date' WHERE document_id in ($ids)";
  979 + DBUtil::runQuery($sql);
  980 +
  981 + $extractorCache = array();
  982 + $storageManager = KTStorageManagerUtil::getSingleton();
  983 +
  984 + $tempPath = $config->get("urls/tmpDirectory");
  985 +
  986 + foreach($result as $docinfo)
  987 + {
  988 + // increment indexed documents count
  989 + Indexer::incrementCount();
  990 +
  991 + $docId=$docinfo['document_id'];
  992 + $extension=$docinfo['filetypes'];
  993 + $mimeType=$docinfo['mimetypes'];
  994 + $extractorClass=$docinfo['extractor'];
  995 + $indexDocument = in_array($docinfo['what'], array('A','C'));
  996 + $indexDiscussion = in_array($docinfo['what'], array('A','D'));
  997 + $this->indexingHistory = '';
  998 +
  999 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Indexing docid: %d extension: '%s' mimetype: '%s' extractor: '%s'"), $docId, $extension,$mimeType,$extractorClass), 'debug');
  1000 +
  1001 + if (empty($extractorClass))
  1002 + {
  1003 + /*
  1004 +
  1005 + if no extractor is found and we don't need to index discussions, then we can remove the item from the queue.
  1006 +
  1007 + */
  1008 + if ($indexDiscussion)
  1009 + {
  1010 + $indexDocument = false;
  1011 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Not indexing docid: %d content because extractor could not be resolve. Still indexing discussion."), $docId), 'info');
  1012 + }
  1013 + else
  1014 + {
  1015 + Indexer::unqueueDocument($docId, sprintf(_kt("No extractor for docid: %d"),$docId));
  1016 + continue;
  1017 + }
  1018 + }
  1019 + else
  1020 + {
  1021 + /*
  1022 +
  1023 + If an extractor is available, we must ensure it is enabled.
  1024 +
  1025 + */
  1026 +
  1027 + if (!$this->isExtractorEnabled($extractorClass))
  1028 + {
  1029 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("diagnose: Not indexing docid: %d because extractor '%s' is disabled."), $docId, $extractorClass), 'info');
  1030 + continue;
  1031 + }
  1032 + }
  1033 +
  1034 + if ($this->debug)
  1035 + {
  1036 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Processing docid: %d.\n"),$docId), 'info');
  1037 + }
  1038 +
  1039 + $document = Document::get($docId);
  1040 + if (PEAR::isError($document))
  1041 + {
  1042 + Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Cannot resolve document id %d: %s."),$docId, $document->getMessage()), 'error');
  1043 + continue;
  1044 + }
  1045 +
  1046 + $filename = $document->getFileName();
  1047 + if (substr($filename,0,1) == '~')
  1048 + {
  1049 + Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Filename for document id %d starts with a tilde (~). This is assumed to be a temporary file. This is ignored."),$docId), 'error');
  1050 + continue;
  1051 + }
  1052 +
  1053 + $removeFromQueue = true;
  1054 + if ($indexDocument)
  1055 + {
  1056 + if (array_key_exists($extractorClass, $extractorCache))
  1057 + {
  1058 + $extractor = $extractorCache[$extractorClass];
  1059 + }
  1060 + else
  1061 + {
  1062 + $extractor = $extractorCache[$extractorClass] = $this->getExtractor($extractorClass);
  1063 + }
  1064 +
  1065 + if (!($extractor instanceof DocumentExtractor))
  1066 + {
  1067 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("indexDocuments: extractor '%s' is not a document extractor class."),$extractorClass), 'error');
  1068 + continue;
  1069 + }
  1070 +
  1071 +
  1072 +
  1073 + $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();
  1074 + $sourceFile = $storageManager->temporaryFile($document);
  1075 +
  1076 + if (empty($sourceFile) || !is_file($sourceFile))
  1077 + {
  1078 + Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: source file '%s' for document %d does not exist."),$sourceFile,$docId), 'error');
  1079 + continue;
  1080 + }
  1081 +
  1082 + if ($extractor->needsIntermediateSourceFile())
  1083 + {
  1084 + $extension = pathinfo($document->getFileName(), PATHINFO_EXTENSION);
  1085 +
  1086 + $intermediate = $tempPath . '/'. $docId . '.' . $extension;
  1087 + $result = @copy($sourceFile, $intermediate);
  1088 + if ($result === false)
  1089 + {
  1090 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not create intermediate file from document %d"),$docId), 'error');
  1091 + // problem. lets try again later. probably permission related. log the issue.
  1092 + continue;
  1093 + }
  1094 + $sourceFile = $intermediate;
  1095 + }
  1096 +
  1097 + $targetFile = tempnam($tempPath, 'ktindexer');
  1098 +
  1099 + $extractor->setSourceFile($sourceFile);
  1100 + $extractor->setMimeType($mimeType);
  1101 + $extractor->setExtension($extension);
  1102 + $extractor->setTargetFile($targetFile);
  1103 + $extractor->setDocument($document);
  1104 + $extractor->setIndexingStatus(null);
  1105 + $extractor->setExtractionStatus(null);
  1106 +
  1107 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Extra Info docid: %d Source File: '%s' Target File: '%s'"),$docId,$sourceFile,$targetFile), 'debug');
  1108 +
  1109 + $this->executeHook($extractor, 'pre_extract');
  1110 + $this->executeHook($extractor, 'pre_extract', $mimeType);
  1111 + $removeFromQueue = false;
  1112 +
  1113 + if ($extractor->extractTextContent())
  1114 + {
  1115 + // the extractor may need to create another target file
  1116 + $targetFile = $extractor->getTargetFile();
  1117 +
  1118 + $extractor->setExtractionStatus(true);
  1119 + $this->executeHook($extractor, 'pre_index');
  1120 + $this->executeHook($extractor, 'pre_index', $mimeType);
  1121 +
  1122 + $title = $document->getName();
  1123 + if ($indexDiscussion)
  1124 + {
  1125 + if (!$this->filterText($targetFile))
  1126 + {
  1127 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');
  1128 + }
  1129 + else
  1130 + {
  1131 + $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);
  1132 + $removeFromQueue = $indexStatus;
  1133 + if (!$indexStatus)
  1134 + {
  1135 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocumentAndDiscussion"),$docId), 'error');
  1136 + }
  1137 +
  1138 + $extractor->setIndexingStatus($indexStatus);
  1139 + }
  1140 + }
  1141 + else
  1142 + {
  1143 + if (!$this->filterText($targetFile))
  1144 + {
  1145 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');
  1146 + }
  1147 + else
  1148 + {
  1149 + $indexStatus = $this->indexDocument($docId, $targetFile, $title, $version);
  1150 + $removeFromQueue = $indexStatus;
  1151 +
  1152 + if (!$indexStatus)
  1153 + {
  1154 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocument"),$docId), 'error');
  1155 + $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error');
  1156 + }
  1157 +
  1158 + $extractor->setIndexingStatus($indexStatus);
  1159 + }
  1160 + }
  1161 +
  1162 + $this->executeHook($extractor, 'post_index', $mimeType);
  1163 + $this->executeHook($extractor, 'post_index');
  1164 + }
  1165 + else
  1166 + {
  1167 + $extractor->setExtractionStatus(false);
  1168 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not extract contents from document %d"),$docId), 'error');
  1169 + $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error');
  1170 + }
  1171 +
  1172 + $this->executeHook($extractor, 'post_extract', $mimeType);
  1173 + $this->executeHook($extractor, 'post_extract');
  1174 +
  1175 + if ($extractor->needsIntermediateSourceFile())
  1176 + {
  1177 + @unlink($sourceFile);
  1178 + }
  1179 +
  1180 + @unlink($targetFile);
  1181 +
  1182 + }
  1183 + else
  1184 + {
  1185 + $indexStatus = $this->indexDiscussion($docId);
  1186 + $removeFromQueue = $indexStatus;
  1187 + }
  1188 +
  1189 + if ($removeFromQueue)
  1190 + {
  1191 + Indexer::unqueueDocument($docId, sprintf(_kt("Done indexing docid: %d"),$docId));
  1192 + }
  1193 + else
  1194 + {
  1195 + if ($this->debug) $default->log->debug(sprintf(_kt("Document docid: %d was not removed from the queue as it looks like there was a problem with the extraction process"),$docId));
  1196 + }
  1197 + }
  1198 + if ($this->debug) $default->log->debug('indexDocuments: done');
  1199 + //unlink($indexLockFile);
  1200 + }
  1201 +
  1202 + public function migrateDocuments($max=null)
  1203 + {
  1204 + global $default;
  1205 +
  1206 + $default->log->info(_kt('migrateDocuments: starting'));
  1207 +
  1208 + if (!$this->doesDiagnosticsPass(true))
  1209 + {
  1210 + $default->log->info(_kt('migrateDocuments: stopping - diagnostics problem. The dashboard will provide more information.'));
  1211 + return;
  1212 + }
  1213 +
  1214 + if (KTUtil::getSystemSetting('migrationComplete') == 'true')
  1215 + {
  1216 + $default->log->info(_kt('migrateDocuments: stopping - migration is complete.'));
  1217 + return;
  1218 + }
  1219 +
  1220 + $config =& KTConfig::getSingleton();
  1221 + if (is_null($max))
  1222 + {
  1223 + $max = $config->get('indexer/batchMigrateDocument',500);
  1224 + }
  1225 +
  1226 + $lockFile = $config->get('cache/cacheDirectory') . '/migration.lock';
  1227 + if (is_file($lockFile))
  1228 + {
  1229 + $default->log->info(_kt('migrateDocuments: stopping - migration lockfile detected.'));
  1230 + return;
  1231 + }
  1232 + touch($lockFile);
  1233 +
  1234 + $startTime = KTUtil::getSystemSetting('migrationStarted');
  1235 + if (is_null($startTime))
  1236 + {
  1237 + KTUtil::setSystemSetting('migrationStarted', time());
  1238 + }
  1239 +
  1240 + $maxLoops = 5;
  1241 +
  1242 + $max = ceil($max / $maxLoops);
  1243 +
  1244 + $start =KTUtil::getBenchmarkTime();
  1245 + $noDocs = false;
  1246 + $numDocs = 0;
  1247 +
  1248 + for($loop=0;$loop<$maxLoops;$loop++)
  1249 + {
  1250 +
  1251 + $sql = "SELECT
  1252 + document_id, document_text
  1253 + FROM
  1254 + document_text
  1255 + ORDER BY document_id
  1256 + LIMIT $max";
  1257 + $result = DBUtil::getResultArray($sql);
  1258 + if (PEAR::isError($result))
  1259 + {
  1260 + $default->log->info(_kt('migrateDocuments: db error'));
  1261 + break;
  1262 + }
  1263 +
  1264 + $docs = count($result);
  1265 + if ($docs == 0)
  1266 + {
  1267 + $noDocs = true;
  1268 + break;
  1269 + }
  1270 + $numDocs += $docs;
  1271 +
  1272 + foreach($result as $docinfo)
  1273 + {
  1274 + $docId = $docinfo['document_id'];
  1275 +
  1276 + $document = Document::get($docId);
  1277 + if (PEAR::isError($document) || is_null($document))
  1278 + {
  1279 + $sql = "DELETE FROM document_text WHERE document_id=$docId";
  1280 + DBUtil::runQuery($sql);
  1281 + $default->log->error(sprintf(_kt('migrateDocuments: Could not get document %d\'s document! Removing content!'),$docId));
  1282 + continue;
  1283 + }
  1284 +
  1285 + $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();
  1286 +
  1287 + $targetFile = tempnam($tempPath, 'ktindexer');
  1288 +
  1289 + if (file_put_contents($targetFile, $docinfo['document_text']) === false)
  1290 + {
  1291 + $default->log->error(sprintf(_kt('migrateDocuments: Cannot write to \'%s\' for document id %d'), $targetFile, $docId));
  1292 + continue;
  1293 + }
  1294 + // free memory asap ;)
  1295 + unset($docinfo['document_text']);
  1296 +
  1297 + $title = $document->getName();
  1298 +
  1299 + $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);
  1300 +
  1301 + if ($indexStatus)
  1302 + {
  1303 + $sql = "DELETE FROM document_text WHERE document_id=$docId";
  1304 + DBUtil::runQuery($sql);
  1305 + }
  1306 + else
  1307 + {
  1308 + $default->log->error(sprintf(_kt("migrateDocuments: Problem indexing document %d"), $docId));
  1309 + }
  1310 +
  1311 + @unlink($targetFile);
  1312 + }
  1313 + }
  1314 +
  1315 + @unlink($lockFile);
  1316 +
  1317 + $time = KTUtil::getBenchmarkTime() - $start;
  1318 +
  1319 + KTUtil::setSystemSetting('migrationTime', KTUtil::getSystemSetting('migrationTime',0) + $time);
  1320 + KTUtil::setSystemSetting('migratedDocuments', KTUtil::getSystemSetting('migratedDocuments',0) + $numDocs);
  1321 +
  1322 + $default->log->info(sprintf(_kt('migrateDocuments: stopping - done in %d seconds!'), $time));
  1323 + if ($noDocs)
  1324 + {
  1325 + $default->log->info(_kt('migrateDocuments: Completed!'));
  1326 + KTUtil::setSystemSetting('migrationComplete', 'true');
  1327 + schedulerUtil::deleteByName('Index Migration');
  1328 + $default->log->debug(_kt('migrateDocuments: Disabling \'Index Migration\' task by removing scheduler entry.'));
  1329 + }
  1330 + }
  1331 +
  1332 + /**
  1333 + * Index a document. The base class must override this function.
  1334 + *
  1335 + * @param int $docId
  1336 + * @param string $textFile
  1337 + */
  1338 + protected abstract function indexDocument($docId, $textFile, $title, $version);
  1339 +
  1340 +
  1341 + public function updateDocumentIndex($docId, $text)
  1342 + {
  1343 + $config = KTConfig::getSingleton();
  1344 + $tempPath = $config->get("urls/tmpDirectory");
  1345 + $tempFile = tempnam($tempPath,'ud_');
  1346 +
  1347 + file_put_contents($tempFile, $text);
  1348 +
  1349 + $document = Document::get($docId);
  1350 + $title = $document->getDescription();
  1351 + $version = $document->getVersion();
  1352 +
  1353 + $result = $this->indexDocument($docId, $tempFile, $title, $version);
  1354 +
  1355 + if (file_exists($tempFile))
  1356 + {
  1357 + unlink($tempFile);
  1358 + }
  1359 +
  1360 + return $result;
  1361 + }
  1362 +
  1363 + /**
  1364 + * Index a discussion. The base class must override this function.
  1365 + *
  1366 + * @param int $docId
  1367 + */
  1368 + protected abstract function indexDiscussion($docId);
  1369 +
  1370 + /**
  1371 + * Diagnose the indexer. e.g. Check that the indexing server is running.
  1372 + *
  1373 + */
  1374 + public abstract function diagnose();
  1375 +
  1376 + /**
  1377 + * Diagnose the extractors.
  1378 + *
  1379 + * @return array
  1380 + */
  1381 + public function diagnoseExtractors()
  1382 + {
  1383 + $diagnosis = $this->_diagnose($this->extractorPath, 'DocumentExtractor', 'Extractor.inc.php');
  1384 + $diagnosis = array_merge($diagnosis, $this->_diagnose($this->hookPath, 'Hook', 'Hook.inc.php'));
  1385 +
  1386 + return $diagnosis;
  1387 + }
  1388 +
  1389 + /**
  1390 + * This is a refactored diagnose function.
  1391 + *
  1392 + * @param string $path
  1393 + * @param string $class
  1394 + * @param string $extension
  1395 + * @return array
  1396 + */
  1397 + private function _diagnose($path, $baseclass, $extension)
  1398 + {
  1399 + global $default;
  1400 +
  1401 + $diagnoses = array();
  1402 +
  1403 + $dir = opendir(SearchHelper::correctPath($path));
  1404 + $extlen = - strlen($extension);
  1405 +
  1406 + while (($file = readdir($dir)) !== false)
  1407 + {
  1408 + if (substr($file,0,1) == '.')
  1409 + {
  1410 + continue;
  1411 + }
  1412 + if (substr($file,$extlen) != $extension)
  1413 + {
  1414 + $default->log->error(sprintf(_kt("diagnose: '%s' does not have extension '%s'."), $file, $extension));
  1415 + continue;
  1416 + }
  1417 +
  1418 + require_once($path . '/' . $file);
  1419 +
  1420 + $class = substr($file, 0, -8);
  1421 + if (!class_exists($class))
  1422 + {
  1423 + $default->log->error(sprintf(_kt("diagnose: class '%s' does not exist."), $class));
  1424 + continue;
  1425 + }
  1426 +
  1427 + if (!$this->isExtractorEnabled($class))
  1428 + {
  1429 + $default->log->debug(sprintf(_kt("diagnose: extractor '%s' is disabled."), $class));
  1430 + continue;
  1431 + }
  1432 +
  1433 + $extractor = new $class();
  1434 + if (!is_a($extractor, $baseclass))
  1435 + {
  1436 + $default->log->error(sprintf(_kt("diagnose(): '%s' is not of type DocumentExtractor"), $class));
  1437 + continue;
  1438 + }
  1439 +
  1440 + $types = $extractor->getSupportedMimeTypes();
  1441 + if (empty($types))
  1442 + {
  1443 + if ($this->debug) $default->log->debug(sprintf(_kt("diagnose: class '%s' does not support any types."), $class));
  1444 + continue;
  1445 + }
  1446 +
  1447 + $diagnosis=$extractor->diagnose();
  1448 + if (empty($diagnosis))
  1449 + {
  1450 + continue;
  1451 + }
  1452 + $diagnoses[$class] = array(
  1453 + 'name'=>$extractor->getDisplayName(),
  1454 + 'diagnosis'=>$diagnosis
  1455 + );
  1456 +
  1457 + }
  1458 + closedir($dir);
  1459 +
  1460 + return $diagnoses;
  1461 + }
  1462 +
  1463 +
  1464 + /**
  1465 + * Register the extractor types.
  1466 + *
  1467 + * @param boolean $clear. Optional. Defaults to false.
  1468 + */
  1469 + public function registerTypes($clear=false)
  1470 + {
  1471 + if ($clear)
  1472 + {
  1473 + $this->clearExtractors();
  1474 + }
  1475 + $dir = opendir(SearchHelper::correctPath($this->extractorPath));
  1476 + while (($file = readdir($dir)) !== false)
  1477 + {
  1478 + if (substr($file,-17) == 'Extractor.inc.php')
  1479 + {
  1480 + require_once($this->extractorPath . '/' . $file);
  1481 + $class = substr($file, 0, -8);
  1482 +
  1483 + if (!class_exists($class))
  1484 + {
  1485 + // if the class does not exist, we can't do anything.
  1486 + continue;
  1487 + }
  1488 +
  1489 + $extractor = new $class;
  1490 + if ($extractor instanceof DocumentExtractor)
  1491 + {
  1492 + $extractor->registerMimeTypes();
  1493 + }
  1494 + }
  1495 + }
  1496 + closedir($dir);
  1497 + }
  1498 +
  1499 + /**
  1500 + * This is used as a possible obtimisation effort. It may be overridden in that case.
  1501 + *
  1502 + * @param int $docId
  1503 + * @param string $textFile
  1504 + */
  1505 + protected function indexDocumentAndDiscussion($docId, $textFile, $title, $version)
  1506 + {
  1507 + $this->indexDocument($docId, $textFile, $title, $version);
  1508 + $this->indexDiscussion($docId);
  1509 + }
  1510 +
  1511 + /**
  1512 + * Remove the document from the queue. This is normally called when it has been processed.
  1513 + *
  1514 + * @param int $docid
  1515 + */
  1516 + public static function unqueueDocument($docid, $reason=false, $level='debug')
  1517 + {
  1518 + $sql = "DELETE FROM index_files WHERE document_id=$docid";
  1519 + DBUtil::runQuery($sql);
  1520 + if ($reason !== false)
  1521 + {
  1522 + global $default;
  1523 + $default->log->$level("Indexer: removing document $docid from the queue - $reason");
  1524 + }
  1525 + }
  1526 +
  1527 + /**
  1528 + * Run a query on the index.
  1529 + *
  1530 + * @param string $query
  1531 + * @return array
  1532 + */
  1533 + public abstract function query($query);
  1534 +
  1535 + /**
  1536 + * Converts an integer to a string that can be easily compared and reversed.
  1537 + *
  1538 + * @param int $int
  1539 + * @return string
  1540 + */
  1541 + public static function longToString($int)
  1542 + {
  1543 + $maxlen = 14;
  1544 +
  1545 + $a2z = array('a','b','c','d','e','f','g','h','i','j');
  1546 + $o29 = array('0','1','2','3','4','5','6','7','8','9');
  1547 + $l = str_pad('',$maxlen - strlen("$int"),'0') . $int;
  1548 +
  1549 + return str_replace($o29, $a2z, $l);
  1550 + }
  1551 +
  1552 + /**
  1553 + * Converts a string to an integer.
  1554 + *
  1555 + * @param string $str
  1556 + * @return int
  1557 + */
  1558 + public static function stringToLong($str)
  1559 + {
  1560 + $a2z = array('a','b','c','d','e','f','g','h','i','j');
  1561 + $o29 = array('0','1','2','3','4','5','6','7','8','9');
  1562 +
  1563 + $int = str_replace($a2z, $o29, $str) + 0;
  1564 +
  1565 + return $int;
  1566 + }
  1567 +
  1568 + /**
  1569 + * Possibly we can optimise indexes. This method must be overriden.
  1570 + * The new function must call the parent!
  1571 + *
  1572 + */
  1573 + public function optimise()
  1574 + {
  1575 + KTUtil::setSystemSetting('luceneOptimisationDate', time());
  1576 + }
  1577 +
  1578 + /**
  1579 + * Shuts down the indexer
  1580 + *
  1581 + */
  1582 + public function shutdown()
  1583 + {
  1584 + // do nothing generally
  1585 + }
  1586 +
  1587 + /**
  1588 + * Returns the name of the indexer.
  1589 + *
  1590 + * @return string
  1591 + */
  1592 + public abstract function getDisplayName();
  1593 +
  1594 +
  1595 + /**
  1596 + * Returns the number of non-deleted documents in the index.
  1597 + *
  1598 + * @return int
  1599 + */
  1600 + public abstract function getDocumentsInIndex();
  1601 +
  1602 + /**
  1603 + * Returns the path to the index directory
  1604 + *
  1605 + * @return string
  1606 + */
  1607 + public function getIndexDirectory()
  1608 + {
  1609 + $config = KTConfig::getSingleton();
  1610 + $directory = $config->get('indexer/luceneDirectory');
  1611 + return $directory;
  1612 + }
  1613 +}
  1614 +
  1615 +?>
... ...