Commit 0b6032bbb3faf46c5823084cb417ce5666dbe16f

Authored by megan_w
1 parent 3a48a0f2

KTS-3440

"Restart open office periodically"
Fixed. A count is incremented on indexing a document (successfully or not), after 50 documents open office is restarted.

Committed by: Megan Watson
Reviewed by: Conrad Vermuelen




git-svn-id: https://kt-dms.svn.sourceforge.net/svnroot/kt-dms/trunk@8634 c91229c3-7414-0410-bfa2-8a42b809f60b
bin/checkopenoffice.php
@@ -8,31 +8,31 @@ @@ -8,31 +8,31 @@
8 * Document Management Made Simple 8 * Document Management Made Simple
9 * Copyright (C) 2008 KnowledgeTree Inc. 9 * Copyright (C) 2008 KnowledgeTree Inc.
10 * Portions copyright The Jam Warehouse Software (Pty) Limited 10 * Portions copyright The Jam Warehouse Software (Pty) Limited
11 - * 11 + *
12 * This program is free software; you can redistribute it and/or modify it under 12 * This program is free software; you can redistribute it and/or modify it under
13 * the terms of the GNU General Public License version 3 as published by the 13 * the terms of the GNU General Public License version 3 as published by the
14 * Free Software Foundation. 14 * Free Software Foundation.
15 - * 15 + *
16 * This program is distributed in the hope that it will be useful, but WITHOUT 16 * This program is distributed in the hope that it will be useful, but WITHOUT
17 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 17 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
18 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 18 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
19 * details. 19 * details.
20 - * 20 + *
21 * You should have received a copy of the GNU General Public License 21 * You should have received a copy of the GNU General Public License
22 * along with this program. If not, see <http://www.gnu.org/licenses/>. 22 * along with this program. If not, see <http://www.gnu.org/licenses/>.
23 - *  
24 - * You can contact KnowledgeTree Inc., PO Box 7775 #87847, San Francisco, 23 + *
  24 + * You can contact KnowledgeTree Inc., PO Box 7775 #87847, San Francisco,
25 * California 94120-7775, or email info@knowledgetree.com. 25 * California 94120-7775, or email info@knowledgetree.com.
26 - * 26 + *
27 * The interactive user interfaces in modified source and object code versions 27 * The interactive user interfaces in modified source and object code versions
28 * of this program must display Appropriate Legal Notices, as required under 28 * of this program must display Appropriate Legal Notices, as required under
29 * Section 5 of the GNU General Public License version 3. 29 * Section 5 of the GNU General Public License version 3.
30 - * 30 + *
31 * In accordance with Section 7(b) of the GNU General Public License version 3, 31 * In accordance with Section 7(b) of the GNU General Public License version 3,
32 * these Appropriate Legal Notices must retain the display of the "Powered by 32 * these Appropriate Legal Notices must retain the display of the "Powered by
33 - * KnowledgeTree" logo and retain the original copyright notice. If the display of the 33 + * KnowledgeTree" logo and retain the original copyright notice. If the display of the
34 * logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices 34 * logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices
35 - * must display the words "Powered by KnowledgeTree" and retain the original 35 + * must display the words "Powered by KnowledgeTree" and retain the original
36 * copyright notice. 36 * copyright notice.
37 * Contributor( s): ______________________________________ 37 * Contributor( s): ______________________________________
38 */ 38 */
@@ -51,10 +51,25 @@ so for windows we use the win32 service status checks. @@ -51,10 +51,25 @@ so for windows we use the win32 service status checks.
51 // Check if the calling function requires a return value 51 // Check if the calling function requires a return value
52 $sGiveOutput = (isset($argv[1]) && $argv[1] == 'output') ? true : false; 52 $sGiveOutput = (isset($argv[1]) && $argv[1] == 'output') ? true : false;
53 53
  54 +// Check indexed document count
  55 +// If the number of indexed documents is greater than the set amount, restart open office
  56 +// this clears open office's memory usage
  57 +$resetPoint = 50; // todo: put in config
  58 +$count = Indexer::getIndexedDocumentCount();
  59 +
  60 +$restartOO = false;
  61 +if($count > $resetPoint){
  62 + $restartOO = true;
  63 +
  64 + // reset the count
  65 + Indexer::updateIndexedDocumentCount(0);
  66 + $default->log->debug('Check Open Office Task: Restarting open office.');
  67 +}
  68 +
54 // First we check the host:port to see if open office is running 69 // First we check the host:port to see if open office is running
55 $sCheckOO = SearchHelper::checkOpenOfficeAvailablity(); 70 $sCheckOO = SearchHelper::checkOpenOfficeAvailablity();
56 71
57 -if(empty($sCheckOO)){ 72 +if(empty($sCheckOO) && !$restartOO){
58 // If the check returns empty then it is available on that port so we exit 73 // If the check returns empty then it is available on that port so we exit
59 if($sGiveOutput){ 74 if($sGiveOutput){
60 echo 1; 75 echo 1;
@@ -62,21 +77,44 @@ if(empty($sCheckOO)){ @@ -62,21 +77,44 @@ if(empty($sCheckOO)){
62 exit; 77 exit;
63 } 78 }
64 79
65 -// Open office appears not to be running.  
66 - 80 +// Open office appears not to be running or requires a restart
67 if(OS_WINDOWS){ 81 if(OS_WINDOWS){
68 - // If this is vista, it might be being blocked, so we query the service  
69 $OOService = 'ktopenoffice'; 82 $OOService = 'ktopenoffice';
70 - $result = win32_query_service_status($OOService);  
71 -  
72 - if(is_array($result)){  
73 - $iProcessId = $result['ProcessId'];  
74 - if(!empty($iProcessId) && $iProcessId != 0){  
75 - // If there is a process id (PID) then open office is running so we exit  
76 - if($sGiveOutput){  
77 - echo 1; 83 + $default->log->debug('Check Open Office Task: ' . get_current_user());
  84 +
  85 + if($restartOO){
  86 + // If Open office needs to be restarted - stop it here
  87 + $result_stop = win32_stop_service($OOService);
  88 +
  89 +
  90 + // Wait for the service to stop fully before trying to restart it
  91 + $continue = false;
  92 + $cnt = 0;
  93 + while($continue === false && $cnt < 15){
  94 + $result = win32_query_service_status($OOService);
  95 +
  96 + if(isset($result['ProcessId']) && $result['ProcessId'] != 0){
  97 + // If there is still a process id then the service has not stopped yet.
  98 + sleep(2);
  99 + $continue = false;
  100 + $cnt++;
  101 + }else{
  102 + $continue = true;
  103 + }
  104 + }
  105 + }else{
  106 + // If this is vista, checking the port may not work so we query the service
  107 + $result = win32_query_service_status($OOService);
  108 +
  109 + if(is_array($result)){
  110 + $iProcessId = $result['ProcessId'];
  111 + if(!empty($iProcessId) && $iProcessId != 0){
  112 + // If there is a process id (PID) then open office is running so we exit
  113 + if($sGiveOutput){
  114 + echo 1;
  115 + }
  116 + exit;
78 } 117 }
79 - exit;  
80 } 118 }
81 } 119 }
82 120
@@ -97,11 +135,14 @@ if(OS_WINDOWS){ @@ -97,11 +135,14 @@ if(OS_WINDOWS){
97 135
98 $default->log->debug('Check Open Office Task: Open office service could not be started. Error code '.$result2); 136 $default->log->debug('Check Open Office Task: Open office service could not be started. Error code '.$result2);
99 137
100 -  
101 // Attempt using the dmsctl batch script 138 // Attempt using the dmsctl batch script
102 $sPath = realpath('../../bin/dmsctl.bat'); 139 $sPath = realpath('../../bin/dmsctl.bat');
  140 +
103 if(file_exists($sPath)){ 141 if(file_exists($sPath)){
104 $sCmd = "\"$sPath\" start"; 142 $sCmd = "\"$sPath\" start";
  143 + $default->log->debug('Check Open Office Task: ' . get_current_user());
  144 + $default->log->debug('Check Open Office Task: ' . $sCmd);
  145 +
105 $res = KTUtil::pexec($sCmd); 146 $res = KTUtil::pexec($sCmd);
106 147
107 $default->log->debug('Check Open Office Task: Attempted start using dmsctl.bat.'); 148 $default->log->debug('Check Open Office Task: Attempted start using dmsctl.bat.');
@@ -120,25 +161,39 @@ if(OS_WINDOWS){ @@ -120,25 +161,39 @@ if(OS_WINDOWS){
120 // If the OS is Unix or Linux 161 // If the OS is Unix or Linux
121 $sPath = realpath('../../dmsctl.sh'); 162 $sPath = realpath('../../dmsctl.sh');
122 if(file_exists($sPath)){ 163 if(file_exists($sPath)){
123 - $sCmd = "\"$sPath\" start";  
124 - KTUtil::pexec($sCmd); 164 + // If Open office needs to be restarted - stop it here
  165 + if($restartOO){
  166 + $sCmd = "\"$sPath\" restart soffice";
  167 + $default->log->debug('Check Open Office Task: ' . get_current_user());
  168 + $default->log->debug('Check Open Office Task: ' . $sCmd);
125 169
126 - $default->log->debug('Check Open Office Task: Attempted start using dmsctl.sh.');  
127 - if($sGiveOutput){ 170 + KTUtil::pexec($sCmd);
  171 +
  172 + $default->log->debug('Check Open Office Task: Attempted restart using dmsctl.sh.');
  173 + }else{
  174 + $sCmd = "\"$sPath\" start soffice";
  175 + $default->log->debug('Check Open Office Task: ' . get_current_user());
  176 + $default->log->debug('Check Open Office Task: ' . $sCmd);
  177 +
  178 + KTUtil::pexec($sCmd);
  179 +
  180 + $default->log->debug('Check Open Office Task: Attempted start using dmsctl.sh.');
  181 + }
  182 + if($sGiveOutput){
128 echo 2; 183 echo 2;
129 } 184 }
130 exit; 185 exit;
131 }else{ 186 }else{
132 - $default->log->debug('Check Open Office Task: Can\'t find dmsctl.sh, this may be a source install.');  
133 - if($sGiveOutput){ 187 + $default->log->debug('Check Open Office Task: Can\'t find dmsctl.sh, this may be a source install.');
  188 + if($sGiveOutput){
134 echo 0; 189 echo 0;
135 } 190 }
136 exit; 191 exit;
137 - } 192 + }
138 } 193 }
139 $default->log->debug('Check Open Office Task: Can\'t start Open office, this may be a source install.'); 194 $default->log->debug('Check Open Office Task: Can\'t start Open office, this may be a source install.');
140 if($sGiveOutput){ 195 if($sGiveOutput){
141 echo 0; 196 echo 0;
142 } 197 }
143 exit; 198 exit;
144 -?>  
145 \ No newline at end of file 199 \ No newline at end of file
  200 +?>
search2/indexing/indexerCore.inc.php
1 -<?php  
2 -  
3 -/**  
4 - * $Id:$  
5 - *  
6 - * KnowledgeTree Community Edition  
7 - * Document Management Made Simple  
8 - * Copyright (C) 2008 KnowledgeTree Inc.  
9 - * Portions copyright The Jam Warehouse Software (Pty) Limited  
10 - *  
11 - * This program is free software; you can redistribute it and/or modify it under  
12 - * the terms of the GNU General Public License version 3 as published by the  
13 - * Free Software Foundation.  
14 - *  
15 - * This program is distributed in the hope that it will be useful, but WITHOUT  
16 - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS  
17 - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more  
18 - * details.  
19 - *  
20 - * You should have received a copy of the GNU General Public License  
21 - * along with this program. If not, see <http://www.gnu.org/licenses/>.  
22 - *  
23 - * You can contact KnowledgeTree Inc., PO Box 7775 #87847, San Francisco,  
24 - * California 94120-7775, or email info@knowledgetree.com.  
25 - *  
26 - * The interactive user interfaces in modified source and object code versions  
27 - * of this program must display Appropriate Legal Notices, as required under  
28 - * Section 5 of the GNU General Public License version 3.  
29 - *  
30 - * In accordance with Section 7(b) of the GNU General Public License version 3,  
31 - * these Appropriate Legal Notices must retain the display of the "Powered by  
32 - * KnowledgeTree" logo and retain the original copyright notice. If the display of the  
33 - * logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices  
34 - * must display the words "Powered by KnowledgeTree" and retain the original  
35 - * copyright notice.  
36 - * Contributor( s): ______________________________________  
37 - *  
38 - */  
39 -  
40 -define('SEARCH2_INDEXER_DIR',realpath(dirname(__FILE__)) . '/');  
41 -require_once('indexing/extractorCore.inc.php');  
42 -require_once(KT_DIR . '/plugins/ktcore/scheduler/schedulerUtil.php');  
43 -  
44 -  
45 -class IndexerInconsistencyException extends Exception {};  
46 -  
47 -class QueryResultItem  
48 -{  
49 - protected $document_id;  
50 - protected $title;  
51 - protected $rank;  
52 - protected $text;  
53 - protected $filesize;  
54 - protected $fullpath;  
55 - protected $live;  
56 - protected $version;  
57 - protected $mimeType;  
58 - protected $filename;  
59 - protected $thumbnail; // TODO: if not null, gui can display a thumbnail  
60 - protected $viewer; // TODO: if not null, a viewer can be used to view the document  
61 - protected $document;  
62 - protected $checkedOutUser;  
63 - protected $dateCheckedout;  
64 - protected $workflowState;  
65 - protected $workflow;  
66 - protected $modifiedBy;  
67 - protected $dateModified;  
68 - protected $createdBy;  
69 - protected $dateCreated;  
70 - protected $owner;  
71 - protected $immutable;  
72 - protected $deleted;  
73 - protected $status;  
74 - protected $folderId;  
75 - protected $storagePath;  
76 - protected $documentType;  
77 - protected $mimeIconPath;  
78 - protected $mimeDisplay;  
79 - protected $oemDocumentNo;  
80 -  
81 - public function __construct($document_id, $rank=null, $title=null, $text=null)  
82 - {  
83 - $this->document_id=(int) $document_id;  
84 - $this->rank= $rank;  
85 - $this->title=$title;  
86 - $this->text = $text;  
87 - $this->live = true;  
88 - $this->loadDocumentInfo();  
89 - }  
90 -  
91 - protected function __isset($property)  
92 - {  
93 - switch($property)  
94 - {  
95 - case 'DocumentID': return isset($this->document_id);  
96 - case 'Rank': return isset($this->rank);  
97 - case 'Text': return isset($this->text);  
98 - case 'Title': return isset($this->title);  
99 - case null: break;  
100 - default:  
101 - throw new Exception("Unknown property '$property' to get on QueryResultItem");  
102 - }  
103 - return true; // should not be reached  
104 - }  
105 -  
106 - public function loadDocumentInfo()  
107 - {  
108 - global $default;  
109 - $sql = "SELECT  
110 - d.folder_id, f.full_path, f.name, dcv.size as filesize, dcv.major_version,  
111 - dcv.minor_version, dcv.filename, cou.name as checkoutuser, w.human_name as workflow, ws.human_name as workflowstate,  
112 - mt.mimetypes as mimetype, md.mime_doc as mimedoc, d.checkedout, mbu.name as modifiedbyuser, d.modified,  
113 - cbu.name as createdbyuser, ou.name as owneruser, d.immutable, d.status_id, d.created,dcv.storage_path, dtl.name as document_type,  
114 - mt.icon_path as mime_icon_path, mt.friendly_name as mime_display, d.oem_no, dmv.name as title  
115 - FROM  
116 - documents d  
117 - INNER JOIN document_metadata_version dmv ON d.metadata_version_id = dmv.id  
118 - INNER JOIN document_content_version dcv ON dmv.content_version_id = dcv.id  
119 - INNER JOIN mime_types mt ON dcv.mime_id=mt.id  
120 - LEFT JOIN document_types_lookup dtl ON dtl.id=dmv.document_type_id  
121 - LEFT JOIN folders f ON f.id=d.folder_id  
122 - LEFT JOIN users cou ON d.checked_out_user_id=cou.id  
123 - LEFT JOIN workflows w ON dmv.workflow_id=w.id  
124 - LEFT JOIN workflow_states ws ON dmv.workflow_state_id = ws.id  
125 - LEFT JOIN mime_documents md ON mt.mime_document_id = md.id  
126 - LEFT JOIN users mbu ON d.modified_user_id=mbu.id  
127 - LEFT JOIN users cbu ON d.creator_id=cbu.id  
128 - LEFT JOIN users ou ON d.owner_id=ou.id  
129 - WHERE  
130 - d.id=$this->document_id";  
131 -  
132 - $result = DBUtil::getOneResult($sql);  
133 -  
134 - if (PEAR::isError($result) || empty($result))  
135 - {  
136 - $this->live = false;  
137 - if (PEAR::isError($result))  
138 - {  
139 - throw new Exception('Database exception! There appears to be an error in the system: ' .$result->getMessage());  
140 - }  
141 -  
142 - $default->log->error('QueryResultItem: $result is null');  
143 - $msg = 'The database did not have a record matching the result from the document indexer. This may occur if there is an inconsistency between the document indexer and the repository. The indexer needs to be repaired.';  
144 - $default->log->error('QueryResultItem: ' . $msg);  
145 - // TODO: repair process where we scan documents in index, and delete those for which there is nothing in the repository  
146 - throw new IndexerInconsistencyException(_kt($msg));  
147 - }  
148 -  
149 - // document_id, relevance, text, title  
150 -  
151 - $this->documentType = $result['document_type'];  
152 - $this->filename=$result['filename'];  
153 - $this->filesize = KTUtil::filesizeToString($result['filesize']);  
154 - $this->folderId = $result['folder_id'];  
155 - $this->title = $result['title'];  
156 -  
157 - $this->createdBy = $result['createdbyuser'];  
158 - $this->dateCreated = $result['created'];  
159 -  
160 - $this->modifiedBy = $result['modifiedbyuser'];  
161 - $this->dateModified = $result['modified'];  
162 -  
163 - $this->checkedOutUser = $result['checkoutuser'];  
164 - $this->dateCheckedout = $result['checkedout'];  
165 -  
166 - $this->owner = $result['owneruser'];  
167 -  
168 - $this->version = $result['major_version'] . '.' . $result['minor_version'];  
169 -  
170 - $this->immutable = ($result['immutable'] + 0)?_kt('Immutable'):'';  
171 -  
172 - $this->workflow = $result['workflow'];  
173 - $this->workflowState = $result['workflowstate'];  
174 -  
175 - $this->oemDocumentNo = $result['oem_no'];  
176 - if (empty($this->oemDocumentNo)) $this->oemDocumentNo = 'n/a';  
177 -  
178 - if (is_null($result['name']))  
179 - {  
180 - $this->fullpath = '(orphaned)';  
181 - }  
182 - else  
183 - {  
184 - $this->fullpath = $result['full_path'];  
185 - }  
186 -  
187 - $this->mimeType = $result['mimetype'];  
188 - $this->mimeIconPath = $result['mime_icon_path'];  
189 - $this->mimeDisplay = $result['mime_display'];  
190 -  
191 - $this->storagePath = $result['storage_path'];  
192 - $this->status = Document::getStatusString($result['status_id']);  
193 - }  
194 -  
195 - protected function __get($property)  
196 - {  
197 - switch($property)  
198 - {  
199 - case null: return '';  
200 - case 'DocumentID': return (int) $this->document_id;  
201 - case 'Relevance':  
202 - case 'Rank': return (float) $this->rank;  
203 - case 'Text': return (string) $this->text;  
204 - case 'Title': return (string) $this->title;  
205 - case 'FullPath': return (string) $this->fullpath;  
206 - case 'IsLive': return (bool) $this->live;  
207 - case 'Filesize': return $this->filesize;  
208 - case 'Version': return (string) $this->version;  
209 - case 'Filename': return (string)$this->filename;  
210 - case 'FolderId': return (int)$this->folderId;  
211 - case 'OemDocumentNo': return (string) $this->oemDocumentNo;  
212 - case 'Document':  
213 - if (is_null($this->document))  
214 - {  
215 - $this->document = Document::get($this->document_id);  
216 - }  
217 - return $this->document;  
218 - case 'IsAvailable':  
219 - return $this->Document->isLive();  
220 - case 'CheckedOutUser':  
221 - case 'CheckedOutBy':  
222 - return (string) $this->checkedOutUser;  
223 - case 'WorkflowOnly':  
224 - case 'Workflow':  
225 - return (string)$this->workflow;  
226 - case 'WorkflowStateOnly':  
227 - case 'WorkflowState':  
228 - return (string)$this->workflowState;  
229 - case 'WorkflowAndState':  
230 - if (is_null($this->workflow))  
231 - {  
232 - return '';  
233 - }  
234 - return "$this->workflow - $this->workflowState";  
235 - case 'MimeType':  
236 - return (string) $this->mimeType;  
237 - case 'MimeIconPath':  
238 - return (string) $this->mimeIconPath;  
239 - case 'MimeDisplay':  
240 - return (string) $this->mimeDisplay;  
241 - case 'DateCheckedOut':  
242 - return (string) $this->dateCheckedout;  
243 - case 'ModifiedBy':  
244 - return (string) $this->modifiedBy;  
245 - case 'DateModified':  
246 - return (string) $this->dateModified;  
247 - case 'CreatedBy':  
248 - return (string) $this->createdBy;  
249 - case 'DateCreated':  
250 - return (string) $this->dateCreated;  
251 - case 'Owner':  
252 - case 'OwnedBy':  
253 - return (string) $this->owner;  
254 - case 'IsImmutable':  
255 - case 'Immutable':  
256 - return (bool) $this->immutable;  
257 - case 'Status':  
258 - return $this->status;  
259 - case 'StoragePath':  
260 - return $this->storagePath;  
261 - case 'DocumentType':  
262 - return $this->documentType;  
263 - case 'Permissions':  
264 - return 'not available';  
265 - case 'CanBeReadByUser':  
266 - if (!$this->live)  
267 - return false;  
268 - if (Permission::userHasDocumentReadPermission($this->Document))  
269 - return true;  
270 - if (Permission::adminIsInAdminMode())  
271 - return true;  
272 - return false;  
273 - default:  
274 - throw new Exception("Unknown property '$property' to get on QueryResultItem");  
275 - }  
276 - return ''; // Should not be reached  
277 - }  
278 -  
279 - protected function __set($property, $value)  
280 - {  
281 - switch($property)  
282 - {  
283 - case 'Rank': $this->rank = number_format($value,2,'.',','); break;  
284 - case 'Title': $this->title = $value; break;  
285 - case 'Text': $this->text = $value; break;  
286 - default:  
287 - throw new Exception("Unknown property '$property' to set on QueryResultItem");  
288 - }  
289 - }  
290 -}  
291 -  
292 -function MatchResultCompare($a, $b)  
293 -{  
294 - if ($a->Rank == $b->Rank) {  
295 - return 0;  
296 - }  
297 - return ($a->Rank < $b->Rank) ? -1 : 1;  
298 -}  
299 -  
300 -abstract class Indexer  
301 -{  
302 - /**  
303 - * Cache of extractors  
304 - *  
305 - * @var array  
306 - */  
307 - private $extractorCache;  
308 -  
309 - /**  
310 - * Indicates if the indexer will do logging.  
311 - *  
312 - * @var boolean  
313 - */  
314 - private $debug;  
315 - /**  
316 - * Cache on mime related hooks  
317 - *  
318 - * @var unknown_type  
319 - */  
320 - private $mimeHookCache;  
321 - /**  
322 - * Cache on general hooks.  
323 - *  
324 - * @var array  
325 - */  
326 - private $generalHookCache;  
327 -  
328 - /**  
329 - * This is a path to the extractors.  
330 - *  
331 - * @var string  
332 - */  
333 - private $extractorPath;  
334 - /**  
335 - * This is a path to the hooks.  
336 - *  
337 - * @var string  
338 - */  
339 - private $hookPath;  
340 -  
341 - private $enabledExtractors;  
342 -  
343 - /**  
344 - * Initialise the indexer  
345 - *  
346 - */  
347 - protected function __construct()  
348 - {  
349 - $config = KTConfig::getSingleton();  
350 -  
351 - $this->extractorCache = array();  
352 - $this->debug = $config->get('indexer/debug', true);  
353 - $this->hookCache = array();  
354 - $this->generalHookCache = array();  
355 - $this->extractorPath = $config->get('indexer/extractorPath', 'extractors');  
356 - $this->hookPath = $config->get('indexer/extractorHookPath','extractorHooks');  
357 -  
358 - $this->loadExtractorStatus();  
359 - }  
360 -  
361 - /**  
362 - * Get the list if enabled extractors  
363 - *  
364 - */  
365 - private function loadExtractorStatus()  
366 - {  
367 - $sql = "SELECT id, name FROM mime_extractors WHERE active=1";  
368 - $rs = DBUtil::getResultArray($sql);  
369 - $this->enabledExtractors = array();  
370 - foreach($rs as $item)  
371 - {  
372 - $this->enabledExtractors[] = $item['name'];  
373 - }  
374 - }  
375 -  
376 - private function isExtractorEnabled($extractor)  
377 - {  
378 - return in_array($extractor, $this->enabledExtractors);  
379 - }  
380 -  
381 - /**  
382 - * Returns a reference to the main class  
383 - *  
384 - * @return Indexer  
385 - */  
386 - public static function get()  
387 - {  
388 - static $singleton = null;  
389 -  
390 - if (is_null($singleton))  
391 - {  
392 - $config = KTConfig::getSingleton();  
393 - $classname = $config->get('indexer/coreClass');  
394 -  
395 - require_once('indexing/indexers/' . $classname . '.inc.php');  
396 -  
397 - if (!class_exists($classname))  
398 - {  
399 - throw new Exception("Class '$classname' does not exist.");  
400 - }  
401 -  
402 - $singleton = new $classname;  
403 - }  
404 -  
405 - return $singleton;  
406 - }  
407 -  
408 - public abstract function deleteDocument($docid);  
409 -  
410 - /**  
411 - * Remove the association of all extractors to mime types on the database.  
412 - *  
413 - */  
414 - public function clearExtractors()  
415 - {  
416 - global $default;  
417 -  
418 - $sql = "update mime_types set extractor_id=null";  
419 - DBUtil::runQuery($sql);  
420 -  
421 - $sql = "delete from mime_extractors";  
422 - DBUtil::runQuery($sql);  
423 -  
424 - if ($this->debug) $default->log->debug('clearExtractors');  
425 - }  
426 -  
427 - /**  
428 - * lookup the name of the extractor class based on the mime type.  
429 - *  
430 - * @param string $type  
431 - * @return string  
432 - */  
433 - public static function resolveExtractor($type)  
434 - {  
435 - global $default;  
436 - $sql = "select extractor from mime_types where filetypes='$type'";  
437 - $class = DBUtil::getOneResultKey($sql,'extractor');  
438 - if (PEAR::isError($class))  
439 - {  
440 - $default->log->error("resolveExtractor: cannot resolve $type");  
441 - return $class;  
442 - }  
443 - if ($this->debug) $default->log->debug(sprintf(_kt("resolveExtractor: Resolved '%s' from mime type '%s'."), $class, $type));  
444 - return $class;  
445 - }  
446 -  
447 - /**  
448 - * Return all the discussion text.  
449 - *  
450 - * @param int $docid  
451 - * @return string  
452 - */  
453 - public static function getDiscussionText($docid)  
454 - {  
455 - $sql = "SELECT  
456 - dc.subject, dc.body  
457 - FROM  
458 - discussion_threads dt  
459 - INNER JOIN discussion_comments dc ON dc.thread_id=dt.id AND dc.id BETWEEN dt.first_comment_id AND dt.last_comment_id  
460 - WHERE  
461 - dt.document_id=$docid";  
462 - $result = DBUtil::getResultArray($sql);  
463 - $text = '';  
464 -  
465 - foreach($result as $record)  
466 - {  
467 - $text .= $record['subject'] . "\n" . $record['body'] . "\n";  
468 - }  
469 -  
470 - return $text;  
471 - }  
472 -  
473 - /**  
474 - * Schedule the indexing of a document.  
475 - *  
476 - * @param string $document  
477 - * @param string $what  
478 - */  
479 - public static function index($document, $what='A')  
480 - {  
481 - global $default;  
482 -  
483 - if (is_numeric($document))  
484 - {  
485 - $document = Document::get($document+0);  
486 - }  
487 -  
488 - if (PEAR::isError($document))  
489 - {  
490 - $default->log->error("index: Could not index document: " .$document->getMessage());  
491 - return;  
492 - }  
493 -  
494 - $document_id = $document->getId();  
495 - $userid=$_SESSION['userID'];  
496 - if (empty($userid)) $userid=1;  
497 -  
498 - // we dequeue the document so that there are no issues when enqueuing  
499 - Indexer::unqueueDocument($document_id);  
500 -  
501 - // enqueue item  
502 - $sql = "INSERT INTO index_files(document_id, user_id, what) VALUES($document_id, $userid, '$what')";  
503 - DBUtil::runQuery($sql);  
504 -  
505 - $default->log->debug("index: Queuing indexing of $document_id");  
506 - }  
507 -  
508 - public static function reindexQueue()  
509 - {  
510 - $sql = "UPDATE index_files SET processdate = null";  
511 - DBUtil::runQuery($sql);  
512 - }  
513 -  
514 - public static function reindexDocument($documentId)  
515 - {  
516 - $sql = "UPDATE index_files SET processdate=null, status_msg=null WHERE document_id=$documentId";  
517 - DBUtil::runQuery($sql);  
518 - }  
519 -  
520 -  
521 -  
522 - public static function indexAll()  
523 - {  
524 - $userid=$_SESSION['userID'];  
525 - if (empty($userid)) $userid=1;  
526 -  
527 - $sql = "DELETE FROM index_files";  
528 - DBUtil::runQuery($sql);  
529 -  
530 - $sql = "INSERT INTO index_files(document_id, user_id, what) SELECT id, $userid, 'A' FROM documents WHERE status_id=1 and id not in (select document_id from index_files)";  
531 - DBUtil::runQuery($sql);  
532 - }  
533 -  
534 - /**  
535 - * Clearout the scheduling of documents that no longer exist.  
536 - *  
537 - */  
538 - public static function clearoutDeleted()  
539 - {  
540 - global $default;  
541 -  
542 - $sql = 'DELETE FROM  
543 - index_files  
544 - WHERE  
545 - document_id in (SELECT d.id FROM documents AS d WHERE d.status_id=3) OR  
546 - NOT EXISTS(SELECT index_files.document_id FROM documents WHERE index_files.document_id=documents.id)';  
547 - DBUtil::runQuery($sql);  
548 -  
549 - $default->log->debug("Indexer::clearoutDeleted: removed documents from indexing queue that have been deleted");  
550 - }  
551 -  
552 -  
553 - /**  
554 - * Check if a document is scheduled to be indexed  
555 - *  
556 - * @param mixed $document This may be a document or document id  
557 - * @return boolean  
558 - */  
559 - public static function isDocumentScheduled($document)  
560 - {  
561 - if (is_numeric($document))  
562 - {  
563 - $docid = $document;  
564 - }  
565 - else if ($document instanceof Document)  
566 - {  
567 - $docid = $document->getId();  
568 - }  
569 - else  
570 - {  
571 - return false;  
572 - }  
573 - $sql = "SELECT 1 FROM index_files WHERE document_id=$docid";  
574 - $result = DBUtil::getResultArray($sql);  
575 - return count($result) > 0;  
576 - }  
577 -  
578 - /**  
579 - * Filters text removing redundant characters such as continuous newlines and spaces.  
580 - *  
581 - * @param string $filename  
582 - */  
583 - private function filterText($filename)  
584 - {  
585 - $content = file_get_contents($filename);  
586 -  
587 - $src = array("([\r\n])","([\n][\n])","([\n])","([\t])",'([ ][ ])');  
588 - $tgt = array("\n","\n",' ',' ',' ');  
589 -  
590 - // shrink what is being stored.  
591 - do  
592 - {  
593 - $orig = $content;  
594 - $content = preg_replace($src, $tgt, $content);  
595 - } while ($content != $orig);  
596 -  
597 - return file_put_contents($filename, $content) !== false;  
598 - }  
599 -  
600 - /**  
601 - * Load hooks for text extraction process.  
602 - *  
603 - */  
604 - private function loadExtractorHooks()  
605 - {  
606 - $this->generalHookCache = array();  
607 - $this->mimeHookCache = array();  
608 -  
609 -  
610 - $dir = opendir(SearchHelper::correctPath($this->hookPath));  
611 - while (($file = readdir($dir)) !== false)  
612 - {  
613 - if (substr($file,-12) == 'Hook.inc.php')  
614 - {  
615 - require_once($this->hookPath . '/' . $file);  
616 - $class = substr($file, 0, -8);  
617 -  
618 - if (!class_exists($class))  
619 - {  
620 - continue;  
621 - }  
622 -  
623 - $hook = new $class;  
624 - if (!($class instanceof ExtractorHook))  
625 - {  
626 - continue;  
627 - }  
628 -  
629 - $mimeTypes = $hook->registerMimeTypes();  
630 - if (is_null($mimeTypes))  
631 - {  
632 - $this->generalHookCache[] = & $hook;  
633 - }  
634 - else  
635 - {  
636 - foreach($mimeTypes as $type)  
637 - {  
638 - $this->mimeHookCache[$type][] = & $hook;  
639 - }  
640 - }  
641 -  
642 - }  
643 - }  
644 - closedir($dir);  
645 - }  
646 -  
647 - /**  
648 - * This is a refactored function to execute the hooks.  
649 - *  
650 - * @param DocumentExtractor $extractor  
651 - * @param string $phase  
652 - * @param string $mimeType Optional. If set, indicates which hooks must be used, else assume general.  
653 - */  
654 - private function executeHook($extractor, $phase, $mimeType = null)  
655 - {  
656 - $hooks = array();  
657 - if (is_null($mimeType))  
658 - {  
659 - $hooks = $this->generalHookCache;  
660 - }  
661 - else  
662 - {  
663 - if (array_key_exists($mimeType, $this->mimeHookCache))  
664 - {  
665 - $hooks = $this->mimeHookCache[$mimeType];  
666 - }  
667 - }  
668 - if (empty($hooks))  
669 - {  
670 - return;  
671 - }  
672 -  
673 - foreach($hooks as $hook)  
674 - {  
675 - $hook->$phase($extractor);  
676 - }  
677 - }  
678 -  
679 - private function doesDiagnosticsPass($simple=false)  
680 - {  
681 - global $default;  
682 -  
683 - $config =& KTConfig::getSingleton();  
684 - // create a index log lock file in case there are errors, and we don't need to log them forever!  
685 - // this function will create the lockfile if an error is detected. It will be removed as soon  
686 - // as the problems with the indexer are removed.  
687 - $lockFile = $config->get('cache/cacheDirectory') . '/index.log.lock';  
688 -  
689 - $diagnosis = $this->diagnose();  
690 - if (!is_null($diagnosis))  
691 - {  
692 - if (!is_file($lockFile))  
693 - {  
694 - $default->log->error(_kt('Indexer problem: ') . $diagnosis);  
695 - }  
696 - touch($lockFile);  
697 - return false;  
698 - }  
699 -  
700 - if ($simple)  
701 - {  
702 - return true;  
703 - }  
704 -  
705 - $diagnosis = $this->diagnoseExtractors();  
706 - if (!empty($diagnosis))  
707 - {  
708 - if (!is_file($lockFile))  
709 - {  
710 - foreach($diagnosis as $diag)  
711 - {  
712 - $default->log->error(sprintf(_kt('%s problem: %s'), $diag['name'],$diag['diagnosis']));  
713 - }  
714 - }  
715 - touch($lockFile);  
716 - return false;  
717 - }  
718 -  
719 - if (is_file($lockFile))  
720 - {  
721 - $default->log->info(_kt('Issues with the indexer have been resolved!'));  
722 - unlink($lockFile);  
723 - }  
724 -  
725 - return true;  
726 - }  
727 -  
728 - /**  
729 - * This does the initial mime type association between mime types and text extractors  
730 - *  
731 - */  
732 - public function checkForRegisteredTypes()  
733 - {  
734 - global $default;  
735 -  
736 - // we are only doing this once!  
737 - $initRegistered = KTUtil::getSystemSetting('mimeTypesRegistered', false);  
738 - if ($initRegistered)  
739 - {  
740 - return;  
741 - }  
742 - if ($this->debug) $default->log->debug('checkForRegisteredTypes: start');  
743 -  
744 - $date = date('Y-m-d H:i');  
745 - $sql = "UPDATE scheduler_tasks SET run_time='$date'";  
746 - DBUtil::runQuery($sql);  
747 -  
748 - $this->registerTypes(true);  
749 -  
750 - $disable = array(  
751 - OS_WINDOWS=>array('PSExtractor'),  
752 - OS_UNIX => array()  
753 - );  
754 -  
755 - $disableForOS = OS_WINDOWS?$disable[OS_WINDOWS]:$disable[OS_UNIX];  
756 -  
757 - foreach($disableForOS as $extractor)  
758 - {  
759 - $sql = "UPDATE mime_extractors SET active=0 WHERE name='$extractor'";  
760 - DBUtil::runQuery($sql);  
761 - $default->log->info("checkForRegisteredTypes: disabled '$extractor'");  
762 - }  
763 -  
764 - if ($this->debug) $default->log->debug('checkForRegisteredTypes: done');  
765 - KTUtil::setSystemSetting('mimeTypesRegistered', true);  
766 - }  
767 -  
768 - private function updatePendingDocumentStatus($documentId, $message, $level)  
769 - {  
770 - $this->indexingHistory .= "\n" . $level . ': ' . $message;  
771 - $message = sanitizeForSQL($this->indexingHistory);  
772 - $sql = "UPDATE index_files SET status_msg='$message' WHERE document_id=$documentId";  
773 - DBUtil::runQuery($sql);  
774 - }  
775 -  
776 - /**  
777 - *  
778 - * @param int $documentId  
779 - * @param string $message  
780 - * @param string $level This may be info, error, debug  
781 - */  
782 - private function logPendingDocumentInfoStatus($documentId, $message, $level)  
783 - {  
784 - $this->updatePendingDocumentStatus($documentId, $message, $level);  
785 - global $default;  
786 -  
787 - switch ($level)  
788 - {  
789 - case 'debug':  
790 - if ($this->debug)  
791 - {  
792 - $default->log->debug($message);  
793 - }  
794 - break;  
795 - default:  
796 - $default->log->$level($message);  
797 - }  
798 - }  
799 -  
800 -  
801 -  
802 - public function getExtractor($extractorClass)  
803 - {  
804 - if (empty($extractorClass))  
805 - {  
806 - return null;  
807 - }  
808 -  
809 - $includeFile = SEARCH2_INDEXER_DIR . 'extractors/' . $extractorClass . '.inc.php';  
810 - if (!file_exists($includeFile))  
811 - {  
812 - throw new Exception("Extractor file does not exist: $includeFile");  
813 - }  
814 -  
815 - require_once($includeFile);  
816 -  
817 - if (!class_exists($extractorClass))  
818 - {  
819 - throw new Exception("Extractor '$classname' not defined in file: $includeFile");  
820 - }  
821 -  
822 - $extractor = new $extractorClass();  
823 -  
824 - if (!($extractor instanceof DocumentExtractor))  
825 - {  
826 - throw new Exception("Class $classname was expected to be of type DocumentExtractor");  
827 - }  
828 -  
829 - return $extractor;  
830 - }  
831 -  
832 - public static function getIndexingQueue($problemItemsOnly=true)  
833 - {  
834 -  
835 - if ($problemItemsOnly)  
836 - {  
837 - $sql = "SELECT  
838 - iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename  
839 - FROM  
840 - index_files iff  
841 - INNER JOIN documents d ON iff.document_id=d.id  
842 - INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id  
843 - INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id  
844 - INNER JOIN mime_types mt ON dcv.mime_id=mt.id  
845 - LEFT JOIN mime_extractors me ON mt.extractor_id=me.id  
846 - WHERE  
847 - (iff.status_msg IS NOT NULL AND iff.status_msg <> '') AND d.status_id=1  
848 - ORDER BY indexdate ";  
849 - }  
850 - else  
851 - {  
852 - $sql = "SELECT  
853 - iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename  
854 - FROM  
855 - index_files iff  
856 - INNER JOIN documents d ON iff.document_id=d.id  
857 - INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id  
858 - INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id  
859 - INNER JOIN mime_types mt ON dcv.mime_id=mt.id  
860 - LEFT JOIN mime_extractors me ON mt.extractor_id=me.id  
861 - WHERE  
862 - (iff.status_msg IS NULL or iff.status_msg = '') AND d.status_id=1  
863 - ORDER BY indexdate ";  
864 - }  
865 - $aResult = DBUtil::getResultArray($sql);  
866 -  
867 - return $aResult;  
868 - }  
869 -  
870 - public static function getPendingIndexingQueue()  
871 - {  
872 - return Indexer::getIndexingQueue(false);  
873 - }  
874 -  
875 - /**  
876 - * The main function that may be called repeatedly to index documents.  
877 - *  
878 - * @param int $max Default 20  
879 - */  
880 - public function indexDocuments($max=null)  
881 - {  
882 - global $default;  
883 - $config =& KTConfig::getSingleton();  
884 -  
885 - /*$indexLockFile = $config->get('cache/cacheDirectory') . '/main.index.lock';  
886 - if (is_file($indexLockFile))  
887 - {  
888 - $default->log->info('indexDocuments: main.index.lock seems to exist. it could be that the indexing is still underway.');  
889 - $default->log->info('indexDocuments: Remove "' . $indexLockFile . '" if the indexing is not running or extend the frequency at which the background task runs!');  
890 - return;  
891 - }  
892 - touch($indexLockFile);*/  
893 -  
894 -  
895 - $this->checkForRegisteredTypes();  
896 -  
897 - if ($this->debug) $default->log->debug('indexDocuments: start');  
898 - if (!$this->doesDiagnosticsPass())  
899 - {  
900 - //unlink($indexLockFile);  
901 - if ($this->debug) $default->log->debug('indexDocuments: stopping - diagnostics problem. The dashboard will provide more information.');  
902 - return;  
903 - }  
904 -  
905 - if (is_null($max))  
906 - {  
907 - $max = $config->get('indexer/batchDocuments',20);  
908 - }  
909 -  
910 - $this->loadExtractorHooks();  
911 -  
912 - Indexer::clearoutDeleted();  
913 -  
914 - $date = date('Y-m-d H:i:s');  
915 - // identify the indexers that must run  
916 - // mysql specific limit!  
917 - $sql = "SELECT  
918 - iff.document_id, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what  
919 - FROM  
920 - index_files iff  
921 - INNER JOIN documents d ON iff.document_id=d.id  
922 - INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id  
923 - INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id  
924 - INNER JOIN mime_types mt ON dcv.mime_id=mt.id  
925 - LEFT JOIN mime_extractors me ON mt.extractor_id=me.id  
926 - WHERE  
927 - (iff.processdate IS NULL or iff.processdate < cast(cast('$date' as date) -1 as date)) AND dmv.status_id=1  
928 - ORDER BY indexdate  
929 - LIMIT $max";  
930 - $result = DBUtil::getResultArray($sql);  
931 - if (PEAR::isError($result))  
932 - {  
933 - //unlink($indexLockFile);  
934 - if ($this->debug) $default->log->debug('indexDocuments: stopping - db error');  
935 - return;  
936 - }  
937 - KTUtil::setSystemSetting('luceneIndexingDate', time());  
938 -  
939 - // bail if no work to do  
940 - if (count($result) == 0)  
941 - {  
942 - //unlink($indexLockFile);  
943 - if ($this->debug) $default->log->debug('indexDocuments: stopping - no work to be done');  
944 - return;  
945 - }  
946 -  
947 - // identify any documents that need indexing and mark them  
948 - // so they are not taken in a followup run  
949 - $ids = array();  
950 - foreach($result as $docinfo)  
951 - {  
952 - $ids[] = $docinfo['document_id'];  
953 - }  
954 -  
955 - // mark the documents as being processed  
956 -  
957 - $ids=implode(',',$ids);  
958 - $sql = "UPDATE index_files SET processdate='$date' WHERE document_id in ($ids)";  
959 - DBUtil::runQuery($sql);  
960 -  
961 - $extractorCache = array();  
962 - $storageManager = KTStorageManagerUtil::getSingleton();  
963 -  
964 - $tempPath = $config->get("urls/tmpDirectory");  
965 -  
966 - foreach($result as $docinfo)  
967 - {  
968 - $docId=$docinfo['document_id'];  
969 - $extension=$docinfo['filetypes'];  
970 - $mimeType=$docinfo['mimetypes'];  
971 - $extractorClass=$docinfo['extractor'];  
972 - $indexDocument = in_array($docinfo['what'], array('A','C'));  
973 - $indexDiscussion = in_array($docinfo['what'], array('A','D'));  
974 - $this->indexingHistory = '';  
975 -  
976 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Indexing docid: %d extension: '%s' mimetype: '%s' extractor: '%s'"), $docId, $extension,$mimeType,$extractorClass), 'debug');  
977 -  
978 - if (empty($extractorClass))  
979 - {  
980 - /*  
981 -  
982 - if no extractor is found and we don't need to index discussions, then we can remove the item from the queue.  
983 -  
984 - */  
985 - if ($indexDiscussion)  
986 - {  
987 - $indexDocument = false;  
988 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Not indexing docid: %d content because extractor could not be resolve. Still indexing discussion."), $docId), 'info');  
989 - }  
990 - else  
991 - {  
992 - Indexer::unqueueDocument($docId, sprintf(_kt("No extractor for docid: %d"),$docId));  
993 - continue;  
994 - }  
995 - }  
996 - else  
997 - {  
998 - /*  
999 -  
1000 - If an extractor is available, we must ensure it is enabled.  
1001 -  
1002 - */  
1003 -  
1004 - if (!$this->isExtractorEnabled($extractorClass))  
1005 - {  
1006 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("diagnose: Not indexing docid: %d because extractor '%s' is disabled."), $docId, $extractorClass), 'info');  
1007 - continue;  
1008 - }  
1009 - }  
1010 -  
1011 - if ($this->debug)  
1012 - {  
1013 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Processing docid: %d.\n"),$docId), 'info');  
1014 - }  
1015 -  
1016 - $document = Document::get($docId);  
1017 - if (PEAR::isError($document))  
1018 - {  
1019 - Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Cannot resolve document id %d: %s."),$docId, $document->getMessage()), 'error');  
1020 - continue;  
1021 - }  
1022 -  
1023 - $filename = $document->getFileName();  
1024 - if (substr($filename,0,1) == '~')  
1025 - {  
1026 - Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Filename for document id %d starts with a tilde (~). This is assumed to be a temporary file. This is ignored."),$docId), 'error');  
1027 - continue;  
1028 - }  
1029 -  
1030 - $removeFromQueue = true;  
1031 - if ($indexDocument)  
1032 - {  
1033 - if (array_key_exists($extractorClass, $extractorCache))  
1034 - {  
1035 - $extractor = $extractorCache[$extractorClass];  
1036 - }  
1037 - else  
1038 - {  
1039 - $extractor = $extractorCache[$extractorClass] = $this->getExtractor($extractorClass);  
1040 - }  
1041 -  
1042 - if (!($extractor instanceof DocumentExtractor))  
1043 - {  
1044 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("indexDocuments: extractor '%s' is not a document extractor class."),$extractorClass), 'error');  
1045 - continue;  
1046 - }  
1047 -  
1048 -  
1049 -  
1050 - $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();  
1051 - $sourceFile = $storageManager->temporaryFile($document);  
1052 -  
1053 - if (empty($sourceFile) || !is_file($sourceFile))  
1054 - {  
1055 - Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: source file '%s' for document %d does not exist."),$sourceFile,$docId), 'error');  
1056 - continue;  
1057 - }  
1058 -  
1059 - if ($extractor->needsIntermediateSourceFile())  
1060 - {  
1061 - $extension = pathinfo($document->getFileName(), PATHINFO_EXTENSION);  
1062 -  
1063 - $intermediate = $tempPath . '/'. $docId . '.' . $extension;  
1064 - $result = @copy($sourceFile, $intermediate);  
1065 - if ($result === false)  
1066 - {  
1067 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not create intermediate file from document %d"),$docId), 'error');  
1068 - // problem. lets try again later. probably permission related. log the issue.  
1069 - continue;  
1070 - }  
1071 - $sourceFile = $intermediate;  
1072 - }  
1073 -  
1074 - $targetFile = tempnam($tempPath, 'ktindexer');  
1075 -  
1076 - $extractor->setSourceFile($sourceFile);  
1077 - $extractor->setMimeType($mimeType);  
1078 - $extractor->setExtension($extension);  
1079 - $extractor->setTargetFile($targetFile);  
1080 - $extractor->setDocument($document);  
1081 - $extractor->setIndexingStatus(null);  
1082 - $extractor->setExtractionStatus(null);  
1083 -  
1084 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Extra Info docid: %d Source File: '%s' Target File: '%s'"),$docId,$sourceFile,$targetFile), 'debug');  
1085 -  
1086 - $this->executeHook($extractor, 'pre_extract');  
1087 - $this->executeHook($extractor, 'pre_extract', $mimeType);  
1088 - $removeFromQueue = false;  
1089 -  
1090 - if ($extractor->extractTextContent())  
1091 - {  
1092 - // the extractor may need to create another target file  
1093 - $targetFile = $extractor->getTargetFile();  
1094 -  
1095 - $extractor->setExtractionStatus(true);  
1096 - $this->executeHook($extractor, 'pre_index');  
1097 - $this->executeHook($extractor, 'pre_index', $mimeType);  
1098 -  
1099 - $title = $document->getName();  
1100 - if ($indexDiscussion)  
1101 - {  
1102 - if (!$this->filterText($targetFile))  
1103 - {  
1104 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');  
1105 - }  
1106 - else  
1107 - {  
1108 - $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);  
1109 - $removeFromQueue = $indexStatus;  
1110 - if (!$indexStatus)  
1111 - {  
1112 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocumentAndDiscussion"),$docId), 'error');  
1113 - }  
1114 -  
1115 - $extractor->setIndexingStatus($indexStatus);  
1116 - }  
1117 - }  
1118 - else  
1119 - {  
1120 - if (!$this->filterText($targetFile))  
1121 - {  
1122 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');  
1123 - }  
1124 - else  
1125 - {  
1126 - $indexStatus = $this->indexDocument($docId, $targetFile, $title, $version);  
1127 - $removeFromQueue = $indexStatus;  
1128 -  
1129 - if (!$indexStatus)  
1130 - {  
1131 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocument"),$docId), 'error');  
1132 - $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error');  
1133 - }  
1134 -  
1135 - $extractor->setIndexingStatus($indexStatus);  
1136 - }  
1137 - }  
1138 -  
1139 - $this->executeHook($extractor, 'post_index', $mimeType);  
1140 - $this->executeHook($extractor, 'post_index');  
1141 - }  
1142 - else  
1143 - {  
1144 - $extractor->setExtractionStatus(false);  
1145 - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not extract contents from document %d"),$docId), 'error');  
1146 - $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error');  
1147 - }  
1148 -  
1149 - $this->executeHook($extractor, 'post_extract', $mimeType);  
1150 - $this->executeHook($extractor, 'post_extract');  
1151 -  
1152 - if ($extractor->needsIntermediateSourceFile())  
1153 - {  
1154 - @unlink($sourceFile);  
1155 - }  
1156 -  
1157 - @unlink($targetFile);  
1158 -  
1159 - }  
1160 - else  
1161 - {  
1162 - $indexStatus = $this->indexDiscussion($docId);  
1163 - $removeFromQueue = $indexStatus;  
1164 - }  
1165 -  
1166 - if ($removeFromQueue)  
1167 - {  
1168 - Indexer::unqueueDocument($docId, sprintf(_kt("Done indexing docid: %d"),$docId));  
1169 - }  
1170 - else  
1171 - {  
1172 - if ($this->debug) $default->log->debug(sprintf(_kt("Document docid: %d was not removed from the queue as it looks like there was a problem with the extraction process"),$docId));  
1173 - }  
1174 - }  
1175 - if ($this->debug) $default->log->debug('indexDocuments: done');  
1176 - //unlink($indexLockFile);  
1177 - }  
1178 -  
1179 - public function migrateDocuments($max=null)  
1180 - {  
1181 - global $default;  
1182 -  
1183 - $default->log->info(_kt('migrateDocuments: starting'));  
1184 -  
1185 - if (!$this->doesDiagnosticsPass(true))  
1186 - {  
1187 - $default->log->info(_kt('migrateDocuments: stopping - diagnostics problem. The dashboard will provide more information.'));  
1188 - return;  
1189 - }  
1190 -  
1191 - if (KTUtil::getSystemSetting('migrationComplete') == 'true')  
1192 - {  
1193 - $default->log->info(_kt('migrateDocuments: stopping - migration is complete.'));  
1194 - return;  
1195 - }  
1196 -  
1197 - $config =& KTConfig::getSingleton();  
1198 - if (is_null($max))  
1199 - {  
1200 - $max = $config->get('indexer/batchMigrateDocument',500);  
1201 - }  
1202 -  
1203 - $lockFile = $config->get('cache/cacheDirectory') . '/migration.lock';  
1204 - if (is_file($lockFile))  
1205 - {  
1206 - $default->log->info(_kt('migrateDocuments: stopping - migration lockfile detected.'));  
1207 - return;  
1208 - }  
1209 - touch($lockFile);  
1210 -  
1211 - $startTime = KTUtil::getSystemSetting('migrationStarted');  
1212 - if (is_null($startTime))  
1213 - {  
1214 - KTUtil::setSystemSetting('migrationStarted', time());  
1215 - }  
1216 -  
1217 - $maxLoops = 5;  
1218 -  
1219 - $max = ceil($max / $maxLoops);  
1220 -  
1221 - $start =KTUtil::getBenchmarkTime();  
1222 - $noDocs = false;  
1223 - $numDocs = 0;  
1224 -  
1225 - for($loop=0;$loop<$maxLoops;$loop++)  
1226 - {  
1227 -  
1228 - $sql = "SELECT  
1229 - document_id, document_text  
1230 - FROM  
1231 - document_text  
1232 - ORDER BY document_id  
1233 - LIMIT $max";  
1234 - $result = DBUtil::getResultArray($sql);  
1235 - if (PEAR::isError($result))  
1236 - {  
1237 - $default->log->info(_kt('migrateDocuments: db error'));  
1238 - break;  
1239 - }  
1240 -  
1241 - $docs = count($result);  
1242 - if ($docs == 0)  
1243 - {  
1244 - $noDocs = true;  
1245 - break;  
1246 - }  
1247 - $numDocs += $docs;  
1248 -  
1249 - foreach($result as $docinfo)  
1250 - {  
1251 - $docId = $docinfo['document_id'];  
1252 -  
1253 - $document = Document::get($docId);  
1254 - if (PEAR::isError($document) || is_null($document))  
1255 - {  
1256 - $sql = "DELETE FROM document_text WHERE document_id=$docId";  
1257 - DBUtil::runQuery($sql);  
1258 - $default->log->error(sprintf(_kt('migrateDocuments: Could not get document %d\'s document! Removing content!'),$docId));  
1259 - continue;  
1260 - }  
1261 -  
1262 - $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();  
1263 -  
1264 - $targetFile = tempnam($tempPath, 'ktindexer');  
1265 -  
1266 - if (file_put_contents($targetFile, $docinfo['document_text']) === false)  
1267 - {  
1268 - $default->log->error(sprintf(_kt('migrateDocuments: Cannot write to \'%s\' for document id %d'), $targetFile, $docId));  
1269 - continue;  
1270 - }  
1271 - // free memory asap ;)  
1272 - unset($docinfo['document_text']);  
1273 -  
1274 - $title = $document->getName();  
1275 -  
1276 - $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);  
1277 -  
1278 - if ($indexStatus)  
1279 - {  
1280 - $sql = "DELETE FROM document_text WHERE document_id=$docId";  
1281 - DBUtil::runQuery($sql);  
1282 - }  
1283 - else  
1284 - {  
1285 - $default->log->error(sprintf(_kt("migrateDocuments: Problem indexing document %d"), $docId));  
1286 - }  
1287 -  
1288 - @unlink($targetFile);  
1289 - }  
1290 - }  
1291 -  
1292 - @unlink($lockFile);  
1293 -  
1294 - $time = KTUtil::getBenchmarkTime() - $start;  
1295 -  
1296 - KTUtil::setSystemSetting('migrationTime', KTUtil::getSystemSetting('migrationTime',0) + $time);  
1297 - KTUtil::setSystemSetting('migratedDocuments', KTUtil::getSystemSetting('migratedDocuments',0) + $numDocs);  
1298 -  
1299 - $default->log->info(sprintf(_kt('migrateDocuments: stopping - done in %d seconds!'), $time));  
1300 - if ($noDocs)  
1301 - {  
1302 - $default->log->info(_kt('migrateDocuments: Completed!'));  
1303 - KTUtil::setSystemSetting('migrationComplete', 'true');  
1304 - schedulerUtil::deleteByName('Index Migration');  
1305 - $default->log->debug(_kt('migrateDocuments: Disabling \'Index Migration\' task by removing scheduler entry.'));  
1306 - }  
1307 - }  
1308 -  
1309 - /**  
1310 - * Index a document. The base class must override this function.  
1311 - *  
1312 - * @param int $docId  
1313 - * @param string $textFile  
1314 - */  
1315 - protected abstract function indexDocument($docId, $textFile, $title, $version);  
1316 -  
1317 -  
1318 - public function updateDocumentIndex($docId, $text)  
1319 - {  
1320 - $config = KTConfig::getSingleton();  
1321 - $tempPath = $config->get("urls/tmpDirectory");  
1322 - $tempFile = tempnam($tempPath,'ud_');  
1323 -  
1324 - file_put_contents($tempFile, $text);  
1325 -  
1326 - $document = Document::get($docId);  
1327 - $title = $document->getDescription();  
1328 - $version = $document->getVersion();  
1329 -  
1330 - $result = $this->indexDocument($docId, $tempFile, $title, $version);  
1331 -  
1332 - if (file_exists($tempFile))  
1333 - {  
1334 - unlink($tempFile);  
1335 - }  
1336 -  
1337 - return $result;  
1338 - }  
1339 -  
1340 - /**  
1341 - * Index a discussion. The base class must override this function.  
1342 - *  
1343 - * @param int $docId  
1344 - */  
1345 - protected abstract function indexDiscussion($docId);  
1346 -  
1347 - /**  
1348 - * Diagnose the indexer. e.g. Check that the indexing server is running.  
1349 - *  
1350 - */  
1351 - public abstract function diagnose();  
1352 -  
1353 - /**  
1354 - * Diagnose the extractors.  
1355 - *  
1356 - * @return array  
1357 - */  
1358 - public function diagnoseExtractors()  
1359 - {  
1360 - $diagnosis = $this->_diagnose($this->extractorPath, 'DocumentExtractor', 'Extractor.inc.php');  
1361 - $diagnosis = array_merge($diagnosis, $this->_diagnose($this->hookPath, 'Hook', 'Hook.inc.php'));  
1362 -  
1363 - return $diagnosis;  
1364 - }  
1365 -  
1366 - /**  
1367 - * This is a refactored diagnose function.  
1368 - *  
1369 - * @param string $path  
1370 - * @param string $class  
1371 - * @param string $extension  
1372 - * @return array  
1373 - */  
1374 - private function _diagnose($path, $baseclass, $extension)  
1375 - {  
1376 - global $default;  
1377 -  
1378 - $diagnoses = array();  
1379 -  
1380 - $dir = opendir(SearchHelper::correctPath($path));  
1381 - $extlen = - strlen($extension);  
1382 -  
1383 - while (($file = readdir($dir)) !== false)  
1384 - {  
1385 - if (substr($file,0,1) == '.')  
1386 - {  
1387 - continue;  
1388 - }  
1389 - if (substr($file,$extlen) != $extension)  
1390 - {  
1391 - $default->log->error(sprintf(_kt("diagnose: '%s' does not have extension '%s'."), $file, $extension));  
1392 - continue;  
1393 - }  
1394 -  
1395 - require_once($path . '/' . $file);  
1396 -  
1397 - $class = substr($file, 0, -8);  
1398 - if (!class_exists($class))  
1399 - {  
1400 - $default->log->error(sprintf(_kt("diagnose: class '%s' does not exist."), $class));  
1401 - continue;  
1402 - }  
1403 -  
1404 - if (!$this->isExtractorEnabled($class))  
1405 - {  
1406 - $default->log->debug(sprintf(_kt("diagnose: extractor '%s' is disabled."), $class));  
1407 - continue;  
1408 - }  
1409 -  
1410 - $extractor = new $class();  
1411 - if (!is_a($extractor, $baseclass))  
1412 - {  
1413 - $default->log->error(sprintf(_kt("diagnose(): '%s' is not of type DocumentExtractor"), $class));  
1414 - continue;  
1415 - }  
1416 -  
1417 - $types = $extractor->getSupportedMimeTypes();  
1418 - if (empty($types))  
1419 - {  
1420 - if ($this->debug) $default->log->debug(sprintf(_kt("diagnose: class '%s' does not support any types."), $class));  
1421 - continue;  
1422 - }  
1423 -  
1424 - $diagnosis=$extractor->diagnose();  
1425 - if (empty($diagnosis))  
1426 - {  
1427 - continue;  
1428 - }  
1429 - $diagnoses[$class] = array(  
1430 - 'name'=>$extractor->getDisplayName(),  
1431 - 'diagnosis'=>$diagnosis  
1432 - );  
1433 -  
1434 - }  
1435 - closedir($dir);  
1436 -  
1437 - return $diagnoses;  
1438 - }  
1439 -  
1440 -  
1441 - /**  
1442 - * Register the extractor types.  
1443 - *  
1444 - * @param boolean $clear. Optional. Defaults to false.  
1445 - */  
1446 - public function registerTypes($clear=false)  
1447 - {  
1448 - if ($clear)  
1449 - {  
1450 - $this->clearExtractors();  
1451 - }  
1452 - $dir = opendir(SearchHelper::correctPath($this->extractorPath));  
1453 - while (($file = readdir($dir)) !== false)  
1454 - {  
1455 - if (substr($file,-17) == 'Extractor.inc.php')  
1456 - {  
1457 - require_once($this->extractorPath . '/' . $file);  
1458 - $class = substr($file, 0, -8);  
1459 -  
1460 - if (!class_exists($class))  
1461 - {  
1462 - // if the class does not exist, we can't do anything.  
1463 - continue;  
1464 - }  
1465 -  
1466 - $extractor = new $class;  
1467 - if ($extractor instanceof DocumentExtractor)  
1468 - {  
1469 - $extractor->registerMimeTypes();  
1470 - }  
1471 - }  
1472 - }  
1473 - closedir($dir);  
1474 - }  
1475 -  
1476 - /**  
1477 - * This is used as a possible obtimisation effort. It may be overridden in that case.  
1478 - *  
1479 - * @param int $docId  
1480 - * @param string $textFile  
1481 - */  
1482 - protected function indexDocumentAndDiscussion($docId, $textFile, $title, $version)  
1483 - {  
1484 - $this->indexDocument($docId, $textFile, $title, $version);  
1485 - $this->indexDiscussion($docId);  
1486 - }  
1487 -  
1488 - /**  
1489 - * Remove the document from the queue. This is normally called when it has been processed.  
1490 - *  
1491 - * @param int $docid  
1492 - */  
1493 - public static function unqueueDocument($docid, $reason=false, $level='debug')  
1494 - {  
1495 - $sql = "DELETE FROM index_files WHERE document_id=$docid";  
1496 - DBUtil::runQuery($sql);  
1497 - if ($reason !== false)  
1498 - {  
1499 - global $default;  
1500 - $default->log->$level("Indexer: removing document $docid from the queue - $reason");  
1501 - }  
1502 - }  
1503 -  
1504 - /**  
1505 - * Run a query on the index.  
1506 - *  
1507 - * @param string $query  
1508 - * @return array  
1509 - */  
1510 - public abstract function query($query);  
1511 -  
1512 - /**  
1513 - * Converts an integer to a string that can be easily compared and reversed.  
1514 - *  
1515 - * @param int $int  
1516 - * @return string  
1517 - */  
1518 - public static function longToString($int)  
1519 - {  
1520 - $maxlen = 14;  
1521 -  
1522 - $a2z = array('a','b','c','d','e','f','g','h','i','j');  
1523 - $o29 = array('0','1','2','3','4','5','6','7','8','9');  
1524 - $l = str_pad('',$maxlen - strlen("$int"),'0') . $int;  
1525 -  
1526 - return str_replace($o29, $a2z, $l);  
1527 - }  
1528 -  
1529 - /**  
1530 - * Converts a string to an integer.  
1531 - *  
1532 - * @param string $str  
1533 - * @return int  
1534 - */  
1535 - public static function stringToLong($str)  
1536 - {  
1537 - $a2z = array('a','b','c','d','e','f','g','h','i','j');  
1538 - $o29 = array('0','1','2','3','4','5','6','7','8','9');  
1539 -  
1540 - $int = str_replace($a2z, $o29, $str) + 0;  
1541 -  
1542 - return $int;  
1543 - }  
1544 -  
1545 - /**  
1546 - * Possibly we can optimise indexes. This method must be overriden.  
1547 - * The new function must call the parent!  
1548 - *  
1549 - */  
1550 - public function optimise()  
1551 - {  
1552 - KTUtil::setSystemSetting('luceneOptimisationDate', time());  
1553 - }  
1554 -  
1555 - /**  
1556 - * Shuts down the indexer  
1557 - *  
1558 - */  
1559 - public function shutdown()  
1560 - {  
1561 - // do nothing generally  
1562 - }  
1563 -  
1564 - /**  
1565 - * Returns the name of the indexer.  
1566 - *  
1567 - * @return string  
1568 - */  
1569 - public abstract function getDisplayName();  
1570 -  
1571 -  
1572 - /**  
1573 - * Returns the number of non-deleted documents in the index.  
1574 - *  
1575 - * @return int  
1576 - */  
1577 - public abstract function getDocumentsInIndex();  
1578 -  
1579 - /**  
1580 - * Returns the path to the index directory  
1581 - *  
1582 - * @return string  
1583 - */  
1584 - public function getIndexDirectory()  
1585 - {  
1586 - $config = KTConfig::getSingleton();  
1587 - $directory = $config->get('indexer/luceneDirectory');  
1588 - return $directory;  
1589 - }  
1590 -}  
1591 -  
1592 -?> 1 +<?php
  2 +
  3 +/**
  4 + * $Id:$
  5 + *
  6 + * KnowledgeTree Community Edition
  7 + * Document Management Made Simple
  8 + * Copyright (C) 2008 KnowledgeTree Inc.
  9 + * Portions copyright The Jam Warehouse Software (Pty) Limited
  10 + *
  11 + * This program is free software; you can redistribute it and/or modify it under
  12 + * the terms of the GNU General Public License version 3 as published by the
  13 + * Free Software Foundation.
  14 + *
  15 + * This program is distributed in the hope that it will be useful, but WITHOUT
  16 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17 + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  18 + * details.
  19 + *
  20 + * You should have received a copy of the GNU General Public License
  21 + * along with this program. If not, see <http://www.gnu.org/licenses/>.
  22 + *
  23 + * You can contact KnowledgeTree Inc., PO Box 7775 #87847, San Francisco,
  24 + * California 94120-7775, or email info@knowledgetree.com.
  25 + *
  26 + * The interactive user interfaces in modified source and object code versions
  27 + * of this program must display Appropriate Legal Notices, as required under
  28 + * Section 5 of the GNU General Public License version 3.
  29 + *
  30 + * In accordance with Section 7(b) of the GNU General Public License version 3,
  31 + * these Appropriate Legal Notices must retain the display of the "Powered by
  32 + * KnowledgeTree" logo and retain the original copyright notice. If the display of the
  33 + * logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices
  34 + * must display the words "Powered by KnowledgeTree" and retain the original
  35 + * copyright notice.
  36 + * Contributor( s): ______________________________________
  37 + *
  38 + */
  39 +
  40 +define('SEARCH2_INDEXER_DIR',realpath(dirname(__FILE__)) . '/');
  41 +require_once('indexing/extractorCore.inc.php');
  42 +require_once(KT_DIR . '/plugins/ktcore/scheduler/schedulerUtil.php');
  43 +
  44 +
  45 +class IndexerInconsistencyException extends Exception {};
  46 +
  47 +class QueryResultItem
  48 +{
  49 + protected $document_id;
  50 + protected $title;
  51 + protected $rank;
  52 + protected $text;
  53 + protected $filesize;
  54 + protected $fullpath;
  55 + protected $live;
  56 + protected $version;
  57 + protected $mimeType;
  58 + protected $filename;
  59 + protected $thumbnail; // TODO: if not null, gui can display a thumbnail
  60 + protected $viewer; // TODO: if not null, a viewer can be used to view the document
  61 + protected $document;
  62 + protected $checkedOutUser;
  63 + protected $dateCheckedout;
  64 + protected $workflowState;
  65 + protected $workflow;
  66 + protected $modifiedBy;
  67 + protected $dateModified;
  68 + protected $createdBy;
  69 + protected $dateCreated;
  70 + protected $owner;
  71 + protected $immutable;
  72 + protected $deleted;
  73 + protected $status;
  74 + protected $folderId;
  75 + protected $storagePath;
  76 + protected $documentType;
  77 + protected $mimeIconPath;
  78 + protected $mimeDisplay;
  79 + protected $oemDocumentNo;
  80 +
  81 + public function __construct($document_id, $rank=null, $title=null, $text=null)
  82 + {
  83 + $this->document_id=(int) $document_id;
  84 + $this->rank= $rank;
  85 + $this->title=$title;
  86 + $this->text = $text;
  87 + $this->live = true;
  88 + $this->loadDocumentInfo();
  89 + }
  90 +
  91 + protected function __isset($property)
  92 + {
  93 + switch($property)
  94 + {
  95 + case 'DocumentID': return isset($this->document_id);
  96 + case 'Rank': return isset($this->rank);
  97 + case 'Text': return isset($this->text);
  98 + case 'Title': return isset($this->title);
  99 + case null: break;
  100 + default:
  101 + throw new Exception("Unknown property '$property' to get on QueryResultItem");
  102 + }
  103 + return true; // should not be reached
  104 + }
  105 +
  106 + public function loadDocumentInfo()
  107 + {
  108 + global $default;
  109 + $sql = "SELECT
  110 + d.folder_id, f.full_path, f.name, dcv.size as filesize, dcv.major_version,
  111 + dcv.minor_version, dcv.filename, cou.name as checkoutuser, w.human_name as workflow, ws.human_name as workflowstate,
  112 + mt.mimetypes as mimetype, md.mime_doc as mimedoc, d.checkedout, mbu.name as modifiedbyuser, d.modified,
  113 + cbu.name as createdbyuser, ou.name as owneruser, d.immutable, d.status_id, d.created,dcv.storage_path, dtl.name as document_type,
  114 + mt.icon_path as mime_icon_path, mt.friendly_name as mime_display, d.oem_no, dmv.name as title
  115 + FROM
  116 + documents d
  117 + INNER JOIN document_metadata_version dmv ON d.metadata_version_id = dmv.id
  118 + INNER JOIN document_content_version dcv ON dmv.content_version_id = dcv.id
  119 + INNER JOIN mime_types mt ON dcv.mime_id=mt.id
  120 + LEFT JOIN document_types_lookup dtl ON dtl.id=dmv.document_type_id
  121 + LEFT JOIN folders f ON f.id=d.folder_id
  122 + LEFT JOIN users cou ON d.checked_out_user_id=cou.id
  123 + LEFT JOIN workflows w ON dmv.workflow_id=w.id
  124 + LEFT JOIN workflow_states ws ON dmv.workflow_state_id = ws.id
  125 + LEFT JOIN mime_documents md ON mt.mime_document_id = md.id
  126 + LEFT JOIN users mbu ON d.modified_user_id=mbu.id
  127 + LEFT JOIN users cbu ON d.creator_id=cbu.id
  128 + LEFT JOIN users ou ON d.owner_id=ou.id
  129 + WHERE
  130 + d.id=$this->document_id";
  131 +
  132 + $result = DBUtil::getOneResult($sql);
  133 +
  134 + if (PEAR::isError($result) || empty($result))
  135 + {
  136 + $this->live = false;
  137 + if (PEAR::isError($result))
  138 + {
  139 + throw new Exception('Database exception! There appears to be an error in the system: ' .$result->getMessage());
  140 + }
  141 +
  142 + $default->log->error('QueryResultItem: $result is null');
  143 + $msg = 'The database did not have a record matching the result from the document indexer. This may occur if there is an inconsistency between the document indexer and the repository. The indexer needs to be repaired.';
  144 + $default->log->error('QueryResultItem: ' . $msg);
  145 + // TODO: repair process where we scan documents in index, and delete those for which there is nothing in the repository
  146 + throw new IndexerInconsistencyException(_kt($msg));
  147 + }
  148 +
  149 + // document_id, relevance, text, title
  150 +
  151 + $this->documentType = $result['document_type'];
  152 + $this->filename=$result['filename'];
  153 + $this->filesize = KTUtil::filesizeToString($result['filesize']);
  154 + $this->folderId = $result['folder_id'];
  155 + $this->title = $result['title'];
  156 +
  157 + $this->createdBy = $result['createdbyuser'];
  158 + $this->dateCreated = $result['created'];
  159 +
  160 + $this->modifiedBy = $result['modifiedbyuser'];
  161 + $this->dateModified = $result['modified'];
  162 +
  163 + $this->checkedOutUser = $result['checkoutuser'];
  164 + $this->dateCheckedout = $result['checkedout'];
  165 +
  166 + $this->owner = $result['owneruser'];
  167 +
  168 + $this->version = $result['major_version'] . '.' . $result['minor_version'];
  169 +
  170 + $this->immutable = ($result['immutable'] + 0)?_kt('Immutable'):'';
  171 +
  172 + $this->workflow = $result['workflow'];
  173 + $this->workflowState = $result['workflowstate'];
  174 +
  175 + $this->oemDocumentNo = $result['oem_no'];
  176 + if (empty($this->oemDocumentNo)) $this->oemDocumentNo = 'n/a';
  177 +
  178 + if (is_null($result['name']))
  179 + {
  180 + $this->fullpath = '(orphaned)';
  181 + }
  182 + else
  183 + {
  184 + $this->fullpath = $result['full_path'];
  185 + }
  186 +
  187 + $this->mimeType = $result['mimetype'];
  188 + $this->mimeIconPath = $result['mime_icon_path'];
  189 + $this->mimeDisplay = $result['mime_display'];
  190 +
  191 + $this->storagePath = $result['storage_path'];
  192 + $this->status = Document::getStatusString($result['status_id']);
  193 + }
  194 +
  195 + protected function __get($property)
  196 + {
  197 + switch($property)
  198 + {
  199 + case null: return '';
  200 + case 'DocumentID': return (int) $this->document_id;
  201 + case 'Relevance':
  202 + case 'Rank': return (float) $this->rank;
  203 + case 'Text': return (string) $this->text;
  204 + case 'Title': return (string) $this->title;
  205 + case 'FullPath': return (string) $this->fullpath;
  206 + case 'IsLive': return (bool) $this->live;
  207 + case 'Filesize': return $this->filesize;
  208 + case 'Version': return (string) $this->version;
  209 + case 'Filename': return (string)$this->filename;
  210 + case 'FolderId': return (int)$this->folderId;
  211 + case 'OemDocumentNo': return (string) $this->oemDocumentNo;
  212 + case 'Document':
  213 + if (is_null($this->document))
  214 + {
  215 + $this->document = Document::get($this->document_id);
  216 + }
  217 + return $this->document;
  218 + case 'IsAvailable':
  219 + return $this->Document->isLive();
  220 + case 'CheckedOutUser':
  221 + case 'CheckedOutBy':
  222 + return (string) $this->checkedOutUser;
  223 + case 'WorkflowOnly':
  224 + case 'Workflow':
  225 + return (string)$this->workflow;
  226 + case 'WorkflowStateOnly':
  227 + case 'WorkflowState':
  228 + return (string)$this->workflowState;
  229 + case 'WorkflowAndState':
  230 + if (is_null($this->workflow))
  231 + {
  232 + return '';
  233 + }
  234 + return "$this->workflow - $this->workflowState";
  235 + case 'MimeType':
  236 + return (string) $this->mimeType;
  237 + case 'MimeIconPath':
  238 + return (string) $this->mimeIconPath;
  239 + case 'MimeDisplay':
  240 + return (string) $this->mimeDisplay;
  241 + case 'DateCheckedOut':
  242 + return (string) $this->dateCheckedout;
  243 + case 'ModifiedBy':
  244 + return (string) $this->modifiedBy;
  245 + case 'DateModified':
  246 + return (string) $this->dateModified;
  247 + case 'CreatedBy':
  248 + return (string) $this->createdBy;
  249 + case 'DateCreated':
  250 + return (string) $this->dateCreated;
  251 + case 'Owner':
  252 + case 'OwnedBy':
  253 + return (string) $this->owner;
  254 + case 'IsImmutable':
  255 + case 'Immutable':
  256 + return (bool) $this->immutable;
  257 + case 'Status':
  258 + return $this->status;
  259 + case 'StoragePath':
  260 + return $this->storagePath;
  261 + case 'DocumentType':
  262 + return $this->documentType;
  263 + case 'Permissions':
  264 + return 'not available';
  265 + case 'CanBeReadByUser':
  266 + if (!$this->live)
  267 + return false;
  268 + if (Permission::userHasDocumentReadPermission($this->Document))
  269 + return true;
  270 + if (Permission::adminIsInAdminMode())
  271 + return true;
  272 + return false;
  273 + default:
  274 + throw new Exception("Unknown property '$property' to get on QueryResultItem");
  275 + }
  276 + return ''; // Should not be reached
  277 + }
  278 +
  279 + protected function __set($property, $value)
  280 + {
  281 + switch($property)
  282 + {
  283 + case 'Rank': $this->rank = number_format($value,2,'.',','); break;
  284 + case 'Title': $this->title = $value; break;
  285 + case 'Text': $this->text = $value; break;
  286 + default:
  287 + throw new Exception("Unknown property '$property' to set on QueryResultItem");
  288 + }
  289 + }
  290 +}
  291 +
  292 +function MatchResultCompare($a, $b)
  293 +{
  294 + if ($a->Rank == $b->Rank) {
  295 + return 0;
  296 + }
  297 + return ($a->Rank < $b->Rank) ? -1 : 1;
  298 +}
  299 +
  300 +abstract class Indexer
  301 +{
  302 + /**
  303 + * Cache of extractors
  304 + *
  305 + * @var array
  306 + */
  307 + private $extractorCache;
  308 +
  309 + /**
  310 + * Indicates if the indexer will do logging.
  311 + *
  312 + * @var boolean
  313 + */
  314 + private $debug;
  315 + /**
  316 + * Cache on mime related hooks
  317 + *
  318 + * @var unknown_type
  319 + */
  320 + private $mimeHookCache;
  321 + /**
  322 + * Cache on general hooks.
  323 + *
  324 + * @var array
  325 + */
  326 + private $generalHookCache;
  327 +
  328 + /**
  329 + * This is a path to the extractors.
  330 + *
  331 + * @var string
  332 + */
  333 + private $extractorPath;
  334 + /**
  335 + * This is a path to the hooks.
  336 + *
  337 + * @var string
  338 + */
  339 + private $hookPath;
  340 +
  341 + private $enabledExtractors;
  342 +
  343 + /**
  344 + * Initialise the indexer
  345 + *
  346 + */
  347 + protected function __construct()
  348 + {
  349 + $config = KTConfig::getSingleton();
  350 +
  351 + $this->extractorCache = array();
  352 + $this->debug = $config->get('indexer/debug', true);
  353 + $this->hookCache = array();
  354 + $this->generalHookCache = array();
  355 + $this->extractorPath = $config->get('indexer/extractorPath', 'extractors');
  356 + $this->hookPath = $config->get('indexer/extractorHookPath','extractorHooks');
  357 +
  358 + $this->loadExtractorStatus();
  359 + }
  360 +
  361 + /**
  362 + * Get the list if enabled extractors
  363 + *
  364 + */
  365 + private function loadExtractorStatus()
  366 + {
  367 + $sql = "SELECT id, name FROM mime_extractors WHERE active=1";
  368 + $rs = DBUtil::getResultArray($sql);
  369 + $this->enabledExtractors = array();
  370 + foreach($rs as $item)
  371 + {
  372 + $this->enabledExtractors[] = $item['name'];
  373 + }
  374 + }
  375 +
  376 + private function isExtractorEnabled($extractor)
  377 + {
  378 + return in_array($extractor, $this->enabledExtractors);
  379 + }
  380 +
  381 + /**
  382 + * Returns a reference to the main class
  383 + *
  384 + * @return Indexer
  385 + */
  386 + public static function get()
  387 + {
  388 + static $singleton = null;
  389 +
  390 + if (is_null($singleton))
  391 + {
  392 + $config = KTConfig::getSingleton();
  393 + $classname = $config->get('indexer/coreClass');
  394 +
  395 + require_once('indexing/indexers/' . $classname . '.inc.php');
  396 +
  397 + if (!class_exists($classname))
  398 + {
  399 + throw new Exception("Class '$classname' does not exist.");
  400 + }
  401 +
  402 + $singleton = new $classname;
  403 + }
  404 +
  405 + return $singleton;
  406 + }
  407 +
  408 + public abstract function deleteDocument($docid);
  409 +
  410 + /**
  411 + * Remove the association of all extractors to mime types on the database.
  412 + *
  413 + */
  414 + public function clearExtractors()
  415 + {
  416 + global $default;
  417 +
  418 + $sql = "update mime_types set extractor_id=null";
  419 + DBUtil::runQuery($sql);
  420 +
  421 + $sql = "delete from mime_extractors";
  422 + DBUtil::runQuery($sql);
  423 +
  424 + if ($this->debug) $default->log->debug('clearExtractors');
  425 + }
  426 +
  427 + /**
  428 + * lookup the name of the extractor class based on the mime type.
  429 + *
  430 + * @param string $type
  431 + * @return string
  432 + */
  433 + public static function resolveExtractor($type)
  434 + {
  435 + global $default;
  436 + $sql = "select extractor from mime_types where filetypes='$type'";
  437 + $class = DBUtil::getOneResultKey($sql,'extractor');
  438 + if (PEAR::isError($class))
  439 + {
  440 + $default->log->error("resolveExtractor: cannot resolve $type");
  441 + return $class;
  442 + }
  443 + if ($this->debug) $default->log->debug(sprintf(_kt("resolveExtractor: Resolved '%s' from mime type '%s'."), $class, $type));
  444 + return $class;
  445 + }
  446 +
  447 + /**
  448 + * Return all the discussion text.
  449 + *
  450 + * @param int $docid
  451 + * @return string
  452 + */
  453 + public static function getDiscussionText($docid)
  454 + {
  455 + $sql = "SELECT
  456 + dc.subject, dc.body
  457 + FROM
  458 + discussion_threads dt
  459 + INNER JOIN discussion_comments dc ON dc.thread_id=dt.id AND dc.id BETWEEN dt.first_comment_id AND dt.last_comment_id
  460 + WHERE
  461 + dt.document_id=$docid";
  462 + $result = DBUtil::getResultArray($sql);
  463 + $text = '';
  464 +
  465 + foreach($result as $record)
  466 + {
  467 + $text .= $record['subject'] . "\n" . $record['body'] . "\n";
  468 + }
  469 +
  470 + return $text;
  471 + }
  472 +
  473 + /**
  474 + * Schedule the indexing of a document.
  475 + *
  476 + * @param string $document
  477 + * @param string $what
  478 + */
  479 + public static function index($document, $what='A')
  480 + {
  481 + global $default;
  482 +
  483 + if (is_numeric($document))
  484 + {
  485 + $document = Document::get($document+0);
  486 + }
  487 +
  488 + if (PEAR::isError($document))
  489 + {
  490 + $default->log->error("index: Could not index document: " .$document->getMessage());
  491 + return;
  492 + }
  493 +
  494 + $document_id = $document->getId();
  495 + $userid=$_SESSION['userID'];
  496 + if (empty($userid)) $userid=1;
  497 +
  498 + // we dequeue the document so that there are no issues when enqueuing
  499 + Indexer::unqueueDocument($document_id);
  500 +
  501 + // enqueue item
  502 + $sql = "INSERT INTO index_files(document_id, user_id, what) VALUES($document_id, $userid, '$what')";
  503 + DBUtil::runQuery($sql);
  504 +
  505 + $default->log->debug("index: Queuing indexing of $document_id");
  506 +
  507 + }
  508 +
  509 + private static function incrementCount()
  510 + {
  511 + // Get count from system settings
  512 + $count = Indexer::getIndexedDocumentCount();
  513 + $count = (int)$count + 1;
  514 + Indexer::updateIndexedDocumentCount($count);
  515 + }
  516 +
  517 + public static function getIndexedDocumentCount()
  518 + {
  519 + $count = KTUtil::getSystemSetting('indexedDocumentCount', 0);
  520 + return (int) $count;
  521 + }
  522 +
  523 + public static function updateIndexedDocumentCount($cnt = 0)
  524 + {
  525 + KTUtil::setSystemSetting('indexedDocumentCount', $cnt);
  526 + }
  527 +
  528 + public static function reindexQueue()
  529 + {
  530 + $sql = "UPDATE index_files SET processdate = null";
  531 + DBUtil::runQuery($sql);
  532 + }
  533 +
  534 + public static function reindexDocument($documentId)
  535 + {
  536 + $sql = "UPDATE index_files SET processdate=null, status_msg=null WHERE document_id=$documentId";
  537 + DBUtil::runQuery($sql);
  538 + }
  539 +
  540 +
  541 +
  542 + public static function indexAll()
  543 + {
  544 + $userid=$_SESSION['userID'];
  545 + if (empty($userid)) $userid=1;
  546 +
  547 + $sql = "DELETE FROM index_files";
  548 + DBUtil::runQuery($sql);
  549 +
  550 + $sql = "INSERT INTO index_files(document_id, user_id, what) SELECT id, $userid, 'A' FROM documents WHERE status_id=1 and id not in (select document_id from index_files)";
  551 + DBUtil::runQuery($sql);
  552 + }
  553 +
  554 + /**
  555 + * Clearout the scheduling of documents that no longer exist.
  556 + *
  557 + */
  558 + public static function clearoutDeleted()
  559 + {
  560 + global $default;
  561 +
  562 + $sql = 'DELETE FROM
  563 + index_files
  564 + WHERE
  565 + document_id in (SELECT d.id FROM documents AS d WHERE d.status_id=3) OR
  566 + NOT EXISTS(SELECT index_files.document_id FROM documents WHERE index_files.document_id=documents.id)';
  567 + DBUtil::runQuery($sql);
  568 +
  569 + $default->log->debug("Indexer::clearoutDeleted: removed documents from indexing queue that have been deleted");
  570 + }
  571 +
  572 +
  573 + /**
  574 + * Check if a document is scheduled to be indexed
  575 + *
  576 + * @param mixed $document This may be a document or document id
  577 + * @return boolean
  578 + */
  579 + public static function isDocumentScheduled($document)
  580 + {
  581 + if (is_numeric($document))
  582 + {
  583 + $docid = $document;
  584 + }
  585 + else if ($document instanceof Document)
  586 + {
  587 + $docid = $document->getId();
  588 + }
  589 + else
  590 + {
  591 + return false;
  592 + }
  593 + $sql = "SELECT 1 FROM index_files WHERE document_id=$docid";
  594 + $result = DBUtil::getResultArray($sql);
  595 + return count($result) > 0;
  596 + }
  597 +
  598 + /**
  599 + * Filters text removing redundant characters such as continuous newlines and spaces.
  600 + *
  601 + * @param string $filename
  602 + */
  603 + private function filterText($filename)
  604 + {
  605 + $content = file_get_contents($filename);
  606 +
  607 + $src = array("([\r\n])","([\n][\n])","([\n])","([\t])",'([ ][ ])');
  608 + $tgt = array("\n","\n",' ',' ',' ');
  609 +
  610 + // shrink what is being stored.
  611 + do
  612 + {
  613 + $orig = $content;
  614 + $content = preg_replace($src, $tgt, $content);
  615 + } while ($content != $orig);
  616 +
  617 + return file_put_contents($filename, $content) !== false;
  618 + }
  619 +
  620 + /**
  621 + * Load hooks for text extraction process.
  622 + *
  623 + */
  624 + private function loadExtractorHooks()
  625 + {
  626 + $this->generalHookCache = array();
  627 + $this->mimeHookCache = array();
  628 +
  629 +
  630 + $dir = opendir(SearchHelper::correctPath($this->hookPath));
  631 + while (($file = readdir($dir)) !== false)
  632 + {
  633 + if (substr($file,-12) == 'Hook.inc.php')
  634 + {
  635 + require_once($this->hookPath . '/' . $file);
  636 + $class = substr($file, 0, -8);
  637 +
  638 + if (!class_exists($class))
  639 + {
  640 + continue;
  641 + }
  642 +
  643 + $hook = new $class;
  644 + if (!($class instanceof ExtractorHook))
  645 + {
  646 + continue;
  647 + }
  648 +
  649 + $mimeTypes = $hook->registerMimeTypes();
  650 + if (is_null($mimeTypes))
  651 + {
  652 + $this->generalHookCache[] = & $hook;
  653 + }
  654 + else
  655 + {
  656 + foreach($mimeTypes as $type)
  657 + {
  658 + $this->mimeHookCache[$type][] = & $hook;
  659 + }
  660 + }
  661 +
  662 + }
  663 + }
  664 + closedir($dir);
  665 + }
  666 +
  667 + /**
  668 + * This is a refactored function to execute the hooks.
  669 + *
  670 + * @param DocumentExtractor $extractor
  671 + * @param string $phase
  672 + * @param string $mimeType Optional. If set, indicates which hooks must be used, else assume general.
  673 + */
  674 + private function executeHook($extractor, $phase, $mimeType = null)
  675 + {
  676 + $hooks = array();
  677 + if (is_null($mimeType))
  678 + {
  679 + $hooks = $this->generalHookCache;
  680 + }
  681 + else
  682 + {
  683 + if (array_key_exists($mimeType, $this->mimeHookCache))
  684 + {
  685 + $hooks = $this->mimeHookCache[$mimeType];
  686 + }
  687 + }
  688 + if (empty($hooks))
  689 + {
  690 + return;
  691 + }
  692 +
  693 + foreach($hooks as $hook)
  694 + {
  695 + $hook->$phase($extractor);
  696 + }
  697 + }
  698 +
  699 + private function doesDiagnosticsPass($simple=false)
  700 + {
  701 + global $default;
  702 +
  703 + $config =& KTConfig::getSingleton();
  704 + // create a index log lock file in case there are errors, and we don't need to log them forever!
  705 + // this function will create the lockfile if an error is detected. It will be removed as soon
  706 + // as the problems with the indexer are removed.
  707 + $lockFile = $config->get('cache/cacheDirectory') . '/index.log.lock';
  708 +
  709 + $diagnosis = $this->diagnose();
  710 + if (!is_null($diagnosis))
  711 + {
  712 + if (!is_file($lockFile))
  713 + {
  714 + $default->log->error(_kt('Indexer problem: ') . $diagnosis);
  715 + }
  716 + touch($lockFile);
  717 + return false;
  718 + }
  719 +
  720 + if ($simple)
  721 + {
  722 + return true;
  723 + }
  724 +
  725 + $diagnosis = $this->diagnoseExtractors();
  726 + if (!empty($diagnosis))
  727 + {
  728 + if (!is_file($lockFile))
  729 + {
  730 + foreach($diagnosis as $diag)
  731 + {
  732 + $default->log->error(sprintf(_kt('%s problem: %s'), $diag['name'],$diag['diagnosis']));
  733 + }
  734 + }
  735 + touch($lockFile);
  736 + return false;
  737 + }
  738 +
  739 + if (is_file($lockFile))
  740 + {
  741 + $default->log->info(_kt('Issues with the indexer have been resolved!'));
  742 + unlink($lockFile);
  743 + }
  744 +
  745 + return true;
  746 + }
  747 +
  748 + /**
  749 + * This does the initial mime type association between mime types and text extractors
  750 + *
  751 + */
  752 + public function checkForRegisteredTypes()
  753 + {
  754 + global $default;
  755 +
  756 + // we are only doing this once!
  757 + $initRegistered = KTUtil::getSystemSetting('mimeTypesRegistered', false);
  758 + if ($initRegistered)
  759 + {
  760 + return;
  761 + }
  762 + if ($this->debug) $default->log->debug('checkForRegisteredTypes: start');
  763 +
  764 + $date = date('Y-m-d H:i');
  765 + $sql = "UPDATE scheduler_tasks SET run_time='$date'";
  766 + DBUtil::runQuery($sql);
  767 +
  768 + $this->registerTypes(true);
  769 +
  770 + $disable = array(
  771 + OS_WINDOWS=>array('PSExtractor'),
  772 + OS_UNIX => array()
  773 + );
  774 +
  775 + $disableForOS = OS_WINDOWS?$disable[OS_WINDOWS]:$disable[OS_UNIX];
  776 +
  777 + foreach($disableForOS as $extractor)
  778 + {
  779 + $sql = "UPDATE mime_extractors SET active=0 WHERE name='$extractor'";
  780 + DBUtil::runQuery($sql);
  781 + $default->log->info("checkForRegisteredTypes: disabled '$extractor'");
  782 + }
  783 +
  784 + if ($this->debug) $default->log->debug('checkForRegisteredTypes: done');
  785 + KTUtil::setSystemSetting('mimeTypesRegistered', true);
  786 + }
  787 +
  788 + private function updatePendingDocumentStatus($documentId, $message, $level)
  789 + {
  790 + $this->indexingHistory .= "\n" . $level . ': ' . $message;
  791 + $message = sanitizeForSQL($this->indexingHistory);
  792 + $sql = "UPDATE index_files SET status_msg='$message' WHERE document_id=$documentId";
  793 + DBUtil::runQuery($sql);
  794 + }
  795 +
  796 + /**
  797 + *
  798 + * @param int $documentId
  799 + * @param string $message
  800 + * @param string $level This may be info, error, debug
  801 + */
  802 + private function logPendingDocumentInfoStatus($documentId, $message, $level)
  803 + {
  804 + $this->updatePendingDocumentStatus($documentId, $message, $level);
  805 + global $default;
  806 +
  807 + switch ($level)
  808 + {
  809 + case 'debug':
  810 + if ($this->debug)
  811 + {
  812 + $default->log->debug($message);
  813 + }
  814 + break;
  815 + default:
  816 + $default->log->$level($message);
  817 + }
  818 + }
  819 +
  820 +
  821 +
  822 + public function getExtractor($extractorClass)
  823 + {
  824 + if (empty($extractorClass))
  825 + {
  826 + return null;
  827 + }
  828 +
  829 + $includeFile = SEARCH2_INDEXER_DIR . 'extractors/' . $extractorClass . '.inc.php';
  830 + if (!file_exists($includeFile))
  831 + {
  832 + throw new Exception("Extractor file does not exist: $includeFile");
  833 + }
  834 +
  835 + require_once($includeFile);
  836 +
  837 + if (!class_exists($extractorClass))
  838 + {
  839 + throw new Exception("Extractor '$classname' not defined in file: $includeFile");
  840 + }
  841 +
  842 + $extractor = new $extractorClass();
  843 +
  844 + if (!($extractor instanceof DocumentExtractor))
  845 + {
  846 + throw new Exception("Class $classname was expected to be of type DocumentExtractor");
  847 + }
  848 +
  849 + return $extractor;
  850 + }
  851 +
  852 + public static function getIndexingQueue($problemItemsOnly=true)
  853 + {
  854 +
  855 + if ($problemItemsOnly)
  856 + {
  857 + $sql = "SELECT
  858 + iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename
  859 + FROM
  860 + index_files iff
  861 + INNER JOIN documents d ON iff.document_id=d.id
  862 + INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
  863 + INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
  864 + INNER JOIN mime_types mt ON dcv.mime_id=mt.id
  865 + LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
  866 + WHERE
  867 + (iff.status_msg IS NOT NULL AND iff.status_msg <> '') AND d.status_id=1
  868 + ORDER BY indexdate ";
  869 + }
  870 + else
  871 + {
  872 + $sql = "SELECT
  873 + iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename
  874 + FROM
  875 + index_files iff
  876 + INNER JOIN documents d ON iff.document_id=d.id
  877 + INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
  878 + INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
  879 + INNER JOIN mime_types mt ON dcv.mime_id=mt.id
  880 + LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
  881 + WHERE
  882 + (iff.status_msg IS NULL or iff.status_msg = '') AND d.status_id=1
  883 + ORDER BY indexdate ";
  884 + }
  885 + $aResult = DBUtil::getResultArray($sql);
  886 +
  887 + return $aResult;
  888 + }
  889 +
  890 + public static function getPendingIndexingQueue()
  891 + {
  892 + return Indexer::getIndexingQueue(false);
  893 + }
  894 +
  895 + /**
  896 + * The main function that may be called repeatedly to index documents.
  897 + *
  898 + * @param int $max Default 20
  899 + */
  900 + public function indexDocuments($max=null)
  901 + {
  902 + global $default;
  903 + $config =& KTConfig::getSingleton();
  904 +
  905 + /*$indexLockFile = $config->get('cache/cacheDirectory') . '/main.index.lock';
  906 + if (is_file($indexLockFile))
  907 + {
  908 + $default->log->info('indexDocuments: main.index.lock seems to exist. it could be that the indexing is still underway.');
  909 + $default->log->info('indexDocuments: Remove "' . $indexLockFile . '" if the indexing is not running or extend the frequency at which the background task runs!');
  910 + return;
  911 + }
  912 + touch($indexLockFile);*/
  913 +
  914 +
  915 + $this->checkForRegisteredTypes();
  916 +
  917 + if ($this->debug) $default->log->debug('indexDocuments: start');
  918 + if (!$this->doesDiagnosticsPass())
  919 + {
  920 + //unlink($indexLockFile);
  921 + if ($this->debug) $default->log->debug('indexDocuments: stopping - diagnostics problem. The dashboard will provide more information.');
  922 + return;
  923 + }
  924 +
  925 + if (is_null($max))
  926 + {
  927 + $max = $config->get('indexer/batchDocuments',20);
  928 + }
  929 +
  930 + $this->loadExtractorHooks();
  931 +
  932 + Indexer::clearoutDeleted();
  933 +
  934 + $date = date('Y-m-d H:i:s');
  935 + // identify the indexers that must run
  936 + // mysql specific limit!
  937 + $sql = "SELECT
  938 + iff.document_id, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what
  939 + FROM
  940 + index_files iff
  941 + INNER JOIN documents d ON iff.document_id=d.id
  942 + INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
  943 + INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
  944 + INNER JOIN mime_types mt ON dcv.mime_id=mt.id
  945 + LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
  946 + WHERE
  947 + (iff.processdate IS NULL or iff.processdate < cast(cast('$date' as date) -1 as date)) AND dmv.status_id=1
  948 + ORDER BY indexdate
  949 + LIMIT $max";
  950 + $result = DBUtil::getResultArray($sql);
  951 + if (PEAR::isError($result))
  952 + {
  953 + //unlink($indexLockFile);
  954 + if ($this->debug) $default->log->debug('indexDocuments: stopping - db error');
  955 + return;
  956 + }
  957 + KTUtil::setSystemSetting('luceneIndexingDate', time());
  958 +
  959 + // bail if no work to do
  960 + if (count($result) == 0)
  961 + {
  962 + //unlink($indexLockFile);
  963 + if ($this->debug) $default->log->debug('indexDocuments: stopping - no work to be done');
  964 + return;
  965 + }
  966 +
  967 + // identify any documents that need indexing and mark them
  968 + // so they are not taken in a followup run
  969 + $ids = array();
  970 + foreach($result as $docinfo)
  971 + {
  972 + $ids[] = $docinfo['document_id'];
  973 + }
  974 +
  975 + // mark the documents as being processed
  976 +
  977 + $ids=implode(',',$ids);
  978 + $sql = "UPDATE index_files SET processdate='$date' WHERE document_id in ($ids)";
  979 + DBUtil::runQuery($sql);
  980 +
  981 + $extractorCache = array();
  982 + $storageManager = KTStorageManagerUtil::getSingleton();
  983 +
  984 + $tempPath = $config->get("urls/tmpDirectory");
  985 +
  986 + foreach($result as $docinfo)
  987 + {
  988 + // increment indexed documents count
  989 + Indexer::incrementCount();
  990 +
  991 + $docId=$docinfo['document_id'];
  992 + $extension=$docinfo['filetypes'];
  993 + $mimeType=$docinfo['mimetypes'];
  994 + $extractorClass=$docinfo['extractor'];
  995 + $indexDocument = in_array($docinfo['what'], array('A','C'));
  996 + $indexDiscussion = in_array($docinfo['what'], array('A','D'));
  997 + $this->indexingHistory = '';
  998 +
  999 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Indexing docid: %d extension: '%s' mimetype: '%s' extractor: '%s'"), $docId, $extension,$mimeType,$extractorClass), 'debug');
  1000 +
  1001 + if (empty($extractorClass))
  1002 + {
  1003 + /*
  1004 +
  1005 + if no extractor is found and we don't need to index discussions, then we can remove the item from the queue.
  1006 +
  1007 + */
  1008 + if ($indexDiscussion)
  1009 + {
  1010 + $indexDocument = false;
  1011 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Not indexing docid: %d content because extractor could not be resolve. Still indexing discussion."), $docId), 'info');
  1012 + }
  1013 + else
  1014 + {
  1015 + Indexer::unqueueDocument($docId, sprintf(_kt("No extractor for docid: %d"),$docId));
  1016 + continue;
  1017 + }
  1018 + }
  1019 + else
  1020 + {
  1021 + /*
  1022 +
  1023 + If an extractor is available, we must ensure it is enabled.
  1024 +
  1025 + */
  1026 +
  1027 + if (!$this->isExtractorEnabled($extractorClass))
  1028 + {
  1029 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("diagnose: Not indexing docid: %d because extractor '%s' is disabled."), $docId, $extractorClass), 'info');
  1030 + continue;
  1031 + }
  1032 + }
  1033 +
  1034 + if ($this->debug)
  1035 + {
  1036 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Processing docid: %d.\n"),$docId), 'info');
  1037 + }
  1038 +
  1039 + $document = Document::get($docId);
  1040 + if (PEAR::isError($document))
  1041 + {
  1042 + Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Cannot resolve document id %d: %s."),$docId, $document->getMessage()), 'error');
  1043 + continue;
  1044 + }
  1045 +
  1046 + $filename = $document->getFileName();
  1047 + if (substr($filename,0,1) == '~')
  1048 + {
  1049 + Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Filename for document id %d starts with a tilde (~). This is assumed to be a temporary file. This is ignored."),$docId), 'error');
  1050 + continue;
  1051 + }
  1052 +
  1053 + $removeFromQueue = true;
  1054 + if ($indexDocument)
  1055 + {
  1056 + if (array_key_exists($extractorClass, $extractorCache))
  1057 + {
  1058 + $extractor = $extractorCache[$extractorClass];
  1059 + }
  1060 + else
  1061 + {
  1062 + $extractor = $extractorCache[$extractorClass] = $this->getExtractor($extractorClass);
  1063 + }
  1064 +
  1065 + if (!($extractor instanceof DocumentExtractor))
  1066 + {
  1067 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("indexDocuments: extractor '%s' is not a document extractor class."),$extractorClass), 'error');
  1068 + continue;
  1069 + }
  1070 +
  1071 +
  1072 +
  1073 + $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();
  1074 + $sourceFile = $storageManager->temporaryFile($document);
  1075 +
  1076 + if (empty($sourceFile) || !is_file($sourceFile))
  1077 + {
  1078 + Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: source file '%s' for document %d does not exist."),$sourceFile,$docId), 'error');
  1079 + continue;
  1080 + }
  1081 +
  1082 + if ($extractor->needsIntermediateSourceFile())
  1083 + {
  1084 + $extension = pathinfo($document->getFileName(), PATHINFO_EXTENSION);
  1085 +
  1086 + $intermediate = $tempPath . '/'. $docId . '.' . $extension;
  1087 + $result = @copy($sourceFile, $intermediate);
  1088 + if ($result === false)
  1089 + {
  1090 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not create intermediate file from document %d"),$docId), 'error');
  1091 + // problem. lets try again later. probably permission related. log the issue.
  1092 + continue;
  1093 + }
  1094 + $sourceFile = $intermediate;
  1095 + }
  1096 +
  1097 + $targetFile = tempnam($tempPath, 'ktindexer');
  1098 +
  1099 + $extractor->setSourceFile($sourceFile);
  1100 + $extractor->setMimeType($mimeType);
  1101 + $extractor->setExtension($extension);
  1102 + $extractor->setTargetFile($targetFile);
  1103 + $extractor->setDocument($document);
  1104 + $extractor->setIndexingStatus(null);
  1105 + $extractor->setExtractionStatus(null);
  1106 +
  1107 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Extra Info docid: %d Source File: '%s' Target File: '%s'"),$docId,$sourceFile,$targetFile), 'debug');
  1108 +
  1109 + $this->executeHook($extractor, 'pre_extract');
  1110 + $this->executeHook($extractor, 'pre_extract', $mimeType);
  1111 + $removeFromQueue = false;
  1112 +
  1113 + if ($extractor->extractTextContent())
  1114 + {
  1115 + // the extractor may need to create another target file
  1116 + $targetFile = $extractor->getTargetFile();
  1117 +
  1118 + $extractor->setExtractionStatus(true);
  1119 + $this->executeHook($extractor, 'pre_index');
  1120 + $this->executeHook($extractor, 'pre_index', $mimeType);
  1121 +
  1122 + $title = $document->getName();
  1123 + if ($indexDiscussion)
  1124 + {
  1125 + if (!$this->filterText($targetFile))
  1126 + {
  1127 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');
  1128 + }
  1129 + else
  1130 + {
  1131 + $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);
  1132 + $removeFromQueue = $indexStatus;
  1133 + if (!$indexStatus)
  1134 + {
  1135 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocumentAndDiscussion"),$docId), 'error');
  1136 + }
  1137 +
  1138 + $extractor->setIndexingStatus($indexStatus);
  1139 + }
  1140 + }
  1141 + else
  1142 + {
  1143 + if (!$this->filterText($targetFile))
  1144 + {
  1145 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');
  1146 + }
  1147 + else
  1148 + {
  1149 + $indexStatus = $this->indexDocument($docId, $targetFile, $title, $version);
  1150 + $removeFromQueue = $indexStatus;
  1151 +
  1152 + if (!$indexStatus)
  1153 + {
  1154 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocument"),$docId), 'error');
  1155 + $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error');
  1156 + }
  1157 +
  1158 + $extractor->setIndexingStatus($indexStatus);
  1159 + }
  1160 + }
  1161 +
  1162 + $this->executeHook($extractor, 'post_index', $mimeType);
  1163 + $this->executeHook($extractor, 'post_index');
  1164 + }
  1165 + else
  1166 + {
  1167 + $extractor->setExtractionStatus(false);
  1168 + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not extract contents from document %d"),$docId), 'error');
  1169 + $this->logPendingDocumentInfoStatus($docId, '<output>' . $extractor->output . '</output>', 'error');
  1170 + }
  1171 +
  1172 + $this->executeHook($extractor, 'post_extract', $mimeType);
  1173 + $this->executeHook($extractor, 'post_extract');
  1174 +
  1175 + if ($extractor->needsIntermediateSourceFile())
  1176 + {
  1177 + @unlink($sourceFile);
  1178 + }
  1179 +
  1180 + @unlink($targetFile);
  1181 +
  1182 + }
  1183 + else
  1184 + {
  1185 + $indexStatus = $this->indexDiscussion($docId);
  1186 + $removeFromQueue = $indexStatus;
  1187 + }
  1188 +
  1189 + if ($removeFromQueue)
  1190 + {
  1191 + Indexer::unqueueDocument($docId, sprintf(_kt("Done indexing docid: %d"),$docId));
  1192 + }
  1193 + else
  1194 + {
  1195 + if ($this->debug) $default->log->debug(sprintf(_kt("Document docid: %d was not removed from the queue as it looks like there was a problem with the extraction process"),$docId));
  1196 + }
  1197 + }
  1198 + if ($this->debug) $default->log->debug('indexDocuments: done');
  1199 + //unlink($indexLockFile);
  1200 + }
  1201 +
  1202 + public function migrateDocuments($max=null)
  1203 + {
  1204 + global $default;
  1205 +
  1206 + $default->log->info(_kt('migrateDocuments: starting'));
  1207 +
  1208 + if (!$this->doesDiagnosticsPass(true))
  1209 + {
  1210 + $default->log->info(_kt('migrateDocuments: stopping - diagnostics problem. The dashboard will provide more information.'));
  1211 + return;
  1212 + }
  1213 +
  1214 + if (KTUtil::getSystemSetting('migrationComplete') == 'true')
  1215 + {
  1216 + $default->log->info(_kt('migrateDocuments: stopping - migration is complete.'));
  1217 + return;
  1218 + }
  1219 +
  1220 + $config =& KTConfig::getSingleton();
  1221 + if (is_null($max))
  1222 + {
  1223 + $max = $config->get('indexer/batchMigrateDocument',500);
  1224 + }
  1225 +
  1226 + $lockFile = $config->get('cache/cacheDirectory') . '/migration.lock';
  1227 + if (is_file($lockFile))
  1228 + {
  1229 + $default->log->info(_kt('migrateDocuments: stopping - migration lockfile detected.'));
  1230 + return;
  1231 + }
  1232 + touch($lockFile);
  1233 +
  1234 + $startTime = KTUtil::getSystemSetting('migrationStarted');
  1235 + if (is_null($startTime))
  1236 + {
  1237 + KTUtil::setSystemSetting('migrationStarted', time());
  1238 + }
  1239 +
  1240 + $maxLoops = 5;
  1241 +
  1242 + $max = ceil($max / $maxLoops);
  1243 +
  1244 + $start =KTUtil::getBenchmarkTime();
  1245 + $noDocs = false;
  1246 + $numDocs = 0;
  1247 +
  1248 + for($loop=0;$loop<$maxLoops;$loop++)
  1249 + {
  1250 +
  1251 + $sql = "SELECT
  1252 + document_id, document_text
  1253 + FROM
  1254 + document_text
  1255 + ORDER BY document_id
  1256 + LIMIT $max";
  1257 + $result = DBUtil::getResultArray($sql);
  1258 + if (PEAR::isError($result))
  1259 + {
  1260 + $default->log->info(_kt('migrateDocuments: db error'));
  1261 + break;
  1262 + }
  1263 +
  1264 + $docs = count($result);
  1265 + if ($docs == 0)
  1266 + {
  1267 + $noDocs = true;
  1268 + break;
  1269 + }
  1270 + $numDocs += $docs;
  1271 +
  1272 + foreach($result as $docinfo)
  1273 + {
  1274 + $docId = $docinfo['document_id'];
  1275 +
  1276 + $document = Document::get($docId);
  1277 + if (PEAR::isError($document) || is_null($document))
  1278 + {
  1279 + $sql = "DELETE FROM document_text WHERE document_id=$docId";
  1280 + DBUtil::runQuery($sql);
  1281 + $default->log->error(sprintf(_kt('migrateDocuments: Could not get document %d\'s document! Removing content!'),$docId));
  1282 + continue;
  1283 + }
  1284 +
  1285 + $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();
  1286 +
  1287 + $targetFile = tempnam($tempPath, 'ktindexer');
  1288 +
  1289 + if (file_put_contents($targetFile, $docinfo['document_text']) === false)
  1290 + {
  1291 + $default->log->error(sprintf(_kt('migrateDocuments: Cannot write to \'%s\' for document id %d'), $targetFile, $docId));
  1292 + continue;
  1293 + }
  1294 + // free memory asap ;)
  1295 + unset($docinfo['document_text']);
  1296 +
  1297 + $title = $document->getName();
  1298 +
  1299 + $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);
  1300 +
  1301 + if ($indexStatus)
  1302 + {
  1303 + $sql = "DELETE FROM document_text WHERE document_id=$docId";
  1304 + DBUtil::runQuery($sql);
  1305 + }
  1306 + else
  1307 + {
  1308 + $default->log->error(sprintf(_kt("migrateDocuments: Problem indexing document %d"), $docId));
  1309 + }
  1310 +
  1311 + @unlink($targetFile);
  1312 + }
  1313 + }
  1314 +
  1315 + @unlink($lockFile);
  1316 +
  1317 + $time = KTUtil::getBenchmarkTime() - $start;
  1318 +
  1319 + KTUtil::setSystemSetting('migrationTime', KTUtil::getSystemSetting('migrationTime',0) + $time);
  1320 + KTUtil::setSystemSetting('migratedDocuments', KTUtil::getSystemSetting('migratedDocuments',0) + $numDocs);
  1321 +
  1322 + $default->log->info(sprintf(_kt('migrateDocuments: stopping - done in %d seconds!'), $time));
  1323 + if ($noDocs)
  1324 + {
  1325 + $default->log->info(_kt('migrateDocuments: Completed!'));
  1326 + KTUtil::setSystemSetting('migrationComplete', 'true');
  1327 + schedulerUtil::deleteByName('Index Migration');
  1328 + $default->log->debug(_kt('migrateDocuments: Disabling \'Index Migration\' task by removing scheduler entry.'));
  1329 + }
  1330 + }
  1331 +
  1332 + /**
  1333 + * Index a document. The base class must override this function.
  1334 + *
  1335 + * @param int $docId
  1336 + * @param string $textFile
  1337 + */
  1338 + protected abstract function indexDocument($docId, $textFile, $title, $version);
  1339 +
  1340 +
  1341 + public function updateDocumentIndex($docId, $text)
  1342 + {
  1343 + $config = KTConfig::getSingleton();
  1344 + $tempPath = $config->get("urls/tmpDirectory");
  1345 + $tempFile = tempnam($tempPath,'ud_');
  1346 +
  1347 + file_put_contents($tempFile, $text);
  1348 +
  1349 + $document = Document::get($docId);
  1350 + $title = $document->getDescription();
  1351 + $version = $document->getVersion();
  1352 +
  1353 + $result = $this->indexDocument($docId, $tempFile, $title, $version);
  1354 +
  1355 + if (file_exists($tempFile))
  1356 + {
  1357 + unlink($tempFile);
  1358 + }
  1359 +
  1360 + return $result;
  1361 + }
  1362 +
  1363 + /**
  1364 + * Index a discussion. The base class must override this function.
  1365 + *
  1366 + * @param int $docId
  1367 + */
  1368 + protected abstract function indexDiscussion($docId);
  1369 +
  1370 + /**
  1371 + * Diagnose the indexer. e.g. Check that the indexing server is running.
  1372 + *
  1373 + */
  1374 + public abstract function diagnose();
  1375 +
  1376 + /**
  1377 + * Diagnose the extractors.
  1378 + *
  1379 + * @return array
  1380 + */
  1381 + public function diagnoseExtractors()
  1382 + {
  1383 + $diagnosis = $this->_diagnose($this->extractorPath, 'DocumentExtractor', 'Extractor.inc.php');
  1384 + $diagnosis = array_merge($diagnosis, $this->_diagnose($this->hookPath, 'Hook', 'Hook.inc.php'));
  1385 +
  1386 + return $diagnosis;
  1387 + }
  1388 +
  1389 + /**
  1390 + * This is a refactored diagnose function.
  1391 + *
  1392 + * @param string $path
  1393 + * @param string $class
  1394 + * @param string $extension
  1395 + * @return array
  1396 + */
  1397 + private function _diagnose($path, $baseclass, $extension)
  1398 + {
  1399 + global $default;
  1400 +
  1401 + $diagnoses = array();
  1402 +
  1403 + $dir = opendir(SearchHelper::correctPath($path));
  1404 + $extlen = - strlen($extension);
  1405 +
  1406 + while (($file = readdir($dir)) !== false)
  1407 + {
  1408 + if (substr($file,0,1) == '.')
  1409 + {
  1410 + continue;
  1411 + }
  1412 + if (substr($file,$extlen) != $extension)
  1413 + {
  1414 + $default->log->error(sprintf(_kt("diagnose: '%s' does not have extension '%s'."), $file, $extension));
  1415 + continue;
  1416 + }
  1417 +
  1418 + require_once($path . '/' . $file);
  1419 +
  1420 + $class = substr($file, 0, -8);
  1421 + if (!class_exists($class))
  1422 + {
  1423 + $default->log->error(sprintf(_kt("diagnose: class '%s' does not exist."), $class));
  1424 + continue;
  1425 + }
  1426 +
  1427 + if (!$this->isExtractorEnabled($class))
  1428 + {
  1429 + $default->log->debug(sprintf(_kt("diagnose: extractor '%s' is disabled."), $class));
  1430 + continue;
  1431 + }
  1432 +
  1433 + $extractor = new $class();
  1434 + if (!is_a($extractor, $baseclass))
  1435 + {
  1436 + $default->log->error(sprintf(_kt("diagnose(): '%s' is not of type DocumentExtractor"), $class));
  1437 + continue;
  1438 + }
  1439 +
  1440 + $types = $extractor->getSupportedMimeTypes();
  1441 + if (empty($types))
  1442 + {
  1443 + if ($this->debug) $default->log->debug(sprintf(_kt("diagnose: class '%s' does not support any types."), $class));
  1444 + continue;
  1445 + }
  1446 +
  1447 + $diagnosis=$extractor->diagnose();
  1448 + if (empty($diagnosis))
  1449 + {
  1450 + continue;
  1451 + }
  1452 + $diagnoses[$class] = array(
  1453 + 'name'=>$extractor->getDisplayName(),
  1454 + 'diagnosis'=>$diagnosis
  1455 + );
  1456 +
  1457 + }
  1458 + closedir($dir);
  1459 +
  1460 + return $diagnoses;
  1461 + }
  1462 +
  1463 +
  1464 + /**
  1465 + * Register the extractor types.
  1466 + *
  1467 + * @param boolean $clear. Optional. Defaults to false.
  1468 + */
  1469 + public function registerTypes($clear=false)
  1470 + {
  1471 + if ($clear)
  1472 + {
  1473 + $this->clearExtractors();
  1474 + }
  1475 + $dir = opendir(SearchHelper::correctPath($this->extractorPath));
  1476 + while (($file = readdir($dir)) !== false)
  1477 + {
  1478 + if (substr($file,-17) == 'Extractor.inc.php')
  1479 + {
  1480 + require_once($this->extractorPath . '/' . $file);
  1481 + $class = substr($file, 0, -8);
  1482 +
  1483 + if (!class_exists($class))
  1484 + {
  1485 + // if the class does not exist, we can't do anything.
  1486 + continue;
  1487 + }
  1488 +
  1489 + $extractor = new $class;
  1490 + if ($extractor instanceof DocumentExtractor)
  1491 + {
  1492 + $extractor->registerMimeTypes();
  1493 + }
  1494 + }
  1495 + }
  1496 + closedir($dir);
  1497 + }
  1498 +
  1499 + /**
  1500 + * This is used as a possible obtimisation effort. It may be overridden in that case.
  1501 + *
  1502 + * @param int $docId
  1503 + * @param string $textFile
  1504 + */
  1505 + protected function indexDocumentAndDiscussion($docId, $textFile, $title, $version)
  1506 + {
  1507 + $this->indexDocument($docId, $textFile, $title, $version);
  1508 + $this->indexDiscussion($docId);
  1509 + }
  1510 +
  1511 + /**
  1512 + * Remove the document from the queue. This is normally called when it has been processed.
  1513 + *
  1514 + * @param int $docid
  1515 + */
  1516 + public static function unqueueDocument($docid, $reason=false, $level='debug')
  1517 + {
  1518 + $sql = "DELETE FROM index_files WHERE document_id=$docid";
  1519 + DBUtil::runQuery($sql);
  1520 + if ($reason !== false)
  1521 + {
  1522 + global $default;
  1523 + $default->log->$level("Indexer: removing document $docid from the queue - $reason");
  1524 + }
  1525 + }
  1526 +
  1527 + /**
  1528 + * Run a query on the index.
  1529 + *
  1530 + * @param string $query
  1531 + * @return array
  1532 + */
  1533 + public abstract function query($query);
  1534 +
  1535 + /**
  1536 + * Converts an integer to a string that can be easily compared and reversed.
  1537 + *
  1538 + * @param int $int
  1539 + * @return string
  1540 + */
  1541 + public static function longToString($int)
  1542 + {
  1543 + $maxlen = 14;
  1544 +
  1545 + $a2z = array('a','b','c','d','e','f','g','h','i','j');
  1546 + $o29 = array('0','1','2','3','4','5','6','7','8','9');
  1547 + $l = str_pad('',$maxlen - strlen("$int"),'0') . $int;
  1548 +
  1549 + return str_replace($o29, $a2z, $l);
  1550 + }
  1551 +
  1552 + /**
  1553 + * Converts a string to an integer.
  1554 + *
  1555 + * @param string $str
  1556 + * @return int
  1557 + */
  1558 + public static function stringToLong($str)
  1559 + {
  1560 + $a2z = array('a','b','c','d','e','f','g','h','i','j');
  1561 + $o29 = array('0','1','2','3','4','5','6','7','8','9');
  1562 +
  1563 + $int = str_replace($a2z, $o29, $str) + 0;
  1564 +
  1565 + return $int;
  1566 + }
  1567 +
  1568 + /**
  1569 + * Possibly we can optimise indexes. This method must be overriden.
  1570 + * The new function must call the parent!
  1571 + *
  1572 + */
  1573 + public function optimise()
  1574 + {
  1575 + KTUtil::setSystemSetting('luceneOptimisationDate', time());
  1576 + }
  1577 +
  1578 + /**
  1579 + * Shuts down the indexer
  1580 + *
  1581 + */
  1582 + public function shutdown()
  1583 + {
  1584 + // do nothing generally
  1585 + }
  1586 +
  1587 + /**
  1588 + * Returns the name of the indexer.
  1589 + *
  1590 + * @return string
  1591 + */
  1592 + public abstract function getDisplayName();
  1593 +
  1594 +
  1595 + /**
  1596 + * Returns the number of non-deleted documents in the index.
  1597 + *
  1598 + * @return int
  1599 + */
  1600 + public abstract function getDocumentsInIndex();
  1601 +
  1602 + /**
  1603 + * Returns the path to the index directory
  1604 + *
  1605 + * @return string
  1606 + */
  1607 + public function getIndexDirectory()
  1608 + {
  1609 + $config = KTConfig::getSingleton();
  1610 + $directory = $config->get('indexer/luceneDirectory');
  1611 + return $directory;
  1612 + }
  1613 +}
  1614 +
  1615 +?>