Commit 81865212adce4d390cca58695d3874c703c1a509

Authored by kevin_fourie
1 parent 319ec2a8

Merged in from DEV trunk...

KTS-673
"The search algorithm needs some work"
Updated. Integrated with DocumentConverter.py

Committed By: Conrad Vermeulen
Reviewed By: Kevin Fourie

KTS-2547
"Test RTF Indexer"
Implemented. 

Committed By: Conrad Vermeulen
Reviewed By: Kevin Fourie

KTS-673
"The search algorithm needs some work"
Updated.

Committed By: Conrad Vermeulen
Reviewed By: Kevin Fourie

KTS-673
"The search algorithm needs some work"
Updated.  MailMime is not tested... will implement again when have more time.

Committed By: Conrad Vermeulen
Reviewed By: Kevin Fourie

KTS-673
"The search algorithm needs some work"
Updated. Sometimes the extractor may change the target filename. 

Committed By: Conrad Vermeulen
Reviewed By: Kevin Fourie

KTS-1753
" Implement Disk Usage Plugin"
Updated. Licensing.

Committed By: Conrad Vermeulen
Reviewed By: Kevin Fourie

KTS-673
"The search algorithm needs some work"
Updated. Search requires php, python and java.

Committed By: Conrad Vermeulen
Reviewed By: Kevin Fourie


git-svn-id: https://kt-dms.svn.sourceforge.net/svnroot/kt-dms/STABLE/trunk@7477 c91229c3-7414-0410-bfa2-8a42b809f60b
config/config.ini
@@ -242,7 +242,10 @@ pdftotext = pdftotext @@ -242,7 +242,10 @@ pdftotext = pdftotext
242 catppt = catppt 242 catppt = catppt
243 pstotext = pstotext 243 pstotext = pstotext
244 catdoc = catdoc 244 catdoc = catdoc
245 -antiword = antiword.exe 245 +antiword = antiword
  246 +python = python
  247 +java = java
  248 +php = php
246 249
247 [search] 250 [search]
248 ; The number of results per page 251 ; The number of results per page
plugins/housekeeper/DiskUsageDashlet.inc.php
1 <?php 1 <?php
2 2
3 /** 3 /**
  4 + * $Id
4 * 5 *
5 - * Copyright (c) 2007 Jam Warehouse http://www.jamwarehouse.com 6 + * The contents of this file are subject to the KnowledgeTree Public
  7 + * License Version 1.1.2 ("License"); You may not use this file except in
  8 + * compliance with the License. You may obtain a copy of the License at
  9 + * http://www.knowledgetree.com/KPL
6 * 10 *
7 - * This program is free software; you can redistribute it and/or modify  
8 - * it under the terms of the GNU General Public License as published by  
9 - * the Free Software Foundation; using version 2 of the License. 11 + * Software distributed under the License is distributed on an "AS IS"
  12 + * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.
  13 + * See the License for the specific language governing rights and
  14 + * limitations under the License.
10 * 15 *
11 - * This program is distributed in the hope that it will be useful,  
12 - * but WITHOUT ANY WARRANTY; without even the implied warranty of  
13 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the  
14 - * GNU General Public License for more details. 16 + * All copies of the Covered Code must include on each user interface screen:
  17 + * (i) the "Powered by KnowledgeTree" logo and
  18 + * (ii) the KnowledgeTree copyright notice
  19 + * in the same form as they appear in the distribution. See the License for
  20 + * requirements.
15 * 21 *
16 - * You should have received a copy of the GNU General Public License  
17 - * along with this program; if not, write to the Free Software  
18 - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 22 + * The Original Code is: KnowledgeTree Open Source
19 * 23 *
20 - * -------------------------------------------------------------------------  
21 - *  
22 - * You can contact the copyright owner regarding licensing via the contact  
23 - * details that can be found on the KnowledgeTree web site:  
24 - *  
25 - * http://www.knowledgetree.com/ 24 + * The Initial Developer of the Original Code is The Jam Warehouse Software
  25 + * (Pty) Ltd, trading as KnowledgeTree.
  26 + * Portions created by The Jam Warehouse Software (Pty) Ltd are Copyright
  27 + * (C) 2007 The Jam Warehouse Software (Pty) Ltd;
  28 + * All Rights Reserved.
  29 + * Contributor( s): ______________________________________
26 */ 30 */
27 31
28 class DiskUsageDashlet extends KTBaseDashlet 32 class DiskUsageDashlet extends KTBaseDashlet
@@ -34,7 +38,7 @@ class DiskUsageDashlet extends KTBaseDashlet @@ -34,7 +38,7 @@ class DiskUsageDashlet extends KTBaseDashlet
34 38
35 function DiskUsageDashlet() 39 function DiskUsageDashlet()
36 { 40 {
37 - $this->sTitle = _kt('Disk Usage'); 41 + $this->sTitle = _kt('Storage Utilization');
38 $this->sClass = "ktInfo"; 42 $this->sClass = "ktInfo";
39 } 43 }
40 44
plugins/housekeeper/FolderUsageDashlet.inc.php
1 <?php 1 <?php
2 2
3 /** 3 /**
  4 + * $Id
4 * 5 *
5 - * Copyright (c) 2007 Jam Warehouse http://www.jamwarehouse.com 6 + * The contents of this file are subject to the KnowledgeTree Public
  7 + * License Version 1.1.2 ("License"); You may not use this file except in
  8 + * compliance with the License. You may obtain a copy of the License at
  9 + * http://www.knowledgetree.com/KPL
6 * 10 *
7 - * This program is free software; you can redistribute it and/or modify  
8 - * it under the terms of the GNU General Public License as published by  
9 - * the Free Software Foundation; using version 2 of the License. 11 + * Software distributed under the License is distributed on an "AS IS"
  12 + * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.
  13 + * See the License for the specific language governing rights and
  14 + * limitations under the License.
10 * 15 *
11 - * This program is distributed in the hope that it will be useful,  
12 - * but WITHOUT ANY WARRANTY; without even the implied warranty of  
13 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the  
14 - * GNU General Public License for more details. 16 + * All copies of the Covered Code must include on each user interface screen:
  17 + * (i) the "Powered by KnowledgeTree" logo and
  18 + * (ii) the KnowledgeTree copyright notice
  19 + * in the same form as they appear in the distribution. See the License for
  20 + * requirements.
15 * 21 *
16 - * You should have received a copy of the GNU General Public License  
17 - * along with this program; if not, write to the Free Software  
18 - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 22 + * The Original Code is: KnowledgeTree Open Source
19 * 23 *
20 - * -------------------------------------------------------------------------  
21 - *  
22 - * You can contact the copyright owner regarding licensing via the contact  
23 - * details that can be found on the KnowledgeTree web site:  
24 - *  
25 - * http://www.knowledgetree.com/ 24 + * The Initial Developer of the Original Code is The Jam Warehouse Software
  25 + * (Pty) Ltd, trading as KnowledgeTree.
  26 + * Portions created by The Jam Warehouse Software (Pty) Ltd are Copyright
  27 + * (C) 2007 The Jam Warehouse Software (Pty) Ltd;
  28 + * All Rights Reserved.
  29 + * Contributor( s): ______________________________________
26 */ 30 */
27 31
28 class FolderUsageDashlet extends KTBaseDashlet 32 class FolderUsageDashlet extends KTBaseDashlet
@@ -31,7 +35,7 @@ class FolderUsageDashlet extends KTBaseDashlet @@ -31,7 +35,7 @@ class FolderUsageDashlet extends KTBaseDashlet
31 35
32 function FolderUsageDashlet() 36 function FolderUsageDashlet()
33 { 37 {
34 - $this->sTitle = _kt('System Folder Usage'); 38 + $this->sTitle = _kt('System Folder Utilization');
35 $this->sClass = "ktInfo"; 39 $this->sClass = "ktInfo";
36 } 40 }
37 41
plugins/housekeeper/HouseKeeperDispatcher.php
1 <?php 1 <?php
2 2
  3 +/**
  4 + * $Id
  5 + *
  6 + * The contents of this file are subject to the KnowledgeTree Public
  7 + * License Version 1.1.2 ("License"); You may not use this file except in
  8 + * compliance with the License. You may obtain a copy of the License at
  9 + * http://www.knowledgetree.com/KPL
  10 + *
  11 + * Software distributed under the License is distributed on an "AS IS"
  12 + * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.
  13 + * See the License for the specific language governing rights and
  14 + * limitations under the License.
  15 + *
  16 + * All copies of the Covered Code must include on each user interface screen:
  17 + * (i) the "Powered by KnowledgeTree" logo and
  18 + * (ii) the KnowledgeTree copyright notice
  19 + * in the same form as they appear in the distribution. See the License for
  20 + * requirements.
  21 + *
  22 + * The Original Code is: KnowledgeTree Open Source
  23 + *
  24 + * The Initial Developer of the Original Code is The Jam Warehouse Software
  25 + * (Pty) Ltd, trading as KnowledgeTree.
  26 + * Portions created by The Jam Warehouse Software (Pty) Ltd are Copyright
  27 + * (C) 2007 The Jam Warehouse Software (Pty) Ltd;
  28 + * All Rights Reserved.
  29 + * Contributor( s): ______________________________________
  30 + */
  31 +
3 session_start(); 32 session_start();
4 33
5 require_once("../../config/dmsDefaults.php"); 34 require_once("../../config/dmsDefaults.php");
@@ -88,4 +117,4 @@ class HouseKeeperDispatcher extends KTStandardDispatcher @@ -88,4 +117,4 @@ class HouseKeeperDispatcher extends KTStandardDispatcher
88 $oDispatcher = new HouseKeeperDispatcher(); 117 $oDispatcher = new HouseKeeperDispatcher();
89 $oDispatcher->dispatch(); 118 $oDispatcher->dispatch();
90 119
91 -?>  
92 \ No newline at end of file 120 \ No newline at end of file
  121 +?>
plugins/housekeeper/HouseKeeperPlugin.php
1 <?php 1 <?php
2 2
  3 +/**
  4 + * $Id
  5 + *
  6 + * The contents of this file are subject to the KnowledgeTree Public
  7 + * License Version 1.1.2 ("License"); You may not use this file except in
  8 + * compliance with the License. You may obtain a copy of the License at
  9 + * http://www.knowledgetree.com/KPL
  10 + *
  11 + * Software distributed under the License is distributed on an "AS IS"
  12 + * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.
  13 + * See the License for the specific language governing rights and
  14 + * limitations under the License.
  15 + *
  16 + * All copies of the Covered Code must include on each user interface screen:
  17 + * (i) the "Powered by KnowledgeTree" logo and
  18 + * (ii) the KnowledgeTree copyright notice
  19 + * in the same form as they appear in the distribution. See the License for
  20 + * requirements.
  21 + *
  22 + * The Original Code is: KnowledgeTree Open Source
  23 + *
  24 + * The Initial Developer of the Original Code is The Jam Warehouse Software
  25 + * (Pty) Ltd, trading as KnowledgeTree.
  26 + * Portions created by The Jam Warehouse Software (Pty) Ltd are Copyright
  27 + * (C) 2007 The Jam Warehouse Software (Pty) Ltd;
  28 + * All Rights Reserved.
  29 + * Contributor( s): ______________________________________
  30 + */
  31 +
3 class HouseKeeperPlugin extends KTPlugin 32 class HouseKeeperPlugin extends KTPlugin
4 { 33 {
5 var $autoRegister = true; 34 var $autoRegister = true;
@@ -93,4 +122,4 @@ class HouseKeeperPlugin extends KTPlugin @@ -93,4 +122,4 @@ class HouseKeeperPlugin extends KTPlugin
93 $oPluginRegistry =& KTPluginRegistry::getSingleton(); 122 $oPluginRegistry =& KTPluginRegistry::getSingleton();
94 $oPluginRegistry->registerPlugin('HouseKeeperPlugin', 'ktcore.housekeeper.plugin', __FILE__); 123 $oPluginRegistry->registerPlugin('HouseKeeperPlugin', 'ktcore.housekeeper.plugin', __FILE__);
95 124
96 -?>  
97 \ No newline at end of file 125 \ No newline at end of file
  126 +?>
search2/indexing/extractorCore.inc.php
@@ -124,10 +124,22 @@ abstract class DocumentExtractor @@ -124,10 +124,22 @@ abstract class DocumentExtractor
124 } 124 }
125 $classname=get_class($this); 125 $classname=get_class($this);
126 126
  127 + $sql = "select id as extractor_id from mime_extractors WHERE name='$classname'";
  128 + $rs = DBUtil::getResultArray($sql);
  129 + if (count($rs) == 0)
  130 + {
  131 + $extractor_id = DBUtil::autoInsert('mime_extractors', array('name'=>$classname, 'active'=>1));
  132 + }
  133 + else
  134 + {
  135 + $extractor_id = $rs[0]['extractor_id'];
  136 + }
  137 +
  138 +
127 foreach($types as $type) 139 foreach($types as $type)
128 { 140 {
129 - $sql = "update mime_types set extractor='$classname' where mimetypes='$type' and extractor is null";  
130 - DBUtil::runQuery($sql); 141 + $sql = "update mime_types set extractor_id=$extractor_id where mimetypes='$type' and extractor_id is null";
  142 + $rs = DBUtil::runQuery($sql);
131 } 143 }
132 } 144 }
133 145
@@ -510,15 +522,17 @@ abstract class CompositeExtractor extends DocumentExtractor @@ -510,15 +522,17 @@ abstract class CompositeExtractor extends DocumentExtractor
510 public function extractTextContent() 522 public function extractTextContent()
511 { 523 {
512 $intermediateFile = $this->targetfile . '.' . $this->targetExtension; 524 $intermediateFile = $this->targetfile . '.' . $this->targetExtension;
  525 + touch($intermediateFile);
513 526
514 $this->sourceExtractor->setSourceFile($this->sourcefile); 527 $this->sourceExtractor->setSourceFile($this->sourcefile);
515 $this->sourceExtractor->setTargetFile($intermediateFile); 528 $this->sourceExtractor->setTargetFile($intermediateFile);
516 $this->sourceExtractor->setMimeType($this->mimetype); 529 $this->sourceExtractor->setMimeType($this->mimetype);
517 $this->sourceExtractor->setExtension($this->extension); 530 $this->sourceExtractor->setExtension($this->extension);
518 - if ($this->sourceExtractor->extractTextContent()) 531 + if (!$this->sourceExtractor->extractTextContent())
519 { 532 {
520 return false; 533 return false;
521 } 534 }
  535 + $intermediateFile = $this->sourceExtractor->getTargetFile();
522 536
523 $this->targetExtractor->setSourceFile($intermediateFile); 537 $this->targetExtractor->setSourceFile($intermediateFile);
524 $this->targetExtractor->setTargetFile($this->targetfile); 538 $this->targetExtractor->setTargetFile($this->targetfile);
search2/indexing/extractors/OOPresentationExtractor.inc.php
1 <?php 1 <?php
2 2
3 -require_once('OOPDFTextExtractor.inc.php'); 3 +require_once('OOTextExtractor.inc.php');
  4 +require_once('PDFExtractor.inc.php');
4 5
5 -class OOPresentationExtractor extends OOPDFTextExtractor 6 +class OOPresentationExtractor extends CompositeExtractor
6 { 7 {
  8 + public function __construct()
  9 + {
  10 + $sourceExtractor = new OOPresentationToPDF();
  11 + $targetExtractor = new PDFExtractor();
  12 + parent::__construct($sourceExtractor, 'pdf', 'application/pdf', $targetExtractor, true);
  13 + }
  14 +
7 public function getDisplayName() 15 public function getDisplayName()
8 { 16 {
9 return _kt('OpenOffice Presentation Extractor'); 17 return _kt('OpenOffice Presentation Extractor');
@@ -18,4 +26,18 @@ class OOPresentationExtractor extends OOPDFTextExtractor @@ -18,4 +26,18 @@ class OOPresentationExtractor extends OOPDFTextExtractor
18 } 26 }
19 } 27 }
20 28
  29 +class OOPresentationToPDF extends OOTextExtractor
  30 +{
  31 + public function __construct()
  32 + {
  33 + parent::__construct('pdf');
  34 + $this->documentConverter = KT_DIR . '/bin/openoffice/pdfgen.py';
  35 + if (!is_file($this->documentConverter))
  36 + {
  37 + $this->documentConverter = false;
  38 + }
  39 + }
  40 +}
  41 +
  42 +
21 ?> 43 ?>
22 \ No newline at end of file 44 \ No newline at end of file
search2/indexing/extractors/OOSpreadsheetExtractor.inc.php
1 <?php 1 <?php
2 2
3 -require_once('OOPDFTextExtractor.inc.php'); 3 +require_once('OOTextExtractor.inc.php');
4 4
5 -class OOSpreadsheetExtractor extends OOPDFTextExtractor 5 +class OOSpreadsheetExtractor extends OOTextExtractor
6 { 6 {
7 public function getDisplayName() 7 public function getDisplayName()
8 { 8 {
search2/indexing/extractors/OOTextExtractor.inc.php
@@ -2,22 +2,27 @@ @@ -2,22 +2,27 @@
2 2
3 class OOTextExtractor extends ExternalDocumentExtractor 3 class OOTextExtractor extends ExternalDocumentExtractor
4 { 4 {
5 - private $converter;  
6 - private $javaPath;  
7 - private $ooHost;  
8 - private $ooPort;  
9 - private $targetMimeType; 5 + protected $python;
  6 + protected $documentConverter;
  7 + protected $ooHost;
  8 + protected $ooPort;
  9 + protected $targetExtension;
10 10
11 - public function __construct($targetMimeType='plain/text') 11 + public function __construct($targetExtension='html')
12 { 12 {
13 parent::__construct(); 13 parent::__construct();
  14 + $this->targetExtension = $targetExtension;
14 $config =& KTConfig::getSingleton(); 15 $config =& KTConfig::getSingleton();
15 16
16 - $this->converter = KTUtil::findCommand('extractors/jodconverter', 'jodconverter');  
17 - $this->javaPath = KTUtil::findCommand('extractors/java', 'java');  
18 - $this->ooHost = $config->get('openoffice/host', 'localhost');  
19 - $this->ooPort = $config->get('openoffice/port', 8100);  
20 - $this->targetMimeType = $targetMimeType; 17 + $this->python = KTUtil::findCommand('externalBinary/python');
  18 + $this->ooHost = $config->get('openoffice/host');
  19 + $this->ooPort = $config->get('openoffice/port');
  20 +
  21 + $this->documentConverter = KT_DIR . '/bin/openoffice/DocumentConverter.py';
  22 + if (!is_file($this->documentConverter))
  23 + {
  24 + $this->documentConverter = false;
  25 + }
21 } 26 }
22 27
23 public function getDisplayName() 28 public function getDisplayName()
@@ -28,53 +33,78 @@ class OOTextExtractor extends ExternalDocumentExtractor @@ -28,53 +33,78 @@ class OOTextExtractor extends ExternalDocumentExtractor
28 public function getSupportedMimeTypes() 33 public function getSupportedMimeTypes()
29 { 34 {
30 return array( 35 return array(
31 - 'text/rtf',  
32 - 'application/vnd.oasis.opendocument.text',  
33 - 'application/vnd.oasis.opendocument.text-template',  
34 - 'application/vnd.oasis.opendocument.text-web',  
35 - 'application/vnd.oasis.opendocument.text-master',  
36 - 'application/vnd.sun.xml.writer',  
37 - 'application/vnd.sun.xml.writer.template',  
38 - 'application/vnd.sun.xml.writer.global', 36 +
39 ); 37 );
40 } 38 }
41 39
42 public function needsIntermediateSourceFile() 40 public function needsIntermediateSourceFile()
43 { 41 {
44 // we need the intermediate file because it 42 // we need the intermediate file because it
45 - // has the correct extension. jodconverter uses the extension to determine mimetype 43 + // has the correct extension. documentConverter uses the extension to determine mimetype
46 return true; 44 return true;
47 } 45 }
48 46
49 protected function getCommandLine() 47 protected function getCommandLine()
50 { 48 {
51 - $cmdline = "$this->javaPath -jar $this->converter $this->sourcefile $this->mimetype $this->targetfile $this->targetMimeType $this->ooHost $this->ooPort"; 49 + $sourcefile = escapeshellcmd($this->sourcefile);
  50 + unlink($this->targetfile);
  51 + $this->targetfile .= '.' . $this->targetExtension;
  52 + $targetfile = escapeshellcmd($this->targetfile);
  53 +
  54 + $escape = OS_WINDOWS?'"':'\'';
  55 +
  56 + $cmdline = "{$this->python} {$escape}{$this->documentConverter}{$escape} {$escape}{$sourcefile}{$escape} {$escape}{$targetfile}{$escape} {$this->ooHost} {$this->ooPort}";
52 return $cmdline; 57 return $cmdline;
53 } 58 }
54 59
55 - public function diagnose() 60 + protected function filter($text)
56 { 61 {
57 - if (false === $this->converter) 62 + $text = preg_replace ("@(</?[^>]*>)+@", '', $text);
  63 +
  64 + do
  65 + {
  66 + $old = $text;
  67 +
  68 + $text= preg_replace("@([\r\n])[\s]+@",'\1', $text);
  69 +
  70 + $text = preg_replace('@\ \ @',' ', $text);
  71 + $text = preg_replace("@\n\n@","\n", $text);
  72 + }
  73 + while ($old != $text);
  74 +
  75 + return $text;
  76 + }
  77 +
  78 + public function extractTextContent()
  79 + {
  80 + if (false === parent::extractTextContent())
58 { 81 {
59 - return _kt('Cannot locate jodconverter'); 82 + return false;
60 } 83 }
61 84
62 - if (false === $this->javaPath) 85 + if ($this->targetExtension != 'html')
63 { 86 {
64 - return _kt('Cannot locate java'); 87 + return true;
65 } 88 }
  89 + $content = file_get_contents($this->targetfile);
  90 + return file_put_contents($this->targetfile, $this->filter($content));
66 91
  92 + }
67 93
68 94
69 - $connection = @fsockopen($this->ooHost, $this->ooPort,$errno, $errstr,5 );  
70 - if (false === $connection) 95 + public function diagnose()
  96 + {
  97 + if (false === $this->python)
71 { 98 {
72 - return _kt('Cannot connect to openoffice host'); 99 + return _kt('Cannot locate python');
73 } 100 }
74 - fclose($connection);  
75 101
  102 + if (false === $this->documentConverter)
  103 + {
  104 + return _kt('Cannot locate DocumentConverter.py');
  105 + }
76 106
77 - return null; 107 + return SearchHelper::checkOpenOfficeAvailablity();
78 } 108 }
79 } 109 }
80 110
search2/indexing/extractors/PDFExtractor.inc.php
@@ -4,7 +4,7 @@ class PDFExtractor extends ApplicationExtractor @@ -4,7 +4,7 @@ class PDFExtractor extends ApplicationExtractor
4 { 4 {
5 public function __construct() 5 public function __construct()
6 { 6 {
7 - parent::__construct('externalBinary','pdftotext','pdftotext',_kt('PDF Text Extractor'),'-nopgbrk -enc UTF-8 {source} {target}'); 7 + parent::__construct('externalBinary','pdftotext','pdftotext',_kt('PDF Text Extractor'),'-nopgbrk -enc UTF-8 \'{source}\' \'{target}\'');
8 } 8 }
9 9
10 public function getSupportedMimeTypes() 10 public function getSupportedMimeTypes()
search2/indexing/extractors/MailMimeExtractor.inc.php renamed to search2/indexing/extractors/RTFExtractor.inc.php
1 <?php 1 <?php
2 2
3 -class MailMimeExtractor extends TextExtractor 3 +require_once('OOTextExtractor.inc.php');
  4 +
  5 +class RTFExtractor extends OOTextExtractor
4 { 6 {
5 public function getDisplayName() 7 public function getDisplayName()
6 { 8 {
7 - return _kt('Mail Mime Extractor'); 9 + return _kt('RTF Extractor');
8 } 10 }
9 11
10 public function getSupportedMimeTypes() 12 public function getSupportedMimeTypes()
11 { 13 {
12 - return array('text/msg'); 14 + return array(
  15 + 'text/rtf'
  16 + );
13 } 17 }
14 -  
15 } 18 }
16 19
  20 +
17 ?> 21 ?>
18 \ No newline at end of file 22 \ No newline at end of file
search2/indexing/extractors/XMLExtractor.inc.php
@@ -9,7 +9,7 @@ class XMLExtractor extends TextExtractor @@ -9,7 +9,7 @@ class XMLExtractor extends TextExtractor
9 9
10 public function getSupportedMimeTypes() 10 public function getSupportedMimeTypes()
11 { 11 {
12 - return array('text/xml','application/xml','text/html'); 12 + return array('text/xml','application/xml','text/html','text/enriched');
13 } 13 }
14 14
15 protected function filter($text) 15 protected function filter($text)
search2/indexing/indexerCore.inc.php
@@ -330,7 +330,11 @@ abstract class Indexer @@ -330,7 +330,11 @@ abstract class Indexer
330 public function clearExtractors() 330 public function clearExtractors()
331 { 331 {
332 global $default; 332 global $default;
333 - $sql = "update mime_types set extractor=null"; 333 +
  334 + $sql = "update mime_types set extractor_id=null";
  335 + DBUtil::runQuery($sql);
  336 +
  337 + $sql = "delete from mime_extractors";
334 DBUtil::runQuery($sql); 338 DBUtil::runQuery($sql);
335 339
336 $default->log->debug('clearExtractors'); 340 $default->log->debug('clearExtractors');
@@ -616,6 +620,43 @@ abstract class Indexer @@ -616,6 +620,43 @@ abstract class Indexer
616 } 620 }
617 621
618 /** 622 /**
  623 + * This does the initial mime type association between mime types and text extractors
  624 + *
  625 + */
  626 + public function checkForRegisteredTypes()
  627 + {
  628 + global $default;
  629 +
  630 + // we are only doing this once!
  631 + $initRegistered = KTUtil::getSystemSetting('mimeTypesRegistered', false);
  632 + if ($initRegistered)
  633 + {
  634 + return;
  635 + }
  636 + $default->log->info('checkForRegisteredTypes: start');
  637 +
  638 + $this->registerTypes(true);
  639 +
  640 +
  641 + $disable = array(
  642 + OS_WINDOWS=>array('PSExtractor'),
  643 + OS_UNIX => array()
  644 +
  645 + );
  646 +
  647 + foreach($disable[OS_WINDOWS] as $extractor)
  648 + {
  649 + $sql = "UPDATE mime_extractors SET active=0 WHERE name='$extractor'";
  650 + DBUtil::runQuery($sql);
  651 + $default->log->info("checkForRegisteredTypes: disabled '$extractor'");
  652 + }
  653 +
  654 + $default->log->info('checkForRegisteredTypes: done');
  655 + KTUtil::setSystemSetting('mimeTypesRegistered', true);
  656 + }
  657 +
  658 +
  659 + /**
619 * The main function that may be called repeatedly to index documents. 660 * The main function that may be called repeatedly to index documents.
620 * 661 *
621 * @param int $max Default 20 662 * @param int $max Default 20
@@ -624,6 +665,8 @@ abstract class Indexer @@ -624,6 +665,8 @@ abstract class Indexer
624 { 665 {
625 global $default; 666 global $default;
626 667
  668 + $this->checkForRegisteredTypes();
  669 +
627 $default->log->info('indexDocuments: start'); 670 $default->log->info('indexDocuments: start');
628 if (!$this->doesDiagnosticsPass()) 671 if (!$this->doesDiagnosticsPass())
629 { 672 {
@@ -798,6 +841,9 @@ abstract class Indexer @@ -798,6 +841,9 @@ abstract class Indexer
798 841
799 if ($extractor->extractTextContent()) 842 if ($extractor->extractTextContent())
800 { 843 {
  844 + // the extractor may need to create another target file
  845 + $targetFile = $extractor->getTargetFile();
  846 +
801 $extractor->setExtractionStatus(true); 847 $extractor->setExtractionStatus(true);
802 $this->executeHook($extractor, 'pre_index'); 848 $this->executeHook($extractor, 'pre_index');
803 $this->executeHook($extractor, 'pre_index', $mimeType); 849 $this->executeHook($extractor, 'pre_index', $mimeType);