Commit 81865212adce4d390cca58695d3874c703c1a509
1 parent
319ec2a8
Merged in from DEV trunk...
KTS-673 "The search algorithm needs some work" Updated. Integrated with DocumentConverter.py Committed By: Conrad Vermeulen Reviewed By: Kevin Fourie KTS-2547 "Test RTF Indexer" Implemented. Committed By: Conrad Vermeulen Reviewed By: Kevin Fourie KTS-673 "The search algorithm needs some work" Updated. Committed By: Conrad Vermeulen Reviewed By: Kevin Fourie KTS-673 "The search algorithm needs some work" Updated. MailMime is not tested... will implement again when have more time. Committed By: Conrad Vermeulen Reviewed By: Kevin Fourie KTS-673 "The search algorithm needs some work" Updated. Sometimes the extractor may change the target filename. Committed By: Conrad Vermeulen Reviewed By: Kevin Fourie KTS-1753 " Implement Disk Usage Plugin" Updated. Licensing. Committed By: Conrad Vermeulen Reviewed By: Kevin Fourie KTS-673 "The search algorithm needs some work" Updated. Search requires php, python and java. Committed By: Conrad Vermeulen Reviewed By: Kevin Fourie git-svn-id: https://kt-dms.svn.sourceforge.net/svnroot/kt-dms/STABLE/trunk@7477 c91229c3-7414-0410-bfa2-8a42b809f60b
Showing
13 changed files
with
269 additions
and
84 deletions
config/config.ini
| ... | ... | @@ -242,7 +242,10 @@ pdftotext = pdftotext |
| 242 | 242 | catppt = catppt |
| 243 | 243 | pstotext = pstotext |
| 244 | 244 | catdoc = catdoc |
| 245 | -antiword = antiword.exe | |
| 245 | +antiword = antiword | |
| 246 | +python = python | |
| 247 | +java = java | |
| 248 | +php = php | |
| 246 | 249 | |
| 247 | 250 | [search] |
| 248 | 251 | ; The number of results per page | ... | ... |
plugins/housekeeper/DiskUsageDashlet.inc.php
| 1 | 1 | <?php |
| 2 | 2 | |
| 3 | 3 | /** |
| 4 | + * $Id | |
| 4 | 5 | * |
| 5 | - * Copyright (c) 2007 Jam Warehouse http://www.jamwarehouse.com | |
| 6 | + * The contents of this file are subject to the KnowledgeTree Public | |
| 7 | + * License Version 1.1.2 ("License"); You may not use this file except in | |
| 8 | + * compliance with the License. You may obtain a copy of the License at | |
| 9 | + * http://www.knowledgetree.com/KPL | |
| 6 | 10 | * |
| 7 | - * This program is free software; you can redistribute it and/or modify | |
| 8 | - * it under the terms of the GNU General Public License as published by | |
| 9 | - * the Free Software Foundation; using version 2 of the License. | |
| 11 | + * Software distributed under the License is distributed on an "AS IS" | |
| 12 | + * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. | |
| 13 | + * See the License for the specific language governing rights and | |
| 14 | + * limitations under the License. | |
| 10 | 15 | * |
| 11 | - * This program is distributed in the hope that it will be useful, | |
| 12 | - * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | - * GNU General Public License for more details. | |
| 16 | + * All copies of the Covered Code must include on each user interface screen: | |
| 17 | + * (i) the "Powered by KnowledgeTree" logo and | |
| 18 | + * (ii) the KnowledgeTree copyright notice | |
| 19 | + * in the same form as they appear in the distribution. See the License for | |
| 20 | + * requirements. | |
| 15 | 21 | * |
| 16 | - * You should have received a copy of the GNU General Public License | |
| 17 | - * along with this program; if not, write to the Free Software | |
| 18 | - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
| 22 | + * The Original Code is: KnowledgeTree Open Source | |
| 19 | 23 | * |
| 20 | - * ------------------------------------------------------------------------- | |
| 21 | - * | |
| 22 | - * You can contact the copyright owner regarding licensing via the contact | |
| 23 | - * details that can be found on the KnowledgeTree web site: | |
| 24 | - * | |
| 25 | - * http://www.knowledgetree.com/ | |
| 24 | + * The Initial Developer of the Original Code is The Jam Warehouse Software | |
| 25 | + * (Pty) Ltd, trading as KnowledgeTree. | |
| 26 | + * Portions created by The Jam Warehouse Software (Pty) Ltd are Copyright | |
| 27 | + * (C) 2007 The Jam Warehouse Software (Pty) Ltd; | |
| 28 | + * All Rights Reserved. | |
| 29 | + * Contributor( s): ______________________________________ | |
| 26 | 30 | */ |
| 27 | 31 | |
| 28 | 32 | class DiskUsageDashlet extends KTBaseDashlet |
| ... | ... | @@ -34,7 +38,7 @@ class DiskUsageDashlet extends KTBaseDashlet |
| 34 | 38 | |
| 35 | 39 | function DiskUsageDashlet() |
| 36 | 40 | { |
| 37 | - $this->sTitle = _kt('Disk Usage'); | |
| 41 | + $this->sTitle = _kt('Storage Utilization'); | |
| 38 | 42 | $this->sClass = "ktInfo"; |
| 39 | 43 | } |
| 40 | 44 | ... | ... |
plugins/housekeeper/FolderUsageDashlet.inc.php
| 1 | 1 | <?php |
| 2 | 2 | |
| 3 | 3 | /** |
| 4 | + * $Id | |
| 4 | 5 | * |
| 5 | - * Copyright (c) 2007 Jam Warehouse http://www.jamwarehouse.com | |
| 6 | + * The contents of this file are subject to the KnowledgeTree Public | |
| 7 | + * License Version 1.1.2 ("License"); You may not use this file except in | |
| 8 | + * compliance with the License. You may obtain a copy of the License at | |
| 9 | + * http://www.knowledgetree.com/KPL | |
| 6 | 10 | * |
| 7 | - * This program is free software; you can redistribute it and/or modify | |
| 8 | - * it under the terms of the GNU General Public License as published by | |
| 9 | - * the Free Software Foundation; using version 2 of the License. | |
| 11 | + * Software distributed under the License is distributed on an "AS IS" | |
| 12 | + * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. | |
| 13 | + * See the License for the specific language governing rights and | |
| 14 | + * limitations under the License. | |
| 10 | 15 | * |
| 11 | - * This program is distributed in the hope that it will be useful, | |
| 12 | - * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | - * GNU General Public License for more details. | |
| 16 | + * All copies of the Covered Code must include on each user interface screen: | |
| 17 | + * (i) the "Powered by KnowledgeTree" logo and | |
| 18 | + * (ii) the KnowledgeTree copyright notice | |
| 19 | + * in the same form as they appear in the distribution. See the License for | |
| 20 | + * requirements. | |
| 15 | 21 | * |
| 16 | - * You should have received a copy of the GNU General Public License | |
| 17 | - * along with this program; if not, write to the Free Software | |
| 18 | - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
| 22 | + * The Original Code is: KnowledgeTree Open Source | |
| 19 | 23 | * |
| 20 | - * ------------------------------------------------------------------------- | |
| 21 | - * | |
| 22 | - * You can contact the copyright owner regarding licensing via the contact | |
| 23 | - * details that can be found on the KnowledgeTree web site: | |
| 24 | - * | |
| 25 | - * http://www.knowledgetree.com/ | |
| 24 | + * The Initial Developer of the Original Code is The Jam Warehouse Software | |
| 25 | + * (Pty) Ltd, trading as KnowledgeTree. | |
| 26 | + * Portions created by The Jam Warehouse Software (Pty) Ltd are Copyright | |
| 27 | + * (C) 2007 The Jam Warehouse Software (Pty) Ltd; | |
| 28 | + * All Rights Reserved. | |
| 29 | + * Contributor( s): ______________________________________ | |
| 26 | 30 | */ |
| 27 | 31 | |
| 28 | 32 | class FolderUsageDashlet extends KTBaseDashlet |
| ... | ... | @@ -31,7 +35,7 @@ class FolderUsageDashlet extends KTBaseDashlet |
| 31 | 35 | |
| 32 | 36 | function FolderUsageDashlet() |
| 33 | 37 | { |
| 34 | - $this->sTitle = _kt('System Folder Usage'); | |
| 38 | + $this->sTitle = _kt('System Folder Utilization'); | |
| 35 | 39 | $this->sClass = "ktInfo"; |
| 36 | 40 | } |
| 37 | 41 | ... | ... |
plugins/housekeeper/HouseKeeperDispatcher.php
| 1 | 1 | <?php |
| 2 | 2 | |
| 3 | +/** | |
| 4 | + * $Id | |
| 5 | + * | |
| 6 | + * The contents of this file are subject to the KnowledgeTree Public | |
| 7 | + * License Version 1.1.2 ("License"); You may not use this file except in | |
| 8 | + * compliance with the License. You may obtain a copy of the License at | |
| 9 | + * http://www.knowledgetree.com/KPL | |
| 10 | + * | |
| 11 | + * Software distributed under the License is distributed on an "AS IS" | |
| 12 | + * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. | |
| 13 | + * See the License for the specific language governing rights and | |
| 14 | + * limitations under the License. | |
| 15 | + * | |
| 16 | + * All copies of the Covered Code must include on each user interface screen: | |
| 17 | + * (i) the "Powered by KnowledgeTree" logo and | |
| 18 | + * (ii) the KnowledgeTree copyright notice | |
| 19 | + * in the same form as they appear in the distribution. See the License for | |
| 20 | + * requirements. | |
| 21 | + * | |
| 22 | + * The Original Code is: KnowledgeTree Open Source | |
| 23 | + * | |
| 24 | + * The Initial Developer of the Original Code is The Jam Warehouse Software | |
| 25 | + * (Pty) Ltd, trading as KnowledgeTree. | |
| 26 | + * Portions created by The Jam Warehouse Software (Pty) Ltd are Copyright | |
| 27 | + * (C) 2007 The Jam Warehouse Software (Pty) Ltd; | |
| 28 | + * All Rights Reserved. | |
| 29 | + * Contributor( s): ______________________________________ | |
| 30 | + */ | |
| 31 | + | |
| 3 | 32 | session_start(); |
| 4 | 33 | |
| 5 | 34 | require_once("../../config/dmsDefaults.php"); |
| ... | ... | @@ -88,4 +117,4 @@ class HouseKeeperDispatcher extends KTStandardDispatcher |
| 88 | 117 | $oDispatcher = new HouseKeeperDispatcher(); |
| 89 | 118 | $oDispatcher->dispatch(); |
| 90 | 119 | |
| 91 | -?> | |
| 92 | 120 | \ No newline at end of file |
| 121 | +?> | ... | ... |
plugins/housekeeper/HouseKeeperPlugin.php
| 1 | 1 | <?php |
| 2 | 2 | |
| 3 | +/** | |
| 4 | + * $Id | |
| 5 | + * | |
| 6 | + * The contents of this file are subject to the KnowledgeTree Public | |
| 7 | + * License Version 1.1.2 ("License"); You may not use this file except in | |
| 8 | + * compliance with the License. You may obtain a copy of the License at | |
| 9 | + * http://www.knowledgetree.com/KPL | |
| 10 | + * | |
| 11 | + * Software distributed under the License is distributed on an "AS IS" | |
| 12 | + * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. | |
| 13 | + * See the License for the specific language governing rights and | |
| 14 | + * limitations under the License. | |
| 15 | + * | |
| 16 | + * All copies of the Covered Code must include on each user interface screen: | |
| 17 | + * (i) the "Powered by KnowledgeTree" logo and | |
| 18 | + * (ii) the KnowledgeTree copyright notice | |
| 19 | + * in the same form as they appear in the distribution. See the License for | |
| 20 | + * requirements. | |
| 21 | + * | |
| 22 | + * The Original Code is: KnowledgeTree Open Source | |
| 23 | + * | |
| 24 | + * The Initial Developer of the Original Code is The Jam Warehouse Software | |
| 25 | + * (Pty) Ltd, trading as KnowledgeTree. | |
| 26 | + * Portions created by The Jam Warehouse Software (Pty) Ltd are Copyright | |
| 27 | + * (C) 2007 The Jam Warehouse Software (Pty) Ltd; | |
| 28 | + * All Rights Reserved. | |
| 29 | + * Contributor( s): ______________________________________ | |
| 30 | + */ | |
| 31 | + | |
| 3 | 32 | class HouseKeeperPlugin extends KTPlugin |
| 4 | 33 | { |
| 5 | 34 | var $autoRegister = true; |
| ... | ... | @@ -93,4 +122,4 @@ class HouseKeeperPlugin extends KTPlugin |
| 93 | 122 | $oPluginRegistry =& KTPluginRegistry::getSingleton(); |
| 94 | 123 | $oPluginRegistry->registerPlugin('HouseKeeperPlugin', 'ktcore.housekeeper.plugin', __FILE__); |
| 95 | 124 | |
| 96 | -?> | |
| 97 | 125 | \ No newline at end of file |
| 126 | +?> | ... | ... |
search2/indexing/extractorCore.inc.php
| ... | ... | @@ -124,10 +124,22 @@ abstract class DocumentExtractor |
| 124 | 124 | } |
| 125 | 125 | $classname=get_class($this); |
| 126 | 126 | |
| 127 | + $sql = "select id as extractor_id from mime_extractors WHERE name='$classname'"; | |
| 128 | + $rs = DBUtil::getResultArray($sql); | |
| 129 | + if (count($rs) == 0) | |
| 130 | + { | |
| 131 | + $extractor_id = DBUtil::autoInsert('mime_extractors', array('name'=>$classname, 'active'=>1)); | |
| 132 | + } | |
| 133 | + else | |
| 134 | + { | |
| 135 | + $extractor_id = $rs[0]['extractor_id']; | |
| 136 | + } | |
| 137 | + | |
| 138 | + | |
| 127 | 139 | foreach($types as $type) |
| 128 | 140 | { |
| 129 | - $sql = "update mime_types set extractor='$classname' where mimetypes='$type' and extractor is null"; | |
| 130 | - DBUtil::runQuery($sql); | |
| 141 | + $sql = "update mime_types set extractor_id=$extractor_id where mimetypes='$type' and extractor_id is null"; | |
| 142 | + $rs = DBUtil::runQuery($sql); | |
| 131 | 143 | } |
| 132 | 144 | } |
| 133 | 145 | |
| ... | ... | @@ -510,15 +522,17 @@ abstract class CompositeExtractor extends DocumentExtractor |
| 510 | 522 | public function extractTextContent() |
| 511 | 523 | { |
| 512 | 524 | $intermediateFile = $this->targetfile . '.' . $this->targetExtension; |
| 525 | + touch($intermediateFile); | |
| 513 | 526 | |
| 514 | 527 | $this->sourceExtractor->setSourceFile($this->sourcefile); |
| 515 | 528 | $this->sourceExtractor->setTargetFile($intermediateFile); |
| 516 | 529 | $this->sourceExtractor->setMimeType($this->mimetype); |
| 517 | 530 | $this->sourceExtractor->setExtension($this->extension); |
| 518 | - if ($this->sourceExtractor->extractTextContent()) | |
| 531 | + if (!$this->sourceExtractor->extractTextContent()) | |
| 519 | 532 | { |
| 520 | 533 | return false; |
| 521 | 534 | } |
| 535 | + $intermediateFile = $this->sourceExtractor->getTargetFile(); | |
| 522 | 536 | |
| 523 | 537 | $this->targetExtractor->setSourceFile($intermediateFile); |
| 524 | 538 | $this->targetExtractor->setTargetFile($this->targetfile); | ... | ... |
search2/indexing/extractors/OOPresentationExtractor.inc.php
| 1 | 1 | <?php |
| 2 | 2 | |
| 3 | -require_once('OOPDFTextExtractor.inc.php'); | |
| 3 | +require_once('OOTextExtractor.inc.php'); | |
| 4 | +require_once('PDFExtractor.inc.php'); | |
| 4 | 5 | |
| 5 | -class OOPresentationExtractor extends OOPDFTextExtractor | |
| 6 | +class OOPresentationExtractor extends CompositeExtractor | |
| 6 | 7 | { |
| 8 | + public function __construct() | |
| 9 | + { | |
| 10 | + $sourceExtractor = new OOPresentationToPDF(); | |
| 11 | + $targetExtractor = new PDFExtractor(); | |
| 12 | + parent::__construct($sourceExtractor, 'pdf', 'application/pdf', $targetExtractor, true); | |
| 13 | + } | |
| 14 | + | |
| 7 | 15 | public function getDisplayName() |
| 8 | 16 | { |
| 9 | 17 | return _kt('OpenOffice Presentation Extractor'); |
| ... | ... | @@ -18,4 +26,18 @@ class OOPresentationExtractor extends OOPDFTextExtractor |
| 18 | 26 | } |
| 19 | 27 | } |
| 20 | 28 | |
| 29 | +class OOPresentationToPDF extends OOTextExtractor | |
| 30 | +{ | |
| 31 | + public function __construct() | |
| 32 | + { | |
| 33 | + parent::__construct('pdf'); | |
| 34 | + $this->documentConverter = KT_DIR . '/bin/openoffice/pdfgen.py'; | |
| 35 | + if (!is_file($this->documentConverter)) | |
| 36 | + { | |
| 37 | + $this->documentConverter = false; | |
| 38 | + } | |
| 39 | + } | |
| 40 | +} | |
| 41 | + | |
| 42 | + | |
| 21 | 43 | ?> |
| 22 | 44 | \ No newline at end of file | ... | ... |
search2/indexing/extractors/OOSpreadsheetExtractor.inc.php
search2/indexing/extractors/OOTextExtractor.inc.php
| ... | ... | @@ -2,22 +2,27 @@ |
| 2 | 2 | |
| 3 | 3 | class OOTextExtractor extends ExternalDocumentExtractor |
| 4 | 4 | { |
| 5 | - private $converter; | |
| 6 | - private $javaPath; | |
| 7 | - private $ooHost; | |
| 8 | - private $ooPort; | |
| 9 | - private $targetMimeType; | |
| 5 | + protected $python; | |
| 6 | + protected $documentConverter; | |
| 7 | + protected $ooHost; | |
| 8 | + protected $ooPort; | |
| 9 | + protected $targetExtension; | |
| 10 | 10 | |
| 11 | - public function __construct($targetMimeType='plain/text') | |
| 11 | + public function __construct($targetExtension='html') | |
| 12 | 12 | { |
| 13 | 13 | parent::__construct(); |
| 14 | + $this->targetExtension = $targetExtension; | |
| 14 | 15 | $config =& KTConfig::getSingleton(); |
| 15 | 16 | |
| 16 | - $this->converter = KTUtil::findCommand('extractors/jodconverter', 'jodconverter'); | |
| 17 | - $this->javaPath = KTUtil::findCommand('extractors/java', 'java'); | |
| 18 | - $this->ooHost = $config->get('openoffice/host', 'localhost'); | |
| 19 | - $this->ooPort = $config->get('openoffice/port', 8100); | |
| 20 | - $this->targetMimeType = $targetMimeType; | |
| 17 | + $this->python = KTUtil::findCommand('externalBinary/python'); | |
| 18 | + $this->ooHost = $config->get('openoffice/host'); | |
| 19 | + $this->ooPort = $config->get('openoffice/port'); | |
| 20 | + | |
| 21 | + $this->documentConverter = KT_DIR . '/bin/openoffice/DocumentConverter.py'; | |
| 22 | + if (!is_file($this->documentConverter)) | |
| 23 | + { | |
| 24 | + $this->documentConverter = false; | |
| 25 | + } | |
| 21 | 26 | } |
| 22 | 27 | |
| 23 | 28 | public function getDisplayName() |
| ... | ... | @@ -28,53 +33,78 @@ class OOTextExtractor extends ExternalDocumentExtractor |
| 28 | 33 | public function getSupportedMimeTypes() |
| 29 | 34 | { |
| 30 | 35 | return array( |
| 31 | - 'text/rtf', | |
| 32 | - 'application/vnd.oasis.opendocument.text', | |
| 33 | - 'application/vnd.oasis.opendocument.text-template', | |
| 34 | - 'application/vnd.oasis.opendocument.text-web', | |
| 35 | - 'application/vnd.oasis.opendocument.text-master', | |
| 36 | - 'application/vnd.sun.xml.writer', | |
| 37 | - 'application/vnd.sun.xml.writer.template', | |
| 38 | - 'application/vnd.sun.xml.writer.global', | |
| 36 | + | |
| 39 | 37 | ); |
| 40 | 38 | } |
| 41 | 39 | |
| 42 | 40 | public function needsIntermediateSourceFile() |
| 43 | 41 | { |
| 44 | 42 | // we need the intermediate file because it |
| 45 | - // has the correct extension. jodconverter uses the extension to determine mimetype | |
| 43 | + // has the correct extension. documentConverter uses the extension to determine mimetype | |
| 46 | 44 | return true; |
| 47 | 45 | } |
| 48 | 46 | |
| 49 | 47 | protected function getCommandLine() |
| 50 | 48 | { |
| 51 | - $cmdline = "$this->javaPath -jar $this->converter $this->sourcefile $this->mimetype $this->targetfile $this->targetMimeType $this->ooHost $this->ooPort"; | |
| 49 | + $sourcefile = escapeshellcmd($this->sourcefile); | |
| 50 | + unlink($this->targetfile); | |
| 51 | + $this->targetfile .= '.' . $this->targetExtension; | |
| 52 | + $targetfile = escapeshellcmd($this->targetfile); | |
| 53 | + | |
| 54 | + $escape = OS_WINDOWS?'"':'\''; | |
| 55 | + | |
| 56 | + $cmdline = "{$this->python} {$escape}{$this->documentConverter}{$escape} {$escape}{$sourcefile}{$escape} {$escape}{$targetfile}{$escape} {$this->ooHost} {$this->ooPort}"; | |
| 52 | 57 | return $cmdline; |
| 53 | 58 | } |
| 54 | 59 | |
| 55 | - public function diagnose() | |
| 60 | + protected function filter($text) | |
| 56 | 61 | { |
| 57 | - if (false === $this->converter) | |
| 62 | + $text = preg_replace ("@(</?[^>]*>)+@", '', $text); | |
| 63 | + | |
| 64 | + do | |
| 65 | + { | |
| 66 | + $old = $text; | |
| 67 | + | |
| 68 | + $text= preg_replace("@([\r\n])[\s]+@",'\1', $text); | |
| 69 | + | |
| 70 | + $text = preg_replace('@\ \ @',' ', $text); | |
| 71 | + $text = preg_replace("@\n\n@","\n", $text); | |
| 72 | + } | |
| 73 | + while ($old != $text); | |
| 74 | + | |
| 75 | + return $text; | |
| 76 | + } | |
| 77 | + | |
| 78 | + public function extractTextContent() | |
| 79 | + { | |
| 80 | + if (false === parent::extractTextContent()) | |
| 58 | 81 | { |
| 59 | - return _kt('Cannot locate jodconverter'); | |
| 82 | + return false; | |
| 60 | 83 | } |
| 61 | 84 | |
| 62 | - if (false === $this->javaPath) | |
| 85 | + if ($this->targetExtension != 'html') | |
| 63 | 86 | { |
| 64 | - return _kt('Cannot locate java'); | |
| 87 | + return true; | |
| 65 | 88 | } |
| 89 | + $content = file_get_contents($this->targetfile); | |
| 90 | + return file_put_contents($this->targetfile, $this->filter($content)); | |
| 66 | 91 | |
| 92 | + } | |
| 67 | 93 | |
| 68 | 94 | |
| 69 | - $connection = @fsockopen($this->ooHost, $this->ooPort,$errno, $errstr,5 ); | |
| 70 | - if (false === $connection) | |
| 95 | + public function diagnose() | |
| 96 | + { | |
| 97 | + if (false === $this->python) | |
| 71 | 98 | { |
| 72 | - return _kt('Cannot connect to openoffice host'); | |
| 99 | + return _kt('Cannot locate python'); | |
| 73 | 100 | } |
| 74 | - fclose($connection); | |
| 75 | 101 | |
| 102 | + if (false === $this->documentConverter) | |
| 103 | + { | |
| 104 | + return _kt('Cannot locate DocumentConverter.py'); | |
| 105 | + } | |
| 76 | 106 | |
| 77 | - return null; | |
| 107 | + return SearchHelper::checkOpenOfficeAvailablity(); | |
| 78 | 108 | } |
| 79 | 109 | } |
| 80 | 110 | ... | ... |
search2/indexing/extractors/PDFExtractor.inc.php
| ... | ... | @@ -4,7 +4,7 @@ class PDFExtractor extends ApplicationExtractor |
| 4 | 4 | { |
| 5 | 5 | public function __construct() |
| 6 | 6 | { |
| 7 | - parent::__construct('externalBinary','pdftotext','pdftotext',_kt('PDF Text Extractor'),'-nopgbrk -enc UTF-8 {source} {target}'); | |
| 7 | + parent::__construct('externalBinary','pdftotext','pdftotext',_kt('PDF Text Extractor'),'-nopgbrk -enc UTF-8 \'{source}\' \'{target}\''); | |
| 8 | 8 | } |
| 9 | 9 | |
| 10 | 10 | public function getSupportedMimeTypes() | ... | ... |
search2/indexing/extractors/MailMimeExtractor.inc.php renamed to search2/indexing/extractors/RTFExtractor.inc.php
| 1 | 1 | <?php |
| 2 | 2 | |
| 3 | -class MailMimeExtractor extends TextExtractor | |
| 3 | +require_once('OOTextExtractor.inc.php'); | |
| 4 | + | |
| 5 | +class RTFExtractor extends OOTextExtractor | |
| 4 | 6 | { |
| 5 | 7 | public function getDisplayName() |
| 6 | 8 | { |
| 7 | - return _kt('Mail Mime Extractor'); | |
| 9 | + return _kt('RTF Extractor'); | |
| 8 | 10 | } |
| 9 | 11 | |
| 10 | 12 | public function getSupportedMimeTypes() |
| 11 | 13 | { |
| 12 | - return array('text/msg'); | |
| 14 | + return array( | |
| 15 | + 'text/rtf' | |
| 16 | + ); | |
| 13 | 17 | } |
| 14 | - | |
| 15 | 18 | } |
| 16 | 19 | |
| 20 | + | |
| 17 | 21 | ?> |
| 18 | 22 | \ No newline at end of file | ... | ... |
search2/indexing/extractors/XMLExtractor.inc.php
| ... | ... | @@ -9,7 +9,7 @@ class XMLExtractor extends TextExtractor |
| 9 | 9 | |
| 10 | 10 | public function getSupportedMimeTypes() |
| 11 | 11 | { |
| 12 | - return array('text/xml','application/xml','text/html'); | |
| 12 | + return array('text/xml','application/xml','text/html','text/enriched'); | |
| 13 | 13 | } |
| 14 | 14 | |
| 15 | 15 | protected function filter($text) | ... | ... |
search2/indexing/indexerCore.inc.php
| ... | ... | @@ -330,7 +330,11 @@ abstract class Indexer |
| 330 | 330 | public function clearExtractors() |
| 331 | 331 | { |
| 332 | 332 | global $default; |
| 333 | - $sql = "update mime_types set extractor=null"; | |
| 333 | + | |
| 334 | + $sql = "update mime_types set extractor_id=null"; | |
| 335 | + DBUtil::runQuery($sql); | |
| 336 | + | |
| 337 | + $sql = "delete from mime_extractors"; | |
| 334 | 338 | DBUtil::runQuery($sql); |
| 335 | 339 | |
| 336 | 340 | $default->log->debug('clearExtractors'); |
| ... | ... | @@ -616,6 +620,43 @@ abstract class Indexer |
| 616 | 620 | } |
| 617 | 621 | |
| 618 | 622 | /** |
| 623 | + * This does the initial mime type association between mime types and text extractors | |
| 624 | + * | |
| 625 | + */ | |
| 626 | + public function checkForRegisteredTypes() | |
| 627 | + { | |
| 628 | + global $default; | |
| 629 | + | |
| 630 | + // we are only doing this once! | |
| 631 | + $initRegistered = KTUtil::getSystemSetting('mimeTypesRegistered', false); | |
| 632 | + if ($initRegistered) | |
| 633 | + { | |
| 634 | + return; | |
| 635 | + } | |
| 636 | + $default->log->info('checkForRegisteredTypes: start'); | |
| 637 | + | |
| 638 | + $this->registerTypes(true); | |
| 639 | + | |
| 640 | + | |
| 641 | + $disable = array( | |
| 642 | + OS_WINDOWS=>array('PSExtractor'), | |
| 643 | + OS_UNIX => array() | |
| 644 | + | |
| 645 | + ); | |
| 646 | + | |
| 647 | + foreach($disable[OS_WINDOWS] as $extractor) | |
| 648 | + { | |
| 649 | + $sql = "UPDATE mime_extractors SET active=0 WHERE name='$extractor'"; | |
| 650 | + DBUtil::runQuery($sql); | |
| 651 | + $default->log->info("checkForRegisteredTypes: disabled '$extractor'"); | |
| 652 | + } | |
| 653 | + | |
| 654 | + $default->log->info('checkForRegisteredTypes: done'); | |
| 655 | + KTUtil::setSystemSetting('mimeTypesRegistered', true); | |
| 656 | + } | |
| 657 | + | |
| 658 | + | |
| 659 | + /** | |
| 619 | 660 | * The main function that may be called repeatedly to index documents. |
| 620 | 661 | * |
| 621 | 662 | * @param int $max Default 20 |
| ... | ... | @@ -624,6 +665,8 @@ abstract class Indexer |
| 624 | 665 | { |
| 625 | 666 | global $default; |
| 626 | 667 | |
| 668 | + $this->checkForRegisteredTypes(); | |
| 669 | + | |
| 627 | 670 | $default->log->info('indexDocuments: start'); |
| 628 | 671 | if (!$this->doesDiagnosticsPass()) |
| 629 | 672 | { |
| ... | ... | @@ -798,6 +841,9 @@ abstract class Indexer |
| 798 | 841 | |
| 799 | 842 | if ($extractor->extractTextContent()) |
| 800 | 843 | { |
| 844 | + // the extractor may need to create another target file | |
| 845 | + $targetFile = $extractor->getTargetFile(); | |
| 846 | + | |
| 801 | 847 | $extractor->setExtractionStatus(true); |
| 802 | 848 | $this->executeHook($extractor, 'pre_index'); |
| 803 | 849 | $this->executeHook($extractor, 'pre_index', $mimeType); | ... | ... |