Commit 81865212adce4d390cca58695d3874c703c1a509
1 parent
319ec2a8
Merged in from DEV trunk...
KTS-673 "The search algorithm needs some work" Updated. Integrated with DocumentConverter.py Committed By: Conrad Vermeulen Reviewed By: Kevin Fourie KTS-2547 "Test RTF Indexer" Implemented. Committed By: Conrad Vermeulen Reviewed By: Kevin Fourie KTS-673 "The search algorithm needs some work" Updated. Committed By: Conrad Vermeulen Reviewed By: Kevin Fourie KTS-673 "The search algorithm needs some work" Updated. MailMime is not tested... will implement again when have more time. Committed By: Conrad Vermeulen Reviewed By: Kevin Fourie KTS-673 "The search algorithm needs some work" Updated. Sometimes the extractor may change the target filename. Committed By: Conrad Vermeulen Reviewed By: Kevin Fourie KTS-1753 " Implement Disk Usage Plugin" Updated. Licensing. Committed By: Conrad Vermeulen Reviewed By: Kevin Fourie KTS-673 "The search algorithm needs some work" Updated. Search requires php, python and java. Committed By: Conrad Vermeulen Reviewed By: Kevin Fourie git-svn-id: https://kt-dms.svn.sourceforge.net/svnroot/kt-dms/STABLE/trunk@7477 c91229c3-7414-0410-bfa2-8a42b809f60b
Showing
13 changed files
with
269 additions
and
84 deletions
config/config.ini
| @@ -242,7 +242,10 @@ pdftotext = pdftotext | @@ -242,7 +242,10 @@ pdftotext = pdftotext | ||
| 242 | catppt = catppt | 242 | catppt = catppt |
| 243 | pstotext = pstotext | 243 | pstotext = pstotext |
| 244 | catdoc = catdoc | 244 | catdoc = catdoc |
| 245 | -antiword = antiword.exe | 245 | +antiword = antiword |
| 246 | +python = python | ||
| 247 | +java = java | ||
| 248 | +php = php | ||
| 246 | 249 | ||
| 247 | [search] | 250 | [search] |
| 248 | ; The number of results per page | 251 | ; The number of results per page |
plugins/housekeeper/DiskUsageDashlet.inc.php
| 1 | <?php | 1 | <?php |
| 2 | 2 | ||
| 3 | /** | 3 | /** |
| 4 | + * $Id | ||
| 4 | * | 5 | * |
| 5 | - * Copyright (c) 2007 Jam Warehouse http://www.jamwarehouse.com | 6 | + * The contents of this file are subject to the KnowledgeTree Public |
| 7 | + * License Version 1.1.2 ("License"); You may not use this file except in | ||
| 8 | + * compliance with the License. You may obtain a copy of the License at | ||
| 9 | + * http://www.knowledgetree.com/KPL | ||
| 6 | * | 10 | * |
| 7 | - * This program is free software; you can redistribute it and/or modify | ||
| 8 | - * it under the terms of the GNU General Public License as published by | ||
| 9 | - * the Free Software Foundation; using version 2 of the License. | 11 | + * Software distributed under the License is distributed on an "AS IS" |
| 12 | + * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. | ||
| 13 | + * See the License for the specific language governing rights and | ||
| 14 | + * limitations under the License. | ||
| 10 | * | 15 | * |
| 11 | - * This program is distributed in the hope that it will be useful, | ||
| 12 | - * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 13 | - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 14 | - * GNU General Public License for more details. | 16 | + * All copies of the Covered Code must include on each user interface screen: |
| 17 | + * (i) the "Powered by KnowledgeTree" logo and | ||
| 18 | + * (ii) the KnowledgeTree copyright notice | ||
| 19 | + * in the same form as they appear in the distribution. See the License for | ||
| 20 | + * requirements. | ||
| 15 | * | 21 | * |
| 16 | - * You should have received a copy of the GNU General Public License | ||
| 17 | - * along with this program; if not, write to the Free Software | ||
| 18 | - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 22 | + * The Original Code is: KnowledgeTree Open Source |
| 19 | * | 23 | * |
| 20 | - * ------------------------------------------------------------------------- | ||
| 21 | - * | ||
| 22 | - * You can contact the copyright owner regarding licensing via the contact | ||
| 23 | - * details that can be found on the KnowledgeTree web site: | ||
| 24 | - * | ||
| 25 | - * http://www.knowledgetree.com/ | 24 | + * The Initial Developer of the Original Code is The Jam Warehouse Software |
| 25 | + * (Pty) Ltd, trading as KnowledgeTree. | ||
| 26 | + * Portions created by The Jam Warehouse Software (Pty) Ltd are Copyright | ||
| 27 | + * (C) 2007 The Jam Warehouse Software (Pty) Ltd; | ||
| 28 | + * All Rights Reserved. | ||
| 29 | + * Contributor( s): ______________________________________ | ||
| 26 | */ | 30 | */ |
| 27 | 31 | ||
| 28 | class DiskUsageDashlet extends KTBaseDashlet | 32 | class DiskUsageDashlet extends KTBaseDashlet |
| @@ -34,7 +38,7 @@ class DiskUsageDashlet extends KTBaseDashlet | @@ -34,7 +38,7 @@ class DiskUsageDashlet extends KTBaseDashlet | ||
| 34 | 38 | ||
| 35 | function DiskUsageDashlet() | 39 | function DiskUsageDashlet() |
| 36 | { | 40 | { |
| 37 | - $this->sTitle = _kt('Disk Usage'); | 41 | + $this->sTitle = _kt('Storage Utilization'); |
| 38 | $this->sClass = "ktInfo"; | 42 | $this->sClass = "ktInfo"; |
| 39 | } | 43 | } |
| 40 | 44 |
plugins/housekeeper/FolderUsageDashlet.inc.php
| 1 | <?php | 1 | <?php |
| 2 | 2 | ||
| 3 | /** | 3 | /** |
| 4 | + * $Id | ||
| 4 | * | 5 | * |
| 5 | - * Copyright (c) 2007 Jam Warehouse http://www.jamwarehouse.com | 6 | + * The contents of this file are subject to the KnowledgeTree Public |
| 7 | + * License Version 1.1.2 ("License"); You may not use this file except in | ||
| 8 | + * compliance with the License. You may obtain a copy of the License at | ||
| 9 | + * http://www.knowledgetree.com/KPL | ||
| 6 | * | 10 | * |
| 7 | - * This program is free software; you can redistribute it and/or modify | ||
| 8 | - * it under the terms of the GNU General Public License as published by | ||
| 9 | - * the Free Software Foundation; using version 2 of the License. | 11 | + * Software distributed under the License is distributed on an "AS IS" |
| 12 | + * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. | ||
| 13 | + * See the License for the specific language governing rights and | ||
| 14 | + * limitations under the License. | ||
| 10 | * | 15 | * |
| 11 | - * This program is distributed in the hope that it will be useful, | ||
| 12 | - * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 13 | - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 14 | - * GNU General Public License for more details. | 16 | + * All copies of the Covered Code must include on each user interface screen: |
| 17 | + * (i) the "Powered by KnowledgeTree" logo and | ||
| 18 | + * (ii) the KnowledgeTree copyright notice | ||
| 19 | + * in the same form as they appear in the distribution. See the License for | ||
| 20 | + * requirements. | ||
| 15 | * | 21 | * |
| 16 | - * You should have received a copy of the GNU General Public License | ||
| 17 | - * along with this program; if not, write to the Free Software | ||
| 18 | - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 22 | + * The Original Code is: KnowledgeTree Open Source |
| 19 | * | 23 | * |
| 20 | - * ------------------------------------------------------------------------- | ||
| 21 | - * | ||
| 22 | - * You can contact the copyright owner regarding licensing via the contact | ||
| 23 | - * details that can be found on the KnowledgeTree web site: | ||
| 24 | - * | ||
| 25 | - * http://www.knowledgetree.com/ | 24 | + * The Initial Developer of the Original Code is The Jam Warehouse Software |
| 25 | + * (Pty) Ltd, trading as KnowledgeTree. | ||
| 26 | + * Portions created by The Jam Warehouse Software (Pty) Ltd are Copyright | ||
| 27 | + * (C) 2007 The Jam Warehouse Software (Pty) Ltd; | ||
| 28 | + * All Rights Reserved. | ||
| 29 | + * Contributor( s): ______________________________________ | ||
| 26 | */ | 30 | */ |
| 27 | 31 | ||
| 28 | class FolderUsageDashlet extends KTBaseDashlet | 32 | class FolderUsageDashlet extends KTBaseDashlet |
| @@ -31,7 +35,7 @@ class FolderUsageDashlet extends KTBaseDashlet | @@ -31,7 +35,7 @@ class FolderUsageDashlet extends KTBaseDashlet | ||
| 31 | 35 | ||
| 32 | function FolderUsageDashlet() | 36 | function FolderUsageDashlet() |
| 33 | { | 37 | { |
| 34 | - $this->sTitle = _kt('System Folder Usage'); | 38 | + $this->sTitle = _kt('System Folder Utilization'); |
| 35 | $this->sClass = "ktInfo"; | 39 | $this->sClass = "ktInfo"; |
| 36 | } | 40 | } |
| 37 | 41 |
plugins/housekeeper/HouseKeeperDispatcher.php
| 1 | <?php | 1 | <?php |
| 2 | 2 | ||
| 3 | +/** | ||
| 4 | + * $Id | ||
| 5 | + * | ||
| 6 | + * The contents of this file are subject to the KnowledgeTree Public | ||
| 7 | + * License Version 1.1.2 ("License"); You may not use this file except in | ||
| 8 | + * compliance with the License. You may obtain a copy of the License at | ||
| 9 | + * http://www.knowledgetree.com/KPL | ||
| 10 | + * | ||
| 11 | + * Software distributed under the License is distributed on an "AS IS" | ||
| 12 | + * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. | ||
| 13 | + * See the License for the specific language governing rights and | ||
| 14 | + * limitations under the License. | ||
| 15 | + * | ||
| 16 | + * All copies of the Covered Code must include on each user interface screen: | ||
| 17 | + * (i) the "Powered by KnowledgeTree" logo and | ||
| 18 | + * (ii) the KnowledgeTree copyright notice | ||
| 19 | + * in the same form as they appear in the distribution. See the License for | ||
| 20 | + * requirements. | ||
| 21 | + * | ||
| 22 | + * The Original Code is: KnowledgeTree Open Source | ||
| 23 | + * | ||
| 24 | + * The Initial Developer of the Original Code is The Jam Warehouse Software | ||
| 25 | + * (Pty) Ltd, trading as KnowledgeTree. | ||
| 26 | + * Portions created by The Jam Warehouse Software (Pty) Ltd are Copyright | ||
| 27 | + * (C) 2007 The Jam Warehouse Software (Pty) Ltd; | ||
| 28 | + * All Rights Reserved. | ||
| 29 | + * Contributor( s): ______________________________________ | ||
| 30 | + */ | ||
| 31 | + | ||
| 3 | session_start(); | 32 | session_start(); |
| 4 | 33 | ||
| 5 | require_once("../../config/dmsDefaults.php"); | 34 | require_once("../../config/dmsDefaults.php"); |
| @@ -88,4 +117,4 @@ class HouseKeeperDispatcher extends KTStandardDispatcher | @@ -88,4 +117,4 @@ class HouseKeeperDispatcher extends KTStandardDispatcher | ||
| 88 | $oDispatcher = new HouseKeeperDispatcher(); | 117 | $oDispatcher = new HouseKeeperDispatcher(); |
| 89 | $oDispatcher->dispatch(); | 118 | $oDispatcher->dispatch(); |
| 90 | 119 | ||
| 91 | -?> | ||
| 92 | \ No newline at end of file | 120 | \ No newline at end of file |
| 121 | +?> |
plugins/housekeeper/HouseKeeperPlugin.php
| 1 | <?php | 1 | <?php |
| 2 | 2 | ||
| 3 | +/** | ||
| 4 | + * $Id | ||
| 5 | + * | ||
| 6 | + * The contents of this file are subject to the KnowledgeTree Public | ||
| 7 | + * License Version 1.1.2 ("License"); You may not use this file except in | ||
| 8 | + * compliance with the License. You may obtain a copy of the License at | ||
| 9 | + * http://www.knowledgetree.com/KPL | ||
| 10 | + * | ||
| 11 | + * Software distributed under the License is distributed on an "AS IS" | ||
| 12 | + * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. | ||
| 13 | + * See the License for the specific language governing rights and | ||
| 14 | + * limitations under the License. | ||
| 15 | + * | ||
| 16 | + * All copies of the Covered Code must include on each user interface screen: | ||
| 17 | + * (i) the "Powered by KnowledgeTree" logo and | ||
| 18 | + * (ii) the KnowledgeTree copyright notice | ||
| 19 | + * in the same form as they appear in the distribution. See the License for | ||
| 20 | + * requirements. | ||
| 21 | + * | ||
| 22 | + * The Original Code is: KnowledgeTree Open Source | ||
| 23 | + * | ||
| 24 | + * The Initial Developer of the Original Code is The Jam Warehouse Software | ||
| 25 | + * (Pty) Ltd, trading as KnowledgeTree. | ||
| 26 | + * Portions created by The Jam Warehouse Software (Pty) Ltd are Copyright | ||
| 27 | + * (C) 2007 The Jam Warehouse Software (Pty) Ltd; | ||
| 28 | + * All Rights Reserved. | ||
| 29 | + * Contributor( s): ______________________________________ | ||
| 30 | + */ | ||
| 31 | + | ||
| 3 | class HouseKeeperPlugin extends KTPlugin | 32 | class HouseKeeperPlugin extends KTPlugin |
| 4 | { | 33 | { |
| 5 | var $autoRegister = true; | 34 | var $autoRegister = true; |
| @@ -93,4 +122,4 @@ class HouseKeeperPlugin extends KTPlugin | @@ -93,4 +122,4 @@ class HouseKeeperPlugin extends KTPlugin | ||
| 93 | $oPluginRegistry =& KTPluginRegistry::getSingleton(); | 122 | $oPluginRegistry =& KTPluginRegistry::getSingleton(); |
| 94 | $oPluginRegistry->registerPlugin('HouseKeeperPlugin', 'ktcore.housekeeper.plugin', __FILE__); | 123 | $oPluginRegistry->registerPlugin('HouseKeeperPlugin', 'ktcore.housekeeper.plugin', __FILE__); |
| 95 | 124 | ||
| 96 | -?> | ||
| 97 | \ No newline at end of file | 125 | \ No newline at end of file |
| 126 | +?> |
search2/indexing/extractorCore.inc.php
| @@ -124,10 +124,22 @@ abstract class DocumentExtractor | @@ -124,10 +124,22 @@ abstract class DocumentExtractor | ||
| 124 | } | 124 | } |
| 125 | $classname=get_class($this); | 125 | $classname=get_class($this); |
| 126 | 126 | ||
| 127 | + $sql = "select id as extractor_id from mime_extractors WHERE name='$classname'"; | ||
| 128 | + $rs = DBUtil::getResultArray($sql); | ||
| 129 | + if (count($rs) == 0) | ||
| 130 | + { | ||
| 131 | + $extractor_id = DBUtil::autoInsert('mime_extractors', array('name'=>$classname, 'active'=>1)); | ||
| 132 | + } | ||
| 133 | + else | ||
| 134 | + { | ||
| 135 | + $extractor_id = $rs[0]['extractor_id']; | ||
| 136 | + } | ||
| 137 | + | ||
| 138 | + | ||
| 127 | foreach($types as $type) | 139 | foreach($types as $type) |
| 128 | { | 140 | { |
| 129 | - $sql = "update mime_types set extractor='$classname' where mimetypes='$type' and extractor is null"; | ||
| 130 | - DBUtil::runQuery($sql); | 141 | + $sql = "update mime_types set extractor_id=$extractor_id where mimetypes='$type' and extractor_id is null"; |
| 142 | + $rs = DBUtil::runQuery($sql); | ||
| 131 | } | 143 | } |
| 132 | } | 144 | } |
| 133 | 145 | ||
| @@ -510,15 +522,17 @@ abstract class CompositeExtractor extends DocumentExtractor | @@ -510,15 +522,17 @@ abstract class CompositeExtractor extends DocumentExtractor | ||
| 510 | public function extractTextContent() | 522 | public function extractTextContent() |
| 511 | { | 523 | { |
| 512 | $intermediateFile = $this->targetfile . '.' . $this->targetExtension; | 524 | $intermediateFile = $this->targetfile . '.' . $this->targetExtension; |
| 525 | + touch($intermediateFile); | ||
| 513 | 526 | ||
| 514 | $this->sourceExtractor->setSourceFile($this->sourcefile); | 527 | $this->sourceExtractor->setSourceFile($this->sourcefile); |
| 515 | $this->sourceExtractor->setTargetFile($intermediateFile); | 528 | $this->sourceExtractor->setTargetFile($intermediateFile); |
| 516 | $this->sourceExtractor->setMimeType($this->mimetype); | 529 | $this->sourceExtractor->setMimeType($this->mimetype); |
| 517 | $this->sourceExtractor->setExtension($this->extension); | 530 | $this->sourceExtractor->setExtension($this->extension); |
| 518 | - if ($this->sourceExtractor->extractTextContent()) | 531 | + if (!$this->sourceExtractor->extractTextContent()) |
| 519 | { | 532 | { |
| 520 | return false; | 533 | return false; |
| 521 | } | 534 | } |
| 535 | + $intermediateFile = $this->sourceExtractor->getTargetFile(); | ||
| 522 | 536 | ||
| 523 | $this->targetExtractor->setSourceFile($intermediateFile); | 537 | $this->targetExtractor->setSourceFile($intermediateFile); |
| 524 | $this->targetExtractor->setTargetFile($this->targetfile); | 538 | $this->targetExtractor->setTargetFile($this->targetfile); |
search2/indexing/extractors/OOPresentationExtractor.inc.php
| 1 | <?php | 1 | <?php |
| 2 | 2 | ||
| 3 | -require_once('OOPDFTextExtractor.inc.php'); | 3 | +require_once('OOTextExtractor.inc.php'); |
| 4 | +require_once('PDFExtractor.inc.php'); | ||
| 4 | 5 | ||
| 5 | -class OOPresentationExtractor extends OOPDFTextExtractor | 6 | +class OOPresentationExtractor extends CompositeExtractor |
| 6 | { | 7 | { |
| 8 | + public function __construct() | ||
| 9 | + { | ||
| 10 | + $sourceExtractor = new OOPresentationToPDF(); | ||
| 11 | + $targetExtractor = new PDFExtractor(); | ||
| 12 | + parent::__construct($sourceExtractor, 'pdf', 'application/pdf', $targetExtractor, true); | ||
| 13 | + } | ||
| 14 | + | ||
| 7 | public function getDisplayName() | 15 | public function getDisplayName() |
| 8 | { | 16 | { |
| 9 | return _kt('OpenOffice Presentation Extractor'); | 17 | return _kt('OpenOffice Presentation Extractor'); |
| @@ -18,4 +26,18 @@ class OOPresentationExtractor extends OOPDFTextExtractor | @@ -18,4 +26,18 @@ class OOPresentationExtractor extends OOPDFTextExtractor | ||
| 18 | } | 26 | } |
| 19 | } | 27 | } |
| 20 | 28 | ||
| 29 | +class OOPresentationToPDF extends OOTextExtractor | ||
| 30 | +{ | ||
| 31 | + public function __construct() | ||
| 32 | + { | ||
| 33 | + parent::__construct('pdf'); | ||
| 34 | + $this->documentConverter = KT_DIR . '/bin/openoffice/pdfgen.py'; | ||
| 35 | + if (!is_file($this->documentConverter)) | ||
| 36 | + { | ||
| 37 | + $this->documentConverter = false; | ||
| 38 | + } | ||
| 39 | + } | ||
| 40 | +} | ||
| 41 | + | ||
| 42 | + | ||
| 21 | ?> | 43 | ?> |
| 22 | \ No newline at end of file | 44 | \ No newline at end of file |
search2/indexing/extractors/OOSpreadsheetExtractor.inc.php
| 1 | <?php | 1 | <?php |
| 2 | 2 | ||
| 3 | -require_once('OOPDFTextExtractor.inc.php'); | 3 | +require_once('OOTextExtractor.inc.php'); |
| 4 | 4 | ||
| 5 | -class OOSpreadsheetExtractor extends OOPDFTextExtractor | 5 | +class OOSpreadsheetExtractor extends OOTextExtractor |
| 6 | { | 6 | { |
| 7 | public function getDisplayName() | 7 | public function getDisplayName() |
| 8 | { | 8 | { |
search2/indexing/extractors/OOTextExtractor.inc.php
| @@ -2,22 +2,27 @@ | @@ -2,22 +2,27 @@ | ||
| 2 | 2 | ||
| 3 | class OOTextExtractor extends ExternalDocumentExtractor | 3 | class OOTextExtractor extends ExternalDocumentExtractor |
| 4 | { | 4 | { |
| 5 | - private $converter; | ||
| 6 | - private $javaPath; | ||
| 7 | - private $ooHost; | ||
| 8 | - private $ooPort; | ||
| 9 | - private $targetMimeType; | 5 | + protected $python; |
| 6 | + protected $documentConverter; | ||
| 7 | + protected $ooHost; | ||
| 8 | + protected $ooPort; | ||
| 9 | + protected $targetExtension; | ||
| 10 | 10 | ||
| 11 | - public function __construct($targetMimeType='plain/text') | 11 | + public function __construct($targetExtension='html') |
| 12 | { | 12 | { |
| 13 | parent::__construct(); | 13 | parent::__construct(); |
| 14 | + $this->targetExtension = $targetExtension; | ||
| 14 | $config =& KTConfig::getSingleton(); | 15 | $config =& KTConfig::getSingleton(); |
| 15 | 16 | ||
| 16 | - $this->converter = KTUtil::findCommand('extractors/jodconverter', 'jodconverter'); | ||
| 17 | - $this->javaPath = KTUtil::findCommand('extractors/java', 'java'); | ||
| 18 | - $this->ooHost = $config->get('openoffice/host', 'localhost'); | ||
| 19 | - $this->ooPort = $config->get('openoffice/port', 8100); | ||
| 20 | - $this->targetMimeType = $targetMimeType; | 17 | + $this->python = KTUtil::findCommand('externalBinary/python'); |
| 18 | + $this->ooHost = $config->get('openoffice/host'); | ||
| 19 | + $this->ooPort = $config->get('openoffice/port'); | ||
| 20 | + | ||
| 21 | + $this->documentConverter = KT_DIR . '/bin/openoffice/DocumentConverter.py'; | ||
| 22 | + if (!is_file($this->documentConverter)) | ||
| 23 | + { | ||
| 24 | + $this->documentConverter = false; | ||
| 25 | + } | ||
| 21 | } | 26 | } |
| 22 | 27 | ||
| 23 | public function getDisplayName() | 28 | public function getDisplayName() |
| @@ -28,53 +33,78 @@ class OOTextExtractor extends ExternalDocumentExtractor | @@ -28,53 +33,78 @@ class OOTextExtractor extends ExternalDocumentExtractor | ||
| 28 | public function getSupportedMimeTypes() | 33 | public function getSupportedMimeTypes() |
| 29 | { | 34 | { |
| 30 | return array( | 35 | return array( |
| 31 | - 'text/rtf', | ||
| 32 | - 'application/vnd.oasis.opendocument.text', | ||
| 33 | - 'application/vnd.oasis.opendocument.text-template', | ||
| 34 | - 'application/vnd.oasis.opendocument.text-web', | ||
| 35 | - 'application/vnd.oasis.opendocument.text-master', | ||
| 36 | - 'application/vnd.sun.xml.writer', | ||
| 37 | - 'application/vnd.sun.xml.writer.template', | ||
| 38 | - 'application/vnd.sun.xml.writer.global', | 36 | + |
| 39 | ); | 37 | ); |
| 40 | } | 38 | } |
| 41 | 39 | ||
| 42 | public function needsIntermediateSourceFile() | 40 | public function needsIntermediateSourceFile() |
| 43 | { | 41 | { |
| 44 | // we need the intermediate file because it | 42 | // we need the intermediate file because it |
| 45 | - // has the correct extension. jodconverter uses the extension to determine mimetype | 43 | + // has the correct extension. documentConverter uses the extension to determine mimetype |
| 46 | return true; | 44 | return true; |
| 47 | } | 45 | } |
| 48 | 46 | ||
| 49 | protected function getCommandLine() | 47 | protected function getCommandLine() |
| 50 | { | 48 | { |
| 51 | - $cmdline = "$this->javaPath -jar $this->converter $this->sourcefile $this->mimetype $this->targetfile $this->targetMimeType $this->ooHost $this->ooPort"; | 49 | + $sourcefile = escapeshellcmd($this->sourcefile); |
| 50 | + unlink($this->targetfile); | ||
| 51 | + $this->targetfile .= '.' . $this->targetExtension; | ||
| 52 | + $targetfile = escapeshellcmd($this->targetfile); | ||
| 53 | + | ||
| 54 | + $escape = OS_WINDOWS?'"':'\''; | ||
| 55 | + | ||
| 56 | + $cmdline = "{$this->python} {$escape}{$this->documentConverter}{$escape} {$escape}{$sourcefile}{$escape} {$escape}{$targetfile}{$escape} {$this->ooHost} {$this->ooPort}"; | ||
| 52 | return $cmdline; | 57 | return $cmdline; |
| 53 | } | 58 | } |
| 54 | 59 | ||
| 55 | - public function diagnose() | 60 | + protected function filter($text) |
| 56 | { | 61 | { |
| 57 | - if (false === $this->converter) | 62 | + $text = preg_replace ("@(</?[^>]*>)+@", '', $text); |
| 63 | + | ||
| 64 | + do | ||
| 65 | + { | ||
| 66 | + $old = $text; | ||
| 67 | + | ||
| 68 | + $text= preg_replace("@([\r\n])[\s]+@",'\1', $text); | ||
| 69 | + | ||
| 70 | + $text = preg_replace('@\ \ @',' ', $text); | ||
| 71 | + $text = preg_replace("@\n\n@","\n", $text); | ||
| 72 | + } | ||
| 73 | + while ($old != $text); | ||
| 74 | + | ||
| 75 | + return $text; | ||
| 76 | + } | ||
| 77 | + | ||
| 78 | + public function extractTextContent() | ||
| 79 | + { | ||
| 80 | + if (false === parent::extractTextContent()) | ||
| 58 | { | 81 | { |
| 59 | - return _kt('Cannot locate jodconverter'); | 82 | + return false; |
| 60 | } | 83 | } |
| 61 | 84 | ||
| 62 | - if (false === $this->javaPath) | 85 | + if ($this->targetExtension != 'html') |
| 63 | { | 86 | { |
| 64 | - return _kt('Cannot locate java'); | 87 | + return true; |
| 65 | } | 88 | } |
| 89 | + $content = file_get_contents($this->targetfile); | ||
| 90 | + return file_put_contents($this->targetfile, $this->filter($content)); | ||
| 66 | 91 | ||
| 92 | + } | ||
| 67 | 93 | ||
| 68 | 94 | ||
| 69 | - $connection = @fsockopen($this->ooHost, $this->ooPort,$errno, $errstr,5 ); | ||
| 70 | - if (false === $connection) | 95 | + public function diagnose() |
| 96 | + { | ||
| 97 | + if (false === $this->python) | ||
| 71 | { | 98 | { |
| 72 | - return _kt('Cannot connect to openoffice host'); | 99 | + return _kt('Cannot locate python'); |
| 73 | } | 100 | } |
| 74 | - fclose($connection); | ||
| 75 | 101 | ||
| 102 | + if (false === $this->documentConverter) | ||
| 103 | + { | ||
| 104 | + return _kt('Cannot locate DocumentConverter.py'); | ||
| 105 | + } | ||
| 76 | 106 | ||
| 77 | - return null; | 107 | + return SearchHelper::checkOpenOfficeAvailablity(); |
| 78 | } | 108 | } |
| 79 | } | 109 | } |
| 80 | 110 |
search2/indexing/extractors/PDFExtractor.inc.php
| @@ -4,7 +4,7 @@ class PDFExtractor extends ApplicationExtractor | @@ -4,7 +4,7 @@ class PDFExtractor extends ApplicationExtractor | ||
| 4 | { | 4 | { |
| 5 | public function __construct() | 5 | public function __construct() |
| 6 | { | 6 | { |
| 7 | - parent::__construct('externalBinary','pdftotext','pdftotext',_kt('PDF Text Extractor'),'-nopgbrk -enc UTF-8 {source} {target}'); | 7 | + parent::__construct('externalBinary','pdftotext','pdftotext',_kt('PDF Text Extractor'),'-nopgbrk -enc UTF-8 \'{source}\' \'{target}\''); |
| 8 | } | 8 | } |
| 9 | 9 | ||
| 10 | public function getSupportedMimeTypes() | 10 | public function getSupportedMimeTypes() |
search2/indexing/extractors/MailMimeExtractor.inc.php renamed to search2/indexing/extractors/RTFExtractor.inc.php
| 1 | <?php | 1 | <?php |
| 2 | 2 | ||
| 3 | -class MailMimeExtractor extends TextExtractor | 3 | +require_once('OOTextExtractor.inc.php'); |
| 4 | + | ||
| 5 | +class RTFExtractor extends OOTextExtractor | ||
| 4 | { | 6 | { |
| 5 | public function getDisplayName() | 7 | public function getDisplayName() |
| 6 | { | 8 | { |
| 7 | - return _kt('Mail Mime Extractor'); | 9 | + return _kt('RTF Extractor'); |
| 8 | } | 10 | } |
| 9 | 11 | ||
| 10 | public function getSupportedMimeTypes() | 12 | public function getSupportedMimeTypes() |
| 11 | { | 13 | { |
| 12 | - return array('text/msg'); | 14 | + return array( |
| 15 | + 'text/rtf' | ||
| 16 | + ); | ||
| 13 | } | 17 | } |
| 14 | - | ||
| 15 | } | 18 | } |
| 16 | 19 | ||
| 20 | + | ||
| 17 | ?> | 21 | ?> |
| 18 | \ No newline at end of file | 22 | \ No newline at end of file |
search2/indexing/extractors/XMLExtractor.inc.php
| @@ -9,7 +9,7 @@ class XMLExtractor extends TextExtractor | @@ -9,7 +9,7 @@ class XMLExtractor extends TextExtractor | ||
| 9 | 9 | ||
| 10 | public function getSupportedMimeTypes() | 10 | public function getSupportedMimeTypes() |
| 11 | { | 11 | { |
| 12 | - return array('text/xml','application/xml','text/html'); | 12 | + return array('text/xml','application/xml','text/html','text/enriched'); |
| 13 | } | 13 | } |
| 14 | 14 | ||
| 15 | protected function filter($text) | 15 | protected function filter($text) |
search2/indexing/indexerCore.inc.php
| @@ -330,7 +330,11 @@ abstract class Indexer | @@ -330,7 +330,11 @@ abstract class Indexer | ||
| 330 | public function clearExtractors() | 330 | public function clearExtractors() |
| 331 | { | 331 | { |
| 332 | global $default; | 332 | global $default; |
| 333 | - $sql = "update mime_types set extractor=null"; | 333 | + |
| 334 | + $sql = "update mime_types set extractor_id=null"; | ||
| 335 | + DBUtil::runQuery($sql); | ||
| 336 | + | ||
| 337 | + $sql = "delete from mime_extractors"; | ||
| 334 | DBUtil::runQuery($sql); | 338 | DBUtil::runQuery($sql); |
| 335 | 339 | ||
| 336 | $default->log->debug('clearExtractors'); | 340 | $default->log->debug('clearExtractors'); |
| @@ -616,6 +620,43 @@ abstract class Indexer | @@ -616,6 +620,43 @@ abstract class Indexer | ||
| 616 | } | 620 | } |
| 617 | 621 | ||
| 618 | /** | 622 | /** |
| 623 | + * This does the initial mime type association between mime types and text extractors | ||
| 624 | + * | ||
| 625 | + */ | ||
| 626 | + public function checkForRegisteredTypes() | ||
| 627 | + { | ||
| 628 | + global $default; | ||
| 629 | + | ||
| 630 | + // we are only doing this once! | ||
| 631 | + $initRegistered = KTUtil::getSystemSetting('mimeTypesRegistered', false); | ||
| 632 | + if ($initRegistered) | ||
| 633 | + { | ||
| 634 | + return; | ||
| 635 | + } | ||
| 636 | + $default->log->info('checkForRegisteredTypes: start'); | ||
| 637 | + | ||
| 638 | + $this->registerTypes(true); | ||
| 639 | + | ||
| 640 | + | ||
| 641 | + $disable = array( | ||
| 642 | + OS_WINDOWS=>array('PSExtractor'), | ||
| 643 | + OS_UNIX => array() | ||
| 644 | + | ||
| 645 | + ); | ||
| 646 | + | ||
| 647 | + foreach($disable[OS_WINDOWS] as $extractor) | ||
| 648 | + { | ||
| 649 | + $sql = "UPDATE mime_extractors SET active=0 WHERE name='$extractor'"; | ||
| 650 | + DBUtil::runQuery($sql); | ||
| 651 | + $default->log->info("checkForRegisteredTypes: disabled '$extractor'"); | ||
| 652 | + } | ||
| 653 | + | ||
| 654 | + $default->log->info('checkForRegisteredTypes: done'); | ||
| 655 | + KTUtil::setSystemSetting('mimeTypesRegistered', true); | ||
| 656 | + } | ||
| 657 | + | ||
| 658 | + | ||
| 659 | + /** | ||
| 619 | * The main function that may be called repeatedly to index documents. | 660 | * The main function that may be called repeatedly to index documents. |
| 620 | * | 661 | * |
| 621 | * @param int $max Default 20 | 662 | * @param int $max Default 20 |
| @@ -624,6 +665,8 @@ abstract class Indexer | @@ -624,6 +665,8 @@ abstract class Indexer | ||
| 624 | { | 665 | { |
| 625 | global $default; | 666 | global $default; |
| 626 | 667 | ||
| 668 | + $this->checkForRegisteredTypes(); | ||
| 669 | + | ||
| 627 | $default->log->info('indexDocuments: start'); | 670 | $default->log->info('indexDocuments: start'); |
| 628 | if (!$this->doesDiagnosticsPass()) | 671 | if (!$this->doesDiagnosticsPass()) |
| 629 | { | 672 | { |
| @@ -798,6 +841,9 @@ abstract class Indexer | @@ -798,6 +841,9 @@ abstract class Indexer | ||
| 798 | 841 | ||
| 799 | if ($extractor->extractTextContent()) | 842 | if ($extractor->extractTextContent()) |
| 800 | { | 843 | { |
| 844 | + // the extractor may need to create another target file | ||
| 845 | + $targetFile = $extractor->getTargetFile(); | ||
| 846 | + | ||
| 801 | $extractor->setExtractionStatus(true); | 847 | $extractor->setExtractionStatus(true); |
| 802 | $this->executeHook($extractor, 'pre_index'); | 848 | $this->executeHook($extractor, 'pre_index'); |
| 803 | $this->executeHook($extractor, 'pre_index', $mimeType); | 849 | $this->executeHook($extractor, 'pre_index', $mimeType); |