diff --git a/config/config.ini b/config/config.ini index e071fd0..fb424ca 100644 --- a/config/config.ini +++ b/config/config.ini @@ -242,7 +242,10 @@ pdftotext = pdftotext catppt = catppt pstotext = pstotext catdoc = catdoc -antiword = antiword.exe +antiword = antiword +python = python +java = java +php = php [search] ; The number of results per page diff --git a/plugins/housekeeper/DiskUsageDashlet.inc.php b/plugins/housekeeper/DiskUsageDashlet.inc.php index 7f22ba7..30abcc7 100644 --- a/plugins/housekeeper/DiskUsageDashlet.inc.php +++ b/plugins/housekeeper/DiskUsageDashlet.inc.php @@ -1,28 +1,32 @@ sTitle = _kt('Disk Usage'); + $this->sTitle = _kt('Storage Utilization'); $this->sClass = "ktInfo"; } diff --git a/plugins/housekeeper/FolderUsageDashlet.inc.php b/plugins/housekeeper/FolderUsageDashlet.inc.php index 359cf67..e94154d 100644 --- a/plugins/housekeeper/FolderUsageDashlet.inc.php +++ b/plugins/housekeeper/FolderUsageDashlet.inc.php @@ -1,28 +1,32 @@ sTitle = _kt('System Folder Usage'); + $this->sTitle = _kt('System Folder Utilization'); $this->sClass = "ktInfo"; } diff --git a/plugins/housekeeper/HouseKeeperDispatcher.php b/plugins/housekeeper/HouseKeeperDispatcher.php index 5138544..458165e 100644 --- a/plugins/housekeeper/HouseKeeperDispatcher.php +++ b/plugins/housekeeper/HouseKeeperDispatcher.php @@ -1,5 +1,34 @@ dispatch(); -?> \ No newline at end of file +?> diff --git a/plugins/housekeeper/HouseKeeperPlugin.php b/plugins/housekeeper/HouseKeeperPlugin.php index 5391514..688337d 100644 --- a/plugins/housekeeper/HouseKeeperPlugin.php +++ b/plugins/housekeeper/HouseKeeperPlugin.php @@ -1,5 +1,34 @@ registerPlugin('HouseKeeperPlugin', 'ktcore.housekeeper.plugin', __FILE__); -?> \ No newline at end of file +?> diff --git a/search2/indexing/extractorCore.inc.php b/search2/indexing/extractorCore.inc.php index b75c9fc..ecf908a 100644 --- a/search2/indexing/extractorCore.inc.php +++ b/search2/indexing/extractorCore.inc.php @@ -124,10 +124,22 @@ abstract class DocumentExtractor } $classname=get_class($this); + $sql = "select id as extractor_id from mime_extractors WHERE name='$classname'"; + $rs = DBUtil::getResultArray($sql); + if (count($rs) == 0) + { + $extractor_id = DBUtil::autoInsert('mime_extractors', array('name'=>$classname, 'active'=>1)); + } + else + { + $extractor_id = $rs[0]['extractor_id']; + } + + foreach($types as $type) { - $sql = "update mime_types set extractor='$classname' where mimetypes='$type' and extractor is null"; - DBUtil::runQuery($sql); + $sql = "update mime_types set extractor_id=$extractor_id where mimetypes='$type' and extractor_id is null"; + $rs = DBUtil::runQuery($sql); } } @@ -510,15 +522,17 @@ abstract class CompositeExtractor extends DocumentExtractor public function extractTextContent() { $intermediateFile = $this->targetfile . '.' . $this->targetExtension; + touch($intermediateFile); $this->sourceExtractor->setSourceFile($this->sourcefile); $this->sourceExtractor->setTargetFile($intermediateFile); $this->sourceExtractor->setMimeType($this->mimetype); $this->sourceExtractor->setExtension($this->extension); - if ($this->sourceExtractor->extractTextContent()) + if (!$this->sourceExtractor->extractTextContent()) { return false; } + $intermediateFile = $this->sourceExtractor->getTargetFile(); $this->targetExtractor->setSourceFile($intermediateFile); $this->targetExtractor->setTargetFile($this->targetfile); diff --git a/search2/indexing/extractors/OOPresentationExtractor.inc.php b/search2/indexing/extractors/OOPresentationExtractor.inc.php index e832cc9..c6c6c22 100644 --- a/search2/indexing/extractors/OOPresentationExtractor.inc.php +++ b/search2/indexing/extractors/OOPresentationExtractor.inc.php @@ -1,9 +1,17 @@ documentConverter = KT_DIR . '/bin/openoffice/pdfgen.py'; + if (!is_file($this->documentConverter)) + { + $this->documentConverter = false; + } + } +} + + ?> \ No newline at end of file diff --git a/search2/indexing/extractors/OOSpreadsheetExtractor.inc.php b/search2/indexing/extractors/OOSpreadsheetExtractor.inc.php index 67d6039..b05dbd6 100644 --- a/search2/indexing/extractors/OOSpreadsheetExtractor.inc.php +++ b/search2/indexing/extractors/OOSpreadsheetExtractor.inc.php @@ -1,8 +1,8 @@ targetExtension = $targetExtension; $config =& KTConfig::getSingleton(); - $this->converter = KTUtil::findCommand('extractors/jodconverter', 'jodconverter'); - $this->javaPath = KTUtil::findCommand('extractors/java', 'java'); - $this->ooHost = $config->get('openoffice/host', 'localhost'); - $this->ooPort = $config->get('openoffice/port', 8100); - $this->targetMimeType = $targetMimeType; + $this->python = KTUtil::findCommand('externalBinary/python'); + $this->ooHost = $config->get('openoffice/host'); + $this->ooPort = $config->get('openoffice/port'); + + $this->documentConverter = KT_DIR . '/bin/openoffice/DocumentConverter.py'; + if (!is_file($this->documentConverter)) + { + $this->documentConverter = false; + } } public function getDisplayName() @@ -28,53 +33,78 @@ class OOTextExtractor extends ExternalDocumentExtractor public function getSupportedMimeTypes() { return array( - 'text/rtf', - 'application/vnd.oasis.opendocument.text', - 'application/vnd.oasis.opendocument.text-template', - 'application/vnd.oasis.opendocument.text-web', - 'application/vnd.oasis.opendocument.text-master', - 'application/vnd.sun.xml.writer', - 'application/vnd.sun.xml.writer.template', - 'application/vnd.sun.xml.writer.global', + ); } public function needsIntermediateSourceFile() { // we need the intermediate file because it - // has the correct extension. jodconverter uses the extension to determine mimetype + // has the correct extension. documentConverter uses the extension to determine mimetype return true; } protected function getCommandLine() { - $cmdline = "$this->javaPath -jar $this->converter $this->sourcefile $this->mimetype $this->targetfile $this->targetMimeType $this->ooHost $this->ooPort"; + $sourcefile = escapeshellcmd($this->sourcefile); + unlink($this->targetfile); + $this->targetfile .= '.' . $this->targetExtension; + $targetfile = escapeshellcmd($this->targetfile); + + $escape = OS_WINDOWS?'"':'\''; + + $cmdline = "{$this->python} {$escape}{$this->documentConverter}{$escape} {$escape}{$sourcefile}{$escape} {$escape}{$targetfile}{$escape} {$this->ooHost} {$this->ooPort}"; return $cmdline; } - public function diagnose() + protected function filter($text) { - if (false === $this->converter) + $text = preg_replace ("@(]*>)+@", '', $text); + + do + { + $old = $text; + + $text= preg_replace("@([\r\n])[\s]+@",'\1', $text); + + $text = preg_replace('@\ \ @',' ', $text); + $text = preg_replace("@\n\n@","\n", $text); + } + while ($old != $text); + + return $text; + } + + public function extractTextContent() + { + if (false === parent::extractTextContent()) { - return _kt('Cannot locate jodconverter'); + return false; } - if (false === $this->javaPath) + if ($this->targetExtension != 'html') { - return _kt('Cannot locate java'); + return true; } + $content = file_get_contents($this->targetfile); + return file_put_contents($this->targetfile, $this->filter($content)); + } - $connection = @fsockopen($this->ooHost, $this->ooPort,$errno, $errstr,5 ); - if (false === $connection) + public function diagnose() + { + if (false === $this->python) { - return _kt('Cannot connect to openoffice host'); + return _kt('Cannot locate python'); } - fclose($connection); + if (false === $this->documentConverter) + { + return _kt('Cannot locate DocumentConverter.py'); + } - return null; + return SearchHelper::checkOpenOfficeAvailablity(); } } diff --git a/search2/indexing/extractors/PDFExtractor.inc.php b/search2/indexing/extractors/PDFExtractor.inc.php index b05e9e7..e2510f9 100644 --- a/search2/indexing/extractors/PDFExtractor.inc.php +++ b/search2/indexing/extractors/PDFExtractor.inc.php @@ -4,7 +4,7 @@ class PDFExtractor extends ApplicationExtractor { public function __construct() { - parent::__construct('externalBinary','pdftotext','pdftotext',_kt('PDF Text Extractor'),'-nopgbrk -enc UTF-8 {source} {target}'); + parent::__construct('externalBinary','pdftotext','pdftotext',_kt('PDF Text Extractor'),'-nopgbrk -enc UTF-8 \'{source}\' \'{target}\''); } public function getSupportedMimeTypes() diff --git a/search2/indexing/extractors/MailMimeExtractor.inc.php b/search2/indexing/extractors/RTFExtractor.inc.php index 9dee5df..9afc7ab 100644 --- a/search2/indexing/extractors/MailMimeExtractor.inc.php +++ b/search2/indexing/extractors/RTFExtractor.inc.php @@ -1,17 +1,21 @@ \ No newline at end of file diff --git a/search2/indexing/extractors/XMLExtractor.inc.php b/search2/indexing/extractors/XMLExtractor.inc.php index 2d7a2fd..573f585 100644 --- a/search2/indexing/extractors/XMLExtractor.inc.php +++ b/search2/indexing/extractors/XMLExtractor.inc.php @@ -9,7 +9,7 @@ class XMLExtractor extends TextExtractor public function getSupportedMimeTypes() { - return array('text/xml','application/xml','text/html'); + return array('text/xml','application/xml','text/html','text/enriched'); } protected function filter($text) diff --git a/search2/indexing/indexerCore.inc.php b/search2/indexing/indexerCore.inc.php index 6d88b4a..02cffa9 100644 --- a/search2/indexing/indexerCore.inc.php +++ b/search2/indexing/indexerCore.inc.php @@ -330,7 +330,11 @@ abstract class Indexer public function clearExtractors() { global $default; - $sql = "update mime_types set extractor=null"; + + $sql = "update mime_types set extractor_id=null"; + DBUtil::runQuery($sql); + + $sql = "delete from mime_extractors"; DBUtil::runQuery($sql); $default->log->debug('clearExtractors'); @@ -616,6 +620,43 @@ abstract class Indexer } /** + * This does the initial mime type association between mime types and text extractors + * + */ + public function checkForRegisteredTypes() + { + global $default; + + // we are only doing this once! + $initRegistered = KTUtil::getSystemSetting('mimeTypesRegistered', false); + if ($initRegistered) + { + return; + } + $default->log->info('checkForRegisteredTypes: start'); + + $this->registerTypes(true); + + + $disable = array( + OS_WINDOWS=>array('PSExtractor'), + OS_UNIX => array() + + ); + + foreach($disable[OS_WINDOWS] as $extractor) + { + $sql = "UPDATE mime_extractors SET active=0 WHERE name='$extractor'"; + DBUtil::runQuery($sql); + $default->log->info("checkForRegisteredTypes: disabled '$extractor'"); + } + + $default->log->info('checkForRegisteredTypes: done'); + KTUtil::setSystemSetting('mimeTypesRegistered', true); + } + + + /** * The main function that may be called repeatedly to index documents. * * @param int $max Default 20 @@ -624,6 +665,8 @@ abstract class Indexer { global $default; + $this->checkForRegisteredTypes(); + $default->log->info('indexDocuments: start'); if (!$this->doesDiagnosticsPass()) { @@ -798,6 +841,9 @@ abstract class Indexer if ($extractor->extractTextContent()) { + // the extractor may need to create another target file + $targetFile = $extractor->getTargetFile(); + $extractor->setExtractionStatus(true); $this->executeHook($extractor, 'pre_index'); $this->executeHook($extractor, 'pre_index', $mimeType);