Commit f07a0f3ef2a254e7467398e8796b25d13e3b65ef
1 parent
6a77e202
KTS-673
"The search algorithm needs some work" Updated. Integrated with DocumentConverter.py Committed By: Conrad Vermeulen Reviewed By: Kevin Fourie git-svn-id: https://kt-dms.svn.sourceforge.net/svnroot/kt-dms/trunk@7463 c91229c3-7414-0410-bfa2-8a42b809f60b
Showing
1 changed file
with
54 additions
and
30 deletions
search2/indexing/extractors/OOTextExtractor.inc.php
| ... | ... | @@ -2,22 +2,25 @@ |
| 2 | 2 | |
| 3 | 3 | class OOTextExtractor extends ExternalDocumentExtractor |
| 4 | 4 | { |
| 5 | - private $converter; | |
| 6 | - private $javaPath; | |
| 5 | + private $python; | |
| 6 | + private $documentConverter; | |
| 7 | 7 | private $ooHost; |
| 8 | 8 | private $ooPort; |
| 9 | - private $targetMimeType; | |
| 10 | 9 | |
| 11 | 10 | public function __construct($targetMimeType='plain/text') |
| 12 | 11 | { |
| 13 | 12 | parent::__construct(); |
| 14 | 13 | $config =& KTConfig::getSingleton(); |
| 15 | 14 | |
| 16 | - $this->converter = KTUtil::findCommand('extractors/jodconverter', 'jodconverter'); | |
| 17 | - $this->javaPath = KTUtil::findCommand('extractors/java', 'java'); | |
| 18 | - $this->ooHost = $config->get('openoffice/host', 'localhost'); | |
| 19 | - $this->ooPort = $config->get('openoffice/port', 8100); | |
| 20 | - $this->targetMimeType = $targetMimeType; | |
| 15 | + $this->python = KTUtil::findCommand('externalBinary/python'); | |
| 16 | + $this->ooHost = $config->get('openoffice/host'); | |
| 17 | + $this->ooPort = $config->get('openoffice/port'); | |
| 18 | + | |
| 19 | + $this->documentConverter = KT_DIR . '/bin/openoffice/DocumentConverter.py'; | |
| 20 | + if (!is_file($this->documentConverter)) | |
| 21 | + { | |
| 22 | + $this->documentConverter = false; | |
| 23 | + } | |
| 21 | 24 | } |
| 22 | 25 | |
| 23 | 26 | public function getDisplayName() |
| ... | ... | @@ -28,53 +31,74 @@ class OOTextExtractor extends ExternalDocumentExtractor |
| 28 | 31 | public function getSupportedMimeTypes() |
| 29 | 32 | { |
| 30 | 33 | return array( |
| 31 | - 'text/rtf', | |
| 32 | - 'application/vnd.oasis.opendocument.text', | |
| 33 | - 'application/vnd.oasis.opendocument.text-template', | |
| 34 | - 'application/vnd.oasis.opendocument.text-web', | |
| 35 | - 'application/vnd.oasis.opendocument.text-master', | |
| 36 | - 'application/vnd.sun.xml.writer', | |
| 37 | - 'application/vnd.sun.xml.writer.template', | |
| 38 | - 'application/vnd.sun.xml.writer.global', | |
| 34 | + | |
| 39 | 35 | ); |
| 40 | 36 | } |
| 41 | 37 | |
| 42 | 38 | public function needsIntermediateSourceFile() |
| 43 | 39 | { |
| 44 | 40 | // we need the intermediate file because it |
| 45 | - // has the correct extension. jodconverter uses the extension to determine mimetype | |
| 41 | + // has the correct extension. documentConverter uses the extension to determine mimetype | |
| 46 | 42 | return true; |
| 47 | 43 | } |
| 48 | 44 | |
| 49 | 45 | protected function getCommandLine() |
| 50 | 46 | { |
| 51 | - $cmdline = "$this->javaPath -jar $this->converter $this->sourcefile $this->mimetype $this->targetfile $this->targetMimeType $this->ooHost $this->ooPort"; | |
| 47 | + $sourcefile = escapeshellcmd($this->sourcefile); | |
| 48 | + unlink($this->targetfile); | |
| 49 | + $this->targetfile .= '.html'; | |
| 50 | + $targetfile = escapeshellcmd($this->targetfile); | |
| 51 | + | |
| 52 | + $escape = OS_WINDOWS?'"':'\''; | |
| 53 | + | |
| 54 | + $cmdline = "{$this->python} {$escape}{$this->documentConverter}{$escape} {$escape}{$this->sourcefile}{$escape} {$escape}{$this->targetfile}{$escape} {$this->ooHost} {$this->ooPort}"; | |
| 52 | 55 | return $cmdline; |
| 53 | 56 | } |
| 54 | 57 | |
| 55 | - public function diagnose() | |
| 58 | + protected function filter($text) | |
| 56 | 59 | { |
| 57 | - if (false === $this->converter) | |
| 58 | - { | |
| 59 | - return _kt('Cannot locate jodconverter'); | |
| 60 | - } | |
| 60 | + $text = preg_replace ("@(</?[^>]*>)+@", '', $text); | |
| 61 | + | |
| 62 | + do | |
| 63 | + { | |
| 64 | + $old = $text; | |
| 65 | + | |
| 66 | + $text= preg_replace("@([\r\n])[\s]+@",'\1', $text); | |
| 61 | 67 | |
| 62 | - if (false === $this->javaPath) | |
| 68 | + $text = preg_replace('@\ \ @',' ', $text); | |
| 69 | + $text = preg_replace("@\n\n@","\n", $text); | |
| 70 | + } | |
| 71 | + while ($old != $text); | |
| 72 | + | |
| 73 | + return $text; | |
| 74 | + } | |
| 75 | + | |
| 76 | + public function extractTextContent() | |
| 77 | + { | |
| 78 | + if (false === parent::extractTextContent()) | |
| 63 | 79 | { |
| 64 | - return _kt('Cannot locate java'); | |
| 80 | + return false; | |
| 65 | 81 | } |
| 66 | 82 | |
| 83 | + $content = file_get_contents($this->targetfile); | |
| 84 | + return file_put_contents($this->targetfile, $this->filter($content)); | |
| 85 | + | |
| 86 | + } | |
| 67 | 87 | |
| 68 | 88 | |
| 69 | - $connection = @fsockopen($this->ooHost, $this->ooPort,$errno, $errstr,5 ); | |
| 70 | - if (false === $connection) | |
| 89 | + public function diagnose() | |
| 90 | + { | |
| 91 | + if (false === $this->python) | |
| 71 | 92 | { |
| 72 | - return _kt('Cannot connect to openoffice host'); | |
| 93 | + return _kt('Cannot locate python'); | |
| 73 | 94 | } |
| 74 | - fclose($connection); | |
| 75 | 95 | |
| 96 | + if (false === $this->documentConverter) | |
| 97 | + { | |
| 98 | + return _kt('Cannot locate DocumentConverter.py'); | |
| 99 | + } | |
| 76 | 100 | |
| 77 | - return null; | |
| 101 | + return SearchHelper::checkOpenOfficeAvailablity(); | |
| 78 | 102 | } |
| 79 | 103 | } |
| 80 | 104 | ... | ... |