Commit b2d72fcf2b3efab99ad93a10b679dfd0db2cbd57
1 parent
fe91877f
KTS-673
"The search algorithm needs some work" Updated. Integrated with DocumentConverter.py Committed By: Conrad Vermeulen Reviewed By: Kevin Fourie git-svn-id: https://kt-dms.svn.sourceforge.net/svnroot/kt-dms/trunk@7463 c91229c3-7414-0410-bfa2-8a42b809f60b
Showing
1 changed file
with
54 additions
and
30 deletions
search2/indexing/extractors/OOTextExtractor.inc.php
| @@ -2,22 +2,25 @@ | @@ -2,22 +2,25 @@ | ||
| 2 | 2 | ||
| 3 | class OOTextExtractor extends ExternalDocumentExtractor | 3 | class OOTextExtractor extends ExternalDocumentExtractor |
| 4 | { | 4 | { |
| 5 | - private $converter; | ||
| 6 | - private $javaPath; | 5 | + private $python; |
| 6 | + private $documentConverter; | ||
| 7 | private $ooHost; | 7 | private $ooHost; |
| 8 | private $ooPort; | 8 | private $ooPort; |
| 9 | - private $targetMimeType; | ||
| 10 | 9 | ||
| 11 | public function __construct($targetMimeType='plain/text') | 10 | public function __construct($targetMimeType='plain/text') |
| 12 | { | 11 | { |
| 13 | parent::__construct(); | 12 | parent::__construct(); |
| 14 | $config =& KTConfig::getSingleton(); | 13 | $config =& KTConfig::getSingleton(); |
| 15 | 14 | ||
| 16 | - $this->converter = KTUtil::findCommand('extractors/jodconverter', 'jodconverter'); | ||
| 17 | - $this->javaPath = KTUtil::findCommand('extractors/java', 'java'); | ||
| 18 | - $this->ooHost = $config->get('openoffice/host', 'localhost'); | ||
| 19 | - $this->ooPort = $config->get('openoffice/port', 8100); | ||
| 20 | - $this->targetMimeType = $targetMimeType; | 15 | + $this->python = KTUtil::findCommand('externalBinary/python'); |
| 16 | + $this->ooHost = $config->get('openoffice/host'); | ||
| 17 | + $this->ooPort = $config->get('openoffice/port'); | ||
| 18 | + | ||
| 19 | + $this->documentConverter = KT_DIR . '/bin/openoffice/DocumentConverter.py'; | ||
| 20 | + if (!is_file($this->documentConverter)) | ||
| 21 | + { | ||
| 22 | + $this->documentConverter = false; | ||
| 23 | + } | ||
| 21 | } | 24 | } |
| 22 | 25 | ||
| 23 | public function getDisplayName() | 26 | public function getDisplayName() |
| @@ -28,53 +31,74 @@ class OOTextExtractor extends ExternalDocumentExtractor | @@ -28,53 +31,74 @@ class OOTextExtractor extends ExternalDocumentExtractor | ||
| 28 | public function getSupportedMimeTypes() | 31 | public function getSupportedMimeTypes() |
| 29 | { | 32 | { |
| 30 | return array( | 33 | return array( |
| 31 | - 'text/rtf', | ||
| 32 | - 'application/vnd.oasis.opendocument.text', | ||
| 33 | - 'application/vnd.oasis.opendocument.text-template', | ||
| 34 | - 'application/vnd.oasis.opendocument.text-web', | ||
| 35 | - 'application/vnd.oasis.opendocument.text-master', | ||
| 36 | - 'application/vnd.sun.xml.writer', | ||
| 37 | - 'application/vnd.sun.xml.writer.template', | ||
| 38 | - 'application/vnd.sun.xml.writer.global', | 34 | + |
| 39 | ); | 35 | ); |
| 40 | } | 36 | } |
| 41 | 37 | ||
| 42 | public function needsIntermediateSourceFile() | 38 | public function needsIntermediateSourceFile() |
| 43 | { | 39 | { |
| 44 | // we need the intermediate file because it | 40 | // we need the intermediate file because it |
| 45 | - // has the correct extension. jodconverter uses the extension to determine mimetype | 41 | + // has the correct extension. documentConverter uses the extension to determine mimetype |
| 46 | return true; | 42 | return true; |
| 47 | } | 43 | } |
| 48 | 44 | ||
| 49 | protected function getCommandLine() | 45 | protected function getCommandLine() |
| 50 | { | 46 | { |
| 51 | - $cmdline = "$this->javaPath -jar $this->converter $this->sourcefile $this->mimetype $this->targetfile $this->targetMimeType $this->ooHost $this->ooPort"; | 47 | + $sourcefile = escapeshellcmd($this->sourcefile); |
| 48 | + unlink($this->targetfile); | ||
| 49 | + $this->targetfile .= '.html'; | ||
| 50 | + $targetfile = escapeshellcmd($this->targetfile); | ||
| 51 | + | ||
| 52 | + $escape = OS_WINDOWS?'"':'\''; | ||
| 53 | + | ||
| 54 | + $cmdline = "{$this->python} {$escape}{$this->documentConverter}{$escape} {$escape}{$this->sourcefile}{$escape} {$escape}{$this->targetfile}{$escape} {$this->ooHost} {$this->ooPort}"; | ||
| 52 | return $cmdline; | 55 | return $cmdline; |
| 53 | } | 56 | } |
| 54 | 57 | ||
| 55 | - public function diagnose() | 58 | + protected function filter($text) |
| 56 | { | 59 | { |
| 57 | - if (false === $this->converter) | ||
| 58 | - { | ||
| 59 | - return _kt('Cannot locate jodconverter'); | ||
| 60 | - } | 60 | + $text = preg_replace ("@(</?[^>]*>)+@", '', $text); |
| 61 | + | ||
| 62 | + do | ||
| 63 | + { | ||
| 64 | + $old = $text; | ||
| 65 | + | ||
| 66 | + $text= preg_replace("@([\r\n])[\s]+@",'\1', $text); | ||
| 61 | 67 | ||
| 62 | - if (false === $this->javaPath) | 68 | + $text = preg_replace('@\ \ @',' ', $text); |
| 69 | + $text = preg_replace("@\n\n@","\n", $text); | ||
| 70 | + } | ||
| 71 | + while ($old != $text); | ||
| 72 | + | ||
| 73 | + return $text; | ||
| 74 | + } | ||
| 75 | + | ||
| 76 | + public function extractTextContent() | ||
| 77 | + { | ||
| 78 | + if (false === parent::extractTextContent()) | ||
| 63 | { | 79 | { |
| 64 | - return _kt('Cannot locate java'); | 80 | + return false; |
| 65 | } | 81 | } |
| 66 | 82 | ||
| 83 | + $content = file_get_contents($this->targetfile); | ||
| 84 | + return file_put_contents($this->targetfile, $this->filter($content)); | ||
| 85 | + | ||
| 86 | + } | ||
| 67 | 87 | ||
| 68 | 88 | ||
| 69 | - $connection = @fsockopen($this->ooHost, $this->ooPort,$errno, $errstr,5 ); | ||
| 70 | - if (false === $connection) | 89 | + public function diagnose() |
| 90 | + { | ||
| 91 | + if (false === $this->python) | ||
| 71 | { | 92 | { |
| 72 | - return _kt('Cannot connect to openoffice host'); | 93 | + return _kt('Cannot locate python'); |
| 73 | } | 94 | } |
| 74 | - fclose($connection); | ||
| 75 | 95 | ||
| 96 | + if (false === $this->documentConverter) | ||
| 97 | + { | ||
| 98 | + return _kt('Cannot locate DocumentConverter.py'); | ||
| 99 | + } | ||
| 76 | 100 | ||
| 77 | - return null; | 101 | + return SearchHelper::checkOpenOfficeAvailablity(); |
| 78 | } | 102 | } |
| 79 | } | 103 | } |
| 80 | 104 |