Commit b2d72fcf2b3efab99ad93a10b679dfd0db2cbd57

Authored by Conrad Vermeulen
1 parent fe91877f

KTS-673

"The search algorithm needs some work"
Updated. Integrated with DocumentConverter.py

Committed By: Conrad Vermeulen
Reviewed By: Kevin Fourie

git-svn-id: https://kt-dms.svn.sourceforge.net/svnroot/kt-dms/trunk@7463 c91229c3-7414-0410-bfa2-8a42b809f60b
search2/indexing/extractors/OOTextExtractor.inc.php
@@ -2,22 +2,25 @@ @@ -2,22 +2,25 @@
2 2
3 class OOTextExtractor extends ExternalDocumentExtractor 3 class OOTextExtractor extends ExternalDocumentExtractor
4 { 4 {
5 - private $converter;  
6 - private $javaPath; 5 + private $python;
  6 + private $documentConverter;
7 private $ooHost; 7 private $ooHost;
8 private $ooPort; 8 private $ooPort;
9 - private $targetMimeType;  
10 9
11 public function __construct($targetMimeType='plain/text') 10 public function __construct($targetMimeType='plain/text')
12 { 11 {
13 parent::__construct(); 12 parent::__construct();
14 $config =& KTConfig::getSingleton(); 13 $config =& KTConfig::getSingleton();
15 14
16 - $this->converter = KTUtil::findCommand('extractors/jodconverter', 'jodconverter');  
17 - $this->javaPath = KTUtil::findCommand('extractors/java', 'java');  
18 - $this->ooHost = $config->get('openoffice/host', 'localhost');  
19 - $this->ooPort = $config->get('openoffice/port', 8100);  
20 - $this->targetMimeType = $targetMimeType; 15 + $this->python = KTUtil::findCommand('externalBinary/python');
  16 + $this->ooHost = $config->get('openoffice/host');
  17 + $this->ooPort = $config->get('openoffice/port');
  18 +
  19 + $this->documentConverter = KT_DIR . '/bin/openoffice/DocumentConverter.py';
  20 + if (!is_file($this->documentConverter))
  21 + {
  22 + $this->documentConverter = false;
  23 + }
21 } 24 }
22 25
23 public function getDisplayName() 26 public function getDisplayName()
@@ -28,53 +31,74 @@ class OOTextExtractor extends ExternalDocumentExtractor @@ -28,53 +31,74 @@ class OOTextExtractor extends ExternalDocumentExtractor
28 public function getSupportedMimeTypes() 31 public function getSupportedMimeTypes()
29 { 32 {
30 return array( 33 return array(
31 - 'text/rtf',  
32 - 'application/vnd.oasis.opendocument.text',  
33 - 'application/vnd.oasis.opendocument.text-template',  
34 - 'application/vnd.oasis.opendocument.text-web',  
35 - 'application/vnd.oasis.opendocument.text-master',  
36 - 'application/vnd.sun.xml.writer',  
37 - 'application/vnd.sun.xml.writer.template',  
38 - 'application/vnd.sun.xml.writer.global', 34 +
39 ); 35 );
40 } 36 }
41 37
42 public function needsIntermediateSourceFile() 38 public function needsIntermediateSourceFile()
43 { 39 {
44 // we need the intermediate file because it 40 // we need the intermediate file because it
45 - // has the correct extension. jodconverter uses the extension to determine mimetype 41 + // has the correct extension. documentConverter uses the extension to determine mimetype
46 return true; 42 return true;
47 } 43 }
48 44
49 protected function getCommandLine() 45 protected function getCommandLine()
50 { 46 {
51 - $cmdline = "$this->javaPath -jar $this->converter $this->sourcefile $this->mimetype $this->targetfile $this->targetMimeType $this->ooHost $this->ooPort"; 47 + $sourcefile = escapeshellcmd($this->sourcefile);
  48 + unlink($this->targetfile);
  49 + $this->targetfile .= '.html';
  50 + $targetfile = escapeshellcmd($this->targetfile);
  51 +
  52 + $escape = OS_WINDOWS?'"':'\'';
  53 +
  54 + $cmdline = "{$this->python} {$escape}{$this->documentConverter}{$escape} {$escape}{$this->sourcefile}{$escape} {$escape}{$this->targetfile}{$escape} {$this->ooHost} {$this->ooPort}";
52 return $cmdline; 55 return $cmdline;
53 } 56 }
54 57
55 - public function diagnose() 58 + protected function filter($text)
56 { 59 {
57 - if (false === $this->converter)  
58 - {  
59 - return _kt('Cannot locate jodconverter');  
60 - } 60 + $text = preg_replace ("@(</?[^>]*>)+@", '', $text);
  61 +
  62 + do
  63 + {
  64 + $old = $text;
  65 +
  66 + $text= preg_replace("@([\r\n])[\s]+@",'\1', $text);
61 67
62 - if (false === $this->javaPath) 68 + $text = preg_replace('@\ \ @',' ', $text);
  69 + $text = preg_replace("@\n\n@","\n", $text);
  70 + }
  71 + while ($old != $text);
  72 +
  73 + return $text;
  74 + }
  75 +
  76 + public function extractTextContent()
  77 + {
  78 + if (false === parent::extractTextContent())
63 { 79 {
64 - return _kt('Cannot locate java'); 80 + return false;
65 } 81 }
66 82
  83 + $content = file_get_contents($this->targetfile);
  84 + return file_put_contents($this->targetfile, $this->filter($content));
  85 +
  86 + }
67 87
68 88
69 - $connection = @fsockopen($this->ooHost, $this->ooPort,$errno, $errstr,5 );  
70 - if (false === $connection) 89 + public function diagnose()
  90 + {
  91 + if (false === $this->python)
71 { 92 {
72 - return _kt('Cannot connect to openoffice host'); 93 + return _kt('Cannot locate python');
73 } 94 }
74 - fclose($connection);  
75 95
  96 + if (false === $this->documentConverter)
  97 + {
  98 + return _kt('Cannot locate DocumentConverter.py');
  99 + }
76 100
77 - return null; 101 + return SearchHelper::checkOpenOfficeAvailablity();
78 } 102 }
79 } 103 }
80 104