Commit b2d72fcf2b3efab99ad93a10b679dfd0db2cbd57

Authored by Conrad Vermeulen
1 parent fe91877f

KTS-673

"The search algorithm needs some work"
Updated. Integrated with DocumentConverter.py

Committed By: Conrad Vermeulen
Reviewed By: Kevin Fourie

git-svn-id: https://kt-dms.svn.sourceforge.net/svnroot/kt-dms/trunk@7463 c91229c3-7414-0410-bfa2-8a42b809f60b
search2/indexing/extractors/OOTextExtractor.inc.php
... ... @@ -2,22 +2,25 @@
2 2  
3 3 class OOTextExtractor extends ExternalDocumentExtractor
4 4 {
5   - private $converter;
6   - private $javaPath;
  5 + private $python;
  6 + private $documentConverter;
7 7 private $ooHost;
8 8 private $ooPort;
9   - private $targetMimeType;
10 9  
11 10 public function __construct($targetMimeType='plain/text')
12 11 {
13 12 parent::__construct();
14 13 $config =& KTConfig::getSingleton();
15 14  
16   - $this->converter = KTUtil::findCommand('extractors/jodconverter', 'jodconverter');
17   - $this->javaPath = KTUtil::findCommand('extractors/java', 'java');
18   - $this->ooHost = $config->get('openoffice/host', 'localhost');
19   - $this->ooPort = $config->get('openoffice/port', 8100);
20   - $this->targetMimeType = $targetMimeType;
  15 + $this->python = KTUtil::findCommand('externalBinary/python');
  16 + $this->ooHost = $config->get('openoffice/host');
  17 + $this->ooPort = $config->get('openoffice/port');
  18 +
  19 + $this->documentConverter = KT_DIR . '/bin/openoffice/DocumentConverter.py';
  20 + if (!is_file($this->documentConverter))
  21 + {
  22 + $this->documentConverter = false;
  23 + }
21 24 }
22 25  
23 26 public function getDisplayName()
... ... @@ -28,53 +31,74 @@ class OOTextExtractor extends ExternalDocumentExtractor
28 31 public function getSupportedMimeTypes()
29 32 {
30 33 return array(
31   - 'text/rtf',
32   - 'application/vnd.oasis.opendocument.text',
33   - 'application/vnd.oasis.opendocument.text-template',
34   - 'application/vnd.oasis.opendocument.text-web',
35   - 'application/vnd.oasis.opendocument.text-master',
36   - 'application/vnd.sun.xml.writer',
37   - 'application/vnd.sun.xml.writer.template',
38   - 'application/vnd.sun.xml.writer.global',
  34 +
39 35 );
40 36 }
41 37  
42 38 public function needsIntermediateSourceFile()
43 39 {
44 40 // we need the intermediate file because it
45   - // has the correct extension. jodconverter uses the extension to determine mimetype
  41 + // has the correct extension. documentConverter uses the extension to determine mimetype
46 42 return true;
47 43 }
48 44  
49 45 protected function getCommandLine()
50 46 {
51   - $cmdline = "$this->javaPath -jar $this->converter $this->sourcefile $this->mimetype $this->targetfile $this->targetMimeType $this->ooHost $this->ooPort";
  47 + $sourcefile = escapeshellcmd($this->sourcefile);
  48 + unlink($this->targetfile);
  49 + $this->targetfile .= '.html';
  50 + $targetfile = escapeshellcmd($this->targetfile);
  51 +
  52 + $escape = OS_WINDOWS?'"':'\'';
  53 +
  54 + $cmdline = "{$this->python} {$escape}{$this->documentConverter}{$escape} {$escape}{$this->sourcefile}{$escape} {$escape}{$this->targetfile}{$escape} {$this->ooHost} {$this->ooPort}";
52 55 return $cmdline;
53 56 }
54 57  
55   - public function diagnose()
  58 + protected function filter($text)
56 59 {
57   - if (false === $this->converter)
58   - {
59   - return _kt('Cannot locate jodconverter');
60   - }
  60 + $text = preg_replace ("@(</?[^>]*>)+@", '', $text);
  61 +
  62 + do
  63 + {
  64 + $old = $text;
  65 +
  66 + $text= preg_replace("@([\r\n])[\s]+@",'\1', $text);
61 67  
62   - if (false === $this->javaPath)
  68 + $text = preg_replace('@\ \ @',' ', $text);
  69 + $text = preg_replace("@\n\n@","\n", $text);
  70 + }
  71 + while ($old != $text);
  72 +
  73 + return $text;
  74 + }
  75 +
  76 + public function extractTextContent()
  77 + {
  78 + if (false === parent::extractTextContent())
63 79 {
64   - return _kt('Cannot locate java');
  80 + return false;
65 81 }
66 82  
  83 + $content = file_get_contents($this->targetfile);
  84 + return file_put_contents($this->targetfile, $this->filter($content));
  85 +
  86 + }
67 87  
68 88  
69   - $connection = @fsockopen($this->ooHost, $this->ooPort,$errno, $errstr,5 );
70   - if (false === $connection)
  89 + public function diagnose()
  90 + {
  91 + if (false === $this->python)
71 92 {
72   - return _kt('Cannot connect to openoffice host');
  93 + return _kt('Cannot locate python');
73 94 }
74   - fclose($connection);
75 95  
  96 + if (false === $this->documentConverter)
  97 + {
  98 + return _kt('Cannot locate DocumentConverter.py');
  99 + }
76 100  
77   - return null;
  101 + return SearchHelper::checkOpenOfficeAvailablity();
78 102 }
79 103 }
80 104  
... ...