OOTextExtractor.inc.php
2.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
<?php
class OOTextExtractor extends ExternalDocumentExtractor
{
private $python;
private $documentConverter;
private $ooHost;
private $ooPort;
public function __construct($targetMimeType='plain/text')
{
parent::__construct();
$config =& KTConfig::getSingleton();
$this->python = KTUtil::findCommand('externalBinary/python');
$this->ooHost = $config->get('openoffice/host');
$this->ooPort = $config->get('openoffice/port');
$this->documentConverter = KT_DIR . '/bin/openoffice/DocumentConverter.py';
if (!is_file($this->documentConverter))
{
$this->documentConverter = false;
}
}
public function getDisplayName()
{
return _kt('OpenOffice Text Extractor');
}
public function getSupportedMimeTypes()
{
return array(
);
}
public function needsIntermediateSourceFile()
{
// we need the intermediate file because it
// has the correct extension. documentConverter uses the extension to determine mimetype
return true;
}
protected function getCommandLine()
{
$sourcefile = escapeshellcmd($this->sourcefile);
unlink($this->targetfile);
$this->targetfile .= '.html';
$targetfile = escapeshellcmd($this->targetfile);
$escape = OS_WINDOWS?'"':'\'';
$cmdline = "{$this->python} {$escape}{$this->documentConverter}{$escape} {$escape}{$this->sourcefile}{$escape} {$escape}{$this->targetfile}{$escape} {$this->ooHost} {$this->ooPort}";
return $cmdline;
}
protected function filter($text)
{
$text = preg_replace ("@(</?[^>]*>)+@", '', $text);
do
{
$old = $text;
$text= preg_replace("@([\r\n])[\s]+@",'\1', $text);
$text = preg_replace('@\ \ @',' ', $text);
$text = preg_replace("@\n\n@","\n", $text);
}
while ($old != $text);
return $text;
}
public function extractTextContent()
{
if (false === parent::extractTextContent())
{
return false;
}
$content = file_get_contents($this->targetfile);
return file_put_contents($this->targetfile, $this->filter($content));
}
public function diagnose()
{
if (false === $this->python)
{
return _kt('Cannot locate python');
}
if (false === $this->documentConverter)
{
return _kt('Cannot locate DocumentConverter.py');
}
return SearchHelper::checkOpenOfficeAvailablity();
}
}
?>