Commit 74a1cff4b7dddb811b48341b312f4e0ffed9f338
1 parent
58031816
KTS-1594
"Support for Office 2007 Documents" Implemented. Committed By: Conrad Vermeulen Reviewed By: Megan Watson git-svn-id: https://kt-dms.svn.sourceforge.net/svnroot/kt-dms/trunk@7992 c91229c3-7414-0410-bfa2-8a42b809f60b
Showing
2 changed files
with
336 additions
and
0 deletions
search2/indexing/extractors/OpenXmlTextExtractor.inc.php
0 → 100644
| 1 | +<?php | |
| 2 | + | |
| 3 | +/** | |
| 4 | + * $Id:$ | |
| 5 | + * | |
| 6 | + * KnowledgeTree Open Source Edition | |
| 7 | + * Document Management Made Simple | |
| 8 | + * Copyright (C) 2004 - 2008 The Jam Warehouse Software (Pty) Limited | |
| 9 | + * | |
| 10 | + * This program is free software; you can redistribute it and/or modify it under | |
| 11 | + * the terms of the GNU General Public License version 3 as published by the | |
| 12 | + * Free Software Foundation. | |
| 13 | + * | |
| 14 | + * This program is distributed in the hope that it will be useful, but WITHOUT | |
| 15 | + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
| 16 | + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more | |
| 17 | + * details. | |
| 18 | + * | |
| 19 | + * You should have received a copy of the GNU General Public License | |
| 20 | + * along with this program. If not, see <http://www.gnu.org/licenses/>. | |
| 21 | + * | |
| 22 | + * You can contact The Jam Warehouse Software (Pty) Limited, Unit 1, Tramber Place, | |
| 23 | + * Blake Street, Observatory, 7925 South Africa. or email info@knowledgetree.com. | |
| 24 | + * | |
| 25 | + * The interactive user interfaces in modified source and object code versions | |
| 26 | + * of this program must display Appropriate Legal Notices, as required under | |
| 27 | + * Section 5 of the GNU General Public License version 3. | |
| 28 | + * | |
| 29 | + * In accordance with Section 7(b) of the GNU General Public License version 3, | |
| 30 | + * these Appropriate Legal Notices must retain the display of the "Powered by | |
| 31 | + * KnowledgeTree" logo and retain the original copyright notice. If the display of the | |
| 32 | + * logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices | |
| 33 | + * must display the words "Powered by KnowledgeTree" and retain the original | |
| 34 | + * copyright notice. | |
| 35 | + * Contributor( s): ______________________________________ | |
| 36 | + * | |
| 37 | + */ | |
| 38 | + | |
| 39 | +class OpenXmlTextExtractor extends ExternalDocumentExtractor | |
| 40 | +{ | |
| 41 | + public function __construct() | |
| 42 | + { | |
| 43 | + $config = KTConfig::getSingleton(); | |
| 44 | + | |
| 45 | + $this->unzip = KTUtil::findCommand("import/unzip", 'unzip'); | |
| 46 | + $this->unzip_params = $config->get('extractorParameters/unzip', '\'{source}\' \'{part}\' -d \'{target_dir}\''); | |
| 47 | + parent::__construct(); | |
| 48 | + } | |
| 49 | + | |
| 50 | + | |
| 51 | + /** | |
| 52 | + * Basic function setting the display name | |
| 53 | + * | |
| 54 | + * @return string | |
| 55 | + */ | |
| 56 | + public function getDisplayName() | |
| 57 | + { | |
| 58 | + return _kt('Open Xml Text Extractor'); | |
| 59 | + } | |
| 60 | + | |
| 61 | + /** | |
| 62 | + * Return a list of all Office 2007 document types that are supported | |
| 63 | + * | |
| 64 | + * @return array | |
| 65 | + */ | |
| 66 | + public function getSupportedMimeTypes() | |
| 67 | + { | |
| 68 | + return array( | |
| 69 | + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', | |
| 70 | + 'application/vnd.openxmlformats-officedocument.wordprocessingml.template', | |
| 71 | + 'application/vnd.openxmlformats-officedocument.presentationml.template', | |
| 72 | + 'application/vnd.openxmlformats-officedocument.presentationml.slideshow', | |
| 73 | + 'application/vnd.openxmlformats-officedocument.presentationml.presentation', | |
| 74 | + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', | |
| 75 | + 'application/vnd.openxmlformats-officedocument.spreadsheetml.template' | |
| 76 | + ); | |
| 77 | + } | |
| 78 | + | |
| 79 | + /** | |
| 80 | + * Trivial function to resolve if the document is word, excel, or power point | |
| 81 | + * | |
| 82 | + * @return array | |
| 83 | + */ | |
| 84 | + | |
| 85 | + private function detectDocumentType() | |
| 86 | + { | |
| 87 | + $types = array( | |
| 88 | + 'docx' => array( | |
| 89 | + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', | |
| 90 | + 'application/vnd.openxmlformats-officedocument.wordprocessingml.template' | |
| 91 | + ), | |
| 92 | + 'pptx' => array( | |
| 93 | + 'application/vnd.openxmlformats-officedocument.presentationml.template', | |
| 94 | + 'application/vnd.openxmlformats-officedocument.presentationml.slideshow', | |
| 95 | + 'application/vnd.openxmlformats-officedocument.presentationml.presentation'), | |
| 96 | + 'xlsx' => array( | |
| 97 | + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', | |
| 98 | + 'application/vnd.openxmlformats-officedocument.spreadsheetml.template'), | |
| 99 | + | |
| 100 | + ); | |
| 101 | + foreach($types as $key=>$types) | |
| 102 | + { | |
| 103 | + if (in_array($this->mimetype, $types)) | |
| 104 | + { | |
| 105 | + return $key; | |
| 106 | + } | |
| 107 | + } | |
| 108 | + } | |
| 109 | + | |
| 110 | + /** | |
| 111 | + * The open xml file comprises various file with different content. This function identifies | |
| 112 | + * which of those content types are worth indexing. | |
| 113 | + * | |
| 114 | + * @param string $openxml_type | |
| 115 | + * @param string $mime_type | |
| 116 | + * @return boolean | |
| 117 | + */ | |
| 118 | + private function interestingParts($openxml_type, $mime_type) | |
| 119 | + { | |
| 120 | + $interest = array( | |
| 121 | + 'docx'=> array( | |
| 122 | + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml', | |
| 123 | + 'application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml', | |
| 124 | + 'application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml', | |
| 125 | + 'application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml'), | |
| 126 | + | |
| 127 | + 'pptx' => array('application/vnd.openxmlformats-officedocument.presentationml.slide+xml'), | |
| 128 | + 'xlsx' => array( | |
| 129 | + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml', | |
| 130 | + 'application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml', | |
| 131 | + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml', | |
| 132 | + 'application/vnd.openxmlformats-package.core-properties+xml')); | |
| 133 | + return in_array($mime_type, $interest[$openxml_type]); | |
| 134 | + } | |
| 135 | + | |
| 136 | + /** | |
| 137 | + * Returns a list of tokens that were identified by the [Content_Types].xml file. This file lists links to all parts of the document. | |
| 138 | + * We use interestingParts() above to identify which of these parts are interesting from a content perspective. | |
| 139 | + * | |
| 140 | + * @return array | |
| 141 | + */ | |
| 142 | + private function getOpenXmlContentTypes() | |
| 143 | + { | |
| 144 | + $config = KTConfig::getSingleton(); | |
| 145 | + $temp_dir = $config->get('urls/tmpDirectory'); | |
| 146 | + | |
| 147 | + $time = 'openxml_'. time(); | |
| 148 | + $this->openxml_dir = $temp_dir . '/' . $time; | |
| 149 | + | |
| 150 | + if (!mkdir($this->openxml_dir)) | |
| 151 | + { | |
| 152 | + $this->output = _kt('Could not create folder: ') . $this->openxml_dir; | |
| 153 | + return false; | |
| 154 | + } | |
| 155 | + | |
| 156 | + | |
| 157 | + $cmd = $this->unzip . ' ' . str_replace( | |
| 158 | + array('{source}','{part}', '{target_dir}'), | |
| 159 | + array($this->sourcefile, '\[Content_Types\].xml',$this->openxml_dir), $this->unzip_params); | |
| 160 | + | |
| 161 | + if (!$this->exec($cmd)) | |
| 162 | + { | |
| 163 | + $this->output = _kt('Failed to execute command: ') . $cmd; | |
| 164 | + return false; | |
| 165 | + } | |
| 166 | + | |
| 167 | + $filename = $this->openxml_dir . '/[Content_Types].xml'; | |
| 168 | + if (!file_exists($filename)) | |
| 169 | + { | |
| 170 | + $this->output = _kt('Failed to find file: ') . $filename; | |
| 171 | + return false; | |
| 172 | + } | |
| 173 | + | |
| 174 | + $xml_content = file_get_contents($filename); | |
| 175 | + | |
| 176 | + // once we have the content, we can cleanup! | |
| 177 | + @unlink($filename); | |
| 178 | + | |
| 179 | + // parse the file | |
| 180 | + $parser = xml_parser_create(); | |
| 181 | + xml_parse_into_struct($parser, $xml_content, $vals, $index); | |
| 182 | + xml_parser_free($parser); | |
| 183 | + | |
| 184 | + return $vals; | |
| 185 | + } | |
| 186 | + | |
| 187 | + /** | |
| 188 | + * Extract the text from a file within the archive for a specific file. | |
| 189 | + * | |
| 190 | + * @param string $filename | |
| 191 | + * @return string | |
| 192 | + */ | |
| 193 | + private function getContent($filename) | |
| 194 | + { | |
| 195 | + $config = KTConfig::getSingleton(); | |
| 196 | + | |
| 197 | + if (substr($filename,0,1) == '/') | |
| 198 | + { | |
| 199 | + $filename = substr($filename,1); | |
| 200 | + } | |
| 201 | + | |
| 202 | + $cmd = $this->unzip . ' ' . str_replace( | |
| 203 | + array('{source}','{part}', '{target_dir}'), | |
| 204 | + array($this->sourcefile, $filename,$this->openxml_dir), $this->unzip_params); | |
| 205 | + | |
| 206 | + if (!$this->exec($cmd)) | |
| 207 | + { | |
| 208 | + $this->output = _kt('Failed to execute command: ') . $cmd; | |
| 209 | + return false; | |
| 210 | + } | |
| 211 | + | |
| 212 | + $filename = $this->openxml_dir . "/$filename"; | |
| 213 | + if (!file_exists($filename)) | |
| 214 | + { | |
| 215 | + $this->output = _kt('Failed to open file: ') . $filename; | |
| 216 | + return false; | |
| 217 | + } | |
| 218 | + | |
| 219 | + $content = file_get_contents($filename); | |
| 220 | + | |
| 221 | + // cleanup | |
| 222 | + @unlink($filename); | |
| 223 | + | |
| 224 | + $content = preg_replace ("@(</?[^>]*>)+@", " ", $content); | |
| 225 | + | |
| 226 | + return $content; | |
| 227 | + } | |
| 228 | + | |
| 229 | + | |
| 230 | + /** | |
| 231 | + * Given the tokens in the [Content_Types].xml, extract the content | |
| 232 | + * | |
| 233 | + * @param array $vals | |
| 234 | + * @return string | |
| 235 | + */ | |
| 236 | + function getOpenXmlText($vals) | |
| 237 | + { | |
| 238 | + $openxml_type = $this->detectDocumentType(); | |
| 239 | + | |
| 240 | + $content = ''; | |
| 241 | + | |
| 242 | + foreach($vals as $val) | |
| 243 | + { | |
| 244 | + if ($val['tag'] == 'OVERRIDE' && $val['type'] == 'complete') | |
| 245 | + { | |
| 246 | + if ($this->interestingParts($openxml_type, $val['attributes']['CONTENTTYPE'])) | |
| 247 | + { | |
| 248 | + $filename = $val['attributes']['PARTNAME']; | |
| 249 | + $result = $this->getContent($filename); | |
| 250 | + | |
| 251 | + if ($result === false) | |
| 252 | + { | |
| 253 | + return false; | |
| 254 | + } | |
| 255 | + | |
| 256 | + $content .= $result; | |
| 257 | + } | |
| 258 | + } | |
| 259 | + } | |
| 260 | + | |
| 261 | + return $content; | |
| 262 | + } | |
| 263 | + | |
| 264 | + /** | |
| 265 | + * The main context extraction function | |
| 266 | + * | |
| 267 | + * @return bool | |
| 268 | + */ | |
| 269 | + | |
| 270 | + public function extractTextContent() | |
| 271 | + { | |
| 272 | + $xml_content = $this->getOpenXmlContentTypes(); | |
| 273 | + | |
| 274 | + if ($xml_content !== false) | |
| 275 | + { | |
| 276 | + $content = $this->getOpenXmlText($xml_content); | |
| 277 | + | |
| 278 | + if ($content !== false) | |
| 279 | + { | |
| 280 | + $result = file_put_contents($this->targetfile, $this->filter($content)); | |
| 281 | + | |
| 282 | + if ($result === false) | |
| 283 | + { | |
| 284 | + $this->output = _kt('Could not save content to file: ') . $this->targetfile; | |
| 285 | + @rmdir($this->openxml_dir); | |
| 286 | + return false; | |
| 287 | + } | |
| 288 | + } | |
| 289 | + @rmdir($this->openxml_dir); | |
| 290 | + return true; | |
| 291 | + } | |
| 292 | + | |
| 293 | + return false; | |
| 294 | + | |
| 295 | + } | |
| 296 | + | |
| 297 | + /** | |
| 298 | + * Check that unzip is available | |
| 299 | + * | |
| 300 | + * @return boolean | |
| 301 | + */ | |
| 302 | + public function diagnose() | |
| 303 | + { | |
| 304 | + if (false === $this->unzip) | |
| 305 | + { | |
| 306 | + return sprintf(_kt("Cannot locate unzip: %s."), $this->unzip); | |
| 307 | + } | |
| 308 | + return null; | |
| 309 | + } | |
| 310 | + | |
| 311 | +} | |
| 312 | + | |
| 313 | +?> | |
| 0 | 314 | \ No newline at end of file | ... | ... |
sql/mysql/upgrade/3.5.2/openxml_mime_types.sql
0 → 100644
| 1 | +select @id:=max(id)+1 from mime_types; | |
| 2 | +insert into mime_types(id, filetypes, mimetypes, icon_path, friendly_name) values | |
| 3 | +(@id, 'docx', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'word', 'Word Document'); | |
| 4 | +select @id:=max(id)+1 from mime_types; | |
| 5 | +insert into mime_types(id, filetypes, mimetypes, icon_path, friendly_name) values | |
| 6 | +(@id, 'dotx', 'application/vnd.openxmlformats-officedocument.wordprocessingml.template', 'word', 'Word Document'); | |
| 7 | +select @id:=max(id)+1 from mime_types; | |
| 8 | +insert into mime_types(id, filetypes, mimetypes, icon_path, friendly_name) values | |
| 9 | +(@id, 'potx', 'application/vnd.openxmlformats-officedocument.presentationml.template', 'office', 'Powerpoint Presentation'); | |
| 10 | +select @id:=max(id)+1 from mime_types; | |
| 11 | +insert into mime_types(id, filetypes, mimetypes, icon_path, friendly_name) values | |
| 12 | +(@id, 'ppsx', 'application/vnd.openxmlformats-officedocument.presentationml.slideshow', 'office', 'Powerpoint Presentation'); | |
| 13 | +select @id:=max(id)+1 from mime_types; | |
| 14 | +insert into mime_types(id, filetypes, mimetypes, icon_path, friendly_name) values | |
| 15 | +(@id, 'pptx', 'application/vnd.openxmlformats-officedocument.presentationml.presentation', 'office', 'Powerpoint Presentation'); | |
| 16 | +select @id:=max(id)+1 from mime_types; | |
| 17 | +insert into mime_types(id, filetypes, mimetypes, icon_path, friendly_name) values | |
| 18 | +(@id, 'xlsx', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'excel', 'Excel Spreadsheet'); | |
| 19 | +select @id:=max(id)+1 from mime_types; | |
| 20 | +insert into mime_types(id, filetypes, mimetypes, icon_path, friendly_name) values | |
| 21 | +(@id, 'xltx', 'application/vnd.openxmlformats-officedocument.spreadsheetml.template', 'excel', 'Excel Spreadsheet'); | |
| 22 | + | |
| 23 | +update zseq_mime_types set id=@id | |
| 0 | 24 | \ No newline at end of file | ... | ... |