Commit 74a1cff4b7dddb811b48341b312f4e0ffed9f338
1 parent
58031816
KTS-1594
"Support for Office 2007 Documents" Implemented. Committed By: Conrad Vermeulen Reviewed By: Megan Watson git-svn-id: https://kt-dms.svn.sourceforge.net/svnroot/kt-dms/trunk@7992 c91229c3-7414-0410-bfa2-8a42b809f60b
Showing
2 changed files
with
336 additions
and
0 deletions
search2/indexing/extractors/OpenXmlTextExtractor.inc.php
0 → 100644
| 1 | +<?php | ||
| 2 | + | ||
| 3 | +/** | ||
| 4 | + * $Id:$ | ||
| 5 | + * | ||
| 6 | + * KnowledgeTree Open Source Edition | ||
| 7 | + * Document Management Made Simple | ||
| 8 | + * Copyright (C) 2004 - 2008 The Jam Warehouse Software (Pty) Limited | ||
| 9 | + * | ||
| 10 | + * This program is free software; you can redistribute it and/or modify it under | ||
| 11 | + * the terms of the GNU General Public License version 3 as published by the | ||
| 12 | + * Free Software Foundation. | ||
| 13 | + * | ||
| 14 | + * This program is distributed in the hope that it will be useful, but WITHOUT | ||
| 15 | + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | ||
| 16 | + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more | ||
| 17 | + * details. | ||
| 18 | + * | ||
| 19 | + * You should have received a copy of the GNU General Public License | ||
| 20 | + * along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
| 21 | + * | ||
| 22 | + * You can contact The Jam Warehouse Software (Pty) Limited, Unit 1, Tramber Place, | ||
| 23 | + * Blake Street, Observatory, 7925 South Africa. or email info@knowledgetree.com. | ||
| 24 | + * | ||
| 25 | + * The interactive user interfaces in modified source and object code versions | ||
| 26 | + * of this program must display Appropriate Legal Notices, as required under | ||
| 27 | + * Section 5 of the GNU General Public License version 3. | ||
| 28 | + * | ||
| 29 | + * In accordance with Section 7(b) of the GNU General Public License version 3, | ||
| 30 | + * these Appropriate Legal Notices must retain the display of the "Powered by | ||
| 31 | + * KnowledgeTree" logo and retain the original copyright notice. If the display of the | ||
| 32 | + * logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices | ||
| 33 | + * must display the words "Powered by KnowledgeTree" and retain the original | ||
| 34 | + * copyright notice. | ||
| 35 | + * Contributor( s): ______________________________________ | ||
| 36 | + * | ||
| 37 | + */ | ||
| 38 | + | ||
| 39 | +class OpenXmlTextExtractor extends ExternalDocumentExtractor | ||
| 40 | +{ | ||
| 41 | + public function __construct() | ||
| 42 | + { | ||
| 43 | + $config = KTConfig::getSingleton(); | ||
| 44 | + | ||
| 45 | + $this->unzip = KTUtil::findCommand("import/unzip", 'unzip'); | ||
| 46 | + $this->unzip_params = $config->get('extractorParameters/unzip', '\'{source}\' \'{part}\' -d \'{target_dir}\''); | ||
| 47 | + parent::__construct(); | ||
| 48 | + } | ||
| 49 | + | ||
| 50 | + | ||
| 51 | + /** | ||
| 52 | + * Basic function setting the display name | ||
| 53 | + * | ||
| 54 | + * @return string | ||
| 55 | + */ | ||
| 56 | + public function getDisplayName() | ||
| 57 | + { | ||
| 58 | + return _kt('Open Xml Text Extractor'); | ||
| 59 | + } | ||
| 60 | + | ||
| 61 | + /** | ||
| 62 | + * Return a list of all Office 2007 document types that are supported | ||
| 63 | + * | ||
| 64 | + * @return array | ||
| 65 | + */ | ||
| 66 | + public function getSupportedMimeTypes() | ||
| 67 | + { | ||
| 68 | + return array( | ||
| 69 | + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', | ||
| 70 | + 'application/vnd.openxmlformats-officedocument.wordprocessingml.template', | ||
| 71 | + 'application/vnd.openxmlformats-officedocument.presentationml.template', | ||
| 72 | + 'application/vnd.openxmlformats-officedocument.presentationml.slideshow', | ||
| 73 | + 'application/vnd.openxmlformats-officedocument.presentationml.presentation', | ||
| 74 | + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', | ||
| 75 | + 'application/vnd.openxmlformats-officedocument.spreadsheetml.template' | ||
| 76 | + ); | ||
| 77 | + } | ||
| 78 | + | ||
| 79 | + /** | ||
| 80 | + * Trivial function to resolve if the document is word, excel, or power point | ||
| 81 | + * | ||
| 82 | + * @return array | ||
| 83 | + */ | ||
| 84 | + | ||
| 85 | + private function detectDocumentType() | ||
| 86 | + { | ||
| 87 | + $types = array( | ||
| 88 | + 'docx' => array( | ||
| 89 | + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', | ||
| 90 | + 'application/vnd.openxmlformats-officedocument.wordprocessingml.template' | ||
| 91 | + ), | ||
| 92 | + 'pptx' => array( | ||
| 93 | + 'application/vnd.openxmlformats-officedocument.presentationml.template', | ||
| 94 | + 'application/vnd.openxmlformats-officedocument.presentationml.slideshow', | ||
| 95 | + 'application/vnd.openxmlformats-officedocument.presentationml.presentation'), | ||
| 96 | + 'xlsx' => array( | ||
| 97 | + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', | ||
| 98 | + 'application/vnd.openxmlformats-officedocument.spreadsheetml.template'), | ||
| 99 | + | ||
| 100 | + ); | ||
| 101 | + foreach($types as $key=>$types) | ||
| 102 | + { | ||
| 103 | + if (in_array($this->mimetype, $types)) | ||
| 104 | + { | ||
| 105 | + return $key; | ||
| 106 | + } | ||
| 107 | + } | ||
| 108 | + } | ||
| 109 | + | ||
| 110 | + /** | ||
| 111 | + * The open xml file comprises various file with different content. This function identifies | ||
| 112 | + * which of those content types are worth indexing. | ||
| 113 | + * | ||
| 114 | + * @param string $openxml_type | ||
| 115 | + * @param string $mime_type | ||
| 116 | + * @return boolean | ||
| 117 | + */ | ||
| 118 | + private function interestingParts($openxml_type, $mime_type) | ||
| 119 | + { | ||
| 120 | + $interest = array( | ||
| 121 | + 'docx'=> array( | ||
| 122 | + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml', | ||
| 123 | + 'application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml', | ||
| 124 | + 'application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml', | ||
| 125 | + 'application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml'), | ||
| 126 | + | ||
| 127 | + 'pptx' => array('application/vnd.openxmlformats-officedocument.presentationml.slide+xml'), | ||
| 128 | + 'xlsx' => array( | ||
| 129 | + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml', | ||
| 130 | + 'application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml', | ||
| 131 | + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml', | ||
| 132 | + 'application/vnd.openxmlformats-package.core-properties+xml')); | ||
| 133 | + return in_array($mime_type, $interest[$openxml_type]); | ||
| 134 | + } | ||
| 135 | + | ||
| 136 | + /** | ||
| 137 | + * Returns a list of tokens that were identified by the [Content_Types].xml file. This file lists links to all parts of the document. | ||
| 138 | + * We use interestingParts() above to identify which of these parts are interesting from a content perspective. | ||
| 139 | + * | ||
| 140 | + * @return array | ||
| 141 | + */ | ||
| 142 | + private function getOpenXmlContentTypes() | ||
| 143 | + { | ||
| 144 | + $config = KTConfig::getSingleton(); | ||
| 145 | + $temp_dir = $config->get('urls/tmpDirectory'); | ||
| 146 | + | ||
| 147 | + $time = 'openxml_'. time(); | ||
| 148 | + $this->openxml_dir = $temp_dir . '/' . $time; | ||
| 149 | + | ||
| 150 | + if (!mkdir($this->openxml_dir)) | ||
| 151 | + { | ||
| 152 | + $this->output = _kt('Could not create folder: ') . $this->openxml_dir; | ||
| 153 | + return false; | ||
| 154 | + } | ||
| 155 | + | ||
| 156 | + | ||
| 157 | + $cmd = $this->unzip . ' ' . str_replace( | ||
| 158 | + array('{source}','{part}', '{target_dir}'), | ||
| 159 | + array($this->sourcefile, '\[Content_Types\].xml',$this->openxml_dir), $this->unzip_params); | ||
| 160 | + | ||
| 161 | + if (!$this->exec($cmd)) | ||
| 162 | + { | ||
| 163 | + $this->output = _kt('Failed to execute command: ') . $cmd; | ||
| 164 | + return false; | ||
| 165 | + } | ||
| 166 | + | ||
| 167 | + $filename = $this->openxml_dir . '/[Content_Types].xml'; | ||
| 168 | + if (!file_exists($filename)) | ||
| 169 | + { | ||
| 170 | + $this->output = _kt('Failed to find file: ') . $filename; | ||
| 171 | + return false; | ||
| 172 | + } | ||
| 173 | + | ||
| 174 | + $xml_content = file_get_contents($filename); | ||
| 175 | + | ||
| 176 | + // once we have the content, we can cleanup! | ||
| 177 | + @unlink($filename); | ||
| 178 | + | ||
| 179 | + // parse the file | ||
| 180 | + $parser = xml_parser_create(); | ||
| 181 | + xml_parse_into_struct($parser, $xml_content, $vals, $index); | ||
| 182 | + xml_parser_free($parser); | ||
| 183 | + | ||
| 184 | + return $vals; | ||
| 185 | + } | ||
| 186 | + | ||
| 187 | + /** | ||
| 188 | + * Extract the text from a file within the archive for a specific file. | ||
| 189 | + * | ||
| 190 | + * @param string $filename | ||
| 191 | + * @return string | ||
| 192 | + */ | ||
| 193 | + private function getContent($filename) | ||
| 194 | + { | ||
| 195 | + $config = KTConfig::getSingleton(); | ||
| 196 | + | ||
| 197 | + if (substr($filename,0,1) == '/') | ||
| 198 | + { | ||
| 199 | + $filename = substr($filename,1); | ||
| 200 | + } | ||
| 201 | + | ||
| 202 | + $cmd = $this->unzip . ' ' . str_replace( | ||
| 203 | + array('{source}','{part}', '{target_dir}'), | ||
| 204 | + array($this->sourcefile, $filename,$this->openxml_dir), $this->unzip_params); | ||
| 205 | + | ||
| 206 | + if (!$this->exec($cmd)) | ||
| 207 | + { | ||
| 208 | + $this->output = _kt('Failed to execute command: ') . $cmd; | ||
| 209 | + return false; | ||
| 210 | + } | ||
| 211 | + | ||
| 212 | + $filename = $this->openxml_dir . "/$filename"; | ||
| 213 | + if (!file_exists($filename)) | ||
| 214 | + { | ||
| 215 | + $this->output = _kt('Failed to open file: ') . $filename; | ||
| 216 | + return false; | ||
| 217 | + } | ||
| 218 | + | ||
| 219 | + $content = file_get_contents($filename); | ||
| 220 | + | ||
| 221 | + // cleanup | ||
| 222 | + @unlink($filename); | ||
| 223 | + | ||
| 224 | + $content = preg_replace ("@(</?[^>]*>)+@", " ", $content); | ||
| 225 | + | ||
| 226 | + return $content; | ||
| 227 | + } | ||
| 228 | + | ||
| 229 | + | ||
| 230 | + /** | ||
| 231 | + * Given the tokens in the [Content_Types].xml, extract the content | ||
| 232 | + * | ||
| 233 | + * @param array $vals | ||
| 234 | + * @return string | ||
| 235 | + */ | ||
| 236 | + function getOpenXmlText($vals) | ||
| 237 | + { | ||
| 238 | + $openxml_type = $this->detectDocumentType(); | ||
| 239 | + | ||
| 240 | + $content = ''; | ||
| 241 | + | ||
| 242 | + foreach($vals as $val) | ||
| 243 | + { | ||
| 244 | + if ($val['tag'] == 'OVERRIDE' && $val['type'] == 'complete') | ||
| 245 | + { | ||
| 246 | + if ($this->interestingParts($openxml_type, $val['attributes']['CONTENTTYPE'])) | ||
| 247 | + { | ||
| 248 | + $filename = $val['attributes']['PARTNAME']; | ||
| 249 | + $result = $this->getContent($filename); | ||
| 250 | + | ||
| 251 | + if ($result === false) | ||
| 252 | + { | ||
| 253 | + return false; | ||
| 254 | + } | ||
| 255 | + | ||
| 256 | + $content .= $result; | ||
| 257 | + } | ||
| 258 | + } | ||
| 259 | + } | ||
| 260 | + | ||
| 261 | + return $content; | ||
| 262 | + } | ||
| 263 | + | ||
| 264 | + /** | ||
| 265 | + * The main context extraction function | ||
| 266 | + * | ||
| 267 | + * @return bool | ||
| 268 | + */ | ||
| 269 | + | ||
| 270 | + public function extractTextContent() | ||
| 271 | + { | ||
| 272 | + $xml_content = $this->getOpenXmlContentTypes(); | ||
| 273 | + | ||
| 274 | + if ($xml_content !== false) | ||
| 275 | + { | ||
| 276 | + $content = $this->getOpenXmlText($xml_content); | ||
| 277 | + | ||
| 278 | + if ($content !== false) | ||
| 279 | + { | ||
| 280 | + $result = file_put_contents($this->targetfile, $this->filter($content)); | ||
| 281 | + | ||
| 282 | + if ($result === false) | ||
| 283 | + { | ||
| 284 | + $this->output = _kt('Could not save content to file: ') . $this->targetfile; | ||
| 285 | + @rmdir($this->openxml_dir); | ||
| 286 | + return false; | ||
| 287 | + } | ||
| 288 | + } | ||
| 289 | + @rmdir($this->openxml_dir); | ||
| 290 | + return true; | ||
| 291 | + } | ||
| 292 | + | ||
| 293 | + return false; | ||
| 294 | + | ||
| 295 | + } | ||
| 296 | + | ||
| 297 | + /** | ||
| 298 | + * Check that unzip is available | ||
| 299 | + * | ||
| 300 | + * @return boolean | ||
| 301 | + */ | ||
| 302 | + public function diagnose() | ||
| 303 | + { | ||
| 304 | + if (false === $this->unzip) | ||
| 305 | + { | ||
| 306 | + return sprintf(_kt("Cannot locate unzip: %s."), $this->unzip); | ||
| 307 | + } | ||
| 308 | + return null; | ||
| 309 | + } | ||
| 310 | + | ||
| 311 | +} | ||
| 312 | + | ||
| 313 | +?> | ||
| 0 | \ No newline at end of file | 314 | \ No newline at end of file |
sql/mysql/upgrade/3.5.2/openxml_mime_types.sql
0 → 100644
| 1 | +select @id:=max(id)+1 from mime_types; | ||
| 2 | +insert into mime_types(id, filetypes, mimetypes, icon_path, friendly_name) values | ||
| 3 | +(@id, 'docx', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'word', 'Word Document'); | ||
| 4 | +select @id:=max(id)+1 from mime_types; | ||
| 5 | +insert into mime_types(id, filetypes, mimetypes, icon_path, friendly_name) values | ||
| 6 | +(@id, 'dotx', 'application/vnd.openxmlformats-officedocument.wordprocessingml.template', 'word', 'Word Document'); | ||
| 7 | +select @id:=max(id)+1 from mime_types; | ||
| 8 | +insert into mime_types(id, filetypes, mimetypes, icon_path, friendly_name) values | ||
| 9 | +(@id, 'potx', 'application/vnd.openxmlformats-officedocument.presentationml.template', 'office', 'Powerpoint Presentation'); | ||
| 10 | +select @id:=max(id)+1 from mime_types; | ||
| 11 | +insert into mime_types(id, filetypes, mimetypes, icon_path, friendly_name) values | ||
| 12 | +(@id, 'ppsx', 'application/vnd.openxmlformats-officedocument.presentationml.slideshow', 'office', 'Powerpoint Presentation'); | ||
| 13 | +select @id:=max(id)+1 from mime_types; | ||
| 14 | +insert into mime_types(id, filetypes, mimetypes, icon_path, friendly_name) values | ||
| 15 | +(@id, 'pptx', 'application/vnd.openxmlformats-officedocument.presentationml.presentation', 'office', 'Powerpoint Presentation'); | ||
| 16 | +select @id:=max(id)+1 from mime_types; | ||
| 17 | +insert into mime_types(id, filetypes, mimetypes, icon_path, friendly_name) values | ||
| 18 | +(@id, 'xlsx', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'excel', 'Excel Spreadsheet'); | ||
| 19 | +select @id:=max(id)+1 from mime_types; | ||
| 20 | +insert into mime_types(id, filetypes, mimetypes, icon_path, friendly_name) values | ||
| 21 | +(@id, 'xltx', 'application/vnd.openxmlformats-officedocument.spreadsheetml.template', 'excel', 'Excel Spreadsheet'); | ||
| 22 | + | ||
| 23 | +update zseq_mime_types set id=@id | ||
| 0 | \ No newline at end of file | 24 | \ No newline at end of file |