Commit 74a1cff4b7dddb811b48341b312f4e0ffed9f338

Authored by Conrad Vermeulen
1 parent 58031816

KTS-1594

"Support for Office 2007 Documents"
Implemented.

Committed By: Conrad Vermeulen
Reviewed By: Megan Watson

git-svn-id: https://kt-dms.svn.sourceforge.net/svnroot/kt-dms/trunk@7992 c91229c3-7414-0410-bfa2-8a42b809f60b
search2/indexing/extractors/OpenXmlTextExtractor.inc.php 0 → 100644
  1 +<?php
  2 +
  3 +/**
  4 + * $Id:$
  5 + *
  6 + * KnowledgeTree Open Source Edition
  7 + * Document Management Made Simple
  8 + * Copyright (C) 2004 - 2008 The Jam Warehouse Software (Pty) Limited
  9 + *
  10 + * This program is free software; you can redistribute it and/or modify it under
  11 + * the terms of the GNU General Public License version 3 as published by the
  12 + * Free Software Foundation.
  13 + *
  14 + * This program is distributed in the hope that it will be useful, but WITHOUT
  15 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  16 + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  17 + * details.
  18 + *
  19 + * You should have received a copy of the GNU General Public License
  20 + * along with this program. If not, see <http://www.gnu.org/licenses/>.
  21 + *
  22 + * You can contact The Jam Warehouse Software (Pty) Limited, Unit 1, Tramber Place,
  23 + * Blake Street, Observatory, 7925 South Africa. or email info@knowledgetree.com.
  24 + *
  25 + * The interactive user interfaces in modified source and object code versions
  26 + * of this program must display Appropriate Legal Notices, as required under
  27 + * Section 5 of the GNU General Public License version 3.
  28 + *
  29 + * In accordance with Section 7(b) of the GNU General Public License version 3,
  30 + * these Appropriate Legal Notices must retain the display of the "Powered by
  31 + * KnowledgeTree" logo and retain the original copyright notice. If the display of the
  32 + * logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices
  33 + * must display the words "Powered by KnowledgeTree" and retain the original
  34 + * copyright notice.
  35 + * Contributor( s): ______________________________________
  36 + *
  37 + */
  38 +
  39 +class OpenXmlTextExtractor extends ExternalDocumentExtractor
  40 +{
  41 + public function __construct()
  42 + {
  43 + $config = KTConfig::getSingleton();
  44 +
  45 + $this->unzip = KTUtil::findCommand("import/unzip", 'unzip');
  46 + $this->unzip_params = $config->get('extractorParameters/unzip', '\'{source}\' \'{part}\' -d \'{target_dir}\'');
  47 + parent::__construct();
  48 + }
  49 +
  50 +
  51 + /**
  52 + * Basic function setting the display name
  53 + *
  54 + * @return string
  55 + */
  56 + public function getDisplayName()
  57 + {
  58 + return _kt('Open Xml Text Extractor');
  59 + }
  60 +
  61 + /**
  62 + * Return a list of all Office 2007 document types that are supported
  63 + *
  64 + * @return array
  65 + */
  66 + public function getSupportedMimeTypes()
  67 + {
  68 + return array(
  69 + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
  70 + 'application/vnd.openxmlformats-officedocument.wordprocessingml.template',
  71 + 'application/vnd.openxmlformats-officedocument.presentationml.template',
  72 + 'application/vnd.openxmlformats-officedocument.presentationml.slideshow',
  73 + 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
  74 + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
  75 + 'application/vnd.openxmlformats-officedocument.spreadsheetml.template'
  76 + );
  77 + }
  78 +
  79 + /**
  80 + * Trivial function to resolve if the document is word, excel, or power point
  81 + *
  82 + * @return array
  83 + */
  84 +
  85 + private function detectDocumentType()
  86 + {
  87 + $types = array(
  88 + 'docx' => array(
  89 + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
  90 + 'application/vnd.openxmlformats-officedocument.wordprocessingml.template'
  91 + ),
  92 + 'pptx' => array(
  93 + 'application/vnd.openxmlformats-officedocument.presentationml.template',
  94 + 'application/vnd.openxmlformats-officedocument.presentationml.slideshow',
  95 + 'application/vnd.openxmlformats-officedocument.presentationml.presentation'),
  96 + 'xlsx' => array(
  97 + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
  98 + 'application/vnd.openxmlformats-officedocument.spreadsheetml.template'),
  99 +
  100 + );
  101 + foreach($types as $key=>$types)
  102 + {
  103 + if (in_array($this->mimetype, $types))
  104 + {
  105 + return $key;
  106 + }
  107 + }
  108 + }
  109 +
  110 + /**
  111 + * The open xml file comprises various file with different content. This function identifies
  112 + * which of those content types are worth indexing.
  113 + *
  114 + * @param string $openxml_type
  115 + * @param string $mime_type
  116 + * @return boolean
  117 + */
  118 + private function interestingParts($openxml_type, $mime_type)
  119 + {
  120 + $interest = array(
  121 + 'docx'=> array(
  122 + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml',
  123 + 'application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml',
  124 + 'application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml',
  125 + 'application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml'),
  126 +
  127 + 'pptx' => array('application/vnd.openxmlformats-officedocument.presentationml.slide+xml'),
  128 + 'xlsx' => array(
  129 + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml',
  130 + 'application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml',
  131 + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml',
  132 + 'application/vnd.openxmlformats-package.core-properties+xml'));
  133 + return in_array($mime_type, $interest[$openxml_type]);
  134 + }
  135 +
  136 + /**
  137 + * Returns a list of tokens that were identified by the [Content_Types].xml file. This file lists links to all parts of the document.
  138 + * We use interestingParts() above to identify which of these parts are interesting from a content perspective.
  139 + *
  140 + * @return array
  141 + */
  142 + private function getOpenXmlContentTypes()
  143 + {
  144 + $config = KTConfig::getSingleton();
  145 + $temp_dir = $config->get('urls/tmpDirectory');
  146 +
  147 + $time = 'openxml_'. time();
  148 + $this->openxml_dir = $temp_dir . '/' . $time;
  149 +
  150 + if (!mkdir($this->openxml_dir))
  151 + {
  152 + $this->output = _kt('Could not create folder: ') . $this->openxml_dir;
  153 + return false;
  154 + }
  155 +
  156 +
  157 + $cmd = $this->unzip . ' ' . str_replace(
  158 + array('{source}','{part}', '{target_dir}'),
  159 + array($this->sourcefile, '\[Content_Types\].xml',$this->openxml_dir), $this->unzip_params);
  160 +
  161 + if (!$this->exec($cmd))
  162 + {
  163 + $this->output = _kt('Failed to execute command: ') . $cmd;
  164 + return false;
  165 + }
  166 +
  167 + $filename = $this->openxml_dir . '/[Content_Types].xml';
  168 + if (!file_exists($filename))
  169 + {
  170 + $this->output = _kt('Failed to find file: ') . $filename;
  171 + return false;
  172 + }
  173 +
  174 + $xml_content = file_get_contents($filename);
  175 +
  176 + // once we have the content, we can cleanup!
  177 + @unlink($filename);
  178 +
  179 + // parse the file
  180 + $parser = xml_parser_create();
  181 + xml_parse_into_struct($parser, $xml_content, $vals, $index);
  182 + xml_parser_free($parser);
  183 +
  184 + return $vals;
  185 + }
  186 +
  187 + /**
  188 + * Extract the text from a file within the archive for a specific file.
  189 + *
  190 + * @param string $filename
  191 + * @return string
  192 + */
  193 + private function getContent($filename)
  194 + {
  195 + $config = KTConfig::getSingleton();
  196 +
  197 + if (substr($filename,0,1) == '/')
  198 + {
  199 + $filename = substr($filename,1);
  200 + }
  201 +
  202 + $cmd = $this->unzip . ' ' . str_replace(
  203 + array('{source}','{part}', '{target_dir}'),
  204 + array($this->sourcefile, $filename,$this->openxml_dir), $this->unzip_params);
  205 +
  206 + if (!$this->exec($cmd))
  207 + {
  208 + $this->output = _kt('Failed to execute command: ') . $cmd;
  209 + return false;
  210 + }
  211 +
  212 + $filename = $this->openxml_dir . "/$filename";
  213 + if (!file_exists($filename))
  214 + {
  215 + $this->output = _kt('Failed to open file: ') . $filename;
  216 + return false;
  217 + }
  218 +
  219 + $content = file_get_contents($filename);
  220 +
  221 + // cleanup
  222 + @unlink($filename);
  223 +
  224 + $content = preg_replace ("@(</?[^>]*>)+@", " ", $content);
  225 +
  226 + return $content;
  227 + }
  228 +
  229 +
  230 + /**
  231 + * Given the tokens in the [Content_Types].xml, extract the content
  232 + *
  233 + * @param array $vals
  234 + * @return string
  235 + */
  236 + function getOpenXmlText($vals)
  237 + {
  238 + $openxml_type = $this->detectDocumentType();
  239 +
  240 + $content = '';
  241 +
  242 + foreach($vals as $val)
  243 + {
  244 + if ($val['tag'] == 'OVERRIDE' && $val['type'] == 'complete')
  245 + {
  246 + if ($this->interestingParts($openxml_type, $val['attributes']['CONTENTTYPE']))
  247 + {
  248 + $filename = $val['attributes']['PARTNAME'];
  249 + $result = $this->getContent($filename);
  250 +
  251 + if ($result === false)
  252 + {
  253 + return false;
  254 + }
  255 +
  256 + $content .= $result;
  257 + }
  258 + }
  259 + }
  260 +
  261 + return $content;
  262 + }
  263 +
  264 + /**
  265 + * The main context extraction function
  266 + *
  267 + * @return bool
  268 + */
  269 +
  270 + public function extractTextContent()
  271 + {
  272 + $xml_content = $this->getOpenXmlContentTypes();
  273 +
  274 + if ($xml_content !== false)
  275 + {
  276 + $content = $this->getOpenXmlText($xml_content);
  277 +
  278 + if ($content !== false)
  279 + {
  280 + $result = file_put_contents($this->targetfile, $this->filter($content));
  281 +
  282 + if ($result === false)
  283 + {
  284 + $this->output = _kt('Could not save content to file: ') . $this->targetfile;
  285 + @rmdir($this->openxml_dir);
  286 + return false;
  287 + }
  288 + }
  289 + @rmdir($this->openxml_dir);
  290 + return true;
  291 + }
  292 +
  293 + return false;
  294 +
  295 + }
  296 +
  297 + /**
  298 + * Check that unzip is available
  299 + *
  300 + * @return boolean
  301 + */
  302 + public function diagnose()
  303 + {
  304 + if (false === $this->unzip)
  305 + {
  306 + return sprintf(_kt("Cannot locate unzip: %s."), $this->unzip);
  307 + }
  308 + return null;
  309 + }
  310 +
  311 +}
  312 +
  313 +?>
0 314 \ No newline at end of file
... ...
sql/mysql/upgrade/3.5.2/openxml_mime_types.sql 0 → 100644
  1 +select @id:=max(id)+1 from mime_types;
  2 +insert into mime_types(id, filetypes, mimetypes, icon_path, friendly_name) values
  3 +(@id, 'docx', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'word', 'Word Document');
  4 +select @id:=max(id)+1 from mime_types;
  5 +insert into mime_types(id, filetypes, mimetypes, icon_path, friendly_name) values
  6 +(@id, 'dotx', 'application/vnd.openxmlformats-officedocument.wordprocessingml.template', 'word', 'Word Document');
  7 +select @id:=max(id)+1 from mime_types;
  8 +insert into mime_types(id, filetypes, mimetypes, icon_path, friendly_name) values
  9 +(@id, 'potx', 'application/vnd.openxmlformats-officedocument.presentationml.template', 'office', 'Powerpoint Presentation');
  10 +select @id:=max(id)+1 from mime_types;
  11 +insert into mime_types(id, filetypes, mimetypes, icon_path, friendly_name) values
  12 +(@id, 'ppsx', 'application/vnd.openxmlformats-officedocument.presentationml.slideshow', 'office', 'Powerpoint Presentation');
  13 +select @id:=max(id)+1 from mime_types;
  14 +insert into mime_types(id, filetypes, mimetypes, icon_path, friendly_name) values
  15 +(@id, 'pptx', 'application/vnd.openxmlformats-officedocument.presentationml.presentation', 'office', 'Powerpoint Presentation');
  16 +select @id:=max(id)+1 from mime_types;
  17 +insert into mime_types(id, filetypes, mimetypes, icon_path, friendly_name) values
  18 +(@id, 'xlsx', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'excel', 'Excel Spreadsheet');
  19 +select @id:=max(id)+1 from mime_types;
  20 +insert into mime_types(id, filetypes, mimetypes, icon_path, friendly_name) values
  21 +(@id, 'xltx', 'application/vnd.openxmlformats-officedocument.spreadsheetml.template', 'excel', 'Excel Spreadsheet');
  22 +
  23 +update zseq_mime_types set id=@id
0 24 \ No newline at end of file
... ...