diff --git a/search2/indexing/extractors/OpenXmlTextExtractor.inc.php b/search2/indexing/extractors/OpenXmlTextExtractor.inc.php new file mode 100644 index 0000000..6176d75 --- /dev/null +++ b/search2/indexing/extractors/OpenXmlTextExtractor.inc.php @@ -0,0 +1,313 @@ +. + * + * You can contact The Jam Warehouse Software (Pty) Limited, Unit 1, Tramber Place, + * Blake Street, Observatory, 7925 South Africa. or email info@knowledgetree.com. + * + * The interactive user interfaces in modified source and object code versions + * of this program must display Appropriate Legal Notices, as required under + * Section 5 of the GNU General Public License version 3. + * + * In accordance with Section 7(b) of the GNU General Public License version 3, + * these Appropriate Legal Notices must retain the display of the "Powered by + * KnowledgeTree" logo and retain the original copyright notice. If the display of the + * logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices + * must display the words "Powered by KnowledgeTree" and retain the original + * copyright notice. + * Contributor( s): ______________________________________ + * + */ + +class OpenXmlTextExtractor extends ExternalDocumentExtractor +{ + public function __construct() + { + $config = KTConfig::getSingleton(); + + $this->unzip = KTUtil::findCommand("import/unzip", 'unzip'); + $this->unzip_params = $config->get('extractorParameters/unzip', '\'{source}\' \'{part}\' -d \'{target_dir}\''); + parent::__construct(); + } + + + /** + * Basic function setting the display name + * + * @return string + */ + public function getDisplayName() + { + return _kt('Open Xml Text Extractor'); + } + + /** + * Return a list of all Office 2007 document types that are supported + * + * @return array + */ + public function getSupportedMimeTypes() + { + return array( + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.template', + 'application/vnd.openxmlformats-officedocument.presentationml.template', + 'application/vnd.openxmlformats-officedocument.presentationml.slideshow', + 'application/vnd.openxmlformats-officedocument.presentationml.presentation', + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + 'application/vnd.openxmlformats-officedocument.spreadsheetml.template' + ); + } + + /** + * Trivial function to resolve if the document is word, excel, or power point + * + * @return array + */ + + private function detectDocumentType() + { + $types = array( + 'docx' => array( + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.template' + ), + 'pptx' => array( + 'application/vnd.openxmlformats-officedocument.presentationml.template', + 'application/vnd.openxmlformats-officedocument.presentationml.slideshow', + 'application/vnd.openxmlformats-officedocument.presentationml.presentation'), + 'xlsx' => array( + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + 'application/vnd.openxmlformats-officedocument.spreadsheetml.template'), + + ); + foreach($types as $key=>$types) + { + if (in_array($this->mimetype, $types)) + { + return $key; + } + } + } + + /** + * The open xml file comprises various file with different content. This function identifies + * which of those content types are worth indexing. + * + * @param string $openxml_type + * @param string $mime_type + * @return boolean + */ + private function interestingParts($openxml_type, $mime_type) + { + $interest = array( + 'docx'=> array( + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml'), + + 'pptx' => array('application/vnd.openxmlformats-officedocument.presentationml.slide+xml'), + 'xlsx' => array( + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml', + 'application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml', + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml', + 'application/vnd.openxmlformats-package.core-properties+xml')); + return in_array($mime_type, $interest[$openxml_type]); + } + + /** + * Returns a list of tokens that were identified by the [Content_Types].xml file. This file lists links to all parts of the document. + * We use interestingParts() above to identify which of these parts are interesting from a content perspective. + * + * @return array + */ + private function getOpenXmlContentTypes() + { + $config = KTConfig::getSingleton(); + $temp_dir = $config->get('urls/tmpDirectory'); + + $time = 'openxml_'. time(); + $this->openxml_dir = $temp_dir . '/' . $time; + + if (!mkdir($this->openxml_dir)) + { + $this->output = _kt('Could not create folder: ') . $this->openxml_dir; + return false; + } + + + $cmd = $this->unzip . ' ' . str_replace( + array('{source}','{part}', '{target_dir}'), + array($this->sourcefile, '\[Content_Types\].xml',$this->openxml_dir), $this->unzip_params); + + if (!$this->exec($cmd)) + { + $this->output = _kt('Failed to execute command: ') . $cmd; + return false; + } + + $filename = $this->openxml_dir . '/[Content_Types].xml'; + if (!file_exists($filename)) + { + $this->output = _kt('Failed to find file: ') . $filename; + return false; + } + + $xml_content = file_get_contents($filename); + + // once we have the content, we can cleanup! + @unlink($filename); + + // parse the file + $parser = xml_parser_create(); + xml_parse_into_struct($parser, $xml_content, $vals, $index); + xml_parser_free($parser); + + return $vals; + } + + /** + * Extract the text from a file within the archive for a specific file. + * + * @param string $filename + * @return string + */ + private function getContent($filename) + { + $config = KTConfig::getSingleton(); + + if (substr($filename,0,1) == '/') + { + $filename = substr($filename,1); + } + + $cmd = $this->unzip . ' ' . str_replace( + array('{source}','{part}', '{target_dir}'), + array($this->sourcefile, $filename,$this->openxml_dir), $this->unzip_params); + + if (!$this->exec($cmd)) + { + $this->output = _kt('Failed to execute command: ') . $cmd; + return false; + } + + $filename = $this->openxml_dir . "/$filename"; + if (!file_exists($filename)) + { + $this->output = _kt('Failed to open file: ') . $filename; + return false; + } + + $content = file_get_contents($filename); + + // cleanup + @unlink($filename); + + $content = preg_replace ("@(]*>)+@", " ", $content); + + return $content; + } + + + /** + * Given the tokens in the [Content_Types].xml, extract the content + * + * @param array $vals + * @return string + */ + function getOpenXmlText($vals) + { + $openxml_type = $this->detectDocumentType(); + + $content = ''; + + foreach($vals as $val) + { + if ($val['tag'] == 'OVERRIDE' && $val['type'] == 'complete') + { + if ($this->interestingParts($openxml_type, $val['attributes']['CONTENTTYPE'])) + { + $filename = $val['attributes']['PARTNAME']; + $result = $this->getContent($filename); + + if ($result === false) + { + return false; + } + + $content .= $result; + } + } + } + + return $content; + } + + /** + * The main context extraction function + * + * @return bool + */ + + public function extractTextContent() + { + $xml_content = $this->getOpenXmlContentTypes(); + + if ($xml_content !== false) + { + $content = $this->getOpenXmlText($xml_content); + + if ($content !== false) + { + $result = file_put_contents($this->targetfile, $this->filter($content)); + + if ($result === false) + { + $this->output = _kt('Could not save content to file: ') . $this->targetfile; + @rmdir($this->openxml_dir); + return false; + } + } + @rmdir($this->openxml_dir); + return true; + } + + return false; + + } + + /** + * Check that unzip is available + * + * @return boolean + */ + public function diagnose() + { + if (false === $this->unzip) + { + return sprintf(_kt("Cannot locate unzip: %s."), $this->unzip); + } + return null; + } + +} + +?> \ No newline at end of file diff --git a/sql/mysql/upgrade/3.5.2/openxml_mime_types.sql b/sql/mysql/upgrade/3.5.2/openxml_mime_types.sql new file mode 100644 index 0000000..9516766 --- /dev/null +++ b/sql/mysql/upgrade/3.5.2/openxml_mime_types.sql @@ -0,0 +1,23 @@ +select @id:=max(id)+1 from mime_types; +insert into mime_types(id, filetypes, mimetypes, icon_path, friendly_name) values +(@id, 'docx', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'word', 'Word Document'); +select @id:=max(id)+1 from mime_types; +insert into mime_types(id, filetypes, mimetypes, icon_path, friendly_name) values +(@id, 'dotx', 'application/vnd.openxmlformats-officedocument.wordprocessingml.template', 'word', 'Word Document'); +select @id:=max(id)+1 from mime_types; +insert into mime_types(id, filetypes, mimetypes, icon_path, friendly_name) values +(@id, 'potx', 'application/vnd.openxmlformats-officedocument.presentationml.template', 'office', 'Powerpoint Presentation'); +select @id:=max(id)+1 from mime_types; +insert into mime_types(id, filetypes, mimetypes, icon_path, friendly_name) values +(@id, 'ppsx', 'application/vnd.openxmlformats-officedocument.presentationml.slideshow', 'office', 'Powerpoint Presentation'); +select @id:=max(id)+1 from mime_types; +insert into mime_types(id, filetypes, mimetypes, icon_path, friendly_name) values +(@id, 'pptx', 'application/vnd.openxmlformats-officedocument.presentationml.presentation', 'office', 'Powerpoint Presentation'); +select @id:=max(id)+1 from mime_types; +insert into mime_types(id, filetypes, mimetypes, icon_path, friendly_name) values +(@id, 'xlsx', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'excel', 'Excel Spreadsheet'); +select @id:=max(id)+1 from mime_types; +insert into mime_types(id, filetypes, mimetypes, icon_path, friendly_name) values +(@id, 'xltx', 'application/vnd.openxmlformats-officedocument.spreadsheetml.template', 'excel', 'Excel Spreadsheet'); + +update zseq_mime_types set id=@id \ No newline at end of file