Commit 319ec2a8d18e3603ae908a49005d516b317eb431
1 parent
5df9da16
Merged in from DEV trunk...
KTS-673 "The search algorithm needs some work" Updated. Removed internal metadata references Committed By: Conrad Vermeulen Reviewed By: Kevin Fourie KTS-2529 "Test open office document extractor" Fixed. Extractor script was not taking host and port correctly Committed By: Conrad Vermeulen Reviewed By: Kevin Fourie KTS-673 "The search algorithm needs some work" Updated. Removed experimental indexer Committed By: Conrad Vermeulen Reviewed By: Kevin Fourie KTS-2525 "Create windows service to wrap around scheduler" Updated. Changed bat file to dos filetype. Committed By: Kevin Fourie Reviewed By: Conrad Vermeulen git-svn-id: https://kt-dms.svn.sourceforge.net/svnroot/kt-dms/STABLE/trunk@7462 c91229c3-7414-0410-bfa2-8a42b809f60b
Showing
4 changed files
with
10 additions
and
106 deletions
bin/openoffice/DocumentConverter.py
| @@ -80,7 +80,7 @@ def _unoProps(**args): | @@ -80,7 +80,7 @@ def _unoProps(**args): | ||
| 80 | 80 | ||
| 81 | class DocumentConverter: | 81 | class DocumentConverter: |
| 82 | 82 | ||
| 83 | - def __init__(self, host=argv[3], port=argv[4]): | 83 | + def __init__(self, host, port): |
| 84 | localContext = uno.getComponentContext() | 84 | localContext = uno.getComponentContext() |
| 85 | resolver = localContext.ServiceManager.createInstanceWithContext("com.sun.star.bridge.UnoUrlResolver", localContext) | 85 | resolver = localContext.ServiceManager.createInstanceWithContext("com.sun.star.bridge.UnoUrlResolver", localContext) |
| 86 | try: | 86 | try: |
| @@ -133,7 +133,7 @@ if __name__ == "__main__": | @@ -133,7 +133,7 @@ if __name__ == "__main__": | ||
| 133 | exit(255) | 133 | exit(255) |
| 134 | 134 | ||
| 135 | try: | 135 | try: |
| 136 | - converter = DocumentConverter() | 136 | + converter = DocumentConverter(argv[3],argv[4]) |
| 137 | converter.convert(argv[1], argv[2]) | 137 | converter.convert(argv[1], argv[2]) |
| 138 | except DocumentConversionException, exception: | 138 | except DocumentConversionException, exception: |
| 139 | print "ERROR! " + str(exception) | 139 | print "ERROR! " + str(exception) |
bin/win32/taskrunner.bat
search2/indexing/extractors/ExifExtractor.inc.php
| @@ -27,6 +27,11 @@ class ExifExtractor extends DocumentExtractor | @@ -27,6 +27,11 @@ class ExifExtractor extends DocumentExtractor | ||
| 27 | // no point indexing numeric content. it will be ignored anyways! | 27 | // no point indexing numeric content. it will be ignored anyways! |
| 28 | continue; | 28 | continue; |
| 29 | } | 29 | } |
| 30 | + if ($key =='FILE' && in_array($name, array('MimeType', 'SectionsFound'))) | ||
| 31 | + { | ||
| 32 | + continue; | ||
| 33 | + } | ||
| 34 | + | ||
| 30 | $content .= "$val\n"; | 35 | $content .= "$val\n"; |
| 31 | } | 36 | } |
| 32 | } | 37 | } |
search2/indexing/extractors/OOPDFTextExtractor.inc.php deleted
| 1 | -<?php | ||
| 2 | - | ||
| 3 | -require_once('PDFExtractor.inc.php'); | ||
| 4 | -require_once('OOTextExtractor.inc.php'); | ||
| 5 | - | ||
| 6 | -class OOPDFTextExtractor extends CompositeExtractor | ||
| 7 | -{ | ||
| 8 | - public function __construct() | ||
| 9 | - { | ||
| 10 | - parent::__construct(new OOTextExtractor('application/pdf'),'pdf','application/pdf',new PDFExtractor(), true); | ||
| 11 | - } | ||
| 12 | - | ||
| 13 | - public function getSupportedMimeTypes() | ||
| 14 | - { | ||
| 15 | - // we provide this so diagnose doesn't fail | ||
| 16 | - return array(); | ||
| 17 | - } | ||
| 18 | - | ||
| 19 | - public function getDisplayName() | ||
| 20 | - { | ||
| 21 | - // we provide this so diagnose doesn't fail | ||
| 22 | - throw new Exception(_kt('This should be overriden')); | ||
| 23 | - } | ||
| 24 | - | ||
| 25 | -} | ||
| 26 | - | ||
| 27 | -/* | ||
| 28 | -class OOPDFTextExtractor extends DocumentExtractor | ||
| 29 | -{ | ||
| 30 | - | ||
| 31 | - private $pdf2txt; | ||
| 32 | - | ||
| 33 | - | ||
| 34 | - private $text2pdf; | ||
| 35 | - | ||
| 36 | - public function __construct() | ||
| 37 | - { | ||
| 38 | - $this->pdf2txt = new PDFExtractor(); | ||
| 39 | - $this->text2pdf = new OOTextExtractor(); | ||
| 40 | - } | ||
| 41 | - | ||
| 42 | - public function needsIntermediateSourceFile() | ||
| 43 | - { | ||
| 44 | - // we need the intermediate file because it | ||
| 45 | - // has the correct extension. jodconverter uses the extension to determine mimetype | ||
| 46 | - return true; | ||
| 47 | - } | ||
| 48 | - | ||
| 49 | - public function getDisplayName() | ||
| 50 | - { | ||
| 51 | - throw new Exception('This should be overriden'); | ||
| 52 | - } | ||
| 53 | - | ||
| 54 | - public function getSupportedMimeTypes() | ||
| 55 | - { | ||
| 56 | - return array(); | ||
| 57 | - } | ||
| 58 | - | ||
| 59 | - public function extractTextContent() | ||
| 60 | - { | ||
| 61 | - $pdffile = $this->targetfile . '.pdf'; | ||
| 62 | - | ||
| 63 | - $this->text2pdf->setSourceFile($this->sourcefile); | ||
| 64 | - $this->text2pdf->setTargetFile($pdffile); | ||
| 65 | - $this->text2pdf->setMimeType($this->mimetype); | ||
| 66 | - $this->text2pdf->setExtension($this->extension); | ||
| 67 | - if ($this->extractTextContent()) | ||
| 68 | - { | ||
| 69 | - return false; | ||
| 70 | - } | ||
| 71 | - | ||
| 72 | - $this->pdf2txt->setSourceFile($pdffile); | ||
| 73 | - $this->pdf2txt->setTargetFile($this->targetfile); | ||
| 74 | - $this->pdf2txt->setMimeType('application/pdf'); | ||
| 75 | - $this->pdf2txt->setExtension('pdf'); | ||
| 76 | - $result = $this->pdf2txt->extractTextContent(); | ||
| 77 | - | ||
| 78 | - unlink(@$pdffile); | ||
| 79 | - | ||
| 80 | - return $result; | ||
| 81 | - } | ||
| 82 | - | ||
| 83 | - public function diagnose() | ||
| 84 | - { | ||
| 85 | - $diagnosis = $this->pdf2txt->diagnose(); | ||
| 86 | - if (!empty($diagnosis)) | ||
| 87 | - { | ||
| 88 | - return $diagnosis; | ||
| 89 | - } | ||
| 90 | - | ||
| 91 | - $diagnosis = $this->text2pdf->diagnose(); | ||
| 92 | - if (!empty($diagnosis)) | ||
| 93 | - { | ||
| 94 | - return $diagnosis; | ||
| 95 | - } | ||
| 96 | - | ||
| 97 | - return null; | ||
| 98 | - } | ||
| 99 | -} */ | ||
| 100 | - | ||
| 101 | -?> | ||
| 102 | \ No newline at end of file | 0 | \ No newline at end of file |