From 4f809981a4726dd0053e7670ca28066005f21e94 Mon Sep 17 00:00:00 2001 From: Megan Watson Date: Thu, 9 Oct 2008 07:39:53 +0000 Subject: [PATCH] KTS-3806 "The number of documents in the indexing queue is incorrect on the Document Indexer Statistics dashlet" Fixed. Adjusted sql to ignore problem documents. --- search2/indexing/indexerCore.inc.php | 3724 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 1 file changed, 1862 insertions(+), 1862 deletions(-) diff --git a/search2/indexing/indexerCore.inc.php b/search2/indexing/indexerCore.inc.php index 48d9a19..7f63090 100755 --- a/search2/indexing/indexerCore.inc.php +++ b/search2/indexing/indexerCore.inc.php @@ -1,1862 +1,1862 @@ -. - * - * You can contact KnowledgeTree Inc., PO Box 7775 #87847, San Francisco, - * California 94120-7775, or email info@knowledgetree.com. - * - * The interactive user interfaces in modified source and object code versions - * of this program must display Appropriate Legal Notices, as required under - * Section 5 of the GNU General Public License version 3. - * - * In accordance with Section 7(b) of the GNU General Public License version 3, - * these Appropriate Legal Notices must retain the display of the "Powered by - * KnowledgeTree" logo and retain the original copyright notice. If the display of the - * logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices - * must display the words "Powered by KnowledgeTree" and retain the original - * copyright notice. - * Contributor( s): ______________________________________ - * - */ - -define('SEARCH2_INDEXER_DIR',realpath(dirname(__FILE__)) . '/'); -require_once('indexing/extractorCore.inc.php'); -require_once(KT_DIR . '/plugins/ktcore/scheduler/schedulerUtil.php'); -require_once(KT_DIR . '/ktapi/ktapi.inc.php'); - -class IndexerInconsistencyException extends Exception {}; - -// TODO: Query Result Items code should be moved into the Search section. It has less to do with indexing... - -class QueryResultItem -{ - protected $id; - protected $title; - protected $rank; - protected $text; - protected $fullpath; - - public function __construct($id, $title, $rank, $text, $fullpath) - { - $this->id = $id; - $this->title = $title; - $this->rank = $rank; - $this->text = $text; - $this->fullpath = $fullpath; - } - - public function getId() { return $this->id; } - public function getRealId() { return $this->id; } - - public function getIsProxy() { return $this instanceof ProxyResultItem; } - public function getIsFolder() { return substr(get_class($this), 0, 6) == 'Folder' ; } - public function getIsDocument() { return substr(get_class($this), 0, 8) == 'Document' ; } - - public function setRank($value) - { - $this->rank = number_format($value,2,'.',','); - } - - public function getIsLive() - { - return true; - } - - public function setTitle($value) - { - $this->title = $value; - } - - public function setText($value) - { - $this->text = $value; - } - - public function getRelevance() { return (float) $this->rank; } - public function getRank() { return $this->getRelevance(); } - public function getText() { return (string) $this->text; } - public function getTitle() { return (string) $this->title; } - public function getFullPath() { return (string) $this->fullpath; } - - protected function __get($property) - { - if (empty($property)) - { - return ''; - } - - $method = 'get' . $property; - if (method_exists($this, $method)) - { - return $this->$method(); - } - return $this->getUnknown(); - } - - protected function getUnknown() - { - return _kt('n/a'); - } - - protected function __set($property, $value) - { - if (empty($property)) - { - return ''; - } - - $method = 'set' . $property; - if (method_exists($this, $method)) - { - return $this->$method($value); - } - throw new Exception("Unknown property '$property' to set on QueryResultItem"); - } -} - -class ProxyResultItem extends QueryResultItem -{ - protected $proxy; - protected $proxyId; - - public function __construct($proxyId, $proxy) - { - parent::__construct($proxyId, $proxy->getTitle, $proxy->getRank(), $proxy->getText(), $proxy->getFullPath()); - $this->proxyId = $proxyId; - $this->proxy = $proxy; - } - - public function getId() { return $this->proxyId; } - public function getTitle() { return $this->proxy->getTitle(); } - public function getRealId() { return $this->proxy->getId(); } - - protected function __get($property) - { - $method = 'get' . $property; - - if (method_exists($this, $method)) - { - return $this->$method(); - } - else - { - return $this->proxy->$method(); - } - } - - protected function __set($property, $value) - { - $method = 'set' . $property; - if (method_exists($this, $method)) - { - return $this->$method($value); - } - else - { - return $this->proxy->$method($value); - } - } -} - -class DocumentResultItem extends QueryResultItem -{ - protected $filesize; - protected $live; - protected $version; - protected $mimeType; - protected $filename; - protected $thumbnail; // TODO: if not null, gui can display a thumbnail - protected $viewer; // TODO: if not null, a viewer can be used to view the document - protected $document; - protected $checkedOutUser; - protected $dateCheckedout; - protected $workflowState; - protected $workflow; - protected $modifiedBy; - protected $dateModified; - protected $createdBy; - protected $dateCreated; - protected $owner; - protected $immutable; - protected $deleted; - protected $status; - protected $folderId; - protected $storagePath; - protected $documentType; - protected $mimeIconPath; - protected $mimeDisplay; - protected $oemDocumentNo; - - public function __construct($document_id, $rank=null, $title=null, $text=null, $fullpath = null) - { - parent::__construct($document_id, $title, $rank, $text, $fullpath); - $this->live = true; - $this->loadDocumentInfo(); - } - - // TODO: this is bad. must refactor to do the query on the group of documents. - public function loadDocumentInfo() - { - global $default; - $sql = "SELECT - d.folder_id, f.full_path, f.name, dcv.size as filesize, dcv.major_version, - dcv.minor_version, dcv.filename, cou.name as checkoutuser, w.human_name as workflow, ws.human_name as workflowstate, - mt.mimetypes as mimetype, md.mime_doc as mimedoc, d.checkedout, mbu.name as modifiedbyuser, d.modified, - cbu.name as createdbyuser, ou.name as owneruser, d.immutable, d.status_id, d.created,dcv.storage_path, dtl.name as document_type, - mt.icon_path as mime_icon_path, mt.friendly_name as mime_display, d.oem_no, dmv.name as title - FROM - documents d - INNER JOIN document_metadata_version dmv ON d.metadata_version_id = dmv.id - INNER JOIN document_content_version dcv ON dmv.content_version_id = dcv.id - INNER JOIN mime_types mt ON dcv.mime_id=mt.id - LEFT JOIN document_types_lookup dtl ON dtl.id=dmv.document_type_id - LEFT JOIN folders f ON f.id=d.folder_id - LEFT JOIN users cou ON d.checked_out_user_id=cou.id - LEFT JOIN workflows w ON dmv.workflow_id=w.id - LEFT JOIN workflow_states ws ON dmv.workflow_state_id = ws.id - LEFT JOIN mime_documents md ON mt.mime_document_id = md.id - LEFT JOIN users mbu ON d.modified_user_id=mbu.id - LEFT JOIN users cbu ON d.creator_id=cbu.id - LEFT JOIN users ou ON d.owner_id=ou.id - WHERE - d.id=$this->id"; - - $result = DBUtil::getOneResult($sql); - - if (PEAR::isError($result) || empty($result)) - { - $this->live = false; - if (PEAR::isError($result)) - { - throw new Exception('Database exception! There appears to be an error in the system: ' .$result->getMessage()); - } - - $default->log->error('QueryResultItem: $result is null'); - $msg = 'The database did not have a record matching the result from the document indexer. This may occur if there is an inconsistency between the document indexer and the repository. The indexer needs to be repaired.'; - $default->log->error('QueryResultItem: ' . $msg); - // TODO: repair process where we scan documents in index, and delete those for which there is nothing in the repository - throw new IndexerInconsistencyException(_kt($msg)); - } - - // document_id, relevance, text, title - - $this->documentType = $result['document_type']; - $this->filename=$result['filename']; - $this->filesize = KTUtil::filesizeToString($result['filesize']); - $this->folderId = $result['folder_id']; - $this->title = $result['title']; - - $this->createdBy = $result['createdbyuser']; - $this->dateCreated = $result['created']; - - $this->modifiedBy = $result['modifiedbyuser']; - $this->dateModified = $result['modified']; - - $this->checkedOutUser = $result['checkoutuser']; - $this->dateCheckedout = $result['checkedout']; - - $this->owner = $result['owneruser']; - - $this->version = $result['major_version'] . '.' . $result['minor_version']; - - $this->immutable = ($result['immutable'] + 0)?_kt('Immutable'):''; - - $this->workflow = $result['workflow']; - $this->workflowState = $result['workflowstate']; - - $this->oemDocumentNo = $result['oem_no']; - if (empty($this->oemDocumentNo)) $this->oemDocumentNo = 'n/a'; - - if (is_null($result['name'])) - { - $this->fullpath = '(orphaned)'; - } - else - { - $this->fullpath = $result['full_path']; - } - - $this->mimeType = $result['mimetype']; - $this->mimeIconPath = $result['mime_icon_path']; - if (empty($this->mimeIconPath)) - { - $this->mimeIconPath = 'unspecified_type'; - } - $this->mimeDisplay = $result['mime_display']; - - $this->storagePath = $result['storage_path']; - $this->status = Document::getStatusString($result['status_id']); - } - - public function getDocumentID() { return $this->getId(); } - public function getIsLive() { return (bool) $this->live; } - public function getFilesize() { return $this->filesize; } - public function getVersion() { return (string) $this->version; } - public function getFilename() { return (string)$this->filename; } - public function getFolderId() { return (int)$this->folderId; } - public function getOemDocumentNo() { return (string) $this->oemDocumentNo; } - public function getDocument() { return Document::get($this->id); } - public function getIsAvailable() { return $this->Document->isLive(); } - public function getCheckedOutUser() { return (string) $this->checkedOutUser; } - public function getCheckedOutByr() { return $this->getCheckedOutUser(); } - public function getWorkflowOnly() { return (string)$this->workflow; } - public function getWorkflow() { return $this->getWorkflow(); } - public function getWorkflowStateOnly() { return (string)$this->workflowState; } - public function getWorkflowState() { return $this->getWorkflowStateOnly(); } - public function getWorkflowAndState() { - if (is_null($this->workflow)) - { - return ''; - } - return "$this->workflow - $this->workflowState"; - } - public function getMimeType() { return (string) $this->mimeType; } - public function getMimeIconPath() { return (string) $this->mimeIconPath; } - public function getMimeDisplay() { return (string) $this->mimeDisplay; } - public function getDateCheckedOut() { return (string) $this->dateCheckedout; } - public function getModifiedBy() { return (string) $this->modifiedBy; } - public function getDateModified() { return (string) $this->dateModified; } - public function getCreatedBy() { return (string) $this->createdBy; } - public function getDateCreated() { return (string) $this->dateCreated; } - public function getOwner() { return (string) $this->owner; } - public function getOwnedBy() { return $this->getOwner(); } - public function getIsImmutable() { return (bool) $this->immutable; } - public function getImmutable() { return $this->getIsImmutable(); } - public function getStatus() { return $this->status; } - public function getStoragePath() { return $this->storagePath; } - public function getDocumentType() { return $this->documentType; } - public function getPermissions() { return KTAPI_Document::get_permission_string($this->Document); } - public function getCanBeReadByUser() { - if (!$this->live) - return false; - if (Permission::userHasDocumentReadPermission($this->Document)) - return true; - if (Permission::adminIsInAdminMode()) - return true; - return false; - } -} - -class FolderResultItem extends QueryResultItem -{ - protected $folder; - protected $createdBy; - protected $parentId; - - public function __construct($folder_id, $rank=null, $title=null, $text=null, $fullpath = null) - { - parent::__construct($folder_id, $title, $rank, $text, $fullpath); - $this->loadFolderInfo(); - } - - public function getFolderID() { return $this->getId(); } - public function getParentID() { return $this->parentId; } - public function getCreatedBy() { return $this->createdBy; } - public function getMimeIconPath() { return 'folder'; } - public function getFolder() { return Folder::get($this->getFolderID()); } - public function getPermissions() { return KTAPI_Folder::get_permission_string($this->Folder); } - - public function loadFolderInfo() - { - global $default; - $folder = $this->getFolder(); - if (PEAR::isError($folder)) - { - throw new Exception('Database exception! There appears to be an error in the system: ' .$result->getMessage()); - } - $this->title = $folder->getName(); - $this->fullpath = '/' . $folder->getFullPath(); - $this->parentId = $folder->getParentId(); - - $user = User::get($folder->getCreatorID()); - $this->createdBy = (PEAR::isError($user))?_kt('Unknown'):$user->getName(); - } - -} - -class DocumentShortcutResultItem extends ProxyResultItem -{ - public function getDocumentID() { return $this->getId(); } - public function getMimeIconPath() { return $this->proxy->getMimeIconPath() . '_shortcut'; } - -} - -class FolderShortcutResultItem extends ProxyResultItem -{ - public function getFolderID() { return $this->getId(); } - public function getMimeIconPath() { return 'folder_shortcut'; } - -} - -function MatchResultCompare($a, $b) -{ - if ($a->Rank == $b->Rank) { - return 0; - } - return ($a->Rank < $b->Rank) ? -1 : 1; -} - -abstract class Indexer -{ - /** - * Cache of extractors - * - * @var array - */ - private $extractorCache; - - /** - * Indicates if the indexer will do logging. - * - * @var boolean - */ - private $debug; - /** - * Cache on mime related hooks - * - * @var unknown_type - */ - private $mimeHookCache; - /** - * Cache on general hooks. - * - * @var array - */ - private $generalHookCache; - - /** - * This is a path to the extractors. - * - * @var string - */ - private $extractorPath; - /** - * This is a path to the hooks. - * - * @var string - */ - private $hookPath; - - private $enabledExtractors; - - /** - * Initialise the indexer - * - */ - protected function __construct() - { - $config = KTConfig::getSingleton(); - - $this->extractorCache = array(); - $this->debug = $config->get('indexer/debug', true); - $this->hookCache = array(); - $this->generalHookCache = array(); - $this->extractorPath = $config->get('indexer/extractorPath', 'extractors'); - $this->hookPath = $config->get('indexer/extractorHookPath','extractorHooks'); - - $this->loadExtractorStatus(); - } - - /** - * Get the list if enabled extractors - * - */ - private function loadExtractorStatus() - { - $sql = "SELECT id, name FROM mime_extractors WHERE active=1"; - $rs = DBUtil::getResultArray($sql); - $this->enabledExtractors = array(); - foreach($rs as $item) - { - $this->enabledExtractors[] = $item['name']; - } - } - - private function isExtractorEnabled($extractor) - { - return in_array($extractor, $this->enabledExtractors); - } - - /** - * Returns a reference to the main class - * - * @return Indexer - */ - public static function get() - { - static $singleton = null; - - if (is_null($singleton)) - { - $config = KTConfig::getSingleton(); - $classname = $config->get('indexer/coreClass'); - - require_once('indexing/indexers/' . $classname . '.inc.php'); - - if (!class_exists($classname)) - { - throw new Exception("Class '$classname' does not exist."); - } - - $singleton = new $classname; - } - - return $singleton; - } - - public abstract function deleteDocument($docid); - - /** - * Remove the association of all extractors to mime types on the database. - * - */ - public function clearExtractors() - { - global $default; - - $sql = "update mime_types set extractor_id=null"; - DBUtil::runQuery($sql); - - $sql = "delete from mime_extractors"; - DBUtil::runQuery($sql); - - if ($this->debug) $default->log->debug('clearExtractors'); - } - - /** - * lookup the name of the extractor class based on the mime type. - * - * @param string $type - * @return string - */ - public static function resolveExtractor($type) - { - global $default; - $sql = "select extractor from mime_types where filetypes='$type'"; - $class = DBUtil::getOneResultKey($sql,'extractor'); - if (PEAR::isError($class)) - { - $default->log->error("resolveExtractor: cannot resolve $type"); - return $class; - } - if ($this->debug) $default->log->debug(sprintf(_kt("resolveExtractor: Resolved '%s' from mime type '%s'."), $class, $type)); - return $class; - } - - /** - * Return all the discussion text. - * - * @param int $docid - * @return string - */ - public static function getDiscussionText($docid) - { - $sql = "SELECT - dc.subject, dc.body - FROM - discussion_threads dt - INNER JOIN discussion_comments dc ON dc.thread_id=dt.id AND dc.id BETWEEN dt.first_comment_id AND dt.last_comment_id - WHERE - dt.document_id=$docid"; - $result = DBUtil::getResultArray($sql); - $text = ''; - - foreach($result as $record) - { - $text .= $record['subject'] . "\n" . $record['body'] . "\n"; - } - - return $text; - } - - /** - * Schedule the indexing of a document. - * - * @param string $document - * @param string $what - */ - public static function index($document, $what='A') - { - global $default; - - if (is_numeric($document)) - { - $document = Document::get($document+0); - } - - if (PEAR::isError($document)) - { - $default->log->error("index: Could not index document: " .$document->getMessage()); - return; - } - - $document_id = $document->getId(); - $userid=$_SESSION['userID']; - if (empty($userid)) $userid=1; - - // we dequeue the document so that there are no issues when enqueuing - Indexer::unqueueDocument($document_id); - - // enqueue item - $sql = "INSERT INTO index_files(document_id, user_id, what) VALUES($document_id, $userid, '$what')"; - DBUtil::runQuery($sql); - - $default->log->debug("index: Queuing indexing of $document_id"); - - } - - private static function incrementCount() - { - // Get count from system settings - $count = Indexer::getIndexedDocumentCount(); - $count = (int)$count + 1; - Indexer::updateIndexedDocumentCount($count); - } - - public static function getIndexedDocumentCount() - { - $count = KTUtil::getSystemSetting('indexedDocumentCount', 0); - return (int) $count; - } - - public static function updateIndexedDocumentCount($cnt = 0) - { - KTUtil::setSystemSetting('indexedDocumentCount', $cnt); - } - - public static function reindexQueue() - { - $sql = "UPDATE index_files SET processdate = null"; - DBUtil::runQuery($sql); - } - - public static function reindexDocument($documentId) - { - $sql = "UPDATE index_files SET processdate=null, status_msg=null WHERE document_id=$documentId"; - DBUtil::runQuery($sql); - } - - - - public static function indexAll() - { - $userid=$_SESSION['userID']; - if (empty($userid)) $userid=1; - - $sql = "DELETE FROM index_files"; - DBUtil::runQuery($sql); - - $sql = "INSERT INTO index_files(document_id, user_id, what) SELECT id, $userid, 'A' FROM documents WHERE status_id=1 and id not in (select document_id from index_files)"; - DBUtil::runQuery($sql); - } - - public static function indexFolder($folder) - { - $userid=$_SESSION['userID']; - if (empty($userid)) $userid=1; - - if (!$folder instanceof Folder && !$folder instanceof FolderProxy) - { - throw new Exception('Folder expected'); - } - - $full_path = $folder->getFullPath(); - - $sql = "INSERT INTO index_files(document_id, user_id, what) SELECT id, $userid, 'A' FROM documents WHERE full_path like '{$full_path}/%' AND status_id=1 and id not in (select document_id from index_files)"; - DBUtil::runQuery($sql); - } - - /** - * Clearout the scheduling of documents that no longer exist. - * - */ - public static function clearoutDeleted() - { - global $default; - - $sql = 'DELETE FROM - index_files - WHERE - document_id in (SELECT d.id FROM documents AS d WHERE d.status_id=3) OR - NOT EXISTS(SELECT index_files.document_id FROM documents WHERE index_files.document_id=documents.id)'; - DBUtil::runQuery($sql); - - $default->log->debug("Indexer::clearoutDeleted: removed documents from indexing queue that have been deleted"); - } - - - /** - * Check if a document is scheduled to be indexed - * - * @param mixed $document This may be a document or document id - * @return boolean - */ - public static function isDocumentScheduled($document) - { - if (is_numeric($document)) - { - $docid = $document; - } - else if ($document instanceof Document) - { - $docid = $document->getId(); - } - else - { - return false; - } - $sql = "SELECT 1 FROM index_files WHERE document_id=$docid"; - $result = DBUtil::getResultArray($sql); - return count($result) > 0; - } - - /** - * Filters text removing redundant characters such as continuous newlines and spaces. - * - * @param string $filename - */ - private function filterText($filename) - { - $content = file_get_contents($filename); - - $src = array("([\r\n])","([\n][\n])","([\n])","([\t])",'([ ][ ])'); - $tgt = array("\n","\n",' ',' ',' '); - - // shrink what is being stored. - do - { - $orig = $content; - $content = preg_replace($src, $tgt, $content); - } while ($content != $orig); - - return file_put_contents($filename, $content) !== false; - } - - /** - * Load hooks for text extraction process. - * - */ - private function loadExtractorHooks() - { - $this->generalHookCache = array(); - $this->mimeHookCache = array(); - - - $dir = opendir(SearchHelper::correctPath($this->hookPath)); - while (($file = readdir($dir)) !== false) - { - if (substr($file,-12) == 'Hook.inc.php') - { - require_once($this->hookPath . '/' . $file); - $class = substr($file, 0, -8); - - if (!class_exists($class)) - { - continue; - } - - $hook = new $class; - if (!($class instanceof ExtractorHook)) - { - continue; - } - - $mimeTypes = $hook->registerMimeTypes(); - if (is_null($mimeTypes)) - { - $this->generalHookCache[] = & $hook; - } - else - { - foreach($mimeTypes as $type) - { - $this->mimeHookCache[$type][] = & $hook; - } - } - - } - } - closedir($dir); - } - - /** - * This is a refactored function to execute the hooks. - * - * @param DocumentExtractor $extractor - * @param string $phase - * @param string $mimeType Optional. If set, indicates which hooks must be used, else assume general. - */ - private function executeHook($extractor, $phase, $mimeType = null) - { - $hooks = array(); - if (is_null($mimeType)) - { - $hooks = $this->generalHookCache; - } - else - { - if (array_key_exists($mimeType, $this->mimeHookCache)) - { - $hooks = $this->mimeHookCache[$mimeType]; - } - } - if (empty($hooks)) - { - return; - } - - foreach($hooks as $hook) - { - $hook->$phase($extractor); - } - } - - private function doesDiagnosticsPass($simple=false) - { - global $default; - - $config =& KTConfig::getSingleton(); - // create a index log lock file in case there are errors, and we don't need to log them forever! - // this function will create the lockfile if an error is detected. It will be removed as soon - // as the problems with the indexer are removed. - $lockFile = $config->get('cache/cacheDirectory') . '/index.log.lock'; - - $diagnosis = $this->diagnose(); - if (!is_null($diagnosis)) - { - if (!is_file($lockFile)) - { - $default->log->error(_kt('Indexer problem: ') . $diagnosis); - } - touch($lockFile); - return false; - } - - if ($simple) - { - return true; - } - - $diagnosis = $this->diagnoseExtractors(); - if (!empty($diagnosis)) - { - if (!is_file($lockFile)) - { - foreach($diagnosis as $diag) - { - $default->log->error(sprintf(_kt('%s problem: %s'), $diag['name'],$diag['diagnosis'])); - } - } - touch($lockFile); - return false; - } - - if (is_file($lockFile)) - { - $default->log->info(_kt('Issues with the indexer have been resolved!')); - unlink($lockFile); - } - - return true; - } - - /** - * This does the initial mime type association between mime types and text extractors - * - */ - public function checkForRegisteredTypes() - { - global $default; - - // we are only doing this once! - $initRegistered = KTUtil::getSystemSetting('mimeTypesRegistered', false); - if ($initRegistered) - { - return; - } - if ($this->debug) $default->log->debug('checkForRegisteredTypes: start'); - - $date = date('Y-m-d H:i'); - $sql = "UPDATE scheduler_tasks SET run_time='$date'"; - DBUtil::runQuery($sql); - - $this->registerTypes(true); - - $disable = array( - 'windows'=>array('PSExtractor'), - 'unix' => array() - ); - - $disableForOS = OS_WINDOWS?$disable['windows']:$disable['unix']; - - if (!empty($disableForOS)) - { - $disableForOS = '\'' . implode("','", $disableForOS) .'\''; - - $sql = "UPDATE mime_extractors SET active=0 WHERE name in ($disableForOS)"; - DBUtil::runQuery($sql); - $default->log->info("checkForRegisteredTypes: disabled '$extractor'"); - } - $this->loadExtractorStatus(); - - if ($this->debug) $default->log->debug('checkForRegisteredTypes: done'); - KTUtil::setSystemSetting('mimeTypesRegistered', true); - } - - private function updatePendingDocumentStatus($documentId, $message, $level) - { - $this->indexingHistory .= "\n" . $level . ': ' . $message; - $message = sanitizeForSQL($this->indexingHistory); - $sql = "UPDATE index_files SET status_msg='$message' WHERE document_id=$documentId"; - DBUtil::runQuery($sql); - } - - private $restartCurrentBatch = false; - - public function restartBatch() - { - $this->restartCurrentBatch = true; - } - - /** - * - * @param int $documentId - * @param string $message - * @param string $level This may be info, error, debug - */ - private function logPendingDocumentInfoStatus($documentId, $message, $level) - { - $this->updatePendingDocumentStatus($documentId, $message, $level); - global $default; - - switch ($level) - { - case 'debug': - if ($this->debug) - { - $default->log->debug($message); - } - break; - default: - $default->log->$level($message); - } - } - - - - public function getExtractor($extractorClass) - { - if (empty($extractorClass)) - { - return null; - } - - $includeFile = SEARCH2_INDEXER_DIR . 'extractors/' . $extractorClass . '.inc.php'; - if (!file_exists($includeFile)) - { - throw new Exception("Extractor file does not exist: $includeFile"); - } - - require_once($includeFile); - - if (!class_exists($extractorClass)) - { - throw new Exception("Extractor '$classname' not defined in file: $includeFile"); - } - - $extractor = new $extractorClass(); - - if (!($extractor instanceof DocumentExtractor)) - { - throw new Exception("Class $classname was expected to be of type DocumentExtractor"); - } - - return $extractor; - } - - public static function getIndexingQueue($problemItemsOnly=true) - { - - if ($problemItemsOnly) - { - $sql = "SELECT - iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename - FROM - index_files iff - INNER JOIN documents d ON iff.document_id=d.id - INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id - INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id - INNER JOIN mime_types mt ON dcv.mime_id=mt.id - LEFT JOIN mime_extractors me ON mt.extractor_id=me.id - WHERE - (iff.status_msg IS NOT NULL AND iff.status_msg <> '') AND d.status_id=1 - ORDER BY indexdate "; - } - else - { - $sql = "SELECT - iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename - FROM - index_files iff - INNER JOIN documents d ON iff.document_id=d.id - INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id - INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id - INNER JOIN mime_types mt ON dcv.mime_id=mt.id - LEFT JOIN mime_extractors me ON mt.extractor_id=me.id - WHERE - (iff.status_msg IS NULL or iff.status_msg = '') AND d.status_id=1 - ORDER BY indexdate "; - } - $aResult = DBUtil::getResultArray($sql); - - return $aResult; - } - - public static function getPendingIndexingQueue() - { - return Indexer::getIndexingQueue(false); - } - - public function updateIndexStats() - { - $optimisationDate = KTUtil::getSystemSetting('luceneOptimisationDate', ''); - - $noOptimisation = false; - if ($optimisationDate == '') - { - $optimisationDate = _kt('N/A'); - $optimisationPeriod = $optimisationDate; - } - else - { - $optimisationPeriod = KTUtil::computePeriodToDate($optimisationDate, null, true); - $noOptimisation = $optimisationPeriod['days'] > 2; - $optimisationPeriod = $optimisationPeriod['str']; - $optimisationDate = date('Y-m-d H:i:s', $optimisationDate); - } - - $indexingDate = KTUtil::getSystemSetting('luceneIndexingDate', ''); - if ($indexingDate == '') - { - $indexingDate = _kt('N/A'); - $indexingPeriod = $indexingDate; - } - else - { - $indexingPeriod = KTUtil::computePeriodToDate($indexingDate); - $indexingDate = date('Y-m-d H:i:s', $indexingDate); - } - - $index = Indexer::get(); - $docsInIndex = $index->getDocumentsInIndex(); - - // we are only interested in documents that are active - $sql = "SELECT count(*) as docsInQueue FROM index_files i inner join documents d on i.document_id = d.id where d.status_id=1"; - $docsInQueue = DBUtil::getOneResultKey($sql, 'docsInQueue'); - - $sql = "SELECT count(*) as errorsInQueue FROM index_files i inner join documents d on i.document_id = d.id where (i.status_msg is not null or i.status_msg <> '') and d.status_id=1"; - $errorsInQueue = DBUtil::getOneResultKey($sql, 'errorsInQueue'); - - $sql = "SELECT count(*) as docsInRepository FROM documents where status_id=1"; - $docsInRepository = DBUtil::getOneResultKey($sql, 'docsInRepository'); - - if ($docsInRepository == 0) - { - $indexingCoverage = '0.00%'; - $queueCoverage = $indexingCoverage; - } - else - { - // compute indexing coverage - $indexingCoverage = _kt('Not Available'); - if (is_numeric($docsInIndex)) - { - $indexingCoverage = ($docsInIndex * 100) / $docsInRepository; - $indexingCoverage = number_format($indexingCoverage, 2, '.',',') . '%'; - } - - // compute queue coverage - $queueCoverage = _kt('Not Available'); - if (is_numeric($docsInQueue)) - { - $queueCoverage = ($docsInQueue * 100) / $docsInRepository; - $queueCoverage = number_format($queueCoverage, 2, '.',',') . '%'; - } - } - - - $stats = array( - 'optimisationDate'=>$optimisationDate, - 'optimisationPeriod'=>$optimisationPeriod, - 'indexingDate'=>$indexingDate, - 'indexingPeriod'=>$indexingPeriod, - 'docsInIndex'=>$docsInIndex, - 'docsInQueue'=>$docsInQueue, - 'errorsInQueue'=>$errorsInQueue, - 'docsInRepository'=>$docsInRepository, - 'indexingCoverage'=>$indexingCoverage, - 'queueCoverage'=>$queueCoverage, - 'noOptimisation'=>$noOptimisation - ); - - KTUtil::setSystemSetting('indexerStats', serialize($stats)); - - $indexer = Indexer::get(); - - $diagnosis = $indexer->diagnose(); - KTUtil::setSystemSetting('indexerDiagnostics', serialize($diagnosis)); - - $extractorDiagnosis = $indexer->diagnoseExtractors(); - - KTUtil::setSystemSetting('extractorDiagnostics', serialize($extractorDiagnosis)); - } - - /** - * The main function that may be called repeatedly to index documents. - * - * @param int $max Default 20 - */ - public function indexDocuments($max=null) - { - global $default; - $config =& KTConfig::getSingleton(); - - /*$indexLockFile = $config->get('cache/cacheDirectory') . '/main.index.lock'; - if (is_file($indexLockFile)) - { - $default->log->info('indexDocuments: main.index.lock seems to exist. it could be that the indexing is still underway.'); - $default->log->info('indexDocuments: Remove "' . $indexLockFile . '" if the indexing is not running or extend the frequency at which the background task runs!'); - return; - } - touch($indexLockFile);*/ - - - $this->checkForRegisteredTypes(); - - if ($this->debug) $default->log->debug('indexDocuments: start'); - if (!$this->doesDiagnosticsPass()) - { - //unlink($indexLockFile); - if ($this->debug) $default->log->debug('indexDocuments: stopping - diagnostics problem. The dashboard will provide more information.'); - return; - } - - if (is_null($max)) - { - $max = $config->get('indexer/batchDocuments',20); - } - - $this->loadExtractorHooks(); - - Indexer::clearoutDeleted(); - - $date = date('Y-m-d H:i:s'); - // identify the indexers that must run - // mysql specific limit! - $sql = "SELECT - iff.document_id, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what - FROM - index_files iff - INNER JOIN documents d ON iff.document_id=d.id - INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id - INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id - INNER JOIN mime_types mt ON dcv.mime_id=mt.id - LEFT JOIN mime_extractors me ON mt.extractor_id=me.id - WHERE - (iff.processdate IS NULL or iff.processdate < date_sub('$date', interval 1 day)) AND dmv.status_id=1 - ORDER BY indexdate - LIMIT $max"; - $result = DBUtil::getResultArray($sql); - if (PEAR::isError($result)) - { - //unlink($indexLockFile); - if ($this->debug) $default->log->debug('indexDocuments: stopping - db error'); - return; - } - KTUtil::setSystemSetting('luceneIndexingDate', time()); - - // bail if no work to do - if (count($result) == 0) - { - //unlink($indexLockFile); - if ($this->debug) $default->log->debug('indexDocuments: stopping - no work to be done'); - return; - } - - // identify any documents that need indexing and mark them - // so they are not taken in a followup run - $ids = array(); - foreach($result as $docinfo) - { - $ids[] = $docinfo['document_id']; - } - - // mark the documents as being processed - - $ids=implode(',',$ids); - $sql = "UPDATE index_files SET processdate='$date' WHERE document_id in ($ids)"; - DBUtil::runQuery($sql); - - $extractorCache = array(); - $storageManager = KTStorageManagerUtil::getSingleton(); - - $tempPath = $config->get("urls/tmpDirectory"); - - foreach($result as $docinfo) - { - // increment indexed documents count - Indexer::incrementCount(); - - $docId=$docinfo['document_id']; - $extension=$docinfo['filetypes']; - $mimeType=$docinfo['mimetypes']; - $extractorClass=$docinfo['extractor']; - $indexDocument = in_array($docinfo['what'], array('A','C')); - $indexDiscussion = in_array($docinfo['what'], array('A','D')); - $this->indexingHistory = ''; - - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Indexing docid: %d extension: '%s' mimetype: '%s' extractor: '%s'"), $docId, $extension,$mimeType,$extractorClass), 'debug'); - - if (empty($extractorClass)) - { - /* - - if no extractor is found and we don't need to index discussions, then we can remove the item from the queue. - - */ - if ($indexDiscussion) - { - $indexDocument = false; - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Not indexing docid: %d content because extractor could not be resolve. Still indexing discussion."), $docId), 'info'); - } - else - { - Indexer::unqueueDocument($docId, sprintf(_kt("No extractor for docid: %d"),$docId)); - continue; - } - } - else - { - /* - - If an extractor is available, we must ensure it is enabled. - - */ - - if (!$this->isExtractorEnabled($extractorClass)) - { - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("diagnose: Not indexing docid: %d because extractor '%s' is disabled."), $docId, $extractorClass), 'info'); - continue; - } - } - - if ($this->debug) - { - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Processing docid: %d.\n"),$docId), 'info'); - } - - $document = Document::get($docId); - if (PEAR::isError($document)) - { - Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Cannot resolve document id %d: %s."),$docId, $document->getMessage()), 'error'); - continue; - } - - if ($this->restartCurrentBatch) - { - Indexer::unqueueDocument($docId); - Indexer::index($docId, 'A'); - continue; - } - - - $filename = $document->getFileName(); - if (substr($filename,0,1) == '~' || substr($filename,-1) == '~') - { - Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Filename for document id %d starts with a tilde (~). This is assumed to be a temporary file. This is ignored."),$docId), 'error'); - continue; - } - - $removeFromQueue = true; - if ($indexDocument) - { - if (array_key_exists($extractorClass, $extractorCache)) - { - $extractor = $extractorCache[$extractorClass]; - } - else - { - $extractor = $extractorCache[$extractorClass] = $this->getExtractor($extractorClass); - } - - if (!($extractor instanceof DocumentExtractor)) - { - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("indexDocuments: extractor '%s' is not a document extractor class."),$extractorClass), 'error'); - continue; - } - - - - $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber(); - $sourceFile = $storageManager->temporaryFile($document); - - if (empty($sourceFile) || !is_file($sourceFile)) - { - Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: source file '%s' for document %d does not exist."),$sourceFile,$docId), 'error'); - continue; - } - - if ($extractor->needsIntermediateSourceFile()) - { - //$extension = pathinfo($document->getFileName(), PATHINFO_EXTENSION); - - $intermediate = $tempPath . '/'. $docId . '.' . $extension; - $result = @copy($sourceFile, $intermediate); - if ($result === false) - { - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not create intermediate file from document %d"),$docId), 'error'); - // problem. lets try again later. probably permission related. log the issue. - continue; - } - $sourceFile = $intermediate; - } - - $targetFile = tempnam($tempPath, 'ktindexer'); - - $extractor->setSourceFile($sourceFile); - $extractor->setMimeType($mimeType); - $extractor->setExtension($extension); - $extractor->setTargetFile($targetFile); - $extractor->setDocument($document); - $extractor->setIndexingStatus(null); - $extractor->setExtractionStatus(null); - - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Extra Info docid: %d Source File: '%s' Target File: '%s'"),$docId,$sourceFile,$targetFile), 'debug'); - - $this->executeHook($extractor, 'pre_extract'); - $this->executeHook($extractor, 'pre_extract', $mimeType); - $removeFromQueue = false; - - if ($extractor->extractTextContent()) - { - // the extractor may need to create another target file - $targetFile = $extractor->getTargetFile(); - - $extractor->setExtractionStatus(true); - $this->executeHook($extractor, 'pre_index'); - $this->executeHook($extractor, 'pre_index', $mimeType); - - $title = $document->getName(); - if ($indexDiscussion) - { - if (!$this->filterText($targetFile)) - { - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error'); - } - else - { - $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version); - $removeFromQueue = $indexStatus; - if (!$indexStatus) - { - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocumentAndDiscussion"),$docId), 'error'); - } - - $extractor->setIndexingStatus($indexStatus); - } - } - else - { - if (!$this->filterText($targetFile)) - { - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error'); - } - else - { - $indexStatus = $this->indexDocument($docId, $targetFile, $title, $version); - $removeFromQueue = $indexStatus; - - if (!$indexStatus) - { - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocument"),$docId), 'error'); - $this->logPendingDocumentInfoStatus($docId, '' . $extractor->output . '', 'error'); - } - - $extractor->setIndexingStatus($indexStatus); - } - } - - $this->executeHook($extractor, 'post_index', $mimeType); - $this->executeHook($extractor, 'post_index'); - } - else - { - $extractor->setExtractionStatus(false); - $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not extract contents from document %d"),$docId), 'error'); - $this->logPendingDocumentInfoStatus($docId, '' . $extractor->output . '', 'error'); - } - - $this->executeHook($extractor, 'post_extract', $mimeType); - $this->executeHook($extractor, 'post_extract'); - - if ($extractor->needsIntermediateSourceFile()) - { - @unlink($sourceFile); - } - - @unlink($targetFile); - - } - else - { - $indexStatus = $this->indexDiscussion($docId); - $removeFromQueue = $indexStatus; - } - - if ($removeFromQueue) - { - Indexer::unqueueDocument($docId, sprintf(_kt("Done indexing docid: %d"),$docId)); - } - else - { - if ($this->debug) $default->log->debug(sprintf(_kt("Document docid: %d was not removed from the queue as it looks like there was a problem with the extraction process"),$docId)); - } - } - if ($this->debug) $default->log->debug('indexDocuments: done'); - //unlink($indexLockFile); - } - - public function migrateDocuments($max=null) - { - global $default; - - $default->log->info(_kt('migrateDocuments: starting')); - - if (!$this->doesDiagnosticsPass(true)) - { - $default->log->info(_kt('migrateDocuments: stopping - diagnostics problem. The dashboard will provide more information.')); - return; - } - - if (KTUtil::getSystemSetting('migrationComplete') == 'true') - { - $default->log->info(_kt('migrateDocuments: stopping - migration is complete.')); - return; - } - - $config =& KTConfig::getSingleton(); - if (is_null($max)) - { - $max = $config->get('indexer/batchMigrateDocument',500); - } - - $lockFile = $config->get('cache/cacheDirectory') . '/migration.lock'; - if (is_file($lockFile)) - { - $default->log->info(_kt('migrateDocuments: stopping - migration lockfile detected.')); - return; - } - touch($lockFile); - - $startTime = KTUtil::getSystemSetting('migrationStarted'); - if (is_null($startTime)) - { - KTUtil::setSystemSetting('migrationStarted', time()); - } - - $maxLoops = 5; - - $max = ceil($max / $maxLoops); - - $start =KTUtil::getBenchmarkTime(); - $noDocs = false; - $numDocs = 0; - - for($loop=0;$loop<$maxLoops;$loop++) - { - - $sql = "SELECT - document_id, document_text - FROM - document_text - ORDER BY document_id - LIMIT $max"; - $result = DBUtil::getResultArray($sql); - if (PEAR::isError($result)) - { - $default->log->info(_kt('migrateDocuments: db error')); - break; - } - - $docs = count($result); - if ($docs == 0) - { - $noDocs = true; - break; - } - $numDocs += $docs; - - foreach($result as $docinfo) - { - $docId = $docinfo['document_id']; - - $document = Document::get($docId); - if (PEAR::isError($document) || is_null($document)) - { - $sql = "DELETE FROM document_text WHERE document_id=$docId"; - DBUtil::runQuery($sql); - $default->log->error(sprintf(_kt('migrateDocuments: Could not get document %d\'s document! Removing content!'),$docId)); - continue; - } - - $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber(); - - $targetFile = tempnam($tempPath, 'ktindexer'); - - if (file_put_contents($targetFile, $docinfo['document_text']) === false) - { - $default->log->error(sprintf(_kt('migrateDocuments: Cannot write to \'%s\' for document id %d'), $targetFile, $docId)); - continue; - } - // free memory asap ;) - unset($docinfo['document_text']); - - $title = $document->getName(); - - $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version); - - if ($indexStatus) - { - $sql = "DELETE FROM document_text WHERE document_id=$docId"; - DBUtil::runQuery($sql); - } - else - { - $default->log->error(sprintf(_kt("migrateDocuments: Problem indexing document %d"), $docId)); - } - - @unlink($targetFile); - } - } - - @unlink($lockFile); - - $time = KTUtil::getBenchmarkTime() - $start; - - KTUtil::setSystemSetting('migrationTime', KTUtil::getSystemSetting('migrationTime',0) + $time); - KTUtil::setSystemSetting('migratedDocuments', KTUtil::getSystemSetting('migratedDocuments',0) + $numDocs); - - $default->log->info(sprintf(_kt('migrateDocuments: stopping - done in %d seconds!'), $time)); - if ($noDocs) - { - $default->log->info(_kt('migrateDocuments: Completed!')); - KTUtil::setSystemSetting('migrationComplete', 'true'); - schedulerUtil::deleteByName('Index Migration'); - $default->log->debug(_kt('migrateDocuments: Disabling \'Index Migration\' task by removing scheduler entry.')); - } - } - - /** - * Index a document. The base class must override this function. - * - * @param int $docId - * @param string $textFile - */ - protected abstract function indexDocument($docId, $textFile, $title, $version); - - - public function updateDocumentIndex($docId, $text) - { - $config = KTConfig::getSingleton(); - $tempPath = $config->get("urls/tmpDirectory"); - $tempFile = tempnam($tempPath,'ud_'); - - file_put_contents($tempFile, $text); - - $document = Document::get($docId); - $title = $document->getDescription(); - $version = $document->getVersion(); - - $result = $this->indexDocument($docId, $tempFile, $title, $version); - - if (file_exists($tempFile)) - { - unlink($tempFile); - } - - return $result; - } - - /** - * Index a discussion. The base class must override this function. - * - * @param int $docId - */ - protected abstract function indexDiscussion($docId); - - /** - * Diagnose the indexer. e.g. Check that the indexing server is running. - * - */ - public abstract function diagnose(); - - /** - * Diagnose the extractors. - * - * @return array - */ - public function diagnoseExtractors() - { - $diagnosis = $this->_diagnose($this->extractorPath, 'DocumentExtractor', 'Extractor.inc.php'); - $diagnosis = array_merge($diagnosis, $this->_diagnose($this->hookPath, 'Hook', 'Hook.inc.php')); - - return $diagnosis; - } - - /** - * This is a refactored diagnose function. - * - * @param string $path - * @param string $class - * @param string $extension - * @return array - */ - private function _diagnose($path, $baseclass, $extension) - { - global $default; - - $diagnoses = array(); - - $dir = opendir(SearchHelper::correctPath($path)); - $extlen = - strlen($extension); - - while (($file = readdir($dir)) !== false) - { - if (substr($file,0,1) == '.') - { - continue; - } - if (substr($file,$extlen) != $extension) - { - $default->log->error(sprintf(_kt("diagnose: '%s' does not have extension '%s'."), $file, $extension)); - continue; - } - - require_once($path . '/' . $file); - - $class = substr($file, 0, -8); - if (!class_exists($class)) - { - $default->log->error(sprintf(_kt("diagnose: class '%s' does not exist."), $class)); - continue; - } - - if (!$this->isExtractorEnabled($class)) - { - $default->log->debug(sprintf(_kt("diagnose: extractor '%s' is disabled."), $class)); - continue; - } - - $extractor = new $class(); - if (!is_a($extractor, $baseclass)) - { - $default->log->error(sprintf(_kt("diagnose(): '%s' is not of type DocumentExtractor"), $class)); - continue; - } - - $types = $extractor->getSupportedMimeTypes(); - if (empty($types)) - { - if ($this->debug) $default->log->debug(sprintf(_kt("diagnose: class '%s' does not support any types."), $class)); - continue; - } - - $diagnosis=$extractor->diagnose(); - if (empty($diagnosis)) - { - continue; - } - $diagnoses[$class] = array( - 'name'=>$extractor->getDisplayName(), - 'diagnosis'=>$diagnosis - ); - - } - closedir($dir); - - return $diagnoses; - } - - - /** - * Register the extractor types. - * - * @param boolean $clear. Optional. Defaults to false. - */ - public function registerTypes($clear=false) - { - if ($clear) - { - $this->clearExtractors(); - } - $dir = opendir(SearchHelper::correctPath($this->extractorPath)); - while (($file = readdir($dir)) !== false) - { - if (substr($file,-17) == 'Extractor.inc.php') - { - require_once($this->extractorPath . '/' . $file); - $class = substr($file, 0, -8); - - if (!class_exists($class)) - { - // if the class does not exist, we can't do anything. - continue; - } - - $extractor = new $class; - if ($extractor instanceof DocumentExtractor) - { - $extractor->registerMimeTypes(); - } - } - } - closedir($dir); - } - - /** - * This is used as a possible obtimisation effort. It may be overridden in that case. - * - * @param int $docId - * @param string $textFile - */ - protected function indexDocumentAndDiscussion($docId, $textFile, $title, $version) - { - $this->indexDocument($docId, $textFile, $title, $version); - $this->indexDiscussion($docId); - } - - /** - * Remove the document from the queue. This is normally called when it has been processed. - * - * @param int $docid - */ - public static function unqueueDocument($docid, $reason=false, $level='debug') - { - $sql = "DELETE FROM index_files WHERE document_id=$docid"; - DBUtil::runQuery($sql); - if ($reason !== false) - { - global $default; - $default->log->$level("Indexer: removing document $docid from the queue - $reason"); - } - } - - /** - * Run a query on the index. - * - * @param string $query - * @return array - */ - public abstract function query($query); - - /** - * Converts an integer to a string that can be easily compared and reversed. - * - * @param int $int - * @return string - */ - public static function longToString($int) - { - $maxlen = 14; - - $a2z = array('a','b','c','d','e','f','g','h','i','j'); - $o29 = array('0','1','2','3','4','5','6','7','8','9'); - $l = str_pad('',$maxlen - strlen("$int"),'0') . $int; - - return str_replace($o29, $a2z, $l); - } - - /** - * Converts a string to an integer. - * - * @param string $str - * @return int - */ - public static function stringToLong($str) - { - $a2z = array('a','b','c','d','e','f','g','h','i','j'); - $o29 = array('0','1','2','3','4','5','6','7','8','9'); - - $int = str_replace($a2z, $o29, $str) + 0; - - return $int; - } - - /** - * Possibly we can optimise indexes. This method must be overriden. - * The new function must call the parent! - * - */ - public function optimise() - { - KTUtil::setSystemSetting('luceneOptimisationDate', time()); - } - - /** - * Shuts down the indexer - * - */ - public function shutdown() - { - // do nothing generally - } - - /** - * Returns the name of the indexer. - * - * @return string - */ - public abstract function getDisplayName(); - - - /** - * Returns the number of non-deleted documents in the index. - * - * @return int - */ - public abstract function getDocumentsInIndex(); - - public abstract function isDocumentIndexed($documentId); - - /** - * Returns the path to the index directory - * - * @return string - */ - public function getIndexDirectory() - { - $config = KTConfig::getSingleton(); - $directory = $config->get('indexer/luceneDirectory'); - return $directory; - } -} - -?> +. + * + * You can contact KnowledgeTree Inc., PO Box 7775 #87847, San Francisco, + * California 94120-7775, or email info@knowledgetree.com. + * + * The interactive user interfaces in modified source and object code versions + * of this program must display Appropriate Legal Notices, as required under + * Section 5 of the GNU General Public License version 3. + * + * In accordance with Section 7(b) of the GNU General Public License version 3, + * these Appropriate Legal Notices must retain the display of the "Powered by + * KnowledgeTree" logo and retain the original copyright notice. If the display of the + * logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices + * must display the words "Powered by KnowledgeTree" and retain the original + * copyright notice. + * Contributor( s): ______________________________________ + * + */ + +define('SEARCH2_INDEXER_DIR',realpath(dirname(__FILE__)) . '/'); +require_once('indexing/extractorCore.inc.php'); +require_once(KT_DIR . '/plugins/ktcore/scheduler/schedulerUtil.php'); +require_once(KT_DIR . '/ktapi/ktapi.inc.php'); + +class IndexerInconsistencyException extends Exception {}; + +// TODO: Query Result Items code should be moved into the Search section. It has less to do with indexing... + +class QueryResultItem +{ + protected $id; + protected $title; + protected $rank; + protected $text; + protected $fullpath; + + public function __construct($id, $title, $rank, $text, $fullpath) + { + $this->id = $id; + $this->title = $title; + $this->rank = $rank; + $this->text = $text; + $this->fullpath = $fullpath; + } + + public function getId() { return $this->id; } + public function getRealId() { return $this->id; } + + public function getIsProxy() { return $this instanceof ProxyResultItem; } + public function getIsFolder() { return substr(get_class($this), 0, 6) == 'Folder' ; } + public function getIsDocument() { return substr(get_class($this), 0, 8) == 'Document' ; } + + public function setRank($value) + { + $this->rank = number_format($value,2,'.',','); + } + + public function getIsLive() + { + return true; + } + + public function setTitle($value) + { + $this->title = $value; + } + + public function setText($value) + { + $this->text = $value; + } + + public function getRelevance() { return (float) $this->rank; } + public function getRank() { return $this->getRelevance(); } + public function getText() { return (string) $this->text; } + public function getTitle() { return (string) $this->title; } + public function getFullPath() { return (string) $this->fullpath; } + + protected function __get($property) + { + if (empty($property)) + { + return ''; + } + + $method = 'get' . $property; + if (method_exists($this, $method)) + { + return $this->$method(); + } + return $this->getUnknown(); + } + + protected function getUnknown() + { + return _kt('n/a'); + } + + protected function __set($property, $value) + { + if (empty($property)) + { + return ''; + } + + $method = 'set' . $property; + if (method_exists($this, $method)) + { + return $this->$method($value); + } + throw new Exception("Unknown property '$property' to set on QueryResultItem"); + } +} + +class ProxyResultItem extends QueryResultItem +{ + protected $proxy; + protected $proxyId; + + public function __construct($proxyId, $proxy) + { + parent::__construct($proxyId, $proxy->getTitle, $proxy->getRank(), $proxy->getText(), $proxy->getFullPath()); + $this->proxyId = $proxyId; + $this->proxy = $proxy; + } + + public function getId() { return $this->proxyId; } + public function getTitle() { return $this->proxy->getTitle(); } + public function getRealId() { return $this->proxy->getId(); } + + protected function __get($property) + { + $method = 'get' . $property; + + if (method_exists($this, $method)) + { + return $this->$method(); + } + else + { + return $this->proxy->$method(); + } + } + + protected function __set($property, $value) + { + $method = 'set' . $property; + if (method_exists($this, $method)) + { + return $this->$method($value); + } + else + { + return $this->proxy->$method($value); + } + } +} + +class DocumentResultItem extends QueryResultItem +{ + protected $filesize; + protected $live; + protected $version; + protected $mimeType; + protected $filename; + protected $thumbnail; // TODO: if not null, gui can display a thumbnail + protected $viewer; // TODO: if not null, a viewer can be used to view the document + protected $document; + protected $checkedOutUser; + protected $dateCheckedout; + protected $workflowState; + protected $workflow; + protected $modifiedBy; + protected $dateModified; + protected $createdBy; + protected $dateCreated; + protected $owner; + protected $immutable; + protected $deleted; + protected $status; + protected $folderId; + protected $storagePath; + protected $documentType; + protected $mimeIconPath; + protected $mimeDisplay; + protected $oemDocumentNo; + + public function __construct($document_id, $rank=null, $title=null, $text=null, $fullpath = null) + { + parent::__construct($document_id, $title, $rank, $text, $fullpath); + $this->live = true; + $this->loadDocumentInfo(); + } + + // TODO: this is bad. must refactor to do the query on the group of documents. + public function loadDocumentInfo() + { + global $default; + $sql = "SELECT + d.folder_id, f.full_path, f.name, dcv.size as filesize, dcv.major_version, + dcv.minor_version, dcv.filename, cou.name as checkoutuser, w.human_name as workflow, ws.human_name as workflowstate, + mt.mimetypes as mimetype, md.mime_doc as mimedoc, d.checkedout, mbu.name as modifiedbyuser, d.modified, + cbu.name as createdbyuser, ou.name as owneruser, d.immutable, d.status_id, d.created,dcv.storage_path, dtl.name as document_type, + mt.icon_path as mime_icon_path, mt.friendly_name as mime_display, d.oem_no, dmv.name as title + FROM + documents d + INNER JOIN document_metadata_version dmv ON d.metadata_version_id = dmv.id + INNER JOIN document_content_version dcv ON dmv.content_version_id = dcv.id + INNER JOIN mime_types mt ON dcv.mime_id=mt.id + LEFT JOIN document_types_lookup dtl ON dtl.id=dmv.document_type_id + LEFT JOIN folders f ON f.id=d.folder_id + LEFT JOIN users cou ON d.checked_out_user_id=cou.id + LEFT JOIN workflows w ON dmv.workflow_id=w.id + LEFT JOIN workflow_states ws ON dmv.workflow_state_id = ws.id + LEFT JOIN mime_documents md ON mt.mime_document_id = md.id + LEFT JOIN users mbu ON d.modified_user_id=mbu.id + LEFT JOIN users cbu ON d.creator_id=cbu.id + LEFT JOIN users ou ON d.owner_id=ou.id + WHERE + d.id=$this->id"; + + $result = DBUtil::getOneResult($sql); + + if (PEAR::isError($result) || empty($result)) + { + $this->live = false; + if (PEAR::isError($result)) + { + throw new Exception('Database exception! There appears to be an error in the system: ' .$result->getMessage()); + } + + $default->log->error('QueryResultItem: $result is null'); + $msg = 'The database did not have a record matching the result from the document indexer. This may occur if there is an inconsistency between the document indexer and the repository. The indexer needs to be repaired.'; + $default->log->error('QueryResultItem: ' . $msg); + // TODO: repair process where we scan documents in index, and delete those for which there is nothing in the repository + throw new IndexerInconsistencyException(_kt($msg)); + } + + // document_id, relevance, text, title + + $this->documentType = $result['document_type']; + $this->filename=$result['filename']; + $this->filesize = KTUtil::filesizeToString($result['filesize']); + $this->folderId = $result['folder_id']; + $this->title = $result['title']; + + $this->createdBy = $result['createdbyuser']; + $this->dateCreated = $result['created']; + + $this->modifiedBy = $result['modifiedbyuser']; + $this->dateModified = $result['modified']; + + $this->checkedOutUser = $result['checkoutuser']; + $this->dateCheckedout = $result['checkedout']; + + $this->owner = $result['owneruser']; + + $this->version = $result['major_version'] . '.' . $result['minor_version']; + + $this->immutable = ($result['immutable'] + 0)?_kt('Immutable'):''; + + $this->workflow = $result['workflow']; + $this->workflowState = $result['workflowstate']; + + $this->oemDocumentNo = $result['oem_no']; + if (empty($this->oemDocumentNo)) $this->oemDocumentNo = 'n/a'; + + if (is_null($result['name'])) + { + $this->fullpath = '(orphaned)'; + } + else + { + $this->fullpath = $result['full_path']; + } + + $this->mimeType = $result['mimetype']; + $this->mimeIconPath = $result['mime_icon_path']; + if (empty($this->mimeIconPath)) + { + $this->mimeIconPath = 'unspecified_type'; + } + $this->mimeDisplay = $result['mime_display']; + + $this->storagePath = $result['storage_path']; + $this->status = Document::getStatusString($result['status_id']); + } + + public function getDocumentID() { return $this->getId(); } + public function getIsLive() { return (bool) $this->live; } + public function getFilesize() { return $this->filesize; } + public function getVersion() { return (string) $this->version; } + public function getFilename() { return (string)$this->filename; } + public function getFolderId() { return (int)$this->folderId; } + public function getOemDocumentNo() { return (string) $this->oemDocumentNo; } + public function getDocument() { return Document::get($this->id); } + public function getIsAvailable() { return $this->Document->isLive(); } + public function getCheckedOutUser() { return (string) $this->checkedOutUser; } + public function getCheckedOutByr() { return $this->getCheckedOutUser(); } + public function getWorkflowOnly() { return (string)$this->workflow; } + public function getWorkflow() { return $this->getWorkflow(); } + public function getWorkflowStateOnly() { return (string)$this->workflowState; } + public function getWorkflowState() { return $this->getWorkflowStateOnly(); } + public function getWorkflowAndState() { + if (is_null($this->workflow)) + { + return ''; + } + return "$this->workflow - $this->workflowState"; + } + public function getMimeType() { return (string) $this->mimeType; } + public function getMimeIconPath() { return (string) $this->mimeIconPath; } + public function getMimeDisplay() { return (string) $this->mimeDisplay; } + public function getDateCheckedOut() { return (string) $this->dateCheckedout; } + public function getModifiedBy() { return (string) $this->modifiedBy; } + public function getDateModified() { return (string) $this->dateModified; } + public function getCreatedBy() { return (string) $this->createdBy; } + public function getDateCreated() { return (string) $this->dateCreated; } + public function getOwner() { return (string) $this->owner; } + public function getOwnedBy() { return $this->getOwner(); } + public function getIsImmutable() { return (bool) $this->immutable; } + public function getImmutable() { return $this->getIsImmutable(); } + public function getStatus() { return $this->status; } + public function getStoragePath() { return $this->storagePath; } + public function getDocumentType() { return $this->documentType; } + public function getPermissions() { return KTAPI_Document::get_permission_string($this->Document); } + public function getCanBeReadByUser() { + if (!$this->live) + return false; + if (Permission::userHasDocumentReadPermission($this->Document)) + return true; + if (Permission::adminIsInAdminMode()) + return true; + return false; + } +} + +class FolderResultItem extends QueryResultItem +{ + protected $folder; + protected $createdBy; + protected $parentId; + + public function __construct($folder_id, $rank=null, $title=null, $text=null, $fullpath = null) + { + parent::__construct($folder_id, $title, $rank, $text, $fullpath); + $this->loadFolderInfo(); + } + + public function getFolderID() { return $this->getId(); } + public function getParentID() { return $this->parentId; } + public function getCreatedBy() { return $this->createdBy; } + public function getMimeIconPath() { return 'folder'; } + public function getFolder() { return Folder::get($this->getFolderID()); } + public function getPermissions() { return KTAPI_Folder::get_permission_string($this->Folder); } + + public function loadFolderInfo() + { + global $default; + $folder = $this->getFolder(); + if (PEAR::isError($folder)) + { + throw new Exception('Database exception! There appears to be an error in the system: ' .$result->getMessage()); + } + $this->title = $folder->getName(); + $this->fullpath = '/' . $folder->getFullPath(); + $this->parentId = $folder->getParentId(); + + $user = User::get($folder->getCreatorID()); + $this->createdBy = (PEAR::isError($user))?_kt('Unknown'):$user->getName(); + } + +} + +class DocumentShortcutResultItem extends ProxyResultItem +{ + public function getDocumentID() { return $this->getId(); } + public function getMimeIconPath() { return $this->proxy->getMimeIconPath() . '_shortcut'; } + +} + +class FolderShortcutResultItem extends ProxyResultItem +{ + public function getFolderID() { return $this->getId(); } + public function getMimeIconPath() { return 'folder_shortcut'; } + +} + +function MatchResultCompare($a, $b) +{ + if ($a->Rank == $b->Rank) { + return 0; + } + return ($a->Rank < $b->Rank) ? -1 : 1; +} + +abstract class Indexer +{ + /** + * Cache of extractors + * + * @var array + */ + private $extractorCache; + + /** + * Indicates if the indexer will do logging. + * + * @var boolean + */ + private $debug; + /** + * Cache on mime related hooks + * + * @var unknown_type + */ + private $mimeHookCache; + /** + * Cache on general hooks. + * + * @var array + */ + private $generalHookCache; + + /** + * This is a path to the extractors. + * + * @var string + */ + private $extractorPath; + /** + * This is a path to the hooks. + * + * @var string + */ + private $hookPath; + + private $enabledExtractors; + + /** + * Initialise the indexer + * + */ + protected function __construct() + { + $config = KTConfig::getSingleton(); + + $this->extractorCache = array(); + $this->debug = $config->get('indexer/debug', true); + $this->hookCache = array(); + $this->generalHookCache = array(); + $this->extractorPath = $config->get('indexer/extractorPath', 'extractors'); + $this->hookPath = $config->get('indexer/extractorHookPath','extractorHooks'); + + $this->loadExtractorStatus(); + } + + /** + * Get the list if enabled extractors + * + */ + private function loadExtractorStatus() + { + $sql = "SELECT id, name FROM mime_extractors WHERE active=1"; + $rs = DBUtil::getResultArray($sql); + $this->enabledExtractors = array(); + foreach($rs as $item) + { + $this->enabledExtractors[] = $item['name']; + } + } + + private function isExtractorEnabled($extractor) + { + return in_array($extractor, $this->enabledExtractors); + } + + /** + * Returns a reference to the main class + * + * @return Indexer + */ + public static function get() + { + static $singleton = null; + + if (is_null($singleton)) + { + $config = KTConfig::getSingleton(); + $classname = $config->get('indexer/coreClass'); + + require_once('indexing/indexers/' . $classname . '.inc.php'); + + if (!class_exists($classname)) + { + throw new Exception("Class '$classname' does not exist."); + } + + $singleton = new $classname; + } + + return $singleton; + } + + public abstract function deleteDocument($docid); + + /** + * Remove the association of all extractors to mime types on the database. + * + */ + public function clearExtractors() + { + global $default; + + $sql = "update mime_types set extractor_id=null"; + DBUtil::runQuery($sql); + + $sql = "delete from mime_extractors"; + DBUtil::runQuery($sql); + + if ($this->debug) $default->log->debug('clearExtractors'); + } + + /** + * lookup the name of the extractor class based on the mime type. + * + * @param string $type + * @return string + */ + public static function resolveExtractor($type) + { + global $default; + $sql = "select extractor from mime_types where filetypes='$type'"; + $class = DBUtil::getOneResultKey($sql,'extractor'); + if (PEAR::isError($class)) + { + $default->log->error("resolveExtractor: cannot resolve $type"); + return $class; + } + if ($this->debug) $default->log->debug(sprintf(_kt("resolveExtractor: Resolved '%s' from mime type '%s'."), $class, $type)); + return $class; + } + + /** + * Return all the discussion text. + * + * @param int $docid + * @return string + */ + public static function getDiscussionText($docid) + { + $sql = "SELECT + dc.subject, dc.body + FROM + discussion_threads dt + INNER JOIN discussion_comments dc ON dc.thread_id=dt.id AND dc.id BETWEEN dt.first_comment_id AND dt.last_comment_id + WHERE + dt.document_id=$docid"; + $result = DBUtil::getResultArray($sql); + $text = ''; + + foreach($result as $record) + { + $text .= $record['subject'] . "\n" . $record['body'] . "\n"; + } + + return $text; + } + + /** + * Schedule the indexing of a document. + * + * @param string $document + * @param string $what + */ + public static function index($document, $what='A') + { + global $default; + + if (is_numeric($document)) + { + $document = Document::get($document+0); + } + + if (PEAR::isError($document)) + { + $default->log->error("index: Could not index document: " .$document->getMessage()); + return; + } + + $document_id = $document->getId(); + $userid=$_SESSION['userID']; + if (empty($userid)) $userid=1; + + // we dequeue the document so that there are no issues when enqueuing + Indexer::unqueueDocument($document_id); + + // enqueue item + $sql = "INSERT INTO index_files(document_id, user_id, what) VALUES($document_id, $userid, '$what')"; + DBUtil::runQuery($sql); + + $default->log->debug("index: Queuing indexing of $document_id"); + + } + + private static function incrementCount() + { + // Get count from system settings + $count = Indexer::getIndexedDocumentCount(); + $count = (int)$count + 1; + Indexer::updateIndexedDocumentCount($count); + } + + public static function getIndexedDocumentCount() + { + $count = KTUtil::getSystemSetting('indexedDocumentCount', 0); + return (int) $count; + } + + public static function updateIndexedDocumentCount($cnt = 0) + { + KTUtil::setSystemSetting('indexedDocumentCount', $cnt); + } + + public static function reindexQueue() + { + $sql = "UPDATE index_files SET processdate = null"; + DBUtil::runQuery($sql); + } + + public static function reindexDocument($documentId) + { + $sql = "UPDATE index_files SET processdate=null, status_msg=null WHERE document_id=$documentId"; + DBUtil::runQuery($sql); + } + + + + public static function indexAll() + { + $userid=$_SESSION['userID']; + if (empty($userid)) $userid=1; + + $sql = "DELETE FROM index_files"; + DBUtil::runQuery($sql); + + $sql = "INSERT INTO index_files(document_id, user_id, what) SELECT id, $userid, 'A' FROM documents WHERE status_id=1 and id not in (select document_id from index_files)"; + DBUtil::runQuery($sql); + } + + public static function indexFolder($folder) + { + $userid=$_SESSION['userID']; + if (empty($userid)) $userid=1; + + if (!$folder instanceof Folder && !$folder instanceof FolderProxy) + { + throw new Exception('Folder expected'); + } + + $full_path = $folder->getFullPath(); + + $sql = "INSERT INTO index_files(document_id, user_id, what) SELECT id, $userid, 'A' FROM documents WHERE full_path like '{$full_path}/%' AND status_id=1 and id not in (select document_id from index_files)"; + DBUtil::runQuery($sql); + } + + /** + * Clearout the scheduling of documents that no longer exist. + * + */ + public static function clearoutDeleted() + { + global $default; + + $sql = 'DELETE FROM + index_files + WHERE + document_id in (SELECT d.id FROM documents AS d WHERE d.status_id=3) OR + NOT EXISTS(SELECT index_files.document_id FROM documents WHERE index_files.document_id=documents.id)'; + DBUtil::runQuery($sql); + + $default->log->debug("Indexer::clearoutDeleted: removed documents from indexing queue that have been deleted"); + } + + + /** + * Check if a document is scheduled to be indexed + * + * @param mixed $document This may be a document or document id + * @return boolean + */ + public static function isDocumentScheduled($document) + { + if (is_numeric($document)) + { + $docid = $document; + } + else if ($document instanceof Document) + { + $docid = $document->getId(); + } + else + { + return false; + } + $sql = "SELECT 1 FROM index_files WHERE document_id=$docid"; + $result = DBUtil::getResultArray($sql); + return count($result) > 0; + } + + /** + * Filters text removing redundant characters such as continuous newlines and spaces. + * + * @param string $filename + */ + private function filterText($filename) + { + $content = file_get_contents($filename); + + $src = array("([\r\n])","([\n][\n])","([\n])","([\t])",'([ ][ ])'); + $tgt = array("\n","\n",' ',' ',' '); + + // shrink what is being stored. + do + { + $orig = $content; + $content = preg_replace($src, $tgt, $content); + } while ($content != $orig); + + return file_put_contents($filename, $content) !== false; + } + + /** + * Load hooks for text extraction process. + * + */ + private function loadExtractorHooks() + { + $this->generalHookCache = array(); + $this->mimeHookCache = array(); + + + $dir = opendir(SearchHelper::correctPath($this->hookPath)); + while (($file = readdir($dir)) !== false) + { + if (substr($file,-12) == 'Hook.inc.php') + { + require_once($this->hookPath . '/' . $file); + $class = substr($file, 0, -8); + + if (!class_exists($class)) + { + continue; + } + + $hook = new $class; + if (!($class instanceof ExtractorHook)) + { + continue; + } + + $mimeTypes = $hook->registerMimeTypes(); + if (is_null($mimeTypes)) + { + $this->generalHookCache[] = & $hook; + } + else + { + foreach($mimeTypes as $type) + { + $this->mimeHookCache[$type][] = & $hook; + } + } + + } + } + closedir($dir); + } + + /** + * This is a refactored function to execute the hooks. + * + * @param DocumentExtractor $extractor + * @param string $phase + * @param string $mimeType Optional. If set, indicates which hooks must be used, else assume general. + */ + private function executeHook($extractor, $phase, $mimeType = null) + { + $hooks = array(); + if (is_null($mimeType)) + { + $hooks = $this->generalHookCache; + } + else + { + if (array_key_exists($mimeType, $this->mimeHookCache)) + { + $hooks = $this->mimeHookCache[$mimeType]; + } + } + if (empty($hooks)) + { + return; + } + + foreach($hooks as $hook) + { + $hook->$phase($extractor); + } + } + + private function doesDiagnosticsPass($simple=false) + { + global $default; + + $config =& KTConfig::getSingleton(); + // create a index log lock file in case there are errors, and we don't need to log them forever! + // this function will create the lockfile if an error is detected. It will be removed as soon + // as the problems with the indexer are removed. + $lockFile = $config->get('cache/cacheDirectory') . '/index.log.lock'; + + $diagnosis = $this->diagnose(); + if (!is_null($diagnosis)) + { + if (!is_file($lockFile)) + { + $default->log->error(_kt('Indexer problem: ') . $diagnosis); + } + touch($lockFile); + return false; + } + + if ($simple) + { + return true; + } + + $diagnosis = $this->diagnoseExtractors(); + if (!empty($diagnosis)) + { + if (!is_file($lockFile)) + { + foreach($diagnosis as $diag) + { + $default->log->error(sprintf(_kt('%s problem: %s'), $diag['name'],$diag['diagnosis'])); + } + } + touch($lockFile); + return false; + } + + if (is_file($lockFile)) + { + $default->log->info(_kt('Issues with the indexer have been resolved!')); + unlink($lockFile); + } + + return true; + } + + /** + * This does the initial mime type association between mime types and text extractors + * + */ + public function checkForRegisteredTypes() + { + global $default; + + // we are only doing this once! + $initRegistered = KTUtil::getSystemSetting('mimeTypesRegistered', false); + if ($initRegistered) + { + return; + } + if ($this->debug) $default->log->debug('checkForRegisteredTypes: start'); + + $date = date('Y-m-d H:i'); + $sql = "UPDATE scheduler_tasks SET run_time='$date'"; + DBUtil::runQuery($sql); + + $this->registerTypes(true); + + $disable = array( + 'windows'=>array('PSExtractor'), + 'unix' => array() + ); + + $disableForOS = OS_WINDOWS?$disable['windows']:$disable['unix']; + + if (!empty($disableForOS)) + { + $disableForOS = '\'' . implode("','", $disableForOS) .'\''; + + $sql = "UPDATE mime_extractors SET active=0 WHERE name in ($disableForOS)"; + DBUtil::runQuery($sql); + $default->log->info("checkForRegisteredTypes: disabled '$extractor'"); + } + $this->loadExtractorStatus(); + + if ($this->debug) $default->log->debug('checkForRegisteredTypes: done'); + KTUtil::setSystemSetting('mimeTypesRegistered', true); + } + + private function updatePendingDocumentStatus($documentId, $message, $level) + { + $this->indexingHistory .= "\n" . $level . ': ' . $message; + $message = sanitizeForSQL($this->indexingHistory); + $sql = "UPDATE index_files SET status_msg='$message' WHERE document_id=$documentId"; + DBUtil::runQuery($sql); + } + + private $restartCurrentBatch = false; + + public function restartBatch() + { + $this->restartCurrentBatch = true; + } + + /** + * + * @param int $documentId + * @param string $message + * @param string $level This may be info, error, debug + */ + private function logPendingDocumentInfoStatus($documentId, $message, $level) + { + $this->updatePendingDocumentStatus($documentId, $message, $level); + global $default; + + switch ($level) + { + case 'debug': + if ($this->debug) + { + $default->log->debug($message); + } + break; + default: + $default->log->$level($message); + } + } + + + + public function getExtractor($extractorClass) + { + if (empty($extractorClass)) + { + return null; + } + + $includeFile = SEARCH2_INDEXER_DIR . 'extractors/' . $extractorClass . '.inc.php'; + if (!file_exists($includeFile)) + { + throw new Exception("Extractor file does not exist: $includeFile"); + } + + require_once($includeFile); + + if (!class_exists($extractorClass)) + { + throw new Exception("Extractor '$classname' not defined in file: $includeFile"); + } + + $extractor = new $extractorClass(); + + if (!($extractor instanceof DocumentExtractor)) + { + throw new Exception("Class $classname was expected to be of type DocumentExtractor"); + } + + return $extractor; + } + + public static function getIndexingQueue($problemItemsOnly=true) + { + + if ($problemItemsOnly) + { + $sql = "SELECT + iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename + FROM + index_files iff + INNER JOIN documents d ON iff.document_id=d.id + INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id + INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id + INNER JOIN mime_types mt ON dcv.mime_id=mt.id + LEFT JOIN mime_extractors me ON mt.extractor_id=me.id + WHERE + (iff.status_msg IS NOT NULL AND iff.status_msg <> '') AND d.status_id=1 + ORDER BY indexdate "; + } + else + { + $sql = "SELECT + iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename + FROM + index_files iff + INNER JOIN documents d ON iff.document_id=d.id + INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id + INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id + INNER JOIN mime_types mt ON dcv.mime_id=mt.id + LEFT JOIN mime_extractors me ON mt.extractor_id=me.id + WHERE + (iff.status_msg IS NULL or iff.status_msg = '') AND d.status_id=1 + ORDER BY indexdate "; + } + $aResult = DBUtil::getResultArray($sql); + + return $aResult; + } + + public static function getPendingIndexingQueue() + { + return Indexer::getIndexingQueue(false); + } + + public function updateIndexStats() + { + $optimisationDate = KTUtil::getSystemSetting('luceneOptimisationDate', ''); + + $noOptimisation = false; + if ($optimisationDate == '') + { + $optimisationDate = _kt('N/A'); + $optimisationPeriod = $optimisationDate; + } + else + { + $optimisationPeriod = KTUtil::computePeriodToDate($optimisationDate, null, true); + $noOptimisation = $optimisationPeriod['days'] > 2; + $optimisationPeriod = $optimisationPeriod['str']; + $optimisationDate = date('Y-m-d H:i:s', $optimisationDate); + } + + $indexingDate = KTUtil::getSystemSetting('luceneIndexingDate', ''); + if ($indexingDate == '') + { + $indexingDate = _kt('N/A'); + $indexingPeriod = $indexingDate; + } + else + { + $indexingPeriod = KTUtil::computePeriodToDate($indexingDate); + $indexingDate = date('Y-m-d H:i:s', $indexingDate); + } + + $index = Indexer::get(); + $docsInIndex = $index->getDocumentsInIndex(); + + // we are only interested in documents that are active + $sql = "SELECT count(*) as docsInQueue FROM index_files i inner join documents d on i.document_id = d.id where (i.status_msg is null or i.status_msg = '') and d.status_id=1"; + $docsInQueue = DBUtil::getOneResultKey($sql, 'docsInQueue'); + + $sql = "SELECT count(*) as errorsInQueue FROM index_files i inner join documents d on i.document_id = d.id where (i.status_msg is not null or i.status_msg <> '') and d.status_id=1"; + $errorsInQueue = DBUtil::getOneResultKey($sql, 'errorsInQueue'); + + $sql = "SELECT count(*) as docsInRepository FROM documents where status_id=1"; + $docsInRepository = DBUtil::getOneResultKey($sql, 'docsInRepository'); + + if ($docsInRepository == 0) + { + $indexingCoverage = '0.00%'; + $queueCoverage = $indexingCoverage; + } + else + { + // compute indexing coverage + $indexingCoverage = _kt('Not Available'); + if (is_numeric($docsInIndex)) + { + $indexingCoverage = ($docsInIndex * 100) / $docsInRepository; + $indexingCoverage = number_format($indexingCoverage, 2, '.',',') . '%'; + } + + // compute queue coverage + $queueCoverage = _kt('Not Available'); + if (is_numeric($docsInQueue)) + { + $queueCoverage = ($docsInQueue * 100) / $docsInRepository; + $queueCoverage = number_format($queueCoverage, 2, '.',',') . '%'; + } + } + + + $stats = array( + 'optimisationDate'=>$optimisationDate, + 'optimisationPeriod'=>$optimisationPeriod, + 'indexingDate'=>$indexingDate, + 'indexingPeriod'=>$indexingPeriod, + 'docsInIndex'=>$docsInIndex, + 'docsInQueue'=>$docsInQueue, + 'errorsInQueue'=>$errorsInQueue, + 'docsInRepository'=>$docsInRepository, + 'indexingCoverage'=>$indexingCoverage, + 'queueCoverage'=>$queueCoverage, + 'noOptimisation'=>$noOptimisation + ); + + KTUtil::setSystemSetting('indexerStats', serialize($stats)); + + $indexer = Indexer::get(); + + $diagnosis = $indexer->diagnose(); + KTUtil::setSystemSetting('indexerDiagnostics', serialize($diagnosis)); + + $extractorDiagnosis = $indexer->diagnoseExtractors(); + + KTUtil::setSystemSetting('extractorDiagnostics', serialize($extractorDiagnosis)); + } + + /** + * The main function that may be called repeatedly to index documents. + * + * @param int $max Default 20 + */ + public function indexDocuments($max=null) + { + global $default; + $config =& KTConfig::getSingleton(); + + /*$indexLockFile = $config->get('cache/cacheDirectory') . '/main.index.lock'; + if (is_file($indexLockFile)) + { + $default->log->info('indexDocuments: main.index.lock seems to exist. it could be that the indexing is still underway.'); + $default->log->info('indexDocuments: Remove "' . $indexLockFile . '" if the indexing is not running or extend the frequency at which the background task runs!'); + return; + } + touch($indexLockFile);*/ + + + $this->checkForRegisteredTypes(); + + if ($this->debug) $default->log->debug('indexDocuments: start'); + if (!$this->doesDiagnosticsPass()) + { + //unlink($indexLockFile); + if ($this->debug) $default->log->debug('indexDocuments: stopping - diagnostics problem. The dashboard will provide more information.'); + return; + } + + if (is_null($max)) + { + $max = $config->get('indexer/batchDocuments',20); + } + + $this->loadExtractorHooks(); + + Indexer::clearoutDeleted(); + + $date = date('Y-m-d H:i:s'); + // identify the indexers that must run + // mysql specific limit! + $sql = "SELECT + iff.document_id, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what + FROM + index_files iff + INNER JOIN documents d ON iff.document_id=d.id + INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id + INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id + INNER JOIN mime_types mt ON dcv.mime_id=mt.id + LEFT JOIN mime_extractors me ON mt.extractor_id=me.id + WHERE + (iff.processdate IS NULL or iff.processdate < date_sub('$date', interval 1 day)) AND dmv.status_id=1 + ORDER BY indexdate + LIMIT $max"; + $result = DBUtil::getResultArray($sql); + if (PEAR::isError($result)) + { + //unlink($indexLockFile); + if ($this->debug) $default->log->debug('indexDocuments: stopping - db error'); + return; + } + KTUtil::setSystemSetting('luceneIndexingDate', time()); + + // bail if no work to do + if (count($result) == 0) + { + //unlink($indexLockFile); + if ($this->debug) $default->log->debug('indexDocuments: stopping - no work to be done'); + return; + } + + // identify any documents that need indexing and mark them + // so they are not taken in a followup run + $ids = array(); + foreach($result as $docinfo) + { + $ids[] = $docinfo['document_id']; + } + + // mark the documents as being processed + + $ids=implode(',',$ids); + $sql = "UPDATE index_files SET processdate='$date' WHERE document_id in ($ids)"; + DBUtil::runQuery($sql); + + $extractorCache = array(); + $storageManager = KTStorageManagerUtil::getSingleton(); + + $tempPath = $config->get("urls/tmpDirectory"); + + foreach($result as $docinfo) + { + // increment indexed documents count + Indexer::incrementCount(); + + $docId=$docinfo['document_id']; + $extension=$docinfo['filetypes']; + $mimeType=$docinfo['mimetypes']; + $extractorClass=$docinfo['extractor']; + $indexDocument = in_array($docinfo['what'], array('A','C')); + $indexDiscussion = in_array($docinfo['what'], array('A','D')); + $this->indexingHistory = ''; + + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Indexing docid: %d extension: '%s' mimetype: '%s' extractor: '%s'"), $docId, $extension,$mimeType,$extractorClass), 'debug'); + + if (empty($extractorClass)) + { + /* + + if no extractor is found and we don't need to index discussions, then we can remove the item from the queue. + + */ + if ($indexDiscussion) + { + $indexDocument = false; + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Not indexing docid: %d content because extractor could not be resolve. Still indexing discussion."), $docId), 'info'); + } + else + { + Indexer::unqueueDocument($docId, sprintf(_kt("No extractor for docid: %d"),$docId)); + continue; + } + } + else + { + /* + + If an extractor is available, we must ensure it is enabled. + + */ + + if (!$this->isExtractorEnabled($extractorClass)) + { + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("diagnose: Not indexing docid: %d because extractor '%s' is disabled."), $docId, $extractorClass), 'info'); + continue; + } + } + + if ($this->debug) + { + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Processing docid: %d.\n"),$docId), 'info'); + } + + $document = Document::get($docId); + if (PEAR::isError($document)) + { + Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Cannot resolve document id %d: %s."),$docId, $document->getMessage()), 'error'); + continue; + } + + if ($this->restartCurrentBatch) + { + Indexer::unqueueDocument($docId); + Indexer::index($docId, 'A'); + continue; + } + + + $filename = $document->getFileName(); + if (substr($filename,0,1) == '~' || substr($filename,-1) == '~') + { + Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Filename for document id %d starts with a tilde (~). This is assumed to be a temporary file. This is ignored."),$docId), 'error'); + continue; + } + + $removeFromQueue = true; + if ($indexDocument) + { + if (array_key_exists($extractorClass, $extractorCache)) + { + $extractor = $extractorCache[$extractorClass]; + } + else + { + $extractor = $extractorCache[$extractorClass] = $this->getExtractor($extractorClass); + } + + if (!($extractor instanceof DocumentExtractor)) + { + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("indexDocuments: extractor '%s' is not a document extractor class."),$extractorClass), 'error'); + continue; + } + + + + $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber(); + $sourceFile = $storageManager->temporaryFile($document); + + if (empty($sourceFile) || !is_file($sourceFile)) + { + Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: source file '%s' for document %d does not exist."),$sourceFile,$docId), 'error'); + continue; + } + + if ($extractor->needsIntermediateSourceFile()) + { + //$extension = pathinfo($document->getFileName(), PATHINFO_EXTENSION); + + $intermediate = $tempPath . '/'. $docId . '.' . $extension; + $result = @copy($sourceFile, $intermediate); + if ($result === false) + { + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not create intermediate file from document %d"),$docId), 'error'); + // problem. lets try again later. probably permission related. log the issue. + continue; + } + $sourceFile = $intermediate; + } + + $targetFile = tempnam($tempPath, 'ktindexer'); + + $extractor->setSourceFile($sourceFile); + $extractor->setMimeType($mimeType); + $extractor->setExtension($extension); + $extractor->setTargetFile($targetFile); + $extractor->setDocument($document); + $extractor->setIndexingStatus(null); + $extractor->setExtractionStatus(null); + + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Extra Info docid: %d Source File: '%s' Target File: '%s'"),$docId,$sourceFile,$targetFile), 'debug'); + + $this->executeHook($extractor, 'pre_extract'); + $this->executeHook($extractor, 'pre_extract', $mimeType); + $removeFromQueue = false; + + if ($extractor->extractTextContent()) + { + // the extractor may need to create another target file + $targetFile = $extractor->getTargetFile(); + + $extractor->setExtractionStatus(true); + $this->executeHook($extractor, 'pre_index'); + $this->executeHook($extractor, 'pre_index', $mimeType); + + $title = $document->getName(); + if ($indexDiscussion) + { + if (!$this->filterText($targetFile)) + { + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error'); + } + else + { + $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version); + $removeFromQueue = $indexStatus; + if (!$indexStatus) + { + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocumentAndDiscussion"),$docId), 'error'); + } + + $extractor->setIndexingStatus($indexStatus); + } + } + else + { + if (!$this->filterText($targetFile)) + { + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error'); + } + else + { + $indexStatus = $this->indexDocument($docId, $targetFile, $title, $version); + $removeFromQueue = $indexStatus; + + if (!$indexStatus) + { + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocument"),$docId), 'error'); + $this->logPendingDocumentInfoStatus($docId, '' . $extractor->output . '', 'error'); + } + + $extractor->setIndexingStatus($indexStatus); + } + } + + $this->executeHook($extractor, 'post_index', $mimeType); + $this->executeHook($extractor, 'post_index'); + } + else + { + $extractor->setExtractionStatus(false); + $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not extract contents from document %d"),$docId), 'error'); + $this->logPendingDocumentInfoStatus($docId, '' . $extractor->output . '', 'error'); + } + + $this->executeHook($extractor, 'post_extract', $mimeType); + $this->executeHook($extractor, 'post_extract'); + + if ($extractor->needsIntermediateSourceFile()) + { + @unlink($sourceFile); + } + + @unlink($targetFile); + + } + else + { + $indexStatus = $this->indexDiscussion($docId); + $removeFromQueue = $indexStatus; + } + + if ($removeFromQueue) + { + Indexer::unqueueDocument($docId, sprintf(_kt("Done indexing docid: %d"),$docId)); + } + else + { + if ($this->debug) $default->log->debug(sprintf(_kt("Document docid: %d was not removed from the queue as it looks like there was a problem with the extraction process"),$docId)); + } + } + if ($this->debug) $default->log->debug('indexDocuments: done'); + //unlink($indexLockFile); + } + + public function migrateDocuments($max=null) + { + global $default; + + $default->log->info(_kt('migrateDocuments: starting')); + + if (!$this->doesDiagnosticsPass(true)) + { + $default->log->info(_kt('migrateDocuments: stopping - diagnostics problem. The dashboard will provide more information.')); + return; + } + + if (KTUtil::getSystemSetting('migrationComplete') == 'true') + { + $default->log->info(_kt('migrateDocuments: stopping - migration is complete.')); + return; + } + + $config =& KTConfig::getSingleton(); + if (is_null($max)) + { + $max = $config->get('indexer/batchMigrateDocument',500); + } + + $lockFile = $config->get('cache/cacheDirectory') . '/migration.lock'; + if (is_file($lockFile)) + { + $default->log->info(_kt('migrateDocuments: stopping - migration lockfile detected.')); + return; + } + touch($lockFile); + + $startTime = KTUtil::getSystemSetting('migrationStarted'); + if (is_null($startTime)) + { + KTUtil::setSystemSetting('migrationStarted', time()); + } + + $maxLoops = 5; + + $max = ceil($max / $maxLoops); + + $start =KTUtil::getBenchmarkTime(); + $noDocs = false; + $numDocs = 0; + + for($loop=0;$loop<$maxLoops;$loop++) + { + + $sql = "SELECT + document_id, document_text + FROM + document_text + ORDER BY document_id + LIMIT $max"; + $result = DBUtil::getResultArray($sql); + if (PEAR::isError($result)) + { + $default->log->info(_kt('migrateDocuments: db error')); + break; + } + + $docs = count($result); + if ($docs == 0) + { + $noDocs = true; + break; + } + $numDocs += $docs; + + foreach($result as $docinfo) + { + $docId = $docinfo['document_id']; + + $document = Document::get($docId); + if (PEAR::isError($document) || is_null($document)) + { + $sql = "DELETE FROM document_text WHERE document_id=$docId"; + DBUtil::runQuery($sql); + $default->log->error(sprintf(_kt('migrateDocuments: Could not get document %d\'s document! Removing content!'),$docId)); + continue; + } + + $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber(); + + $targetFile = tempnam($tempPath, 'ktindexer'); + + if (file_put_contents($targetFile, $docinfo['document_text']) === false) + { + $default->log->error(sprintf(_kt('migrateDocuments: Cannot write to \'%s\' for document id %d'), $targetFile, $docId)); + continue; + } + // free memory asap ;) + unset($docinfo['document_text']); + + $title = $document->getName(); + + $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version); + + if ($indexStatus) + { + $sql = "DELETE FROM document_text WHERE document_id=$docId"; + DBUtil::runQuery($sql); + } + else + { + $default->log->error(sprintf(_kt("migrateDocuments: Problem indexing document %d"), $docId)); + } + + @unlink($targetFile); + } + } + + @unlink($lockFile); + + $time = KTUtil::getBenchmarkTime() - $start; + + KTUtil::setSystemSetting('migrationTime', KTUtil::getSystemSetting('migrationTime',0) + $time); + KTUtil::setSystemSetting('migratedDocuments', KTUtil::getSystemSetting('migratedDocuments',0) + $numDocs); + + $default->log->info(sprintf(_kt('migrateDocuments: stopping - done in %d seconds!'), $time)); + if ($noDocs) + { + $default->log->info(_kt('migrateDocuments: Completed!')); + KTUtil::setSystemSetting('migrationComplete', 'true'); + schedulerUtil::deleteByName('Index Migration'); + $default->log->debug(_kt('migrateDocuments: Disabling \'Index Migration\' task by removing scheduler entry.')); + } + } + + /** + * Index a document. The base class must override this function. + * + * @param int $docId + * @param string $textFile + */ + protected abstract function indexDocument($docId, $textFile, $title, $version); + + + public function updateDocumentIndex($docId, $text) + { + $config = KTConfig::getSingleton(); + $tempPath = $config->get("urls/tmpDirectory"); + $tempFile = tempnam($tempPath,'ud_'); + + file_put_contents($tempFile, $text); + + $document = Document::get($docId); + $title = $document->getDescription(); + $version = $document->getVersion(); + + $result = $this->indexDocument($docId, $tempFile, $title, $version); + + if (file_exists($tempFile)) + { + unlink($tempFile); + } + + return $result; + } + + /** + * Index a discussion. The base class must override this function. + * + * @param int $docId + */ + protected abstract function indexDiscussion($docId); + + /** + * Diagnose the indexer. e.g. Check that the indexing server is running. + * + */ + public abstract function diagnose(); + + /** + * Diagnose the extractors. + * + * @return array + */ + public function diagnoseExtractors() + { + $diagnosis = $this->_diagnose($this->extractorPath, 'DocumentExtractor', 'Extractor.inc.php'); + $diagnosis = array_merge($diagnosis, $this->_diagnose($this->hookPath, 'Hook', 'Hook.inc.php')); + + return $diagnosis; + } + + /** + * This is a refactored diagnose function. + * + * @param string $path + * @param string $class + * @param string $extension + * @return array + */ + private function _diagnose($path, $baseclass, $extension) + { + global $default; + + $diagnoses = array(); + + $dir = opendir(SearchHelper::correctPath($path)); + $extlen = - strlen($extension); + + while (($file = readdir($dir)) !== false) + { + if (substr($file,0,1) == '.') + { + continue; + } + if (substr($file,$extlen) != $extension) + { + $default->log->error(sprintf(_kt("diagnose: '%s' does not have extension '%s'."), $file, $extension)); + continue; + } + + require_once($path . '/' . $file); + + $class = substr($file, 0, -8); + if (!class_exists($class)) + { + $default->log->error(sprintf(_kt("diagnose: class '%s' does not exist."), $class)); + continue; + } + + if (!$this->isExtractorEnabled($class)) + { + $default->log->debug(sprintf(_kt("diagnose: extractor '%s' is disabled."), $class)); + continue; + } + + $extractor = new $class(); + if (!is_a($extractor, $baseclass)) + { + $default->log->error(sprintf(_kt("diagnose(): '%s' is not of type DocumentExtractor"), $class)); + continue; + } + + $types = $extractor->getSupportedMimeTypes(); + if (empty($types)) + { + if ($this->debug) $default->log->debug(sprintf(_kt("diagnose: class '%s' does not support any types."), $class)); + continue; + } + + $diagnosis=$extractor->diagnose(); + if (empty($diagnosis)) + { + continue; + } + $diagnoses[$class] = array( + 'name'=>$extractor->getDisplayName(), + 'diagnosis'=>$diagnosis + ); + + } + closedir($dir); + + return $diagnoses; + } + + + /** + * Register the extractor types. + * + * @param boolean $clear. Optional. Defaults to false. + */ + public function registerTypes($clear=false) + { + if ($clear) + { + $this->clearExtractors(); + } + $dir = opendir(SearchHelper::correctPath($this->extractorPath)); + while (($file = readdir($dir)) !== false) + { + if (substr($file,-17) == 'Extractor.inc.php') + { + require_once($this->extractorPath . '/' . $file); + $class = substr($file, 0, -8); + + if (!class_exists($class)) + { + // if the class does not exist, we can't do anything. + continue; + } + + $extractor = new $class; + if ($extractor instanceof DocumentExtractor) + { + $extractor->registerMimeTypes(); + } + } + } + closedir($dir); + } + + /** + * This is used as a possible obtimisation effort. It may be overridden in that case. + * + * @param int $docId + * @param string $textFile + */ + protected function indexDocumentAndDiscussion($docId, $textFile, $title, $version) + { + $this->indexDocument($docId, $textFile, $title, $version); + $this->indexDiscussion($docId); + } + + /** + * Remove the document from the queue. This is normally called when it has been processed. + * + * @param int $docid + */ + public static function unqueueDocument($docid, $reason=false, $level='debug') + { + $sql = "DELETE FROM index_files WHERE document_id=$docid"; + DBUtil::runQuery($sql); + if ($reason !== false) + { + global $default; + $default->log->$level("Indexer: removing document $docid from the queue - $reason"); + } + } + + /** + * Run a query on the index. + * + * @param string $query + * @return array + */ + public abstract function query($query); + + /** + * Converts an integer to a string that can be easily compared and reversed. + * + * @param int $int + * @return string + */ + public static function longToString($int) + { + $maxlen = 14; + + $a2z = array('a','b','c','d','e','f','g','h','i','j'); + $o29 = array('0','1','2','3','4','5','6','7','8','9'); + $l = str_pad('',$maxlen - strlen("$int"),'0') . $int; + + return str_replace($o29, $a2z, $l); + } + + /** + * Converts a string to an integer. + * + * @param string $str + * @return int + */ + public static function stringToLong($str) + { + $a2z = array('a','b','c','d','e','f','g','h','i','j'); + $o29 = array('0','1','2','3','4','5','6','7','8','9'); + + $int = str_replace($a2z, $o29, $str) + 0; + + return $int; + } + + /** + * Possibly we can optimise indexes. This method must be overriden. + * The new function must call the parent! + * + */ + public function optimise() + { + KTUtil::setSystemSetting('luceneOptimisationDate', time()); + } + + /** + * Shuts down the indexer + * + */ + public function shutdown() + { + // do nothing generally + } + + /** + * Returns the name of the indexer. + * + * @return string + */ + public abstract function getDisplayName(); + + + /** + * Returns the number of non-deleted documents in the index. + * + * @return int + */ + public abstract function getDocumentsInIndex(); + + public abstract function isDocumentIndexed($documentId); + + /** + * Returns the path to the index directory + * + * @return string + */ + public function getIndexDirectory() + { + $config = KTConfig::getSingleton(); + $directory = $config->get('indexer/luceneDirectory'); + return $directory; + } +} + +?> -- libgit2 0.21.4