diff --git a/search2/indexing/indexerCore.inc.php b/search2/indexing/indexerCore.inc.php
index 48d9a19..7f63090 100755
--- a/search2/indexing/indexerCore.inc.php
+++ b/search2/indexing/indexerCore.inc.php
@@ -1,1862 +1,1862 @@
-.
- *
- * You can contact KnowledgeTree Inc., PO Box 7775 #87847, San Francisco,
- * California 94120-7775, or email info@knowledgetree.com.
- *
- * The interactive user interfaces in modified source and object code versions
- * of this program must display Appropriate Legal Notices, as required under
- * Section 5 of the GNU General Public License version 3.
- *
- * In accordance with Section 7(b) of the GNU General Public License version 3,
- * these Appropriate Legal Notices must retain the display of the "Powered by
- * KnowledgeTree" logo and retain the original copyright notice. If the display of the
- * logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices
- * must display the words "Powered by KnowledgeTree" and retain the original
- * copyright notice.
- * Contributor( s): ______________________________________
- *
- */
-
-define('SEARCH2_INDEXER_DIR',realpath(dirname(__FILE__)) . '/');
-require_once('indexing/extractorCore.inc.php');
-require_once(KT_DIR . '/plugins/ktcore/scheduler/schedulerUtil.php');
-require_once(KT_DIR . '/ktapi/ktapi.inc.php');
-
-class IndexerInconsistencyException extends Exception {};
-
-// TODO: Query Result Items code should be moved into the Search section. It has less to do with indexing...
-
-class QueryResultItem
-{
- protected $id;
- protected $title;
- protected $rank;
- protected $text;
- protected $fullpath;
-
- public function __construct($id, $title, $rank, $text, $fullpath)
- {
- $this->id = $id;
- $this->title = $title;
- $this->rank = $rank;
- $this->text = $text;
- $this->fullpath = $fullpath;
- }
-
- public function getId() { return $this->id; }
- public function getRealId() { return $this->id; }
-
- public function getIsProxy() { return $this instanceof ProxyResultItem; }
- public function getIsFolder() { return substr(get_class($this), 0, 6) == 'Folder' ; }
- public function getIsDocument() { return substr(get_class($this), 0, 8) == 'Document' ; }
-
- public function setRank($value)
- {
- $this->rank = number_format($value,2,'.',',');
- }
-
- public function getIsLive()
- {
- return true;
- }
-
- public function setTitle($value)
- {
- $this->title = $value;
- }
-
- public function setText($value)
- {
- $this->text = $value;
- }
-
- public function getRelevance() { return (float) $this->rank; }
- public function getRank() { return $this->getRelevance(); }
- public function getText() { return (string) $this->text; }
- public function getTitle() { return (string) $this->title; }
- public function getFullPath() { return (string) $this->fullpath; }
-
- protected function __get($property)
- {
- if (empty($property))
- {
- return '';
- }
-
- $method = 'get' . $property;
- if (method_exists($this, $method))
- {
- return $this->$method();
- }
- return $this->getUnknown();
- }
-
- protected function getUnknown()
- {
- return _kt('n/a');
- }
-
- protected function __set($property, $value)
- {
- if (empty($property))
- {
- return '';
- }
-
- $method = 'set' . $property;
- if (method_exists($this, $method))
- {
- return $this->$method($value);
- }
- throw new Exception("Unknown property '$property' to set on QueryResultItem");
- }
-}
-
-class ProxyResultItem extends QueryResultItem
-{
- protected $proxy;
- protected $proxyId;
-
- public function __construct($proxyId, $proxy)
- {
- parent::__construct($proxyId, $proxy->getTitle, $proxy->getRank(), $proxy->getText(), $proxy->getFullPath());
- $this->proxyId = $proxyId;
- $this->proxy = $proxy;
- }
-
- public function getId() { return $this->proxyId; }
- public function getTitle() { return $this->proxy->getTitle(); }
- public function getRealId() { return $this->proxy->getId(); }
-
- protected function __get($property)
- {
- $method = 'get' . $property;
-
- if (method_exists($this, $method))
- {
- return $this->$method();
- }
- else
- {
- return $this->proxy->$method();
- }
- }
-
- protected function __set($property, $value)
- {
- $method = 'set' . $property;
- if (method_exists($this, $method))
- {
- return $this->$method($value);
- }
- else
- {
- return $this->proxy->$method($value);
- }
- }
-}
-
-class DocumentResultItem extends QueryResultItem
-{
- protected $filesize;
- protected $live;
- protected $version;
- protected $mimeType;
- protected $filename;
- protected $thumbnail; // TODO: if not null, gui can display a thumbnail
- protected $viewer; // TODO: if not null, a viewer can be used to view the document
- protected $document;
- protected $checkedOutUser;
- protected $dateCheckedout;
- protected $workflowState;
- protected $workflow;
- protected $modifiedBy;
- protected $dateModified;
- protected $createdBy;
- protected $dateCreated;
- protected $owner;
- protected $immutable;
- protected $deleted;
- protected $status;
- protected $folderId;
- protected $storagePath;
- protected $documentType;
- protected $mimeIconPath;
- protected $mimeDisplay;
- protected $oemDocumentNo;
-
- public function __construct($document_id, $rank=null, $title=null, $text=null, $fullpath = null)
- {
- parent::__construct($document_id, $title, $rank, $text, $fullpath);
- $this->live = true;
- $this->loadDocumentInfo();
- }
-
- // TODO: this is bad. must refactor to do the query on the group of documents.
- public function loadDocumentInfo()
- {
- global $default;
- $sql = "SELECT
- d.folder_id, f.full_path, f.name, dcv.size as filesize, dcv.major_version,
- dcv.minor_version, dcv.filename, cou.name as checkoutuser, w.human_name as workflow, ws.human_name as workflowstate,
- mt.mimetypes as mimetype, md.mime_doc as mimedoc, d.checkedout, mbu.name as modifiedbyuser, d.modified,
- cbu.name as createdbyuser, ou.name as owneruser, d.immutable, d.status_id, d.created,dcv.storage_path, dtl.name as document_type,
- mt.icon_path as mime_icon_path, mt.friendly_name as mime_display, d.oem_no, dmv.name as title
- FROM
- documents d
- INNER JOIN document_metadata_version dmv ON d.metadata_version_id = dmv.id
- INNER JOIN document_content_version dcv ON dmv.content_version_id = dcv.id
- INNER JOIN mime_types mt ON dcv.mime_id=mt.id
- LEFT JOIN document_types_lookup dtl ON dtl.id=dmv.document_type_id
- LEFT JOIN folders f ON f.id=d.folder_id
- LEFT JOIN users cou ON d.checked_out_user_id=cou.id
- LEFT JOIN workflows w ON dmv.workflow_id=w.id
- LEFT JOIN workflow_states ws ON dmv.workflow_state_id = ws.id
- LEFT JOIN mime_documents md ON mt.mime_document_id = md.id
- LEFT JOIN users mbu ON d.modified_user_id=mbu.id
- LEFT JOIN users cbu ON d.creator_id=cbu.id
- LEFT JOIN users ou ON d.owner_id=ou.id
- WHERE
- d.id=$this->id";
-
- $result = DBUtil::getOneResult($sql);
-
- if (PEAR::isError($result) || empty($result))
- {
- $this->live = false;
- if (PEAR::isError($result))
- {
- throw new Exception('Database exception! There appears to be an error in the system: ' .$result->getMessage());
- }
-
- $default->log->error('QueryResultItem: $result is null');
- $msg = 'The database did not have a record matching the result from the document indexer. This may occur if there is an inconsistency between the document indexer and the repository. The indexer needs to be repaired.';
- $default->log->error('QueryResultItem: ' . $msg);
- // TODO: repair process where we scan documents in index, and delete those for which there is nothing in the repository
- throw new IndexerInconsistencyException(_kt($msg));
- }
-
- // document_id, relevance, text, title
-
- $this->documentType = $result['document_type'];
- $this->filename=$result['filename'];
- $this->filesize = KTUtil::filesizeToString($result['filesize']);
- $this->folderId = $result['folder_id'];
- $this->title = $result['title'];
-
- $this->createdBy = $result['createdbyuser'];
- $this->dateCreated = $result['created'];
-
- $this->modifiedBy = $result['modifiedbyuser'];
- $this->dateModified = $result['modified'];
-
- $this->checkedOutUser = $result['checkoutuser'];
- $this->dateCheckedout = $result['checkedout'];
-
- $this->owner = $result['owneruser'];
-
- $this->version = $result['major_version'] . '.' . $result['minor_version'];
-
- $this->immutable = ($result['immutable'] + 0)?_kt('Immutable'):'';
-
- $this->workflow = $result['workflow'];
- $this->workflowState = $result['workflowstate'];
-
- $this->oemDocumentNo = $result['oem_no'];
- if (empty($this->oemDocumentNo)) $this->oemDocumentNo = 'n/a';
-
- if (is_null($result['name']))
- {
- $this->fullpath = '(orphaned)';
- }
- else
- {
- $this->fullpath = $result['full_path'];
- }
-
- $this->mimeType = $result['mimetype'];
- $this->mimeIconPath = $result['mime_icon_path'];
- if (empty($this->mimeIconPath))
- {
- $this->mimeIconPath = 'unspecified_type';
- }
- $this->mimeDisplay = $result['mime_display'];
-
- $this->storagePath = $result['storage_path'];
- $this->status = Document::getStatusString($result['status_id']);
- }
-
- public function getDocumentID() { return $this->getId(); }
- public function getIsLive() { return (bool) $this->live; }
- public function getFilesize() { return $this->filesize; }
- public function getVersion() { return (string) $this->version; }
- public function getFilename() { return (string)$this->filename; }
- public function getFolderId() { return (int)$this->folderId; }
- public function getOemDocumentNo() { return (string) $this->oemDocumentNo; }
- public function getDocument() { return Document::get($this->id); }
- public function getIsAvailable() { return $this->Document->isLive(); }
- public function getCheckedOutUser() { return (string) $this->checkedOutUser; }
- public function getCheckedOutByr() { return $this->getCheckedOutUser(); }
- public function getWorkflowOnly() { return (string)$this->workflow; }
- public function getWorkflow() { return $this->getWorkflow(); }
- public function getWorkflowStateOnly() { return (string)$this->workflowState; }
- public function getWorkflowState() { return $this->getWorkflowStateOnly(); }
- public function getWorkflowAndState() {
- if (is_null($this->workflow))
- {
- return '';
- }
- return "$this->workflow - $this->workflowState";
- }
- public function getMimeType() { return (string) $this->mimeType; }
- public function getMimeIconPath() { return (string) $this->mimeIconPath; }
- public function getMimeDisplay() { return (string) $this->mimeDisplay; }
- public function getDateCheckedOut() { return (string) $this->dateCheckedout; }
- public function getModifiedBy() { return (string) $this->modifiedBy; }
- public function getDateModified() { return (string) $this->dateModified; }
- public function getCreatedBy() { return (string) $this->createdBy; }
- public function getDateCreated() { return (string) $this->dateCreated; }
- public function getOwner() { return (string) $this->owner; }
- public function getOwnedBy() { return $this->getOwner(); }
- public function getIsImmutable() { return (bool) $this->immutable; }
- public function getImmutable() { return $this->getIsImmutable(); }
- public function getStatus() { return $this->status; }
- public function getStoragePath() { return $this->storagePath; }
- public function getDocumentType() { return $this->documentType; }
- public function getPermissions() { return KTAPI_Document::get_permission_string($this->Document); }
- public function getCanBeReadByUser() {
- if (!$this->live)
- return false;
- if (Permission::userHasDocumentReadPermission($this->Document))
- return true;
- if (Permission::adminIsInAdminMode())
- return true;
- return false;
- }
-}
-
-class FolderResultItem extends QueryResultItem
-{
- protected $folder;
- protected $createdBy;
- protected $parentId;
-
- public function __construct($folder_id, $rank=null, $title=null, $text=null, $fullpath = null)
- {
- parent::__construct($folder_id, $title, $rank, $text, $fullpath);
- $this->loadFolderInfo();
- }
-
- public function getFolderID() { return $this->getId(); }
- public function getParentID() { return $this->parentId; }
- public function getCreatedBy() { return $this->createdBy; }
- public function getMimeIconPath() { return 'folder'; }
- public function getFolder() { return Folder::get($this->getFolderID()); }
- public function getPermissions() { return KTAPI_Folder::get_permission_string($this->Folder); }
-
- public function loadFolderInfo()
- {
- global $default;
- $folder = $this->getFolder();
- if (PEAR::isError($folder))
- {
- throw new Exception('Database exception! There appears to be an error in the system: ' .$result->getMessage());
- }
- $this->title = $folder->getName();
- $this->fullpath = '/' . $folder->getFullPath();
- $this->parentId = $folder->getParentId();
-
- $user = User::get($folder->getCreatorID());
- $this->createdBy = (PEAR::isError($user))?_kt('Unknown'):$user->getName();
- }
-
-}
-
-class DocumentShortcutResultItem extends ProxyResultItem
-{
- public function getDocumentID() { return $this->getId(); }
- public function getMimeIconPath() { return $this->proxy->getMimeIconPath() . '_shortcut'; }
-
-}
-
-class FolderShortcutResultItem extends ProxyResultItem
-{
- public function getFolderID() { return $this->getId(); }
- public function getMimeIconPath() { return 'folder_shortcut'; }
-
-}
-
-function MatchResultCompare($a, $b)
-{
- if ($a->Rank == $b->Rank) {
- return 0;
- }
- return ($a->Rank < $b->Rank) ? -1 : 1;
-}
-
-abstract class Indexer
-{
- /**
- * Cache of extractors
- *
- * @var array
- */
- private $extractorCache;
-
- /**
- * Indicates if the indexer will do logging.
- *
- * @var boolean
- */
- private $debug;
- /**
- * Cache on mime related hooks
- *
- * @var unknown_type
- */
- private $mimeHookCache;
- /**
- * Cache on general hooks.
- *
- * @var array
- */
- private $generalHookCache;
-
- /**
- * This is a path to the extractors.
- *
- * @var string
- */
- private $extractorPath;
- /**
- * This is a path to the hooks.
- *
- * @var string
- */
- private $hookPath;
-
- private $enabledExtractors;
-
- /**
- * Initialise the indexer
- *
- */
- protected function __construct()
- {
- $config = KTConfig::getSingleton();
-
- $this->extractorCache = array();
- $this->debug = $config->get('indexer/debug', true);
- $this->hookCache = array();
- $this->generalHookCache = array();
- $this->extractorPath = $config->get('indexer/extractorPath', 'extractors');
- $this->hookPath = $config->get('indexer/extractorHookPath','extractorHooks');
-
- $this->loadExtractorStatus();
- }
-
- /**
- * Get the list if enabled extractors
- *
- */
- private function loadExtractorStatus()
- {
- $sql = "SELECT id, name FROM mime_extractors WHERE active=1";
- $rs = DBUtil::getResultArray($sql);
- $this->enabledExtractors = array();
- foreach($rs as $item)
- {
- $this->enabledExtractors[] = $item['name'];
- }
- }
-
- private function isExtractorEnabled($extractor)
- {
- return in_array($extractor, $this->enabledExtractors);
- }
-
- /**
- * Returns a reference to the main class
- *
- * @return Indexer
- */
- public static function get()
- {
- static $singleton = null;
-
- if (is_null($singleton))
- {
- $config = KTConfig::getSingleton();
- $classname = $config->get('indexer/coreClass');
-
- require_once('indexing/indexers/' . $classname . '.inc.php');
-
- if (!class_exists($classname))
- {
- throw new Exception("Class '$classname' does not exist.");
- }
-
- $singleton = new $classname;
- }
-
- return $singleton;
- }
-
- public abstract function deleteDocument($docid);
-
- /**
- * Remove the association of all extractors to mime types on the database.
- *
- */
- public function clearExtractors()
- {
- global $default;
-
- $sql = "update mime_types set extractor_id=null";
- DBUtil::runQuery($sql);
-
- $sql = "delete from mime_extractors";
- DBUtil::runQuery($sql);
-
- if ($this->debug) $default->log->debug('clearExtractors');
- }
-
- /**
- * lookup the name of the extractor class based on the mime type.
- *
- * @param string $type
- * @return string
- */
- public static function resolveExtractor($type)
- {
- global $default;
- $sql = "select extractor from mime_types where filetypes='$type'";
- $class = DBUtil::getOneResultKey($sql,'extractor');
- if (PEAR::isError($class))
- {
- $default->log->error("resolveExtractor: cannot resolve $type");
- return $class;
- }
- if ($this->debug) $default->log->debug(sprintf(_kt("resolveExtractor: Resolved '%s' from mime type '%s'."), $class, $type));
- return $class;
- }
-
- /**
- * Return all the discussion text.
- *
- * @param int $docid
- * @return string
- */
- public static function getDiscussionText($docid)
- {
- $sql = "SELECT
- dc.subject, dc.body
- FROM
- discussion_threads dt
- INNER JOIN discussion_comments dc ON dc.thread_id=dt.id AND dc.id BETWEEN dt.first_comment_id AND dt.last_comment_id
- WHERE
- dt.document_id=$docid";
- $result = DBUtil::getResultArray($sql);
- $text = '';
-
- foreach($result as $record)
- {
- $text .= $record['subject'] . "\n" . $record['body'] . "\n";
- }
-
- return $text;
- }
-
- /**
- * Schedule the indexing of a document.
- *
- * @param string $document
- * @param string $what
- */
- public static function index($document, $what='A')
- {
- global $default;
-
- if (is_numeric($document))
- {
- $document = Document::get($document+0);
- }
-
- if (PEAR::isError($document))
- {
- $default->log->error("index: Could not index document: " .$document->getMessage());
- return;
- }
-
- $document_id = $document->getId();
- $userid=$_SESSION['userID'];
- if (empty($userid)) $userid=1;
-
- // we dequeue the document so that there are no issues when enqueuing
- Indexer::unqueueDocument($document_id);
-
- // enqueue item
- $sql = "INSERT INTO index_files(document_id, user_id, what) VALUES($document_id, $userid, '$what')";
- DBUtil::runQuery($sql);
-
- $default->log->debug("index: Queuing indexing of $document_id");
-
- }
-
- private static function incrementCount()
- {
- // Get count from system settings
- $count = Indexer::getIndexedDocumentCount();
- $count = (int)$count + 1;
- Indexer::updateIndexedDocumentCount($count);
- }
-
- public static function getIndexedDocumentCount()
- {
- $count = KTUtil::getSystemSetting('indexedDocumentCount', 0);
- return (int) $count;
- }
-
- public static function updateIndexedDocumentCount($cnt = 0)
- {
- KTUtil::setSystemSetting('indexedDocumentCount', $cnt);
- }
-
- public static function reindexQueue()
- {
- $sql = "UPDATE index_files SET processdate = null";
- DBUtil::runQuery($sql);
- }
-
- public static function reindexDocument($documentId)
- {
- $sql = "UPDATE index_files SET processdate=null, status_msg=null WHERE document_id=$documentId";
- DBUtil::runQuery($sql);
- }
-
-
-
- public static function indexAll()
- {
- $userid=$_SESSION['userID'];
- if (empty($userid)) $userid=1;
-
- $sql = "DELETE FROM index_files";
- DBUtil::runQuery($sql);
-
- $sql = "INSERT INTO index_files(document_id, user_id, what) SELECT id, $userid, 'A' FROM documents WHERE status_id=1 and id not in (select document_id from index_files)";
- DBUtil::runQuery($sql);
- }
-
- public static function indexFolder($folder)
- {
- $userid=$_SESSION['userID'];
- if (empty($userid)) $userid=1;
-
- if (!$folder instanceof Folder && !$folder instanceof FolderProxy)
- {
- throw new Exception('Folder expected');
- }
-
- $full_path = $folder->getFullPath();
-
- $sql = "INSERT INTO index_files(document_id, user_id, what) SELECT id, $userid, 'A' FROM documents WHERE full_path like '{$full_path}/%' AND status_id=1 and id not in (select document_id from index_files)";
- DBUtil::runQuery($sql);
- }
-
- /**
- * Clearout the scheduling of documents that no longer exist.
- *
- */
- public static function clearoutDeleted()
- {
- global $default;
-
- $sql = 'DELETE FROM
- index_files
- WHERE
- document_id in (SELECT d.id FROM documents AS d WHERE d.status_id=3) OR
- NOT EXISTS(SELECT index_files.document_id FROM documents WHERE index_files.document_id=documents.id)';
- DBUtil::runQuery($sql);
-
- $default->log->debug("Indexer::clearoutDeleted: removed documents from indexing queue that have been deleted");
- }
-
-
- /**
- * Check if a document is scheduled to be indexed
- *
- * @param mixed $document This may be a document or document id
- * @return boolean
- */
- public static function isDocumentScheduled($document)
- {
- if (is_numeric($document))
- {
- $docid = $document;
- }
- else if ($document instanceof Document)
- {
- $docid = $document->getId();
- }
- else
- {
- return false;
- }
- $sql = "SELECT 1 FROM index_files WHERE document_id=$docid";
- $result = DBUtil::getResultArray($sql);
- return count($result) > 0;
- }
-
- /**
- * Filters text removing redundant characters such as continuous newlines and spaces.
- *
- * @param string $filename
- */
- private function filterText($filename)
- {
- $content = file_get_contents($filename);
-
- $src = array("([\r\n])","([\n][\n])","([\n])","([\t])",'([ ][ ])');
- $tgt = array("\n","\n",' ',' ',' ');
-
- // shrink what is being stored.
- do
- {
- $orig = $content;
- $content = preg_replace($src, $tgt, $content);
- } while ($content != $orig);
-
- return file_put_contents($filename, $content) !== false;
- }
-
- /**
- * Load hooks for text extraction process.
- *
- */
- private function loadExtractorHooks()
- {
- $this->generalHookCache = array();
- $this->mimeHookCache = array();
-
-
- $dir = opendir(SearchHelper::correctPath($this->hookPath));
- while (($file = readdir($dir)) !== false)
- {
- if (substr($file,-12) == 'Hook.inc.php')
- {
- require_once($this->hookPath . '/' . $file);
- $class = substr($file, 0, -8);
-
- if (!class_exists($class))
- {
- continue;
- }
-
- $hook = new $class;
- if (!($class instanceof ExtractorHook))
- {
- continue;
- }
-
- $mimeTypes = $hook->registerMimeTypes();
- if (is_null($mimeTypes))
- {
- $this->generalHookCache[] = & $hook;
- }
- else
- {
- foreach($mimeTypes as $type)
- {
- $this->mimeHookCache[$type][] = & $hook;
- }
- }
-
- }
- }
- closedir($dir);
- }
-
- /**
- * This is a refactored function to execute the hooks.
- *
- * @param DocumentExtractor $extractor
- * @param string $phase
- * @param string $mimeType Optional. If set, indicates which hooks must be used, else assume general.
- */
- private function executeHook($extractor, $phase, $mimeType = null)
- {
- $hooks = array();
- if (is_null($mimeType))
- {
- $hooks = $this->generalHookCache;
- }
- else
- {
- if (array_key_exists($mimeType, $this->mimeHookCache))
- {
- $hooks = $this->mimeHookCache[$mimeType];
- }
- }
- if (empty($hooks))
- {
- return;
- }
-
- foreach($hooks as $hook)
- {
- $hook->$phase($extractor);
- }
- }
-
- private function doesDiagnosticsPass($simple=false)
- {
- global $default;
-
- $config =& KTConfig::getSingleton();
- // create a index log lock file in case there are errors, and we don't need to log them forever!
- // this function will create the lockfile if an error is detected. It will be removed as soon
- // as the problems with the indexer are removed.
- $lockFile = $config->get('cache/cacheDirectory') . '/index.log.lock';
-
- $diagnosis = $this->diagnose();
- if (!is_null($diagnosis))
- {
- if (!is_file($lockFile))
- {
- $default->log->error(_kt('Indexer problem: ') . $diagnosis);
- }
- touch($lockFile);
- return false;
- }
-
- if ($simple)
- {
- return true;
- }
-
- $diagnosis = $this->diagnoseExtractors();
- if (!empty($diagnosis))
- {
- if (!is_file($lockFile))
- {
- foreach($diagnosis as $diag)
- {
- $default->log->error(sprintf(_kt('%s problem: %s'), $diag['name'],$diag['diagnosis']));
- }
- }
- touch($lockFile);
- return false;
- }
-
- if (is_file($lockFile))
- {
- $default->log->info(_kt('Issues with the indexer have been resolved!'));
- unlink($lockFile);
- }
-
- return true;
- }
-
- /**
- * This does the initial mime type association between mime types and text extractors
- *
- */
- public function checkForRegisteredTypes()
- {
- global $default;
-
- // we are only doing this once!
- $initRegistered = KTUtil::getSystemSetting('mimeTypesRegistered', false);
- if ($initRegistered)
- {
- return;
- }
- if ($this->debug) $default->log->debug('checkForRegisteredTypes: start');
-
- $date = date('Y-m-d H:i');
- $sql = "UPDATE scheduler_tasks SET run_time='$date'";
- DBUtil::runQuery($sql);
-
- $this->registerTypes(true);
-
- $disable = array(
- 'windows'=>array('PSExtractor'),
- 'unix' => array()
- );
-
- $disableForOS = OS_WINDOWS?$disable['windows']:$disable['unix'];
-
- if (!empty($disableForOS))
- {
- $disableForOS = '\'' . implode("','", $disableForOS) .'\'';
-
- $sql = "UPDATE mime_extractors SET active=0 WHERE name in ($disableForOS)";
- DBUtil::runQuery($sql);
- $default->log->info("checkForRegisteredTypes: disabled '$extractor'");
- }
- $this->loadExtractorStatus();
-
- if ($this->debug) $default->log->debug('checkForRegisteredTypes: done');
- KTUtil::setSystemSetting('mimeTypesRegistered', true);
- }
-
- private function updatePendingDocumentStatus($documentId, $message, $level)
- {
- $this->indexingHistory .= "\n" . $level . ': ' . $message;
- $message = sanitizeForSQL($this->indexingHistory);
- $sql = "UPDATE index_files SET status_msg='$message' WHERE document_id=$documentId";
- DBUtil::runQuery($sql);
- }
-
- private $restartCurrentBatch = false;
-
- public function restartBatch()
- {
- $this->restartCurrentBatch = true;
- }
-
- /**
- *
- * @param int $documentId
- * @param string $message
- * @param string $level This may be info, error, debug
- */
- private function logPendingDocumentInfoStatus($documentId, $message, $level)
- {
- $this->updatePendingDocumentStatus($documentId, $message, $level);
- global $default;
-
- switch ($level)
- {
- case 'debug':
- if ($this->debug)
- {
- $default->log->debug($message);
- }
- break;
- default:
- $default->log->$level($message);
- }
- }
-
-
-
- public function getExtractor($extractorClass)
- {
- if (empty($extractorClass))
- {
- return null;
- }
-
- $includeFile = SEARCH2_INDEXER_DIR . 'extractors/' . $extractorClass . '.inc.php';
- if (!file_exists($includeFile))
- {
- throw new Exception("Extractor file does not exist: $includeFile");
- }
-
- require_once($includeFile);
-
- if (!class_exists($extractorClass))
- {
- throw new Exception("Extractor '$classname' not defined in file: $includeFile");
- }
-
- $extractor = new $extractorClass();
-
- if (!($extractor instanceof DocumentExtractor))
- {
- throw new Exception("Class $classname was expected to be of type DocumentExtractor");
- }
-
- return $extractor;
- }
-
- public static function getIndexingQueue($problemItemsOnly=true)
- {
-
- if ($problemItemsOnly)
- {
- $sql = "SELECT
- iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename
- FROM
- index_files iff
- INNER JOIN documents d ON iff.document_id=d.id
- INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
- INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
- INNER JOIN mime_types mt ON dcv.mime_id=mt.id
- LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
- WHERE
- (iff.status_msg IS NOT NULL AND iff.status_msg <> '') AND d.status_id=1
- ORDER BY indexdate ";
- }
- else
- {
- $sql = "SELECT
- iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename
- FROM
- index_files iff
- INNER JOIN documents d ON iff.document_id=d.id
- INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
- INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
- INNER JOIN mime_types mt ON dcv.mime_id=mt.id
- LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
- WHERE
- (iff.status_msg IS NULL or iff.status_msg = '') AND d.status_id=1
- ORDER BY indexdate ";
- }
- $aResult = DBUtil::getResultArray($sql);
-
- return $aResult;
- }
-
- public static function getPendingIndexingQueue()
- {
- return Indexer::getIndexingQueue(false);
- }
-
- public function updateIndexStats()
- {
- $optimisationDate = KTUtil::getSystemSetting('luceneOptimisationDate', '');
-
- $noOptimisation = false;
- if ($optimisationDate == '')
- {
- $optimisationDate = _kt('N/A');
- $optimisationPeriod = $optimisationDate;
- }
- else
- {
- $optimisationPeriod = KTUtil::computePeriodToDate($optimisationDate, null, true);
- $noOptimisation = $optimisationPeriod['days'] > 2;
- $optimisationPeriod = $optimisationPeriod['str'];
- $optimisationDate = date('Y-m-d H:i:s', $optimisationDate);
- }
-
- $indexingDate = KTUtil::getSystemSetting('luceneIndexingDate', '');
- if ($indexingDate == '')
- {
- $indexingDate = _kt('N/A');
- $indexingPeriod = $indexingDate;
- }
- else
- {
- $indexingPeriod = KTUtil::computePeriodToDate($indexingDate);
- $indexingDate = date('Y-m-d H:i:s', $indexingDate);
- }
-
- $index = Indexer::get();
- $docsInIndex = $index->getDocumentsInIndex();
-
- // we are only interested in documents that are active
- $sql = "SELECT count(*) as docsInQueue FROM index_files i inner join documents d on i.document_id = d.id where d.status_id=1";
- $docsInQueue = DBUtil::getOneResultKey($sql, 'docsInQueue');
-
- $sql = "SELECT count(*) as errorsInQueue FROM index_files i inner join documents d on i.document_id = d.id where (i.status_msg is not null or i.status_msg <> '') and d.status_id=1";
- $errorsInQueue = DBUtil::getOneResultKey($sql, 'errorsInQueue');
-
- $sql = "SELECT count(*) as docsInRepository FROM documents where status_id=1";
- $docsInRepository = DBUtil::getOneResultKey($sql, 'docsInRepository');
-
- if ($docsInRepository == 0)
- {
- $indexingCoverage = '0.00%';
- $queueCoverage = $indexingCoverage;
- }
- else
- {
- // compute indexing coverage
- $indexingCoverage = _kt('Not Available');
- if (is_numeric($docsInIndex))
- {
- $indexingCoverage = ($docsInIndex * 100) / $docsInRepository;
- $indexingCoverage = number_format($indexingCoverage, 2, '.',',') . '%';
- }
-
- // compute queue coverage
- $queueCoverage = _kt('Not Available');
- if (is_numeric($docsInQueue))
- {
- $queueCoverage = ($docsInQueue * 100) / $docsInRepository;
- $queueCoverage = number_format($queueCoverage, 2, '.',',') . '%';
- }
- }
-
-
- $stats = array(
- 'optimisationDate'=>$optimisationDate,
- 'optimisationPeriod'=>$optimisationPeriod,
- 'indexingDate'=>$indexingDate,
- 'indexingPeriod'=>$indexingPeriod,
- 'docsInIndex'=>$docsInIndex,
- 'docsInQueue'=>$docsInQueue,
- 'errorsInQueue'=>$errorsInQueue,
- 'docsInRepository'=>$docsInRepository,
- 'indexingCoverage'=>$indexingCoverage,
- 'queueCoverage'=>$queueCoverage,
- 'noOptimisation'=>$noOptimisation
- );
-
- KTUtil::setSystemSetting('indexerStats', serialize($stats));
-
- $indexer = Indexer::get();
-
- $diagnosis = $indexer->diagnose();
- KTUtil::setSystemSetting('indexerDiagnostics', serialize($diagnosis));
-
- $extractorDiagnosis = $indexer->diagnoseExtractors();
-
- KTUtil::setSystemSetting('extractorDiagnostics', serialize($extractorDiagnosis));
- }
-
- /**
- * The main function that may be called repeatedly to index documents.
- *
- * @param int $max Default 20
- */
- public function indexDocuments($max=null)
- {
- global $default;
- $config =& KTConfig::getSingleton();
-
- /*$indexLockFile = $config->get('cache/cacheDirectory') . '/main.index.lock';
- if (is_file($indexLockFile))
- {
- $default->log->info('indexDocuments: main.index.lock seems to exist. it could be that the indexing is still underway.');
- $default->log->info('indexDocuments: Remove "' . $indexLockFile . '" if the indexing is not running or extend the frequency at which the background task runs!');
- return;
- }
- touch($indexLockFile);*/
-
-
- $this->checkForRegisteredTypes();
-
- if ($this->debug) $default->log->debug('indexDocuments: start');
- if (!$this->doesDiagnosticsPass())
- {
- //unlink($indexLockFile);
- if ($this->debug) $default->log->debug('indexDocuments: stopping - diagnostics problem. The dashboard will provide more information.');
- return;
- }
-
- if (is_null($max))
- {
- $max = $config->get('indexer/batchDocuments',20);
- }
-
- $this->loadExtractorHooks();
-
- Indexer::clearoutDeleted();
-
- $date = date('Y-m-d H:i:s');
- // identify the indexers that must run
- // mysql specific limit!
- $sql = "SELECT
- iff.document_id, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what
- FROM
- index_files iff
- INNER JOIN documents d ON iff.document_id=d.id
- INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
- INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
- INNER JOIN mime_types mt ON dcv.mime_id=mt.id
- LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
- WHERE
- (iff.processdate IS NULL or iff.processdate < date_sub('$date', interval 1 day)) AND dmv.status_id=1
- ORDER BY indexdate
- LIMIT $max";
- $result = DBUtil::getResultArray($sql);
- if (PEAR::isError($result))
- {
- //unlink($indexLockFile);
- if ($this->debug) $default->log->debug('indexDocuments: stopping - db error');
- return;
- }
- KTUtil::setSystemSetting('luceneIndexingDate', time());
-
- // bail if no work to do
- if (count($result) == 0)
- {
- //unlink($indexLockFile);
- if ($this->debug) $default->log->debug('indexDocuments: stopping - no work to be done');
- return;
- }
-
- // identify any documents that need indexing and mark them
- // so they are not taken in a followup run
- $ids = array();
- foreach($result as $docinfo)
- {
- $ids[] = $docinfo['document_id'];
- }
-
- // mark the documents as being processed
-
- $ids=implode(',',$ids);
- $sql = "UPDATE index_files SET processdate='$date' WHERE document_id in ($ids)";
- DBUtil::runQuery($sql);
-
- $extractorCache = array();
- $storageManager = KTStorageManagerUtil::getSingleton();
-
- $tempPath = $config->get("urls/tmpDirectory");
-
- foreach($result as $docinfo)
- {
- // increment indexed documents count
- Indexer::incrementCount();
-
- $docId=$docinfo['document_id'];
- $extension=$docinfo['filetypes'];
- $mimeType=$docinfo['mimetypes'];
- $extractorClass=$docinfo['extractor'];
- $indexDocument = in_array($docinfo['what'], array('A','C'));
- $indexDiscussion = in_array($docinfo['what'], array('A','D'));
- $this->indexingHistory = '';
-
- $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Indexing docid: %d extension: '%s' mimetype: '%s' extractor: '%s'"), $docId, $extension,$mimeType,$extractorClass), 'debug');
-
- if (empty($extractorClass))
- {
- /*
-
- if no extractor is found and we don't need to index discussions, then we can remove the item from the queue.
-
- */
- if ($indexDiscussion)
- {
- $indexDocument = false;
- $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Not indexing docid: %d content because extractor could not be resolve. Still indexing discussion."), $docId), 'info');
- }
- else
- {
- Indexer::unqueueDocument($docId, sprintf(_kt("No extractor for docid: %d"),$docId));
- continue;
- }
- }
- else
- {
- /*
-
- If an extractor is available, we must ensure it is enabled.
-
- */
-
- if (!$this->isExtractorEnabled($extractorClass))
- {
- $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("diagnose: Not indexing docid: %d because extractor '%s' is disabled."), $docId, $extractorClass), 'info');
- continue;
- }
- }
-
- if ($this->debug)
- {
- $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Processing docid: %d.\n"),$docId), 'info');
- }
-
- $document = Document::get($docId);
- if (PEAR::isError($document))
- {
- Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Cannot resolve document id %d: %s."),$docId, $document->getMessage()), 'error');
- continue;
- }
-
- if ($this->restartCurrentBatch)
- {
- Indexer::unqueueDocument($docId);
- Indexer::index($docId, 'A');
- continue;
- }
-
-
- $filename = $document->getFileName();
- if (substr($filename,0,1) == '~' || substr($filename,-1) == '~')
- {
- Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Filename for document id %d starts with a tilde (~). This is assumed to be a temporary file. This is ignored."),$docId), 'error');
- continue;
- }
-
- $removeFromQueue = true;
- if ($indexDocument)
- {
- if (array_key_exists($extractorClass, $extractorCache))
- {
- $extractor = $extractorCache[$extractorClass];
- }
- else
- {
- $extractor = $extractorCache[$extractorClass] = $this->getExtractor($extractorClass);
- }
-
- if (!($extractor instanceof DocumentExtractor))
- {
- $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("indexDocuments: extractor '%s' is not a document extractor class."),$extractorClass), 'error');
- continue;
- }
-
-
-
- $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();
- $sourceFile = $storageManager->temporaryFile($document);
-
- if (empty($sourceFile) || !is_file($sourceFile))
- {
- Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: source file '%s' for document %d does not exist."),$sourceFile,$docId), 'error');
- continue;
- }
-
- if ($extractor->needsIntermediateSourceFile())
- {
- //$extension = pathinfo($document->getFileName(), PATHINFO_EXTENSION);
-
- $intermediate = $tempPath . '/'. $docId . '.' . $extension;
- $result = @copy($sourceFile, $intermediate);
- if ($result === false)
- {
- $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not create intermediate file from document %d"),$docId), 'error');
- // problem. lets try again later. probably permission related. log the issue.
- continue;
- }
- $sourceFile = $intermediate;
- }
-
- $targetFile = tempnam($tempPath, 'ktindexer');
-
- $extractor->setSourceFile($sourceFile);
- $extractor->setMimeType($mimeType);
- $extractor->setExtension($extension);
- $extractor->setTargetFile($targetFile);
- $extractor->setDocument($document);
- $extractor->setIndexingStatus(null);
- $extractor->setExtractionStatus(null);
-
- $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Extra Info docid: %d Source File: '%s' Target File: '%s'"),$docId,$sourceFile,$targetFile), 'debug');
-
- $this->executeHook($extractor, 'pre_extract');
- $this->executeHook($extractor, 'pre_extract', $mimeType);
- $removeFromQueue = false;
-
- if ($extractor->extractTextContent())
- {
- // the extractor may need to create another target file
- $targetFile = $extractor->getTargetFile();
-
- $extractor->setExtractionStatus(true);
- $this->executeHook($extractor, 'pre_index');
- $this->executeHook($extractor, 'pre_index', $mimeType);
-
- $title = $document->getName();
- if ($indexDiscussion)
- {
- if (!$this->filterText($targetFile))
- {
- $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');
- }
- else
- {
- $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);
- $removeFromQueue = $indexStatus;
- if (!$indexStatus)
- {
- $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocumentAndDiscussion"),$docId), 'error');
- }
-
- $extractor->setIndexingStatus($indexStatus);
- }
- }
- else
- {
- if (!$this->filterText($targetFile))
- {
- $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');
- }
- else
- {
- $indexStatus = $this->indexDocument($docId, $targetFile, $title, $version);
- $removeFromQueue = $indexStatus;
-
- if (!$indexStatus)
- {
- $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocument"),$docId), 'error');
- $this->logPendingDocumentInfoStatus($docId, '', 'error');
- }
-
- $extractor->setIndexingStatus($indexStatus);
- }
- }
-
- $this->executeHook($extractor, 'post_index', $mimeType);
- $this->executeHook($extractor, 'post_index');
- }
- else
- {
- $extractor->setExtractionStatus(false);
- $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not extract contents from document %d"),$docId), 'error');
- $this->logPendingDocumentInfoStatus($docId, '', 'error');
- }
-
- $this->executeHook($extractor, 'post_extract', $mimeType);
- $this->executeHook($extractor, 'post_extract');
-
- if ($extractor->needsIntermediateSourceFile())
- {
- @unlink($sourceFile);
- }
-
- @unlink($targetFile);
-
- }
- else
- {
- $indexStatus = $this->indexDiscussion($docId);
- $removeFromQueue = $indexStatus;
- }
-
- if ($removeFromQueue)
- {
- Indexer::unqueueDocument($docId, sprintf(_kt("Done indexing docid: %d"),$docId));
- }
- else
- {
- if ($this->debug) $default->log->debug(sprintf(_kt("Document docid: %d was not removed from the queue as it looks like there was a problem with the extraction process"),$docId));
- }
- }
- if ($this->debug) $default->log->debug('indexDocuments: done');
- //unlink($indexLockFile);
- }
-
- public function migrateDocuments($max=null)
- {
- global $default;
-
- $default->log->info(_kt('migrateDocuments: starting'));
-
- if (!$this->doesDiagnosticsPass(true))
- {
- $default->log->info(_kt('migrateDocuments: stopping - diagnostics problem. The dashboard will provide more information.'));
- return;
- }
-
- if (KTUtil::getSystemSetting('migrationComplete') == 'true')
- {
- $default->log->info(_kt('migrateDocuments: stopping - migration is complete.'));
- return;
- }
-
- $config =& KTConfig::getSingleton();
- if (is_null($max))
- {
- $max = $config->get('indexer/batchMigrateDocument',500);
- }
-
- $lockFile = $config->get('cache/cacheDirectory') . '/migration.lock';
- if (is_file($lockFile))
- {
- $default->log->info(_kt('migrateDocuments: stopping - migration lockfile detected.'));
- return;
- }
- touch($lockFile);
-
- $startTime = KTUtil::getSystemSetting('migrationStarted');
- if (is_null($startTime))
- {
- KTUtil::setSystemSetting('migrationStarted', time());
- }
-
- $maxLoops = 5;
-
- $max = ceil($max / $maxLoops);
-
- $start =KTUtil::getBenchmarkTime();
- $noDocs = false;
- $numDocs = 0;
-
- for($loop=0;$loop<$maxLoops;$loop++)
- {
-
- $sql = "SELECT
- document_id, document_text
- FROM
- document_text
- ORDER BY document_id
- LIMIT $max";
- $result = DBUtil::getResultArray($sql);
- if (PEAR::isError($result))
- {
- $default->log->info(_kt('migrateDocuments: db error'));
- break;
- }
-
- $docs = count($result);
- if ($docs == 0)
- {
- $noDocs = true;
- break;
- }
- $numDocs += $docs;
-
- foreach($result as $docinfo)
- {
- $docId = $docinfo['document_id'];
-
- $document = Document::get($docId);
- if (PEAR::isError($document) || is_null($document))
- {
- $sql = "DELETE FROM document_text WHERE document_id=$docId";
- DBUtil::runQuery($sql);
- $default->log->error(sprintf(_kt('migrateDocuments: Could not get document %d\'s document! Removing content!'),$docId));
- continue;
- }
-
- $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();
-
- $targetFile = tempnam($tempPath, 'ktindexer');
-
- if (file_put_contents($targetFile, $docinfo['document_text']) === false)
- {
- $default->log->error(sprintf(_kt('migrateDocuments: Cannot write to \'%s\' for document id %d'), $targetFile, $docId));
- continue;
- }
- // free memory asap ;)
- unset($docinfo['document_text']);
-
- $title = $document->getName();
-
- $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);
-
- if ($indexStatus)
- {
- $sql = "DELETE FROM document_text WHERE document_id=$docId";
- DBUtil::runQuery($sql);
- }
- else
- {
- $default->log->error(sprintf(_kt("migrateDocuments: Problem indexing document %d"), $docId));
- }
-
- @unlink($targetFile);
- }
- }
-
- @unlink($lockFile);
-
- $time = KTUtil::getBenchmarkTime() - $start;
-
- KTUtil::setSystemSetting('migrationTime', KTUtil::getSystemSetting('migrationTime',0) + $time);
- KTUtil::setSystemSetting('migratedDocuments', KTUtil::getSystemSetting('migratedDocuments',0) + $numDocs);
-
- $default->log->info(sprintf(_kt('migrateDocuments: stopping - done in %d seconds!'), $time));
- if ($noDocs)
- {
- $default->log->info(_kt('migrateDocuments: Completed!'));
- KTUtil::setSystemSetting('migrationComplete', 'true');
- schedulerUtil::deleteByName('Index Migration');
- $default->log->debug(_kt('migrateDocuments: Disabling \'Index Migration\' task by removing scheduler entry.'));
- }
- }
-
- /**
- * Index a document. The base class must override this function.
- *
- * @param int $docId
- * @param string $textFile
- */
- protected abstract function indexDocument($docId, $textFile, $title, $version);
-
-
- public function updateDocumentIndex($docId, $text)
- {
- $config = KTConfig::getSingleton();
- $tempPath = $config->get("urls/tmpDirectory");
- $tempFile = tempnam($tempPath,'ud_');
-
- file_put_contents($tempFile, $text);
-
- $document = Document::get($docId);
- $title = $document->getDescription();
- $version = $document->getVersion();
-
- $result = $this->indexDocument($docId, $tempFile, $title, $version);
-
- if (file_exists($tempFile))
- {
- unlink($tempFile);
- }
-
- return $result;
- }
-
- /**
- * Index a discussion. The base class must override this function.
- *
- * @param int $docId
- */
- protected abstract function indexDiscussion($docId);
-
- /**
- * Diagnose the indexer. e.g. Check that the indexing server is running.
- *
- */
- public abstract function diagnose();
-
- /**
- * Diagnose the extractors.
- *
- * @return array
- */
- public function diagnoseExtractors()
- {
- $diagnosis = $this->_diagnose($this->extractorPath, 'DocumentExtractor', 'Extractor.inc.php');
- $diagnosis = array_merge($diagnosis, $this->_diagnose($this->hookPath, 'Hook', 'Hook.inc.php'));
-
- return $diagnosis;
- }
-
- /**
- * This is a refactored diagnose function.
- *
- * @param string $path
- * @param string $class
- * @param string $extension
- * @return array
- */
- private function _diagnose($path, $baseclass, $extension)
- {
- global $default;
-
- $diagnoses = array();
-
- $dir = opendir(SearchHelper::correctPath($path));
- $extlen = - strlen($extension);
-
- while (($file = readdir($dir)) !== false)
- {
- if (substr($file,0,1) == '.')
- {
- continue;
- }
- if (substr($file,$extlen) != $extension)
- {
- $default->log->error(sprintf(_kt("diagnose: '%s' does not have extension '%s'."), $file, $extension));
- continue;
- }
-
- require_once($path . '/' . $file);
-
- $class = substr($file, 0, -8);
- if (!class_exists($class))
- {
- $default->log->error(sprintf(_kt("diagnose: class '%s' does not exist."), $class));
- continue;
- }
-
- if (!$this->isExtractorEnabled($class))
- {
- $default->log->debug(sprintf(_kt("diagnose: extractor '%s' is disabled."), $class));
- continue;
- }
-
- $extractor = new $class();
- if (!is_a($extractor, $baseclass))
- {
- $default->log->error(sprintf(_kt("diagnose(): '%s' is not of type DocumentExtractor"), $class));
- continue;
- }
-
- $types = $extractor->getSupportedMimeTypes();
- if (empty($types))
- {
- if ($this->debug) $default->log->debug(sprintf(_kt("diagnose: class '%s' does not support any types."), $class));
- continue;
- }
-
- $diagnosis=$extractor->diagnose();
- if (empty($diagnosis))
- {
- continue;
- }
- $diagnoses[$class] = array(
- 'name'=>$extractor->getDisplayName(),
- 'diagnosis'=>$diagnosis
- );
-
- }
- closedir($dir);
-
- return $diagnoses;
- }
-
-
- /**
- * Register the extractor types.
- *
- * @param boolean $clear. Optional. Defaults to false.
- */
- public function registerTypes($clear=false)
- {
- if ($clear)
- {
- $this->clearExtractors();
- }
- $dir = opendir(SearchHelper::correctPath($this->extractorPath));
- while (($file = readdir($dir)) !== false)
- {
- if (substr($file,-17) == 'Extractor.inc.php')
- {
- require_once($this->extractorPath . '/' . $file);
- $class = substr($file, 0, -8);
-
- if (!class_exists($class))
- {
- // if the class does not exist, we can't do anything.
- continue;
- }
-
- $extractor = new $class;
- if ($extractor instanceof DocumentExtractor)
- {
- $extractor->registerMimeTypes();
- }
- }
- }
- closedir($dir);
- }
-
- /**
- * This is used as a possible obtimisation effort. It may be overridden in that case.
- *
- * @param int $docId
- * @param string $textFile
- */
- protected function indexDocumentAndDiscussion($docId, $textFile, $title, $version)
- {
- $this->indexDocument($docId, $textFile, $title, $version);
- $this->indexDiscussion($docId);
- }
-
- /**
- * Remove the document from the queue. This is normally called when it has been processed.
- *
- * @param int $docid
- */
- public static function unqueueDocument($docid, $reason=false, $level='debug')
- {
- $sql = "DELETE FROM index_files WHERE document_id=$docid";
- DBUtil::runQuery($sql);
- if ($reason !== false)
- {
- global $default;
- $default->log->$level("Indexer: removing document $docid from the queue - $reason");
- }
- }
-
- /**
- * Run a query on the index.
- *
- * @param string $query
- * @return array
- */
- public abstract function query($query);
-
- /**
- * Converts an integer to a string that can be easily compared and reversed.
- *
- * @param int $int
- * @return string
- */
- public static function longToString($int)
- {
- $maxlen = 14;
-
- $a2z = array('a','b','c','d','e','f','g','h','i','j');
- $o29 = array('0','1','2','3','4','5','6','7','8','9');
- $l = str_pad('',$maxlen - strlen("$int"),'0') . $int;
-
- return str_replace($o29, $a2z, $l);
- }
-
- /**
- * Converts a string to an integer.
- *
- * @param string $str
- * @return int
- */
- public static function stringToLong($str)
- {
- $a2z = array('a','b','c','d','e','f','g','h','i','j');
- $o29 = array('0','1','2','3','4','5','6','7','8','9');
-
- $int = str_replace($a2z, $o29, $str) + 0;
-
- return $int;
- }
-
- /**
- * Possibly we can optimise indexes. This method must be overriden.
- * The new function must call the parent!
- *
- */
- public function optimise()
- {
- KTUtil::setSystemSetting('luceneOptimisationDate', time());
- }
-
- /**
- * Shuts down the indexer
- *
- */
- public function shutdown()
- {
- // do nothing generally
- }
-
- /**
- * Returns the name of the indexer.
- *
- * @return string
- */
- public abstract function getDisplayName();
-
-
- /**
- * Returns the number of non-deleted documents in the index.
- *
- * @return int
- */
- public abstract function getDocumentsInIndex();
-
- public abstract function isDocumentIndexed($documentId);
-
- /**
- * Returns the path to the index directory
- *
- * @return string
- */
- public function getIndexDirectory()
- {
- $config = KTConfig::getSingleton();
- $directory = $config->get('indexer/luceneDirectory');
- return $directory;
- }
-}
-
-?>
+.
+ *
+ * You can contact KnowledgeTree Inc., PO Box 7775 #87847, San Francisco,
+ * California 94120-7775, or email info@knowledgetree.com.
+ *
+ * The interactive user interfaces in modified source and object code versions
+ * of this program must display Appropriate Legal Notices, as required under
+ * Section 5 of the GNU General Public License version 3.
+ *
+ * In accordance with Section 7(b) of the GNU General Public License version 3,
+ * these Appropriate Legal Notices must retain the display of the "Powered by
+ * KnowledgeTree" logo and retain the original copyright notice. If the display of the
+ * logo is not reasonably feasible for technical reasons, the Appropriate Legal Notices
+ * must display the words "Powered by KnowledgeTree" and retain the original
+ * copyright notice.
+ * Contributor( s): ______________________________________
+ *
+ */
+
+define('SEARCH2_INDEXER_DIR',realpath(dirname(__FILE__)) . '/');
+require_once('indexing/extractorCore.inc.php');
+require_once(KT_DIR . '/plugins/ktcore/scheduler/schedulerUtil.php');
+require_once(KT_DIR . '/ktapi/ktapi.inc.php');
+
+class IndexerInconsistencyException extends Exception {};
+
+// TODO: Query Result Items code should be moved into the Search section. It has less to do with indexing...
+
+class QueryResultItem
+{
+ protected $id;
+ protected $title;
+ protected $rank;
+ protected $text;
+ protected $fullpath;
+
+ public function __construct($id, $title, $rank, $text, $fullpath)
+ {
+ $this->id = $id;
+ $this->title = $title;
+ $this->rank = $rank;
+ $this->text = $text;
+ $this->fullpath = $fullpath;
+ }
+
+ public function getId() { return $this->id; }
+ public function getRealId() { return $this->id; }
+
+ public function getIsProxy() { return $this instanceof ProxyResultItem; }
+ public function getIsFolder() { return substr(get_class($this), 0, 6) == 'Folder' ; }
+ public function getIsDocument() { return substr(get_class($this), 0, 8) == 'Document' ; }
+
+ public function setRank($value)
+ {
+ $this->rank = number_format($value,2,'.',',');
+ }
+
+ public function getIsLive()
+ {
+ return true;
+ }
+
+ public function setTitle($value)
+ {
+ $this->title = $value;
+ }
+
+ public function setText($value)
+ {
+ $this->text = $value;
+ }
+
+ public function getRelevance() { return (float) $this->rank; }
+ public function getRank() { return $this->getRelevance(); }
+ public function getText() { return (string) $this->text; }
+ public function getTitle() { return (string) $this->title; }
+ public function getFullPath() { return (string) $this->fullpath; }
+
+ protected function __get($property)
+ {
+ if (empty($property))
+ {
+ return '';
+ }
+
+ $method = 'get' . $property;
+ if (method_exists($this, $method))
+ {
+ return $this->$method();
+ }
+ return $this->getUnknown();
+ }
+
+ protected function getUnknown()
+ {
+ return _kt('n/a');
+ }
+
+ protected function __set($property, $value)
+ {
+ if (empty($property))
+ {
+ return '';
+ }
+
+ $method = 'set' . $property;
+ if (method_exists($this, $method))
+ {
+ return $this->$method($value);
+ }
+ throw new Exception("Unknown property '$property' to set on QueryResultItem");
+ }
+}
+
+class ProxyResultItem extends QueryResultItem
+{
+ protected $proxy;
+ protected $proxyId;
+
+ public function __construct($proxyId, $proxy)
+ {
+ parent::__construct($proxyId, $proxy->getTitle, $proxy->getRank(), $proxy->getText(), $proxy->getFullPath());
+ $this->proxyId = $proxyId;
+ $this->proxy = $proxy;
+ }
+
+ public function getId() { return $this->proxyId; }
+ public function getTitle() { return $this->proxy->getTitle(); }
+ public function getRealId() { return $this->proxy->getId(); }
+
+ protected function __get($property)
+ {
+ $method = 'get' . $property;
+
+ if (method_exists($this, $method))
+ {
+ return $this->$method();
+ }
+ else
+ {
+ return $this->proxy->$method();
+ }
+ }
+
+ protected function __set($property, $value)
+ {
+ $method = 'set' . $property;
+ if (method_exists($this, $method))
+ {
+ return $this->$method($value);
+ }
+ else
+ {
+ return $this->proxy->$method($value);
+ }
+ }
+}
+
+class DocumentResultItem extends QueryResultItem
+{
+ protected $filesize;
+ protected $live;
+ protected $version;
+ protected $mimeType;
+ protected $filename;
+ protected $thumbnail; // TODO: if not null, gui can display a thumbnail
+ protected $viewer; // TODO: if not null, a viewer can be used to view the document
+ protected $document;
+ protected $checkedOutUser;
+ protected $dateCheckedout;
+ protected $workflowState;
+ protected $workflow;
+ protected $modifiedBy;
+ protected $dateModified;
+ protected $createdBy;
+ protected $dateCreated;
+ protected $owner;
+ protected $immutable;
+ protected $deleted;
+ protected $status;
+ protected $folderId;
+ protected $storagePath;
+ protected $documentType;
+ protected $mimeIconPath;
+ protected $mimeDisplay;
+ protected $oemDocumentNo;
+
+ public function __construct($document_id, $rank=null, $title=null, $text=null, $fullpath = null)
+ {
+ parent::__construct($document_id, $title, $rank, $text, $fullpath);
+ $this->live = true;
+ $this->loadDocumentInfo();
+ }
+
+ // TODO: this is bad. must refactor to do the query on the group of documents.
+ public function loadDocumentInfo()
+ {
+ global $default;
+ $sql = "SELECT
+ d.folder_id, f.full_path, f.name, dcv.size as filesize, dcv.major_version,
+ dcv.minor_version, dcv.filename, cou.name as checkoutuser, w.human_name as workflow, ws.human_name as workflowstate,
+ mt.mimetypes as mimetype, md.mime_doc as mimedoc, d.checkedout, mbu.name as modifiedbyuser, d.modified,
+ cbu.name as createdbyuser, ou.name as owneruser, d.immutable, d.status_id, d.created,dcv.storage_path, dtl.name as document_type,
+ mt.icon_path as mime_icon_path, mt.friendly_name as mime_display, d.oem_no, dmv.name as title
+ FROM
+ documents d
+ INNER JOIN document_metadata_version dmv ON d.metadata_version_id = dmv.id
+ INNER JOIN document_content_version dcv ON dmv.content_version_id = dcv.id
+ INNER JOIN mime_types mt ON dcv.mime_id=mt.id
+ LEFT JOIN document_types_lookup dtl ON dtl.id=dmv.document_type_id
+ LEFT JOIN folders f ON f.id=d.folder_id
+ LEFT JOIN users cou ON d.checked_out_user_id=cou.id
+ LEFT JOIN workflows w ON dmv.workflow_id=w.id
+ LEFT JOIN workflow_states ws ON dmv.workflow_state_id = ws.id
+ LEFT JOIN mime_documents md ON mt.mime_document_id = md.id
+ LEFT JOIN users mbu ON d.modified_user_id=mbu.id
+ LEFT JOIN users cbu ON d.creator_id=cbu.id
+ LEFT JOIN users ou ON d.owner_id=ou.id
+ WHERE
+ d.id=$this->id";
+
+ $result = DBUtil::getOneResult($sql);
+
+ if (PEAR::isError($result) || empty($result))
+ {
+ $this->live = false;
+ if (PEAR::isError($result))
+ {
+ throw new Exception('Database exception! There appears to be an error in the system: ' .$result->getMessage());
+ }
+
+ $default->log->error('QueryResultItem: $result is null');
+ $msg = 'The database did not have a record matching the result from the document indexer. This may occur if there is an inconsistency between the document indexer and the repository. The indexer needs to be repaired.';
+ $default->log->error('QueryResultItem: ' . $msg);
+ // TODO: repair process where we scan documents in index, and delete those for which there is nothing in the repository
+ throw new IndexerInconsistencyException(_kt($msg));
+ }
+
+ // document_id, relevance, text, title
+
+ $this->documentType = $result['document_type'];
+ $this->filename=$result['filename'];
+ $this->filesize = KTUtil::filesizeToString($result['filesize']);
+ $this->folderId = $result['folder_id'];
+ $this->title = $result['title'];
+
+ $this->createdBy = $result['createdbyuser'];
+ $this->dateCreated = $result['created'];
+
+ $this->modifiedBy = $result['modifiedbyuser'];
+ $this->dateModified = $result['modified'];
+
+ $this->checkedOutUser = $result['checkoutuser'];
+ $this->dateCheckedout = $result['checkedout'];
+
+ $this->owner = $result['owneruser'];
+
+ $this->version = $result['major_version'] . '.' . $result['minor_version'];
+
+ $this->immutable = ($result['immutable'] + 0)?_kt('Immutable'):'';
+
+ $this->workflow = $result['workflow'];
+ $this->workflowState = $result['workflowstate'];
+
+ $this->oemDocumentNo = $result['oem_no'];
+ if (empty($this->oemDocumentNo)) $this->oemDocumentNo = 'n/a';
+
+ if (is_null($result['name']))
+ {
+ $this->fullpath = '(orphaned)';
+ }
+ else
+ {
+ $this->fullpath = $result['full_path'];
+ }
+
+ $this->mimeType = $result['mimetype'];
+ $this->mimeIconPath = $result['mime_icon_path'];
+ if (empty($this->mimeIconPath))
+ {
+ $this->mimeIconPath = 'unspecified_type';
+ }
+ $this->mimeDisplay = $result['mime_display'];
+
+ $this->storagePath = $result['storage_path'];
+ $this->status = Document::getStatusString($result['status_id']);
+ }
+
+ public function getDocumentID() { return $this->getId(); }
+ public function getIsLive() { return (bool) $this->live; }
+ public function getFilesize() { return $this->filesize; }
+ public function getVersion() { return (string) $this->version; }
+ public function getFilename() { return (string)$this->filename; }
+ public function getFolderId() { return (int)$this->folderId; }
+ public function getOemDocumentNo() { return (string) $this->oemDocumentNo; }
+ public function getDocument() { return Document::get($this->id); }
+ public function getIsAvailable() { return $this->Document->isLive(); }
+ public function getCheckedOutUser() { return (string) $this->checkedOutUser; }
+ public function getCheckedOutByr() { return $this->getCheckedOutUser(); }
+ public function getWorkflowOnly() { return (string)$this->workflow; }
+ public function getWorkflow() { return $this->getWorkflow(); }
+ public function getWorkflowStateOnly() { return (string)$this->workflowState; }
+ public function getWorkflowState() { return $this->getWorkflowStateOnly(); }
+ public function getWorkflowAndState() {
+ if (is_null($this->workflow))
+ {
+ return '';
+ }
+ return "$this->workflow - $this->workflowState";
+ }
+ public function getMimeType() { return (string) $this->mimeType; }
+ public function getMimeIconPath() { return (string) $this->mimeIconPath; }
+ public function getMimeDisplay() { return (string) $this->mimeDisplay; }
+ public function getDateCheckedOut() { return (string) $this->dateCheckedout; }
+ public function getModifiedBy() { return (string) $this->modifiedBy; }
+ public function getDateModified() { return (string) $this->dateModified; }
+ public function getCreatedBy() { return (string) $this->createdBy; }
+ public function getDateCreated() { return (string) $this->dateCreated; }
+ public function getOwner() { return (string) $this->owner; }
+ public function getOwnedBy() { return $this->getOwner(); }
+ public function getIsImmutable() { return (bool) $this->immutable; }
+ public function getImmutable() { return $this->getIsImmutable(); }
+ public function getStatus() { return $this->status; }
+ public function getStoragePath() { return $this->storagePath; }
+ public function getDocumentType() { return $this->documentType; }
+ public function getPermissions() { return KTAPI_Document::get_permission_string($this->Document); }
+ public function getCanBeReadByUser() {
+ if (!$this->live)
+ return false;
+ if (Permission::userHasDocumentReadPermission($this->Document))
+ return true;
+ if (Permission::adminIsInAdminMode())
+ return true;
+ return false;
+ }
+}
+
+class FolderResultItem extends QueryResultItem
+{
+ protected $folder;
+ protected $createdBy;
+ protected $parentId;
+
+ public function __construct($folder_id, $rank=null, $title=null, $text=null, $fullpath = null)
+ {
+ parent::__construct($folder_id, $title, $rank, $text, $fullpath);
+ $this->loadFolderInfo();
+ }
+
+ public function getFolderID() { return $this->getId(); }
+ public function getParentID() { return $this->parentId; }
+ public function getCreatedBy() { return $this->createdBy; }
+ public function getMimeIconPath() { return 'folder'; }
+ public function getFolder() { return Folder::get($this->getFolderID()); }
+ public function getPermissions() { return KTAPI_Folder::get_permission_string($this->Folder); }
+
+ public function loadFolderInfo()
+ {
+ global $default;
+ $folder = $this->getFolder();
+ if (PEAR::isError($folder))
+ {
+ throw new Exception('Database exception! There appears to be an error in the system: ' .$result->getMessage());
+ }
+ $this->title = $folder->getName();
+ $this->fullpath = '/' . $folder->getFullPath();
+ $this->parentId = $folder->getParentId();
+
+ $user = User::get($folder->getCreatorID());
+ $this->createdBy = (PEAR::isError($user))?_kt('Unknown'):$user->getName();
+ }
+
+}
+
+class DocumentShortcutResultItem extends ProxyResultItem
+{
+ public function getDocumentID() { return $this->getId(); }
+ public function getMimeIconPath() { return $this->proxy->getMimeIconPath() . '_shortcut'; }
+
+}
+
+class FolderShortcutResultItem extends ProxyResultItem
+{
+ public function getFolderID() { return $this->getId(); }
+ public function getMimeIconPath() { return 'folder_shortcut'; }
+
+}
+
+function MatchResultCompare($a, $b)
+{
+ if ($a->Rank == $b->Rank) {
+ return 0;
+ }
+ return ($a->Rank < $b->Rank) ? -1 : 1;
+}
+
+abstract class Indexer
+{
+ /**
+ * Cache of extractors
+ *
+ * @var array
+ */
+ private $extractorCache;
+
+ /**
+ * Indicates if the indexer will do logging.
+ *
+ * @var boolean
+ */
+ private $debug;
+ /**
+ * Cache on mime related hooks
+ *
+ * @var unknown_type
+ */
+ private $mimeHookCache;
+ /**
+ * Cache on general hooks.
+ *
+ * @var array
+ */
+ private $generalHookCache;
+
+ /**
+ * This is a path to the extractors.
+ *
+ * @var string
+ */
+ private $extractorPath;
+ /**
+ * This is a path to the hooks.
+ *
+ * @var string
+ */
+ private $hookPath;
+
+ private $enabledExtractors;
+
+ /**
+ * Initialise the indexer
+ *
+ */
+ protected function __construct()
+ {
+ $config = KTConfig::getSingleton();
+
+ $this->extractorCache = array();
+ $this->debug = $config->get('indexer/debug', true);
+ $this->hookCache = array();
+ $this->generalHookCache = array();
+ $this->extractorPath = $config->get('indexer/extractorPath', 'extractors');
+ $this->hookPath = $config->get('indexer/extractorHookPath','extractorHooks');
+
+ $this->loadExtractorStatus();
+ }
+
+ /**
+ * Get the list if enabled extractors
+ *
+ */
+ private function loadExtractorStatus()
+ {
+ $sql = "SELECT id, name FROM mime_extractors WHERE active=1";
+ $rs = DBUtil::getResultArray($sql);
+ $this->enabledExtractors = array();
+ foreach($rs as $item)
+ {
+ $this->enabledExtractors[] = $item['name'];
+ }
+ }
+
+ private function isExtractorEnabled($extractor)
+ {
+ return in_array($extractor, $this->enabledExtractors);
+ }
+
+ /**
+ * Returns a reference to the main class
+ *
+ * @return Indexer
+ */
+ public static function get()
+ {
+ static $singleton = null;
+
+ if (is_null($singleton))
+ {
+ $config = KTConfig::getSingleton();
+ $classname = $config->get('indexer/coreClass');
+
+ require_once('indexing/indexers/' . $classname . '.inc.php');
+
+ if (!class_exists($classname))
+ {
+ throw new Exception("Class '$classname' does not exist.");
+ }
+
+ $singleton = new $classname;
+ }
+
+ return $singleton;
+ }
+
+ public abstract function deleteDocument($docid);
+
+ /**
+ * Remove the association of all extractors to mime types on the database.
+ *
+ */
+ public function clearExtractors()
+ {
+ global $default;
+
+ $sql = "update mime_types set extractor_id=null";
+ DBUtil::runQuery($sql);
+
+ $sql = "delete from mime_extractors";
+ DBUtil::runQuery($sql);
+
+ if ($this->debug) $default->log->debug('clearExtractors');
+ }
+
+ /**
+ * lookup the name of the extractor class based on the mime type.
+ *
+ * @param string $type
+ * @return string
+ */
+ public static function resolveExtractor($type)
+ {
+ global $default;
+ $sql = "select extractor from mime_types where filetypes='$type'";
+ $class = DBUtil::getOneResultKey($sql,'extractor');
+ if (PEAR::isError($class))
+ {
+ $default->log->error("resolveExtractor: cannot resolve $type");
+ return $class;
+ }
+ if ($this->debug) $default->log->debug(sprintf(_kt("resolveExtractor: Resolved '%s' from mime type '%s'."), $class, $type));
+ return $class;
+ }
+
+ /**
+ * Return all the discussion text.
+ *
+ * @param int $docid
+ * @return string
+ */
+ public static function getDiscussionText($docid)
+ {
+ $sql = "SELECT
+ dc.subject, dc.body
+ FROM
+ discussion_threads dt
+ INNER JOIN discussion_comments dc ON dc.thread_id=dt.id AND dc.id BETWEEN dt.first_comment_id AND dt.last_comment_id
+ WHERE
+ dt.document_id=$docid";
+ $result = DBUtil::getResultArray($sql);
+ $text = '';
+
+ foreach($result as $record)
+ {
+ $text .= $record['subject'] . "\n" . $record['body'] . "\n";
+ }
+
+ return $text;
+ }
+
+ /**
+ * Schedule the indexing of a document.
+ *
+ * @param string $document
+ * @param string $what
+ */
+ public static function index($document, $what='A')
+ {
+ global $default;
+
+ if (is_numeric($document))
+ {
+ $document = Document::get($document+0);
+ }
+
+ if (PEAR::isError($document))
+ {
+ $default->log->error("index: Could not index document: " .$document->getMessage());
+ return;
+ }
+
+ $document_id = $document->getId();
+ $userid=$_SESSION['userID'];
+ if (empty($userid)) $userid=1;
+
+ // we dequeue the document so that there are no issues when enqueuing
+ Indexer::unqueueDocument($document_id);
+
+ // enqueue item
+ $sql = "INSERT INTO index_files(document_id, user_id, what) VALUES($document_id, $userid, '$what')";
+ DBUtil::runQuery($sql);
+
+ $default->log->debug("index: Queuing indexing of $document_id");
+
+ }
+
+ private static function incrementCount()
+ {
+ // Get count from system settings
+ $count = Indexer::getIndexedDocumentCount();
+ $count = (int)$count + 1;
+ Indexer::updateIndexedDocumentCount($count);
+ }
+
+ public static function getIndexedDocumentCount()
+ {
+ $count = KTUtil::getSystemSetting('indexedDocumentCount', 0);
+ return (int) $count;
+ }
+
+ public static function updateIndexedDocumentCount($cnt = 0)
+ {
+ KTUtil::setSystemSetting('indexedDocumentCount', $cnt);
+ }
+
+ public static function reindexQueue()
+ {
+ $sql = "UPDATE index_files SET processdate = null";
+ DBUtil::runQuery($sql);
+ }
+
+ public static function reindexDocument($documentId)
+ {
+ $sql = "UPDATE index_files SET processdate=null, status_msg=null WHERE document_id=$documentId";
+ DBUtil::runQuery($sql);
+ }
+
+
+
+ public static function indexAll()
+ {
+ $userid=$_SESSION['userID'];
+ if (empty($userid)) $userid=1;
+
+ $sql = "DELETE FROM index_files";
+ DBUtil::runQuery($sql);
+
+ $sql = "INSERT INTO index_files(document_id, user_id, what) SELECT id, $userid, 'A' FROM documents WHERE status_id=1 and id not in (select document_id from index_files)";
+ DBUtil::runQuery($sql);
+ }
+
+ public static function indexFolder($folder)
+ {
+ $userid=$_SESSION['userID'];
+ if (empty($userid)) $userid=1;
+
+ if (!$folder instanceof Folder && !$folder instanceof FolderProxy)
+ {
+ throw new Exception('Folder expected');
+ }
+
+ $full_path = $folder->getFullPath();
+
+ $sql = "INSERT INTO index_files(document_id, user_id, what) SELECT id, $userid, 'A' FROM documents WHERE full_path like '{$full_path}/%' AND status_id=1 and id not in (select document_id from index_files)";
+ DBUtil::runQuery($sql);
+ }
+
+ /**
+ * Clearout the scheduling of documents that no longer exist.
+ *
+ */
+ public static function clearoutDeleted()
+ {
+ global $default;
+
+ $sql = 'DELETE FROM
+ index_files
+ WHERE
+ document_id in (SELECT d.id FROM documents AS d WHERE d.status_id=3) OR
+ NOT EXISTS(SELECT index_files.document_id FROM documents WHERE index_files.document_id=documents.id)';
+ DBUtil::runQuery($sql);
+
+ $default->log->debug("Indexer::clearoutDeleted: removed documents from indexing queue that have been deleted");
+ }
+
+
+ /**
+ * Check if a document is scheduled to be indexed
+ *
+ * @param mixed $document This may be a document or document id
+ * @return boolean
+ */
+ public static function isDocumentScheduled($document)
+ {
+ if (is_numeric($document))
+ {
+ $docid = $document;
+ }
+ else if ($document instanceof Document)
+ {
+ $docid = $document->getId();
+ }
+ else
+ {
+ return false;
+ }
+ $sql = "SELECT 1 FROM index_files WHERE document_id=$docid";
+ $result = DBUtil::getResultArray($sql);
+ return count($result) > 0;
+ }
+
+ /**
+ * Filters text removing redundant characters such as continuous newlines and spaces.
+ *
+ * @param string $filename
+ */
+ private function filterText($filename)
+ {
+ $content = file_get_contents($filename);
+
+ $src = array("([\r\n])","([\n][\n])","([\n])","([\t])",'([ ][ ])');
+ $tgt = array("\n","\n",' ',' ',' ');
+
+ // shrink what is being stored.
+ do
+ {
+ $orig = $content;
+ $content = preg_replace($src, $tgt, $content);
+ } while ($content != $orig);
+
+ return file_put_contents($filename, $content) !== false;
+ }
+
+ /**
+ * Load hooks for text extraction process.
+ *
+ */
+ private function loadExtractorHooks()
+ {
+ $this->generalHookCache = array();
+ $this->mimeHookCache = array();
+
+
+ $dir = opendir(SearchHelper::correctPath($this->hookPath));
+ while (($file = readdir($dir)) !== false)
+ {
+ if (substr($file,-12) == 'Hook.inc.php')
+ {
+ require_once($this->hookPath . '/' . $file);
+ $class = substr($file, 0, -8);
+
+ if (!class_exists($class))
+ {
+ continue;
+ }
+
+ $hook = new $class;
+ if (!($class instanceof ExtractorHook))
+ {
+ continue;
+ }
+
+ $mimeTypes = $hook->registerMimeTypes();
+ if (is_null($mimeTypes))
+ {
+ $this->generalHookCache[] = & $hook;
+ }
+ else
+ {
+ foreach($mimeTypes as $type)
+ {
+ $this->mimeHookCache[$type][] = & $hook;
+ }
+ }
+
+ }
+ }
+ closedir($dir);
+ }
+
+ /**
+ * This is a refactored function to execute the hooks.
+ *
+ * @param DocumentExtractor $extractor
+ * @param string $phase
+ * @param string $mimeType Optional. If set, indicates which hooks must be used, else assume general.
+ */
+ private function executeHook($extractor, $phase, $mimeType = null)
+ {
+ $hooks = array();
+ if (is_null($mimeType))
+ {
+ $hooks = $this->generalHookCache;
+ }
+ else
+ {
+ if (array_key_exists($mimeType, $this->mimeHookCache))
+ {
+ $hooks = $this->mimeHookCache[$mimeType];
+ }
+ }
+ if (empty($hooks))
+ {
+ return;
+ }
+
+ foreach($hooks as $hook)
+ {
+ $hook->$phase($extractor);
+ }
+ }
+
+ private function doesDiagnosticsPass($simple=false)
+ {
+ global $default;
+
+ $config =& KTConfig::getSingleton();
+ // create a index log lock file in case there are errors, and we don't need to log them forever!
+ // this function will create the lockfile if an error is detected. It will be removed as soon
+ // as the problems with the indexer are removed.
+ $lockFile = $config->get('cache/cacheDirectory') . '/index.log.lock';
+
+ $diagnosis = $this->diagnose();
+ if (!is_null($diagnosis))
+ {
+ if (!is_file($lockFile))
+ {
+ $default->log->error(_kt('Indexer problem: ') . $diagnosis);
+ }
+ touch($lockFile);
+ return false;
+ }
+
+ if ($simple)
+ {
+ return true;
+ }
+
+ $diagnosis = $this->diagnoseExtractors();
+ if (!empty($diagnosis))
+ {
+ if (!is_file($lockFile))
+ {
+ foreach($diagnosis as $diag)
+ {
+ $default->log->error(sprintf(_kt('%s problem: %s'), $diag['name'],$diag['diagnosis']));
+ }
+ }
+ touch($lockFile);
+ return false;
+ }
+
+ if (is_file($lockFile))
+ {
+ $default->log->info(_kt('Issues with the indexer have been resolved!'));
+ unlink($lockFile);
+ }
+
+ return true;
+ }
+
+ /**
+ * This does the initial mime type association between mime types and text extractors
+ *
+ */
+ public function checkForRegisteredTypes()
+ {
+ global $default;
+
+ // we are only doing this once!
+ $initRegistered = KTUtil::getSystemSetting('mimeTypesRegistered', false);
+ if ($initRegistered)
+ {
+ return;
+ }
+ if ($this->debug) $default->log->debug('checkForRegisteredTypes: start');
+
+ $date = date('Y-m-d H:i');
+ $sql = "UPDATE scheduler_tasks SET run_time='$date'";
+ DBUtil::runQuery($sql);
+
+ $this->registerTypes(true);
+
+ $disable = array(
+ 'windows'=>array('PSExtractor'),
+ 'unix' => array()
+ );
+
+ $disableForOS = OS_WINDOWS?$disable['windows']:$disable['unix'];
+
+ if (!empty($disableForOS))
+ {
+ $disableForOS = '\'' . implode("','", $disableForOS) .'\'';
+
+ $sql = "UPDATE mime_extractors SET active=0 WHERE name in ($disableForOS)";
+ DBUtil::runQuery($sql);
+ $default->log->info("checkForRegisteredTypes: disabled '$extractor'");
+ }
+ $this->loadExtractorStatus();
+
+ if ($this->debug) $default->log->debug('checkForRegisteredTypes: done');
+ KTUtil::setSystemSetting('mimeTypesRegistered', true);
+ }
+
+ private function updatePendingDocumentStatus($documentId, $message, $level)
+ {
+ $this->indexingHistory .= "\n" . $level . ': ' . $message;
+ $message = sanitizeForSQL($this->indexingHistory);
+ $sql = "UPDATE index_files SET status_msg='$message' WHERE document_id=$documentId";
+ DBUtil::runQuery($sql);
+ }
+
+ private $restartCurrentBatch = false;
+
+ public function restartBatch()
+ {
+ $this->restartCurrentBatch = true;
+ }
+
+ /**
+ *
+ * @param int $documentId
+ * @param string $message
+ * @param string $level This may be info, error, debug
+ */
+ private function logPendingDocumentInfoStatus($documentId, $message, $level)
+ {
+ $this->updatePendingDocumentStatus($documentId, $message, $level);
+ global $default;
+
+ switch ($level)
+ {
+ case 'debug':
+ if ($this->debug)
+ {
+ $default->log->debug($message);
+ }
+ break;
+ default:
+ $default->log->$level($message);
+ }
+ }
+
+
+
+ public function getExtractor($extractorClass)
+ {
+ if (empty($extractorClass))
+ {
+ return null;
+ }
+
+ $includeFile = SEARCH2_INDEXER_DIR . 'extractors/' . $extractorClass . '.inc.php';
+ if (!file_exists($includeFile))
+ {
+ throw new Exception("Extractor file does not exist: $includeFile");
+ }
+
+ require_once($includeFile);
+
+ if (!class_exists($extractorClass))
+ {
+ throw new Exception("Extractor '$classname' not defined in file: $includeFile");
+ }
+
+ $extractor = new $extractorClass();
+
+ if (!($extractor instanceof DocumentExtractor))
+ {
+ throw new Exception("Class $classname was expected to be of type DocumentExtractor");
+ }
+
+ return $extractor;
+ }
+
+ public static function getIndexingQueue($problemItemsOnly=true)
+ {
+
+ if ($problemItemsOnly)
+ {
+ $sql = "SELECT
+ iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename
+ FROM
+ index_files iff
+ INNER JOIN documents d ON iff.document_id=d.id
+ INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
+ INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
+ INNER JOIN mime_types mt ON dcv.mime_id=mt.id
+ LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
+ WHERE
+ (iff.status_msg IS NOT NULL AND iff.status_msg <> '') AND d.status_id=1
+ ORDER BY indexdate ";
+ }
+ else
+ {
+ $sql = "SELECT
+ iff.document_id, iff.indexdate, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what, iff.status_msg, dcv.filename
+ FROM
+ index_files iff
+ INNER JOIN documents d ON iff.document_id=d.id
+ INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
+ INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
+ INNER JOIN mime_types mt ON dcv.mime_id=mt.id
+ LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
+ WHERE
+ (iff.status_msg IS NULL or iff.status_msg = '') AND d.status_id=1
+ ORDER BY indexdate ";
+ }
+ $aResult = DBUtil::getResultArray($sql);
+
+ return $aResult;
+ }
+
+ public static function getPendingIndexingQueue()
+ {
+ return Indexer::getIndexingQueue(false);
+ }
+
+ public function updateIndexStats()
+ {
+ $optimisationDate = KTUtil::getSystemSetting('luceneOptimisationDate', '');
+
+ $noOptimisation = false;
+ if ($optimisationDate == '')
+ {
+ $optimisationDate = _kt('N/A');
+ $optimisationPeriod = $optimisationDate;
+ }
+ else
+ {
+ $optimisationPeriod = KTUtil::computePeriodToDate($optimisationDate, null, true);
+ $noOptimisation = $optimisationPeriod['days'] > 2;
+ $optimisationPeriod = $optimisationPeriod['str'];
+ $optimisationDate = date('Y-m-d H:i:s', $optimisationDate);
+ }
+
+ $indexingDate = KTUtil::getSystemSetting('luceneIndexingDate', '');
+ if ($indexingDate == '')
+ {
+ $indexingDate = _kt('N/A');
+ $indexingPeriod = $indexingDate;
+ }
+ else
+ {
+ $indexingPeriod = KTUtil::computePeriodToDate($indexingDate);
+ $indexingDate = date('Y-m-d H:i:s', $indexingDate);
+ }
+
+ $index = Indexer::get();
+ $docsInIndex = $index->getDocumentsInIndex();
+
+ // we are only interested in documents that are active
+ $sql = "SELECT count(*) as docsInQueue FROM index_files i inner join documents d on i.document_id = d.id where (i.status_msg is null or i.status_msg = '') and d.status_id=1";
+ $docsInQueue = DBUtil::getOneResultKey($sql, 'docsInQueue');
+
+ $sql = "SELECT count(*) as errorsInQueue FROM index_files i inner join documents d on i.document_id = d.id where (i.status_msg is not null or i.status_msg <> '') and d.status_id=1";
+ $errorsInQueue = DBUtil::getOneResultKey($sql, 'errorsInQueue');
+
+ $sql = "SELECT count(*) as docsInRepository FROM documents where status_id=1";
+ $docsInRepository = DBUtil::getOneResultKey($sql, 'docsInRepository');
+
+ if ($docsInRepository == 0)
+ {
+ $indexingCoverage = '0.00%';
+ $queueCoverage = $indexingCoverage;
+ }
+ else
+ {
+ // compute indexing coverage
+ $indexingCoverage = _kt('Not Available');
+ if (is_numeric($docsInIndex))
+ {
+ $indexingCoverage = ($docsInIndex * 100) / $docsInRepository;
+ $indexingCoverage = number_format($indexingCoverage, 2, '.',',') . '%';
+ }
+
+ // compute queue coverage
+ $queueCoverage = _kt('Not Available');
+ if (is_numeric($docsInQueue))
+ {
+ $queueCoverage = ($docsInQueue * 100) / $docsInRepository;
+ $queueCoverage = number_format($queueCoverage, 2, '.',',') . '%';
+ }
+ }
+
+
+ $stats = array(
+ 'optimisationDate'=>$optimisationDate,
+ 'optimisationPeriod'=>$optimisationPeriod,
+ 'indexingDate'=>$indexingDate,
+ 'indexingPeriod'=>$indexingPeriod,
+ 'docsInIndex'=>$docsInIndex,
+ 'docsInQueue'=>$docsInQueue,
+ 'errorsInQueue'=>$errorsInQueue,
+ 'docsInRepository'=>$docsInRepository,
+ 'indexingCoverage'=>$indexingCoverage,
+ 'queueCoverage'=>$queueCoverage,
+ 'noOptimisation'=>$noOptimisation
+ );
+
+ KTUtil::setSystemSetting('indexerStats', serialize($stats));
+
+ $indexer = Indexer::get();
+
+ $diagnosis = $indexer->diagnose();
+ KTUtil::setSystemSetting('indexerDiagnostics', serialize($diagnosis));
+
+ $extractorDiagnosis = $indexer->diagnoseExtractors();
+
+ KTUtil::setSystemSetting('extractorDiagnostics', serialize($extractorDiagnosis));
+ }
+
+ /**
+ * The main function that may be called repeatedly to index documents.
+ *
+ * @param int $max Default 20
+ */
+ public function indexDocuments($max=null)
+ {
+ global $default;
+ $config =& KTConfig::getSingleton();
+
+ /*$indexLockFile = $config->get('cache/cacheDirectory') . '/main.index.lock';
+ if (is_file($indexLockFile))
+ {
+ $default->log->info('indexDocuments: main.index.lock seems to exist. it could be that the indexing is still underway.');
+ $default->log->info('indexDocuments: Remove "' . $indexLockFile . '" if the indexing is not running or extend the frequency at which the background task runs!');
+ return;
+ }
+ touch($indexLockFile);*/
+
+
+ $this->checkForRegisteredTypes();
+
+ if ($this->debug) $default->log->debug('indexDocuments: start');
+ if (!$this->doesDiagnosticsPass())
+ {
+ //unlink($indexLockFile);
+ if ($this->debug) $default->log->debug('indexDocuments: stopping - diagnostics problem. The dashboard will provide more information.');
+ return;
+ }
+
+ if (is_null($max))
+ {
+ $max = $config->get('indexer/batchDocuments',20);
+ }
+
+ $this->loadExtractorHooks();
+
+ Indexer::clearoutDeleted();
+
+ $date = date('Y-m-d H:i:s');
+ // identify the indexers that must run
+ // mysql specific limit!
+ $sql = "SELECT
+ iff.document_id, mt.filetypes, mt.mimetypes, me.name as extractor, iff.what
+ FROM
+ index_files iff
+ INNER JOIN documents d ON iff.document_id=d.id
+ INNER JOIN document_metadata_version dmv ON d.metadata_version_id=dmv.id
+ INNER JOIN document_content_version dcv ON dmv.content_version_id=dcv.id
+ INNER JOIN mime_types mt ON dcv.mime_id=mt.id
+ LEFT JOIN mime_extractors me ON mt.extractor_id=me.id
+ WHERE
+ (iff.processdate IS NULL or iff.processdate < date_sub('$date', interval 1 day)) AND dmv.status_id=1
+ ORDER BY indexdate
+ LIMIT $max";
+ $result = DBUtil::getResultArray($sql);
+ if (PEAR::isError($result))
+ {
+ //unlink($indexLockFile);
+ if ($this->debug) $default->log->debug('indexDocuments: stopping - db error');
+ return;
+ }
+ KTUtil::setSystemSetting('luceneIndexingDate', time());
+
+ // bail if no work to do
+ if (count($result) == 0)
+ {
+ //unlink($indexLockFile);
+ if ($this->debug) $default->log->debug('indexDocuments: stopping - no work to be done');
+ return;
+ }
+
+ // identify any documents that need indexing and mark them
+ // so they are not taken in a followup run
+ $ids = array();
+ foreach($result as $docinfo)
+ {
+ $ids[] = $docinfo['document_id'];
+ }
+
+ // mark the documents as being processed
+
+ $ids=implode(',',$ids);
+ $sql = "UPDATE index_files SET processdate='$date' WHERE document_id in ($ids)";
+ DBUtil::runQuery($sql);
+
+ $extractorCache = array();
+ $storageManager = KTStorageManagerUtil::getSingleton();
+
+ $tempPath = $config->get("urls/tmpDirectory");
+
+ foreach($result as $docinfo)
+ {
+ // increment indexed documents count
+ Indexer::incrementCount();
+
+ $docId=$docinfo['document_id'];
+ $extension=$docinfo['filetypes'];
+ $mimeType=$docinfo['mimetypes'];
+ $extractorClass=$docinfo['extractor'];
+ $indexDocument = in_array($docinfo['what'], array('A','C'));
+ $indexDiscussion = in_array($docinfo['what'], array('A','D'));
+ $this->indexingHistory = '';
+
+ $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Indexing docid: %d extension: '%s' mimetype: '%s' extractor: '%s'"), $docId, $extension,$mimeType,$extractorClass), 'debug');
+
+ if (empty($extractorClass))
+ {
+ /*
+
+ if no extractor is found and we don't need to index discussions, then we can remove the item from the queue.
+
+ */
+ if ($indexDiscussion)
+ {
+ $indexDocument = false;
+ $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Not indexing docid: %d content because extractor could not be resolve. Still indexing discussion."), $docId), 'info');
+ }
+ else
+ {
+ Indexer::unqueueDocument($docId, sprintf(_kt("No extractor for docid: %d"),$docId));
+ continue;
+ }
+ }
+ else
+ {
+ /*
+
+ If an extractor is available, we must ensure it is enabled.
+
+ */
+
+ if (!$this->isExtractorEnabled($extractorClass))
+ {
+ $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("diagnose: Not indexing docid: %d because extractor '%s' is disabled."), $docId, $extractorClass), 'info');
+ continue;
+ }
+ }
+
+ if ($this->debug)
+ {
+ $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Processing docid: %d.\n"),$docId), 'info');
+ }
+
+ $document = Document::get($docId);
+ if (PEAR::isError($document))
+ {
+ Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Cannot resolve document id %d: %s."),$docId, $document->getMessage()), 'error');
+ continue;
+ }
+
+ if ($this->restartCurrentBatch)
+ {
+ Indexer::unqueueDocument($docId);
+ Indexer::index($docId, 'A');
+ continue;
+ }
+
+
+ $filename = $document->getFileName();
+ if (substr($filename,0,1) == '~' || substr($filename,-1) == '~')
+ {
+ Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: Filename for document id %d starts with a tilde (~). This is assumed to be a temporary file. This is ignored."),$docId), 'error');
+ continue;
+ }
+
+ $removeFromQueue = true;
+ if ($indexDocument)
+ {
+ if (array_key_exists($extractorClass, $extractorCache))
+ {
+ $extractor = $extractorCache[$extractorClass];
+ }
+ else
+ {
+ $extractor = $extractorCache[$extractorClass] = $this->getExtractor($extractorClass);
+ }
+
+ if (!($extractor instanceof DocumentExtractor))
+ {
+ $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("indexDocuments: extractor '%s' is not a document extractor class."),$extractorClass), 'error');
+ continue;
+ }
+
+
+
+ $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();
+ $sourceFile = $storageManager->temporaryFile($document);
+
+ if (empty($sourceFile) || !is_file($sourceFile))
+ {
+ Indexer::unqueueDocument($docId,sprintf(_kt("indexDocuments: source file '%s' for document %d does not exist."),$sourceFile,$docId), 'error');
+ continue;
+ }
+
+ if ($extractor->needsIntermediateSourceFile())
+ {
+ //$extension = pathinfo($document->getFileName(), PATHINFO_EXTENSION);
+
+ $intermediate = $tempPath . '/'. $docId . '.' . $extension;
+ $result = @copy($sourceFile, $intermediate);
+ if ($result === false)
+ {
+ $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not create intermediate file from document %d"),$docId), 'error');
+ // problem. lets try again later. probably permission related. log the issue.
+ continue;
+ }
+ $sourceFile = $intermediate;
+ }
+
+ $targetFile = tempnam($tempPath, 'ktindexer');
+
+ $extractor->setSourceFile($sourceFile);
+ $extractor->setMimeType($mimeType);
+ $extractor->setExtension($extension);
+ $extractor->setTargetFile($targetFile);
+ $extractor->setDocument($document);
+ $extractor->setIndexingStatus(null);
+ $extractor->setExtractionStatus(null);
+
+ $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Extra Info docid: %d Source File: '%s' Target File: '%s'"),$docId,$sourceFile,$targetFile), 'debug');
+
+ $this->executeHook($extractor, 'pre_extract');
+ $this->executeHook($extractor, 'pre_extract', $mimeType);
+ $removeFromQueue = false;
+
+ if ($extractor->extractTextContent())
+ {
+ // the extractor may need to create another target file
+ $targetFile = $extractor->getTargetFile();
+
+ $extractor->setExtractionStatus(true);
+ $this->executeHook($extractor, 'pre_index');
+ $this->executeHook($extractor, 'pre_index', $mimeType);
+
+ $title = $document->getName();
+ if ($indexDiscussion)
+ {
+ if (!$this->filterText($targetFile))
+ {
+ $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');
+ }
+ else
+ {
+ $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);
+ $removeFromQueue = $indexStatus;
+ if (!$indexStatus)
+ {
+ $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocumentAndDiscussion"),$docId), 'error');
+ }
+
+ $extractor->setIndexingStatus($indexStatus);
+ }
+ }
+ else
+ {
+ if (!$this->filterText($targetFile))
+ {
+ $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem filtering document %d"),$docId), 'error');
+ }
+ else
+ {
+ $indexStatus = $this->indexDocument($docId, $targetFile, $title, $version);
+ $removeFromQueue = $indexStatus;
+
+ if (!$indexStatus)
+ {
+ $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Problem indexing document %d - indexDocument"),$docId), 'error');
+ $this->logPendingDocumentInfoStatus($docId, '', 'error');
+ }
+
+ $extractor->setIndexingStatus($indexStatus);
+ }
+ }
+
+ $this->executeHook($extractor, 'post_index', $mimeType);
+ $this->executeHook($extractor, 'post_index');
+ }
+ else
+ {
+ $extractor->setExtractionStatus(false);
+ $this->logPendingDocumentInfoStatus($docId, sprintf(_kt("Could not extract contents from document %d"),$docId), 'error');
+ $this->logPendingDocumentInfoStatus($docId, '', 'error');
+ }
+
+ $this->executeHook($extractor, 'post_extract', $mimeType);
+ $this->executeHook($extractor, 'post_extract');
+
+ if ($extractor->needsIntermediateSourceFile())
+ {
+ @unlink($sourceFile);
+ }
+
+ @unlink($targetFile);
+
+ }
+ else
+ {
+ $indexStatus = $this->indexDiscussion($docId);
+ $removeFromQueue = $indexStatus;
+ }
+
+ if ($removeFromQueue)
+ {
+ Indexer::unqueueDocument($docId, sprintf(_kt("Done indexing docid: %d"),$docId));
+ }
+ else
+ {
+ if ($this->debug) $default->log->debug(sprintf(_kt("Document docid: %d was not removed from the queue as it looks like there was a problem with the extraction process"),$docId));
+ }
+ }
+ if ($this->debug) $default->log->debug('indexDocuments: done');
+ //unlink($indexLockFile);
+ }
+
+ public function migrateDocuments($max=null)
+ {
+ global $default;
+
+ $default->log->info(_kt('migrateDocuments: starting'));
+
+ if (!$this->doesDiagnosticsPass(true))
+ {
+ $default->log->info(_kt('migrateDocuments: stopping - diagnostics problem. The dashboard will provide more information.'));
+ return;
+ }
+
+ if (KTUtil::getSystemSetting('migrationComplete') == 'true')
+ {
+ $default->log->info(_kt('migrateDocuments: stopping - migration is complete.'));
+ return;
+ }
+
+ $config =& KTConfig::getSingleton();
+ if (is_null($max))
+ {
+ $max = $config->get('indexer/batchMigrateDocument',500);
+ }
+
+ $lockFile = $config->get('cache/cacheDirectory') . '/migration.lock';
+ if (is_file($lockFile))
+ {
+ $default->log->info(_kt('migrateDocuments: stopping - migration lockfile detected.'));
+ return;
+ }
+ touch($lockFile);
+
+ $startTime = KTUtil::getSystemSetting('migrationStarted');
+ if (is_null($startTime))
+ {
+ KTUtil::setSystemSetting('migrationStarted', time());
+ }
+
+ $maxLoops = 5;
+
+ $max = ceil($max / $maxLoops);
+
+ $start =KTUtil::getBenchmarkTime();
+ $noDocs = false;
+ $numDocs = 0;
+
+ for($loop=0;$loop<$maxLoops;$loop++)
+ {
+
+ $sql = "SELECT
+ document_id, document_text
+ FROM
+ document_text
+ ORDER BY document_id
+ LIMIT $max";
+ $result = DBUtil::getResultArray($sql);
+ if (PEAR::isError($result))
+ {
+ $default->log->info(_kt('migrateDocuments: db error'));
+ break;
+ }
+
+ $docs = count($result);
+ if ($docs == 0)
+ {
+ $noDocs = true;
+ break;
+ }
+ $numDocs += $docs;
+
+ foreach($result as $docinfo)
+ {
+ $docId = $docinfo['document_id'];
+
+ $document = Document::get($docId);
+ if (PEAR::isError($document) || is_null($document))
+ {
+ $sql = "DELETE FROM document_text WHERE document_id=$docId";
+ DBUtil::runQuery($sql);
+ $default->log->error(sprintf(_kt('migrateDocuments: Could not get document %d\'s document! Removing content!'),$docId));
+ continue;
+ }
+
+ $version = $document->getMajorVersionNumber() . '.' . $document->getMinorVersionNumber();
+
+ $targetFile = tempnam($tempPath, 'ktindexer');
+
+ if (file_put_contents($targetFile, $docinfo['document_text']) === false)
+ {
+ $default->log->error(sprintf(_kt('migrateDocuments: Cannot write to \'%s\' for document id %d'), $targetFile, $docId));
+ continue;
+ }
+ // free memory asap ;)
+ unset($docinfo['document_text']);
+
+ $title = $document->getName();
+
+ $indexStatus = $this->indexDocumentAndDiscussion($docId, $targetFile, $title, $version);
+
+ if ($indexStatus)
+ {
+ $sql = "DELETE FROM document_text WHERE document_id=$docId";
+ DBUtil::runQuery($sql);
+ }
+ else
+ {
+ $default->log->error(sprintf(_kt("migrateDocuments: Problem indexing document %d"), $docId));
+ }
+
+ @unlink($targetFile);
+ }
+ }
+
+ @unlink($lockFile);
+
+ $time = KTUtil::getBenchmarkTime() - $start;
+
+ KTUtil::setSystemSetting('migrationTime', KTUtil::getSystemSetting('migrationTime',0) + $time);
+ KTUtil::setSystemSetting('migratedDocuments', KTUtil::getSystemSetting('migratedDocuments',0) + $numDocs);
+
+ $default->log->info(sprintf(_kt('migrateDocuments: stopping - done in %d seconds!'), $time));
+ if ($noDocs)
+ {
+ $default->log->info(_kt('migrateDocuments: Completed!'));
+ KTUtil::setSystemSetting('migrationComplete', 'true');
+ schedulerUtil::deleteByName('Index Migration');
+ $default->log->debug(_kt('migrateDocuments: Disabling \'Index Migration\' task by removing scheduler entry.'));
+ }
+ }
+
+ /**
+ * Index a document. The base class must override this function.
+ *
+ * @param int $docId
+ * @param string $textFile
+ */
+ protected abstract function indexDocument($docId, $textFile, $title, $version);
+
+
+ public function updateDocumentIndex($docId, $text)
+ {
+ $config = KTConfig::getSingleton();
+ $tempPath = $config->get("urls/tmpDirectory");
+ $tempFile = tempnam($tempPath,'ud_');
+
+ file_put_contents($tempFile, $text);
+
+ $document = Document::get($docId);
+ $title = $document->getDescription();
+ $version = $document->getVersion();
+
+ $result = $this->indexDocument($docId, $tempFile, $title, $version);
+
+ if (file_exists($tempFile))
+ {
+ unlink($tempFile);
+ }
+
+ return $result;
+ }
+
+ /**
+ * Index a discussion. The base class must override this function.
+ *
+ * @param int $docId
+ */
+ protected abstract function indexDiscussion($docId);
+
+ /**
+ * Diagnose the indexer. e.g. Check that the indexing server is running.
+ *
+ */
+ public abstract function diagnose();
+
+ /**
+ * Diagnose the extractors.
+ *
+ * @return array
+ */
+ public function diagnoseExtractors()
+ {
+ $diagnosis = $this->_diagnose($this->extractorPath, 'DocumentExtractor', 'Extractor.inc.php');
+ $diagnosis = array_merge($diagnosis, $this->_diagnose($this->hookPath, 'Hook', 'Hook.inc.php'));
+
+ return $diagnosis;
+ }
+
+ /**
+ * This is a refactored diagnose function.
+ *
+ * @param string $path
+ * @param string $class
+ * @param string $extension
+ * @return array
+ */
+ private function _diagnose($path, $baseclass, $extension)
+ {
+ global $default;
+
+ $diagnoses = array();
+
+ $dir = opendir(SearchHelper::correctPath($path));
+ $extlen = - strlen($extension);
+
+ while (($file = readdir($dir)) !== false)
+ {
+ if (substr($file,0,1) == '.')
+ {
+ continue;
+ }
+ if (substr($file,$extlen) != $extension)
+ {
+ $default->log->error(sprintf(_kt("diagnose: '%s' does not have extension '%s'."), $file, $extension));
+ continue;
+ }
+
+ require_once($path . '/' . $file);
+
+ $class = substr($file, 0, -8);
+ if (!class_exists($class))
+ {
+ $default->log->error(sprintf(_kt("diagnose: class '%s' does not exist."), $class));
+ continue;
+ }
+
+ if (!$this->isExtractorEnabled($class))
+ {
+ $default->log->debug(sprintf(_kt("diagnose: extractor '%s' is disabled."), $class));
+ continue;
+ }
+
+ $extractor = new $class();
+ if (!is_a($extractor, $baseclass))
+ {
+ $default->log->error(sprintf(_kt("diagnose(): '%s' is not of type DocumentExtractor"), $class));
+ continue;
+ }
+
+ $types = $extractor->getSupportedMimeTypes();
+ if (empty($types))
+ {
+ if ($this->debug) $default->log->debug(sprintf(_kt("diagnose: class '%s' does not support any types."), $class));
+ continue;
+ }
+
+ $diagnosis=$extractor->diagnose();
+ if (empty($diagnosis))
+ {
+ continue;
+ }
+ $diagnoses[$class] = array(
+ 'name'=>$extractor->getDisplayName(),
+ 'diagnosis'=>$diagnosis
+ );
+
+ }
+ closedir($dir);
+
+ return $diagnoses;
+ }
+
+
+ /**
+ * Register the extractor types.
+ *
+ * @param boolean $clear. Optional. Defaults to false.
+ */
+ public function registerTypes($clear=false)
+ {
+ if ($clear)
+ {
+ $this->clearExtractors();
+ }
+ $dir = opendir(SearchHelper::correctPath($this->extractorPath));
+ while (($file = readdir($dir)) !== false)
+ {
+ if (substr($file,-17) == 'Extractor.inc.php')
+ {
+ require_once($this->extractorPath . '/' . $file);
+ $class = substr($file, 0, -8);
+
+ if (!class_exists($class))
+ {
+ // if the class does not exist, we can't do anything.
+ continue;
+ }
+
+ $extractor = new $class;
+ if ($extractor instanceof DocumentExtractor)
+ {
+ $extractor->registerMimeTypes();
+ }
+ }
+ }
+ closedir($dir);
+ }
+
+ /**
+ * This is used as a possible obtimisation effort. It may be overridden in that case.
+ *
+ * @param int $docId
+ * @param string $textFile
+ */
+ protected function indexDocumentAndDiscussion($docId, $textFile, $title, $version)
+ {
+ $this->indexDocument($docId, $textFile, $title, $version);
+ $this->indexDiscussion($docId);
+ }
+
+ /**
+ * Remove the document from the queue. This is normally called when it has been processed.
+ *
+ * @param int $docid
+ */
+ public static function unqueueDocument($docid, $reason=false, $level='debug')
+ {
+ $sql = "DELETE FROM index_files WHERE document_id=$docid";
+ DBUtil::runQuery($sql);
+ if ($reason !== false)
+ {
+ global $default;
+ $default->log->$level("Indexer: removing document $docid from the queue - $reason");
+ }
+ }
+
+ /**
+ * Run a query on the index.
+ *
+ * @param string $query
+ * @return array
+ */
+ public abstract function query($query);
+
+ /**
+ * Converts an integer to a string that can be easily compared and reversed.
+ *
+ * @param int $int
+ * @return string
+ */
+ public static function longToString($int)
+ {
+ $maxlen = 14;
+
+ $a2z = array('a','b','c','d','e','f','g','h','i','j');
+ $o29 = array('0','1','2','3','4','5','6','7','8','9');
+ $l = str_pad('',$maxlen - strlen("$int"),'0') . $int;
+
+ return str_replace($o29, $a2z, $l);
+ }
+
+ /**
+ * Converts a string to an integer.
+ *
+ * @param string $str
+ * @return int
+ */
+ public static function stringToLong($str)
+ {
+ $a2z = array('a','b','c','d','e','f','g','h','i','j');
+ $o29 = array('0','1','2','3','4','5','6','7','8','9');
+
+ $int = str_replace($a2z, $o29, $str) + 0;
+
+ return $int;
+ }
+
+ /**
+ * Possibly we can optimise indexes. This method must be overriden.
+ * The new function must call the parent!
+ *
+ */
+ public function optimise()
+ {
+ KTUtil::setSystemSetting('luceneOptimisationDate', time());
+ }
+
+ /**
+ * Shuts down the indexer
+ *
+ */
+ public function shutdown()
+ {
+ // do nothing generally
+ }
+
+ /**
+ * Returns the name of the indexer.
+ *
+ * @return string
+ */
+ public abstract function getDisplayName();
+
+
+ /**
+ * Returns the number of non-deleted documents in the index.
+ *
+ * @return int
+ */
+ public abstract function getDocumentsInIndex();
+
+ public abstract function isDocumentIndexed($documentId);
+
+ /**
+ * Returns the path to the index directory
+ *
+ * @return string
+ */
+ public function getIndexDirectory()
+ {
+ $config = KTConfig::getSingleton();
+ $directory = $config->get('indexer/luceneDirectory');
+ return $directory;
+ }
+}
+
+?>