diff --git a/config/config.ini b/config/config.ini index 192577d..b004b85 100644 --- a/config/config.ini +++ b/config/config.ini @@ -155,6 +155,9 @@ tmpDirectory = ${varDirectory}/tmp graphicsUrl = ${rootUrl}/graphics uiUrl = ${rootUrl}/presentation/lookAndFeel/knowledgeTree +; files +stopwordsFile = ${fileSystemRoot}/config/stopwords.txt + [session] ; session timeout (in seconds) sessionTimeout = 1200 diff --git a/config/stopwords.txt b/config/stopwords.txt new file mode 100644 index 0000000..92c520c --- /dev/null +++ b/config/stopwords.txt @@ -0,0 +1,544 @@ +a's +able +about +above +according +accordingly +across +actually +after +afterwards +again +against +ain't +all +allow +allows +almost +alone +along +already +also +although +always +am +among +amongst +an +and +another +any +anybody +anyhow +anyone +anything +anyway +anyways +anywhere +apart +appear +appreciate +appropriate +are +aren't +around +as +aside +ask +asking +associated +at +available +away +awfully +be +became +because +become +becomes +becoming +been +before +beforehand +behind +being +believe +below +beside +besides +best +better +between +beyond +both +brief +but +by +c'mon +c's +came +can +can't +cannot +cant +cause +causes +certain +certainly +changes +clearly +co +com +come +comes +concerning +consequently +consider +considering +contain +containing +contains +corresponding +could +couldn't +course +currently +definitely +described +despite +did +didn't +different +do +does +doesn't +doing +don't +done +down +downwards +during +each +edu +eg +eight +either +else +elsewhere +enough +entirely +especially +et +etc +even +ever +every +everybody +everyone +everything +everywhere +ex +exactly +example +except +far +few +fifth +first +five +followed +following +follows +for +former +formerly +forth +four +from +further +furthermore +get +gets +getting +given +gives +go +goes +going +gone +got +gotten +greetings +had +hadn't +happens +hardly +has +hasn't +have +haven't +having +he +he's +hello +help +hence +her +here +here's +hereafter +hereby +herein +hereupon +hers +herself +hi +him +himself +his +hither +hopefully +how +howbeit +however +i'd +i'll +i'm +i've +ie +if +ignored +immediate +in +inasmuch +inc +indeed +indicate +indicated +indicates +inner +insofar +instead +into +inward +is +isn't +it +it'd +it'll +it's +its +itself +just +keep +keeps +kept +know +knows +known +last +lately +later +latter +latterly +least +less +lest +let +let's +like +liked +likely +little +look +looking +looks +ltd +mainly +many +may +maybe +me +mean +meanwhile +merely +might +more +moreover +most +mostly +much +must +my +myself +name +namely +nd +near +nearly +necessary +need +needs +neither +never +nevertheless +new +next +nine +no +nobody +non +none +noone +nor +normally +not +nothing +novel +now +nowhere +obviously +of +off +often +oh +ok +okay +old +on +once +one +ones +only +onto +or +other +others +otherwise +ought +our +ours +ourselves +out +outside +over +overall +own +particular +particularly +per +perhaps +placed +please +plus +possible +presumably +probably +provides +que +quite +qv +rather +rd +re +really +reasonably +regarding +regardless +regards +relatively +respectively +right +said +same +saw +say +saying +says +second +secondly +see +seeing +seem +seemed +seeming +seems +seen +self +selves +sensible +sent +serious +seriously +seven +several +shall +she +should +shouldn't +since +six +so +some +somebody +somehow +someone +something +sometime +sometimes +somewhat +somewhere +soon +sorry +specified +specify +specifying +still +sub +such +sup +sure +t's +take +taken +tell +tends +th +than +thank +thanks +thanx +that +that's +thats +the +their +theirs +them +themselves +then +thence +there +there's +thereafter +thereby +therefore +therein +theres +thereupon +these +they +they'd +they'll +they're +they've +think +third +this +thorough +thoroughly +those +though +three +through +throughout +thru +thus +to +together +too +took +toward +towards +tried +tries +truly +try +trying +twice +two +un +under +unfortunately +unless +unlikely +until +unto +up +upon +us +use +used +useful +uses +using +usually +value +various +very +via +viz +vs +want +wants +was +wasn't +way +we +we'd +we'll +we're +we've +welcome +well +went +were +weren't +what +what's +whatever +when +whence +whenever +where +where's +whereafter +whereas +whereby +wherein +whereupon +wherever +whether +which +while +whither +who +who's +whoever +whole +whom +whose +why +will +willing +wish +with +within +without +won't +wonder +would +would +wouldn't +yes +yet +you +you'd +you'll +you're +you've +your +yours +yourself +yourselves +zero diff --git a/lib/browse/Criteria.inc b/lib/browse/Criteria.inc index e84b68f..ecc0e15 100644 --- a/lib/browse/Criteria.inc +++ b/lib/browse/Criteria.inc @@ -43,8 +43,6 @@ require_once(KT_LIB_DIR . '/workflow/workflow.inc.php'); require_once(KT_LIB_DIR . '/browse/criteriaregistry.php'); -$RESTRICTING_SEARCH = true; - class BrowseCriterion { var $sDisplay; var $sDocumentField; @@ -689,23 +687,9 @@ class ContentCriterion extends BrowseCriterion { $p = array(); $p[0] = "MATCH(DT.document_text) AGAINST (? $boolean_mode)"; - + $p[1] = KTUtil::phraseQuote($aRequest[$this->getWidgetBase()]); - - if ($RESTRICTING_SEARCH) { - $q_set = KTUtil::phraseSplit($aRequest[$this->getWidgetBase()]); - $temp = $q_set; - foreach ($temp as $k => $v) { - $t = array(); - foreach ($v as $part) { - $t[] = sprintf('+"%s"', $part); - } - $q_set[$k] = join(' ', $t); - } - $p[1] = implode(' ',$q_set); - } else { - $p[1] = $aRequest[$this->getWidgetBase()]; - } + // var_dump($p[1]);exit(0); // handle the boolean "not" stuff. $want_invert = KTUtil::arrayGet($aRequest, $this->getWidgetBase() . '_not'); @@ -815,23 +799,7 @@ class DiscussionTextCriterion extends BrowseCriterion { $p = array(); $p[0] = "MATCH(DDCT.body) AGAINST (? $boolean_mode)"; - - - - if ($RESTRICTING_SEARCH) { - $q_set = KTUtil::phraseSplit($aRequest[$this->getWidgetBase()]); - $temp = $q_set; - foreach ($temp as $k => $v) { - $t = array(); - foreach ($v as $part) { - $t[] = sprintf('+"%s"', $part); - } - $q_set[$k] = join(' ', $t); - } - $p[1] = implode(' ',$q_set); - } else { - $p[1] = $aRequest[$this->getWidgetBase()]; - } + $p[1] = KTUtil::phraseQuote($aRequest[$this->getWidgetBase()]); // handle the boolean "not" stuff. $want_invert = KTUtil::arrayGet($aRequest, $this->getWidgetBase() . '_not'); @@ -891,23 +859,7 @@ class SearchableTextCriterion extends BrowseCriterion { $p = array(); $p[0] = "MATCH(DST.document_text) AGAINST (? $boolean_mode)"; - - - - if ($RESTRICTING_SEARCH) { - $q_set = KTUtil::phraseSplit($aRequest[$this->getWidgetBase()]); - $temp = $q_set; - foreach ($temp as $k => $v) { - $t = array(); - foreach ($v as $part) { - $t[] = sprintf('+"%s"', $part); - } - $q_set[$k] = join(' ', $t); - } - $p[1] = implode(' ',$q_set); - } else { - $p[1] = $aRequest[$this->getWidgetBase()]; - } + $p[1] = KTUtil::phraseQuote($aRequest[$this->getWidgetBase()]); // handle the boolean "not" stuff. $want_invert = KTUtil::arrayGet($aRequest, $this->getWidgetBase() . '_not'); @@ -956,23 +908,7 @@ class TransactionTextCriterion extends BrowseCriterion { $p = array(); $p[0] = "MATCH(DTT.document_text) AGAINST (? $boolean_mode)"; - - - - if ($RESTRICTING_SEARCH) { - $q_set = KTUtil::phraseSplit($aRequest[$this->getWidgetBase()]); - $temp = $q_set; - foreach ($temp as $k => $v) { - $t = array(); - foreach ($v as $part) { - $t[] = sprintf('+"%s"', $part); - } - $q_set[$k] = join(' ', $t); - } - $p[1] = implode(' ',$q_set); - } else { - $p[1] = $aRequest[$this->getWidgetBase()]; - } + $p[1] = KTUtil::phraseQuote($aRequest[$this->getWidgetBase()]); // handle the boolean "not" stuff. $want_invert = KTUtil::arrayGet($aRequest, $this->getWidgetBase() . '_not'); diff --git a/lib/util/KTStopwords.php b/lib/util/KTStopwords.php new file mode 100644 index 0000000..816c70f --- /dev/null +++ b/lib/util/KTStopwords.php @@ -0,0 +1,68 @@ +words = unserialize($cache_str); + return true; + } + + function createCache($filename) { + file_put_contents($filename, serialize($this->words)); + } + + function loadFile($filename) { + $this->words = array(); + foreach(file($filename) as $line) { + $this->words[] = trim($line); + } + } + + function isStopword($sWord) { + return in_array($sWord, $this->words); + } + + function &getSingleton() { + if (!KTUtil::arrayGet($GLOBALS, 'KTStopwords')) { + $GLOBALS['KTStopwords'] =& new KTStopwords; + $oConfig = KTConfig::getSingleton(); + $GLOBALS['KTStopwords']->loadFile($oConfig->get('stopwordsFile')); + } + return $GLOBALS['KTStopwords']; + } +} + + +?> diff --git a/lib/util/ktutil.inc b/lib/util/ktutil.inc index c3a9ed4..587dd69 100644 --- a/lib/util/ktutil.inc +++ b/lib/util/ktutil.inc @@ -28,6 +28,8 @@ * @author Neil Blakey-Milner , Jam Warehouse (Pty) Ltd, South Africa */ +require_once(KT_LIB_DIR . '/util/KTStopwords.php'); + class KTUtil { function extractGPC () { foreach (func_get_args() as $var) { @@ -550,7 +552,16 @@ class KTUtil { return ((float) $microtime_simple[1] + (float) $microtime_simple[0]); } - function phraseSplit($sSearchString) { + function phraseSplit($sSearchString) { + // this should probably be moved to a DBUtil method + + $sMinWord = DBUtil::getOneResultKey("SHOW VARIABLES LIKE 'ft_min_word_len'", "Value"); + if(is_numeric($sMinWord)) { + $iMinWord = (int)$sMinWord; + } else { + $iMinWord = 4; + } + $a = preg_split('#"#', $sSearchString); $i = 0; $phrases = array(); @@ -564,20 +575,33 @@ class KTUtil { $i += 1; } + $oStopwords =& KTStopwords::getSingleton(); + $words = array(); foreach ($word_parts as $part) { $w = (array) explode(' ', $part); - foreach ($w as $potential) { if (!empty($potential)) { $words[] = $potential; }} - } - - // XXX: filter each subword (including in phrases) to remove whitespace-broken items and sub-4 character words. - + foreach ($w as $potential) { + if (strlen($potential) >= $iMinWord && !$oStopwords->isStopword($potential)) { + $words[] = $potential; + } + } + } return array( 'words' => $words, 'phrases' => $phrases, ); + } + function phraseQuote($sQuery) { + foreach(KTUtil::phraseSplit($sQuery) as $k => $v) { + $t = array(); + foreach ($v as $part) { + $t[] = sprintf('+"%s"', $part); + } + $q_set[$k] = join(' ', $t); + } + return implode(' ',$q_set); } function running_user() {