Commit b77bfde8af4388f6308f4016d857efdc442bc3ef

Authored by Bryn Divey
1 parent 40304ef7

Added stopword removal


git-svn-id: https://kt-dms.svn.sourceforge.net/svnroot/kt-dms/trunk@5800 c91229c3-7414-0410-bfa2-8a42b809f60b
config/config.ini
@@ -155,6 +155,9 @@ tmpDirectory = ${varDirectory}/tmp @@ -155,6 +155,9 @@ tmpDirectory = ${varDirectory}/tmp
155 graphicsUrl = ${rootUrl}/graphics 155 graphicsUrl = ${rootUrl}/graphics
156 uiUrl = ${rootUrl}/presentation/lookAndFeel/knowledgeTree 156 uiUrl = ${rootUrl}/presentation/lookAndFeel/knowledgeTree
157 157
  158 +; files
  159 +stopwordsFile = ${fileSystemRoot}/config/stopwords.txt
  160 +
158 [session] 161 [session]
159 ; session timeout (in seconds) 162 ; session timeout (in seconds)
160 sessionTimeout = 1200 163 sessionTimeout = 1200
config/stopwords.txt 0 → 100644
  1 +a's
  2 +able
  3 +about
  4 +above
  5 +according
  6 +accordingly
  7 +across
  8 +actually
  9 +after
  10 +afterwards
  11 +again
  12 +against
  13 +ain't
  14 +all
  15 +allow
  16 +allows
  17 +almost
  18 +alone
  19 +along
  20 +already
  21 +also
  22 +although
  23 +always
  24 +am
  25 +among
  26 +amongst
  27 +an
  28 +and
  29 +another
  30 +any
  31 +anybody
  32 +anyhow
  33 +anyone
  34 +anything
  35 +anyway
  36 +anyways
  37 +anywhere
  38 +apart
  39 +appear
  40 +appreciate
  41 +appropriate
  42 +are
  43 +aren't
  44 +around
  45 +as
  46 +aside
  47 +ask
  48 +asking
  49 +associated
  50 +at
  51 +available
  52 +away
  53 +awfully
  54 +be
  55 +became
  56 +because
  57 +become
  58 +becomes
  59 +becoming
  60 +been
  61 +before
  62 +beforehand
  63 +behind
  64 +being
  65 +believe
  66 +below
  67 +beside
  68 +besides
  69 +best
  70 +better
  71 +between
  72 +beyond
  73 +both
  74 +brief
  75 +but
  76 +by
  77 +c'mon
  78 +c's
  79 +came
  80 +can
  81 +can't
  82 +cannot
  83 +cant
  84 +cause
  85 +causes
  86 +certain
  87 +certainly
  88 +changes
  89 +clearly
  90 +co
  91 +com
  92 +come
  93 +comes
  94 +concerning
  95 +consequently
  96 +consider
  97 +considering
  98 +contain
  99 +containing
  100 +contains
  101 +corresponding
  102 +could
  103 +couldn't
  104 +course
  105 +currently
  106 +definitely
  107 +described
  108 +despite
  109 +did
  110 +didn't
  111 +different
  112 +do
  113 +does
  114 +doesn't
  115 +doing
  116 +don't
  117 +done
  118 +down
  119 +downwards
  120 +during
  121 +each
  122 +edu
  123 +eg
  124 +eight
  125 +either
  126 +else
  127 +elsewhere
  128 +enough
  129 +entirely
  130 +especially
  131 +et
  132 +etc
  133 +even
  134 +ever
  135 +every
  136 +everybody
  137 +everyone
  138 +everything
  139 +everywhere
  140 +ex
  141 +exactly
  142 +example
  143 +except
  144 +far
  145 +few
  146 +fifth
  147 +first
  148 +five
  149 +followed
  150 +following
  151 +follows
  152 +for
  153 +former
  154 +formerly
  155 +forth
  156 +four
  157 +from
  158 +further
  159 +furthermore
  160 +get
  161 +gets
  162 +getting
  163 +given
  164 +gives
  165 +go
  166 +goes
  167 +going
  168 +gone
  169 +got
  170 +gotten
  171 +greetings
  172 +had
  173 +hadn't
  174 +happens
  175 +hardly
  176 +has
  177 +hasn't
  178 +have
  179 +haven't
  180 +having
  181 +he
  182 +he's
  183 +hello
  184 +help
  185 +hence
  186 +her
  187 +here
  188 +here's
  189 +hereafter
  190 +hereby
  191 +herein
  192 +hereupon
  193 +hers
  194 +herself
  195 +hi
  196 +him
  197 +himself
  198 +his
  199 +hither
  200 +hopefully
  201 +how
  202 +howbeit
  203 +however
  204 +i'd
  205 +i'll
  206 +i'm
  207 +i've
  208 +ie
  209 +if
  210 +ignored
  211 +immediate
  212 +in
  213 +inasmuch
  214 +inc
  215 +indeed
  216 +indicate
  217 +indicated
  218 +indicates
  219 +inner
  220 +insofar
  221 +instead
  222 +into
  223 +inward
  224 +is
  225 +isn't
  226 +it
  227 +it'd
  228 +it'll
  229 +it's
  230 +its
  231 +itself
  232 +just
  233 +keep
  234 +keeps
  235 +kept
  236 +know
  237 +knows
  238 +known
  239 +last
  240 +lately
  241 +later
  242 +latter
  243 +latterly
  244 +least
  245 +less
  246 +lest
  247 +let
  248 +let's
  249 +like
  250 +liked
  251 +likely
  252 +little
  253 +look
  254 +looking
  255 +looks
  256 +ltd
  257 +mainly
  258 +many
  259 +may
  260 +maybe
  261 +me
  262 +mean
  263 +meanwhile
  264 +merely
  265 +might
  266 +more
  267 +moreover
  268 +most
  269 +mostly
  270 +much
  271 +must
  272 +my
  273 +myself
  274 +name
  275 +namely
  276 +nd
  277 +near
  278 +nearly
  279 +necessary
  280 +need
  281 +needs
  282 +neither
  283 +never
  284 +nevertheless
  285 +new
  286 +next
  287 +nine
  288 +no
  289 +nobody
  290 +non
  291 +none
  292 +noone
  293 +nor
  294 +normally
  295 +not
  296 +nothing
  297 +novel
  298 +now
  299 +nowhere
  300 +obviously
  301 +of
  302 +off
  303 +often
  304 +oh
  305 +ok
  306 +okay
  307 +old
  308 +on
  309 +once
  310 +one
  311 +ones
  312 +only
  313 +onto
  314 +or
  315 +other
  316 +others
  317 +otherwise
  318 +ought
  319 +our
  320 +ours
  321 +ourselves
  322 +out
  323 +outside
  324 +over
  325 +overall
  326 +own
  327 +particular
  328 +particularly
  329 +per
  330 +perhaps
  331 +placed
  332 +please
  333 +plus
  334 +possible
  335 +presumably
  336 +probably
  337 +provides
  338 +que
  339 +quite
  340 +qv
  341 +rather
  342 +rd
  343 +re
  344 +really
  345 +reasonably
  346 +regarding
  347 +regardless
  348 +regards
  349 +relatively
  350 +respectively
  351 +right
  352 +said
  353 +same
  354 +saw
  355 +say
  356 +saying
  357 +says
  358 +second
  359 +secondly
  360 +see
  361 +seeing
  362 +seem
  363 +seemed
  364 +seeming
  365 +seems
  366 +seen
  367 +self
  368 +selves
  369 +sensible
  370 +sent
  371 +serious
  372 +seriously
  373 +seven
  374 +several
  375 +shall
  376 +she
  377 +should
  378 +shouldn't
  379 +since
  380 +six
  381 +so
  382 +some
  383 +somebody
  384 +somehow
  385 +someone
  386 +something
  387 +sometime
  388 +sometimes
  389 +somewhat
  390 +somewhere
  391 +soon
  392 +sorry
  393 +specified
  394 +specify
  395 +specifying
  396 +still
  397 +sub
  398 +such
  399 +sup
  400 +sure
  401 +t's
  402 +take
  403 +taken
  404 +tell
  405 +tends
  406 +th
  407 +than
  408 +thank
  409 +thanks
  410 +thanx
  411 +that
  412 +that's
  413 +thats
  414 +the
  415 +their
  416 +theirs
  417 +them
  418 +themselves
  419 +then
  420 +thence
  421 +there
  422 +there's
  423 +thereafter
  424 +thereby
  425 +therefore
  426 +therein
  427 +theres
  428 +thereupon
  429 +these
  430 +they
  431 +they'd
  432 +they'll
  433 +they're
  434 +they've
  435 +think
  436 +third
  437 +this
  438 +thorough
  439 +thoroughly
  440 +those
  441 +though
  442 +three
  443 +through
  444 +throughout
  445 +thru
  446 +thus
  447 +to
  448 +together
  449 +too
  450 +took
  451 +toward
  452 +towards
  453 +tried
  454 +tries
  455 +truly
  456 +try
  457 +trying
  458 +twice
  459 +two
  460 +un
  461 +under
  462 +unfortunately
  463 +unless
  464 +unlikely
  465 +until
  466 +unto
  467 +up
  468 +upon
  469 +us
  470 +use
  471 +used
  472 +useful
  473 +uses
  474 +using
  475 +usually
  476 +value
  477 +various
  478 +very
  479 +via
  480 +viz
  481 +vs
  482 +want
  483 +wants
  484 +was
  485 +wasn't
  486 +way
  487 +we
  488 +we'd
  489 +we'll
  490 +we're
  491 +we've
  492 +welcome
  493 +well
  494 +went
  495 +were
  496 +weren't
  497 +what
  498 +what's
  499 +whatever
  500 +when
  501 +whence
  502 +whenever
  503 +where
  504 +where's
  505 +whereafter
  506 +whereas
  507 +whereby
  508 +wherein
  509 +whereupon
  510 +wherever
  511 +whether
  512 +which
  513 +while
  514 +whither
  515 +who
  516 +who's
  517 +whoever
  518 +whole
  519 +whom
  520 +whose
  521 +why
  522 +will
  523 +willing
  524 +wish
  525 +with
  526 +within
  527 +without
  528 +won't
  529 +wonder
  530 +would
  531 +would
  532 +wouldn't
  533 +yes
  534 +yet
  535 +you
  536 +you'd
  537 +you'll
  538 +you're
  539 +you've
  540 +your
  541 +yours
  542 +yourself
  543 +yourselves
  544 +zero
lib/browse/Criteria.inc
@@ -43,8 +43,6 @@ require_once(KT_LIB_DIR . '/workflow/workflow.inc.php'); @@ -43,8 +43,6 @@ require_once(KT_LIB_DIR . '/workflow/workflow.inc.php');
43 require_once(KT_LIB_DIR . '/browse/criteriaregistry.php'); 43 require_once(KT_LIB_DIR . '/browse/criteriaregistry.php');
44 44
45 45
46 -$RESTRICTING_SEARCH = true;  
47 -  
48 class BrowseCriterion { 46 class BrowseCriterion {
49 var $sDisplay; 47 var $sDisplay;
50 var $sDocumentField; 48 var $sDocumentField;
@@ -689,23 +687,9 @@ class ContentCriterion extends BrowseCriterion { @@ -689,23 +687,9 @@ class ContentCriterion extends BrowseCriterion {
689 687
690 $p = array(); 688 $p = array();
691 $p[0] = "MATCH(DT.document_text) AGAINST (? $boolean_mode)"; 689 $p[0] = "MATCH(DT.document_text) AGAINST (? $boolean_mode)";
692 - 690 + $p[1] = KTUtil::phraseQuote($aRequest[$this->getWidgetBase()]);
693 691
694 -  
695 - if ($RESTRICTING_SEARCH) {  
696 - $q_set = KTUtil::phraseSplit($aRequest[$this->getWidgetBase()]);  
697 - $temp = $q_set;  
698 - foreach ($temp as $k => $v) {  
699 - $t = array();  
700 - foreach ($v as $part) {  
701 - $t[] = sprintf('+"%s"', $part);  
702 - }  
703 - $q_set[$k] = join(' ', $t);  
704 - }  
705 - $p[1] = implode(' ',$q_set);  
706 - } else {  
707 - $p[1] = $aRequest[$this->getWidgetBase()];  
708 - } 692 + // var_dump($p[1]);exit(0);
709 693
710 // handle the boolean "not" stuff. 694 // handle the boolean "not" stuff.
711 $want_invert = KTUtil::arrayGet($aRequest, $this->getWidgetBase() . '_not'); 695 $want_invert = KTUtil::arrayGet($aRequest, $this->getWidgetBase() . '_not');
@@ -815,23 +799,7 @@ class DiscussionTextCriterion extends BrowseCriterion { @@ -815,23 +799,7 @@ class DiscussionTextCriterion extends BrowseCriterion {
815 799
816 $p = array(); 800 $p = array();
817 $p[0] = "MATCH(DDCT.body) AGAINST (? $boolean_mode)"; 801 $p[0] = "MATCH(DDCT.body) AGAINST (? $boolean_mode)";
818 -  
819 -  
820 -  
821 - if ($RESTRICTING_SEARCH) {  
822 - $q_set = KTUtil::phraseSplit($aRequest[$this->getWidgetBase()]);  
823 - $temp = $q_set;  
824 - foreach ($temp as $k => $v) {  
825 - $t = array();  
826 - foreach ($v as $part) {  
827 - $t[] = sprintf('+"%s"', $part);  
828 - }  
829 - $q_set[$k] = join(' ', $t);  
830 - }  
831 - $p[1] = implode(' ',$q_set);  
832 - } else {  
833 - $p[1] = $aRequest[$this->getWidgetBase()];  
834 - } 802 + $p[1] = KTUtil::phraseQuote($aRequest[$this->getWidgetBase()]);
835 803
836 // handle the boolean "not" stuff. 804 // handle the boolean "not" stuff.
837 $want_invert = KTUtil::arrayGet($aRequest, $this->getWidgetBase() . '_not'); 805 $want_invert = KTUtil::arrayGet($aRequest, $this->getWidgetBase() . '_not');
@@ -891,23 +859,7 @@ class SearchableTextCriterion extends BrowseCriterion { @@ -891,23 +859,7 @@ class SearchableTextCriterion extends BrowseCriterion {
891 859
892 $p = array(); 860 $p = array();
893 $p[0] = "MATCH(DST.document_text) AGAINST (? $boolean_mode)"; 861 $p[0] = "MATCH(DST.document_text) AGAINST (? $boolean_mode)";
894 -  
895 -  
896 -  
897 - if ($RESTRICTING_SEARCH) {  
898 - $q_set = KTUtil::phraseSplit($aRequest[$this->getWidgetBase()]);  
899 - $temp = $q_set;  
900 - foreach ($temp as $k => $v) {  
901 - $t = array();  
902 - foreach ($v as $part) {  
903 - $t[] = sprintf('+"%s"', $part);  
904 - }  
905 - $q_set[$k] = join(' ', $t);  
906 - }  
907 - $p[1] = implode(' ',$q_set);  
908 - } else {  
909 - $p[1] = $aRequest[$this->getWidgetBase()];  
910 - } 862 + $p[1] = KTUtil::phraseQuote($aRequest[$this->getWidgetBase()]);
911 863
912 // handle the boolean "not" stuff. 864 // handle the boolean "not" stuff.
913 $want_invert = KTUtil::arrayGet($aRequest, $this->getWidgetBase() . '_not'); 865 $want_invert = KTUtil::arrayGet($aRequest, $this->getWidgetBase() . '_not');
@@ -956,23 +908,7 @@ class TransactionTextCriterion extends BrowseCriterion { @@ -956,23 +908,7 @@ class TransactionTextCriterion extends BrowseCriterion {
956 908
957 $p = array(); 909 $p = array();
958 $p[0] = "MATCH(DTT.document_text) AGAINST (? $boolean_mode)"; 910 $p[0] = "MATCH(DTT.document_text) AGAINST (? $boolean_mode)";
959 -  
960 -  
961 -  
962 - if ($RESTRICTING_SEARCH) {  
963 - $q_set = KTUtil::phraseSplit($aRequest[$this->getWidgetBase()]);  
964 - $temp = $q_set;  
965 - foreach ($temp as $k => $v) {  
966 - $t = array();  
967 - foreach ($v as $part) {  
968 - $t[] = sprintf('+"%s"', $part);  
969 - }  
970 - $q_set[$k] = join(' ', $t);  
971 - }  
972 - $p[1] = implode(' ',$q_set);  
973 - } else {  
974 - $p[1] = $aRequest[$this->getWidgetBase()];  
975 - } 911 + $p[1] = KTUtil::phraseQuote($aRequest[$this->getWidgetBase()]);
976 912
977 // handle the boolean "not" stuff. 913 // handle the boolean "not" stuff.
978 $want_invert = KTUtil::arrayGet($aRequest, $this->getWidgetBase() . '_not'); 914 $want_invert = KTUtil::arrayGet($aRequest, $this->getWidgetBase() . '_not');
lib/util/KTStopwords.php 0 → 100644
  1 +<?php
  2 +
  3 +/**
  4 + * $Id: config.inc.php 5758 2006-07-27 10:17:43Z bshuttle $
  5 + *
  6 + * The contents of this file are subject to the KnowledgeTree Public
  7 + * License Version 1.1 ("License"); You may not use this file except in
  8 + * compliance with the License. You may obtain a copy of the License at
  9 + * http://www.ktdms.com/KPL
  10 + *
  11 + * Software distributed under the License is distributed on an "AS IS"
  12 + * basis,
  13 + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  14 + * for the specific language governing rights and limitations under the
  15 + * License.
  16 + *
  17 + * The Original Code is: KnowledgeTree Open Source
  18 + *
  19 + * The Initial Developer of the Original Code is The Jam Warehouse Software
  20 + * (Pty) Ltd, trading as KnowledgeTree.
  21 + * Portions created by The Jam Warehouse Software (Pty) Ltd are Copyright
  22 + * (C) 2006 The Jam Warehouse Software (Pty) Ltd;
  23 + * All Rights Reserved.
  24 + *
  25 + */
  26 +
  27 +class KTStopwords {
  28 + var $words = array();
  29 +
  30 + var $conf = array();
  31 + var $aSectionFile;
  32 + var $aFileRoot;
  33 + var $flat = array();
  34 + var $flatns = array();
  35 +
  36 + function loadCache($filename) {
  37 + $cache_str = file_get_contents($filename);
  38 + $this->words = unserialize($cache_str);
  39 + return true;
  40 + }
  41 +
  42 + function createCache($filename) {
  43 + file_put_contents($filename, serialize($this->words));
  44 + }
  45 +
  46 + function loadFile($filename) {
  47 + $this->words = array();
  48 + foreach(file($filename) as $line) {
  49 + $this->words[] = trim($line);
  50 + }
  51 + }
  52 +
  53 + function isStopword($sWord) {
  54 + return in_array($sWord, $this->words);
  55 + }
  56 +
  57 + function &getSingleton() {
  58 + if (!KTUtil::arrayGet($GLOBALS, 'KTStopwords')) {
  59 + $GLOBALS['KTStopwords'] =& new KTStopwords;
  60 + $oConfig = KTConfig::getSingleton();
  61 + $GLOBALS['KTStopwords']->loadFile($oConfig->get('stopwordsFile'));
  62 + }
  63 + return $GLOBALS['KTStopwords'];
  64 + }
  65 +}
  66 +
  67 +
  68 +?>
lib/util/ktutil.inc
@@ -28,6 +28,8 @@ @@ -28,6 +28,8 @@
28 * @author Neil Blakey-Milner <nbm@jamwarehouse.com>, Jam Warehouse (Pty) Ltd, South Africa 28 * @author Neil Blakey-Milner <nbm@jamwarehouse.com>, Jam Warehouse (Pty) Ltd, South Africa
29 */ 29 */
30 30
  31 +require_once(KT_LIB_DIR . '/util/KTStopwords.php');
  32 +
31 class KTUtil { 33 class KTUtil {
32 function extractGPC () { 34 function extractGPC () {
33 foreach (func_get_args() as $var) { 35 foreach (func_get_args() as $var) {
@@ -550,7 +552,16 @@ class KTUtil { @@ -550,7 +552,16 @@ class KTUtil {
550 return ((float) $microtime_simple[1] + (float) $microtime_simple[0]); 552 return ((float) $microtime_simple[1] + (float) $microtime_simple[0]);
551 } 553 }
552 554
553 - function phraseSplit($sSearchString) { 555 + function phraseSplit($sSearchString) {
  556 + // this should probably be moved to a DBUtil method
  557 +
  558 + $sMinWord = DBUtil::getOneResultKey("SHOW VARIABLES LIKE 'ft_min_word_len'", "Value");
  559 + if(is_numeric($sMinWord)) {
  560 + $iMinWord = (int)$sMinWord;
  561 + } else {
  562 + $iMinWord = 4;
  563 + }
  564 +
554 $a = preg_split('#"#', $sSearchString); 565 $a = preg_split('#"#', $sSearchString);
555 $i = 0; 566 $i = 0;
556 $phrases = array(); 567 $phrases = array();
@@ -564,20 +575,33 @@ class KTUtil { @@ -564,20 +575,33 @@ class KTUtil {
564 $i += 1; 575 $i += 1;
565 } 576 }
566 577
  578 + $oStopwords =& KTStopwords::getSingleton();
  579 +
567 $words = array(); 580 $words = array();
568 foreach ($word_parts as $part) { 581 foreach ($word_parts as $part) {
569 $w = (array) explode(' ', $part); 582 $w = (array) explode(' ', $part);
570 - foreach ($w as $potential) { if (!empty($potential)) { $words[] = $potential; }}  
571 - }  
572 -  
573 - // XXX: filter each subword (including in phrases) to remove whitespace-broken items and sub-4 character words.  
574 - 583 + foreach ($w as $potential) {
  584 + if (strlen($potential) >= $iMinWord && !$oStopwords->isStopword($potential)) {
  585 + $words[] = $potential;
  586 + }
  587 + }
  588 + }
575 589
576 return array( 590 return array(
577 'words' => $words, 591 'words' => $words,
578 'phrases' => $phrases, 592 'phrases' => $phrases,
579 ); 593 );
  594 + }
580 595
  596 + function phraseQuote($sQuery) {
  597 + foreach(KTUtil::phraseSplit($sQuery) as $k => $v) {
  598 + $t = array();
  599 + foreach ($v as $part) {
  600 + $t[] = sprintf('+"%s"', $part);
  601 + }
  602 + $q_set[$k] = join(' ', $t);
  603 + }
  604 + return implode(' ',$q_set);
581 } 605 }
582 606
583 function running_user() { 607 function running_user() {