Commit b77bfde8af4388f6308f4016d857efdc442bc3ef
1 parent
40304ef7
Added stopword removal
git-svn-id: https://kt-dms.svn.sourceforge.net/svnroot/kt-dms/trunk@5800 c91229c3-7414-0410-bfa2-8a42b809f60b
Showing
5 changed files
with
650 additions
and
75 deletions
config/config.ini
| ... | ... | @@ -155,6 +155,9 @@ tmpDirectory = ${varDirectory}/tmp |
| 155 | 155 | graphicsUrl = ${rootUrl}/graphics |
| 156 | 156 | uiUrl = ${rootUrl}/presentation/lookAndFeel/knowledgeTree |
| 157 | 157 | |
| 158 | +; files | |
| 159 | +stopwordsFile = ${fileSystemRoot}/config/stopwords.txt | |
| 160 | + | |
| 158 | 161 | [session] |
| 159 | 162 | ; session timeout (in seconds) |
| 160 | 163 | sessionTimeout = 1200 | ... | ... |
config/stopwords.txt
0 → 100644
| 1 | +a's | |
| 2 | +able | |
| 3 | +about | |
| 4 | +above | |
| 5 | +according | |
| 6 | +accordingly | |
| 7 | +across | |
| 8 | +actually | |
| 9 | +after | |
| 10 | +afterwards | |
| 11 | +again | |
| 12 | +against | |
| 13 | +ain't | |
| 14 | +all | |
| 15 | +allow | |
| 16 | +allows | |
| 17 | +almost | |
| 18 | +alone | |
| 19 | +along | |
| 20 | +already | |
| 21 | +also | |
| 22 | +although | |
| 23 | +always | |
| 24 | +am | |
| 25 | +among | |
| 26 | +amongst | |
| 27 | +an | |
| 28 | +and | |
| 29 | +another | |
| 30 | +any | |
| 31 | +anybody | |
| 32 | +anyhow | |
| 33 | +anyone | |
| 34 | +anything | |
| 35 | +anyway | |
| 36 | +anyways | |
| 37 | +anywhere | |
| 38 | +apart | |
| 39 | +appear | |
| 40 | +appreciate | |
| 41 | +appropriate | |
| 42 | +are | |
| 43 | +aren't | |
| 44 | +around | |
| 45 | +as | |
| 46 | +aside | |
| 47 | +ask | |
| 48 | +asking | |
| 49 | +associated | |
| 50 | +at | |
| 51 | +available | |
| 52 | +away | |
| 53 | +awfully | |
| 54 | +be | |
| 55 | +became | |
| 56 | +because | |
| 57 | +become | |
| 58 | +becomes | |
| 59 | +becoming | |
| 60 | +been | |
| 61 | +before | |
| 62 | +beforehand | |
| 63 | +behind | |
| 64 | +being | |
| 65 | +believe | |
| 66 | +below | |
| 67 | +beside | |
| 68 | +besides | |
| 69 | +best | |
| 70 | +better | |
| 71 | +between | |
| 72 | +beyond | |
| 73 | +both | |
| 74 | +brief | |
| 75 | +but | |
| 76 | +by | |
| 77 | +c'mon | |
| 78 | +c's | |
| 79 | +came | |
| 80 | +can | |
| 81 | +can't | |
| 82 | +cannot | |
| 83 | +cant | |
| 84 | +cause | |
| 85 | +causes | |
| 86 | +certain | |
| 87 | +certainly | |
| 88 | +changes | |
| 89 | +clearly | |
| 90 | +co | |
| 91 | +com | |
| 92 | +come | |
| 93 | +comes | |
| 94 | +concerning | |
| 95 | +consequently | |
| 96 | +consider | |
| 97 | +considering | |
| 98 | +contain | |
| 99 | +containing | |
| 100 | +contains | |
| 101 | +corresponding | |
| 102 | +could | |
| 103 | +couldn't | |
| 104 | +course | |
| 105 | +currently | |
| 106 | +definitely | |
| 107 | +described | |
| 108 | +despite | |
| 109 | +did | |
| 110 | +didn't | |
| 111 | +different | |
| 112 | +do | |
| 113 | +does | |
| 114 | +doesn't | |
| 115 | +doing | |
| 116 | +don't | |
| 117 | +done | |
| 118 | +down | |
| 119 | +downwards | |
| 120 | +during | |
| 121 | +each | |
| 122 | +edu | |
| 123 | +eg | |
| 124 | +eight | |
| 125 | +either | |
| 126 | +else | |
| 127 | +elsewhere | |
| 128 | +enough | |
| 129 | +entirely | |
| 130 | +especially | |
| 131 | +et | |
| 132 | +etc | |
| 133 | +even | |
| 134 | +ever | |
| 135 | +every | |
| 136 | +everybody | |
| 137 | +everyone | |
| 138 | +everything | |
| 139 | +everywhere | |
| 140 | +ex | |
| 141 | +exactly | |
| 142 | +example | |
| 143 | +except | |
| 144 | +far | |
| 145 | +few | |
| 146 | +fifth | |
| 147 | +first | |
| 148 | +five | |
| 149 | +followed | |
| 150 | +following | |
| 151 | +follows | |
| 152 | +for | |
| 153 | +former | |
| 154 | +formerly | |
| 155 | +forth | |
| 156 | +four | |
| 157 | +from | |
| 158 | +further | |
| 159 | +furthermore | |
| 160 | +get | |
| 161 | +gets | |
| 162 | +getting | |
| 163 | +given | |
| 164 | +gives | |
| 165 | +go | |
| 166 | +goes | |
| 167 | +going | |
| 168 | +gone | |
| 169 | +got | |
| 170 | +gotten | |
| 171 | +greetings | |
| 172 | +had | |
| 173 | +hadn't | |
| 174 | +happens | |
| 175 | +hardly | |
| 176 | +has | |
| 177 | +hasn't | |
| 178 | +have | |
| 179 | +haven't | |
| 180 | +having | |
| 181 | +he | |
| 182 | +he's | |
| 183 | +hello | |
| 184 | +help | |
| 185 | +hence | |
| 186 | +her | |
| 187 | +here | |
| 188 | +here's | |
| 189 | +hereafter | |
| 190 | +hereby | |
| 191 | +herein | |
| 192 | +hereupon | |
| 193 | +hers | |
| 194 | +herself | |
| 195 | +hi | |
| 196 | +him | |
| 197 | +himself | |
| 198 | +his | |
| 199 | +hither | |
| 200 | +hopefully | |
| 201 | +how | |
| 202 | +howbeit | |
| 203 | +however | |
| 204 | +i'd | |
| 205 | +i'll | |
| 206 | +i'm | |
| 207 | +i've | |
| 208 | +ie | |
| 209 | +if | |
| 210 | +ignored | |
| 211 | +immediate | |
| 212 | +in | |
| 213 | +inasmuch | |
| 214 | +inc | |
| 215 | +indeed | |
| 216 | +indicate | |
| 217 | +indicated | |
| 218 | +indicates | |
| 219 | +inner | |
| 220 | +insofar | |
| 221 | +instead | |
| 222 | +into | |
| 223 | +inward | |
| 224 | +is | |
| 225 | +isn't | |
| 226 | +it | |
| 227 | +it'd | |
| 228 | +it'll | |
| 229 | +it's | |
| 230 | +its | |
| 231 | +itself | |
| 232 | +just | |
| 233 | +keep | |
| 234 | +keeps | |
| 235 | +kept | |
| 236 | +know | |
| 237 | +knows | |
| 238 | +known | |
| 239 | +last | |
| 240 | +lately | |
| 241 | +later | |
| 242 | +latter | |
| 243 | +latterly | |
| 244 | +least | |
| 245 | +less | |
| 246 | +lest | |
| 247 | +let | |
| 248 | +let's | |
| 249 | +like | |
| 250 | +liked | |
| 251 | +likely | |
| 252 | +little | |
| 253 | +look | |
| 254 | +looking | |
| 255 | +looks | |
| 256 | +ltd | |
| 257 | +mainly | |
| 258 | +many | |
| 259 | +may | |
| 260 | +maybe | |
| 261 | +me | |
| 262 | +mean | |
| 263 | +meanwhile | |
| 264 | +merely | |
| 265 | +might | |
| 266 | +more | |
| 267 | +moreover | |
| 268 | +most | |
| 269 | +mostly | |
| 270 | +much | |
| 271 | +must | |
| 272 | +my | |
| 273 | +myself | |
| 274 | +name | |
| 275 | +namely | |
| 276 | +nd | |
| 277 | +near | |
| 278 | +nearly | |
| 279 | +necessary | |
| 280 | +need | |
| 281 | +needs | |
| 282 | +neither | |
| 283 | +never | |
| 284 | +nevertheless | |
| 285 | +new | |
| 286 | +next | |
| 287 | +nine | |
| 288 | +no | |
| 289 | +nobody | |
| 290 | +non | |
| 291 | +none | |
| 292 | +noone | |
| 293 | +nor | |
| 294 | +normally | |
| 295 | +not | |
| 296 | +nothing | |
| 297 | +novel | |
| 298 | +now | |
| 299 | +nowhere | |
| 300 | +obviously | |
| 301 | +of | |
| 302 | +off | |
| 303 | +often | |
| 304 | +oh | |
| 305 | +ok | |
| 306 | +okay | |
| 307 | +old | |
| 308 | +on | |
| 309 | +once | |
| 310 | +one | |
| 311 | +ones | |
| 312 | +only | |
| 313 | +onto | |
| 314 | +or | |
| 315 | +other | |
| 316 | +others | |
| 317 | +otherwise | |
| 318 | +ought | |
| 319 | +our | |
| 320 | +ours | |
| 321 | +ourselves | |
| 322 | +out | |
| 323 | +outside | |
| 324 | +over | |
| 325 | +overall | |
| 326 | +own | |
| 327 | +particular | |
| 328 | +particularly | |
| 329 | +per | |
| 330 | +perhaps | |
| 331 | +placed | |
| 332 | +please | |
| 333 | +plus | |
| 334 | +possible | |
| 335 | +presumably | |
| 336 | +probably | |
| 337 | +provides | |
| 338 | +que | |
| 339 | +quite | |
| 340 | +qv | |
| 341 | +rather | |
| 342 | +rd | |
| 343 | +re | |
| 344 | +really | |
| 345 | +reasonably | |
| 346 | +regarding | |
| 347 | +regardless | |
| 348 | +regards | |
| 349 | +relatively | |
| 350 | +respectively | |
| 351 | +right | |
| 352 | +said | |
| 353 | +same | |
| 354 | +saw | |
| 355 | +say | |
| 356 | +saying | |
| 357 | +says | |
| 358 | +second | |
| 359 | +secondly | |
| 360 | +see | |
| 361 | +seeing | |
| 362 | +seem | |
| 363 | +seemed | |
| 364 | +seeming | |
| 365 | +seems | |
| 366 | +seen | |
| 367 | +self | |
| 368 | +selves | |
| 369 | +sensible | |
| 370 | +sent | |
| 371 | +serious | |
| 372 | +seriously | |
| 373 | +seven | |
| 374 | +several | |
| 375 | +shall | |
| 376 | +she | |
| 377 | +should | |
| 378 | +shouldn't | |
| 379 | +since | |
| 380 | +six | |
| 381 | +so | |
| 382 | +some | |
| 383 | +somebody | |
| 384 | +somehow | |
| 385 | +someone | |
| 386 | +something | |
| 387 | +sometime | |
| 388 | +sometimes | |
| 389 | +somewhat | |
| 390 | +somewhere | |
| 391 | +soon | |
| 392 | +sorry | |
| 393 | +specified | |
| 394 | +specify | |
| 395 | +specifying | |
| 396 | +still | |
| 397 | +sub | |
| 398 | +such | |
| 399 | +sup | |
| 400 | +sure | |
| 401 | +t's | |
| 402 | +take | |
| 403 | +taken | |
| 404 | +tell | |
| 405 | +tends | |
| 406 | +th | |
| 407 | +than | |
| 408 | +thank | |
| 409 | +thanks | |
| 410 | +thanx | |
| 411 | +that | |
| 412 | +that's | |
| 413 | +thats | |
| 414 | +the | |
| 415 | +their | |
| 416 | +theirs | |
| 417 | +them | |
| 418 | +themselves | |
| 419 | +then | |
| 420 | +thence | |
| 421 | +there | |
| 422 | +there's | |
| 423 | +thereafter | |
| 424 | +thereby | |
| 425 | +therefore | |
| 426 | +therein | |
| 427 | +theres | |
| 428 | +thereupon | |
| 429 | +these | |
| 430 | +they | |
| 431 | +they'd | |
| 432 | +they'll | |
| 433 | +they're | |
| 434 | +they've | |
| 435 | +think | |
| 436 | +third | |
| 437 | +this | |
| 438 | +thorough | |
| 439 | +thoroughly | |
| 440 | +those | |
| 441 | +though | |
| 442 | +three | |
| 443 | +through | |
| 444 | +throughout | |
| 445 | +thru | |
| 446 | +thus | |
| 447 | +to | |
| 448 | +together | |
| 449 | +too | |
| 450 | +took | |
| 451 | +toward | |
| 452 | +towards | |
| 453 | +tried | |
| 454 | +tries | |
| 455 | +truly | |
| 456 | +try | |
| 457 | +trying | |
| 458 | +twice | |
| 459 | +two | |
| 460 | +un | |
| 461 | +under | |
| 462 | +unfortunately | |
| 463 | +unless | |
| 464 | +unlikely | |
| 465 | +until | |
| 466 | +unto | |
| 467 | +up | |
| 468 | +upon | |
| 469 | +us | |
| 470 | +use | |
| 471 | +used | |
| 472 | +useful | |
| 473 | +uses | |
| 474 | +using | |
| 475 | +usually | |
| 476 | +value | |
| 477 | +various | |
| 478 | +very | |
| 479 | +via | |
| 480 | +viz | |
| 481 | +vs | |
| 482 | +want | |
| 483 | +wants | |
| 484 | +was | |
| 485 | +wasn't | |
| 486 | +way | |
| 487 | +we | |
| 488 | +we'd | |
| 489 | +we'll | |
| 490 | +we're | |
| 491 | +we've | |
| 492 | +welcome | |
| 493 | +well | |
| 494 | +went | |
| 495 | +were | |
| 496 | +weren't | |
| 497 | +what | |
| 498 | +what's | |
| 499 | +whatever | |
| 500 | +when | |
| 501 | +whence | |
| 502 | +whenever | |
| 503 | +where | |
| 504 | +where's | |
| 505 | +whereafter | |
| 506 | +whereas | |
| 507 | +whereby | |
| 508 | +wherein | |
| 509 | +whereupon | |
| 510 | +wherever | |
| 511 | +whether | |
| 512 | +which | |
| 513 | +while | |
| 514 | +whither | |
| 515 | +who | |
| 516 | +who's | |
| 517 | +whoever | |
| 518 | +whole | |
| 519 | +whom | |
| 520 | +whose | |
| 521 | +why | |
| 522 | +will | |
| 523 | +willing | |
| 524 | +wish | |
| 525 | +with | |
| 526 | +within | |
| 527 | +without | |
| 528 | +won't | |
| 529 | +wonder | |
| 530 | +would | |
| 531 | +would | |
| 532 | +wouldn't | |
| 533 | +yes | |
| 534 | +yet | |
| 535 | +you | |
| 536 | +you'd | |
| 537 | +you'll | |
| 538 | +you're | |
| 539 | +you've | |
| 540 | +your | |
| 541 | +yours | |
| 542 | +yourself | |
| 543 | +yourselves | |
| 544 | +zero | ... | ... |
lib/browse/Criteria.inc
| ... | ... | @@ -43,8 +43,6 @@ require_once(KT_LIB_DIR . '/workflow/workflow.inc.php'); |
| 43 | 43 | require_once(KT_LIB_DIR . '/browse/criteriaregistry.php'); |
| 44 | 44 | |
| 45 | 45 | |
| 46 | -$RESTRICTING_SEARCH = true; | |
| 47 | - | |
| 48 | 46 | class BrowseCriterion { |
| 49 | 47 | var $sDisplay; |
| 50 | 48 | var $sDocumentField; |
| ... | ... | @@ -689,23 +687,9 @@ class ContentCriterion extends BrowseCriterion { |
| 689 | 687 | |
| 690 | 688 | $p = array(); |
| 691 | 689 | $p[0] = "MATCH(DT.document_text) AGAINST (? $boolean_mode)"; |
| 692 | - | |
| 690 | + $p[1] = KTUtil::phraseQuote($aRequest[$this->getWidgetBase()]); | |
| 693 | 691 | |
| 694 | - | |
| 695 | - if ($RESTRICTING_SEARCH) { | |
| 696 | - $q_set = KTUtil::phraseSplit($aRequest[$this->getWidgetBase()]); | |
| 697 | - $temp = $q_set; | |
| 698 | - foreach ($temp as $k => $v) { | |
| 699 | - $t = array(); | |
| 700 | - foreach ($v as $part) { | |
| 701 | - $t[] = sprintf('+"%s"', $part); | |
| 702 | - } | |
| 703 | - $q_set[$k] = join(' ', $t); | |
| 704 | - } | |
| 705 | - $p[1] = implode(' ',$q_set); | |
| 706 | - } else { | |
| 707 | - $p[1] = $aRequest[$this->getWidgetBase()]; | |
| 708 | - } | |
| 692 | + // var_dump($p[1]);exit(0); | |
| 709 | 693 | |
| 710 | 694 | // handle the boolean "not" stuff. |
| 711 | 695 | $want_invert = KTUtil::arrayGet($aRequest, $this->getWidgetBase() . '_not'); |
| ... | ... | @@ -815,23 +799,7 @@ class DiscussionTextCriterion extends BrowseCriterion { |
| 815 | 799 | |
| 816 | 800 | $p = array(); |
| 817 | 801 | $p[0] = "MATCH(DDCT.body) AGAINST (? $boolean_mode)"; |
| 818 | - | |
| 819 | - | |
| 820 | - | |
| 821 | - if ($RESTRICTING_SEARCH) { | |
| 822 | - $q_set = KTUtil::phraseSplit($aRequest[$this->getWidgetBase()]); | |
| 823 | - $temp = $q_set; | |
| 824 | - foreach ($temp as $k => $v) { | |
| 825 | - $t = array(); | |
| 826 | - foreach ($v as $part) { | |
| 827 | - $t[] = sprintf('+"%s"', $part); | |
| 828 | - } | |
| 829 | - $q_set[$k] = join(' ', $t); | |
| 830 | - } | |
| 831 | - $p[1] = implode(' ',$q_set); | |
| 832 | - } else { | |
| 833 | - $p[1] = $aRequest[$this->getWidgetBase()]; | |
| 834 | - } | |
| 802 | + $p[1] = KTUtil::phraseQuote($aRequest[$this->getWidgetBase()]); | |
| 835 | 803 | |
| 836 | 804 | // handle the boolean "not" stuff. |
| 837 | 805 | $want_invert = KTUtil::arrayGet($aRequest, $this->getWidgetBase() . '_not'); |
| ... | ... | @@ -891,23 +859,7 @@ class SearchableTextCriterion extends BrowseCriterion { |
| 891 | 859 | |
| 892 | 860 | $p = array(); |
| 893 | 861 | $p[0] = "MATCH(DST.document_text) AGAINST (? $boolean_mode)"; |
| 894 | - | |
| 895 | - | |
| 896 | - | |
| 897 | - if ($RESTRICTING_SEARCH) { | |
| 898 | - $q_set = KTUtil::phraseSplit($aRequest[$this->getWidgetBase()]); | |
| 899 | - $temp = $q_set; | |
| 900 | - foreach ($temp as $k => $v) { | |
| 901 | - $t = array(); | |
| 902 | - foreach ($v as $part) { | |
| 903 | - $t[] = sprintf('+"%s"', $part); | |
| 904 | - } | |
| 905 | - $q_set[$k] = join(' ', $t); | |
| 906 | - } | |
| 907 | - $p[1] = implode(' ',$q_set); | |
| 908 | - } else { | |
| 909 | - $p[1] = $aRequest[$this->getWidgetBase()]; | |
| 910 | - } | |
| 862 | + $p[1] = KTUtil::phraseQuote($aRequest[$this->getWidgetBase()]); | |
| 911 | 863 | |
| 912 | 864 | // handle the boolean "not" stuff. |
| 913 | 865 | $want_invert = KTUtil::arrayGet($aRequest, $this->getWidgetBase() . '_not'); |
| ... | ... | @@ -956,23 +908,7 @@ class TransactionTextCriterion extends BrowseCriterion { |
| 956 | 908 | |
| 957 | 909 | $p = array(); |
| 958 | 910 | $p[0] = "MATCH(DTT.document_text) AGAINST (? $boolean_mode)"; |
| 959 | - | |
| 960 | - | |
| 961 | - | |
| 962 | - if ($RESTRICTING_SEARCH) { | |
| 963 | - $q_set = KTUtil::phraseSplit($aRequest[$this->getWidgetBase()]); | |
| 964 | - $temp = $q_set; | |
| 965 | - foreach ($temp as $k => $v) { | |
| 966 | - $t = array(); | |
| 967 | - foreach ($v as $part) { | |
| 968 | - $t[] = sprintf('+"%s"', $part); | |
| 969 | - } | |
| 970 | - $q_set[$k] = join(' ', $t); | |
| 971 | - } | |
| 972 | - $p[1] = implode(' ',$q_set); | |
| 973 | - } else { | |
| 974 | - $p[1] = $aRequest[$this->getWidgetBase()]; | |
| 975 | - } | |
| 911 | + $p[1] = KTUtil::phraseQuote($aRequest[$this->getWidgetBase()]); | |
| 976 | 912 | |
| 977 | 913 | // handle the boolean "not" stuff. |
| 978 | 914 | $want_invert = KTUtil::arrayGet($aRequest, $this->getWidgetBase() . '_not'); | ... | ... |
lib/util/KTStopwords.php
0 → 100644
| 1 | +<?php | |
| 2 | + | |
| 3 | +/** | |
| 4 | + * $Id: config.inc.php 5758 2006-07-27 10:17:43Z bshuttle $ | |
| 5 | + * | |
| 6 | + * The contents of this file are subject to the KnowledgeTree Public | |
| 7 | + * License Version 1.1 ("License"); You may not use this file except in | |
| 8 | + * compliance with the License. You may obtain a copy of the License at | |
| 9 | + * http://www.ktdms.com/KPL | |
| 10 | + * | |
| 11 | + * Software distributed under the License is distributed on an "AS IS" | |
| 12 | + * basis, | |
| 13 | + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License | |
| 14 | + * for the specific language governing rights and limitations under the | |
| 15 | + * License. | |
| 16 | + * | |
| 17 | + * The Original Code is: KnowledgeTree Open Source | |
| 18 | + * | |
| 19 | + * The Initial Developer of the Original Code is The Jam Warehouse Software | |
| 20 | + * (Pty) Ltd, trading as KnowledgeTree. | |
| 21 | + * Portions created by The Jam Warehouse Software (Pty) Ltd are Copyright | |
| 22 | + * (C) 2006 The Jam Warehouse Software (Pty) Ltd; | |
| 23 | + * All Rights Reserved. | |
| 24 | + * | |
| 25 | + */ | |
| 26 | + | |
| 27 | +class KTStopwords { | |
| 28 | + var $words = array(); | |
| 29 | + | |
| 30 | + var $conf = array(); | |
| 31 | + var $aSectionFile; | |
| 32 | + var $aFileRoot; | |
| 33 | + var $flat = array(); | |
| 34 | + var $flatns = array(); | |
| 35 | + | |
| 36 | + function loadCache($filename) { | |
| 37 | + $cache_str = file_get_contents($filename); | |
| 38 | + $this->words = unserialize($cache_str); | |
| 39 | + return true; | |
| 40 | + } | |
| 41 | + | |
| 42 | + function createCache($filename) { | |
| 43 | + file_put_contents($filename, serialize($this->words)); | |
| 44 | + } | |
| 45 | + | |
| 46 | + function loadFile($filename) { | |
| 47 | + $this->words = array(); | |
| 48 | + foreach(file($filename) as $line) { | |
| 49 | + $this->words[] = trim($line); | |
| 50 | + } | |
| 51 | + } | |
| 52 | + | |
| 53 | + function isStopword($sWord) { | |
| 54 | + return in_array($sWord, $this->words); | |
| 55 | + } | |
| 56 | + | |
| 57 | + function &getSingleton() { | |
| 58 | + if (!KTUtil::arrayGet($GLOBALS, 'KTStopwords')) { | |
| 59 | + $GLOBALS['KTStopwords'] =& new KTStopwords; | |
| 60 | + $oConfig = KTConfig::getSingleton(); | |
| 61 | + $GLOBALS['KTStopwords']->loadFile($oConfig->get('stopwordsFile')); | |
| 62 | + } | |
| 63 | + return $GLOBALS['KTStopwords']; | |
| 64 | + } | |
| 65 | +} | |
| 66 | + | |
| 67 | + | |
| 68 | +?> | ... | ... |
lib/util/ktutil.inc
| ... | ... | @@ -28,6 +28,8 @@ |
| 28 | 28 | * @author Neil Blakey-Milner <nbm@jamwarehouse.com>, Jam Warehouse (Pty) Ltd, South Africa |
| 29 | 29 | */ |
| 30 | 30 | |
| 31 | +require_once(KT_LIB_DIR . '/util/KTStopwords.php'); | |
| 32 | + | |
| 31 | 33 | class KTUtil { |
| 32 | 34 | function extractGPC () { |
| 33 | 35 | foreach (func_get_args() as $var) { |
| ... | ... | @@ -550,7 +552,16 @@ class KTUtil { |
| 550 | 552 | return ((float) $microtime_simple[1] + (float) $microtime_simple[0]); |
| 551 | 553 | } |
| 552 | 554 | |
| 553 | - function phraseSplit($sSearchString) { | |
| 555 | + function phraseSplit($sSearchString) { | |
| 556 | + // this should probably be moved to a DBUtil method | |
| 557 | + | |
| 558 | + $sMinWord = DBUtil::getOneResultKey("SHOW VARIABLES LIKE 'ft_min_word_len'", "Value"); | |
| 559 | + if(is_numeric($sMinWord)) { | |
| 560 | + $iMinWord = (int)$sMinWord; | |
| 561 | + } else { | |
| 562 | + $iMinWord = 4; | |
| 563 | + } | |
| 564 | + | |
| 554 | 565 | $a = preg_split('#"#', $sSearchString); |
| 555 | 566 | $i = 0; |
| 556 | 567 | $phrases = array(); |
| ... | ... | @@ -564,20 +575,33 @@ class KTUtil { |
| 564 | 575 | $i += 1; |
| 565 | 576 | } |
| 566 | 577 | |
| 578 | + $oStopwords =& KTStopwords::getSingleton(); | |
| 579 | + | |
| 567 | 580 | $words = array(); |
| 568 | 581 | foreach ($word_parts as $part) { |
| 569 | 582 | $w = (array) explode(' ', $part); |
| 570 | - foreach ($w as $potential) { if (!empty($potential)) { $words[] = $potential; }} | |
| 571 | - } | |
| 572 | - | |
| 573 | - // XXX: filter each subword (including in phrases) to remove whitespace-broken items and sub-4 character words. | |
| 574 | - | |
| 583 | + foreach ($w as $potential) { | |
| 584 | + if (strlen($potential) >= $iMinWord && !$oStopwords->isStopword($potential)) { | |
| 585 | + $words[] = $potential; | |
| 586 | + } | |
| 587 | + } | |
| 588 | + } | |
| 575 | 589 | |
| 576 | 590 | return array( |
| 577 | 591 | 'words' => $words, |
| 578 | 592 | 'phrases' => $phrases, |
| 579 | 593 | ); |
| 594 | + } | |
| 580 | 595 | |
| 596 | + function phraseQuote($sQuery) { | |
| 597 | + foreach(KTUtil::phraseSplit($sQuery) as $k => $v) { | |
| 598 | + $t = array(); | |
| 599 | + foreach ($v as $part) { | |
| 600 | + $t[] = sprintf('+"%s"', $part); | |
| 601 | + } | |
| 602 | + $q_set[$k] = join(' ', $t); | |
| 603 | + } | |
| 604 | + return implode(' ',$q_set); | |
| 581 | 605 | } |
| 582 | 606 | |
| 583 | 607 | function running_user() { | ... | ... |