Commit b77bfde8af4388f6308f4016d857efdc442bc3ef
1 parent
40304ef7
Added stopword removal
git-svn-id: https://kt-dms.svn.sourceforge.net/svnroot/kt-dms/trunk@5800 c91229c3-7414-0410-bfa2-8a42b809f60b
Showing
5 changed files
with
650 additions
and
75 deletions
config/config.ini
| @@ -155,6 +155,9 @@ tmpDirectory = ${varDirectory}/tmp | @@ -155,6 +155,9 @@ tmpDirectory = ${varDirectory}/tmp | ||
| 155 | graphicsUrl = ${rootUrl}/graphics | 155 | graphicsUrl = ${rootUrl}/graphics |
| 156 | uiUrl = ${rootUrl}/presentation/lookAndFeel/knowledgeTree | 156 | uiUrl = ${rootUrl}/presentation/lookAndFeel/knowledgeTree |
| 157 | 157 | ||
| 158 | +; files | ||
| 159 | +stopwordsFile = ${fileSystemRoot}/config/stopwords.txt | ||
| 160 | + | ||
| 158 | [session] | 161 | [session] |
| 159 | ; session timeout (in seconds) | 162 | ; session timeout (in seconds) |
| 160 | sessionTimeout = 1200 | 163 | sessionTimeout = 1200 |
config/stopwords.txt
0 → 100644
| 1 | +a's | ||
| 2 | +able | ||
| 3 | +about | ||
| 4 | +above | ||
| 5 | +according | ||
| 6 | +accordingly | ||
| 7 | +across | ||
| 8 | +actually | ||
| 9 | +after | ||
| 10 | +afterwards | ||
| 11 | +again | ||
| 12 | +against | ||
| 13 | +ain't | ||
| 14 | +all | ||
| 15 | +allow | ||
| 16 | +allows | ||
| 17 | +almost | ||
| 18 | +alone | ||
| 19 | +along | ||
| 20 | +already | ||
| 21 | +also | ||
| 22 | +although | ||
| 23 | +always | ||
| 24 | +am | ||
| 25 | +among | ||
| 26 | +amongst | ||
| 27 | +an | ||
| 28 | +and | ||
| 29 | +another | ||
| 30 | +any | ||
| 31 | +anybody | ||
| 32 | +anyhow | ||
| 33 | +anyone | ||
| 34 | +anything | ||
| 35 | +anyway | ||
| 36 | +anyways | ||
| 37 | +anywhere | ||
| 38 | +apart | ||
| 39 | +appear | ||
| 40 | +appreciate | ||
| 41 | +appropriate | ||
| 42 | +are | ||
| 43 | +aren't | ||
| 44 | +around | ||
| 45 | +as | ||
| 46 | +aside | ||
| 47 | +ask | ||
| 48 | +asking | ||
| 49 | +associated | ||
| 50 | +at | ||
| 51 | +available | ||
| 52 | +away | ||
| 53 | +awfully | ||
| 54 | +be | ||
| 55 | +became | ||
| 56 | +because | ||
| 57 | +become | ||
| 58 | +becomes | ||
| 59 | +becoming | ||
| 60 | +been | ||
| 61 | +before | ||
| 62 | +beforehand | ||
| 63 | +behind | ||
| 64 | +being | ||
| 65 | +believe | ||
| 66 | +below | ||
| 67 | +beside | ||
| 68 | +besides | ||
| 69 | +best | ||
| 70 | +better | ||
| 71 | +between | ||
| 72 | +beyond | ||
| 73 | +both | ||
| 74 | +brief | ||
| 75 | +but | ||
| 76 | +by | ||
| 77 | +c'mon | ||
| 78 | +c's | ||
| 79 | +came | ||
| 80 | +can | ||
| 81 | +can't | ||
| 82 | +cannot | ||
| 83 | +cant | ||
| 84 | +cause | ||
| 85 | +causes | ||
| 86 | +certain | ||
| 87 | +certainly | ||
| 88 | +changes | ||
| 89 | +clearly | ||
| 90 | +co | ||
| 91 | +com | ||
| 92 | +come | ||
| 93 | +comes | ||
| 94 | +concerning | ||
| 95 | +consequently | ||
| 96 | +consider | ||
| 97 | +considering | ||
| 98 | +contain | ||
| 99 | +containing | ||
| 100 | +contains | ||
| 101 | +corresponding | ||
| 102 | +could | ||
| 103 | +couldn't | ||
| 104 | +course | ||
| 105 | +currently | ||
| 106 | +definitely | ||
| 107 | +described | ||
| 108 | +despite | ||
| 109 | +did | ||
| 110 | +didn't | ||
| 111 | +different | ||
| 112 | +do | ||
| 113 | +does | ||
| 114 | +doesn't | ||
| 115 | +doing | ||
| 116 | +don't | ||
| 117 | +done | ||
| 118 | +down | ||
| 119 | +downwards | ||
| 120 | +during | ||
| 121 | +each | ||
| 122 | +edu | ||
| 123 | +eg | ||
| 124 | +eight | ||
| 125 | +either | ||
| 126 | +else | ||
| 127 | +elsewhere | ||
| 128 | +enough | ||
| 129 | +entirely | ||
| 130 | +especially | ||
| 131 | +et | ||
| 132 | +etc | ||
| 133 | +even | ||
| 134 | +ever | ||
| 135 | +every | ||
| 136 | +everybody | ||
| 137 | +everyone | ||
| 138 | +everything | ||
| 139 | +everywhere | ||
| 140 | +ex | ||
| 141 | +exactly | ||
| 142 | +example | ||
| 143 | +except | ||
| 144 | +far | ||
| 145 | +few | ||
| 146 | +fifth | ||
| 147 | +first | ||
| 148 | +five | ||
| 149 | +followed | ||
| 150 | +following | ||
| 151 | +follows | ||
| 152 | +for | ||
| 153 | +former | ||
| 154 | +formerly | ||
| 155 | +forth | ||
| 156 | +four | ||
| 157 | +from | ||
| 158 | +further | ||
| 159 | +furthermore | ||
| 160 | +get | ||
| 161 | +gets | ||
| 162 | +getting | ||
| 163 | +given | ||
| 164 | +gives | ||
| 165 | +go | ||
| 166 | +goes | ||
| 167 | +going | ||
| 168 | +gone | ||
| 169 | +got | ||
| 170 | +gotten | ||
| 171 | +greetings | ||
| 172 | +had | ||
| 173 | +hadn't | ||
| 174 | +happens | ||
| 175 | +hardly | ||
| 176 | +has | ||
| 177 | +hasn't | ||
| 178 | +have | ||
| 179 | +haven't | ||
| 180 | +having | ||
| 181 | +he | ||
| 182 | +he's | ||
| 183 | +hello | ||
| 184 | +help | ||
| 185 | +hence | ||
| 186 | +her | ||
| 187 | +here | ||
| 188 | +here's | ||
| 189 | +hereafter | ||
| 190 | +hereby | ||
| 191 | +herein | ||
| 192 | +hereupon | ||
| 193 | +hers | ||
| 194 | +herself | ||
| 195 | +hi | ||
| 196 | +him | ||
| 197 | +himself | ||
| 198 | +his | ||
| 199 | +hither | ||
| 200 | +hopefully | ||
| 201 | +how | ||
| 202 | +howbeit | ||
| 203 | +however | ||
| 204 | +i'd | ||
| 205 | +i'll | ||
| 206 | +i'm | ||
| 207 | +i've | ||
| 208 | +ie | ||
| 209 | +if | ||
| 210 | +ignored | ||
| 211 | +immediate | ||
| 212 | +in | ||
| 213 | +inasmuch | ||
| 214 | +inc | ||
| 215 | +indeed | ||
| 216 | +indicate | ||
| 217 | +indicated | ||
| 218 | +indicates | ||
| 219 | +inner | ||
| 220 | +insofar | ||
| 221 | +instead | ||
| 222 | +into | ||
| 223 | +inward | ||
| 224 | +is | ||
| 225 | +isn't | ||
| 226 | +it | ||
| 227 | +it'd | ||
| 228 | +it'll | ||
| 229 | +it's | ||
| 230 | +its | ||
| 231 | +itself | ||
| 232 | +just | ||
| 233 | +keep | ||
| 234 | +keeps | ||
| 235 | +kept | ||
| 236 | +know | ||
| 237 | +knows | ||
| 238 | +known | ||
| 239 | +last | ||
| 240 | +lately | ||
| 241 | +later | ||
| 242 | +latter | ||
| 243 | +latterly | ||
| 244 | +least | ||
| 245 | +less | ||
| 246 | +lest | ||
| 247 | +let | ||
| 248 | +let's | ||
| 249 | +like | ||
| 250 | +liked | ||
| 251 | +likely | ||
| 252 | +little | ||
| 253 | +look | ||
| 254 | +looking | ||
| 255 | +looks | ||
| 256 | +ltd | ||
| 257 | +mainly | ||
| 258 | +many | ||
| 259 | +may | ||
| 260 | +maybe | ||
| 261 | +me | ||
| 262 | +mean | ||
| 263 | +meanwhile | ||
| 264 | +merely | ||
| 265 | +might | ||
| 266 | +more | ||
| 267 | +moreover | ||
| 268 | +most | ||
| 269 | +mostly | ||
| 270 | +much | ||
| 271 | +must | ||
| 272 | +my | ||
| 273 | +myself | ||
| 274 | +name | ||
| 275 | +namely | ||
| 276 | +nd | ||
| 277 | +near | ||
| 278 | +nearly | ||
| 279 | +necessary | ||
| 280 | +need | ||
| 281 | +needs | ||
| 282 | +neither | ||
| 283 | +never | ||
| 284 | +nevertheless | ||
| 285 | +new | ||
| 286 | +next | ||
| 287 | +nine | ||
| 288 | +no | ||
| 289 | +nobody | ||
| 290 | +non | ||
| 291 | +none | ||
| 292 | +noone | ||
| 293 | +nor | ||
| 294 | +normally | ||
| 295 | +not | ||
| 296 | +nothing | ||
| 297 | +novel | ||
| 298 | +now | ||
| 299 | +nowhere | ||
| 300 | +obviously | ||
| 301 | +of | ||
| 302 | +off | ||
| 303 | +often | ||
| 304 | +oh | ||
| 305 | +ok | ||
| 306 | +okay | ||
| 307 | +old | ||
| 308 | +on | ||
| 309 | +once | ||
| 310 | +one | ||
| 311 | +ones | ||
| 312 | +only | ||
| 313 | +onto | ||
| 314 | +or | ||
| 315 | +other | ||
| 316 | +others | ||
| 317 | +otherwise | ||
| 318 | +ought | ||
| 319 | +our | ||
| 320 | +ours | ||
| 321 | +ourselves | ||
| 322 | +out | ||
| 323 | +outside | ||
| 324 | +over | ||
| 325 | +overall | ||
| 326 | +own | ||
| 327 | +particular | ||
| 328 | +particularly | ||
| 329 | +per | ||
| 330 | +perhaps | ||
| 331 | +placed | ||
| 332 | +please | ||
| 333 | +plus | ||
| 334 | +possible | ||
| 335 | +presumably | ||
| 336 | +probably | ||
| 337 | +provides | ||
| 338 | +que | ||
| 339 | +quite | ||
| 340 | +qv | ||
| 341 | +rather | ||
| 342 | +rd | ||
| 343 | +re | ||
| 344 | +really | ||
| 345 | +reasonably | ||
| 346 | +regarding | ||
| 347 | +regardless | ||
| 348 | +regards | ||
| 349 | +relatively | ||
| 350 | +respectively | ||
| 351 | +right | ||
| 352 | +said | ||
| 353 | +same | ||
| 354 | +saw | ||
| 355 | +say | ||
| 356 | +saying | ||
| 357 | +says | ||
| 358 | +second | ||
| 359 | +secondly | ||
| 360 | +see | ||
| 361 | +seeing | ||
| 362 | +seem | ||
| 363 | +seemed | ||
| 364 | +seeming | ||
| 365 | +seems | ||
| 366 | +seen | ||
| 367 | +self | ||
| 368 | +selves | ||
| 369 | +sensible | ||
| 370 | +sent | ||
| 371 | +serious | ||
| 372 | +seriously | ||
| 373 | +seven | ||
| 374 | +several | ||
| 375 | +shall | ||
| 376 | +she | ||
| 377 | +should | ||
| 378 | +shouldn't | ||
| 379 | +since | ||
| 380 | +six | ||
| 381 | +so | ||
| 382 | +some | ||
| 383 | +somebody | ||
| 384 | +somehow | ||
| 385 | +someone | ||
| 386 | +something | ||
| 387 | +sometime | ||
| 388 | +sometimes | ||
| 389 | +somewhat | ||
| 390 | +somewhere | ||
| 391 | +soon | ||
| 392 | +sorry | ||
| 393 | +specified | ||
| 394 | +specify | ||
| 395 | +specifying | ||
| 396 | +still | ||
| 397 | +sub | ||
| 398 | +such | ||
| 399 | +sup | ||
| 400 | +sure | ||
| 401 | +t's | ||
| 402 | +take | ||
| 403 | +taken | ||
| 404 | +tell | ||
| 405 | +tends | ||
| 406 | +th | ||
| 407 | +than | ||
| 408 | +thank | ||
| 409 | +thanks | ||
| 410 | +thanx | ||
| 411 | +that | ||
| 412 | +that's | ||
| 413 | +thats | ||
| 414 | +the | ||
| 415 | +their | ||
| 416 | +theirs | ||
| 417 | +them | ||
| 418 | +themselves | ||
| 419 | +then | ||
| 420 | +thence | ||
| 421 | +there | ||
| 422 | +there's | ||
| 423 | +thereafter | ||
| 424 | +thereby | ||
| 425 | +therefore | ||
| 426 | +therein | ||
| 427 | +theres | ||
| 428 | +thereupon | ||
| 429 | +these | ||
| 430 | +they | ||
| 431 | +they'd | ||
| 432 | +they'll | ||
| 433 | +they're | ||
| 434 | +they've | ||
| 435 | +think | ||
| 436 | +third | ||
| 437 | +this | ||
| 438 | +thorough | ||
| 439 | +thoroughly | ||
| 440 | +those | ||
| 441 | +though | ||
| 442 | +three | ||
| 443 | +through | ||
| 444 | +throughout | ||
| 445 | +thru | ||
| 446 | +thus | ||
| 447 | +to | ||
| 448 | +together | ||
| 449 | +too | ||
| 450 | +took | ||
| 451 | +toward | ||
| 452 | +towards | ||
| 453 | +tried | ||
| 454 | +tries | ||
| 455 | +truly | ||
| 456 | +try | ||
| 457 | +trying | ||
| 458 | +twice | ||
| 459 | +two | ||
| 460 | +un | ||
| 461 | +under | ||
| 462 | +unfortunately | ||
| 463 | +unless | ||
| 464 | +unlikely | ||
| 465 | +until | ||
| 466 | +unto | ||
| 467 | +up | ||
| 468 | +upon | ||
| 469 | +us | ||
| 470 | +use | ||
| 471 | +used | ||
| 472 | +useful | ||
| 473 | +uses | ||
| 474 | +using | ||
| 475 | +usually | ||
| 476 | +value | ||
| 477 | +various | ||
| 478 | +very | ||
| 479 | +via | ||
| 480 | +viz | ||
| 481 | +vs | ||
| 482 | +want | ||
| 483 | +wants | ||
| 484 | +was | ||
| 485 | +wasn't | ||
| 486 | +way | ||
| 487 | +we | ||
| 488 | +we'd | ||
| 489 | +we'll | ||
| 490 | +we're | ||
| 491 | +we've | ||
| 492 | +welcome | ||
| 493 | +well | ||
| 494 | +went | ||
| 495 | +were | ||
| 496 | +weren't | ||
| 497 | +what | ||
| 498 | +what's | ||
| 499 | +whatever | ||
| 500 | +when | ||
| 501 | +whence | ||
| 502 | +whenever | ||
| 503 | +where | ||
| 504 | +where's | ||
| 505 | +whereafter | ||
| 506 | +whereas | ||
| 507 | +whereby | ||
| 508 | +wherein | ||
| 509 | +whereupon | ||
| 510 | +wherever | ||
| 511 | +whether | ||
| 512 | +which | ||
| 513 | +while | ||
| 514 | +whither | ||
| 515 | +who | ||
| 516 | +who's | ||
| 517 | +whoever | ||
| 518 | +whole | ||
| 519 | +whom | ||
| 520 | +whose | ||
| 521 | +why | ||
| 522 | +will | ||
| 523 | +willing | ||
| 524 | +wish | ||
| 525 | +with | ||
| 526 | +within | ||
| 527 | +without | ||
| 528 | +won't | ||
| 529 | +wonder | ||
| 530 | +would | ||
| 531 | +would | ||
| 532 | +wouldn't | ||
| 533 | +yes | ||
| 534 | +yet | ||
| 535 | +you | ||
| 536 | +you'd | ||
| 537 | +you'll | ||
| 538 | +you're | ||
| 539 | +you've | ||
| 540 | +your | ||
| 541 | +yours | ||
| 542 | +yourself | ||
| 543 | +yourselves | ||
| 544 | +zero |
lib/browse/Criteria.inc
| @@ -43,8 +43,6 @@ require_once(KT_LIB_DIR . '/workflow/workflow.inc.php'); | @@ -43,8 +43,6 @@ require_once(KT_LIB_DIR . '/workflow/workflow.inc.php'); | ||
| 43 | require_once(KT_LIB_DIR . '/browse/criteriaregistry.php'); | 43 | require_once(KT_LIB_DIR . '/browse/criteriaregistry.php'); |
| 44 | 44 | ||
| 45 | 45 | ||
| 46 | -$RESTRICTING_SEARCH = true; | ||
| 47 | - | ||
| 48 | class BrowseCriterion { | 46 | class BrowseCriterion { |
| 49 | var $sDisplay; | 47 | var $sDisplay; |
| 50 | var $sDocumentField; | 48 | var $sDocumentField; |
| @@ -689,23 +687,9 @@ class ContentCriterion extends BrowseCriterion { | @@ -689,23 +687,9 @@ class ContentCriterion extends BrowseCriterion { | ||
| 689 | 687 | ||
| 690 | $p = array(); | 688 | $p = array(); |
| 691 | $p[0] = "MATCH(DT.document_text) AGAINST (? $boolean_mode)"; | 689 | $p[0] = "MATCH(DT.document_text) AGAINST (? $boolean_mode)"; |
| 692 | - | 690 | + $p[1] = KTUtil::phraseQuote($aRequest[$this->getWidgetBase()]); |
| 693 | 691 | ||
| 694 | - | ||
| 695 | - if ($RESTRICTING_SEARCH) { | ||
| 696 | - $q_set = KTUtil::phraseSplit($aRequest[$this->getWidgetBase()]); | ||
| 697 | - $temp = $q_set; | ||
| 698 | - foreach ($temp as $k => $v) { | ||
| 699 | - $t = array(); | ||
| 700 | - foreach ($v as $part) { | ||
| 701 | - $t[] = sprintf('+"%s"', $part); | ||
| 702 | - } | ||
| 703 | - $q_set[$k] = join(' ', $t); | ||
| 704 | - } | ||
| 705 | - $p[1] = implode(' ',$q_set); | ||
| 706 | - } else { | ||
| 707 | - $p[1] = $aRequest[$this->getWidgetBase()]; | ||
| 708 | - } | 692 | + // var_dump($p[1]);exit(0); |
| 709 | 693 | ||
| 710 | // handle the boolean "not" stuff. | 694 | // handle the boolean "not" stuff. |
| 711 | $want_invert = KTUtil::arrayGet($aRequest, $this->getWidgetBase() . '_not'); | 695 | $want_invert = KTUtil::arrayGet($aRequest, $this->getWidgetBase() . '_not'); |
| @@ -815,23 +799,7 @@ class DiscussionTextCriterion extends BrowseCriterion { | @@ -815,23 +799,7 @@ class DiscussionTextCriterion extends BrowseCriterion { | ||
| 815 | 799 | ||
| 816 | $p = array(); | 800 | $p = array(); |
| 817 | $p[0] = "MATCH(DDCT.body) AGAINST (? $boolean_mode)"; | 801 | $p[0] = "MATCH(DDCT.body) AGAINST (? $boolean_mode)"; |
| 818 | - | ||
| 819 | - | ||
| 820 | - | ||
| 821 | - if ($RESTRICTING_SEARCH) { | ||
| 822 | - $q_set = KTUtil::phraseSplit($aRequest[$this->getWidgetBase()]); | ||
| 823 | - $temp = $q_set; | ||
| 824 | - foreach ($temp as $k => $v) { | ||
| 825 | - $t = array(); | ||
| 826 | - foreach ($v as $part) { | ||
| 827 | - $t[] = sprintf('+"%s"', $part); | ||
| 828 | - } | ||
| 829 | - $q_set[$k] = join(' ', $t); | ||
| 830 | - } | ||
| 831 | - $p[1] = implode(' ',$q_set); | ||
| 832 | - } else { | ||
| 833 | - $p[1] = $aRequest[$this->getWidgetBase()]; | ||
| 834 | - } | 802 | + $p[1] = KTUtil::phraseQuote($aRequest[$this->getWidgetBase()]); |
| 835 | 803 | ||
| 836 | // handle the boolean "not" stuff. | 804 | // handle the boolean "not" stuff. |
| 837 | $want_invert = KTUtil::arrayGet($aRequest, $this->getWidgetBase() . '_not'); | 805 | $want_invert = KTUtil::arrayGet($aRequest, $this->getWidgetBase() . '_not'); |
| @@ -891,23 +859,7 @@ class SearchableTextCriterion extends BrowseCriterion { | @@ -891,23 +859,7 @@ class SearchableTextCriterion extends BrowseCriterion { | ||
| 891 | 859 | ||
| 892 | $p = array(); | 860 | $p = array(); |
| 893 | $p[0] = "MATCH(DST.document_text) AGAINST (? $boolean_mode)"; | 861 | $p[0] = "MATCH(DST.document_text) AGAINST (? $boolean_mode)"; |
| 894 | - | ||
| 895 | - | ||
| 896 | - | ||
| 897 | - if ($RESTRICTING_SEARCH) { | ||
| 898 | - $q_set = KTUtil::phraseSplit($aRequest[$this->getWidgetBase()]); | ||
| 899 | - $temp = $q_set; | ||
| 900 | - foreach ($temp as $k => $v) { | ||
| 901 | - $t = array(); | ||
| 902 | - foreach ($v as $part) { | ||
| 903 | - $t[] = sprintf('+"%s"', $part); | ||
| 904 | - } | ||
| 905 | - $q_set[$k] = join(' ', $t); | ||
| 906 | - } | ||
| 907 | - $p[1] = implode(' ',$q_set); | ||
| 908 | - } else { | ||
| 909 | - $p[1] = $aRequest[$this->getWidgetBase()]; | ||
| 910 | - } | 862 | + $p[1] = KTUtil::phraseQuote($aRequest[$this->getWidgetBase()]); |
| 911 | 863 | ||
| 912 | // handle the boolean "not" stuff. | 864 | // handle the boolean "not" stuff. |
| 913 | $want_invert = KTUtil::arrayGet($aRequest, $this->getWidgetBase() . '_not'); | 865 | $want_invert = KTUtil::arrayGet($aRequest, $this->getWidgetBase() . '_not'); |
| @@ -956,23 +908,7 @@ class TransactionTextCriterion extends BrowseCriterion { | @@ -956,23 +908,7 @@ class TransactionTextCriterion extends BrowseCriterion { | ||
| 956 | 908 | ||
| 957 | $p = array(); | 909 | $p = array(); |
| 958 | $p[0] = "MATCH(DTT.document_text) AGAINST (? $boolean_mode)"; | 910 | $p[0] = "MATCH(DTT.document_text) AGAINST (? $boolean_mode)"; |
| 959 | - | ||
| 960 | - | ||
| 961 | - | ||
| 962 | - if ($RESTRICTING_SEARCH) { | ||
| 963 | - $q_set = KTUtil::phraseSplit($aRequest[$this->getWidgetBase()]); | ||
| 964 | - $temp = $q_set; | ||
| 965 | - foreach ($temp as $k => $v) { | ||
| 966 | - $t = array(); | ||
| 967 | - foreach ($v as $part) { | ||
| 968 | - $t[] = sprintf('+"%s"', $part); | ||
| 969 | - } | ||
| 970 | - $q_set[$k] = join(' ', $t); | ||
| 971 | - } | ||
| 972 | - $p[1] = implode(' ',$q_set); | ||
| 973 | - } else { | ||
| 974 | - $p[1] = $aRequest[$this->getWidgetBase()]; | ||
| 975 | - } | 911 | + $p[1] = KTUtil::phraseQuote($aRequest[$this->getWidgetBase()]); |
| 976 | 912 | ||
| 977 | // handle the boolean "not" stuff. | 913 | // handle the boolean "not" stuff. |
| 978 | $want_invert = KTUtil::arrayGet($aRequest, $this->getWidgetBase() . '_not'); | 914 | $want_invert = KTUtil::arrayGet($aRequest, $this->getWidgetBase() . '_not'); |
lib/util/KTStopwords.php
0 → 100644
| 1 | +<?php | ||
| 2 | + | ||
| 3 | +/** | ||
| 4 | + * $Id: config.inc.php 5758 2006-07-27 10:17:43Z bshuttle $ | ||
| 5 | + * | ||
| 6 | + * The contents of this file are subject to the KnowledgeTree Public | ||
| 7 | + * License Version 1.1 ("License"); You may not use this file except in | ||
| 8 | + * compliance with the License. You may obtain a copy of the License at | ||
| 9 | + * http://www.ktdms.com/KPL | ||
| 10 | + * | ||
| 11 | + * Software distributed under the License is distributed on an "AS IS" | ||
| 12 | + * basis, | ||
| 13 | + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License | ||
| 14 | + * for the specific language governing rights and limitations under the | ||
| 15 | + * License. | ||
| 16 | + * | ||
| 17 | + * The Original Code is: KnowledgeTree Open Source | ||
| 18 | + * | ||
| 19 | + * The Initial Developer of the Original Code is The Jam Warehouse Software | ||
| 20 | + * (Pty) Ltd, trading as KnowledgeTree. | ||
| 21 | + * Portions created by The Jam Warehouse Software (Pty) Ltd are Copyright | ||
| 22 | + * (C) 2006 The Jam Warehouse Software (Pty) Ltd; | ||
| 23 | + * All Rights Reserved. | ||
| 24 | + * | ||
| 25 | + */ | ||
| 26 | + | ||
| 27 | +class KTStopwords { | ||
| 28 | + var $words = array(); | ||
| 29 | + | ||
| 30 | + var $conf = array(); | ||
| 31 | + var $aSectionFile; | ||
| 32 | + var $aFileRoot; | ||
| 33 | + var $flat = array(); | ||
| 34 | + var $flatns = array(); | ||
| 35 | + | ||
| 36 | + function loadCache($filename) { | ||
| 37 | + $cache_str = file_get_contents($filename); | ||
| 38 | + $this->words = unserialize($cache_str); | ||
| 39 | + return true; | ||
| 40 | + } | ||
| 41 | + | ||
| 42 | + function createCache($filename) { | ||
| 43 | + file_put_contents($filename, serialize($this->words)); | ||
| 44 | + } | ||
| 45 | + | ||
| 46 | + function loadFile($filename) { | ||
| 47 | + $this->words = array(); | ||
| 48 | + foreach(file($filename) as $line) { | ||
| 49 | + $this->words[] = trim($line); | ||
| 50 | + } | ||
| 51 | + } | ||
| 52 | + | ||
| 53 | + function isStopword($sWord) { | ||
| 54 | + return in_array($sWord, $this->words); | ||
| 55 | + } | ||
| 56 | + | ||
| 57 | + function &getSingleton() { | ||
| 58 | + if (!KTUtil::arrayGet($GLOBALS, 'KTStopwords')) { | ||
| 59 | + $GLOBALS['KTStopwords'] =& new KTStopwords; | ||
| 60 | + $oConfig = KTConfig::getSingleton(); | ||
| 61 | + $GLOBALS['KTStopwords']->loadFile($oConfig->get('stopwordsFile')); | ||
| 62 | + } | ||
| 63 | + return $GLOBALS['KTStopwords']; | ||
| 64 | + } | ||
| 65 | +} | ||
| 66 | + | ||
| 67 | + | ||
| 68 | +?> |
lib/util/ktutil.inc
| @@ -28,6 +28,8 @@ | @@ -28,6 +28,8 @@ | ||
| 28 | * @author Neil Blakey-Milner <nbm@jamwarehouse.com>, Jam Warehouse (Pty) Ltd, South Africa | 28 | * @author Neil Blakey-Milner <nbm@jamwarehouse.com>, Jam Warehouse (Pty) Ltd, South Africa |
| 29 | */ | 29 | */ |
| 30 | 30 | ||
| 31 | +require_once(KT_LIB_DIR . '/util/KTStopwords.php'); | ||
| 32 | + | ||
| 31 | class KTUtil { | 33 | class KTUtil { |
| 32 | function extractGPC () { | 34 | function extractGPC () { |
| 33 | foreach (func_get_args() as $var) { | 35 | foreach (func_get_args() as $var) { |
| @@ -550,7 +552,16 @@ class KTUtil { | @@ -550,7 +552,16 @@ class KTUtil { | ||
| 550 | return ((float) $microtime_simple[1] + (float) $microtime_simple[0]); | 552 | return ((float) $microtime_simple[1] + (float) $microtime_simple[0]); |
| 551 | } | 553 | } |
| 552 | 554 | ||
| 553 | - function phraseSplit($sSearchString) { | 555 | + function phraseSplit($sSearchString) { |
| 556 | + // this should probably be moved to a DBUtil method | ||
| 557 | + | ||
| 558 | + $sMinWord = DBUtil::getOneResultKey("SHOW VARIABLES LIKE 'ft_min_word_len'", "Value"); | ||
| 559 | + if(is_numeric($sMinWord)) { | ||
| 560 | + $iMinWord = (int)$sMinWord; | ||
| 561 | + } else { | ||
| 562 | + $iMinWord = 4; | ||
| 563 | + } | ||
| 564 | + | ||
| 554 | $a = preg_split('#"#', $sSearchString); | 565 | $a = preg_split('#"#', $sSearchString); |
| 555 | $i = 0; | 566 | $i = 0; |
| 556 | $phrases = array(); | 567 | $phrases = array(); |
| @@ -564,20 +575,33 @@ class KTUtil { | @@ -564,20 +575,33 @@ class KTUtil { | ||
| 564 | $i += 1; | 575 | $i += 1; |
| 565 | } | 576 | } |
| 566 | 577 | ||
| 578 | + $oStopwords =& KTStopwords::getSingleton(); | ||
| 579 | + | ||
| 567 | $words = array(); | 580 | $words = array(); |
| 568 | foreach ($word_parts as $part) { | 581 | foreach ($word_parts as $part) { |
| 569 | $w = (array) explode(' ', $part); | 582 | $w = (array) explode(' ', $part); |
| 570 | - foreach ($w as $potential) { if (!empty($potential)) { $words[] = $potential; }} | ||
| 571 | - } | ||
| 572 | - | ||
| 573 | - // XXX: filter each subword (including in phrases) to remove whitespace-broken items and sub-4 character words. | ||
| 574 | - | 583 | + foreach ($w as $potential) { |
| 584 | + if (strlen($potential) >= $iMinWord && !$oStopwords->isStopword($potential)) { | ||
| 585 | + $words[] = $potential; | ||
| 586 | + } | ||
| 587 | + } | ||
| 588 | + } | ||
| 575 | 589 | ||
| 576 | return array( | 590 | return array( |
| 577 | 'words' => $words, | 591 | 'words' => $words, |
| 578 | 'phrases' => $phrases, | 592 | 'phrases' => $phrases, |
| 579 | ); | 593 | ); |
| 594 | + } | ||
| 580 | 595 | ||
| 596 | + function phraseQuote($sQuery) { | ||
| 597 | + foreach(KTUtil::phraseSplit($sQuery) as $k => $v) { | ||
| 598 | + $t = array(); | ||
| 599 | + foreach ($v as $part) { | ||
| 600 | + $t[] = sprintf('+"%s"', $part); | ||
| 601 | + } | ||
| 602 | + $q_set[$k] = join(' ', $t); | ||
| 603 | + } | ||
| 604 | + return implode(' ',$q_set); | ||
| 581 | } | 605 | } |
| 582 | 606 | ||
| 583 | function running_user() { | 607 | function running_user() { |