Analyzer.php
4.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
/** Zend_Search_Lucene_Analysis_Token */
require_once 'Zend/Search/Lucene/Analysis/Token.php';
/** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php';
/** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php';
/** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php';
/** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php';
/** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php';
/** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum/CaseInsensitive.php';
/** Zend_Search_Lucene_Analysis_TokenFilter_StopWords */
require_once 'Zend/Search/Lucene/Analysis/TokenFilter/StopWords.php';
/** Zend_Search_Lucene_Analysis_TokenFilter_ShortWords */
require_once 'Zend/Search/Lucene/Analysis/TokenFilter/ShortWords.php';
/**
* An Analyzer is used to analyze text.
* It thus represents a policy for extracting index terms from text.
*
* Note:
* Lucene Java implementation is oriented to streams. It provides effective work
* with a huge documents (more then 20Mb).
* But engine itself is not oriented such documents.
* Thus Zend_Search_Lucene analysis API works with data strings and sets (arrays).
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2007 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Analysis_Analyzer
{
/**
* The Analyzer implementation used by default.
*
* @var Zend_Search_Lucene_Analysis_Analyzer
*/
static private $_defaultImpl;
/**
* Input string
*
* @var string
*/
protected $_input = null;
/**
* Input string encoding
*
* @var string
*/
protected $_encoding = '';
/**
* Tokenize text to a terms
* Returns array of Zend_Search_Lucene_Analysis_Token objects
*
* Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding)
*
* @param string $data
* @return array
*/
public function tokenize($data, $encoding = '')
{
$this->setInput($data, $encoding);
$tokenList = array();
while (($nextToken = $this->nextToken()) !== null) {
$tokenList[] = $nextToken;
}
return $tokenList;
}
/**
* Tokenization stream API
* Set input
*
* @param string $data
*/
public function setInput($data, $encoding = '')
{
$this->_input = $data;
$this->_encoding = $encoding;
$this->reset();
}
/**
* Reset token stream
*/
abstract public function reset();
/**
* Tokenization stream API
* Get next token
* Returns null at the end of stream
*
* Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding)
*
* @return Zend_Search_Lucene_Analysis_Token|null
*/
abstract public function nextToken();
/**
* Set the default Analyzer implementation used by indexing code.
*
* @param Zend_Search_Lucene_Analysis_Analyzer $similarity
*/
static public function setDefault(Zend_Search_Lucene_Analysis_Analyzer $analyzer)
{
self::$_defaultImpl = $analyzer;
}
/**
* Return the default Analyzer implementation used by indexing code.
*
* @return Zend_Search_Lucene_Analysis_Analyzer
*/
static public function getDefault()
{
if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Analysis_Analyzer) {
self::$_defaultImpl = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive();
}
return self::$_defaultImpl;
}
}