Version Description
Download this release
Release Info
Developer | RobMarsh |
Plugin | ![]() |
Version | 2.6.0.0 |
Comparing to | |
See all releases |
Version 2.6.0.0
- languages/de/stemmer.php +315 -0
- languages/de/stemmer.php.bak +315 -0
- languages/de/stopwords.php +4 -0
- languages/en/stemmer.php +335 -0
- languages/en/stopwords.php +4 -0
- languages/es/stemmer.php +381 -0
- languages/es/stopwords.php +4 -0
- languages/fr/stemmer.php +513 -0
- languages/fr/stopwords.php +4 -0
- languages/it/stemmer.php +341 -0
- languages/it/stopwords.php +4 -0
- readme.txt +114 -0
- similar-posts-admin.php +702 -0
- similar-posts.php +584 -0
languages/de/stemmer.php
ADDED
@@ -0,0 +1,315 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?php
|
2 |
+
/*
|
3 |
+
Adapted from a drupal module -- see details below
|
4 |
+
*/
|
5 |
+
|
6 |
+
/*
|
7 |
+
Content:
|
8 |
+
Drupal module to improve searching in german texts (Porter stemmer)
|
9 |
+
Algorithm based on http://snowball.tartarus.org/algorithms/german/stemmer.html
|
10 |
+
Author:
|
11 |
+
Reiner Miericke 10.10.2007
|
12 |
+
References:
|
13 |
+
Algorithm:
|
14 |
+
http://www.clef-campaign.org/workshop2002/WN/3.pdf
|
15 |
+
http://w3.ub.uni-konstanz.de/v13/volltexte/2003/996//pdf/scherer.pdf
|
16 |
+
http://kontext.fraunhofer.de/haenelt/kurs/Referate/Kowatschew_Lang/stemming.pdf
|
17 |
+
http://www.cis.uni-muenchen.de/people/Schulz/SeminarSoSe2001IR/FilzmayerMargetic/referat.html
|
18 |
+
http://www.ifi.unizh.ch/CL/broder/mue1/porter/stemming/node1.html
|
19 |
+
For lists of stopwords see
|
20 |
+
http://members.unine.ch/jacques.savoy/clef/index.html
|
21 |
+
Small parts were stolen from dutchstemmer.module
|
22 |
+
*/
|
23 |
+
|
24 |
+
|
25 |
+
define("DE_STEMMER_VOKALE", "aeiouy���");
|
26 |
+
|
27 |
+
$enc = mb_detect_encoding('a-zA-Z���������������');
|
28 |
+
mb_internal_encoding($enc);
|
29 |
+
|
30 |
+
function _de_stemmer_split_text(&$text) {
|
31 |
+
// Split words from noise
|
32 |
+
return preg_split('/([^a-zA-Z���������������]+)/u', $text, -1, PREG_SPLIT_NO_EMPTY);
|
33 |
+
}
|
34 |
+
|
35 |
+
|
36 |
+
/**
|
37 |
+
* Implementation of hook_search_preprocess
|
38 |
+
*/
|
39 |
+
function de_stemmer_search_preprocess(&$text) {
|
40 |
+
// Split words from noise and remove apostrophes
|
41 |
+
$words = preg_split('/([^a-zA-Z���������������]+)/u', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
|
42 |
+
|
43 |
+
// Process each word
|
44 |
+
$odd = true;
|
45 |
+
foreach ($words as $k => $word) {
|
46 |
+
if ($odd) {
|
47 |
+
$words[$k] = _de_stemmer_wortstamm($word);
|
48 |
+
}
|
49 |
+
$odd = !$odd;
|
50 |
+
}
|
51 |
+
|
52 |
+
// Put it all back together
|
53 |
+
return implode('', $words);
|
54 |
+
|
55 |
+
/* alte Version
|
56 |
+
$words = _de_stemmer_split_text($text);
|
57 |
+
|
58 |
+
// Process each word
|
59 |
+
foreach ($words as $k => $word) {
|
60 |
+
if (!_de_stemmer_stoppwort(strtolower($word))) {
|
61 |
+
$words[$k] = _de_stemmer_wortstamm($word);
|
62 |
+
}
|
63 |
+
}
|
64 |
+
|
65 |
+
// Put it all back together
|
66 |
+
return implode(' ', $words);
|
67 |
+
*/
|
68 |
+
}
|
69 |
+
|
70 |
+
|
71 |
+
/**
|
72 |
+
* Implementation of hook_help().
|
73 |
+
*/
|
74 |
+
function de_stemmer_help($section = 'admin/help#search') {
|
75 |
+
switch ($section) {
|
76 |
+
case 'admin/modules#description':
|
77 |
+
return t('Implements a German stemming algorithm (Porter) to improve searching.');
|
78 |
+
}
|
79 |
+
}
|
80 |
+
|
81 |
+
|
82 |
+
/*
|
83 |
+
* Function gets as text (parameter) and splits the text into words.
|
84 |
+
* Then each word is stemmed and the word together with its stem is
|
85 |
+
* stored in an array (hash).
|
86 |
+
* As a result the hash is returned and can be used as a lookup table
|
87 |
+
* to identify words which transform to the same stem.
|
88 |
+
* For details please compare 'search.module-stem.patch'
|
89 |
+
*/
|
90 |
+
function de_stemmer_stem_list($text) {
|
91 |
+
// Split words from noise and remove apostrophes
|
92 |
+
$words = _de_stemmer_split_text($text);
|
93 |
+
|
94 |
+
$stem_list = array();
|
95 |
+
foreach ($words as $word) {
|
96 |
+
$stem_list[$word] = _de_stemmer_wortstamm($word);
|
97 |
+
}
|
98 |
+
return $stem_list;
|
99 |
+
}
|
100 |
+
|
101 |
+
|
102 |
+
function _de_stemmer_region_n($wort) {
|
103 |
+
$r = strcspn($wort, DE_STEMMER_VOKALE);
|
104 |
+
return $r + strspn($wort, DE_STEMMER_VOKALE, $r) + 1;
|
105 |
+
}
|
106 |
+
|
107 |
+
function de_stemmer_preprocess($wort) {
|
108 |
+
$wort = mb_strtolower($wort);
|
109 |
+
$wort = str_replace("�", "ss", $wort);
|
110 |
+
// replace � by ss, and put u and y between vowels into upper case
|
111 |
+
|
112 |
+
$wort = preg_replace( array( '/�/',
|
113 |
+
'/(?<=['. DE_STEMMER_VOKALE .'])u(?=['. DE_STEMMER_VOKALE .'])/u',
|
114 |
+
'/(?<=['. DE_STEMMER_VOKALE .'])y(?=['. DE_STEMMER_VOKALE .'])/u'
|
115 |
+
),
|
116 |
+
array( 'ss', 'U', 'Y' ),
|
117 |
+
$wort
|
118 |
+
);
|
119 |
+
return $wort;
|
120 |
+
}
|
121 |
+
|
122 |
+
|
123 |
+
function _de_stemmer_postprocess($wort) {
|
124 |
+
$wort = mb_strtolower($wort);
|
125 |
+
|
126 |
+
if (!_de_stemmer_ausnahme($wort)) // check for exceptions
|
127 |
+
{
|
128 |
+
$wort = strtr($wort, array('�' => 'a', '�' => 'a',
|
129 |
+
'�' => 'e', '�' => 'e',
|
130 |
+
'�' => 'i', '�' => 'i',
|
131 |
+
'�' => 'o', '�' => 'o',
|
132 |
+
'�' => "u", '�' => 'u'
|
133 |
+
));
|
134 |
+
}
|
135 |
+
return $wort;
|
136 |
+
}
|
137 |
+
|
138 |
+
|
139 |
+
function _de_stemmer_wortstamm($wort) {
|
140 |
+
$stamm = de_stemmer_preprocess($wort);
|
141 |
+
|
142 |
+
/*
|
143 |
+
* R1 is the region after the first non-vowel following a vowel,
|
144 |
+
or is the null region at the end of the word if there is no such non-vowel.
|
145 |
+
* R2 is the region after the first non-vowel following a vowel in R1,
|
146 |
+
or is the null region at the end of the word if there is no such non-vowel.
|
147 |
+
*/
|
148 |
+
|
149 |
+
$l = strlen($stamm);
|
150 |
+
$r1 = _de_stemmer_region_n($stamm);
|
151 |
+
$r2 = $r1 == $l ? $r1 : $r1 + _de_stemmer_region_n(mb_substr($stamm, $r1));
|
152 |
+
// unshure about interpreting the following rule:
|
153 |
+
// "then R1 is ADJUSTED so that the region before it contains at least 3 letters"
|
154 |
+
if ($r1 < 3) {
|
155 |
+
$r1 = 3;
|
156 |
+
}
|
157 |
+
|
158 |
+
/* Step 1
|
159 |
+
Search for the longest among the following suffixes,
|
160 |
+
(a) e em en ern er es
|
161 |
+
(b) s (preceded by a valid s-ending)
|
162 |
+
and delete if in R1.
|
163 |
+
(Of course the letter of the valid s-ending is not necessarily in R1)
|
164 |
+
*/
|
165 |
+
|
166 |
+
if (preg_match('/(e|em|en|ern|er|es)$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
|
167 |
+
$stamm = mb_substr($stamm, 0, $hits[0][1]);
|
168 |
+
}
|
169 |
+
elseif (preg_match('/(?<=(b|d|f|g|h|k|l|m|n|r|t))s$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
|
170 |
+
$stamm = mb_substr($stamm, 0, $hits[0][1]);
|
171 |
+
}
|
172 |
+
|
173 |
+
|
174 |
+
/*
|
175 |
+
Step 2
|
176 |
+
Search for the longest among the following suffixes,
|
177 |
+
(a) en er est
|
178 |
+
(b) st (preceded by a valid st-ending, itself preceded by at least 3 letters)
|
179 |
+
and delete if in R1.
|
180 |
+
*/
|
181 |
+
|
182 |
+
if (preg_match('/(en|er|est)$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
|
183 |
+
$stamm = mb_substr($stamm, 0, $hits[0][1]);
|
184 |
+
}
|
185 |
+
elseif (preg_match('/(?<=(b|d|f|g|h|k|l|m|n|t))st$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
|
186 |
+
$stamm = mb_substr($stamm, 0, $hits[0][1]);
|
187 |
+
}
|
188 |
+
|
189 |
+
|
190 |
+
/*
|
191 |
+
Step 3: d-suffixes ( see http://snowball.tartarus.org/texts/glossary.html )
|
192 |
+
Search for the longest among the following suffixes, and perform the action indicated.
|
193 |
+
end ung
|
194 |
+
delete if in R2
|
195 |
+
if preceded by ig, delete if in R2 and not preceded by e
|
196 |
+
ig ik isch
|
197 |
+
delete if in R2 and not preceded by e
|
198 |
+
lich heit
|
199 |
+
delete if in R2
|
200 |
+
if preceded by er or en, delete if in R1
|
201 |
+
keit
|
202 |
+
delete if in R2
|
203 |
+
if preceded by lich or ig, delete if in R2
|
204 |
+
^ means R1 ?
|
205 |
+
*/
|
206 |
+
|
207 |
+
if (preg_match('/(?<=eig)(end|ung)$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
|
208 |
+
;
|
209 |
+
}
|
210 |
+
elseif (preg_match('/(end|ung)$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
|
211 |
+
$stamm = mb_substr($stamm, 0, $hits[0][1]);
|
212 |
+
}
|
213 |
+
elseif (preg_match('/(?<![e])(ig|ik|isch)$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
|
214 |
+
$stamm = mb_substr($stamm, 0, $hits[0][1]);
|
215 |
+
}
|
216 |
+
elseif (preg_match('/(?<=(er|en))(lich|heit)$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
|
217 |
+
$stamm = mb_substr($stamm, 0, $hits[0][1]);
|
218 |
+
}
|
219 |
+
elseif (preg_match('/(lich|heit)$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
|
220 |
+
$stamm = mb_substr($stamm, 0, $hits[0][1]);
|
221 |
+
}
|
222 |
+
elseif (preg_match('/(?<=lich)keit$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
|
223 |
+
$stamm = mb_substr($stamm, 0, $hits[0][1]);
|
224 |
+
}
|
225 |
+
elseif (preg_match('/(?<=ig)keit$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
|
226 |
+
$stamm = mb_substr($stamm, 0, $hits[0][1]);
|
227 |
+
}
|
228 |
+
elseif (preg_match('/keit$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
|
229 |
+
$stamm = mb_substr($stamm, 0, $hits[0][1]);
|
230 |
+
}
|
231 |
+
|
232 |
+
|
233 |
+
/* Was ist mit
|
234 |
+
chen, lein, bar, schaft, ... ?
|
235 |
+
*/
|
236 |
+
return _de_stemmer_postprocess($stamm);
|
237 |
+
}
|
238 |
+
|
239 |
+
|
240 |
+
function _de_stemmer_stoppwort($wort) {
|
241 |
+
|
242 |
+
static $stoppworte = array(
|
243 |
+
'ab', 'aber', 'aber', 'ach', 'acht', 'achte', 'achten', 'achter', 'achtes', 'ag', 'alle', 'allein', 'allem', 'allen', 'aller', 'allerdings', 'alles', 'allgemeinen', 'als', 'als', 'also', 'am', 'an', 'andere', 'anderen', 'andern', 'anders', 'au', 'auch', 'auch', 'auf', 'aus', 'ausser', 'au�er', 'ausserdem', 'au�erdem',
|
244 |
+
'bald', 'bei', 'beide', 'beiden', 'beim', 'bekannt', 'bereits', 'besonders', 'besser', 'besten', 'bin', 'bis', 'bisher', 'bist',
|
245 |
+
'da', 'dabei', 'dadurch', 'daf�r', 'dagegen', 'daher', 'dahin', 'dahinter', 'damals', 'damit', 'danach', 'daneben', 'dank', 'dann', 'daran', 'darauf', 'daraus', 'darf', 'darfst', 'darin', 'dar�ber', 'darum', 'darunter', 'das', 'das', 'dasein', 'daselbst', 'dass', 'da�', 'dasselbe', 'davon', 'davor', 'dazu', 'dazwischen', 'dein', 'deine', 'deinem', 'deiner', 'dem', 'dementsprechend', 'demgegen�ber', 'demgem�ss', 'demgem��', 'demselben', 'demzufolge', 'den', 'denen', 'denn', 'denn', 'denselben', 'der', 'deren', 'derjenige', 'derjenigen', 'dermassen', 'derma�en', 'derselbe', 'derselben', 'des', 'deshalb', 'desselben', 'dessen', 'deswegen', 'd.h', 'dich', 'die', 'diejenige', 'diejenigen', 'dies', 'diese', 'dieselbe', 'dieselben', 'diesem', 'diesen', 'dieser', 'dieses', 'dir', 'doch', 'dort', 'drei', 'drin', 'dritte', 'dritten', 'dritter', 'drittes', 'du', 'durch', 'durchaus',
|
246 |
+
'eben', 'ebenso', 'eigen', 'eigene', 'eigenen', 'eigener', 'eigenes', 'ein', 'einander', 'eine', 'einem', 'einen', 'einer', 'eines', 'einige', 'einigen', 'einiger', 'einiges', 'einmal', 'einmal', 'eins', 'elf', 'en', 'ende', 'endlich', 'entweder', 'entweder', 'er', 'ernst', 'erst', 'erste', 'ersten', 'erster', 'erstes', 'es', 'etwa', 'etwas', 'euch',
|
247 |
+
'fr�her', 'f�nf', 'f�nfte', 'f�nften', 'f�nfter', 'f�nftes', 'f�r',
|
248 |
+
'gab', 'ganz', 'ganze', 'ganzen', 'ganzer', 'ganzes', 'gar', 'gedurft', 'gegen', 'gegen�ber', 'gehabt', 'gehen', 'geht', 'gekannt', 'gekonnt', 'gemacht', 'gemocht', 'gemusst', 'genug', 'gerade', 'gern', 'gesagt', 'gesagt', 'geschweige', 'gewesen', 'gewollt', 'geworden', 'gibt', 'ging', 'gleich', 'gott', 'gross', 'gro�', 'grosse', 'gro�e', 'grossen', 'gro�en', 'grosser', 'gro�er', 'grosses', 'gro�es', 'gut', 'gute', 'guter', 'gutes',
|
249 |
+
'habe', 'haben', 'habt', 'hast', 'hat', 'hatte', 'h�tte', 'hatten', 'h�tten', 'heisst', 'her', 'heute', 'hier', 'hin', 'hinter', 'hoch',
|
250 |
+
'ich', 'ihm', 'ihn', 'ihnen', 'ihr', 'ihre', 'ihrem', 'ihren', 'ihrer', 'ihres', 'im', 'im', 'immer', 'in', 'in', 'indem', 'infolgedessen', 'ins', 'irgend', 'ist',
|
251 |
+
'ja', 'ja', 'jahr', 'jahre', 'jahren', 'je', 'jede', 'jedem', 'jeden', 'jeder', 'jedermann', 'jedermanns', 'jedoch', 'jemand', 'jemandem', 'jemanden', 'jene', 'jenem', 'jenen', 'jener', 'jenes', 'jetzt',
|
252 |
+
'kam', 'kann', 'kannst', 'kaum', 'kein', 'keine', 'keinem', 'keinen', 'keiner', 'kleine', 'kleinen', 'kleiner', 'kleines', 'kommen', 'kommt', 'k�nnen', 'k�nnt', 'konnte', 'k�nnte', 'konnten', 'kurz',
|
253 |
+
'lang', 'lange', 'lange', 'leicht', 'leide', 'lieber', 'los',
|
254 |
+
'machen', 'macht', 'machte', 'mag', 'magst', 'mahn', 'man', 'manche', 'manchem', 'manchen', 'mancher', 'manches', 'mann', 'mehr', 'mein', 'meine', 'meinem', 'meinen', 'meiner', 'meines', 'mich', 'mir', 'mit', 'mittel', 'mochte', 'm�chte', 'mochten', 'm�gen', 'm�glich', 'm�gt', 'morgen', 'muss', 'mu�', 'm�ssen', 'musst', 'm�sst', 'musste', 'mussten',
|
255 |
+
'na', 'nach', 'nachdem', 'nahm', 'nat�rlich', 'neben', 'nein', 'neue', 'neuen', 'neun', 'neunte', 'neunten', 'neunter', 'neuntes', 'nicht', 'nicht', 'nichts', 'nie', 'niemand', 'niemandem', 'niemanden', 'noch', 'nun', 'nun', 'nur',
|
256 |
+
'ob', 'oben', 'oder', 'oder', 'offen', 'oft', 'oft', 'ohne',
|
257 |
+
'recht', 'rechte', 'rechten', 'rechter', 'rechtes', 'richtig', 'rund',
|
258 |
+
'sa', 'sache', 'sagt', 'sagte', 'sah', 'satt', 'schon', 'sechs', 'sechste', 'sechsten', 'sechster', 'sechstes', 'sehr', 'sei', 'sei', 'seid', 'seien', 'sein', 'seine', 'seinem', 'seinen', 'seiner', 'seines', 'seit', 'seitdem', 'selbst', 'selbst', 'sich', 'sie', 'sieben', 'siebente', 'siebenten', 'siebenter', 'siebentes', 'sind', 'so', 'solang', 'solche', 'solchem', 'solchen', 'solcher', 'solches', 'soll', 'sollen', 'sollte', 'sollten', 'sondern', 'sonst', 'sowie', 'sp�ter', 'statt',
|
259 |
+
'tat', 'teil', 'tel', 'tritt', 'trotzdem', 'tun',
|
260 |
+
'�ber', '�berhaupt', '�brigens', 'uhr', 'um', 'und', 'und?', 'uns', 'unser', 'unsere', 'unserer', 'unter',
|
261 |
+
'vergangenen', 'viel', 'viele', 'vielem', 'vielen', 'vielleicht', 'vier', 'vierte', 'vierten', 'vierter', 'viertes', 'vom', 'von', 'vor',
|
262 |
+
'wahr?', 'w�hrend', 'w�hrenddem', 'w�hrenddessen', 'wann', 'war', 'w�re', 'waren', 'wart', 'warum', 'was', 'wegen', 'weil', 'weit', 'weiter', 'weitere', 'weiteren', 'weiteres', 'welche', 'welchem', 'welchen', 'welcher', 'welches', 'wem', 'wen', 'wenig', 'wenig', 'wenige', 'weniger', 'weniges', 'wenigstens', 'wenn', 'wenn', 'wer', 'werde', 'werden', 'werdet', 'wessen', 'wie', 'wie', 'wieder', 'will', 'willst', 'wir', 'wird', 'wirklich', 'wirst', 'wo', 'wohl', 'wollen', 'wollt', 'wollte', 'wollten', 'worden', 'wurde', 'w�rde', 'wurden', 'w�rden',
|
263 |
+
'z.b', 'zehn', 'zehnte', 'zehnten', 'zehnter', 'zehntes', 'zeit', 'zu', 'zuerst', 'zugleich', 'zum', 'zum', 'zun�chst', 'zur', 'zur�ck', 'zusammen', 'zwanzig', 'zwar', 'zwar', 'zwei', 'zweite', 'zweiten', 'zweiter', 'zweites', 'zwischen', 'zw�lf'
|
264 |
+
);
|
265 |
+
|
266 |
+
return in_array($wort, $stoppworte);
|
267 |
+
}
|
268 |
+
|
269 |
+
|
270 |
+
/*
|
271 |
+
first try to set up a list of exceptions
|
272 |
+
*/
|
273 |
+
function _de_stemmer_ausnahme(&$wort)
|
274 |
+
{ static $de_stemmer_ausnahmen = array (
|
275 |
+
'sch�n' => 'sch�n', // !schon
|
276 |
+
'bl�t' => 'bl�t', // Bl�te (NICHT Blut)
|
277 |
+
'kannt' => 'kenn',
|
278 |
+
'k�ch' => 'k�ch', // K�chen (NICHT Kuchen)
|
279 |
+
'm�g' => 'm�g',
|
280 |
+
'mocht' => 'm�g',
|
281 |
+
'mag' => 'm�g',
|
282 |
+
'ging' => 'geh',
|
283 |
+
'lief' => 'lauf',
|
284 |
+
'�nd' => '�nd' // �ndern (NICHT andern)
|
285 |
+
);
|
286 |
+
|
287 |
+
//return FALSE;
|
288 |
+
if ( array_key_exists($wort, $de_stemmer_ausnahmen) )
|
289 |
+
{ $wort = $de_stemmer_ausnahmen[$wort];
|
290 |
+
return TRUE;
|
291 |
+
}
|
292 |
+
else
|
293 |
+
return FALSE;
|
294 |
+
}
|
295 |
+
|
296 |
+
/*
|
297 |
+
Stem caching added by Rob Marsh, SJ
|
298 |
+
http://rmarsh.com
|
299 |
+
*/
|
300 |
+
|
301 |
+
$StemCache = array();
|
302 |
+
|
303 |
+
function stem($word) {
|
304 |
+
global $StemCache;
|
305 |
+
if (!isset($StemCache[$word])) {
|
306 |
+
$stemmedword = _de_stemmer_wortstamm($word);
|
307 |
+
$StemCache[$word] = $stemmedword;
|
308 |
+
}
|
309 |
+
else {
|
310 |
+
$stemmedword = $StemCache[$word] ;
|
311 |
+
}
|
312 |
+
return $stemmedword;
|
313 |
+
}
|
314 |
+
|
315 |
+
?>
|
languages/de/stemmer.php.bak
ADDED
@@ -0,0 +1,315 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?php
|
2 |
+
/*
|
3 |
+
Adapted from a drupal module -- see details below
|
4 |
+
*/
|
5 |
+
|
6 |
+
/*
|
7 |
+
Content:
|
8 |
+
Drupal module to improve searching in german texts (Porter stemmer)
|
9 |
+
Algorithm based on http://snowball.tartarus.org/algorithms/german/stemmer.html
|
10 |
+
Author:
|
11 |
+
Reiner Miericke 10.10.2007
|
12 |
+
References:
|
13 |
+
Algorithm:
|
14 |
+
http://www.clef-campaign.org/workshop2002/WN/3.pdf
|
15 |
+
http://w3.ub.uni-konstanz.de/v13/volltexte/2003/996//pdf/scherer.pdf
|
16 |
+
http://kontext.fraunhofer.de/haenelt/kurs/Referate/Kowatschew_Lang/stemming.pdf
|
17 |
+
http://www.cis.uni-muenchen.de/people/Schulz/SeminarSoSe2001IR/FilzmayerMargetic/referat.html
|
18 |
+
http://www.ifi.unizh.ch/CL/broder/mue1/porter/stemming/node1.html
|
19 |
+
For lists of stopwords see
|
20 |
+
http://members.unine.ch/jacques.savoy/clef/index.html
|
21 |
+
Small parts were stolen from dutchstemmer.module
|
22 |
+
*/
|
23 |
+
|
24 |
+
|
25 |
+
define("DE_STEMMER_VOKALE", "aeiouyäöü");
|
26 |
+
|
27 |
+
$enc = mb_detect_encoding('a-zA-ZÄÖÜßäëïöüáéíóúè');
|
28 |
+
mb_internal_encoding($enc);
|
29 |
+
|
30 |
+
function _de_stemmer_split_text(&$text) {
|
31 |
+
// Split words from noise
|
32 |
+
return preg_split('/([^a-zA-ZÄÖÜßäëïöüáéíóúè]+)/u', $text, -1, PREG_SPLIT_NO_EMPTY);
|
33 |
+
}
|
34 |
+
|
35 |
+
|
36 |
+
/**
|
37 |
+
* Implementation of hook_search_preprocess
|
38 |
+
*/
|
39 |
+
function de_stemmer_search_preprocess(&$text) {
|
40 |
+
// Split words from noise and remove apostrophes
|
41 |
+
$words = preg_split('/([^a-zA-ZÄÖÜßäëïöüáéíóúè]+)/u', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
|
42 |
+
|
43 |
+
// Process each word
|
44 |
+
$odd = true;
|
45 |
+
foreach ($words as $k => $word) {
|
46 |
+
if ($odd) {
|
47 |
+
$words[$k] = _de_stemmer_wortstamm($word);
|
48 |
+
}
|
49 |
+
$odd = !$odd;
|
50 |
+
}
|
51 |
+
|
52 |
+
// Put it all back together
|
53 |
+
return implode('', $words);
|
54 |
+
|
55 |
+
/* alte Version
|
56 |
+
$words = _de_stemmer_split_text($text);
|
57 |
+
|
58 |
+
// Process each word
|
59 |
+
foreach ($words as $k => $word) {
|
60 |
+
if (!_de_stemmer_stoppwort(strtolower($word))) {
|
61 |
+
$words[$k] = _de_stemmer_wortstamm($word);
|
62 |
+
}
|
63 |
+
}
|
64 |
+
|
65 |
+
// Put it all back together
|
66 |
+
return implode(' ', $words);
|
67 |
+
*/
|
68 |
+
}
|
69 |
+
|
70 |
+
|
71 |
+
/**
|
72 |
+
* Implementation of hook_help().
|
73 |
+
*/
|
74 |
+
function de_stemmer_help($section = 'admin/help#search') {
|
75 |
+
switch ($section) {
|
76 |
+
case 'admin/modules#description':
|
77 |
+
return t('Implements a German stemming algorithm (Porter) to improve searching.');
|
78 |
+
}
|
79 |
+
}
|
80 |
+
|
81 |
+
|
82 |
+
/*
|
83 |
+
* Function gets as text (parameter) and splits the text into words.
|
84 |
+
* Then each word is stemmed and the word together with its stem is
|
85 |
+
* stored in an array (hash).
|
86 |
+
* As a result the hash is returned and can be used as a lookup table
|
87 |
+
* to identify words which transform to the same stem.
|
88 |
+
* For details please compare 'search.module-stem.patch'
|
89 |
+
*/
|
90 |
+
function de_stemmer_stem_list($text) {
|
91 |
+
// Split words from noise and remove apostrophes
|
92 |
+
$words = _de_stemmer_split_text($text);
|
93 |
+
|
94 |
+
$stem_list = array();
|
95 |
+
foreach ($words as $word) {
|
96 |
+
$stem_list[$word] = _de_stemmer_wortstamm($word);
|
97 |
+
}
|
98 |
+
return $stem_list;
|
99 |
+
}
|
100 |
+
|
101 |
+
|
102 |
+
function _de_stemmer_region_n($wort) {
|
103 |
+
$r = strcspn($wort, DE_STEMMER_VOKALE);
|
104 |
+
return $r + strspn($wort, DE_STEMMER_VOKALE, $r) + 1;
|
105 |
+
}
|
106 |
+
|
107 |
+
function de_stemmer_preprocess($wort) {
|
108 |
+
$wort = mb_strtolower($wort);
|
109 |
+
$wort = str_replace("ß", "ss", $wort);
|
110 |
+
// replace ß by ss, and put u and y between vowels into upper case
|
111 |
+
|
112 |
+
$wort = preg_replace( array( '/ß/',
|
113 |
+
'/(?<=['. DE_STEMMER_VOKALE .'])u(?=['. DE_STEMMER_VOKALE .'])/u',
|
114 |
+
'/(?<=['. DE_STEMMER_VOKALE .'])y(?=['. DE_STEMMER_VOKALE .'])/u'
|
115 |
+
),
|
116 |
+
array( 'ss', 'U', 'Y' ),
|
117 |
+
$wort
|
118 |
+
);
|
119 |
+
return $wort;
|
120 |
+
}
|
121 |
+
|
122 |
+
|
123 |
+
function _de_stemmer_postprocess($wort) {
|
124 |
+
$wort = mb_strtolower($wort);
|
125 |
+
|
126 |
+
if (!_de_stemmer_ausnahme($wort)) // check for exceptions
|
127 |
+
{
|
128 |
+
$wort = strtr($wort, array('ä' => 'a', 'á' => 'a',
|
129 |
+
'ë' => 'e', 'é' => 'e',
|
130 |
+
'ï' => 'i', 'í' => 'i',
|
131 |
+
'ö' => 'o', 'ó' => 'o',
|
132 |
+
'ü' => "u", 'ú' => 'u'
|
133 |
+
));
|
134 |
+
}
|
135 |
+
return $wort;
|
136 |
+
}
|
137 |
+
|
138 |
+
|
139 |
+
function _de_stemmer_wortstamm($wort) {
|
140 |
+
$stamm = de_stemmer_preprocess($wort);
|
141 |
+
|
142 |
+
/*
|
143 |
+
* R1 is the region after the first non-vowel following a vowel,
|
144 |
+
or is the null region at the end of the word if there is no such non-vowel.
|
145 |
+
* R2 is the region after the first non-vowel following a vowel in R1,
|
146 |
+
or is the null region at the end of the word if there is no such non-vowel.
|
147 |
+
*/
|
148 |
+
|
149 |
+
$l = strlen($stamm);
|
150 |
+
$r1 = _de_stemmer_region_n($stamm);
|
151 |
+
$r2 = $r1 == $l ? $r1 : $r1 + _de_stemmer_region_n(mb_substr($stamm, $r1));
|
152 |
+
// unshure about interpreting the following rule:
|
153 |
+
// "then R1 is ADJUSTED so that the region before it contains at least 3 letters"
|
154 |
+
if ($r1 < 3) {
|
155 |
+
$r1 = 3;
|
156 |
+
}
|
157 |
+
|
158 |
+
/* Step 1
|
159 |
+
Search for the longest among the following suffixes,
|
160 |
+
(a) e em en ern er es
|
161 |
+
(b) s (preceded by a valid s-ending)
|
162 |
+
and delete if in R1.
|
163 |
+
(Of course the letter of the valid s-ending is not necessarily in R1)
|
164 |
+
*/
|
165 |
+
|
166 |
+
if (preg_match('/(e|em|en|ern|er|es)$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
|
167 |
+
$stamm = mb_substr($stamm, 0, $hits[0][1]);
|
168 |
+
}
|
169 |
+
elseif (preg_match('/(?<=(b|d|f|g|h|k|l|m|n|r|t))s$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
|
170 |
+
$stamm = mb_substr($stamm, 0, $hits[0][1]);
|
171 |
+
}
|
172 |
+
|
173 |
+
|
174 |
+
/*
|
175 |
+
Step 2
|
176 |
+
Search for the longest among the following suffixes,
|
177 |
+
(a) en er est
|
178 |
+
(b) st (preceded by a valid st-ending, itself preceded by at least 3 letters)
|
179 |
+
and delete if in R1.
|
180 |
+
*/
|
181 |
+
|
182 |
+
if (preg_match('/(en|er|est)$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
|
183 |
+
$stamm = mb_substr($stamm, 0, $hits[0][1]);
|
184 |
+
}
|
185 |
+
elseif (preg_match('/(?<=(b|d|f|g|h|k|l|m|n|t))st$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
|
186 |
+
$stamm = mb_substr($stamm, 0, $hits[0][1]);
|
187 |
+
}
|
188 |
+
|
189 |
+
|
190 |
+
/*
|
191 |
+
Step 3: d-suffixes ( see http://snowball.tartarus.org/texts/glossary.html )
|
192 |
+
Search for the longest among the following suffixes, and perform the action indicated.
|
193 |
+
end ung
|
194 |
+
delete if in R2
|
195 |
+
if preceded by ig, delete if in R2 and not preceded by e
|
196 |
+
ig ik isch
|
197 |
+
delete if in R2 and not preceded by e
|
198 |
+
lich heit
|
199 |
+
delete if in R2
|
200 |
+
if preceded by er or en, delete if in R1
|
201 |
+
keit
|
202 |
+
delete if in R2
|
203 |
+
if preceded by lich or ig, delete if in R2
|
204 |
+
^ means R1 ?
|
205 |
+
*/
|
206 |
+
|
207 |
+
if (preg_match('/(?<=eig)(end|ung)$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
|
208 |
+
;
|
209 |
+
}
|
210 |
+
elseif (preg_match('/(end|ung)$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
|
211 |
+
$stamm = mb_substr($stamm, 0, $hits[0][1]);
|
212 |
+
}
|
213 |
+
elseif (preg_match('/(?<![e])(ig|ik|isch)$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
|
214 |
+
$stamm = mb_substr($stamm, 0, $hits[0][1]);
|
215 |
+
}
|
216 |
+
elseif (preg_match('/(?<=(er|en))(lich|heit)$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
|
217 |
+
$stamm = mb_substr($stamm, 0, $hits[0][1]);
|
218 |
+
}
|
219 |
+
elseif (preg_match('/(lich|heit)$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
|
220 |
+
$stamm = mb_substr($stamm, 0, $hits[0][1]);
|
221 |
+
}
|
222 |
+
elseif (preg_match('/(?<=lich)keit$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
|
223 |
+
$stamm = mb_substr($stamm, 0, $hits[0][1]);
|
224 |
+
}
|
225 |
+
elseif (preg_match('/(?<=ig)keit$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
|
226 |
+
$stamm = mb_substr($stamm, 0, $hits[0][1]);
|
227 |
+
}
|
228 |
+
elseif (preg_match('/keit$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
|
229 |
+
$stamm = mb_substr($stamm, 0, $hits[0][1]);
|
230 |
+
}
|
231 |
+
|
232 |
+
|
233 |
+
/* Was ist mit
|
234 |
+
chen, lein, bar, schaft, ... ?
|
235 |
+
*/
|
236 |
+
return _de_stemmer_postprocess($stamm);
|
237 |
+
}
|
238 |
+
|
239 |
+
|
240 |
+
function _de_stemmer_stoppwort($wort) {
|
241 |
+
|
242 |
+
static $stoppworte = array(
|
243 |
+
'ab', 'aber', 'aber', 'ach', 'acht', 'achte', 'achten', 'achter', 'achtes', 'ag', 'alle', 'allein', 'allem', 'allen', 'aller', 'allerdings', 'alles', 'allgemeinen', 'als', 'als', 'also', 'am', 'an', 'andere', 'anderen', 'andern', 'anders', 'au', 'auch', 'auch', 'auf', 'aus', 'ausser', 'außer', 'ausserdem', 'außerdem',
|
244 |
+
'bald', 'bei', 'beide', 'beiden', 'beim', 'bekannt', 'bereits', 'besonders', 'besser', 'besten', 'bin', 'bis', 'bisher', 'bist',
|
245 |
+
'da', 'dabei', 'dadurch', 'dafür', 'dagegen', 'daher', 'dahin', 'dahinter', 'damals', 'damit', 'danach', 'daneben', 'dank', 'dann', 'daran', 'darauf', 'daraus', 'darf', 'darfst', 'darin', 'darüber', 'darum', 'darunter', 'das', 'das', 'dasein', 'daselbst', 'dass', 'daß', 'dasselbe', 'davon', 'davor', 'dazu', 'dazwischen', 'dein', 'deine', 'deinem', 'deiner', 'dem', 'dementsprechend', 'demgegenüber', 'demgemäss', 'demgemäß', 'demselben', 'demzufolge', 'den', 'denen', 'denn', 'denn', 'denselben', 'der', 'deren', 'derjenige', 'derjenigen', 'dermassen', 'dermaßen', 'derselbe', 'derselben', 'des', 'deshalb', 'desselben', 'dessen', 'deswegen', 'd.h', 'dich', 'die', 'diejenige', 'diejenigen', 'dies', 'diese', 'dieselbe', 'dieselben', 'diesem', 'diesen', 'dieser', 'dieses', 'dir', 'doch', 'dort', 'drei', 'drin', 'dritte', 'dritten', 'dritter', 'drittes', 'du', 'durch', 'durchaus',
|
246 |
+
'eben', 'ebenso', 'eigen', 'eigene', 'eigenen', 'eigener', 'eigenes', 'ein', 'einander', 'eine', 'einem', 'einen', 'einer', 'eines', 'einige', 'einigen', 'einiger', 'einiges', 'einmal', 'einmal', 'eins', 'elf', 'en', 'ende', 'endlich', 'entweder', 'entweder', 'er', 'ernst', 'erst', 'erste', 'ersten', 'erster', 'erstes', 'es', 'etwa', 'etwas', 'euch',
|
247 |
+
'früher', 'fünf', 'fünfte', 'fünften', 'fünfter', 'fünftes', 'für',
|
248 |
+
'gab', 'ganz', 'ganze', 'ganzen', 'ganzer', 'ganzes', 'gar', 'gedurft', 'gegen', 'gegenüber', 'gehabt', 'gehen', 'geht', 'gekannt', 'gekonnt', 'gemacht', 'gemocht', 'gemusst', 'genug', 'gerade', 'gern', 'gesagt', 'gesagt', 'geschweige', 'gewesen', 'gewollt', 'geworden', 'gibt', 'ging', 'gleich', 'gott', 'gross', 'groß', 'grosse', 'große', 'grossen', 'großen', 'grosser', 'großer', 'grosses', 'großes', 'gut', 'gute', 'guter', 'gutes',
|
249 |
+
'habe', 'haben', 'habt', 'hast', 'hat', 'hatte', 'hätte', 'hatten', 'hätten', 'heisst', 'her', 'heute', 'hier', 'hin', 'hinter', 'hoch',
|
250 |
+
'ich', 'ihm', 'ihn', 'ihnen', 'ihr', 'ihre', 'ihrem', 'ihren', 'ihrer', 'ihres', 'im', 'im', 'immer', 'in', 'in', 'indem', 'infolgedessen', 'ins', 'irgend', 'ist',
|
251 |
+
'ja', 'ja', 'jahr', 'jahre', 'jahren', 'je', 'jede', 'jedem', 'jeden', 'jeder', 'jedermann', 'jedermanns', 'jedoch', 'jemand', 'jemandem', 'jemanden', 'jene', 'jenem', 'jenen', 'jener', 'jenes', 'jetzt',
|
252 |
+
'kam', 'kann', 'kannst', 'kaum', 'kein', 'keine', 'keinem', 'keinen', 'keiner', 'kleine', 'kleinen', 'kleiner', 'kleines', 'kommen', 'kommt', 'können', 'könnt', 'konnte', 'könnte', 'konnten', 'kurz',
|
253 |
+
'lang', 'lange', 'lange', 'leicht', 'leide', 'lieber', 'los',
|
254 |
+
'machen', 'macht', 'machte', 'mag', 'magst', 'mahn', 'man', 'manche', 'manchem', 'manchen', 'mancher', 'manches', 'mann', 'mehr', 'mein', 'meine', 'meinem', 'meinen', 'meiner', 'meines', 'mich', 'mir', 'mit', 'mittel', 'mochte', 'möchte', 'mochten', 'mögen', 'möglich', 'mögt', 'morgen', 'muss', 'muß', 'müssen', 'musst', 'müsst', 'musste', 'mussten',
|
255 |
+
'na', 'nach', 'nachdem', 'nahm', 'natürlich', 'neben', 'nein', 'neue', 'neuen', 'neun', 'neunte', 'neunten', 'neunter', 'neuntes', 'nicht', 'nicht', 'nichts', 'nie', 'niemand', 'niemandem', 'niemanden', 'noch', 'nun', 'nun', 'nur',
|
256 |
+
'ob', 'oben', 'oder', 'oder', 'offen', 'oft', 'oft', 'ohne',
|
257 |
+
'recht', 'rechte', 'rechten', 'rechter', 'rechtes', 'richtig', 'rund',
|
258 |
+
'sa', 'sache', 'sagt', 'sagte', 'sah', 'satt', 'schon', 'sechs', 'sechste', 'sechsten', 'sechster', 'sechstes', 'sehr', 'sei', 'sei', 'seid', 'seien', 'sein', 'seine', 'seinem', 'seinen', 'seiner', 'seines', 'seit', 'seitdem', 'selbst', 'selbst', 'sich', 'sie', 'sieben', 'siebente', 'siebenten', 'siebenter', 'siebentes', 'sind', 'so', 'solang', 'solche', 'solchem', 'solchen', 'solcher', 'solches', 'soll', 'sollen', 'sollte', 'sollten', 'sondern', 'sonst', 'sowie', 'später', 'statt',
|
259 |
+
'tat', 'teil', 'tel', 'tritt', 'trotzdem', 'tun',
|
260 |
+
'über', 'überhaupt', 'übrigens', 'uhr', 'um', 'und', 'und?', 'uns', 'unser', 'unsere', 'unserer', 'unter',
|
261 |
+
'vergangenen', 'viel', 'viele', 'vielem', 'vielen', 'vielleicht', 'vier', 'vierte', 'vierten', 'vierter', 'viertes', 'vom', 'von', 'vor',
|
262 |
+
'wahr?', 'während', 'währenddem', 'währenddessen', 'wann', 'war', 'wäre', 'waren', 'wart', 'warum', 'was', 'wegen', 'weil', 'weit', 'weiter', 'weitere', 'weiteren', 'weiteres', 'welche', 'welchem', 'welchen', 'welcher', 'welches', 'wem', 'wen', 'wenig', 'wenig', 'wenige', 'weniger', 'weniges', 'wenigstens', 'wenn', 'wenn', 'wer', 'werde', 'werden', 'werdet', 'wessen', 'wie', 'wie', 'wieder', 'will', 'willst', 'wir', 'wird', 'wirklich', 'wirst', 'wo', 'wohl', 'wollen', 'wollt', 'wollte', 'wollten', 'worden', 'wurde', 'würde', 'wurden', 'würden',
|
263 |
+
'z.b', 'zehn', 'zehnte', 'zehnten', 'zehnter', 'zehntes', 'zeit', 'zu', 'zuerst', 'zugleich', 'zum', 'zum', 'zunächst', 'zur', 'zurück', 'zusammen', 'zwanzig', 'zwar', 'zwar', 'zwei', 'zweite', 'zweiten', 'zweiter', 'zweites', 'zwischen', 'zwölf'
|
264 |
+
);
|
265 |
+
|
266 |
+
return in_array($wort, $stoppworte);
|
267 |
+
}
|
268 |
+
|
269 |
+
|
270 |
+
/*
|
271 |
+
first try to set up a list of exceptions
|
272 |
+
*/
|
273 |
+
function _de_stemmer_ausnahme(&$wort)
|
274 |
+
{ static $de_stemmer_ausnahmen = array (
|
275 |
+
'schön' => 'schön', // !schon
|
276 |
+
'blüt' => 'blüt', // Blüte (NICHT Blut)
|
277 |
+
'kannt' => 'kenn',
|
278 |
+
'küch' => 'küch', // Küchen (NICHT Kuchen)
|
279 |
+
'mög' => 'mög',
|
280 |
+
'mocht' => 'mög',
|
281 |
+
'mag' => 'mög',
|
282 |
+
'ging' => 'geh',
|
283 |
+
'lief' => 'lauf',
|
284 |
+
'änd' => 'änd' // ändern (NICHT andern)
|
285 |
+
);
|
286 |
+
|
287 |
+
//return FALSE;
|
288 |
+
if ( array_key_exists($wort, $de_stemmer_ausnahmen) )
|
289 |
+
{ $wort = $de_stemmer_ausnahmen[$wort];
|
290 |
+
return TRUE;
|
291 |
+
}
|
292 |
+
else
|
293 |
+
return FALSE;
|
294 |
+
}
|
295 |
+
|
296 |
+
/*
|
297 |
+
Stem caching added by Rob Marsh, SJ
|
298 |
+
http://rmarsh.com
|
299 |
+
*/
|
300 |
+
|
301 |
+
$StemCache = array();
|
302 |
+
|
303 |
+
function stem($word) {
|
304 |
+
global $StemCache;
|
305 |
+
if (!isset($StemCache[$word])) {
|
306 |
+
$stemmedword = _de_stemmer_wortstamm($word);
|
307 |
+
$StemCache[$word] = $stemmedword;
|
308 |
+
}
|
309 |
+
else {
|
310 |
+
$stemmedword = $StemCache[$word] ;
|
311 |
+
}
|
312 |
+
return $stemmedword;
|
313 |
+
}
|
314 |
+
|
315 |
+
?>
|
languages/de/stopwords.php
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
1 |
+
<?php
|
2 |
+
// the list of common words we want to ignore. NB anything shorter than 4 characters is knocked by the plugin and doesn't need to figure here
|
3 |
+
$overusedwords = array("aber", "alle", "allem", "allen", "aller", "alles", "also", "ander", "andere", "anderem", "anderen", "anderer", "anderes", "anderm", "andern", "anderr", "anders", "auch", "bist", "damit", "dann", "derselbe", "derselben", "denselben", "desselben", "demselben", "dieselbe", "dieselben", "dasselbe", "dazu", "dein", "deine", "deinem", "deinen", "deiner", "deines", "denn", "derer", "dessen", "dich", "dies", "diese", "diesem", "diesen", "dieser", "dieses", "doch", "dort", "durch", "eine", "einem", "einen", "einer", "eines", "einig", "einige", "einigem", "einigen", "einiger", "einiges", "einmal", "etwas", "euer", "eure", "eurem", "euren", "eurer", "eures", "gegen", "gewesen", "habe", "haben", "hatte", "hatten", "hier", "hinter", "mich", "ihre", "ihrem", "ihren", "ihrer", "ihres", "euch", "indem", "jede", "jedem", "jeden", "jeder", "jedes", "jene", "jenem", "jenen", "jener", "jenes", "jetzt", "kann", "kein", "keine", "keinem", "keinen", "keiner", "keines", "k�nnen", "k�nnte", "machen", "manche", "manchem", "manchen", "mancher", "manches", "mein", "meine", "meinem", "meinen", "meiner", "meines", "muss", "musste", "nach", "nicht", "nichts", "noch", "oder", "ohne", "sehr", "sein", "seine", "seinem", "seinen", "seiner", "seines", "selbst", "sich", "ihnen", "sind", "solche", "solchem", "solchen", "solcher", "solches", "soll", "sollte", "sondern", "sonst", "�ber", "unse", "unsem", "unsen", "unser", "unses", "unter", "viel", "w�hrend", "waren", "warst", "weil", "weiter", "welche", "welchem", "welchen", "welcher", "welches", "wenn", "werde", "werden", "wieder", "will", "wird", "wirst", "wollen", "wollte", "w�rde", "w�rden", "zwar", "zwischen");
|
4 |
+
?>
|
languages/en/stemmer.php
ADDED
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?php
|
2 |
+
/*
|
3 |
+
Creado por Cesar Rodas para el proyecto Saddor.com
|
4 |
+
Este Stemmer esta basado en el argoritmo de Snowball Stemmer.
|
5 |
+
saddor@gmail.com
|
6 |
+
Este programa esta bajo licencia GNU
|
7 |
+
*/
|
8 |
+
if (!defined("ENGLISHSTEMMER"))
|
9 |
+
{
|
10 |
+
define("ENGLISHSTEMMER",1,false);
|
11 |
+
class EnglishStemmer
|
12 |
+
{
|
13 |
+
var $regex_consonant = '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)';
|
14 |
+
var $regex_vowel = '(?:[aeiou]|(?<![aeiou])y)';
|
15 |
+
|
16 |
+
function Stem($word)
|
17 |
+
{
|
18 |
+
if (strlen($word) <= 2) {
|
19 |
+
return $word;
|
20 |
+
}
|
21 |
+
|
22 |
+
$word = $this->step1ab($word);
|
23 |
+
$word = $this->step1c($word);
|
24 |
+
$word = $this->step2($word);
|
25 |
+
$word = $this->step3($word);
|
26 |
+
$word = $this->step4($word);
|
27 |
+
$word = $this->step5($word);
|
28 |
+
/*
|
29 |
+
Esta parte esta editado por cesar rodas,
|
30 |
+
no quiero que me muestre ' (apostrofe) al final
|
31 |
+
*/
|
32 |
+
if (substr($word,-1,1) == "'")
|
33 |
+
$word = substr($word,0,strlen($word) -1 );
|
34 |
+
return $word;
|
35 |
+
}
|
36 |
+
|
37 |
+
|
38 |
+
function step1ab($word)
|
39 |
+
{
|
40 |
+
if (substr($word, -1) == 's') {
|
41 |
+
|
42 |
+
$this->replace($word, 'sses', 'ss')
|
43 |
+
OR $this->replace($word, 'ies', 'i')
|
44 |
+
OR $this->replace($word, 'ss', 'ss')
|
45 |
+
OR $this->replace($word, 's', '');
|
46 |
+
}
|
47 |
+
|
48 |
+
if (substr($word, -2, 1) != 'e' OR !$this->replace($word, 'eed', 'ee', 0)) { // First rule
|
49 |
+
$v = $this->regex_vowel;
|
50 |
+
|
51 |
+
if ( preg_match("#$v+#", substr($word, 0, -3)) && $this->replace($word, 'ing', '')
|
52 |
+
OR preg_match("#$v+#", substr($word, 0, -2)) && $this->replace($word, 'ed', '')) {
|
53 |
+
if ( !$this->replace($word, 'at', 'ate')
|
54 |
+
AND !$this->replace($word, 'bl', 'ble')
|
55 |
+
AND !$this->replace($word, 'iz', 'ize')) {
|
56 |
+
|
57 |
+
if ( $this->doubleConsonant($word)
|
58 |
+
AND substr($word, -2) != 'll'
|
59 |
+
AND substr($word, -2) != 'ss'
|
60 |
+
AND substr($word, -2) != 'zz') {
|
61 |
+
|
62 |
+
$word = substr($word, 0, -1);
|
63 |
+
|
64 |
+
} else if ($this->m($word) == 1 AND $this->cvc($word)) {
|
65 |
+
$word .= 'e';
|
66 |
+
}
|
67 |
+
}
|
68 |
+
}
|
69 |
+
}
|
70 |
+
|
71 |
+
return $word;
|
72 |
+
}
|
73 |
+
|
74 |
+
function step1c($word)
|
75 |
+
{
|
76 |
+
$v = $this->regex_vowel;
|
77 |
+
|
78 |
+
if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1))) {
|
79 |
+
$this->replace($word, 'y', 'i');
|
80 |
+
}
|
81 |
+
|
82 |
+
return $word;
|
83 |
+
}
|
84 |
+
|
85 |
+
|
86 |
+
function step2($word)
|
87 |
+
{
|
88 |
+
switch (substr($word, -2, 1)) {
|
89 |
+
case 'a':
|
90 |
+
$this->replace($word, 'ational', 'ate', 0)
|
91 |
+
OR $this->replace($word, 'tional', 'tion', 0);
|
92 |
+
break;
|
93 |
+
|
94 |
+
case 'c':
|
95 |
+
$this->replace($word, 'enci', 'ence', 0)
|
96 |
+
OR $this->replace($word, 'anci', 'ance', 0);
|
97 |
+
break;
|
98 |
+
|
99 |
+
case 'e':
|
100 |
+
$this->replace($word, 'izer', 'ize', 0);
|
101 |
+
break;
|
102 |
+
|
103 |
+
case 'g':
|
104 |
+
$this->replace($word, 'logi', 'log', 0);
|
105 |
+
break;
|
106 |
+
|
107 |
+
case 'l':
|
108 |
+
$this->replace($word, 'entli', 'ent', 0)
|
109 |
+
OR $this->replace($word, 'ousli', 'ous', 0)
|
110 |
+
OR $this->replace($word, 'alli', 'al', 0)
|
111 |
+
OR $this->replace($word, 'bli', 'ble', 0)
|
112 |
+
OR $this->replace($word, 'eli', 'e', 0);
|
113 |
+
break;
|
114 |
+
|
115 |
+
case 'o':
|
116 |
+
$this->replace($word, 'ization', 'ize', 0)
|
117 |
+
OR $this->replace($word, 'ation', 'ate', 0)
|
118 |
+
OR $this->replace($word, 'ator', 'ate', 0);
|
119 |
+
break;
|
120 |
+
|
121 |
+
case 's':
|
122 |
+
$this->replace($word, 'iveness', 'ive', 0)
|
123 |
+
OR $this->replace($word, 'fulness', 'ful', 0)
|
124 |
+
OR $this->replace($word, 'ousness', 'ous', 0)
|
125 |
+
OR $this->replace($word, 'alism', 'al', 0);
|
126 |
+
break;
|
127 |
+
|
128 |
+
case 't':
|
129 |
+
$this->replace($word, 'biliti', 'ble', 0)
|
130 |
+
OR $this->replace($word, 'aliti', 'al', 0)
|
131 |
+
OR $this->replace($word, 'iviti', 'ive', 0);
|
132 |
+
break;
|
133 |
+
}
|
134 |
+
|
135 |
+
return $word;
|
136 |
+
}
|
137 |
+
|
138 |
+
|
139 |
+
function step3($word)
|
140 |
+
{
|
141 |
+
switch (substr($word, -2, 1)) {
|
142 |
+
case 'a':
|
143 |
+
$this->replace($word, 'ical', 'ic', 0);
|
144 |
+
break;
|
145 |
+
|
146 |
+
case 's':
|
147 |
+
$this->replace($word, 'ness', '', 0);
|
148 |
+
break;
|
149 |
+
|
150 |
+
case 't':
|
151 |
+
$this->replace($word, 'icate', 'ic', 0)
|
152 |
+
OR $this->replace($word, 'iciti', 'ic', 0);
|
153 |
+
break;
|
154 |
+
|
155 |
+
case 'u':
|
156 |
+
$this->replace($word, 'ful', '', 0);
|
157 |
+
break;
|
158 |
+
|
159 |
+
case 'v':
|
160 |
+
$this->replace($word, 'ative', '', 0);
|
161 |
+
break;
|
162 |
+
|
163 |
+
case 'z':
|
164 |
+
$this->replace($word, 'alize', 'al', 0);
|
165 |
+
break;
|
166 |
+
}
|
167 |
+
|
168 |
+
return $word;
|
169 |
+
}
|
170 |
+
|
171 |
+
|
172 |
+
function step4($word)
|
173 |
+
{
|
174 |
+
switch (substr($word, -2, 1)) {
|
175 |
+
case 'a':
|
176 |
+
$this->replace($word, 'al', '', 1);
|
177 |
+
break;
|
178 |
+
|
179 |
+
case 'c':
|
180 |
+
$this->replace($word, 'ance', '', 1)
|
181 |
+
OR $this->replace($word, 'ence', '', 1);
|
182 |
+
break;
|
183 |
+
|
184 |
+
case 'e':
|
185 |
+
$this->replace($word, 'er', '', 1);
|
186 |
+
break;
|
187 |
+
|
188 |
+
case 'i':
|
189 |
+
$this->replace($word, 'ic', '', 1);
|
190 |
+
break;
|
191 |
+
|
192 |
+
case 'l':
|
193 |
+
$this->replace($word, 'able', '', 1)
|
194 |
+
OR $this->replace($word, 'ible', '', 1);
|
195 |
+
break;
|
196 |
+
|
197 |
+
case 'n':
|
198 |
+
$this->replace($word, 'ant', '', 1)
|
199 |
+
OR $this->replace($word, 'ement', '', 1)
|
200 |
+
OR $this->replace($word, 'ment', '', 1)
|
201 |
+
OR $this->replace($word, 'ent', '', 1);
|
202 |
+
break;
|
203 |
+
|
204 |
+
case 'o':
|
205 |
+
if (substr($word, -4) == 'tion' OR substr($word, -4) == 'sion') {
|
206 |
+
$this->replace($word, 'ion', '', 1);
|
207 |
+
} else {
|
208 |
+
$this->replace($word, 'ou', '', 1);
|
209 |
+
}
|
210 |
+
break;
|
211 |
+
|
212 |
+
case 's':
|
213 |
+
$this->replace($word, 'ism', '', 1);
|
214 |
+
break;
|
215 |
+
|
216 |
+
case 't':
|
217 |
+
$this->replace($word, 'ate', '', 1)
|
218 |
+
OR $this->replace($word, 'iti', '', 1);
|
219 |
+
break;
|
220 |
+
|
221 |
+
case 'u':
|
222 |
+
$this->replace($word, 'ous', '', 1);
|
223 |
+
break;
|
224 |
+
|
225 |
+
case 'v':
|
226 |
+
$this->replace($word, 'ive', '', 1);
|
227 |
+
break;
|
228 |
+
|
229 |
+
case 'z':
|
230 |
+
$this->replace($word, 'ize', '', 1);
|
231 |
+
break;
|
232 |
+
}
|
233 |
+
|
234 |
+
return $word;
|
235 |
+
}
|
236 |
+
|
237 |
+
function step5($word)
|
238 |
+
{
|
239 |
+
if (substr($word, -1) == 'e') {
|
240 |
+
if ($this->m(substr($word, 0, -1)) > 1) {
|
241 |
+
$this->replace($word, 'e', '');
|
242 |
+
|
243 |
+
} else if ($this->m(substr($word, 0, -1)) == 1) {
|
244 |
+
|
245 |
+
if (!$this->cvc(substr($word, 0, -1))) {
|
246 |
+
$this->replace($word, 'e', '');
|
247 |
+
}
|
248 |
+
}
|
249 |
+
}
|
250 |
+
|
251 |
+
// Part b
|
252 |
+
if ($this->m($word) > 1 AND $this->doubleConsonant($word) AND substr($word, -1) == 'l') {
|
253 |
+
$word = substr($word, 0, -1);
|
254 |
+
}
|
255 |
+
|
256 |
+
return $word;
|
257 |
+
}
|
258 |
+
|
259 |
+
function replace(&$str, $check, $repl, $m = null)
|
260 |
+
{
|
261 |
+
$len = 0 - strlen($check);
|
262 |
+
|
263 |
+
if (substr($str, $len) == $check) {
|
264 |
+
$substr = substr($str, 0, $len);
|
265 |
+
if (is_null($m) OR $this->m($substr) > $m) {
|
266 |
+
$str = $substr . $repl;
|
267 |
+
}
|
268 |
+
|
269 |
+
return true;
|
270 |
+
}
|
271 |
+
|
272 |
+
return false;
|
273 |
+
}
|
274 |
+
|
275 |
+
|
276 |
+
|
277 |
+
function m($str)
|
278 |
+
{
|
279 |
+
$c = $this->regex_consonant;
|
280 |
+
$v = $this->regex_vowel;
|
281 |
+
|
282 |
+
$str = preg_replace("#^$c+#", '', $str);
|
283 |
+
$str = preg_replace("#$v+$#", '', $str);
|
284 |
+
|
285 |
+
preg_match_all("#($v+$c+)#", $str, $matches);
|
286 |
+
|
287 |
+
return count($matches[1]);
|
288 |
+
}
|
289 |
+
|
290 |
+
|
291 |
+
|
292 |
+
function doubleConsonant($str)
|
293 |
+
{
|
294 |
+
$c = $this->regex_consonant;
|
295 |
+
|
296 |
+
return preg_match("#$c{2}$#", $str, $matches) AND $matches[0]{0} == $matches[0]{1};
|
297 |
+
}
|
298 |
+
|
299 |
+
|
300 |
+
|
301 |
+
function cvc($str)
|
302 |
+
{
|
303 |
+
$c = $this->regex_consonant;
|
304 |
+
$v = $this->regex_vowel;
|
305 |
+
|
306 |
+
return preg_match("#($c$v$c)$#", $str, $matches)
|
307 |
+
AND strlen($matches[1]) == 3
|
308 |
+
AND $matches[1]{2} != 'w'
|
309 |
+
AND $matches[1]{2} != 'x'
|
310 |
+
AND $matches[1]{2} != 'y';
|
311 |
+
}
|
312 |
+
}
|
313 |
+
}
|
314 |
+
|
315 |
+
/*
|
316 |
+
Stem caching added by Rob Marsh, SJ
|
317 |
+
http://rmarsh.com
|
318 |
+
*/
|
319 |
+
|
320 |
+
$Stemmer = new EnglishStemmer();
|
321 |
+
$StemCache = array();
|
322 |
+
|
323 |
+
function stem($word) {
|
324 |
+
global $Stemmer, $StemCache;
|
325 |
+
if (!isset($StemCache[$word])) {
|
326 |
+
$stemmedword = $Stemmer->Stem($word);
|
327 |
+
$StemCache[$word] = $stemmedword;
|
328 |
+
}
|
329 |
+
else {
|
330 |
+
$stemmedword = $StemCache[$word] ;
|
331 |
+
}
|
332 |
+
return $stemmedword;
|
333 |
+
}
|
334 |
+
|
335 |
+
?>
|
languages/en/stopwords.php
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
1 |
+
<?php
|
2 |
+
// the list of common words we want to ignore. NB anything shorter than 4 characters is knocked by the plugin and doesn't need to figure here
|
3 |
+
$overusedwords = array("able", "about", "above", "according", "accordingly", "across", "actually", "after", "afterwards", "again", "against", "ain't", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "among", "amongst", "another", "anybody", "anyhow", "anyone", "anything", "anyway", "anyways", "anywhere", "apart", "appear", "appreciate", "appropriate", "aren't", "around", "aside", "asking", "associated", "available", "away", "awfully", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "both", "brief", "came", "cannot", "can't", "cause", "causes", "certain", "certainly", "changes", "clearly", "come", "comes", "concerning", "conse'uently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn't", "course", "currently", "definitely", "described", "despite", "didn't", "different", "does", "doesn't", "doing", "done", "don't", "down", "downwards", "during", "each", "eight", "either", "else", "elsewhere", "enough", "entirely", "especially", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "exactly", "example", "except", "fifth", "first", "five", "followed", "following", "follows", "former", "formerly", "forth", "four", "from", "further", "furthermore", "gets", "getting", "given", "gives", "goes", "going", "gone", "gotten", "greetings", "hadn't", "happens", "hardly", "hasn't", "have", "haven't", "having", "hello", "help", "hence", "here", "hereafter", "hereby", "herein", "hereupon", "he's", "hers", "herself", "himself", "hither", "hopefully", "howbeit", "however", "ignored", "i'll", "it'd", "it's", "i've", "immediate", "inasmuch", "indeed", "indicate", "indicated", "indicates", "inner", "insofar", "instead", "into", "inward", "isn't", "itself", "just", "keep", "keeps", "kept", "know", "known", "knows", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "like", "liked", "likely", "little", "look", "looking", "looks", "mainly", "many", "maybe", "mean", "meanwhile", "merely", "might", "more", "moreover", "most", "mostly", "much", "must", "mustn't", "myself", "name", "namely", "near", "nearly", "necessary", "need", "needs", "neither", "never", "nevertheless", "next", "nine", "nobody", "none", "noone", "normally", "nothing", "novel", "nowhere", "obviously", "often", "okay", "once", "ones", "one's", "only", "onto", "other", "others", "otherwise", "ought", "ours", "ourselves", "outside", "over", "overall", "particular", "particularly", "perhaps", "placed", "please", "plus", "possible", "presumably", "probably", "provides", "'uite", "rather", "really", "reasonably", "regarding", "regardless", "regards", "relatively", "respectively", "right", "said", "same", "saying", "says", "second", "secondly", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall", "should", "shouldn't", "since", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified", "specify", "specifying", "still", "such", "sure", "take", "taken", "tell", "tends", "than", "thank", "thanks", "that", "that's", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "there's", "thereupon", "these", "they", "think", "third", "this", "thorough", "thoroughly", "those", "though", "three", "through", "throughout", "thru", "thus", "together", "took", "toward", "towards", "tried", "tries", "truly", "trying", "twice", "under", "unfortunately", "unless", "unlikely", "until", "unto", "upon", "used", "useful", "uses", "using", "usually", "value", "various", "very", "want", "wants", "wasn't", "welcome", "we'd", "well", "went", "were", "weren't", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "whoever", "whole", "whom", "whose", "will", "willing", "wish", "with", "within", "without", "wonder", "would", "wouldn't", "your", "yours", "yourself", "yourselves", "zero");
|
4 |
+
?>
|
languages/es/stemmer.php
ADDED
@@ -0,0 +1,381 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?php
|
2 |
+
/*
|
3 |
+
Creado por Cesar Rodas para el proyecto Saddor.com
|
4 |
+
Este Stemmer esta basado en el argoritmo de Snowball Stemmer.
|
5 |
+
saddor@gmail.com
|
6 |
+
Este programa esta bajo licencia GNU
|
7 |
+
*/
|
8 |
+
if (!defined("SPANISHSTEMMER"))
|
9 |
+
{
|
10 |
+
define("vocal",1,false);
|
11 |
+
define("consonante",2,false);
|
12 |
+
define("SPANISHSTEMMER",1,false);
|
13 |
+
|
14 |
+
class PorterStemmer
|
15 |
+
{
|
16 |
+
var $R1;
|
17 |
+
var $R2;
|
18 |
+
var $RV;
|
19 |
+
var $word;
|
20 |
+
function Stem($word)
|
21 |
+
{
|
22 |
+
|
23 |
+
$this->word = $word;
|
24 |
+
if (strlen($word) < 2)
|
25 |
+
return;
|
26 |
+
|
27 |
+
|
28 |
+
$this->step_0();
|
29 |
+
while($this->step_1());
|
30 |
+
$this->step_2();
|
31 |
+
$this->step_3();
|
32 |
+
return $this->word;
|
33 |
+
}
|
34 |
+
|
35 |
+
function step_0()
|
36 |
+
{
|
37 |
+
$this->splitword();
|
38 |
+
$search = array(
|
39 |
+
"me","se","sela","selo","selas","selos","la","le","lo","les",
|
40 |
+
"los","nos"
|
41 |
+
);
|
42 |
+
|
43 |
+
$prefix = array(
|
44 |
+
"i�ndo","�ndo","�r","�r","�r", /* primer caso */
|
45 |
+
"iendo","ando","ar","er","ir", /* segundo caso*/
|
46 |
+
"yendo"
|
47 |
+
);
|
48 |
+
|
49 |
+
foreach ($prefix as $id => $pref)
|
50 |
+
{
|
51 |
+
$return = false;
|
52 |
+
if ( (strstr($this->RV,$pref) != NULL) or
|
53 |
+
/* caso para yendo */
|
54 |
+
($pref == "yendo" && strstr($this->word,"uyendo")) )
|
55 |
+
{
|
56 |
+
|
57 |
+
/*
|
58 |
+
El prefijo fue encontrado, ahora buscar para borrar
|
59 |
+
el pronombre.
|
60 |
+
*/
|
61 |
+
foreach ($search as $word)
|
62 |
+
{
|
63 |
+
$len = strlen($word);
|
64 |
+
|
65 |
+
switch ($id)
|
66 |
+
{
|
67 |
+
|
68 |
+
case $id < 5: /* primer Caso*/
|
69 |
+
if ($word == substr($this->RV,-1 * $len,$len) )
|
70 |
+
{
|
71 |
+
$this->word = substr($this->word,0, strlen($this->word) - $len);
|
72 |
+
$this->word = str_replace($prefix[$id],$prefix[$id+5],$this->word);
|
73 |
+
$return = true;
|
74 |
+
}
|
75 |
+
break;
|
76 |
+
case $id < 10: /* segundo caso*/
|
77 |
+
if ($word == substr($this->RV,-1 * $len,$len) )
|
78 |
+
{
|
79 |
+
$this->word = substr($this->word,0, strlen($this->word) - $len);
|
80 |
+
$return = true;
|
81 |
+
}
|
82 |
+
break;
|
83 |
+
case $id >= 10: /* tercer caso*/
|
84 |
+
if ($word == substr($this->RV,-1 * $len,$len) )
|
85 |
+
{
|
86 |
+
|
87 |
+
$this->word = substr($this->word,0, strlen($this->word) - $len);
|
88 |
+
$return = true;
|
89 |
+
}
|
90 |
+
break;
|
91 |
+
}
|
92 |
+
}
|
93 |
+
}
|
94 |
+
|
95 |
+
}
|
96 |
+
unset($prefix,$search,$word,$id,$pref,$len);
|
97 |
+
return $return;
|
98 |
+
}
|
99 |
+
|
100 |
+
function step_1()
|
101 |
+
{
|
102 |
+
$return = false;
|
103 |
+
$this->splitword();
|
104 |
+
|
105 |
+
/* borrado de R2 */
|
106 |
+
$search = array(
|
107 |
+
"abilidades","iblemente","icaciones","ablemente","antemente","ivamente","atamente",
|
108 |
+
"amientos","icadoras","icadores","icancias","imientos","icamente",
|
109 |
+
"osamente","abilidad","icidades","ividades","adamente","icantes",
|
110 |
+
"icancia","imiemto","icadora","icaci�n","amiento","imiento","aciones",
|
111 |
+
"ativos","ativas","ividad","idades","icidad","icante",
|
112 |
+
"icador","adoras","adores","ancias","mente","ables",
|
113 |
+
"ismos","anzas","ativa","ativo","istas","ibles",
|
114 |
+
"aci�n","antes","adora","ancia","ismo","anza",
|
115 |
+
"icos","ivas","osos","ivos","ante","osas",
|
116 |
+
"ador","ible","ista","idad","able","ico",
|
117 |
+
"osa","oso","iva","ica","ica","ivo",
|
118 |
+
);
|
119 |
+
|
120 |
+
for ($i = 0; $i < count($search); $i++)
|
121 |
+
if (substr($this->R2,strlen($search[$i]) * (-1),strlen($search[$i])) == $search[$i])
|
122 |
+
{
|
123 |
+
$this->word = substr($this->word,0,strlen($this->word) - strlen($search[$i]) );
|
124 |
+
$return = true;
|
125 |
+
break;
|
126 |
+
}
|
127 |
+
/* creo que esta mal, creo que hay que buscar en R1*/
|
128 |
+
if ($this->R1 == "amente")
|
129 |
+
{
|
130 |
+
$this->word = str_replace("amente","",$this->word);
|
131 |
+
}
|
132 |
+
|
133 |
+
$search = array
|
134 |
+
(
|
135 |
+
"log�a","log�as",/**/"uci�n","uciones",/**/"encia","encias"
|
136 |
+
);
|
137 |
+
$replace = array
|
138 |
+
(
|
139 |
+
"log","log","u","u","entre","entre"
|
140 |
+
);
|
141 |
+
for ($i = 0; $i < count($search); $i++)
|
142 |
+
if (substr($this->R2,strlen($search[$i]) * (-1),strlen($search[$i])) == $search[$i])
|
143 |
+
{
|
144 |
+
$this->word = str_replace($search[$i],$replace[$i],$this->word);
|
145 |
+
$return = true;
|
146 |
+
break;
|
147 |
+
}
|
148 |
+
unset($i,$search,$replace);
|
149 |
+
return $return;
|
150 |
+
}
|
151 |
+
|
152 |
+
function step_2()
|
153 |
+
{
|
154 |
+
$this->splitword();
|
155 |
+
$return = false;
|
156 |
+
$search = array(
|
157 |
+
"ya","ye","yan","yen","yeron","yendo","yo","y�","yas","yes","yais","yamos"
|
158 |
+
);
|
159 |
+
foreach ($search as $word)
|
160 |
+
{
|
161 |
+
if (substr($this->RV,strlen($word) * (-1),strlen($word)) == $word)
|
162 |
+
if (substr($this->word,-1*(strlen($word) + 1), strlen($word) + 1) == "u".$word)
|
163 |
+
{
|
164 |
+
$this->word = substr($this->word,0, strlen($this->word) -(strlen($word) + 1));
|
165 |
+
$return = true;
|
166 |
+
}
|
167 |
+
}
|
168 |
+
|
169 |
+
if ($return == false)
|
170 |
+
$this->step_2b();
|
171 |
+
unset($return,$search,$word);
|
172 |
+
}
|
173 |
+
|
174 |
+
function step_2b()
|
175 |
+
{
|
176 |
+
$this->splitword();
|
177 |
+
$search = array(
|
178 |
+
"en","es","�is","emos"
|
179 |
+
);
|
180 |
+
|
181 |
+
foreach ($search as $word)
|
182 |
+
{
|
183 |
+
if (substr($this->RV,strlen($word) * (-1),strlen($word)) == $word)
|
184 |
+
if (substr($this->word,(-1)*(strlen($word) + 2), strlen($word) + 2) == "gu".$word)
|
185 |
+
{
|
186 |
+
$this->word = substr($this->word,0, strlen($this->word) -(strlen($word) + 1) );
|
187 |
+
$return = true;
|
188 |
+
}
|
189 |
+
/*
|
190 |
+
This part was fix by Diego Enrique Finol <dfinol at cantv dot net>
|
191 |
+
This was the email that Diego sent to me:
|
192 |
+
Epa saludos, gracias por la clase de spanish stemmer, hab�a visto lo mismo
|
193 |
+
en snowball pero me ahorraste el trabajo de convertirlo a php. S�lo not�
|
194 |
+
que en las partes en la que hab�a que borrar cierto sufijo y, adem�s,
|
195 |
+
borrar la "u" de si est� precedido por "gu" creo que no borra el sufijo si
|
196 |
+
no est� precedido por esto. O sea, hay que borrar el afijo en ambos casos,
|
197 |
+
y de paso si est� precedido por gu, tambi�n borrar la u, pero el algoritmo
|
198 |
+
s�lo lo hace si est� precedido por gu, sino, no borra nada.
|
199 |
+
|
200 |
+
Thanks Diego!.
|
201 |
+
*/
|
202 |
+
else
|
203 |
+
{
|
204 |
+
$this->word = substr($this->word,0, strlen($this->word) -(strlen($word)) );
|
205 |
+
$return = true;
|
206 |
+
}
|
207 |
+
/*End of Diego fix*/
|
208 |
+
}
|
209 |
+
|
210 |
+
$search = array(
|
211 |
+
"i�ramos","ar�amos","ir�amos","i�semos","er�amos","er�ais","eremos",
|
212 |
+
"isteis","ir�ais","ierais","iremos","�bamos","ieseis",
|
213 |
+
"asteis","�ramos","�semos","aremos","ar�ais","abais",
|
214 |
+
"�amos","arais","ieses","ar�an","iesen","ieron",
|
215 |
+
"iendo","ieras","ir�is","ar�as","er�as","aseis",
|
216 |
+
"er�is","er�an","ir�an","ar�is","ir�as","ieran",
|
217 |
+
"ando","amos","aron","asen","aras","ados",
|
218 |
+
"�ais","ases","imos","adas","idas","abas",
|
219 |
+
"iste","ir�n","er�n","ar�a","er�a","iera",
|
220 |
+
"ir�s","ir�a","aran","ar�s","er�s","aste",
|
221 |
+
"iese","aban","ar�n","�is","ada","ir�",
|
222 |
+
"�an","ir�","er�","aba","ara","ido",
|
223 |
+
"ar�","ar�","ado","er�","ase","�as",
|
224 |
+
"ida","�a","er","ar","i�","an",
|
225 |
+
"ir","as","ad","ed","id","�s",
|
226 |
+
|
227 |
+
|
228 |
+
);
|
229 |
+
|
230 |
+
foreach ($search as $word)
|
231 |
+
if (substr($this->RV,strlen($word) * (-1),strlen($word)) == $word)
|
232 |
+
{
|
233 |
+
$this->word = substr($this->word,0, strlen($this->word) -(strlen($word)));
|
234 |
+
$this->splitword();
|
235 |
+
}
|
236 |
+
unset($search,$word);
|
237 |
+
|
238 |
+
}
|
239 |
+
|
240 |
+
function step_3()
|
241 |
+
{
|
242 |
+
$this->splitword();
|
243 |
+
$return = false;
|
244 |
+
$search = array(
|
245 |
+
"os","a","o","�","�","�"
|
246 |
+
);
|
247 |
+
|
248 |
+
|
249 |
+
foreach ($search as $word)
|
250 |
+
if (substr($this->RV,strlen($word) * (-1),strlen($word)) == $word)
|
251 |
+
{
|
252 |
+
$this->word = substr($this->word,0, strlen($this->word) -(strlen($word)));
|
253 |
+
$return = true;
|
254 |
+
}
|
255 |
+
|
256 |
+
$search = array(
|
257 |
+
"e","�"
|
258 |
+
);
|
259 |
+
|
260 |
+
foreach ($search as $word)
|
261 |
+
{
|
262 |
+
if (substr($this->RV,strlen($word) * (-1),strlen($word)) == $word)
|
263 |
+
if (substr($this->RV,-1*(strlen($word) + 2), strlen($word) + 2) == "gu".$word)
|
264 |
+
{
|
265 |
+
$this->word = substr($this->word,0, strlen($this->word) -(strlen($word) + 1) );
|
266 |
+
$return = true;
|
267 |
+
}
|
268 |
+
else
|
269 |
+
{
|
270 |
+
$this->word = substr($this->word,0, strlen($this->word) -(strlen($word)) );
|
271 |
+
$return = true;
|
272 |
+
}
|
273 |
+
}
|
274 |
+
unset($search,$word);
|
275 |
+
$this->word = str_replace("�","a",$this->word);
|
276 |
+
$this->word = str_replace("�","e",$this->word);
|
277 |
+
$this->word = str_replace("�","i",$this->word);
|
278 |
+
$this->word = str_replace("�","o",$this->word);
|
279 |
+
$this->word = str_replace("�","u",$this->word);
|
280 |
+
$this->word = str_replace("�","u",$this->word);
|
281 |
+
return $return;
|
282 |
+
}
|
283 |
+
|
284 |
+
|
285 |
+
/* funciones utilizadas*/
|
286 |
+
function saddorsort($a, $b)
|
287 |
+
{
|
288 |
+
if (strlen($a) == strlen($b)) {
|
289 |
+
return 0;
|
290 |
+
}
|
291 |
+
return (strlen($a) < strlen($b)) ? 1 : -1;
|
292 |
+
}
|
293 |
+
function splitword()
|
294 |
+
{
|
295 |
+
$flag1=false;
|
296 |
+
$flag2=false;
|
297 |
+
$this->R1="";
|
298 |
+
$this->R2="";
|
299 |
+
$this->RV="";
|
300 |
+
for ($i = 1; $i < strlen($this->word); $i++)
|
301 |
+
{
|
302 |
+
if ($flag1)
|
303 |
+
$this->R1.=$this->word[$i];
|
304 |
+
if ($flag2)
|
305 |
+
$this->R2.=$this->word[$i];
|
306 |
+
|
307 |
+
if ($i+1 >= strlen($this->word))
|
308 |
+
break;
|
309 |
+
|
310 |
+
if ($this->char_is($this->word[$i]) == consonante &&
|
311 |
+
$this->char_is(@$this->word[$i+1]) == vocal &&
|
312 |
+
$flag1 == true && $flag2 == false)
|
313 |
+
$flag2=true;
|
314 |
+
|
315 |
+
if ($this->char_is($this->word[$i]) == consonante &&
|
316 |
+
$this->char_is($this->word[$i+1]) == vocal &&
|
317 |
+
$flag1 == false)
|
318 |
+
$flag1=true;
|
319 |
+
}
|
320 |
+
|
321 |
+
|
322 |
+
/* Buscando RV*/
|
323 |
+
$flag1=false;
|
324 |
+
if ($this->char_is($this->word[1]) == consonante)
|
325 |
+
{
|
326 |
+
for ($i = 2; $i < strlen($this->word); $i++)
|
327 |
+
if ($this->char_is($this->word[$i]) == vocal)
|
328 |
+
break;
|
329 |
+
$i++;
|
330 |
+
$this->RV = substr($this->word,$i);
|
331 |
+
}
|
332 |
+
else if ($this->char_is($this->word[1]) == vocal && $this->char_is($this->word[0]) == vocal)
|
333 |
+
{
|
334 |
+
for ($i = 2; $i < strlen($this->word); $i++)
|
335 |
+
if ($this->char_is($this->word[$i]) == consonante)
|
336 |
+
break;
|
337 |
+
$i++;
|
338 |
+
$this->RV = substr($this->word,$i);
|
339 |
+
}
|
340 |
+
else if (strlen($this->word) > 2)
|
341 |
+
$this->RV = substr($this->word,3);
|
342 |
+
|
343 |
+
unset($flag1,$flag2,$i);
|
344 |
+
}
|
345 |
+
|
346 |
+
function char_is($char)
|
347 |
+
{
|
348 |
+
$char = strtolower($char);
|
349 |
+
if ($char == "")
|
350 |
+
return;
|
351 |
+
$vowel = "aeiou������";
|
352 |
+
$consonant = "bcdfghijklmn�opqrsvtxwyz";
|
353 |
+
if (strstr($vowel,$char))
|
354 |
+
return vocal;
|
355 |
+
if (strstr($consonant,$char))
|
356 |
+
return consonante;
|
357 |
+
}
|
358 |
+
}
|
359 |
+
}
|
360 |
+
|
361 |
+
/*
|
362 |
+
Stem caching added by Rob Marsh, SJ
|
363 |
+
http://rmarsh.com
|
364 |
+
*/
|
365 |
+
|
366 |
+
$Stemmer = new PorterStemmer();
|
367 |
+
$StemCache = array();
|
368 |
+
|
369 |
+
function stem($word) {
|
370 |
+
global $Stemmer, $StemCache;
|
371 |
+
if (!isset($StemCache[$word])) {
|
372 |
+
$stemmedword = $Stemmer->Stem($word);
|
373 |
+
$StemCache[$word] = $stemmedword;
|
374 |
+
}
|
375 |
+
else {
|
376 |
+
$stemmedword = $StemCache[$word] ;
|
377 |
+
}
|
378 |
+
return $stemmedword;
|
379 |
+
}
|
380 |
+
|
381 |
+
?>
|
languages/es/stopwords.php
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
1 |
+
<?php
|
2 |
+
// the list of common words we want to ignore. NB anything shorter than 4 characters is knocked by the plugin and doesn't need to figure here
|
3 |
+
$overusedwords = array("algo", "alguna", "algunas", "alguno", "algunos", "alg�n", "ambos", "ampleamos", "ante", "antes", "aquel", "aquellas", "aquellos", "aqui", "arriba", "atras", "bajo", "bastante", "bien", "cada", "cierta", "ciertas", "ciertos", "como", "conseguimos", "conseguir", "consigo", "consigue", "consiguen", "consigues", "contra", "cual", "cuando", "dentro", "desde", "donde", "durante", "ella", "ellas", "ellos", "empleais", "emplean", "emplear", "empleas", "empleo", "encima", "entonces", "entre", "erais", "eramos", "eran", "eras", "eres", "esas", "esos", "esta", "estaba", "estabais", "estaban", "estabas", "estad", "estada", "estadas", "estado", "estados", "estais", "estamos", "estan", "estando", "estar", "estaremos", "estar�", "estar�n", "estar�s", "estar�", "estar�is", "estar�a", "estar�ais", "estar�amos", "estar�an", "estar�as", "estas", "este", "estemos", "esto", "estos", "estoy", "estuve", "estuviera", "estuvierais", "estuvieran", "estuvieras", "estuvieron", "estuviese", "estuvieseis", "estuviesen", "estuvieses", "estuvimos", "estuviste", "estuvisteis", "estuvi�ramos", "estuvi�semos", "estuvo", "est�", "est�bamos", "est�is", "est�n", "est�s", "est�", "est�is", "est�n", "est�s", "fuera", "fuerais", "fueran", "fueras", "fueron", "fuese", "fueseis", "fuesen", "fueses", "fuimos", "fuiste", "fuisteis", "fu�ramos", "fu�semos", "gueno", "habida", "habidas", "habido", "habidos", "habiendo", "habremos", "habr�", "habr�n", "habr�s", "habr�", "habr�is", "habr�a", "habr�ais", "habr�amos", "habr�an", "habr�as", "hab�is", "hab�a", "hab�ais", "hab�amos", "hab�an", "hab�as", "hace", "haceis", "hacemos", "hacen", "hacer", "haces", "hago", "hasta", "haya", "hayamos", "hayan", "hayas", "hay�is", "hemos", "hube", "hubiera", "hubierais", "hubieran", "hubieras", "hubieron", "hubiese", "hubieseis", "hubiesen", "hubieses", "hubimos", "hubiste", "hubisteis", "hubi�ramos", "hubi�semos", "hubo", "incluso", "intenta", "intentais", "intentamos", "intentan", "intentar", "intentas", "intento", "largo", "mientras", "modo", "mucho", "muchos", "m�as", "m�os", "nada", "nosotras", "nosotros", "nuestra", "nuestras", "nuestro", "nuestros", "otra", "otras", "otro", "otros", "para", "pero", "poco", "podeis", "podemos", "poder", "podria", "podriais", "podriamos", "podrian", "podrias", "porque", "primero desde", "puede", "pueden", "puedo", "quien", "quienes", "sabe", "sabeis", "sabemos", "saben", "saber", "sabes", "seamos", "sean", "seas", "sentid", "sentida", "sentidas", "sentido", "sentidos", "seremos", "ser�", "ser�n", "ser�s", "ser�", "ser�is", "ser�a", "ser�ais", "ser�amos", "ser�an", "ser�as", "se�is", "siendo", "siente", "sintiendo", "sobre", "sois", "solamente", "solo", "somos", "suya", "suyas", "suyo", "suyos", "tambi�n", "tanto", "tendremos", "tendr�", "tendr�n", "tendr�s", "tendr�", "tendr�is", "tendr�a", "tendr�ais", "tendr�amos", "tendr�an", "tendr�as", "tened", "teneis", "tenemos", "tener", "tenga", "tengamos", "tengan", "tengas", "tengo", "teng�is", "tenida", "tenidas", "tenido", "tenidos", "teniendo", "ten�is", "ten�a", "ten�ais", "ten�amos", "ten�an", "ten�as", "tiempo", "tiene", "tienen", "tienes", "todo", "todos", "trabaja", "trabajais", "trabajamos", "trabajan", "trabajar", "trabajas", "trabajo", "tras", "tuve", "tuviera", "tuvierais", "tuvieran", "tuvieras", "tuvieron", "tuviese", "tuvieseis", "tuviesen", "tuvieses", "tuvimos", "tuviste", "tuvisteis", "tuvi�ramos", "tuvi�semos", "tuvo", "tuya", "tuyas", "tuyo", "tuyos", "ultimo", "unas", "unos", "usais", "usamos", "usan", "usar", "usas", "vais", "valor", "vamos", "vaya", "verdad", "verdadera cierto", "verdadero", "vosostras", "vosostros", "vosotras", "vosotros", "vuestra", "vuestras", "vuestro", "vuestros", "�ramos");
|
4 |
+
?>
|
languages/fr/stemmer.php
ADDED
@@ -0,0 +1,513 @@
|
|
|
|
|
|
|