Similar Posts – Best Related Posts Plugin for WordPress - Version 2.6.0.0

Version Description

Download this release

Release Info

Developer RobMarsh
Plugin Icon 128x128 Similar Posts – Best Related Posts Plugin for WordPress
Version 2.6.0.0
Comparing to
See all releases

Version 2.6.0.0

languages/de/stemmer.php ADDED
@@ -0,0 +1,315 @@
1
+ <?php
2
+ /*
3
+ Adapted from a drupal module -- see details below
4
+ */
5
+
6
+ /*
7
+ Content:
8
+ Drupal module to improve searching in german texts (Porter stemmer)
9
+ Algorithm based on http://snowball.tartarus.org/algorithms/german/stemmer.html
10
+ Author:
11
+ Reiner Miericke 10.10.2007
12
+ References:
13
+ Algorithm:
14
+ http://www.clef-campaign.org/workshop2002/WN/3.pdf
15
+ http://w3.ub.uni-konstanz.de/v13/volltexte/2003/996//pdf/scherer.pdf
16
+ http://kontext.fraunhofer.de/haenelt/kurs/Referate/Kowatschew_Lang/stemming.pdf
17
+ http://www.cis.uni-muenchen.de/people/Schulz/SeminarSoSe2001IR/FilzmayerMargetic/referat.html
18
+ http://www.ifi.unizh.ch/CL/broder/mue1/porter/stemming/node1.html
19
+ For lists of stopwords see
20
+ http://members.unine.ch/jacques.savoy/clef/index.html
21
+ Small parts were stolen from dutchstemmer.module
22
+ */
23
+
24
+
25
+ define("DE_STEMMER_VOKALE", "aeiouy���");
26
+
27
+ $enc = mb_detect_encoding('a-zA-Z���������������');
28
+ mb_internal_encoding($enc);
29
+
30
+ function _de_stemmer_split_text(&$text) {
31
+ // Split words from noise
32
+ return preg_split('/([^a-zA-Z���������������]+)/u', $text, -1, PREG_SPLIT_NO_EMPTY);
33
+ }
34
+
35
+
36
+ /**
37
+ * Implementation of hook_search_preprocess
38
+ */
39
+ function de_stemmer_search_preprocess(&$text) {
40
+ // Split words from noise and remove apostrophes
41
+ $words = preg_split('/([^a-zA-Z���������������]+)/u', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
42
+
43
+ // Process each word
44
+ $odd = true;
45
+ foreach ($words as $k => $word) {
46
+ if ($odd) {
47
+ $words[$k] = _de_stemmer_wortstamm($word);
48
+ }
49
+ $odd = !$odd;
50
+ }
51
+
52
+ // Put it all back together
53
+ return implode('', $words);
54
+
55
+ /* alte Version
56
+ $words = _de_stemmer_split_text($text);
57
+
58
+ // Process each word
59
+ foreach ($words as $k => $word) {
60
+ if (!_de_stemmer_stoppwort(strtolower($word))) {
61
+ $words[$k] = _de_stemmer_wortstamm($word);
62
+ }
63
+ }
64
+
65
+ // Put it all back together
66
+ return implode(' ', $words);
67
+ */
68
+ }
69
+
70
+
71
+ /**
72
+ * Implementation of hook_help().
73
+ */
74
+ function de_stemmer_help($section = 'admin/help#search') {
75
+ switch ($section) {
76
+ case 'admin/modules#description':
77
+ return t('Implements a German stemming algorithm (Porter) to improve searching.');
78
+ }
79
+ }
80
+
81
+
82
+ /*
83
+ * Function gets as text (parameter) and splits the text into words.
84
+ * Then each word is stemmed and the word together with its stem is
85
+ * stored in an array (hash).
86
+ * As a result the hash is returned and can be used as a lookup table
87
+ * to identify words which transform to the same stem.
88
+ * For details please compare 'search.module-stem.patch'
89
+ */
90
+ function de_stemmer_stem_list($text) {
91
+ // Split words from noise and remove apostrophes
92
+ $words = _de_stemmer_split_text($text);
93
+
94
+ $stem_list = array();
95
+ foreach ($words as $word) {
96
+ $stem_list[$word] = _de_stemmer_wortstamm($word);
97
+ }
98
+ return $stem_list;
99
+ }
100
+
101
+
102
+ function _de_stemmer_region_n($wort) {
103
+ $r = strcspn($wort, DE_STEMMER_VOKALE);
104
+ return $r + strspn($wort, DE_STEMMER_VOKALE, $r) + 1;
105
+ }
106
+
107
+ function de_stemmer_preprocess($wort) {
108
+ $wort = mb_strtolower($wort);
109
+ $wort = str_replace("�", "ss", $wort);
110
+ // replace � by ss, and put u and y between vowels into upper case
111
+
112
+ $wort = preg_replace( array( '/�/',
113
+ '/(?<=['. DE_STEMMER_VOKALE .'])u(?=['. DE_STEMMER_VOKALE .'])/u',
114
+ '/(?<=['. DE_STEMMER_VOKALE .'])y(?=['. DE_STEMMER_VOKALE .'])/u'
115
+ ),
116
+ array( 'ss', 'U', 'Y' ),
117
+ $wort
118
+ );
119
+ return $wort;
120
+ }
121
+
122
+
123
+ function _de_stemmer_postprocess($wort) {
124
+ $wort = mb_strtolower($wort);
125
+
126
+ if (!_de_stemmer_ausnahme($wort)) // check for exceptions
127
+ {
128
+ $wort = strtr($wort, array('�' => 'a', '�' => 'a',
129
+ '�' => 'e', '�' => 'e',
130
+ '�' => 'i', '�' => 'i',
131
+ '�' => 'o', '�' => 'o',
132
+ '�' => "u", '�' => 'u'
133
+ ));
134
+ }
135
+ return $wort;
136
+ }
137
+
138
+
139
+ function _de_stemmer_wortstamm($wort) {
140
+ $stamm = de_stemmer_preprocess($wort);
141
+
142
+ /*
143
+ * R1 is the region after the first non-vowel following a vowel,
144
+ or is the null region at the end of the word if there is no such non-vowel.
145
+ * R2 is the region after the first non-vowel following a vowel in R1,
146
+ or is the null region at the end of the word if there is no such non-vowel.
147
+ */
148
+
149
+ $l = strlen($stamm);
150
+ $r1 = _de_stemmer_region_n($stamm);
151
+ $r2 = $r1 == $l ? $r1 : $r1 + _de_stemmer_region_n(mb_substr($stamm, $r1));
152
+ // unshure about interpreting the following rule:
153
+ // "then R1 is ADJUSTED so that the region before it contains at least 3 letters"
154
+ if ($r1 < 3) {
155
+ $r1 = 3;
156
+ }
157
+
158
+ /* Step 1
159
+ Search for the longest among the following suffixes,
160
+ (a) e em en ern er es
161
+ (b) s (preceded by a valid s-ending)
162
+ and delete if in R1.
163
+ (Of course the letter of the valid s-ending is not necessarily in R1)
164
+ */
165
+
166
+ if (preg_match('/(e|em|en|ern|er|es)#x2F;u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
167
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
168
+ }
169
+ elseif (preg_match('/(?<=(b|d|f|g|h|k|l|m|n|r|t))s#x2F;u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
170
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
171
+ }
172
+
173
+
174
+ /*
175
+ Step 2
176
+ Search for the longest among the following suffixes,
177
+ (a) en er est
178
+ (b) st (preceded by a valid st-ending, itself preceded by at least 3 letters)
179
+ and delete if in R1.
180
+ */
181
+
182
+ if (preg_match('/(en|er|est)#x2F;u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
183
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
184
+ }
185
+ elseif (preg_match('/(?<=(b|d|f|g|h|k|l|m|n|t))st#x2F;u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
186
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
187
+ }
188
+
189
+
190
+ /*
191
+ Step 3: d-suffixes ( see http://snowball.tartarus.org/texts/glossary.html )
192
+ Search for the longest among the following suffixes, and perform the action indicated.
193
+ end ung
194
+ delete if in R2
195
+ if preceded by ig, delete if in R2 and not preceded by e
196
+ ig ik isch
197
+ delete if in R2 and not preceded by e
198
+ lich heit
199
+ delete if in R2
200
+ if preceded by er or en, delete if in R1
201
+ keit
202
+ delete if in R2
203
+ if preceded by lich or ig, delete if in R2
204
+ ^ means R1 ?
205
+ */
206
+
207
+ if (preg_match('/(?<=eig)(end|ung)#x2F;u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
208
+ ;
209
+ }
210
+ elseif (preg_match('/(end|ung)#x2F;u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
211
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
212
+ }
213
+ elseif (preg_match('/(?<![e])(ig|ik|isch)#x2F;u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
214
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
215
+ }
216
+ elseif (preg_match('/(?<=(er|en))(lich|heit)#x2F;u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
217
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
218
+ }
219
+ elseif (preg_match('/(lich|heit)#x2F;u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
220
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
221
+ }
222
+ elseif (preg_match('/(?<=lich)keit#x2F;u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
223
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
224
+ }
225
+ elseif (preg_match('/(?<=ig)keit#x2F;u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
226
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
227
+ }
228
+ elseif (preg_match('/keit#x2F;u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
229
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
230
+ }
231
+
232
+
233
+ /* Was ist mit
234
+ chen, lein, bar, schaft, ... ?
235
+ */
236
+ return _de_stemmer_postprocess($stamm);
237
+ }
238
+
239
+
240
+ function _de_stemmer_stoppwort($wort) {
241
+
242
+ static $stoppworte = array(
243
+ 'ab', 'aber', 'aber', 'ach', 'acht', 'achte', 'achten', 'achter', 'achtes', 'ag', 'alle', 'allein', 'allem', 'allen', 'aller', 'allerdings', 'alles', 'allgemeinen', 'als', 'als', 'also', 'am', 'an', 'andere', 'anderen', 'andern', 'anders', 'au', 'auch', 'auch', 'auf', 'aus', 'ausser', 'au�er', 'ausserdem', 'au�erdem',
244
+ 'bald', 'bei', 'beide', 'beiden', 'beim', 'bekannt', 'bereits', 'besonders', 'besser', 'besten', 'bin', 'bis', 'bisher', 'bist',
245
+ 'da', 'dabei', 'dadurch', 'daf�r', 'dagegen', 'daher', 'dahin', 'dahinter', 'damals', 'damit', 'danach', 'daneben', 'dank', 'dann', 'daran', 'darauf', 'daraus', 'darf', 'darfst', 'darin', 'dar�ber', 'darum', 'darunter', 'das', 'das', 'dasein', 'daselbst', 'dass', 'da�', 'dasselbe', 'davon', 'davor', 'dazu', 'dazwischen', 'dein', 'deine', 'deinem', 'deiner', 'dem', 'dementsprechend', 'demgegen�ber', 'demgem�ss', 'demgem��', 'demselben', 'demzufolge', 'den', 'denen', 'denn', 'denn', 'denselben', 'der', 'deren', 'derjenige', 'derjenigen', 'dermassen', 'derma�en', 'derselbe', 'derselben', 'des', 'deshalb', 'desselben', 'dessen', 'deswegen', 'd.h', 'dich', 'die', 'diejenige', 'diejenigen', 'dies', 'diese', 'dieselbe', 'dieselben', 'diesem', 'diesen', 'dieser', 'dieses', 'dir', 'doch', 'dort', 'drei', 'drin', 'dritte', 'dritten', 'dritter', 'drittes', 'du', 'durch', 'durchaus',
246
+ 'eben', 'ebenso', 'eigen', 'eigene', 'eigenen', 'eigener', 'eigenes', 'ein', 'einander', 'eine', 'einem', 'einen', 'einer', 'eines', 'einige', 'einigen', 'einiger', 'einiges', 'einmal', 'einmal', 'eins', 'elf', 'en', 'ende', 'endlich', 'entweder', 'entweder', 'er', 'ernst', 'erst', 'erste', 'ersten', 'erster', 'erstes', 'es', 'etwa', 'etwas', 'euch',
247
+ 'fr�her', 'f�nf', 'f�nfte', 'f�nften', 'f�nfter', 'f�nftes', 'f�r',
248
+ 'gab', 'ganz', 'ganze', 'ganzen', 'ganzer', 'ganzes', 'gar', 'gedurft', 'gegen', 'gegen�ber', 'gehabt', 'gehen', 'geht', 'gekannt', 'gekonnt', 'gemacht', 'gemocht', 'gemusst', 'genug', 'gerade', 'gern', 'gesagt', 'gesagt', 'geschweige', 'gewesen', 'gewollt', 'geworden', 'gibt', 'ging', 'gleich', 'gott', 'gross', 'gro�', 'grosse', 'gro�e', 'grossen', 'gro�en', 'grosser', 'gro�er', 'grosses', 'gro�es', 'gut', 'gute', 'guter', 'gutes',
249
+ 'habe', 'haben', 'habt', 'hast', 'hat', 'hatte', 'h�tte', 'hatten', 'h�tten', 'heisst', 'her', 'heute', 'hier', 'hin', 'hinter', 'hoch',
250
+ 'ich', 'ihm', 'ihn', 'ihnen', 'ihr', 'ihre', 'ihrem', 'ihren', 'ihrer', 'ihres', 'im', 'im', 'immer', 'in', 'in', 'indem', 'infolgedessen', 'ins', 'irgend', 'ist',
251
+ 'ja', 'ja', 'jahr', 'jahre', 'jahren', 'je', 'jede', 'jedem', 'jeden', 'jeder', 'jedermann', 'jedermanns', 'jedoch', 'jemand', 'jemandem', 'jemanden', 'jene', 'jenem', 'jenen', 'jener', 'jenes', 'jetzt',
252
+ 'kam', 'kann', 'kannst', 'kaum', 'kein', 'keine', 'keinem', 'keinen', 'keiner', 'kleine', 'kleinen', 'kleiner', 'kleines', 'kommen', 'kommt', 'k�nnen', 'k�nnt', 'konnte', 'k�nnte', 'konnten', 'kurz',
253
+ 'lang', 'lange', 'lange', 'leicht', 'leide', 'lieber', 'los',
254
+ 'machen', 'macht', 'machte', 'mag', 'magst', 'mahn', 'man', 'manche', 'manchem', 'manchen', 'mancher', 'manches', 'mann', 'mehr', 'mein', 'meine', 'meinem', 'meinen', 'meiner', 'meines', 'mich', 'mir', 'mit', 'mittel', 'mochte', 'm�chte', 'mochten', 'm�gen', 'm�glich', 'm�gt', 'morgen', 'muss', 'mu�', 'm�ssen', 'musst', 'm�sst', 'musste', 'mussten',
255
+ 'na', 'nach', 'nachdem', 'nahm', 'nat�rlich', 'neben', 'nein', 'neue', 'neuen', 'neun', 'neunte', 'neunten', 'neunter', 'neuntes', 'nicht', 'nicht', 'nichts', 'nie', 'niemand', 'niemandem', 'niemanden', 'noch', 'nun', 'nun', 'nur',
256
+ 'ob', 'oben', 'oder', 'oder', 'offen', 'oft', 'oft', 'ohne',
257
+ 'recht', 'rechte', 'rechten', 'rechter', 'rechtes', 'richtig', 'rund',
258
+ 'sa', 'sache', 'sagt', 'sagte', 'sah', 'satt', 'schon', 'sechs', 'sechste', 'sechsten', 'sechster', 'sechstes', 'sehr', 'sei', 'sei', 'seid', 'seien', 'sein', 'seine', 'seinem', 'seinen', 'seiner', 'seines', 'seit', 'seitdem', 'selbst', 'selbst', 'sich', 'sie', 'sieben', 'siebente', 'siebenten', 'siebenter', 'siebentes', 'sind', 'so', 'solang', 'solche', 'solchem', 'solchen', 'solcher', 'solches', 'soll', 'sollen', 'sollte', 'sollten', 'sondern', 'sonst', 'sowie', 'sp�ter', 'statt',
259
+ 'tat', 'teil', 'tel', 'tritt', 'trotzdem', 'tun',
260
+ '�ber', '�berhaupt', '�brigens', 'uhr', 'um', 'und', 'und?', 'uns', 'unser', 'unsere', 'unserer', 'unter',
261
+ 'vergangenen', 'viel', 'viele', 'vielem', 'vielen', 'vielleicht', 'vier', 'vierte', 'vierten', 'vierter', 'viertes', 'vom', 'von', 'vor',
262
+ 'wahr?', 'w�hrend', 'w�hrenddem', 'w�hrenddessen', 'wann', 'war', 'w�re', 'waren', 'wart', 'warum', 'was', 'wegen', 'weil', 'weit', 'weiter', 'weitere', 'weiteren', 'weiteres', 'welche', 'welchem', 'welchen', 'welcher', 'welches', 'wem', 'wen', 'wenig', 'wenig', 'wenige', 'weniger', 'weniges', 'wenigstens', 'wenn', 'wenn', 'wer', 'werde', 'werden', 'werdet', 'wessen', 'wie', 'wie', 'wieder', 'will', 'willst', 'wir', 'wird', 'wirklich', 'wirst', 'wo', 'wohl', 'wollen', 'wollt', 'wollte', 'wollten', 'worden', 'wurde', 'w�rde', 'wurden', 'w�rden',
263
+ 'z.b', 'zehn', 'zehnte', 'zehnten', 'zehnter', 'zehntes', 'zeit', 'zu', 'zuerst', 'zugleich', 'zum', 'zum', 'zun�chst', 'zur', 'zur�ck', 'zusammen', 'zwanzig', 'zwar', 'zwar', 'zwei', 'zweite', 'zweiten', 'zweiter', 'zweites', 'zwischen', 'zw�lf'
264
+ );
265
+
266
+ return in_array($wort, $stoppworte);
267
+ }
268
+
269
+
270
+ /*
271
+ first try to set up a list of exceptions
272
+ */
273
+ function _de_stemmer_ausnahme(&$wort)
274
+ { static $de_stemmer_ausnahmen = array (
275
+ 'sch�n' => 'sch�n', // !schon
276
+ 'bl�t' => 'bl�t', // Bl�te (NICHT Blut)
277
+ 'kannt' => 'kenn',
278
+ 'k�ch' => 'k�ch', // K�chen (NICHT Kuchen)
279
+ 'm�g' => 'm�g',
280
+ 'mocht' => 'm�g',
281
+ 'mag' => 'm�g',
282
+ 'ging' => 'geh',
283
+ 'lief' => 'lauf',
284
+ '�nd' => '�nd' // �ndern (NICHT andern)
285
+ );
286
+
287
+ //return FALSE;
288
+ if ( array_key_exists($wort, $de_stemmer_ausnahmen) )
289
+ { $wort = $de_stemmer_ausnahmen[$wort];
290
+ return TRUE;
291
+ }
292
+ else
293
+ return FALSE;
294
+ }
295
+
296
+ /*
297
+ Stem caching added by Rob Marsh, SJ
298
+ http://rmarsh.com
299
+ */
300
+
301
+ $StemCache = array();
302
+
303
+ function stem($word) {
304
+ global $StemCache;
305
+ if (!isset($StemCache[$word])) {
306
+ $stemmedword = _de_stemmer_wortstamm($word);
307
+ $StemCache[$word] = $stemmedword;
308
+ }
309
+ else {
310
+ $stemmedword = $StemCache[$word] ;
311
+ }
312
+ return $stemmedword;
313
+ }
314
+
315
+ ?>
languages/de/stemmer.php.bak ADDED
@@ -0,0 +1,315 @@
1
+ <?php
2
+ /*
3
+ Adapted from a drupal module -- see details below
4
+ */
5
+
6
+ /*
7
+ Content:
8
+ Drupal module to improve searching in german texts (Porter stemmer)
9
+ Algorithm based on http://snowball.tartarus.org/algorithms/german/stemmer.html
10
+ Author:
11
+ Reiner Miericke 10.10.2007
12
+ References:
13
+ Algorithm:
14
+ http://www.clef-campaign.org/workshop2002/WN/3.pdf
15
+ http://w3.ub.uni-konstanz.de/v13/volltexte/2003/996//pdf/scherer.pdf
16
+ http://kontext.fraunhofer.de/haenelt/kurs/Referate/Kowatschew_Lang/stemming.pdf
17
+ http://www.cis.uni-muenchen.de/people/Schulz/SeminarSoSe2001IR/FilzmayerMargetic/referat.html
18
+ http://www.ifi.unizh.ch/CL/broder/mue1/porter/stemming/node1.html
19
+ For lists of stopwords see
20
+ http://members.unine.ch/jacques.savoy/clef/index.html
21
+ Small parts were stolen from dutchstemmer.module
22
+ */
23
+
24
+
25
+ define("DE_STEMMER_VOKALE", "aeiouyäöü");
26
+
27
+ $enc = mb_detect_encoding('a-zA-ZÄÖÜßäëïöüáéíóúè');
28
+ mb_internal_encoding($enc);
29
+
30
+ function _de_stemmer_split_text(&$text) {
31
+ // Split words from noise
32
+ return preg_split('/([^a-zA-ZÄÖÜßäëïöüáéíóúè]+)/u', $text, -1, PREG_SPLIT_NO_EMPTY);
33
+ }
34
+
35
+
36
+ /**
37
+ * Implementation of hook_search_preprocess
38
+ */
39
+ function de_stemmer_search_preprocess(&$text) {
40
+ // Split words from noise and remove apostrophes
41
+ $words = preg_split('/([^a-zA-ZÄÖÜßäëïöüáéíóúè]+)/u', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
42
+
43
+ // Process each word
44
+ $odd = true;
45
+ foreach ($words as $k => $word) {
46
+ if ($odd) {
47
+ $words[$k] = _de_stemmer_wortstamm($word);
48
+ }
49
+ $odd = !$odd;
50
+ }
51
+
52
+ // Put it all back together
53
+ return implode('', $words);
54
+
55
+ /* alte Version
56
+ $words = _de_stemmer_split_text($text);
57
+
58
+ // Process each word
59
+ foreach ($words as $k => $word) {
60
+ if (!_de_stemmer_stoppwort(strtolower($word))) {
61
+ $words[$k] = _de_stemmer_wortstamm($word);
62
+ }
63
+ }
64
+
65
+ // Put it all back together
66
+ return implode(' ', $words);
67
+ */
68
+ }
69
+
70
+
71
+ /**
72
+ * Implementation of hook_help().
73
+ */
74
+ function de_stemmer_help($section = 'admin/help#search') {
75
+ switch ($section) {
76
+ case 'admin/modules#description':
77
+ return t('Implements a German stemming algorithm (Porter) to improve searching.');
78
+ }
79
+ }
80
+
81
+
82
+ /*
83
+ * Function gets as text (parameter) and splits the text into words.
84
+ * Then each word is stemmed and the word together with its stem is
85
+ * stored in an array (hash).
86
+ * As a result the hash is returned and can be used as a lookup table
87
+ * to identify words which transform to the same stem.
88
+ * For details please compare 'search.module-stem.patch'
89
+ */
90
+ function de_stemmer_stem_list($text) {
91
+ // Split words from noise and remove apostrophes
92
+ $words = _de_stemmer_split_text($text);
93
+
94
+ $stem_list = array();
95
+ foreach ($words as $word) {
96
+ $stem_list[$word] = _de_stemmer_wortstamm($word);
97
+ }
98
+ return $stem_list;
99
+ }
100
+
101
+
102
+ function _de_stemmer_region_n($wort) {
103
+ $r = strcspn($wort, DE_STEMMER_VOKALE);
104
+ return $r + strspn($wort, DE_STEMMER_VOKALE, $r) + 1;
105
+ }
106
+
107
+ function de_stemmer_preprocess($wort) {
108
+ $wort = mb_strtolower($wort);
109
+ $wort = str_replace("ß", "ss", $wort);
110
+ // replace ß by ss, and put u and y between vowels into upper case
111
+
112
+ $wort = preg_replace( array( '/ß/',
113
+ '/(?<=['. DE_STEMMER_VOKALE .'])u(?=['. DE_STEMMER_VOKALE .'])/u',
114
+ '/(?<=['. DE_STEMMER_VOKALE .'])y(?=['. DE_STEMMER_VOKALE .'])/u'
115
+ ),
116
+ array( 'ss', 'U', 'Y' ),
117
+ $wort
118
+ );
119
+ return $wort;
120
+ }
121
+
122
+
123
+ function _de_stemmer_postprocess($wort) {
124
+ $wort = mb_strtolower($wort);
125
+
126
+ if (!_de_stemmer_ausnahme($wort)) // check for exceptions
127
+ {
128
+ $wort = strtr($wort, array('ä' => 'a', 'á' => 'a',
129
+ 'ë' => 'e', 'é' => 'e',
130
+ 'ï' => 'i', 'í' => 'i',
131
+ 'ö' => 'o', 'ó' => 'o',
132
+ 'ü' => "u", 'ú' => 'u'
133
+ ));
134
+ }
135
+ return $wort;
136
+ }
137
+
138
+
139
+ function _de_stemmer_wortstamm($wort) {
140
+ $stamm = de_stemmer_preprocess($wort);
141
+
142
+ /*
143
+ * R1 is the region after the first non-vowel following a vowel,
144
+ or is the null region at the end of the word if there is no such non-vowel.
145
+ * R2 is the region after the first non-vowel following a vowel in R1,
146
+ or is the null region at the end of the word if there is no such non-vowel.
147
+ */
148
+
149
+ $l = strlen($stamm);
150
+ $r1 = _de_stemmer_region_n($stamm);
151
+ $r2 = $r1 == $l ? $r1 : $r1 + _de_stemmer_region_n(mb_substr($stamm, $r1));
152
+ // unshure about interpreting the following rule:
153
+ // "then R1 is ADJUSTED so that the region before it contains at least 3 letters"
154
+ if ($r1 < 3) {
155
+ $r1 = 3;
156
+ }
157
+
158
+ /* Step 1
159
+ Search for the longest among the following suffixes,
160
+ (a) e em en ern er es
161
+ (b) s (preceded by a valid s-ending)
162
+ and delete if in R1.
163
+ (Of course the letter of the valid s-ending is not necessarily in R1)
164
+ */
165
+
166
+ if (preg_match('/(e|em|en|ern|er|es)#x2F;u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
167
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
168
+ }
169
+ elseif (preg_match('/(?<=(b|d|f|g|h|k|l|m|n|r|t))s#x2F;u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
170
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
171
+ }
172
+
173
+
174
+ /*
175
+ Step 2
176
+ Search for the longest among the following suffixes,
177
+ (a) en er est
178
+ (b) st (preceded by a valid st-ending, itself preceded by at least 3 letters)
179
+ and delete if in R1.
180
+ */
181
+
182
+ if (preg_match('/(en|er|est)#x2F;u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
183
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
184
+ }
185
+ elseif (preg_match('/(?<=(b|d|f|g|h|k|l|m|n|t))st#x2F;u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
186
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
187
+ }
188
+
189
+
190
+ /*
191
+ Step 3: d-suffixes ( see http://snowball.tartarus.org/texts/glossary.html )
192
+ Search for the longest among the following suffixes, and perform the action indicated.
193
+ end ung
194
+ delete if in R2
195
+ if preceded by ig, delete if in R2 and not preceded by e
196
+ ig ik isch
197
+ delete if in R2 and not preceded by e
198
+ lich heit
199
+ delete if in R2
200
+ if preceded by er or en, delete if in R1
201
+ keit
202
+ delete if in R2
203
+ if preceded by lich or ig, delete if in R2
204
+ ^ means R1 ?
205
+ */
206
+
207
+ if (preg_match('/(?<=eig)(end|ung)#x2F;u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
208
+ ;
209
+ }
210
+ elseif (preg_match('/(end|ung)#x2F;u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
211
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
212
+ }
213
+ elseif (preg_match('/(?<![e])(ig|ik|isch)#x2F;u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
214
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
215
+ }
216
+ elseif (preg_match('/(?<=(er|en))(lich|heit)#x2F;u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
217
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
218
+ }
219
+ elseif (preg_match('/(lich|heit)#x2F;u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
220
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
221
+ }
222
+ elseif (preg_match('/(?<=lich)keit#x2F;u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
223
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
224
+ }
225
+ elseif (preg_match('/(?<=ig)keit#x2F;u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
226
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
227
+ }
228
+ elseif (preg_match('/keit#x2F;u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
229
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
230
+ }
231
+
232
+
233
+ /* Was ist mit
234
+ chen, lein, bar, schaft, ... ?
235
+ */
236
+ return _de_stemmer_postprocess($stamm);
237
+ }
238
+
239
+
240
+ function _de_stemmer_stoppwort($wort) {
241
+
242
+ static $stoppworte = array(
243
+ 'ab', 'aber', 'aber', 'ach', 'acht', 'achte', 'achten', 'achter', 'achtes', 'ag', 'alle', 'allein', 'allem', 'allen', 'aller', 'allerdings', 'alles', 'allgemeinen', 'als', 'als', 'also', 'am', 'an', 'andere', 'anderen', 'andern', 'anders', 'au', 'auch', 'auch', 'auf', 'aus', 'ausser', 'außer', 'ausserdem', 'außerdem',
244
+ 'bald', 'bei', 'beide', 'beiden', 'beim', 'bekannt', 'bereits', 'besonders', 'besser', 'besten', 'bin', 'bis', 'bisher', 'bist',
245
+ 'da', 'dabei', 'dadurch', 'dafür', 'dagegen', 'daher', 'dahin', 'dahinter', 'damals', 'damit', 'danach', 'daneben', 'dank', 'dann', 'daran', 'darauf', 'daraus', 'darf', 'darfst', 'darin', 'darüber', 'darum', 'darunter', 'das', 'das', 'dasein', 'daselbst', 'dass', 'daß', 'dasselbe', 'davon', 'davor', 'dazu', 'dazwischen', 'dein', 'deine', 'deinem', 'deiner', 'dem', 'dementsprechend', 'demgegenüber', 'demgemäss', 'demgemäß', 'demselben', 'demzufolge', 'den', 'denen', 'denn', 'denn', 'denselben', 'der', 'deren', 'derjenige', 'derjenigen', 'dermassen', 'dermaßen', 'derselbe', 'derselben', 'des', 'deshalb', 'desselben', 'dessen', 'deswegen', 'd.h', 'dich', 'die', 'diejenige', 'diejenigen', 'dies', 'diese', 'dieselbe', 'dieselben', 'diesem', 'diesen', 'dieser', 'dieses', 'dir', 'doch', 'dort', 'drei', 'drin', 'dritte', 'dritten', 'dritter', 'drittes', 'du', 'durch', 'durchaus',
246
+ 'eben', 'ebenso', 'eigen', 'eigene', 'eigenen', 'eigener', 'eigenes', 'ein', 'einander', 'eine', 'einem', 'einen', 'einer', 'eines', 'einige', 'einigen', 'einiger', 'einiges', 'einmal', 'einmal', 'eins', 'elf', 'en', 'ende', 'endlich', 'entweder', 'entweder', 'er', 'ernst', 'erst', 'erste', 'ersten', 'erster', 'erstes', 'es', 'etwa', 'etwas', 'euch',
247
+ 'früher', 'fünf', 'fünfte', 'fünften', 'fünfter', 'fünftes', 'für',
248
+ 'gab', 'ganz', 'ganze', 'ganzen', 'ganzer', 'ganzes', 'gar', 'gedurft', 'gegen', 'gegenüber', 'gehabt', 'gehen', 'geht', 'gekannt', 'gekonnt', 'gemacht', 'gemocht', 'gemusst', 'genug', 'gerade', 'gern', 'gesagt', 'gesagt', 'geschweige', 'gewesen', 'gewollt', 'geworden', 'gibt', 'ging', 'gleich', 'gott', 'gross', 'groß', 'grosse', 'große', 'grossen', 'großen', 'grosser', 'großer', 'grosses', 'großes', 'gut', 'gute', 'guter', 'gutes',
249
+ 'habe', 'haben', 'habt', 'hast', 'hat', 'hatte', 'hätte', 'hatten', 'hätten', 'heisst', 'her', 'heute', 'hier', 'hin', 'hinter', 'hoch',
250
+ 'ich', 'ihm', 'ihn', 'ihnen', 'ihr', 'ihre', 'ihrem', 'ihren', 'ihrer', 'ihres', 'im', 'im', 'immer', 'in', 'in', 'indem', 'infolgedessen', 'ins', 'irgend', 'ist',
251
+ 'ja', 'ja', 'jahr', 'jahre', 'jahren', 'je', 'jede', 'jedem', 'jeden', 'jeder', 'jedermann', 'jedermanns', 'jedoch', 'jemand', 'jemandem', 'jemanden', 'jene', 'jenem', 'jenen', 'jener', 'jenes', 'jetzt',
252
+ 'kam', 'kann', 'kannst', 'kaum', 'kein', 'keine', 'keinem', 'keinen', 'keiner', 'kleine', 'kleinen', 'kleiner', 'kleines', 'kommen', 'kommt', 'können', 'könnt', 'konnte', 'könnte', 'konnten', 'kurz',
253
+ 'lang', 'lange', 'lange', 'leicht', 'leide', 'lieber', 'los',
254
+ 'machen', 'macht', 'machte', 'mag', 'magst', 'mahn', 'man', 'manche', 'manchem', 'manchen', 'mancher', 'manches', 'mann', 'mehr', 'mein', 'meine', 'meinem', 'meinen', 'meiner', 'meines', 'mich', 'mir', 'mit', 'mittel', 'mochte', 'möchte', 'mochten', 'mögen', 'möglich', 'mögt', 'morgen', 'muss', 'muß', 'müssen', 'musst', 'müsst', 'musste', 'mussten',
255
+ 'na', 'nach', 'nachdem', 'nahm', 'natürlich', 'neben', 'nein', 'neue', 'neuen', 'neun', 'neunte', 'neunten', 'neunter', 'neuntes', 'nicht', 'nicht', 'nichts', 'nie', 'niemand', 'niemandem', 'niemanden', 'noch', 'nun', 'nun', 'nur',
256
+ 'ob', 'oben', 'oder', 'oder', 'offen', 'oft', 'oft', 'ohne',
257
+ 'recht', 'rechte', 'rechten', 'rechter', 'rechtes', 'richtig', 'rund',
258
+ 'sa', 'sache', 'sagt', 'sagte', 'sah', 'satt', 'schon', 'sechs', 'sechste', 'sechsten', 'sechster', 'sechstes', 'sehr', 'sei', 'sei', 'seid', 'seien', 'sein', 'seine', 'seinem', 'seinen', 'seiner', 'seines', 'seit', 'seitdem', 'selbst', 'selbst', 'sich', 'sie', 'sieben', 'siebente', 'siebenten', 'siebenter', 'siebentes', 'sind', 'so', 'solang', 'solche', 'solchem', 'solchen', 'solcher', 'solches', 'soll', 'sollen', 'sollte', 'sollten', 'sondern', 'sonst', 'sowie', 'später', 'statt',
259
+ 'tat', 'teil', 'tel', 'tritt', 'trotzdem', 'tun',
260
+ 'über', 'überhaupt', 'übrigens', 'uhr', 'um', 'und', 'und?', 'uns', 'unser', 'unsere', 'unserer', 'unter',
261
+ 'vergangenen', 'viel', 'viele', 'vielem', 'vielen', 'vielleicht', 'vier', 'vierte', 'vierten', 'vierter', 'viertes', 'vom', 'von', 'vor',
262
+ 'wahr?', 'während', 'währenddem', 'währenddessen', 'wann', 'war', 'wäre', 'waren', 'wart', 'warum', 'was', 'wegen', 'weil', 'weit', 'weiter', 'weitere', 'weiteren', 'weiteres', 'welche', 'welchem', 'welchen', 'welcher', 'welches', 'wem', 'wen', 'wenig', 'wenig', 'wenige', 'weniger', 'weniges', 'wenigstens', 'wenn', 'wenn', 'wer', 'werde', 'werden', 'werdet', 'wessen', 'wie', 'wie', 'wieder', 'will', 'willst', 'wir', 'wird', 'wirklich', 'wirst', 'wo', 'wohl', 'wollen', 'wollt', 'wollte', 'wollten', 'worden', 'wurde', 'würde', 'wurden', 'würden',
263
+ 'z.b', 'zehn', 'zehnte', 'zehnten', 'zehnter', 'zehntes', 'zeit', 'zu', 'zuerst', 'zugleich', 'zum', 'zum', 'zunächst', 'zur', 'zurück', 'zusammen', 'zwanzig', 'zwar', 'zwar', 'zwei', 'zweite', 'zweiten', 'zweiter', 'zweites', 'zwischen', 'zwölf'
264
+ );
265
+
266
+ return in_array($wort, $stoppworte);
267
+ }
268
+
269
+
270
+ /*
271
+ first try to set up a list of exceptions
272
+ */
273
+ function _de_stemmer_ausnahme(&$wort)
274
+ { static $de_stemmer_ausnahmen = array (
275
+ 'schön' => 'schön', // !schon
276
+ 'blüt' => 'blüt', // Blüte (NICHT Blut)
277
+ 'kannt' => 'kenn',
278
+ 'küch' => 'küch', // Küchen (NICHT Kuchen)
279
+ 'mög' => 'mög',
280
+ 'mocht' => 'mög',
281
+ 'mag' => 'mög',
282
+ 'ging' => 'geh',
283
+ 'lief' => 'lauf',
284
+ 'änd' => 'änd' // ändern (NICHT andern)
285
+ );
286
+
287
+ //return FALSE;
288
+ if ( array_key_exists($wort, $de_stemmer_ausnahmen) )
289
+ { $wort = $de_stemmer_ausnahmen[$wort];
290
+ return TRUE;
291
+ }
292
+ else
293
+ return FALSE;
294
+ }
295
+
296
+ /*
297
+ Stem caching added by Rob Marsh, SJ
298
+ http://rmarsh.com
299
+ */
300
+
301
+ $StemCache = array();
302
+
303
+ function stem($word) {
304
+ global $StemCache;
305
+ if (!isset($StemCache[$word])) {
306
+ $stemmedword = _de_stemmer_wortstamm($word);
307
+ $StemCache[$word] = $stemmedword;
308
+ }
309
+ else {
310
+ $stemmedword = $StemCache[$word] ;
311
+ }
312
+ return $stemmedword;
313
+ }
314
+
315
+ ?>
languages/de/stopwords.php ADDED
@@ -0,0 +1,4 @@
1
+ <?php
2
+ // the list of common words we want to ignore. NB anything shorter than 4 characters is knocked by the plugin and doesn't need to figure here
3
+ $overusedwords = array("aber", "alle", "allem", "allen", "aller", "alles", "also", "ander", "andere", "anderem", "anderen", "anderer", "anderes", "anderm", "andern", "anderr", "anders", "auch", "bist", "damit", "dann", "derselbe", "derselben", "denselben", "desselben", "demselben", "dieselbe", "dieselben", "dasselbe", "dazu", "dein", "deine", "deinem", "deinen", "deiner", "deines", "denn", "derer", "dessen", "dich", "dies", "diese", "diesem", "diesen", "dieser", "dieses", "doch", "dort", "durch", "eine", "einem", "einen", "einer", "eines", "einig", "einige", "einigem", "einigen", "einiger", "einiges", "einmal", "etwas", "euer", "eure", "eurem", "euren", "eurer", "eures", "gegen", "gewesen", "habe", "haben", "hatte", "hatten", "hier", "hinter", "mich", "ihre", "ihrem", "ihren", "ihrer", "ihres", "euch", "indem", "jede", "jedem", "jeden", "jeder", "jedes", "jene", "jenem", "jenen", "jener", "jenes", "jetzt", "kann", "kein", "keine", "keinem", "keinen", "keiner", "keines", "k�nnen", "k�nnte", "machen", "manche", "manchem", "manchen", "mancher", "manches", "mein", "meine", "meinem", "meinen", "meiner", "meines", "muss", "musste", "nach", "nicht", "nichts", "noch", "oder", "ohne", "sehr", "sein", "seine", "seinem", "seinen", "seiner", "seines", "selbst", "sich", "ihnen", "sind", "solche", "solchem", "solchen", "solcher", "solches", "soll", "sollte", "sondern", "sonst", "�ber", "unse", "unsem", "unsen", "unser", "unses", "unter", "viel", "w�hrend", "waren", "warst", "weil", "weiter", "welche", "welchem", "welchen", "welcher", "welches", "wenn", "werde", "werden", "wieder", "will", "wird", "wirst", "wollen", "wollte", "w�rde", "w�rden", "zwar", "zwischen");
4
+ ?>
languages/en/stemmer.php ADDED
@@ -0,0 +1,335 @@
1
+ <?php
2
+ /*
3
+ Creado por Cesar Rodas para el proyecto Saddor.com
4
+ Este Stemmer esta basado en el argoritmo de Snowball Stemmer.
5
+ saddor@gmail.com
6
+ Este programa esta bajo licencia GNU
7
+ */
8
+ if (!defined("ENGLISHSTEMMER"))
9
+ {
10
+ define("ENGLISHSTEMMER",1,false);
11
+ class EnglishStemmer
12
+ {
13
+ var $regex_consonant = '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)';
14
+ var $regex_vowel = '(?:[aeiou]|(?<![aeiou])y)';
15
+
16
+ function Stem($word)
17
+ {
18
+ if (strlen($word) <= 2) {
19
+ return $word;
20
+ }
21
+
22
+ $word = $this->step1ab($word);
23
+ $word = $this->step1c($word);
24
+ $word = $this->step2($word);
25
+ $word = $this->step3($word);
26
+ $word = $this->step4($word);
27
+ $word = $this->step5($word);
28
+ /*
29
+ Esta parte esta editado por cesar rodas,
30
+ no quiero que me muestre ' (apostrofe) al final
31
+ */
32
+ if (substr($word,-1,1) == "'")
33
+ $word = substr($word,0,strlen($word) -1 );
34
+ return $word;
35
+ }
36
+
37
+
38
+ function step1ab($word)
39
+ {
40
+ if (substr($word, -1) == 's') {
41
+
42
+ $this->replace($word, 'sses', 'ss')
43
+ OR $this->replace($word, 'ies', 'i')
44
+ OR $this->replace($word, 'ss', 'ss')
45
+ OR $this->replace($word, 's', '');
46
+ }
47
+
48
+ if (substr($word, -2, 1) != 'e' OR !$this->replace($word, 'eed', 'ee', 0)) { // First rule
49
+ $v = $this->regex_vowel;
50
+
51
+ if ( preg_match("#$v+#", substr($word, 0, -3)) && $this->replace($word, 'ing', '')
52
+ OR preg_match("#$v+#", substr($word, 0, -2)) && $this->replace($word, 'ed', '')) {
53
+ if ( !$this->replace($word, 'at', 'ate')
54
+ AND !$this->replace($word, 'bl', 'ble')
55
+ AND !$this->replace($word, 'iz', 'ize')) {
56
+
57
+ if ( $this->doubleConsonant($word)
58
+ AND substr($word, -2) != 'll'
59
+ AND substr($word, -2) != 'ss'
60
+ AND substr($word, -2) != 'zz') {
61
+
62
+ $word = substr($word, 0, -1);
63
+
64
+ } else if ($this->m($word) == 1 AND $this->cvc($word)) {
65
+ $word .= 'e';
66
+ }
67
+ }
68
+ }
69
+ }
70
+
71
+ return $word;
72
+ }
73
+
74
+ function step1c($word)
75
+ {
76
+ $v = $this->regex_vowel;
77
+
78
+ if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1))) {
79
+ $this->replace($word, 'y', 'i');
80
+ }
81
+
82
+ return $word;
83
+ }
84
+
85
+
86
+ function step2($word)
87
+ {
88
+ switch (substr($word, -2, 1)) {
89
+ case 'a':
90
+ $this->replace($word, 'ational', 'ate', 0)
91
+ OR $this->replace($word, 'tional', 'tion', 0);
92
+ break;
93
+
94
+ case 'c':
95
+ $this->replace($word, 'enci', 'ence', 0)
96
+ OR $this->replace($word, 'anci', 'ance', 0);
97
+ break;
98
+
99
+ case 'e':
100
+ $this->replace($word, 'izer', 'ize', 0);
101
+ break;
102
+
103
+ case 'g':
104
+ $this->replace($word, 'logi', 'log', 0);
105
+ break;
106
+
107
+ case 'l':
108
+ $this->replace($word, 'entli', 'ent', 0)
109
+ OR $this->replace($word, 'ousli', 'ous', 0)
110
+ OR $this->replace($word, 'alli', 'al', 0)
111
+ OR $this->replace($word, 'bli', 'ble', 0)
112
+ OR $this->replace($word, 'eli', 'e', 0);
113
+ break;
114
+
115
+ case 'o':
116
+ $this->replace($word, 'ization', 'ize', 0)
117
+ OR $this->replace($word, 'ation', 'ate', 0)
118
+ OR $this->replace($word, 'ator', 'ate', 0);
119
+ break;
120
+
121
+ case 's':
122
+ $this->replace($word, 'iveness', 'ive', 0)
123
+ OR $this->replace($word, 'fulness', 'ful', 0)
124
+ OR $this->replace($word, 'ousness', 'ous', 0)
125
+ OR $this->replace($word, 'alism', 'al', 0);
126
+ break;
127
+
128
+ case 't':
129
+ $this->replace($word, 'biliti', 'ble', 0)
130
+ OR $this->replace($word, 'aliti', 'al', 0)
131
+ OR $this->replace($word, 'iviti', 'ive', 0);
132
+ break;
133
+ }
134
+
135
+ return $word;
136
+ }
137
+
138
+
139
+ function step3($word)
140
+ {
141
+ switch (substr($word, -2, 1)) {
142
+ case 'a':
143
+ $this->replace($word, 'ical', 'ic', 0);
144
+ break;
145
+
146
+ case 's':
147
+ $this->replace($word, 'ness', '', 0);
148
+ break;
149
+
150
+ case 't':
151
+ $this->replace($word, 'icate', 'ic', 0)
152
+ OR $this->replace($word, 'iciti', 'ic', 0);
153
+ break;
154
+
155
+ case 'u':
156
+ $this->replace($word, 'ful', '', 0);
157
+ break;
158
+
159
+ case 'v':
160
+ $this->replace($word, 'ative', '', 0);
161
+ break;
162
+
163
+ case 'z':
164
+ $this->replace($word, 'alize', 'al', 0);
165
+ break;
166
+ }
167
+
168
+ return $word;
169
+ }
170
+
171
+
172
+ function step4($word)
173
+ {
174
+ switch (substr($word, -2, 1)) {
175
+ case 'a':
176
+ $this->replace($word, 'al', '', 1);
177
+ break;
178
+
179
+ case 'c':
180
+ $this->replace($word, 'ance', '', 1)
181
+ OR $this->replace($word, 'ence', '', 1);
182
+ break;
183
+
184
+ case 'e':
185
+ $this->replace($word, 'er', '', 1);
186
+ break;
187
+
188
+ case 'i':
189
+ $this->replace($word, 'ic', '', 1);
190
+ break;
191
+
192
+ case 'l':
193
+ $this->replace($word, 'able', '', 1)
194
+ OR $this->replace($word, 'ible', '', 1);
195
+ break;
196
+
197
+ case 'n':
198
+ $this->replace($word, 'ant', '', 1)
199
+ OR $this->replace($word, 'ement', '', 1)
200
+ OR $this->replace($word, 'ment', '', 1)
201
+ OR $this->replace($word, 'ent', '', 1);
202
+ break;
203
+
204
+ case 'o':
205
+ if (substr($word, -4) == 'tion' OR substr($word, -4) == 'sion') {
206
+ $this->replace($word, 'ion', '', 1);
207
+ } else {
208
+ $this->replace($word, 'ou', '', 1);
209
+ }
210
+ break;
211
+
212
+ case 's':
213
+ $this->replace($word, 'ism', '', 1);
214
+ break;
215
+
216
+ case 't':
217
+ $this->replace($word, 'ate', '', 1)
218
+ OR $this->replace($word, 'iti', '', 1);
219
+ break;
220
+
221
+ case 'u':
222
+ $this->replace($word, 'ous', '', 1);
223
+ break;
224
+
225
+ case 'v':
226
+ $this->replace($word, 'ive', '', 1);
227
+ break;
228
+
229
+ case 'z':
230
+ $this->replace($word, 'ize', '', 1);
231
+ break;
232
+ }
233
+
234
+ return $word;
235
+ }
236
+
237
+ function step5($word)
238
+ {
239
+ if (substr($word, -1) == 'e') {
240
+ if ($this->m(substr($word, 0, -1)) > 1) {
241
+ $this->replace($word, 'e', '');
242
+
243
+ } else if ($this->m(substr($word, 0, -1)) == 1) {
244
+
245
+ if (!$this->cvc(substr($word, 0, -1))) {
246
+ $this->replace($word, 'e', '');
247
+ }
248
+ }
249
+ }
250
+
251
+ // Part b
252
+ if ($this->m($word) > 1 AND $this->doubleConsonant($word) AND substr($word, -1) == 'l') {
253
+ $word = substr($word, 0, -1);
254
+ }
255
+
256
+ return $word;
257
+ }
258
+
259
+ function replace(&$str, $check, $repl, $m = null)
260
+ {
261
+ $len = 0 - strlen($check);
262
+
263
+ if (substr($str, $len) == $check) {
264
+ $substr = substr($str, 0, $len);
265
+ if (is_null($m) OR $this->m($substr) > $m) {
266
+ $str = $substr . $repl;
267
+ }
268
+
269
+ return true;
270
+ }
271
+
272
+ return false;
273
+ }
274
+
275
+
276
+
277
+ function m($str)
278
+ {
279
+ $c = $this->regex_consonant;
280
+ $v = $this->regex_vowel;
281
+
282
+ $str = preg_replace("#^$c+#", '', $str);
283
+ $str = preg_replace("#$v+$#", '', $str);
284
+
285
+ preg_match_all("#($v+$c+)#", $str, $matches);
286
+
287
+ return count($matches[1]);
288
+ }
289
+
290
+
291
+
292
+ function doubleConsonant($str)
293
+ {
294
+ $c = $this->regex_consonant;
295
+
296
+ return preg_match("#$c{2}$#", $str, $matches) AND $matches[0]{0} == $matches[0]{1};
297
+ }
298
+
299
+
300
+
301
+ function cvc($str)
302
+ {
303
+ $c = $this->regex_consonant;
304
+ $v = $this->regex_vowel;
305
+
306
+ return preg_match("#($c$v$c)$#", $str, $matches)
307
+ AND strlen($matches[1]) == 3
308
+ AND $matches[1]{2} != 'w'
309
+ AND $matches[1]{2} != 'x'
310
+ AND $matches[1]{2} != 'y';
311
+ }
312
+ }
313
+ }
314
+
315
+ /*
316
+ Stem caching added by Rob Marsh, SJ
317
+ http://rmarsh.com
318
+ */
319
+
320
+ $Stemmer = new EnglishStemmer();
321
+ $StemCache = array();
322
+
323
+ function stem($word) {
324
+ global $Stemmer, $StemCache;
325
+ if (!isset($StemCache[$word])) {
326
+ $stemmedword = $Stemmer->Stem($word);
327
+ $StemCache[$word] = $stemmedword;
328
+ }
329
+ else {
330
+ $stemmedword = $StemCache[$word] ;
331
+ }
332
+ return $stemmedword;
333
+ }
334
+
335
+ ?>
languages/en/stopwords.php ADDED
@@ -0,0 +1,4 @@
1
+ <?php
2
+ // the list of common words we want to ignore. NB anything shorter than 4 characters is knocked by the plugin and doesn't need to figure here
3
+ $overusedwords = array("able", "about", "above", "according", "accordingly", "across", "actually", "after", "afterwards", "again", "against", "ain't", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "among", "amongst", "another", "anybody", "anyhow", "anyone", "anything", "anyway", "anyways", "anywhere", "apart", "appear", "appreciate", "appropriate", "aren't", "around", "aside", "asking", "associated", "available", "away", "awfully", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "both", "brief", "came", "cannot", "can't", "cause", "causes", "certain", "certainly", "changes", "clearly", "come", "comes", "concerning", "conse'uently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn't", "course", "currently", "definitely", "described", "despite", "didn't", "different", "does", "doesn't", "doing", "done", "don't", "down", "downwards", "during", "each", "eight", "either", "else", "elsewhere", "enough", "entirely", "especially", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "exactly", "example", "except", "fifth", "first", "five", "followed", "following", "follows", "former", "formerly", "forth", "four", "from", "further", "furthermore", "gets", "getting", "given", "gives", "goes", "going", "gone", "gotten", "greetings", "hadn't", "happens", "hardly", "hasn't", "have", "haven't", "having", "hello", "help", "hence", "here", "hereafter", "hereby", "herein", "hereupon", "he's", "hers", "herself", "himself", "hither", "hopefully", "howbeit", "however", "ignored", "i'll", "it'd", "it's", "i've", "immediate", "inasmuch", "indeed", "indicate", "indicated", "indicates", "inner", "insofar", "instead", "into", "inward", "isn't", "itself", "just", "keep", "keeps", "kept", "know", "known", "knows", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "like", "liked", "likely", "little", "look", "looking", "looks", "mainly", "many", "maybe", "mean", "meanwhile", "merely", "might", "more", "moreover", "most", "mostly", "much", "must", "mustn't", "myself", "name", "namely", "near", "nearly", "necessary", "need", "needs", "neither", "never", "nevertheless", "next", "nine", "nobody", "none", "noone", "normally", "nothing", "novel", "nowhere", "obviously", "often", "okay", "once", "ones", "one's", "only", "onto", "other", "others", "otherwise", "ought", "ours", "ourselves", "outside", "over", "overall", "particular", "particularly", "perhaps", "placed", "please", "plus", "possible", "presumably", "probably", "provides", "'uite", "rather", "really", "reasonably", "regarding", "regardless", "regards", "relatively", "respectively", "right", "said", "same", "saying", "says", "second", "secondly", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall", "should", "shouldn't", "since", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified", "specify", "specifying", "still", "such", "sure", "take", "taken", "tell", "tends", "than", "thank", "thanks", "that", "that's", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "there's", "thereupon", "these", "they", "think", "third", "this", "thorough", "thoroughly", "those", "though", "three", "through", "throughout", "thru", "thus", "together", "took", "toward", "towards", "tried", "tries", "truly", "trying", "twice", "under", "unfortunately", "unless", "unlikely", "until", "unto", "upon", "used", "useful", "uses", "using", "usually", "value", "various", "very", "want", "wants", "wasn't", "welcome", "we'd", "well", "went", "were", "weren't", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "whoever", "whole", "whom", "whose", "will", "willing", "wish", "with", "within", "without", "wonder", "would", "wouldn't", "your", "yours", "yourself", "yourselves", "zero");
4
+ ?>
languages/es/stemmer.php ADDED
@@ -0,0 +1,381 @@
1
+ <?php
2
+ /*
3
+ Creado por Cesar Rodas para el proyecto Saddor.com
4
+ Este Stemmer esta basado en el argoritmo de Snowball Stemmer.
5
+ saddor@gmail.com
6
+ Este programa esta bajo licencia GNU
7
+ */
8
+ if (!defined("SPANISHSTEMMER"))
9
+ {
10
+ define("vocal",1,false);
11
+ define("consonante",2,false);
12
+ define("SPANISHSTEMMER",1,false);
13
+
14
+ class PorterStemmer
15
+ {
16
+ var $R1;
17
+ var $R2;
18
+ var $RV;
19
+ var $word;
20
+ function Stem($word)
21
+ {
22
+
23
+ $this->word = $word;
24
+ if (strlen($word) < 2)
25
+ return;
26
+
27
+
28
+ $this->step_0();
29
+ while($this->step_1());
30
+ $this->step_2();
31
+ $this->step_3();
32
+ return $this->word;
33
+ }
34
+
35
+ function step_0()
36
+ {
37
+ $this->splitword();
38
+ $search = array(
39
+ "me","se","sela","selo","selas","selos","la","le","lo","les",
40
+ "los","nos"
41
+ );
42
+
43
+ $prefix = array(
44
+ "i�ndo","�ndo","�r","�r","�r", /* primer caso */
45
+ "iendo","ando","ar","er","ir", /* segundo caso*/
46
+ "yendo"
47
+ );
48
+
49
+ foreach ($prefix as $id => $pref)
50
+ {
51
+ $return = false;
52
+ if ( (strstr($this->RV,$pref) != NULL) or
53
+ /* caso para yendo */
54
+ ($pref == "yendo" && strstr($this->word,"uyendo")) )
55
+ {
56
+
57
+ /*
58
+ El prefijo fue encontrado, ahora buscar para borrar
59
+ el pronombre.
60
+ */
61
+ foreach ($search as $word)
62
+ {
63
+ $len = strlen($word);
64
+
65
+ switch ($id)
66
+ {
67
+
68
+ case $id < 5: /* primer Caso*/
69
+ if ($word == substr($this->RV,-1 * $len,$len) )
70
+ {
71
+ $this->word = substr($this->word,0, strlen($this->word) - $len);
72
+ $this->word = str_replace($prefix[$id],$prefix[$id+5],$this->word);
73
+ $return = true;
74
+ }
75
+ break;
76
+ case $id < 10: /* segundo caso*/
77
+ if ($word == substr($this->RV,-1 * $len,$len) )
78
+ {
79
+ $this->word = substr($this->word,0, strlen($this->word) - $len);
80
+ $return = true;
81
+ }
82
+ break;
83
+ case $id >= 10: /* tercer caso*/
84
+ if ($word == substr($this->RV,-1 * $len,$len) )
85
+ {
86
+
87
+ $this->word = substr($this->word,0, strlen($this->word) - $len);
88
+ $return = true;
89
+ }
90
+ break;
91
+ }
92
+ }
93
+ }
94
+
95
+ }
96
+ unset($prefix,$search,$word,$id,$pref,$len);
97
+ return $return;
98
+ }
99
+
100
+ function step_1()
101
+ {
102
+ $return = false;
103
+ $this->splitword();
104
+
105
+ /* borrado de R2 */
106
+ $search = array(
107
+ "abilidades","iblemente","icaciones","ablemente","antemente","ivamente","atamente",
108
+ "amientos","icadoras","icadores","icancias","imientos","icamente",
109
+ "osamente","abilidad","icidades","ividades","adamente","icantes",
110
+ "icancia","imiemto","icadora","icaci�n","amiento","imiento","aciones",
111
+ "ativos","ativas","ividad","idades","icidad","icante",
112
+ "icador","adoras","adores","ancias","mente","ables",
113
+ "ismos","anzas","ativa","ativo","istas","ibles",
114
+ "aci�n","antes","adora","ancia","ismo","anza",
115
+ "icos","ivas","osos","ivos","ante","osas",
116
+ "ador","ible","ista","idad","able","ico",
117
+ "osa","oso","iva","ica","ica","ivo",
118
+ );
119
+
120
+ for ($i = 0; $i < count($search); $i++)
121
+ if (substr($this->R2,strlen($search[$i]) * (-1),strlen($search[$i])) == $search[$i])
122
+ {
123
+ $this->word = substr($this->word,0,strlen($this->word) - strlen($search[$i]) );
124
+ $return = true;
125
+ break;
126
+ }
127
+ /* creo que esta mal, creo que hay que buscar en R1*/
128
+ if ($this->R1 == "amente")
129
+ {
130
+ $this->word = str_replace("amente","",$this->word);
131
+ }
132
+
133
+ $search = array
134
+ (
135
+ "log�a","log�as",/**/"uci�n","uciones",/**/"encia","encias"
136
+ );
137
+ $replace = array
138
+ (
139
+ "log","log","u","u","entre","entre"
140
+ );
141
+ for ($i = 0; $i < count($search); $i++)
142
+ if (substr($this->R2,strlen($search[$i]) * (-1),strlen($search[$i])) == $search[$i])
143
+ {
144
+ $this->word = str_replace($search[$i],$replace[$i],$this->word);
145
+ $return = true;
146
+ break;
147
+ }
148
+ unset($i,$search,$replace);
149
+ return $return;
150
+ }
151
+
152
+ function step_2()
153
+ {
154
+ $this->splitword();
155
+ $return = false;
156
+ $search = array(
157
+ "ya","ye","yan","yen","yeron","yendo","yo","y�","yas","yes","yais","yamos"
158
+ );
159
+ foreach ($search as $word)
160
+ {
161
+ if (substr($this->RV,strlen($word) * (-1),strlen($word)) == $word)
162
+ if (substr($this->word,-1*(strlen($word) + 1), strlen($word) + 1) == "u".$word)
163
+ {
164
+ $this->word = substr($this->word,0, strlen($this->word) -(strlen($word) + 1));
165
+ $return = true;
166
+ }
167
+ }
168
+
169
+ if ($return == false)
170
+ $this->step_2b();
171
+ unset($return,$search,$word);
172
+ }
173
+
174
+ function step_2b()
175
+ {
176
+ $this->splitword();
177
+ $search = array(
178
+ "en","es","�is","emos"
179
+ );
180
+
181
+ foreach ($search as $word)
182
+ {
183
+ if (substr($this->RV,strlen($word) * (-1),strlen($word)) == $word)
184
+ if (substr($this->word,(-1)*(strlen($word) + 2), strlen($word) + 2) == "gu".$word)
185
+ {
186
+ $this->word = substr($this->word,0, strlen($this->word) -(strlen($word) + 1) );
187
+ $return = true;
188
+ }
189
+ /*
190
+ This part was fix by Diego Enrique Finol <dfinol at cantv dot net>
191
+ This was the email that Diego sent to me:
192
+ Epa saludos, gracias por la clase de spanish stemmer, hab�a visto lo mismo
193
+ en snowball pero me ahorraste el trabajo de convertirlo a php. S�lo not�
194
+ que en las partes en la que hab�a que borrar cierto sufijo y, adem�s,
195
+ borrar la "u" de si est� precedido por "gu" creo que no borra el sufijo si
196
+ no est� precedido por esto. O sea, hay que borrar el afijo en ambos casos,
197
+ y de paso si est� precedido por gu, tambi�n borrar la u, pero el algoritmo
198
+ s�lo lo hace si est� precedido por gu, sino, no borra nada.
199
+
200
+ Thanks Diego!.
201
+ */
202
+ else
203
+ {
204
+ $this->word = substr($this->word,0, strlen($this->word) -(strlen($word)) );
205
+ $return = true;
206
+ }
207
+ /*End of Diego fix*/
208
+ }
209
+
210
+ $search = array(
211
+ "i�ramos","ar�amos","ir�amos","i�semos","er�amos","er�ais","eremos",
212
+ "isteis","ir�ais","ierais","iremos","�bamos","ieseis",
213
+ "asteis","�ramos","�semos","aremos","ar�ais","abais",
214
+ "�amos","arais","ieses","ar�an","iesen","ieron",
215
+ "iendo","ieras","ir�is","ar�as","er�as","aseis",
216
+ "er�is","er�an","ir�an","ar�is","ir�as","ieran",
217
+ "ando","amos","aron","asen","aras","ados",
218
+ "�ais","ases","imos","adas","idas","abas",
219
+ "iste","ir�n","er�n","ar�a","er�a","iera",
220
+ "ir�s","ir�a","aran","ar�s","er�s","aste",
221
+ "iese","aban","ar�n","�is","ada","ir�",
222
+ "�an","ir�","er�","aba","ara","ido",
223
+ "ar�","ar�","ado","er�","ase","�as",
224
+ "ida","�a","er","ar","i�","an",
225
+ "ir","as","ad","ed","id","�s",
226
+
227
+
228
+ );
229
+
230
+ foreach ($search as $word)
231
+ if (substr($this->RV,strlen($word) * (-1),strlen($word)) == $word)
232
+ {
233
+ $this->word = substr($this->word,0, strlen($this->word) -(strlen($word)));
234
+ $this->splitword();
235
+ }
236
+ unset($search,$word);
237
+
238
+ }
239
+
240
+ function step_3()
241
+ {
242
+ $this->splitword();
243
+ $return = false;
244
+ $search = array(
245
+ "os","a","o","�","�","�"
246
+ );
247
+
248
+
249
+ foreach ($search as $word)
250
+ if (substr($this->RV,strlen($word) * (-1),strlen($word)) == $word)
251
+ {
252
+ $this->word = substr($this->word,0, strlen($this->word) -(strlen($word)));
253
+ $return = true;
254
+ }
255
+
256
+ $search = array(
257
+ "e","�"
258
+ );
259
+
260
+ foreach ($search as $word)
261
+ {
262
+ if (substr($this->RV,strlen($word) * (-1),strlen($word)) == $word)
263
+ if (substr($this->RV,-1*(strlen($word) + 2), strlen($word) + 2) == "gu".$word)
264
+ {
265
+ $this->word = substr($this->word,0, strlen($this->word) -(strlen($word) + 1) );
266
+ $return = true;
267
+ }
268
+ else
269
+ {
270
+ $this->word = substr($this->word,0, strlen($this->word) -(strlen($word)) );
271
+ $return = true;
272
+ }
273
+ }
274
+ unset($search,$word);
275
+ $this->word = str_replace("�","a",$this->word);
276
+ $this->word = str_replace("�","e",$this->word);
277
+ $this->word = str_replace("�","i",$this->word);
278
+ $this->word = str_replace("�","o",$this->word);
279
+ $this->word = str_replace("�","u",$this->word);
280
+ $this->word = str_replace("�","u",$this->word);
281
+ return $return;
282
+ }
283
+
284
+
285
+ /* funciones utilizadas*/
286
+ function saddorsort($a, $b)
287
+ {
288
+ if (strlen($a) == strlen($b)) {
289
+ return 0;
290
+ }
291
+ return (strlen($a) < strlen($b)) ? 1 : -1;
292
+ }
293
+ function splitword()
294
+ {
295
+ $flag1=false;
296
+ $flag2=false;
297
+ $this->R1="";
298
+ $this->R2="";
299
+ $this->RV="";
300
+ for ($i = 1; $i < strlen($this->word); $i++)
301
+ {
302
+ if ($flag1)
303
+ $this->R1.=$this->word[$i];
304
+ if ($flag2)
305
+ $this->R2.=$this->word[$i];
306
+
307
+ if ($i+1 >= strlen($this->word))
308
+ break;
309
+
310
+ if ($this->char_is($this->word[$i]) == consonante &&
311
+ $this->char_is(@$this->word[$i+1]) == vocal &&
312
+ $flag1 == true && $flag2 == false)
313
+ $flag2=true;
314
+
315
+ if ($this->char_is($this->word[$i]) == consonante &&
316
+ $this->char_is($this->word[$i+1]) == vocal &&
317
+ $flag1 == false)
318
+ $flag1=true;
319
+ }
320
+
321
+
322
+ /* Buscando RV*/
323
+ $flag1=false;
324
+ if ($this->char_is($this->word[1]) == consonante)
325
+ {
326
+ for ($i = 2; $i < strlen($this->word); $i++)
327
+ if ($this->char_is($this->word[$i]) == vocal)
328
+ break;
329
+ $i++;
330
+ $this->RV = substr($this->word,$i);
331
+ }
332
+ else if ($this->char_is($this->word[1]) == vocal && $this->char_is($this->word[0]) == vocal)
333
+ {
334
+ for ($i = 2; $i < strlen($this->word); $i++)
335
+ if ($this->char_is($this->word[$i]) == consonante)
336
+ break;
337
+ $i++;
338
+ $this->RV = substr($this->word,$i);
339
+ }
340
+ else if (strlen($this->word) > 2)
341
+ $this->RV = substr($this->word,3);
342
+
343
+ unset($flag1,$flag2,$i);
344
+ }
345
+
346
+ function char_is($char)
347
+ {
348
+ $char = strtolower($char);
349
+ if ($char == "")
350
+ return;
351
+ $vowel = "aeiou������";
352
+ $consonant = "bcdfghijklmn�opqrsvtxwyz";
353
+ if (strstr($vowel,$char))
354
+ return vocal;
355
+ if (strstr($consonant,$char))
356
+ return consonante;
357
+ }
358
+ }
359
+ }
360
+
361
+ /*
362
+ Stem caching added by Rob Marsh, SJ
363
+ http://rmarsh.com
364
+ */
365
+
366
+ $Stemmer = new PorterStemmer();
367
+ $StemCache = array();
368
+
369
+ function stem($word) {
370
+ global $Stemmer, $StemCache;
371
+ if (!isset($StemCache[$word])) {
372
+ $stemmedword = $Stemmer->Stem($word);
373
+ $StemCache[$word] = $stemmedword;
374
+ }
375
+ else {
376
+ $stemmedword = $StemCache[$word] ;
377
+ }
378
+ return $stemmedword;
379
+ }
380
+
381
+ ?>
languages/es/stopwords.php ADDED
@@ -0,0 +1,4 @@
1
+ <?php
2
+ // the list of common words we want to ignore. NB anything shorter than 4 characters is knocked by the plugin and doesn't need to figure here
3
+ $overusedwords = array("algo", "alguna", "algunas", "alguno", "algunos", "alg�n", "ambos", "ampleamos", "ante", "antes", "aquel", "aquellas", "aquellos", "aqui", "arriba", "atras", "bajo", "bastante", "bien", "cada", "cierta", "ciertas", "ciertos", "como", "conseguimos", "conseguir", "consigo", "consigue", "consiguen", "consigues", "contra", "cual", "cuando", "dentro", "desde", "donde", "durante", "ella", "ellas", "ellos", "empleais", "emplean", "emplear", "empleas", "empleo", "encima", "entonces", "entre", "erais", "eramos", "eran", "eras", "eres", "esas", "esos", "esta", "estaba", "estabais", "estaban", "estabas", "estad", "estada", "estadas", "estado", "estados", "estais", "estamos", "estan", "estando", "estar", "estaremos", "estar�", "estar�n", "estar�s", "estar�", "estar�is", "estar�a", "estar�ais", "estar�amos", "estar�an", "estar�as", "estas", "este", "estemos", "esto", "estos", "estoy", "estuve", "estuviera", "estuvierais", "estuvieran", "estuvieras", "estuvieron", "estuviese", "estuvieseis", "estuviesen", "estuvieses", "estuvimos", "estuviste", "estuvisteis", "estuvi�ramos", "estuvi�semos", "estuvo", "est�", "est�bamos", "est�is", "est�n", "est�s", "est�", "est�is", "est�n", "est�s", "fuera", "fuerais", "fueran", "fueras", "fueron", "fuese", "fueseis", "fuesen", "fueses", "fuimos", "fuiste", "fuisteis", "fu�ramos", "fu�semos", "gueno", "habida", "habidas", "habido", "habidos", "habiendo", "habremos", "habr�", "habr�n", "habr�s", "habr�", "habr�is", "habr�a", "habr�ais", "habr�amos", "habr�an", "habr�as", "hab�is", "hab�a", "hab�ais", "hab�amos", "hab�an", "hab�as", "hace", "haceis", "hacemos", "hacen", "hacer", "haces", "hago", "hasta", "haya", "hayamos", "hayan", "hayas", "hay�is", "hemos", "hube", "hubiera", "hubierais", "hubieran", "hubieras", "hubieron", "hubiese", "hubieseis", "hubiesen", "hubieses", "hubimos", "hubiste", "hubisteis", "hubi�ramos", "hubi�semos", "hubo", "incluso", "intenta", "intentais", "intentamos", "intentan", "intentar", "intentas", "intento", "largo", "mientras", "modo", "mucho", "muchos", "m�as", "m�os", "nada", "nosotras", "nosotros", "nuestra", "nuestras", "nuestro", "nuestros", "otra", "otras", "otro", "otros", "para", "pero", "poco", "podeis", "podemos", "poder", "podria", "podriais", "podriamos", "podrian", "podrias", "porque", "primero desde", "puede", "pueden", "puedo", "quien", "quienes", "sabe", "sabeis", "sabemos", "saben", "saber", "sabes", "seamos", "sean", "seas", "sentid", "sentida", "sentidas", "sentido", "sentidos", "seremos", "ser�", "ser�n", "ser�s", "ser�", "ser�is", "ser�a", "ser�ais", "ser�amos", "ser�an", "ser�as", "se�is", "siendo", "siente", "sintiendo", "sobre", "sois", "solamente", "solo", "somos", "suya", "suyas", "suyo", "suyos", "tambi�n", "tanto", "tendremos", "tendr�", "tendr�n", "tendr�s", "tendr�", "tendr�is", "tendr�a", "tendr�ais", "tendr�amos", "tendr�an", "tendr�as", "tened", "teneis", "tenemos", "tener", "tenga", "tengamos", "tengan", "tengas", "tengo", "teng�is", "tenida", "tenidas", "tenido", "tenidos", "teniendo", "ten�is", "ten�a", "ten�ais", "ten�amos", "ten�an", "ten�as", "tiempo", "tiene", "tienen", "tienes", "todo", "todos", "trabaja", "trabajais", "trabajamos", "trabajan", "trabajar", "trabajas", "trabajo", "tras", "tuve", "tuviera", "tuvierais", "tuvieran", "tuvieras", "tuvieron", "tuviese", "tuvieseis", "tuviesen", "tuvieses", "tuvimos", "tuviste", "tuvisteis", "tuvi�ramos", "tuvi�semos", "tuvo", "tuya", "tuyas", "tuyo", "tuyos", "ultimo", "unas", "unos", "usais", "usamos", "usan", "usar", "usas", "vais", "valor", "vamos", "vaya", "verdad", "verdadera cierto", "verdadero", "vosostras", "vosostros", "vosotras", "vosotros", "vuestra", "vuestras", "vuestro", "vuestros", "�ramos");
4
+ ?>
languages/fr/stemmer.php ADDED
@@ -0,0 +1,513 @@
1
+ <?php
2
+
3
+ /*
4
+ *
5
+ * implements a Paice/Husk Stemmer written in PHP by Alexis Ulrich (http://alx2002.free.fr)
6
+ *
7
+ * This code is in the public domain.
8
+ *
9
+ */
10
+
11
+
12
+ // the rule patterns include all accented forms for a given language
13
+ $rule_pattern = "/^([a-z������������]*)(\*){0,1}(\d)([a-z������������]*)([.|>])/";
14
+
15
+ $PaiceHuskStemmerRules_fr = array(
16
+ 'esre1>', # { -erse > -ers }
17
+ 'esio1>', # { -oise > -ois }
18
+ 'siol1.', # { -lois > -loi }
19
+ 'siof0.', # { -fois > -fois }
20
+ 'sioe0.', # { -eois > -eois }
21
+ 'sio3>', # { -ois > - }
22
+ 'st1>', # { -ts > -t }
23
+ 'sf1>', # { -fs > -f }
24
+ 'sle1>', # { -els > -el }
25
+ 'slo1>', # { -ols > -ol }
26
+ 's�1>', # { -�s > -� }
27
+ '�tuae5.', # { -eaut� > - }
28
+ '�tuae2.', # { -eaut� > -eau }
29
+ 'tnia0.', # { -aint > -aint }
30
+ 'tniv1.', # { -vint > -vin }
31
+ 'tni3>', # { -int > - }
32
+ 'suor1.', # { -rous > -ou }
33
+ 'suo0.', # { -ous > -ous }
34
+ 'sdrail5.', # { -liards > -l }
35
+ 'sdrai4.', # { -iards > -i }
36
+ 'er�i1>', # { -i�re > -ier }
37
+ 'sesue3x>', # { -euses > -euse }
38
+ 'esuey5i.', # { -yeuse > -i }
39
+ 'esue2x>', # { -euse > -eux }
40
+ 'se1>', # { -es > -e }
41
+ 'er�g3.', # { -g�re > -g }
42
+ 'eca1>', # { -ace > -ac }
43
+ 'esiah0.', # { -haise > - }
44
+ 'esi1>', # { -ise > -is }
45
+ 'siss2.', # { -ssis > -ss }
46
+ 'sir2>', # { -ris > -r }
47
+ 'sit2>', # { -tis > -t }
48
+ 'egan�1.', # { -�nage > -�nag }
49
+ 'egalli6>', # { -illage > - }
50
+ 'egass1.', # { -ssage > -sag }
51
+ 'egas0.', # { -sage > - }
52
+ 'egat3.', # { -tage > - }
53
+ 'ega3>', # { -age > - }
54
+ 'ette4>', # { -ette > - }
55
+ 'ett2>', # { -tte > -t }
56
+ 'etio1.', # { -oite > -oit }
57
+ 'tio�4c.', # { -�oit > -c }
58
+ 'tio0.', # { -oit > -oit }
59
+ 'et1>', # { -te > -t }
60
+ 'eb1>', # { -be > -b }
61
+ 'snia1>', # { -ains > -ain }
62
+ 'eniatnau8>', # { -uantaine > - }
63
+ 'eniatn4.', # { -ntaine > -nt }
64
+ 'enia1>', # { -aine > -ain }
65
+ 'niatnio3.', # { -ointain > -oint }
66
+ 'niatg3.', # { -gtain > -gt }
67
+ 'e�1>', # { -�e > -� }
68
+ '�hcat1.', # { -tach� > -tach }
69
+ '�hca4.', # { -ach� > - }
70
+ '�tila5>', # { -alit� > - }
71
+ '�tici5.', # { -icit� > - }
72
+ '�tir1.', # { -rit� > -rit }
73
+ '�ti3>', # { -it� > - }
74
+ '�gan1.', # { -nag� > -nag }
75
+ '�ga3>', # { -ag� > - }
76
+ '�tehc1.', # { -chet� > -chet }
77
+ '�te3>', # { -et� > - }
78
+ '�it0.', # { -ti� > -ti� }
79
+ '�1>', # { -� > - }
80
+ 'eire4.', # { -erie > - }
81
+ 'eirue5.', # { -eurie > - }
82
+ 'eio1.', # { -oie > -oi }
83
+ 'eia1.', # { -aie > -ai }
84
+ 'ei1>', # { -ie > -i }
85
+ 'eng1.', # { -gne > -gn }
86
+ 'xuaessi7.', # { -isseaux > - }
87
+ 'xuae1>', # { -eaux > -eau }
88
+ 'uaes0.', # { -seau > -seau }
89
+ 'uae3.', # { -eau > - }
90
+ 'xuave2l.', # { -evaux > -eval }
91
+ 'xuav2li>', # { -vaux > -vail }
92
+ 'xua3la>', # { -aux > -al }
93
+ 'ela1>', # { -ale > -al }
94
+ 'lart2.', # { -tral > -tr }
95
+ 'lani2>', # { -inal > -in }
96
+ 'la�2>', # { -�al > -� }
97
+ 'siay4i.', # { -yais > -i }
98
+ 'siassia7.', # { -aissais > - }
99
+ 'siarv1*.', # { -vrais > -vrai if intact }
100
+ 'sia1>', # { -ais > -ai }
101
+ 'tneiayo6i.', # { -oyaient > -oi }
102
+ 'tneiay6i.', # { -yaient > -i }
103
+ 'tneiassia9.', # { -aissaient > - }
104
+ 'tneiareio7.', # { -oieraient > -oi }
105
+ 'tneia5>', # { -aient > - }
106
+ 'tneia4>', # { -aient > -a }
107
+ 'tiario4.', # { -oirait > -oi }
108
+ 'tiarim3.', # { -mirait > -mir }
109
+ 'tiaria3.', # { -airait > -air }
110
+ 'tiaris3.', # { -sirait > -sir }
111
+ 'tiari5.', # { -irait > - }
112
+ 'tiarve6>', # { -evrait > - }
113
+ 'tiare5>', # { -erait > - }
114
+ 'iare4>', # { -erai > - }
115
+ 'are3>', # { -era > - }
116
+ 'tiay4i.', # { -yait > -i }
117
+ 'tia3>', # { -ait > - }
118
+ 'tnay4i.', # { -yant > -i }
119
+ 'em�iu5>', # { -ui�me > - }
120
+ 'em�i4>', # { -i�me > - }
121
+ 'tnaun3.', # { -nuant > -nu }
122
+ 'tnauqo3.', # { -oquant > -oqu }
123
+ 'tnau4>', # { -uant > - }
124
+ 'tnaf0.', # { -fant > -fant }
125
+ 'tnat�2>', # { -�tant > -�t }
126
+ 'tna3>', # { -ant > - }
127
+ 'tno3>', # { -ont > - }
128
+ 'zeiy4i.', # { -yiez > -i }
129
+ 'zey3i.', # { -yez > -i }
130
+ 'zeire5>', # { -eriez > - }
131
+ 'zeird4.', # { -driez > -d }
132
+ 'zeirio4.', # { -oiriez > -oi }
133
+ 'ze2>', # { -ez > - }
134
+ 'ssiab0.', # { -baiss > - }
135
+ 'ssia4.', # { -aiss > - }
136
+ 'ssi3.', # { -iss > - }
137
+ 'tnemma6>', # { -amment > - }
138
+ 'tnemesuey9i.', # { -yeusement > -i }
139
+ 'tnemesue8>', # { -eusement > - }
140
+ 'tnemevi7.', # { -ivement > - }
141
+ 'tnemessia5.', # { -aissement > -aiss }
142
+ 'tnemessi8.', # { -issement > - }
143
+ 'tneme5>', # { -ement > - }
144
+ 'tnemia4.', # { -aiment > -ai }
145
+ 'tnem�5>', # { -�ment > - }
146
+ 'el2l>', # { -le > -l }
147
+ 'lle3le>', # { -ell > -el }
148
+ 'let�0.', # { -�tel > -�tel }
149
+ 'lepp0.', # { -ppel > -ppel }
150
+ 'le2>', # { -el > - }
151
+ 'srei1>', # { -iers > -ier }
152
+ 'reit3.', # { -tier > -t }
153
+ 'reila2.', # { -alier > -ali }
154
+ 'rei3>', # { -ier > - }
155
+ 'ert�e5.', # { -e�tre > - }
156
+ 'ert��1.', # { -��tre > -��tr }
157
+ 'ert�4.', # { -�tre > - }
158
+ 'drai4.', # { -iard > - }
159
+ 'erdro0.', # { -ordre > -ordre }
160
+ 'erute5.', # { -eture > - }
161
+ 'ruta0.', # { -atur > -atur }
162
+ 'eruta1.', # { -ature > -atur }
163
+ 'erutiov1.', # { -voiture > -voitur }
164
+ 'erub3.', # { -bure > -b }
165
+ 'eruh3.', # { -hure > -h }
166
+ 'erul3.', # { -lure > -l }
167
+ 'er2r>', # { -re > -r }
168
+ 'nn1>', # { -nn > -n }
169
+ 'r�i3.', # { -i�r > - }
170
+ 'srev0.', # { -vers > -vers }
171
+ 'sr1>', # { -rs > -r }
172
+ 'rid2>', # { -dir > -d }
173
+ 're2>', # { -er > - }
174
+ 'xuei4.', # { -ieux > - }
175
+ 'esuei5.', # { -ieuse > - }
176
+ 'lbati3.', # { -itabl > -it }
177
+ 'lba3>', # { -abl > - }
178
+ 'rueis0.', # { -sieur > - }
179
+ 'ruehcn4.', # { -ncheur > -nc }
180
+ 'ecirta6.', # { -atrice > - }
181
+ 'ruetai6.', # { -iateur > - }
182
+ 'rueta5.', # { -ateur > - }
183
+ 'rueir0.', # { -rieur > - }
184
+ 'rue3>', # { -eur > - }
185
+ 'esseti6.', # { -itesse > - }
186
+ 'essere6>', # { -eresse > - }
187
+ 'esserd1.', # { -dresse > -dress }
188
+ 'esse4>', # { -esse > - }
189
+ 'essiab1.', # { -baisse > -baiss }
190
+ 'essia5.', # { -aisse > - }
191
+ 'essio1.', # { -oisse > -oiss }
192
+ 'essi4.', # { -isse > - }
193
+ 'essal4.', # { -lasse > -l }
194
+ 'essa1>', # { -asse > -ass }
195
+ 'ssab1.', # { -bass > -bas }
196
+ 'essurp1.', # { -prusse > -uss }
197
+ 'essu4.', # { -usse > - }
198
+ 'essi1.', # { -isse > -ss }
199
+ 'ssor1.', # { -ross > -ros }
200
+ 'essor2.', # { -rosse > -ros }
201
+ 'esso1>', # { -osse > -oss }
202
+ 'ess2>', # { -sse > -s }
203
+ 'tio3.', # { -oit > - }
204
+ 'r�s2re.', # { -s�r > -ser }
205
+ 'r�0e.', # { -�r > -�re }
206
+ 'esn1.', # { -nse > -�ns }
207
+ 'eu1>', # { -ue > -u }
208
+ 'sua0.', # { -aus > -aus }
209
+ 'su1>', # { -us > -u }
210
+ 'utt1>', # { -utt > -tt }
211
+ 'tu�3c.', # { -�ut > -c }
212
+ 'u�2c.', # { -�u > -c }
213
+ 'ur1.', # { -ru > -r }
214
+ 'ehcn2>', # { -nche > -nc }
215
+ 'ehcu1>', # { -uche > -uch }
216
+ 'snorr3.', # { -rrons > -rr }
217
+ 'snoru3.', # { -urons > -ur }
218
+ 'snorua3.', # { -aurons > -aur }
219
+ 'snorv3.', # { -vrons > -vr }
220
+ 'snorio4.', # { -oirons > -oi }
221
+ 'snori5.', # { -irons > - }
222
+ 'snore5>', # { -erons > - }
223
+ 'snortt4>', # { -ttrons > -tt }
224
+ 'snort�a7.', # { -a�trons > - }
225
+ 'snort3.', # { -trons > -tr }
226
+ 'snor4.', # { -rons > - }
227
+ 'snossi6.', # { -issons > - }
228
+ 'snoire6.', # { -erions > - }
229
+ 'snoird5.', # { -drions > -d }
230
+ 'snoitai7.', # { -iations > - }
231
+ 'snoita6.', # { -ations > - }
232
+ 'snoits1>', # { -stions > -stion }
233
+ 'noits0.', # { -stion > -stion }
234
+ 'snoi4>', # { -ions > - }
235
+ 'noitaci7>', # { -ication > - }
236
+ 'noitai6.', # { -iation > - }
237
+ 'noita5.', # { -ation > - }
238
+ 'noitu4.', # { -ution > -u }
239
+ 'noi3>', # { -ion > - }
240
+ 'snoya0.', # { -ayons > -ayons }
241
+ 'snoy4i.', # { -yons > -i }
242
+ 'sno�a1.', # { -a�ons > -a�on }
243
+ 'sno�r1.', # { -r�ons > -r�on }
244
+ 'snoe4.', # { -eons > - }
245
+ 'snosiar1>', # { -raisons > - }
246
+ 'snola1.', # { -alons > -alon }
247
+ 'sno3>', # { -ons > - }
248
+ 'sno1>', # { -ons > -on }
249
+ 'noll2.', # { -llon > -ll }
250
+ 'tnennei4.', # { -iennent > -ien }
251
+ 'ennei2>', # { -ienne > -ien }
252
+ 'snei1>', # { -iens > -ien }
253
+ 'sne�1>', # { -�ens > -�en }
254
+ 'enne�5e.', # { -�enne > -e }
255
+ 'ne�3e.', # { -�en > -e }
256
+ 'neic0.', # { -cien > -cien }
257
+ 'neiv0.', # { -vien > -vien }
258
+ 'nei3.', # { -ien > - }
259
+ 'sc1.', # { -cs > -c }
260
+ 'sd1.', # { -ds > -d }
261
+ 'sg1.', # { -gs > -g }
262
+ 'sni1.', # { -ins > -in }
263
+ 'tiu0.', # { -uit > - }
264
+ 'ti2.', # { -it > - }
265
+ 'sp1>', # { -ps > -p }
266
+ 'sna1>', # { -ans > -an }
267
+ 'sue1.', # { -eus > -eu }
268
+ 'enn2>', # { -nne > -n }
269
+ 'nong2.', # { -gnon > -gn }
270
+ 'noss2.', # { -sson > -ss }
271
+ 'rioe4.', # { -eoir > - }
272
+ 'riot0.', # { -toir > -toir }
273
+ 'riorc1.', # { -croir > -croi }
274
+ 'riovec5.', # { -cevoir > -c }
275
+ 'rio3.', # { -oir > - }
276
+ 'ric2.', # { -cir > -l }
277
+ 'ril2.', # { -lir > -l }
278
+ 'tnerim3.', # { -mirent > -mir }
279
+ 'tneris3>', # { -sirent > -sir }
280
+ 'tneri5.', # { -irent > - }
281
+ 't�a3.', # { -a�t > - }
282
+ 'riss2.', # { -ssir > -ss }
283
+ 't�2.', # { -�t > - }
284
+ 't�2>', # { -�t > - }
285
+ 'ario2.', # { -oira > -oi }
286
+ 'arim1.', # { -mira > -m }
287
+ 'ara1.', # { -ara > -ar }
288
+ 'aris1.', # { -sira > -sir }
289
+ 'ari3.', # { -ira > - }
290
+ 'art1>', # { -tra > -tr }
291
+ 'ardn2.', # { -ndra > -nd }
292
+ 'arr1.', # { -rra > -rr }
293
+ 'arua1.', # { -aura > -aur }
294
+ 'aro1.', # { -ora > -or }
295
+ 'arv1.', # { -vra > -vr }
296
+ 'aru1.', # { -ura > -ur }
297
+ 'ar2.', # { -ra > - }
298
+ 'rd1.', # { -dr > -d }
299
+ 'ud1.', # { -du > - }
300
+ 'ul1.', # { -lu > -l }
301
+ 'ini1.', # { -ini > -in }
302
+ 'rin2.', # { -nir > - }
303
+ 'tnessiab3.', # { -baissent > -baiss }
304
+ 'tnessia7.', # { -aissent > - }
305
+ 'tnessi6.', # { -issent > - }
306
+ 'tnessni4.', # { -inssent > -ins }
307
+ 'sini2.', # { -inis > -in }
308
+ 'sl1.', # { -ls > -l }
309
+ 'iard3.', # { -drai > -d }
310
+ 'iario3.', # { -oirai > -oi }
311
+ 'ia2>', # { -ai > - }
312
+ 'io0.', # { -oi > -oi }
313
+ 'iule2.', # { -elui > -el }
314
+ 'i1>', # { -i > - }
315
+ 'sid2.', # { -dis > -d }
316
+ 'sic2.', # { -cis > -c }
317
+ 'esoi4.', # { -iose > - }
318
+ 'ed1.', # { -de > -d }
319
+ 'ai2>', # { -ia > - }
320
+ 'a1>', # { -a > - }
321
+ 'adr1.', # { -rda > -rd }
322
+ 'tner�5>', # { -�rent > - }
323
+ 'evir1.', # { -rive > -riv }
324
+ 'evio4>', # { -oive > - }
325
+ 'evi3.', # { -ive > - }
326
+ 'fita4.', # { -atif > - }
327
+ 'fi2>', # { -if > - }
328
+ 'enie1.', # { -eine > -ein }
329
+ 'sare4>', # { -eras > - }
330
+ 'sari4>', # { -iras > - }
331
+ 'sard3.', # { -dras > -d }
332
+ 'sart2>', # { -tras > -tr }
333
+ 'sa2.', # { -as > - }
334
+ 'tnessa6>', # { -assent > - }
335
+ 'tnessu6>', # { -ussent > - }
336
+ 'tnegna3.', # { -angent > -ang }
337
+ 'tnegi3.', # { -igent > -ig }
338
+ 'tneg0.', # { -gent > -gent }
339
+ 'tneru5>', # { -urent > - }
340
+ 'tnemg0.', # { -gment > -gment }
341
+ 'tnerni4.', # { -inrent > -in }
342
+ 'tneiv1.', # { -vient > -vien }
343
+ 'tne3>', # { -ent > - }
344
+ 'une1.', # { -enu > -en }
345
+ 'en1>', # { -ne > -n }
346
+ 'nitn2.', # { -ntin > - }
347
+ 'ecnay5i.', # { -yance > -i }
348
+ 'ecnal1.', # { -lance > -lanc }
349
+ 'ecna4.', # { -ance > - }
350
+ 'ec1>', # { -ce > -c }
351
+ 'nn1.', # { -nn > -n }
352
+ 'rit2>', # { -tir > - }
353
+ 'rut2>', # { -tur > -t }
354
+ 'rud2.', # { -dur > -d }
355
+ 'ugn1>', # { -ngu > -ng }
356
+ 'eg1>', # { -ge > -g }
357
+ 'tuo0.', # { -out > -out }
358
+ 'tul2>', # { -lut > -l }
359
+ 't�2>', # { -�t > - }
360
+ 'ev1>', # { -ve > -v }
361
+ 'v�2ve>', # { -�v > -ev }
362
+ 'rtt1>', # { -ttr > -tt }
363
+ 'emissi6.', # { -issime > - }
364
+ 'em1.', # { -me > -m }
365
+ 'ehc1.', # { -che > -ch }
366
+ 'c�i2c�.', # { -i�c > -i�c }
367
+ 'libi2l.', # { -ibil > -ibl }
368
+ 'llie1.', # { -eill > -eil }
369
+ 'liei4i.', # { -ieil > -i }
370
+ 'xuev1.', # { -veux > -veu }
371
+ 'xuey4i.', # { -yeux > -i }
372
+ 'xueni5>', # { -ineux > - }
373
+ 'xuell4.', # { -lleux > -l }
374
+ 'xuere5.', # { -ereux > - }
375
+ 'xue3>', # { -eux > - }
376
+ 'rb�3rb�.', # { -�br > -�br }
377
+ 'tur2.', # { -rut > -r }
378
+ 'rir�4re.', # { -�rir > -er }
379
+ 'rir2.', # { -rir > -r }
380
+ 'c�2ca.', # { -�c > -ac }
381
+ 'snu1.', # { -uns > -un }
382
+ 'rt�a4.', # { -a�tr > - }
383
+ 'long2.', # { -gnol > -gn }
384
+ 'vec2.', # { -cev > -c }
385
+ '�1c>', # { -� > -c }
386
+ 'ssilp3.', # { -pliss > -pl }
387
+ 'silp2.', # { -plis > -pl }
388
+ 't�hc2te.', # { -ch�t > -chet }
389
+ 'n�m2ne.', # { -m�n > -men }
390
+ 'llepp1.', # { -ppell > -ppel }
391
+ 'tan2.', # { -nat > -n }
392
+ 'rv�3rve.', # { -�vr > -evr }
393
+ 'rv�3rve.', # { -�vr > -evr }
394
+ 'r�2re.', # { -�r > -er }
395
+ 'r�2re.', # { -�r > -er }
396
+ 't�2te.', # { -�t > -et }
397
+ 't�2te.', # { -�t > -et }
398
+ 'epp1.', # { -ppe > -pp }
399
+ 'eya2i.', # { -aye > -ai }
400
+ 'ya1i.', # { -ay > -ai }
401
+ 'yo1i.', # { -oy > -oi }
402
+ 'esu1.', # { -use > -us }
403
+ 'ugi1.', # { -igu > -g }
404
+ 'tt1.', # { -tt > -t }
405
+
406
+ # end rule: the stem has already been found
407
+ 'end0.'
408
+ );
409
+
410
+ // returns the number of the first rule from the rule number $rule_number
411
+ // that can be applied to the given reversed form
412
+ // returns -1 if no rule can be applied, ie the stem has been found
413
+ function getFirstRule($reversed_form, $rule_number) {
414
+ global $PaiceHuskStemmerRules_fr;
415
+ global $rule_pattern;
416
+ $nb_rules = sizeOf($PaiceHuskStemmerRules_fr);
417
+ for ($i=$rule_number; $i<$nb_rules; $i++) {
418
+ // gets the letters from the current rule
419
+ $rule = $PaiceHuskStemmerRules_fr[$i];
420
+ $rule = preg_replace($rule_pattern, "\\1", $rule);
421
+ //if (strncasecmp(utf8_decode($rule),$reversed_form,strlen(utf8_decode($rule))) == 0) return $i;
422
+ if (strncasecmp($rule, $reversed_form, strlen($rule)) == 0) return $i;
423
+ }
424
+ return -1;
425
+ }
426
+
427
+
428
+ /*
429
+ * Check the acceptability of a stem
430
+ *
431
+ * $reversed_stem: the stem to check in reverse form
432
+ */
433
+ function checkAcceptability($reversed_stem) {
434
+ //if (preg_match("/[a��e����i��o�u��y]#x2F;",utf8_encode($reversed_stem))) {
435
+ if (preg_match("/[a��e����i��o�u��y]#x2F;",$reversed_stem)) {
436
+ // if the form starts with a vowel then at least two letters must remain after stemming (e.g.: "�taient" --> "�t")
437
+ return (strlen($reversed_stem) > 2);
438
+ }
439
+ else {
440
+ // if the form starts with a consonant then at least two letters must remain after stemming
441
+ if (strlen($reversed_stem) <= 2) {
442
+ return False;
443
+ }
444
+ // and at least one of these must be a vowel or "y"
445
+ //return (preg_match("/[a��e����i��o�u��y]/",utf8_encode($reversed_stem)));
446
+ return (preg_match("/[a��e����i��o�u��y]/", $reversed_stem));
447
+ }
448
+ }
449
+
450
+
451
+ /*
452
+ * the actual Paice/Husk stemmer
453
+ * which returns a stem for the given form
454
+ *
455
+ * $form: the word for which we want the stem
456
+ */
457
+ function PaiceHuskStemmer($form) {
458
+ global $PaiceHuskStemmerRules_fr;
459
+ global $rule_pattern;
460
+ $intact = True;
461
+ $stem_found = False;
462
+ $reversed_form = strrev(utf8_decode($form));
463
+ $rule_number = 0;
464
+ // that loop goes through the rules' array until it finds an ending one (ending by '.') or the last one ('end0.')
465
+ while (True) {
466
+ $rule_number = getFirstRule($reversed_form, $rule_number);
467
+ if ($rule_number == -1) {
468
+ // no other rule can be applied => the stem has been found
469
+ break;
470
+ }
471
+ $rule = $PaiceHuskStemmerRules_fr[$rule_number];
472
+ preg_match($rule_pattern, $rule, $matches);
473
+ if (($matches[2] != '*') || ($intact)) {
474
+ $reversed_stem = utf8_decode($matches[4]) . substr($reversed_form,$matches[3],strlen($reversed_form)-$matches[3]);
475
+ if (checkAcceptability($reversed_stem)) {
476
+ $reversed_form = $reversed_stem;
477
+ if ($matches[5] == '.') break;
478
+ }
479
+ else {
480
+ // go to another rule
481
+ $rule_number++;
482
+ }
483
+ }
484
+ else {
485
+ // go to another rule
486
+ $rule_number++;
487
+ }
488
+ }
489
+
490
+ return utf8_encode(strrev($reversed_form));
491
+
492
+ }
493
+
494
+ /*
495
+ Stem caching added by Rob Marsh, SJ
496
+ http://rmarsh.com
497
+ */
498
+
499
+ $StemCache = array();
500
+
501
+ function stem($word) {
502
+ global $StemCache;
503
+ if (!isset($StemCache[$word])) {
504
+ $stemmedword = PaiceHuskStemmer($word);
505
+ $StemCache[$word] = $stemmedword;
506
+ }
507
+ else {
508
+ $stemmedword = $StemCache[$word] ;
509
+ }
510
+ return $stemmedword;
511
+ }
512
+
513
+ ?>
languages/fr/stopwords.php ADDED
@@ -0,0 +1,4 @@
1
+ <?php
2
+ // the list of common words we want to ignore. NB anything shorter than 4 characters is knocked by the plugin and doesn't need to figure here
3
+ $overusedwords = array("afin", "aient", "aies", "ailleurs", "ainsi", "alentour", "alias", "allaient", "allais", "allait", "allez", "allons", "alors", "apr�s", "apr�s-demain", "arri�re", "assez", "attendu", "au-dedans", "au-dehors", "au-del�", "au-dessous", "au-dessus", "au-devant", "aucun", "aucune", "audit", "aujourd'", "aujourd'hui", "auparavant", "aupr�s", "auquel", "aura", "aurai", "auraient", "aurais", "aurait", "auras", "aurez", "auriez", "aurions", "aurons", "auront", "aussi", "aussit�t", "autant", "autour", "autre", "autrefois", "autres", "autrui", "auxdites", "auxdits", "auxquelles", "auxquels", "avaient", "avais", "avait", "avant", "avant-hier", "avec", "avez", "aviez", "avions", "avoir", "avons", "ayant", "ayante", "ayantes", "ayants", "ayez", "ayons", "banco", "beaucoup", "bien", "bient�t", "c'est-�-dire", "c.-�-d.", "cahin-caha", "ceci", "cela", "celle", "celle-ci", "celle-l�", "celles", "celles-ci", "celles-l�", "celui", "celui-ci", "celui-l�", "cent", "cents", "cependant", "certain", "certaine", "certaines", "certains", "certes", "cette", "ceux", "ceux-ci", "ceux-l�", "chacun", "chacune", "chaque", "cher", "chez", "chose", "ci-apr�s", "ci-dessous", "ci-dessus", "cinq", "cinquante", "cinquante-cinq", "cinquante-deux", "cinquante-et-un", "cinquante-huit", "cinquante-neuf", "cinquante-quatre", "cinquante-sept", "cinquante-six", "cinquante-trois", "combien", "comme", "comment", "contrario", "contre", "cours", "crescendo", "c�ans", "d'abord", "d'accord", "d'affil�e", "d'ailleurs", "d'apr�s", "d'arrache-pied", "d'embl�e", "d'un", "d'une", "dans", "davantage", "debout", "dedans", "dehors", "del�", "demain", "depuis", "derechef", "derri�re", "desdites", "desdits", "desquelles", "desquels", "dessous", "dessus", "deux", "devant", "devers", "de��", "diff�rentes", "diff�rents", "dire", "disent", "dito", "divers", "diverses", "dix-huit", "dix-neuf", "dix-sept", "donc", "dont", "dor�navant", "douze", "dudit", "duquel", "durant", "d�j�", "d�pit", "d�sormais", "elle", "elles", "en-dehors", "encore", "enfin", "ensemble", "ensuite", "entre", "entre-temps", "envers", "environ", "et/ou", "eues", "eurent", "eusse", "eussent", "eusses", "eussiez", "eussions", "expr�s", "extenso", "extremis", "e�mes", "e�tes", "facto", "faire", "fais", "faisaient", "faisais", "faisait", "faisons", "fait", "faites", "fallait", "faudrait", "faut", "faveur", "flac", "fors", "fort", "forte", "fortiori", "frais", "furent", "fusse", "fussent", "fusses", "fussiez", "fussions", "f�mes", "f�tes", "grand-chose", "grosso", "gr�ce", "gu�re", "haut", "hein", "hier", "hol�", "hormis", "hors", "huit", "ibidem", "ici-bas", "idem", "illico", "ipso", "item", "jadis", "jamais", "jusqu'", "jusqu'au", "jusqu'aux", "jusqu'�", "jusque", "juste", "l'autre", "l'encontre", "l'instar", "l'insu", "l'issue", "l'occasion", "l'on", "l'un", "l'une", "l'�gard", "ladite", "laquelle", "lequel", "lesquelles", "lesquels", "leur", "leurs", "loin", "longtemps", "lors", "lorsqu'", "lorsque", "l�-bas", "l�-dedans", "l�-dehors", "l�-derri�re", "l�-dessous", "l�-dessus", "l�-devant", "l�-haut", "maint", "mainte", "maintenant", "maintes", "maints", "mais", "malgr�", "marge", "mati�re", "mien", "mienne", "miennes", "miens", "mieux", "mille", "milliards", "millions", "minima", "modo", "moins", "moult", "moyennant", "m�me", "m�mes", "nagu�re", "neuf", "nonante", "nonobstant", "notre", "nous", "nulle", "n�anmoins", "n�tre", "n�tres", "octante", "onze", "ouais", "outre", "par-ci", "par-del�", "par-derri�re", "par-dessous", "par-dessus", "par-devant", "par-l�", "parbleu", "parce", "parfois", "parmi", "part", "partir", "partout", "passim", "pass�", "pendant", "personne", "petto", "peur", "peut", "peut-�tre", "peuvent", "peux", "plus", "plusieurs", "plut�t", "point", "posteriori", "pour", "pourquoi", "pourtant", "pourvu", "presqu'", "presque", "primo", "priori", "prou", "pr�s", "pr�alable", "puis", "puisqu'", "puisque", "quand", "quant", "quarante", "quarante-cinq", "quarante-deux", "quarante-et-un", "quarante-huit", "quarante-neuf", "quarante-quatre", "quarante-sept", "quarante-six", "quarante-trois", "quasi", "quatorze", "quatre", "quatre-vingt", "quatre-vingt-cinq", "quatre-vingt-deux", "quatre-vingt-dix", "quatre-vingt-dix-huit", "quatre-vingt-dix-neuf", "quatre-vingt-dix-sept", "quatre-vingt-douze", "quatre-vingt-huit", "quatre-vingt-neuf", "quatre-vingt-onze", "quatre-vingt-quatorze", "quatre-vingt-quatre", "quatre-vingt-quinze", "quatre-vingt-seize", "quatre-vingt-sept", "quatre-vingt-six", "quatre-vingt-treize", "quatre-vingt-trois", "quatre-vingt-un", "quatre-vingt-une", "quatre-vingts", "quel", "quelle", "quelles", "quelqu'", "quelqu'un", "quelqu'une", "quelque", "quelquefois", "quelques", "quelques-unes", "quelques-uns", "quels", "quiconque", "quinze", "quoi", "quoiqu'", "quoique", "raison", "rapport", "regard", "revoici", "revoil�", "rien", "sans", "sauf", "secundo", "sein", "seize", "selon", "sensu", "sept", "septante", "sera", "serai", "seraient", "serais", "serait", "seras", "serez", "seriez", "serions", "serons", "seront", "sien", "sienne", "siennes", "siens", "sine", "sinon", "situ", "sit�t", "soient", "sois", "soit", "soixante", "soixante-cinq", "soixante-deux", "soixante-dix", "soixante-dix-huit", "soixante-dix-neuf", "soixante-dix-sept", "soixante-douze", "soixante-et-onze", "soixante-et-un", "soixante-et-une", "soixante-huit", "soixante-neuf", "soixante-quatorze", "soixante-quatre", "soixante-quinze", "soixante-seize", "soixante-sept", "soixante-six", "soixante-treize", "soixante-trois", "sommes", "sont", "soudain", "sous", "souvent", "soyez", "soyons", "stricto", "suis", "suite", "sujet", "sur-le-champ", "surtout", "tacatac", "tandis", "tant", "tant�t", "tard", "telle", "telles", "tels", "tien", "tienne", "tiennes", "tiens", "toujours", "tous", "tout", "toute", "toutefois", "toutes", "travers", "treize", "trente", "trente-cinq", "trente-deux", "trente-et-un", "trente-huit", "trente-neuf", "trente-quatre", "trente-sept", "trente-six", "trente-trois", "trois", "trop", "tr�s", "unes", "vais", "vers", "vertu", "veut", "veux", "vice-versa", "vingt", "vingt-cinq", "vingt-deux", "vingt-huit", "vingt-neuf", "vingt-quatre", "vingt-sept", "vingt-six", "vingt-trois", "vis-�-vis", "vite", "vitro", "vivo", "voici", "voil�", "voire", "volontiers", "votre", "vous", "v�tre", "v�tres", "z�ro", "�gard", "�taient", "�tais", "�tait", "�tant", "�tante", "�tantes", "�tants", "�tiez", "�tions", "�t�e", "�t�es", "�t�s", "�tes", "�tre");
4
+ ?>
languages/it/stemmer.php ADDED
@@ -0,0 +1,341 @@