Similar Posts – Best Related Posts Plugin for WordPress - Version 2.6.0.0

Version Description

Download this release

Release Info

Developer RobMarsh
Plugin Icon 128x128 Similar Posts – Best Related Posts Plugin for WordPress
Version 2.6.0.0
Comparing to
See all releases

Version 2.6.0.0

languages/de/stemmer.php ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?php
2
+ /*
3
+ Adapted from a drupal module -- see details below
4
+ */
5
+
6
+ /*
7
+ Content:
8
+ Drupal module to improve searching in german texts (Porter stemmer)
9
+ Algorithm based on http://snowball.tartarus.org/algorithms/german/stemmer.html
10
+ Author:
11
+ Reiner Miericke 10.10.2007
12
+ References:
13
+ Algorithm:
14
+ http://www.clef-campaign.org/workshop2002/WN/3.pdf
15
+ http://w3.ub.uni-konstanz.de/v13/volltexte/2003/996//pdf/scherer.pdf
16
+ http://kontext.fraunhofer.de/haenelt/kurs/Referate/Kowatschew_Lang/stemming.pdf
17
+ http://www.cis.uni-muenchen.de/people/Schulz/SeminarSoSe2001IR/FilzmayerMargetic/referat.html
18
+ http://www.ifi.unizh.ch/CL/broder/mue1/porter/stemming/node1.html
19
+ For lists of stopwords see
20
+ http://members.unine.ch/jacques.savoy/clef/index.html
21
+ Small parts were stolen from dutchstemmer.module
22
+ */
23
+
24
+
25
+ define("DE_STEMMER_VOKALE", "aeiouy���");
26
+
27
+ $enc = mb_detect_encoding('a-zA-Z���������������');
28
+ mb_internal_encoding($enc);
29
+
30
+ function _de_stemmer_split_text(&$text) {
31
+ // Split words from noise
32
+ return preg_split('/([^a-zA-Z���������������]+)/u', $text, -1, PREG_SPLIT_NO_EMPTY);
33
+ }
34
+
35
+
36
+ /**
37
+ * Implementation of hook_search_preprocess
38
+ */
39
+ function de_stemmer_search_preprocess(&$text) {
40
+ // Split words from noise and remove apostrophes
41
+ $words = preg_split('/([^a-zA-Z���������������]+)/u', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
42
+
43
+ // Process each word
44
+ $odd = true;
45
+ foreach ($words as $k => $word) {
46
+ if ($odd) {
47
+ $words[$k] = _de_stemmer_wortstamm($word);
48
+ }
49
+ $odd = !$odd;
50
+ }
51
+
52
+ // Put it all back together
53
+ return implode('', $words);
54
+
55
+ /* alte Version
56
+ $words = _de_stemmer_split_text($text);
57
+
58
+ // Process each word
59
+ foreach ($words as $k => $word) {
60
+ if (!_de_stemmer_stoppwort(strtolower($word))) {
61
+ $words[$k] = _de_stemmer_wortstamm($word);
62
+ }
63
+ }
64
+
65
+ // Put it all back together
66
+ return implode(' ', $words);
67
+ */
68
+ }
69
+
70
+
71
+ /**
72
+ * Implementation of hook_help().
73
+ */
74
+ function de_stemmer_help($section = 'admin/help#search') {
75
+ switch ($section) {
76
+ case 'admin/modules#description':
77
+ return t('Implements a German stemming algorithm (Porter) to improve searching.');
78
+ }
79
+ }
80
+
81
+
82
+ /*
83
+ * Function gets as text (parameter) and splits the text into words.
84
+ * Then each word is stemmed and the word together with its stem is
85
+ * stored in an array (hash).
86
+ * As a result the hash is returned and can be used as a lookup table
87
+ * to identify words which transform to the same stem.
88
+ * For details please compare 'search.module-stem.patch'
89
+ */
90
+ function de_stemmer_stem_list($text) {
91
+ // Split words from noise and remove apostrophes
92
+ $words = _de_stemmer_split_text($text);
93
+
94
+ $stem_list = array();
95
+ foreach ($words as $word) {
96
+ $stem_list[$word] = _de_stemmer_wortstamm($word);
97
+ }
98
+ return $stem_list;
99
+ }
100
+
101
+
102
+ function _de_stemmer_region_n($wort) {
103
+ $r = strcspn($wort, DE_STEMMER_VOKALE);
104
+ return $r + strspn($wort, DE_STEMMER_VOKALE, $r) + 1;
105
+ }
106
+
107
+ function de_stemmer_preprocess($wort) {
108
+ $wort = mb_strtolower($wort);
109
+ $wort = str_replace("�", "ss", $wort);
110
+ // replace � by ss, and put u and y between vowels into upper case
111
+
112
+ $wort = preg_replace( array( '/�/',
113
+ '/(?<=['. DE_STEMMER_VOKALE .'])u(?=['. DE_STEMMER_VOKALE .'])/u',
114
+ '/(?<=['. DE_STEMMER_VOKALE .'])y(?=['. DE_STEMMER_VOKALE .'])/u'
115
+ ),
116
+ array( 'ss', 'U', 'Y' ),
117
+ $wort
118
+ );
119
+ return $wort;
120
+ }
121
+
122
+
123
+ function _de_stemmer_postprocess($wort) {
124
+ $wort = mb_strtolower($wort);
125
+
126
+ if (!_de_stemmer_ausnahme($wort)) // check for exceptions
127
+ {
128
+ $wort = strtr($wort, array('�' => 'a', '�' => 'a',
129
+ '�' => 'e', '�' => 'e',
130
+ '�' => 'i', '�' => 'i',
131
+ '�' => 'o', '�' => 'o',
132
+ '�' => "u", '�' => 'u'
133
+ ));
134
+ }
135
+ return $wort;
136
+ }
137
+
138
+
139
+ function _de_stemmer_wortstamm($wort) {
140
+ $stamm = de_stemmer_preprocess($wort);
141
+
142
+ /*
143
+ * R1 is the region after the first non-vowel following a vowel,
144
+ or is the null region at the end of the word if there is no such non-vowel.
145
+ * R2 is the region after the first non-vowel following a vowel in R1,
146
+ or is the null region at the end of the word if there is no such non-vowel.
147
+ */
148
+
149
+ $l = strlen($stamm);
150
+ $r1 = _de_stemmer_region_n($stamm);
151
+ $r2 = $r1 == $l ? $r1 : $r1 + _de_stemmer_region_n(mb_substr($stamm, $r1));
152
+ // unshure about interpreting the following rule:
153
+ // "then R1 is ADJUSTED so that the region before it contains at least 3 letters"
154
+ if ($r1 < 3) {
155
+ $r1 = 3;
156
+ }
157
+
158
+ /* Step 1
159
+ Search for the longest among the following suffixes,
160
+ (a) e em en ern er es
161
+ (b) s (preceded by a valid s-ending)
162
+ and delete if in R1.
163
+ (Of course the letter of the valid s-ending is not necessarily in R1)
164
+ */
165
+
166
+ if (preg_match('/(e|em|en|ern|er|es)$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
167
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
168
+ }
169
+ elseif (preg_match('/(?<=(b|d|f|g|h|k|l|m|n|r|t))s$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
170
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
171
+ }
172
+
173
+
174
+ /*
175
+ Step 2
176
+ Search for the longest among the following suffixes,
177
+ (a) en er est
178
+ (b) st (preceded by a valid st-ending, itself preceded by at least 3 letters)
179
+ and delete if in R1.
180
+ */
181
+
182
+ if (preg_match('/(en|er|est)$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
183
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
184
+ }
185
+ elseif (preg_match('/(?<=(b|d|f|g|h|k|l|m|n|t))st$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
186
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
187
+ }
188
+
189
+
190
+ /*
191
+ Step 3: d-suffixes ( see http://snowball.tartarus.org/texts/glossary.html )
192
+ Search for the longest among the following suffixes, and perform the action indicated.
193
+ end ung
194
+ delete if in R2
195
+ if preceded by ig, delete if in R2 and not preceded by e
196
+ ig ik isch
197
+ delete if in R2 and not preceded by e
198
+ lich heit
199
+ delete if in R2
200
+ if preceded by er or en, delete if in R1
201
+ keit
202
+ delete if in R2
203
+ if preceded by lich or ig, delete if in R2
204
+ ^ means R1 ?
205
+ */
206
+
207
+ if (preg_match('/(?<=eig)(end|ung)$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
208
+ ;
209
+ }
210
+ elseif (preg_match('/(end|ung)$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
211
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
212
+ }
213
+ elseif (preg_match('/(?<![e])(ig|ik|isch)$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
214
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
215
+ }
216
+ elseif (preg_match('/(?<=(er|en))(lich|heit)$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
217
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
218
+ }
219
+ elseif (preg_match('/(lich|heit)$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
220
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
221
+ }
222
+ elseif (preg_match('/(?<=lich)keit$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
223
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
224
+ }
225
+ elseif (preg_match('/(?<=ig)keit$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
226
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
227
+ }
228
+ elseif (preg_match('/keit$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
229
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
230
+ }
231
+
232
+
233
+ /* Was ist mit
234
+ chen, lein, bar, schaft, ... ?
235
+ */
236
+ return _de_stemmer_postprocess($stamm);
237
+ }
238
+
239
+
240
+ function _de_stemmer_stoppwort($wort) {
241
+
242
+ static $stoppworte = array(
243
+ 'ab', 'aber', 'aber', 'ach', 'acht', 'achte', 'achten', 'achter', 'achtes', 'ag', 'alle', 'allein', 'allem', 'allen', 'aller', 'allerdings', 'alles', 'allgemeinen', 'als', 'als', 'also', 'am', 'an', 'andere', 'anderen', 'andern', 'anders', 'au', 'auch', 'auch', 'auf', 'aus', 'ausser', 'au�er', 'ausserdem', 'au�erdem',
244
+ 'bald', 'bei', 'beide', 'beiden', 'beim', 'bekannt', 'bereits', 'besonders', 'besser', 'besten', 'bin', 'bis', 'bisher', 'bist',
245
+ 'da', 'dabei', 'dadurch', 'daf�r', 'dagegen', 'daher', 'dahin', 'dahinter', 'damals', 'damit', 'danach', 'daneben', 'dank', 'dann', 'daran', 'darauf', 'daraus', 'darf', 'darfst', 'darin', 'dar�ber', 'darum', 'darunter', 'das', 'das', 'dasein', 'daselbst', 'dass', 'da�', 'dasselbe', 'davon', 'davor', 'dazu', 'dazwischen', 'dein', 'deine', 'deinem', 'deiner', 'dem', 'dementsprechend', 'demgegen�ber', 'demgem�ss', 'demgem��', 'demselben', 'demzufolge', 'den', 'denen', 'denn', 'denn', 'denselben', 'der', 'deren', 'derjenige', 'derjenigen', 'dermassen', 'derma�en', 'derselbe', 'derselben', 'des', 'deshalb', 'desselben', 'dessen', 'deswegen', 'd.h', 'dich', 'die', 'diejenige', 'diejenigen', 'dies', 'diese', 'dieselbe', 'dieselben', 'diesem', 'diesen', 'dieser', 'dieses', 'dir', 'doch', 'dort', 'drei', 'drin', 'dritte', 'dritten', 'dritter', 'drittes', 'du', 'durch', 'durchaus',
246
+ 'eben', 'ebenso', 'eigen', 'eigene', 'eigenen', 'eigener', 'eigenes', 'ein', 'einander', 'eine', 'einem', 'einen', 'einer', 'eines', 'einige', 'einigen', 'einiger', 'einiges', 'einmal', 'einmal', 'eins', 'elf', 'en', 'ende', 'endlich', 'entweder', 'entweder', 'er', 'ernst', 'erst', 'erste', 'ersten', 'erster', 'erstes', 'es', 'etwa', 'etwas', 'euch',
247
+ 'fr�her', 'f�nf', 'f�nfte', 'f�nften', 'f�nfter', 'f�nftes', 'f�r',
248
+ 'gab', 'ganz', 'ganze', 'ganzen', 'ganzer', 'ganzes', 'gar', 'gedurft', 'gegen', 'gegen�ber', 'gehabt', 'gehen', 'geht', 'gekannt', 'gekonnt', 'gemacht', 'gemocht', 'gemusst', 'genug', 'gerade', 'gern', 'gesagt', 'gesagt', 'geschweige', 'gewesen', 'gewollt', 'geworden', 'gibt', 'ging', 'gleich', 'gott', 'gross', 'gro�', 'grosse', 'gro�e', 'grossen', 'gro�en', 'grosser', 'gro�er', 'grosses', 'gro�es', 'gut', 'gute', 'guter', 'gutes',
249
+ 'habe', 'haben', 'habt', 'hast', 'hat', 'hatte', 'h�tte', 'hatten', 'h�tten', 'heisst', 'her', 'heute', 'hier', 'hin', 'hinter', 'hoch',
250
+ 'ich', 'ihm', 'ihn', 'ihnen', 'ihr', 'ihre', 'ihrem', 'ihren', 'ihrer', 'ihres', 'im', 'im', 'immer', 'in', 'in', 'indem', 'infolgedessen', 'ins', 'irgend', 'ist',
251
+ 'ja', 'ja', 'jahr', 'jahre', 'jahren', 'je', 'jede', 'jedem', 'jeden', 'jeder', 'jedermann', 'jedermanns', 'jedoch', 'jemand', 'jemandem', 'jemanden', 'jene', 'jenem', 'jenen', 'jener', 'jenes', 'jetzt',
252
+ 'kam', 'kann', 'kannst', 'kaum', 'kein', 'keine', 'keinem', 'keinen', 'keiner', 'kleine', 'kleinen', 'kleiner', 'kleines', 'kommen', 'kommt', 'k�nnen', 'k�nnt', 'konnte', 'k�nnte', 'konnten', 'kurz',
253
+ 'lang', 'lange', 'lange', 'leicht', 'leide', 'lieber', 'los',
254
+ 'machen', 'macht', 'machte', 'mag', 'magst', 'mahn', 'man', 'manche', 'manchem', 'manchen', 'mancher', 'manches', 'mann', 'mehr', 'mein', 'meine', 'meinem', 'meinen', 'meiner', 'meines', 'mich', 'mir', 'mit', 'mittel', 'mochte', 'm�chte', 'mochten', 'm�gen', 'm�glich', 'm�gt', 'morgen', 'muss', 'mu�', 'm�ssen', 'musst', 'm�sst', 'musste', 'mussten',
255
+ 'na', 'nach', 'nachdem', 'nahm', 'nat�rlich', 'neben', 'nein', 'neue', 'neuen', 'neun', 'neunte', 'neunten', 'neunter', 'neuntes', 'nicht', 'nicht', 'nichts', 'nie', 'niemand', 'niemandem', 'niemanden', 'noch', 'nun', 'nun', 'nur',
256
+ 'ob', 'oben', 'oder', 'oder', 'offen', 'oft', 'oft', 'ohne',
257
+ 'recht', 'rechte', 'rechten', 'rechter', 'rechtes', 'richtig', 'rund',
258
+ 'sa', 'sache', 'sagt', 'sagte', 'sah', 'satt', 'schon', 'sechs', 'sechste', 'sechsten', 'sechster', 'sechstes', 'sehr', 'sei', 'sei', 'seid', 'seien', 'sein', 'seine', 'seinem', 'seinen', 'seiner', 'seines', 'seit', 'seitdem', 'selbst', 'selbst', 'sich', 'sie', 'sieben', 'siebente', 'siebenten', 'siebenter', 'siebentes', 'sind', 'so', 'solang', 'solche', 'solchem', 'solchen', 'solcher', 'solches', 'soll', 'sollen', 'sollte', 'sollten', 'sondern', 'sonst', 'sowie', 'sp�ter', 'statt',
259
+ 'tat', 'teil', 'tel', 'tritt', 'trotzdem', 'tun',
260
+ '�ber', '�berhaupt', '�brigens', 'uhr', 'um', 'und', 'und?', 'uns', 'unser', 'unsere', 'unserer', 'unter',
261
+ 'vergangenen', 'viel', 'viele', 'vielem', 'vielen', 'vielleicht', 'vier', 'vierte', 'vierten', 'vierter', 'viertes', 'vom', 'von', 'vor',
262
+ 'wahr?', 'w�hrend', 'w�hrenddem', 'w�hrenddessen', 'wann', 'war', 'w�re', 'waren', 'wart', 'warum', 'was', 'wegen', 'weil', 'weit', 'weiter', 'weitere', 'weiteren', 'weiteres', 'welche', 'welchem', 'welchen', 'welcher', 'welches', 'wem', 'wen', 'wenig', 'wenig', 'wenige', 'weniger', 'weniges', 'wenigstens', 'wenn', 'wenn', 'wer', 'werde', 'werden', 'werdet', 'wessen', 'wie', 'wie', 'wieder', 'will', 'willst', 'wir', 'wird', 'wirklich', 'wirst', 'wo', 'wohl', 'wollen', 'wollt', 'wollte', 'wollten', 'worden', 'wurde', 'w�rde', 'wurden', 'w�rden',
263
+ 'z.b', 'zehn', 'zehnte', 'zehnten', 'zehnter', 'zehntes', 'zeit', 'zu', 'zuerst', 'zugleich', 'zum', 'zum', 'zun�chst', 'zur', 'zur�ck', 'zusammen', 'zwanzig', 'zwar', 'zwar', 'zwei', 'zweite', 'zweiten', 'zweiter', 'zweites', 'zwischen', 'zw�lf'
264
+ );
265
+
266
+ return in_array($wort, $stoppworte);
267
+ }
268
+
269
+
270
+ /*
271
+ first try to set up a list of exceptions
272
+ */
273
+ function _de_stemmer_ausnahme(&$wort)
274
+ { static $de_stemmer_ausnahmen = array (
275
+ 'sch�n' => 'sch�n', // !schon
276
+ 'bl�t' => 'bl�t', // Bl�te (NICHT Blut)
277
+ 'kannt' => 'kenn',
278
+ 'k�ch' => 'k�ch', // K�chen (NICHT Kuchen)
279
+ 'm�g' => 'm�g',
280
+ 'mocht' => 'm�g',
281
+ 'mag' => 'm�g',
282
+ 'ging' => 'geh',
283
+ 'lief' => 'lauf',
284
+ '�nd' => '�nd' // �ndern (NICHT andern)
285
+ );
286
+
287
+ //return FALSE;
288
+ if ( array_key_exists($wort, $de_stemmer_ausnahmen) )
289
+ { $wort = $de_stemmer_ausnahmen[$wort];
290
+ return TRUE;
291
+ }
292
+ else
293
+ return FALSE;
294
+ }
295
+
296
+ /*
297
+ Stem caching added by Rob Marsh, SJ
298
+ http://rmarsh.com
299
+ */
300
+
301
+ $StemCache = array();
302
+
303
+ function stem($word) {
304
+ global $StemCache;
305
+ if (!isset($StemCache[$word])) {
306
+ $stemmedword = _de_stemmer_wortstamm($word);
307
+ $StemCache[$word] = $stemmedword;
308
+ }
309
+ else {
310
+ $stemmedword = $StemCache[$word] ;
311
+ }
312
+ return $stemmedword;
313
+ }
314
+
315
+ ?>
languages/de/stemmer.php.bak ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?php
2
+ /*
3
+ Adapted from a drupal module -- see details below
4
+ */
5
+
6
+ /*
7
+ Content:
8
+ Drupal module to improve searching in german texts (Porter stemmer)
9
+ Algorithm based on http://snowball.tartarus.org/algorithms/german/stemmer.html
10
+ Author:
11
+ Reiner Miericke 10.10.2007
12
+ References:
13
+ Algorithm:
14
+ http://www.clef-campaign.org/workshop2002/WN/3.pdf
15
+ http://w3.ub.uni-konstanz.de/v13/volltexte/2003/996//pdf/scherer.pdf
16
+ http://kontext.fraunhofer.de/haenelt/kurs/Referate/Kowatschew_Lang/stemming.pdf
17
+ http://www.cis.uni-muenchen.de/people/Schulz/SeminarSoSe2001IR/FilzmayerMargetic/referat.html
18
+ http://www.ifi.unizh.ch/CL/broder/mue1/porter/stemming/node1.html
19
+ For lists of stopwords see
20
+ http://members.unine.ch/jacques.savoy/clef/index.html
21
+ Small parts were stolen from dutchstemmer.module
22
+ */
23
+
24
+
25
+ define("DE_STEMMER_VOKALE", "aeiouyäöü");
26
+
27
+ $enc = mb_detect_encoding('a-zA-ZÄÖÜßäëïöüáéíóúè');
28
+ mb_internal_encoding($enc);
29
+
30
+ function _de_stemmer_split_text(&$text) {
31
+ // Split words from noise
32
+ return preg_split('/([^a-zA-ZÄÖÜßäëïöüáéíóúè]+)/u', $text, -1, PREG_SPLIT_NO_EMPTY);
33
+ }
34
+
35
+
36
+ /**
37
+ * Implementation of hook_search_preprocess
38
+ */
39
+ function de_stemmer_search_preprocess(&$text) {
40
+ // Split words from noise and remove apostrophes
41
+ $words = preg_split('/([^a-zA-ZÄÖÜßäëïöüáéíóúè]+)/u', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
42
+
43
+ // Process each word
44
+ $odd = true;
45
+ foreach ($words as $k => $word) {
46
+ if ($odd) {
47
+ $words[$k] = _de_stemmer_wortstamm($word);
48
+ }
49
+ $odd = !$odd;
50
+ }
51
+
52
+ // Put it all back together
53
+ return implode('', $words);
54
+
55
+ /* alte Version
56
+ $words = _de_stemmer_split_text($text);
57
+
58
+ // Process each word
59
+ foreach ($words as $k => $word) {
60
+ if (!_de_stemmer_stoppwort(strtolower($word))) {
61
+ $words[$k] = _de_stemmer_wortstamm($word);
62
+ }
63
+ }
64
+
65
+ // Put it all back together
66
+ return implode(' ', $words);
67
+ */
68
+ }
69
+
70
+
71
+ /**
72
+ * Implementation of hook_help().
73
+ */
74
+ function de_stemmer_help($section = 'admin/help#search') {
75
+ switch ($section) {
76
+ case 'admin/modules#description':
77
+ return t('Implements a German stemming algorithm (Porter) to improve searching.');
78
+ }
79
+ }
80
+
81
+
82
+ /*
83
+ * Function gets as text (parameter) and splits the text into words.
84
+ * Then each word is stemmed and the word together with its stem is
85
+ * stored in an array (hash).
86
+ * As a result the hash is returned and can be used as a lookup table
87
+ * to identify words which transform to the same stem.
88
+ * For details please compare 'search.module-stem.patch'
89
+ */
90
+ function de_stemmer_stem_list($text) {
91
+ // Split words from noise and remove apostrophes
92
+ $words = _de_stemmer_split_text($text);
93
+
94
+ $stem_list = array();
95
+ foreach ($words as $word) {
96
+ $stem_list[$word] = _de_stemmer_wortstamm($word);
97
+ }
98
+ return $stem_list;
99
+ }
100
+
101
+
102
+ function _de_stemmer_region_n($wort) {
103
+ $r = strcspn($wort, DE_STEMMER_VOKALE);
104
+ return $r + strspn($wort, DE_STEMMER_VOKALE, $r) + 1;
105
+ }
106
+
107
+ function de_stemmer_preprocess($wort) {
108
+ $wort = mb_strtolower($wort);
109
+ $wort = str_replace("ß", "ss", $wort);
110
+ // replace ß by ss, and put u and y between vowels into upper case
111
+
112
+ $wort = preg_replace( array( '/ß/',
113
+ '/(?<=['. DE_STEMMER_VOKALE .'])u(?=['. DE_STEMMER_VOKALE .'])/u',
114
+ '/(?<=['. DE_STEMMER_VOKALE .'])y(?=['. DE_STEMMER_VOKALE .'])/u'
115
+ ),
116
+ array( 'ss', 'U', 'Y' ),
117
+ $wort
118
+ );
119
+ return $wort;
120
+ }
121
+
122
+
123
+ function _de_stemmer_postprocess($wort) {
124
+ $wort = mb_strtolower($wort);
125
+
126
+ if (!_de_stemmer_ausnahme($wort)) // check for exceptions
127
+ {
128
+ $wort = strtr($wort, array('ä' => 'a', 'á' => 'a',
129
+ 'ë' => 'e', 'é' => 'e',
130
+ 'ï' => 'i', 'í' => 'i',
131
+ 'ö' => 'o', 'ó' => 'o',
132
+ 'ü' => "u", 'ú' => 'u'
133
+ ));
134
+ }
135
+ return $wort;
136
+ }
137
+
138
+
139
+ function _de_stemmer_wortstamm($wort) {
140
+ $stamm = de_stemmer_preprocess($wort);
141
+
142
+ /*
143
+ * R1 is the region after the first non-vowel following a vowel,
144
+ or is the null region at the end of the word if there is no such non-vowel.
145
+ * R2 is the region after the first non-vowel following a vowel in R1,
146
+ or is the null region at the end of the word if there is no such non-vowel.
147
+ */
148
+
149
+ $l = strlen($stamm);
150
+ $r1 = _de_stemmer_region_n($stamm);
151
+ $r2 = $r1 == $l ? $r1 : $r1 + _de_stemmer_region_n(mb_substr($stamm, $r1));
152
+ // unshure about interpreting the following rule:
153
+ // "then R1 is ADJUSTED so that the region before it contains at least 3 letters"
154
+ if ($r1 < 3) {
155
+ $r1 = 3;
156
+ }
157
+
158
+ /* Step 1
159
+ Search for the longest among the following suffixes,
160
+ (a) e em en ern er es
161
+ (b) s (preceded by a valid s-ending)
162
+ and delete if in R1.
163
+ (Of course the letter of the valid s-ending is not necessarily in R1)
164
+ */
165
+
166
+ if (preg_match('/(e|em|en|ern|er|es)$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
167
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
168
+ }
169
+ elseif (preg_match('/(?<=(b|d|f|g|h|k|l|m|n|r|t))s$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
170
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
171
+ }
172
+
173
+
174
+ /*
175
+ Step 2
176
+ Search for the longest among the following suffixes,
177
+ (a) en er est
178
+ (b) st (preceded by a valid st-ending, itself preceded by at least 3 letters)
179
+ and delete if in R1.
180
+ */
181
+
182
+ if (preg_match('/(en|er|est)$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
183
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
184
+ }
185
+ elseif (preg_match('/(?<=(b|d|f|g|h|k|l|m|n|t))st$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
186
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
187
+ }
188
+
189
+
190
+ /*
191
+ Step 3: d-suffixes ( see http://snowball.tartarus.org/texts/glossary.html )
192
+ Search for the longest among the following suffixes, and perform the action indicated.
193
+ end ung
194
+ delete if in R2
195
+ if preceded by ig, delete if in R2 and not preceded by e
196
+ ig ik isch
197
+ delete if in R2 and not preceded by e
198
+ lich heit
199
+ delete if in R2
200
+ if preceded by er or en, delete if in R1
201
+ keit
202
+ delete if in R2
203
+ if preceded by lich or ig, delete if in R2
204
+ ^ means R1 ?
205
+ */
206
+
207
+ if (preg_match('/(?<=eig)(end|ung)$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
208
+ ;
209
+ }
210
+ elseif (preg_match('/(end|ung)$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
211
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
212
+ }
213
+ elseif (preg_match('/(?<![e])(ig|ik|isch)$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
214
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
215
+ }
216
+ elseif (preg_match('/(?<=(er|en))(lich|heit)$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
217
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
218
+ }
219
+ elseif (preg_match('/(lich|heit)$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
220
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
221
+ }
222
+ elseif (preg_match('/(?<=lich)keit$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
223
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
224
+ }
225
+ elseif (preg_match('/(?<=ig)keit$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r1)) {
226
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
227
+ }
228
+ elseif (preg_match('/keit$/u', $stamm, $hits, PREG_OFFSET_CAPTURE, $r2)) {
229
+ $stamm = mb_substr($stamm, 0, $hits[0][1]);
230
+ }
231
+
232
+
233
+ /* Was ist mit
234
+ chen, lein, bar, schaft, ... ?
235
+ */
236
+ return _de_stemmer_postprocess($stamm);
237
+ }
238
+
239
+
240
+ function _de_stemmer_stoppwort($wort) {
241
+
242
+ static $stoppworte = array(
243
+ 'ab', 'aber', 'aber', 'ach', 'acht', 'achte', 'achten', 'achter', 'achtes', 'ag', 'alle', 'allein', 'allem', 'allen', 'aller', 'allerdings', 'alles', 'allgemeinen', 'als', 'als', 'also', 'am', 'an', 'andere', 'anderen', 'andern', 'anders', 'au', 'auch', 'auch', 'auf', 'aus', 'ausser', 'außer', 'ausserdem', 'außerdem',
244
+ 'bald', 'bei', 'beide', 'beiden', 'beim', 'bekannt', 'bereits', 'besonders', 'besser', 'besten', 'bin', 'bis', 'bisher', 'bist',
245
+ 'da', 'dabei', 'dadurch', 'dafür', 'dagegen', 'daher', 'dahin', 'dahinter', 'damals', 'damit', 'danach', 'daneben', 'dank', 'dann', 'daran', 'darauf', 'daraus', 'darf', 'darfst', 'darin', 'darüber', 'darum', 'darunter', 'das', 'das', 'dasein', 'daselbst', 'dass', 'daß', 'dasselbe', 'davon', 'davor', 'dazu', 'dazwischen', 'dein', 'deine', 'deinem', 'deiner', 'dem', 'dementsprechend', 'demgegenüber', 'demgemäss', 'demgemäß', 'demselben', 'demzufolge', 'den', 'denen', 'denn', 'denn', 'denselben', 'der', 'deren', 'derjenige', 'derjenigen', 'dermassen', 'dermaßen', 'derselbe', 'derselben', 'des', 'deshalb', 'desselben', 'dessen', 'deswegen', 'd.h', 'dich', 'die', 'diejenige', 'diejenigen', 'dies', 'diese', 'dieselbe', 'dieselben', 'diesem', 'diesen', 'dieser', 'dieses', 'dir', 'doch', 'dort', 'drei', 'drin', 'dritte', 'dritten', 'dritter', 'drittes', 'du', 'durch', 'durchaus',
246
+ 'eben', 'ebenso', 'eigen', 'eigene', 'eigenen', 'eigener', 'eigenes', 'ein', 'einander', 'eine', 'einem', 'einen', 'einer', 'eines', 'einige', 'einigen', 'einiger', 'einiges', 'einmal', 'einmal', 'eins', 'elf', 'en', 'ende', 'endlich', 'entweder', 'entweder', 'er', 'ernst', 'erst', 'erste', 'ersten', 'erster', 'erstes', 'es', 'etwa', 'etwas', 'euch',
247
+ 'früher', 'fünf', 'fünfte', 'fünften', 'fünfter', 'fünftes', 'für',
248
+ 'gab', 'ganz', 'ganze', 'ganzen', 'ganzer', 'ganzes', 'gar', 'gedurft', 'gegen', 'gegenüber', 'gehabt', 'gehen', 'geht', 'gekannt', 'gekonnt', 'gemacht', 'gemocht', 'gemusst', 'genug', 'gerade', 'gern', 'gesagt', 'gesagt', 'geschweige', 'gewesen', 'gewollt', 'geworden', 'gibt', 'ging', 'gleich', 'gott', 'gross', 'groß', 'grosse', 'große', 'grossen', 'großen', 'grosser', 'großer', 'grosses', 'großes', 'gut', 'gute', 'guter', 'gutes',
249
+ 'habe', 'haben', 'habt', 'hast', 'hat', 'hatte', 'hätte', 'hatten', 'hätten', 'heisst', 'her', 'heute', 'hier', 'hin', 'hinter', 'hoch',
250
+ 'ich', 'ihm', 'ihn', 'ihnen', 'ihr', 'ihre', 'ihrem', 'ihren', 'ihrer', 'ihres', 'im', 'im', 'immer', 'in', 'in', 'indem', 'infolgedessen', 'ins', 'irgend', 'ist',
251
+ 'ja', 'ja', 'jahr', 'jahre', 'jahren', 'je', 'jede', 'jedem', 'jeden', 'jeder', 'jedermann', 'jedermanns', 'jedoch', 'jemand', 'jemandem', 'jemanden', 'jene', 'jenem', 'jenen', 'jener', 'jenes', 'jetzt',
252
+ 'kam', 'kann', 'kannst', 'kaum', 'kein', 'keine', 'keinem', 'keinen', 'keiner', 'kleine', 'kleinen', 'kleiner', 'kleines', 'kommen', 'kommt', 'können', 'könnt', 'konnte', 'könnte', 'konnten', 'kurz',
253
+ 'lang', 'lange', 'lange', 'leicht', 'leide', 'lieber', 'los',
254
+ 'machen', 'macht', 'machte', 'mag', 'magst', 'mahn', 'man', 'manche', 'manchem', 'manchen', 'mancher', 'manches', 'mann', 'mehr', 'mein', 'meine', 'meinem', 'meinen', 'meiner', 'meines', 'mich', 'mir', 'mit', 'mittel', 'mochte', 'möchte', 'mochten', 'mögen', 'möglich', 'mögt', 'morgen', 'muss', 'muß', 'müssen', 'musst', 'müsst', 'musste', 'mussten',
255
+ 'na', 'nach', 'nachdem', 'nahm', 'natürlich', 'neben', 'nein', 'neue', 'neuen', 'neun', 'neunte', 'neunten', 'neunter', 'neuntes', 'nicht', 'nicht', 'nichts', 'nie', 'niemand', 'niemandem', 'niemanden', 'noch', 'nun', 'nun', 'nur',
256
+ 'ob', 'oben', 'oder', 'oder', 'offen', 'oft', 'oft', 'ohne',
257
+ 'recht', 'rechte', 'rechten', 'rechter', 'rechtes', 'richtig', 'rund',
258
+ 'sa', 'sache', 'sagt', 'sagte', 'sah', 'satt', 'schon', 'sechs', 'sechste', 'sechsten', 'sechster', 'sechstes', 'sehr', 'sei', 'sei', 'seid', 'seien', 'sein', 'seine', 'seinem', 'seinen', 'seiner', 'seines', 'seit', 'seitdem', 'selbst', 'selbst', 'sich', 'sie', 'sieben', 'siebente', 'siebenten', 'siebenter', 'siebentes', 'sind', 'so', 'solang', 'solche', 'solchem', 'solchen', 'solcher', 'solches', 'soll', 'sollen', 'sollte', 'sollten', 'sondern', 'sonst', 'sowie', 'später', 'statt',
259
+ 'tat', 'teil', 'tel', 'tritt', 'trotzdem', 'tun',
260
+ 'über', 'überhaupt', 'übrigens', 'uhr', 'um', 'und', 'und?', 'uns', 'unser', 'unsere', 'unserer', 'unter',
261
+ 'vergangenen', 'viel', 'viele', 'vielem', 'vielen', 'vielleicht', 'vier', 'vierte', 'vierten', 'vierter', 'viertes', 'vom', 'von', 'vor',
262
+ 'wahr?', 'während', 'währenddem', 'währenddessen', 'wann', 'war', 'wäre', 'waren', 'wart', 'warum', 'was', 'wegen', 'weil', 'weit', 'weiter', 'weitere', 'weiteren', 'weiteres', 'welche', 'welchem', 'welchen', 'welcher', 'welches', 'wem', 'wen', 'wenig', 'wenig', 'wenige', 'weniger', 'weniges', 'wenigstens', 'wenn', 'wenn', 'wer', 'werde', 'werden', 'werdet', 'wessen', 'wie', 'wie', 'wieder', 'will', 'willst', 'wir', 'wird', 'wirklich', 'wirst', 'wo', 'wohl', 'wollen', 'wollt', 'wollte', 'wollten', 'worden', 'wurde', 'würde', 'wurden', 'würden',
263
+ 'z.b', 'zehn', 'zehnte', 'zehnten', 'zehnter', 'zehntes', 'zeit', 'zu', 'zuerst', 'zugleich', 'zum', 'zum', 'zunächst', 'zur', 'zurück', 'zusammen', 'zwanzig', 'zwar', 'zwar', 'zwei', 'zweite', 'zweiten', 'zweiter', 'zweites', 'zwischen', 'zwölf'
264
+ );
265
+
266
+ return in_array($wort, $stoppworte);
267
+ }
268
+
269
+
270
+ /*
271
+ first try to set up a list of exceptions
272
+ */
273
+ function _de_stemmer_ausnahme(&$wort)
274
+ { static $de_stemmer_ausnahmen = array (
275
+ 'schön' => 'schön', // !schon
276
+ 'blüt' => 'blüt', // Blüte (NICHT Blut)
277
+ 'kannt' => 'kenn',
278
+ 'küch' => 'küch', // Küchen (NICHT Kuchen)
279
+ 'mög' => 'mög',
280
+ 'mocht' => 'mög',
281
+ 'mag' => 'mög',
282
+ 'ging' => 'geh',
283
+ 'lief' => 'lauf',
284
+ 'änd' => 'änd' // ändern (NICHT andern)
285
+ );
286
+
287
+ //return FALSE;
288
+ if ( array_key_exists($wort, $de_stemmer_ausnahmen) )
289
+ { $wort = $de_stemmer_ausnahmen[$wort];
290
+ return TRUE;
291
+ }
292
+ else
293
+ return FALSE;
294
+ }
295
+
296
+ /*
297
+ Stem caching added by Rob Marsh, SJ
298
+ http://rmarsh.com
299
+ */
300
+
301
+ $StemCache = array();
302
+
303
+ function stem($word) {
304
+ global $StemCache;
305
+ if (!isset($StemCache[$word])) {
306
+ $stemmedword = _de_stemmer_wortstamm($word);
307
+ $StemCache[$word] = $stemmedword;
308
+ }
309
+ else {
310
+ $stemmedword = $StemCache[$word] ;
311
+ }
312
+ return $stemmedword;
313
+ }
314
+
315
+ ?>
languages/de/stopwords.php ADDED
@@ -0,0 +1,4 @@
 
 
 
 
1
+ <?php
2
+ // the list of common words we want to ignore. NB anything shorter than 4 characters is knocked by the plugin and doesn't need to figure here
3
+ $overusedwords = array("aber", "alle", "allem", "allen", "aller", "alles", "also", "ander", "andere", "anderem", "anderen", "anderer", "anderes", "anderm", "andern", "anderr", "anders", "auch", "bist", "damit", "dann", "derselbe", "derselben", "denselben", "desselben", "demselben", "dieselbe", "dieselben", "dasselbe", "dazu", "dein", "deine", "deinem", "deinen", "deiner", "deines", "denn", "derer", "dessen", "dich", "dies", "diese", "diesem", "diesen", "dieser", "dieses", "doch", "dort", "durch", "eine", "einem", "einen", "einer", "eines", "einig", "einige", "einigem", "einigen", "einiger", "einiges", "einmal", "etwas", "euer", "eure", "eurem", "euren", "eurer", "eures", "gegen", "gewesen", "habe", "haben", "hatte", "hatten", "hier", "hinter", "mich", "ihre", "ihrem", "ihren", "ihrer", "ihres", "euch", "indem", "jede", "jedem", "jeden", "jeder", "jedes", "jene", "jenem", "jenen", "jener", "jenes", "jetzt", "kann", "kein", "keine", "keinem", "keinen", "keiner", "keines", "k�nnen", "k�nnte", "machen", "manche", "manchem", "manchen", "mancher", "manches", "mein", "meine", "meinem", "meinen", "meiner", "meines", "muss", "musste", "nach", "nicht", "nichts", "noch", "oder", "ohne", "sehr", "sein", "seine", "seinem", "seinen", "seiner", "seines", "selbst", "sich", "ihnen", "sind", "solche", "solchem", "solchen", "solcher", "solches", "soll", "sollte", "sondern", "sonst", "�ber", "unse", "unsem", "unsen", "unser", "unses", "unter", "viel", "w�hrend", "waren", "warst", "weil", "weiter", "welche", "welchem", "welchen", "welcher", "welches", "wenn", "werde", "werden", "wieder", "will", "wird", "wirst", "wollen", "wollte", "w�rde", "w�rden", "zwar", "zwischen");
4
+ ?>
languages/en/stemmer.php ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?php
2
+ /*
3
+ Creado por Cesar Rodas para el proyecto Saddor.com
4
+ Este Stemmer esta basado en el argoritmo de Snowball Stemmer.
5
+ saddor@gmail.com
6
+ Este programa esta bajo licencia GNU
7
+ */
8
+ if (!defined("ENGLISHSTEMMER"))
9
+ {
10
+ define("ENGLISHSTEMMER",1,false);
11
+ class EnglishStemmer
12
+ {
13
+ var $regex_consonant = '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)';
14
+ var $regex_vowel = '(?:[aeiou]|(?<![aeiou])y)';
15
+
16
+ function Stem($word)
17
+ {
18
+ if (strlen($word) <= 2) {
19
+ return $word;
20
+ }
21
+
22
+ $word = $this->step1ab($word);
23
+ $word = $this->step1c($word);
24
+ $word = $this->step2($word);
25
+ $word = $this->step3($word);
26
+ $word = $this->step4($word);
27
+ $word = $this->step5($word);
28
+ /*
29
+ Esta parte esta editado por cesar rodas,
30
+ no quiero que me muestre ' (apostrofe) al final
31
+ */
32
+ if (substr($word,-1,1) == "'")
33
+ $word = substr($word,0,strlen($word) -1 );
34
+ return $word;
35
+ }
36
+
37
+
38
+ function step1ab($word)
39
+ {
40
+ if (substr($word, -1) == 's') {
41
+
42
+ $this->replace($word, 'sses', 'ss')
43
+ OR $this->replace($word, 'ies', 'i')
44
+ OR $this->replace($word, 'ss', 'ss')
45
+ OR $this->replace($word, 's', '');
46
+ }
47
+
48
+ if (substr($word, -2, 1) != 'e' OR !$this->replace($word, 'eed', 'ee', 0)) { // First rule
49
+ $v = $this->regex_vowel;
50
+
51
+ if ( preg_match("#$v+#", substr($word, 0, -3)) && $this->replace($word, 'ing', '')
52
+ OR preg_match("#$v+#", substr($word, 0, -2)) && $this->replace($word, 'ed', '')) {
53
+ if ( !$this->replace($word, 'at', 'ate')
54
+ AND !$this->replace($word, 'bl', 'ble')
55
+ AND !$this->replace($word, 'iz', 'ize')) {
56
+
57
+ if ( $this->doubleConsonant($word)
58
+ AND substr($word, -2) != 'll'
59
+ AND substr($word, -2) != 'ss'
60
+ AND substr($word, -2) != 'zz') {
61
+
62
+ $word = substr($word, 0, -1);
63
+
64
+ } else if ($this->m($word) == 1 AND $this->cvc($word)) {
65
+ $word .= 'e';
66
+ }
67
+ }
68
+ }
69
+ }
70
+
71
+ return $word;
72
+ }
73
+
74
+ function step1c($word)
75
+ {
76
+ $v = $this->regex_vowel;
77
+
78
+ if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1))) {
79
+ $this->replace($word, 'y', 'i');
80
+ }
81
+
82
+ return $word;
83
+ }
84
+
85
+
86
+ function step2($word)
87
+ {
88
+ switch (substr($word, -2, 1)) {
89
+ case 'a':
90
+ $this->replace($word, 'ational', 'ate', 0)
91
+ OR $this->replace($word, 'tional', 'tion', 0);
92
+ break;
93
+
94
+ case 'c':
95
+ $this->replace($word, 'enci', 'ence', 0)
96
+ OR $this->replace($word, 'anci', 'ance', 0);
97
+ break;
98
+
99
+ case 'e':
100
+ $this->replace($word, 'izer', 'ize', 0);
101
+ break;
102
+
103
+ case 'g':
104
+ $this->replace($word, 'logi', 'log', 0);
105
+ break;
106
+
107
+ case 'l':
108
+ $this->replace($word, 'entli', 'ent', 0)
109
+ OR $this->replace($word, 'ousli', 'ous', 0)
110
+ OR $this->replace($word, 'alli', 'al', 0)
111
+ OR $this->replace($word, 'bli', 'ble', 0)
112
+ OR $this->replace($word, 'eli', 'e', 0);
113
+ break;
114
+
115
+ case 'o':
116
+ $this->replace($word, 'ization', 'ize', 0)
117
+ OR $this->replace($word, 'ation', 'ate', 0)
118
+ OR $this->replace($word, 'ator', 'ate', 0);
119
+ break;
120
+
121
+ case 's':
122
+ $this->replace($word, 'iveness', 'ive', 0)
123
+ OR $this->replace($word, 'fulness', 'ful', 0)
124
+ OR $this->replace($word, 'ousness', 'ous', 0)
125
+ OR $this->replace($word, 'alism', 'al', 0);
126
+ break;
127
+
128
+ case 't':
129
+ $this->replace($word, 'biliti', 'ble', 0)
130
+ OR $this->replace($word, 'aliti', 'al', 0)
131
+ OR $this->replace($word, 'iviti', 'ive', 0);
132
+ break;
133
+ }
134
+
135
+ return $word;
136
+ }
137
+
138
+
139
+ function step3($word)
140
+ {
141
+ switch (substr($word, -2, 1)) {
142
+ case 'a':
143
+ $this->replace($word, 'ical', 'ic', 0);
144
+ break;
145
+
146
+ case 's':
147
+ $this->replace($word, 'ness', '', 0);
148
+ break;
149
+
150
+ case 't':
151
+ $this->replace($word, 'icate', 'ic', 0)
152
+ OR $this->replace($word, 'iciti', 'ic', 0);
153
+ break;
154
+
155
+ case 'u':
156
+ $this->replace($word, 'ful', '', 0);
157
+ break;
158
+
159
+ case 'v':
160
+ $this->replace($word, 'ative', '', 0);
161
+ break;
162
+
163
+ case 'z':
164
+ $this->replace($word, 'alize', 'al', 0);
165
+ break;
166
+ }
167
+
168
+ return $word;
169
+ }
170
+
171
+
172
+ function step4($word)
173
+ {
174
+ switch (substr($word, -2, 1)) {
175
+ case 'a':
176
+ $this->replace($word, 'al', '', 1);
177
+ break;
178
+
179
+ case 'c':
180
+ $this->replace($word, 'ance', '', 1)
181
+ OR $this->replace($word, 'ence', '', 1);
182
+ break;
183
+
184
+ case 'e':
185
+ $this->replace($word, 'er', '', 1);
186
+ break;
187
+
188
+ case 'i':
189
+ $this->replace($word, 'ic', '', 1);
190
+ break;
191
+
192
+ case 'l':
193
+ $this->replace($word, 'able', '', 1)
194
+ OR $this->replace($word, 'ible', '', 1);
195
+ break;
196
+
197
+ case 'n':
198
+ $this->replace($word, 'ant', '', 1)
199
+ OR $this->replace($word, 'ement', '', 1)
200
+ OR $this->replace($word, 'ment', '', 1)
201
+ OR $this->replace($word, 'ent', '', 1);
202
+ break;
203
+
204
+ case 'o':
205
+ if (substr($word, -4) == 'tion' OR substr($word, -4) == 'sion') {
206
+ $this->replace($word, 'ion', '', 1);
207
+ } else {
208
+ $this->replace($word, 'ou', '', 1);
209
+ }
210
+ break;
211
+
212
+ case 's':
213
+ $this->replace($word, 'ism', '', 1);
214
+ break;
215
+
216
+ case 't':
217
+ $this->replace($word, 'ate', '', 1)
218
+ OR $this->replace($word, 'iti', '', 1);
219
+ break;
220
+
221
+ case 'u':
222
+ $this->replace($word, 'ous', '', 1);
223
+ break;
224
+
225
+ case 'v':
226
+ $this->replace($word, 'ive', '', 1);
227
+ break;
228
+
229
+ case 'z':
230
+ $this->replace($word, 'ize', '', 1);
231
+ break;
232
+ }
233
+
234
+ return $word;
235
+ }
236
+
237
+ function step5($word)
238
+ {
239
+ if (substr($word, -1) == 'e') {
240
+ if ($this->m(substr($word, 0, -1)) > 1) {
241
+ $this->replace($word, 'e', '');
242
+
243
+ } else if ($this->m(substr($word, 0, -1)) == 1) {
244
+
245
+ if (!$this->cvc(substr($word, 0, -1))) {
246
+ $this->replace($word, 'e', '');
247
+ }
248
+ }
249
+ }
250
+
251
+ // Part b
252
+ if ($this->m($word) > 1 AND $this->doubleConsonant($word) AND substr($word, -1) == 'l') {
253
+ $word = substr($word, 0, -1);
254
+ }
255
+
256
+ return $word;
257
+ }
258
+
259
+ function replace(&$str, $check, $repl, $m = null)
260
+ {
261
+ $len = 0 - strlen($check);
262
+
263
+ if (substr($str, $len) == $check) {
264
+ $substr = substr($str, 0, $len);
265
+ if (is_null($m) OR $this->m($substr) > $m) {
266
+ $str = $substr . $repl;
267
+ }
268
+
269
+ return true;
270
+ }
271
+
272
+ return false;
273
+ }
274
+
275
+
276
+
277
+ function m($str)
278
+ {
279
+ $c = $this->regex_consonant;
280
+ $v = $this->regex_vowel;
281
+
282
+ $str = preg_replace("#^$c+#", '', $str);
283
+ $str = preg_replace("#$v+$#", '', $str);
284
+
285
+ preg_match_all("#($v+$c+)#", $str, $matches);
286
+
287
+ return count($matches[1]);
288
+ }
289
+
290
+
291
+
292
+ function doubleConsonant($str)
293
+ {
294
+ $c = $this->regex_consonant;
295
+
296
+ return preg_match("#$c{2}$#", $str, $matches) AND $matches[0]{0} == $matches[0]{1};
297
+ }
298
+
299
+
300
+
301
+ function cvc($str)
302
+ {
303
+ $c = $this->regex_consonant;
304
+ $v = $this->regex_vowel;
305
+
306
+ return preg_match("#($c$v$c)$#", $str, $matches)
307
+ AND strlen($matches[1]) == 3
308
+ AND $matches[1]{2} != 'w'
309
+ AND $matches[1]{2} != 'x'
310
+ AND $matches[1]{2} != 'y';
311
+ }
312
+ }
313
+ }
314
+
315
+ /*
316
+ Stem caching added by Rob Marsh, SJ
317
+ http://rmarsh.com
318
+ */
319
+
320
+ $Stemmer = new EnglishStemmer();
321
+ $StemCache = array();
322
+
323
+ function stem($word) {
324
+ global $Stemmer, $StemCache;
325
+ if (!isset($StemCache[$word])) {
326
+ $stemmedword = $Stemmer->Stem($word);
327
+ $StemCache[$word] = $stemmedword;
328
+ }
329
+ else {
330
+ $stemmedword = $StemCache[$word] ;
331
+ }
332
+ return $stemmedword;
333
+ }
334
+
335
+ ?>
languages/en/stopwords.php ADDED
@@ -0,0 +1,4 @@
 
 
 
 
1
+ <?php
2
+ // the list of common words we want to ignore. NB anything shorter than 4 characters is knocked by the plugin and doesn't need to figure here
3
+ $overusedwords = array("able", "about", "above", "according", "accordingly", "across", "actually", "after", "afterwards", "again", "against", "ain't", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "among", "amongst", "another", "anybody", "anyhow", "anyone", "anything", "anyway", "anyways", "anywhere", "apart", "appear", "appreciate", "appropriate", "aren't", "around", "aside", "asking", "associated", "available", "away", "awfully", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "both", "brief", "came", "cannot", "can't", "cause", "causes", "certain", "certainly", "changes", "clearly", "come", "comes", "concerning", "conse'uently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn't", "course", "currently", "definitely", "described", "despite", "didn't", "different", "does", "doesn't", "doing", "done", "don't", "down", "downwards", "during", "each", "eight", "either", "else", "elsewhere", "enough", "entirely", "especially", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "exactly", "example", "except", "fifth", "first", "five", "followed", "following", "follows", "former", "formerly", "forth", "four", "from", "further", "furthermore", "gets", "getting", "given", "gives", "goes", "going", "gone", "gotten", "greetings", "hadn't", "happens", "hardly", "hasn't", "have", "haven't", "having", "hello", "help", "hence", "here", "hereafter", "hereby", "herein", "hereupon", "he's", "hers", "herself", "himself", "hither", "hopefully", "howbeit", "however", "ignored", "i'll", "it'd", "it's", "i've", "immediate", "inasmuch", "indeed", "indicate", "indicated", "indicates", "inner", "insofar", "instead", "into", "inward", "isn't", "itself", "just", "keep", "keeps", "kept", "know", "known", "knows", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "like", "liked", "likely", "little", "look", "looking", "looks", "mainly", "many", "maybe", "mean", "meanwhile", "merely", "might", "more", "moreover", "most", "mostly", "much", "must", "mustn't", "myself", "name", "namely", "near", "nearly", "necessary", "need", "needs", "neither", "never", "nevertheless", "next", "nine", "nobody", "none", "noone", "normally", "nothing", "novel", "nowhere", "obviously", "often", "okay", "once", "ones", "one's", "only", "onto", "other", "others", "otherwise", "ought", "ours", "ourselves", "outside", "over", "overall", "particular", "particularly", "perhaps", "placed", "please", "plus", "possible", "presumably", "probably", "provides", "'uite", "rather", "really", "reasonably", "regarding", "regardless", "regards", "relatively", "respectively", "right", "said", "same", "saying", "says", "second", "secondly", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall", "should", "shouldn't", "since", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified", "specify", "specifying", "still", "such", "sure", "take", "taken", "tell", "tends", "than", "thank", "thanks", "that", "that's", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "there's", "thereupon", "these", "they", "think", "third", "this", "thorough", "thoroughly", "those", "though", "three", "through", "throughout", "thru", "thus", "together", "took", "toward", "towards", "tried", "tries", "truly", "trying", "twice", "under", "unfortunately", "unless", "unlikely", "until", "unto", "upon", "used", "useful", "uses", "using", "usually", "value", "various", "very", "want", "wants", "wasn't", "welcome", "we'd", "well", "went", "were", "weren't", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "whoever", "whole", "whom", "whose", "will", "willing", "wish", "with", "within", "without", "wonder", "would", "wouldn't", "your", "yours", "yourself", "yourselves", "zero");
4
+ ?>
languages/es/stemmer.php ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?php
2
+ /*
3
+ Creado por Cesar Rodas para el proyecto Saddor.com
4
+ Este Stemmer esta basado en el argoritmo de Snowball Stemmer.
5
+ saddor@gmail.com
6
+ Este programa esta bajo licencia GNU
7
+ */
8
+ if (!defined("SPANISHSTEMMER"))
9
+ {
10
+ define("vocal",1,false);
11
+ define("consonante",2,false);
12
+ define("SPANISHSTEMMER",1,false);
13
+
14
+ class PorterStemmer
15
+ {
16
+ var $R1;
17
+ var $R2;
18
+ var $RV;
19
+ var $word;
20
+ function Stem($word)
21
+ {
22
+
23
+ $this->word = $word;
24
+ if (strlen($word) < 2)
25
+ return;
26
+
27
+
28
+ $this->step_0();
29
+ while($this->step_1());
30
+ $this->step_2();
31
+ $this->step_3();
32
+ return $this->word;
33
+ }
34
+
35
+ function step_0()
36
+ {
37
+ $this->splitword();
38
+ $search = array(
39
+ "me","se","sela","selo","selas","selos","la","le","lo","les",
40
+ "los","nos"
41
+ );
42
+
43
+ $prefix = array(
44
+ "i�ndo","�ndo","�r","�r","�r", /* primer caso */
45
+ "iendo","ando","ar","er","ir", /* segundo caso*/
46
+ "yendo"
47
+ );
48
+
49
+ foreach ($prefix as $id => $pref)
50
+ {
51
+ $return = false;
52
+ if ( (strstr($this->RV,$pref) != NULL) or
53
+ /* caso para yendo */
54
+ ($pref == "yendo" && strstr($this->word,"uyendo")) )
55
+ {
56
+
57
+ /*
58
+ El prefijo fue encontrado, ahora buscar para borrar
59
+ el pronombre.
60
+ */
61
+ foreach ($search as $word)
62
+ {
63
+ $len = strlen($word);
64
+
65
+ switch ($id)
66
+ {
67
+
68
+ case $id < 5: /* primer Caso*/
69
+ if ($word == substr($this->RV,-1 * $len,$len) )
70
+ {
71
+ $this->word = substr($this->word,0, strlen($this->word) - $len);
72
+ $this->word = str_replace($prefix[$id],$prefix[$id+5],$this->word);
73
+ $return = true;
74
+ }
75
+ break;
76
+ case $id < 10: /* segundo caso*/
77
+ if ($word == substr($this->RV,-1 * $len,$len) )
78
+ {
79
+ $this->word = substr($this->word,0, strlen($this->word) - $len);
80
+ $return = true;
81
+ }
82
+ break;
83
+ case $id >= 10: /* tercer caso*/
84
+ if ($word == substr($this->RV,-1 * $len,$len) )
85
+ {
86
+
87
+ $this->word = substr($this->word,0, strlen($this->word) - $len);
88
+ $return = true;
89
+ }
90
+ break;
91
+ }
92
+ }
93
+ }
94
+
95
+ }
96
+ unset($prefix,$search,$word,$id,$pref,$len);
97
+ return $return;
98
+ }
99
+
100
+ function step_1()
101
+ {
102
+ $return = false;
103
+ $this->splitword();
104
+
105
+ /* borrado de R2 */
106
+ $search = array(
107
+ "abilidades","iblemente","icaciones","ablemente","antemente","ivamente","atamente",
108
+ "amientos","icadoras","icadores","icancias","imientos","icamente",
109
+ "osamente","abilidad","icidades","ividades","adamente","icantes",
110
+ "icancia","imiemto","icadora","icaci�n","amiento","imiento","aciones",
111
+ "ativos","ativas","ividad","idades","icidad","icante",
112
+ "icador","adoras","adores","ancias","mente","ables",
113
+ "ismos","anzas","ativa","ativo","istas","ibles",
114
+ "aci�n","antes","adora","ancia","ismo","anza",
115
+ "icos","ivas","osos","ivos","ante","osas",
116
+ "ador","ible","ista","idad","able","ico",
117
+ "osa","oso","iva","ica","ica","ivo",
118
+ );
119
+
120
+ for ($i = 0; $i < count($search); $i++)
121
+ if (substr($this->R2,strlen($search[$i]) * (-1),strlen($search[$i])) == $search[$i])
122
+ {
123
+ $this->word = substr($this->word,0,strlen($this->word) - strlen($search[$i]) );
124
+ $return = true;
125
+ break;
126
+ }
127
+ /* creo que esta mal, creo que hay que buscar en R1*/
128
+ if ($this->R1 == "amente")
129
+ {
130
+ $this->word = str_replace("amente","",$this->word);
131
+ }
132
+
133
+ $search = array
134
+ (
135
+ "log�a","log�as",/**/"uci�n","uciones",/**/"encia","encias"
136
+ );
137
+ $replace = array
138
+ (
139
+ "log","log","u","u","entre","entre"
140
+ );
141
+ for ($i = 0; $i < count($search); $i++)
142
+ if (substr($this->R2,strlen($search[$i]) * (-1),strlen($search[$i])) == $search[$i])
143
+ {
144
+ $this->word = str_replace($search[$i],$replace[$i],$this->word);
145
+ $return = true;
146
+ break;
147
+ }
148
+ unset($i,$search,$replace);
149
+ return $return;
150
+ }
151
+
152
+ function step_2()
153
+ {
154
+ $this->splitword();
155
+ $return = false;
156
+ $search = array(
157
+ "ya","ye","yan","yen","yeron","yendo","yo","y�","yas","yes","yais","yamos"
158
+ );
159
+ foreach ($search as $word)
160
+ {
161
+ if (substr($this->RV,strlen($word) * (-1),strlen($word)) == $word)
162
+ if (substr($this->word,-1*(strlen($word) + 1), strlen($word) + 1) == "u".$word)
163
+ {
164
+ $this->word = substr($this->word,0, strlen($this->word) -(strlen($word) + 1));
165
+ $return = true;
166
+ }
167
+ }
168
+
169
+ if ($return == false)
170
+ $this->step_2b();
171
+ unset($return,$search,$word);
172
+ }
173
+
174
+ function step_2b()
175
+ {
176
+ $this->splitword();
177
+ $search = array(
178
+ "en","es","�is","emos"
179
+ );
180
+
181
+ foreach ($search as $word)
182
+ {
183
+ if (substr($this->RV,strlen($word) * (-1),strlen($word)) == $word)
184
+ if (substr($this->word,(-1)*(strlen($word) + 2), strlen($word) + 2) == "gu".$word)
185
+ {
186
+ $this->word = substr($this->word,0, strlen($this->word) -(strlen($word) + 1) );
187
+ $return = true;
188
+ }
189
+ /*
190
+ This part was fix by Diego Enrique Finol <dfinol at cantv dot net>
191
+ This was the email that Diego sent to me:
192
+ Epa saludos, gracias por la clase de spanish stemmer, hab�a visto lo mismo
193
+ en snowball pero me ahorraste el trabajo de convertirlo a php. S�lo not�
194
+ que en las partes en la que hab�a que borrar cierto sufijo y, adem�s,
195
+ borrar la "u" de si est� precedido por "gu" creo que no borra el sufijo si
196
+ no est� precedido por esto. O sea, hay que borrar el afijo en ambos casos,
197
+ y de paso si est� precedido por gu, tambi�n borrar la u, pero el algoritmo
198
+ s�lo lo hace si est� precedido por gu, sino, no borra nada.
199
+
200
+ Thanks Diego!.
201
+ */
202
+ else
203
+ {
204
+ $this->word = substr($this->word,0, strlen($this->word) -(strlen($word)) );
205
+ $return = true;
206
+ }
207
+ /*End of Diego fix*/
208
+ }
209
+
210
+ $search = array(
211
+ "i�ramos","ar�amos","ir�amos","i�semos","er�amos","er�ais","eremos",
212
+ "isteis","ir�ais","ierais","iremos","�bamos","ieseis",
213
+ "asteis","�ramos","�semos","aremos","ar�ais","abais",
214
+ "�amos","arais","ieses","ar�an","iesen","ieron",
215
+ "iendo","ieras","ir�is","ar�as","er�as","aseis",
216
+ "er�is","er�an","ir�an","ar�is","ir�as","ieran",
217
+ "ando","amos","aron","asen","aras","ados",
218
+ "�ais","ases","imos","adas","idas","abas",
219
+ "iste","ir�n","er�n","ar�a","er�a","iera",
220
+ "ir�s","ir�a","aran","ar�s","er�s","aste",
221
+ "iese","aban","ar�n","�is","ada","ir�",
222
+ "�an","ir�","er�","aba","ara","ido",
223
+ "ar�","ar�","ado","er�","ase","�as",
224
+ "ida","�a","er","ar","i�","an",
225
+ "ir","as","ad","ed","id","�s",
226
+
227
+
228
+ );
229
+
230
+ foreach ($search as $word)
231
+ if (substr($this->RV,strlen($word) * (-1),strlen($word)) == $word)
232
+ {
233
+ $this->word = substr($this->word,0, strlen($this->word) -(strlen($word)));
234
+ $this->splitword();
235
+ }
236
+ unset($search,$word);
237
+
238
+ }
239
+
240
+ function step_3()
241
+ {
242
+ $this->splitword();
243
+ $return = false;
244
+ $search = array(
245
+ "os","a","o","�","�","�"
246
+ );
247
+
248
+
249
+ foreach ($search as $word)
250
+ if (substr($this->RV,strlen($word) * (-1),strlen($word)) == $word)
251
+ {
252
+ $this->word = substr($this->word,0, strlen($this->word) -(strlen($word)));
253
+ $return = true;
254
+ }
255
+
256
+ $search = array(
257
+ "e","�"
258
+ );
259
+
260
+ foreach ($search as $word)
261
+ {
262
+ if (substr($this->RV,strlen($word) * (-1),strlen($word)) == $word)
263
+ if (substr($this->RV,-1*(strlen($word) + 2), strlen($word) + 2) == "gu".$word)
264
+ {
265
+ $this->word = substr($this->word,0, strlen($this->word) -(strlen($word) + 1) );
266
+ $return = true;
267
+ }
268
+ else
269
+ {
270
+ $this->word = substr($this->word,0, strlen($this->word) -(strlen($word)) );
271
+ $return = true;
272
+ }
273
+ }
274
+ unset($search,$word);
275
+ $this->word = str_replace("�","a",$this->word);
276
+ $this->word = str_replace("�","e",$this->word);
277
+ $this->word = str_replace("�","i",$this->word);
278
+ $this->word = str_replace("�","o",$this->word);
279
+ $this->word = str_replace("�","u",$this->word);
280
+ $this->word = str_replace("�","u",$this->word);
281
+ return $return;
282
+ }
283
+
284
+
285
+ /* funciones utilizadas*/
286
+ function saddorsort($a, $b)
287
+ {
288
+ if (strlen($a) == strlen($b)) {
289
+ return 0;
290
+ }
291
+ return (strlen($a) < strlen($b)) ? 1 : -1;
292
+ }
293
+ function splitword()
294
+ {
295
+ $flag1=false;
296
+ $flag2=false;
297
+ $this->R1="";
298
+ $this->R2="";
299
+ $this->RV="";
300
+ for ($i = 1; $i < strlen($this->word); $i++)
301
+ {
302
+ if ($flag1)
303
+ $this->R1.=$this->word[$i];
304
+ if ($flag2)
305
+ $this->R2.=$this->word[$i];
306
+
307
+ if ($i+1 >= strlen($this->word))
308
+ break;
309
+
310
+ if ($this->char_is($this->word[$i]) == consonante &&
311
+ $this->char_is(@$this->word[$i+1]) == vocal &&
312
+ $flag1 == true && $flag2 == false)
313
+ $flag2=true;
314
+
315
+ if ($this->char_is($this->word[$i]) == consonante &&
316
+ $this->char_is($this->word[$i+1]) == vocal &&
317
+ $flag1 == false)
318
+ $flag1=true;
319
+ }
320
+
321
+
322
+ /* Buscando RV*/
323
+ $flag1=false;
324
+ if ($this->char_is($this->word[1]) == consonante)
325
+ {
326
+ for ($i = 2; $i < strlen($this->word); $i++)
327
+ if ($this->char_is($this->word[$i]) == vocal)
328
+ break;
329
+ $i++;
330
+ $this->RV = substr($this->word,$i);
331
+ }
332
+ else if ($this->char_is($this->word[1]) == vocal && $this->char_is($this->word[0]) == vocal)
333
+ {
334
+ for ($i = 2; $i < strlen($this->word); $i++)
335
+ if ($this->char_is($this->word[$i]) == consonante)
336
+ break;
337
+ $i++;
338
+ $this->RV = substr($this->word,$i);
339
+ }
340
+ else if (strlen($this->word) > 2)
341
+ $this->RV = substr($this->word,3);
342
+
343
+ unset($flag1,$flag2,$i);
344
+ }
345
+
346
+ function char_is($char)
347
+ {
348
+ $char = strtolower($char);
349
+ if ($char == "")
350
+ return;
351
+ $vowel = "aeiou������";
352
+ $consonant = "bcdfghijklmn�opqrsvtxwyz";
353
+ if (strstr($vowel,$char))
354
+ return vocal;
355
+ if (strstr($consonant,$char))
356
+ return consonante;
357
+ }
358
+ }
359
+ }
360
+
361
+ /*
362
+ Stem caching added by Rob Marsh, SJ
363
+ http://rmarsh.com
364
+ */
365
+
366
+ $Stemmer = new PorterStemmer();
367
+ $StemCache = array();
368
+
369
+ function stem($word) {
370
+ global $Stemmer, $StemCache;
371
+ if (!isset($StemCache[$word])) {
372
+ $stemmedword = $Stemmer->Stem($word);
373
+ $StemCache[$word] = $stemmedword;
374
+ }
375
+ else {
376
+ $stemmedword = $StemCache[$word] ;
377
+ }
378
+ return $stemmedword;
379
+ }
380
+
381
+ ?>
languages/es/stopwords.php ADDED
@@ -0,0 +1,4 @@
 
 
 
 
1
+ <?php
2
+ // the list of common words we want to ignore. NB anything shorter than 4 characters is knocked by the plugin and doesn't need to figure here
3
+ $overusedwords = array("algo", "alguna", "algunas", "alguno", "algunos", "alg�n", "ambos", "ampleamos", "ante", "antes", "aquel", "aquellas", "aquellos", "aqui", "arriba", "atras", "bajo", "bastante", "bien", "cada", "cierta", "ciertas", "ciertos", "como", "conseguimos", "conseguir", "consigo", "consigue", "consiguen", "consigues", "contra", "cual", "cuando", "dentro", "desde", "donde", "durante", "ella", "ellas", "ellos", "empleais", "emplean", "emplear", "empleas", "empleo", "encima", "entonces", "entre", "erais", "eramos", "eran", "eras", "eres", "esas", "esos", "esta", "estaba", "estabais", "estaban", "estabas", "estad", "estada", "estadas", "estado", "estados", "estais", "estamos", "estan", "estando", "estar", "estaremos", "estar�", "estar�n", "estar�s", "estar�", "estar�is", "estar�a", "estar�ais", "estar�amos", "estar�an", "estar�as", "estas", "este", "estemos", "esto", "estos", "estoy", "estuve", "estuviera", "estuvierais", "estuvieran", "estuvieras", "estuvieron", "estuviese", "estuvieseis", "estuviesen", "estuvieses", "estuvimos", "estuviste", "estuvisteis", "estuvi�ramos", "estuvi�semos", "estuvo", "est�", "est�bamos", "est�is", "est�n", "est�s", "est�", "est�is", "est�n", "est�s", "fuera", "fuerais", "fueran", "fueras", "fueron", "fuese", "fueseis", "fuesen", "fueses", "fuimos", "fuiste", "fuisteis", "fu�ramos", "fu�semos", "gueno", "habida", "habidas", "habido", "habidos", "habiendo", "habremos", "habr�", "habr�n", "habr�s", "habr�", "habr�is", "habr�a", "habr�ais", "habr�amos", "habr�an", "habr�as", "hab�is", "hab�a", "hab�ais", "hab�amos", "hab�an", "hab�as", "hace", "haceis", "hacemos", "hacen", "hacer", "haces", "hago", "hasta", "haya", "hayamos", "hayan", "hayas", "hay�is", "hemos", "hube", "hubiera", "hubierais", "hubieran", "hubieras", "hubieron", "hubiese", "hubieseis", "hubiesen", "hubieses", "hubimos", "hubiste", "hubisteis", "hubi�ramos", "hubi�semos", "hubo", "incluso", "intenta", "intentais", "intentamos", "intentan", "intentar", "intentas", "intento", "largo", "mientras", "modo", "mucho", "muchos", "m�as", "m�os", "nada", "nosotras", "nosotros", "nuestra", "nuestras", "nuestro", "nuestros", "otra", "otras", "otro", "otros", "para", "pero", "poco", "podeis", "podemos", "poder", "podria", "podriais", "podriamos", "podrian", "podrias", "porque", "primero desde", "puede", "pueden", "puedo", "quien", "quienes", "sabe", "sabeis", "sabemos", "saben", "saber", "sabes", "seamos", "sean", "seas", "sentid", "sentida", "sentidas", "sentido", "sentidos", "seremos", "ser�", "ser�n", "ser�s", "ser�", "ser�is", "ser�a", "ser�ais", "ser�amos", "ser�an", "ser�as", "se�is", "siendo", "siente", "sintiendo", "sobre", "sois", "solamente", "solo", "somos", "suya", "suyas", "suyo", "suyos", "tambi�n", "tanto", "tendremos", "tendr�", "tendr�n", "tendr�s", "tendr�", "tendr�is", "tendr�a", "tendr�ais", "tendr�amos", "tendr�an", "tendr�as", "tened", "teneis", "tenemos", "tener", "tenga", "tengamos", "tengan", "tengas", "tengo", "teng�is", "tenida", "tenidas", "tenido", "tenidos", "teniendo", "ten�is", "ten�a", "ten�ais", "ten�amos", "ten�an", "ten�as", "tiempo", "tiene", "tienen", "tienes", "todo", "todos", "trabaja", "trabajais", "trabajamos", "trabajan", "trabajar", "trabajas", "trabajo", "tras", "tuve", "tuviera", "tuvierais", "tuvieran", "tuvieras", "tuvieron", "tuviese", "tuvieseis", "tuviesen", "tuvieses", "tuvimos", "tuviste", "tuvisteis", "tuvi�ramos", "tuvi�semos", "tuvo", "tuya", "tuyas", "tuyo", "tuyos", "ultimo", "unas", "unos", "usais", "usamos", "usan", "usar", "usas", "vais", "valor", "vamos", "vaya", "verdad", "verdadera cierto", "verdadero", "vosostras", "vosostros", "vosotras", "vosotros", "vuestra", "vuestras", "vuestro", "vuestros", "�ramos");
4
+ ?>
languages/fr/stemmer.php ADDED
@@ -0,0 +1,513 @@