4 * author: Evren (Kapusuz) Çilden
5 * email: evren.kapusuz at gmail.com
6 * version: 1.0 (15.01.2007)
9 * stems nominal verb suffixes
10 * stems nominal inflections
11 * more than one syllable word check
12 * (y,n,s,U) context check
14 * last consonant check and conversion (b, c, d, ğ to p, ç, t, k)
16 * The stemming algorithm is based on the paper "An Affix Stripping
17 * Morphological Analyzer for Turkish" by Gülşen Eryiğit and
18 * Eşref Adalı (Proceedings of the IAESTED International Conference
19 * ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004,
22 * Turkish is an agglutinative language and has a very rich morphological
23 * structure. In Turkish, you can form many different words from a single stem
24 * by appending a sequence of suffixes. Eg. The word "doktoruymuşsunuz" means
25 * "You had been the doctor of him". The stem of the word is "doktor" and it
26 * takes three different suffixes -sU, -ymUs, and -sUnUz. The rules about
27 * the append order of suffixes can be clearly described as FSMs.
28 * The paper referenced above defines some FSMs for right to left
29 * morphological analysis. I generated a method for constructing snowball
30 * expressions from right to left FSMs for stemming suffixes.
34 append_U_to_stems_ending_with_d_or_g // for preventing some overstemmings
35 check_vowel_harmony // tests vowel harmony for suffixes
36 is_reserved_word // tests whether current string is a reserved word ('ad','soyad')
37 mark_cAsInA // nominal verb suffix
38 mark_DA // noun suffix
39 mark_DAn // noun suffix
40 mark_DUr // nominal verb suffix
41 mark_ki // noun suffix
42 mark_lAr // noun suffix, nominal verb suffix
43 mark_lArI // noun suffix
44 mark_nA // noun suffix
45 mark_ncA // noun suffix
46 mark_ndA // noun suffix
47 mark_ndAn // noun suffix
48 mark_nU // noun suffix
49 mark_nUn // noun suffix
50 mark_nUz // nominal verb suffix
51 mark_sU // noun suffix
52 mark_sUn // nominal verb suffix
53 mark_sUnUz // nominal verb suffix
54 mark_possessives // -(U)m,-(U)n,-(U)mUz,-(U)nUz,
55 mark_yA // noun suffix
56 mark_ylA // noun suffix
57 mark_yU // noun suffix
58 mark_yUm // nominal verb suffix
59 mark_yUz // nominal verb suffix
60 mark_yDU // nominal verb suffix
61 mark_yken // nominal verb suffix
62 mark_ymUs_ // nominal verb suffix
63 mark_ysA // nominal verb suffix
65 mark_suffix_with_optional_y_consonant
66 mark_suffix_with_optional_U_vowel
67 mark_suffix_with_optional_n_consonant
68 mark_suffix_with_optional_s_consonant
70 more_than_one_syllable_word
72 post_process_last_consonants
75 stem_nominal_verb_suffixes
77 stem_suffix_chain_before_ki
80 /* Special characters in Unicode Latin-1 and Latin Extended-A */
81 stringdef c. hex 'E7' // LATIN SMALL LETTER C WITH CEDILLA
82 stringdef g~ hex '011F' // LATIN SMALL LETTER G WITH BREVE
83 stringdef i' hex '0131' // LATIN SMALL LETTER I WITHOUT DOT
84 stringdef o" hex 'F6' // LATIN SMALL LETTER O WITH DIAERESIS
85 stringdef s. hex '015F' // LATIN SMALL LETTER S WITH CEDILLA
86 stringdef u" hex 'FC' // LATIN SMALL LETTER U WITH DIAERESIS
90 integers ( strlen ) // length of a string
92 booleans ( continue_stemming_noun_suffixes )
94 groupings ( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6)
96 define vowel 'ae{i'}io{o"}u{u"}'
99 // the vowel grouping definitions below are used for checking vowel harmony
100 define vowel1 'a{i'}ou' // vowels that can end with suffixes containing 'a'
101 define vowel2 'ei{o"}{u"}' // vowels that can end with suffixes containing 'e'
102 define vowel3 'a{i'}' // vowels that can end with suffixes containing 'i''
103 define vowel4 'ei' // vowels that can end with suffixes containing 'i'
104 define vowel5 'ou' // vowels that can end with suffixes containing 'o' or 'u'
105 define vowel6 '{o"}{u"}' // vowels that can end with suffixes containing 'o"' or 'u"'
110 // checks vowel harmony for possible suffixes,
111 // helps to detect whether the candidate for suffix applies to vowel harmony
112 // this rule is added to prevent over stemming
113 define check_vowel_harmony as (
116 (goto vowel) // if there is a vowel
120 ('{i'}' goto vowel3) or
123 ('{o"}' goto vowel6) or
130 // if the last consonant before suffix is vowel and n then advance and delete
131 // if the last consonant before suffix is non vowel and n do nothing
132 // if the last consonant before suffix is not n then only delete the suffix
133 // assumption: slice beginning is set correctly
134 define mark_suffix_with_optional_n_consonant as (
137 ((not(test 'n')) test(next vowel))
141 // if the last consonant before suffix is vowel and s then advance and delete
142 // if the last consonant before suffix is non vowel and s do nothing
143 // if the last consonant before suffix is not s then only delete the suffix
144 // assumption: slice beginning is set correctly
145 define mark_suffix_with_optional_s_consonant as (
148 ((not(test 's')) test(next vowel))
151 // if the last consonant before suffix is vowel and y then advance and delete
152 // if the last consonant before suffix is non vowel and y do nothing
153 // if the last consonant before suffix is not y then only delete the suffix
154 // assumption: slice beginning is set correctly
155 define mark_suffix_with_optional_y_consonant as (
158 ((not(test 'y')) test(next vowel))
161 define mark_suffix_with_optional_U_vowel as (
164 ((not(test U)) test(next non-vowel))
168 define mark_possessives as (
169 among ('m{i'}z' 'miz' 'muz' 'm{u"}z'
170 'n{i'}z' 'niz' 'nuz' 'n{u"}z' 'm' 'n')
171 (mark_suffix_with_optional_U_vowel)
177 (mark_suffix_with_optional_s_consonant)
180 define mark_lArI as (
181 among ('leri' 'lar{i'}')
187 (mark_suffix_with_optional_y_consonant)
192 among ('n{i'}' 'ni' 'nu' 'n{u"}')
197 among ('{i'}n' 'in' 'un' '{u"}n')
198 (mark_suffix_with_optional_n_consonant)
204 (mark_suffix_with_optional_y_consonant)
214 among('da' 'de' 'ta' 'te')
224 among('dan' 'den' 'tan' 'ten')
227 define mark_ndAn as (
235 (mark_suffix_with_optional_y_consonant)
245 (mark_suffix_with_optional_n_consonant)
250 among ('{i'}m' 'im' 'um' '{u"}m')
251 (mark_suffix_with_optional_y_consonant)
256 among ('s{i'}n' 'sin' 'sun' 's{u"}n' )
261 among ('{i'}z' 'iz' 'uz' '{u"}z')
262 (mark_suffix_with_optional_y_consonant)
265 define mark_sUnUz as (
266 among ('s{i'}n{i'}z' 'siniz' 'sunuz' 's{u"}n{u"}z')
276 among ('n{i'}z' 'niz' 'nuz' 'n{u"}z')
281 among ('t{i'}r' 'tir' 'tur' 't{u"}r' 'd{i'}r' 'dir' 'dur' 'd{u"}r')
284 define mark_cAsInA as (
285 among ('cas{i'}na' 'cesine')
290 among ('t{i'}m' 'tim' 'tum' 't{u"}m' 'd{i'}m' 'dim' 'dum' 'd{u"}m'
291 't{i'}n' 'tin' 'tun' 't{u"}n' 'd{i'}n' 'din' 'dun' 'd{u"}n'
292 't{i'}k' 'tik' 'tuk' 't{u"}k' 'd{i'}k' 'dik' 'duk' 'd{u"}k'
293 't{i'}' 'ti' 'tu' 't{u"}' 'd{i'}' 'di' 'du' 'd{u"}')
294 (mark_suffix_with_optional_y_consonant)
297 // does not fully obey vowel harmony
299 among ('sam' 'san' 'sak' 'sem' 'sen' 'sek' 'sa' 'se')
300 (mark_suffix_with_optional_y_consonant)
303 define mark_ymUs_ as (
305 among ('m{i'}{s.}' 'mi{s.}' 'mu{s.}' 'm{u"}{s.}')
306 (mark_suffix_with_optional_y_consonant)
309 define mark_yken as (
310 'ken' (mark_suffix_with_optional_y_consonant)
313 define stem_nominal_verb_suffixes as (
315 set continue_stemming_noun_suffixes
316 (mark_ymUs_ or mark_yDU or mark_ysA or mark_yken)
318 (mark_cAsInA (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_)
321 mark_lAr ] delete try([(mark_DUr or mark_yDU or mark_ysA or mark_ymUs_))
322 unset continue_stemming_noun_suffixes
325 (mark_nUz (mark_yDU or mark_ysA))
327 ((mark_sUnUz or mark_yUz or mark_sUn or mark_yUm) ] delete try([ mark_ymUs_))
329 (mark_DUr ] delete try([ (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_))
333 // stems noun suffix chains ending with -ki
334 define stem_suffix_chain_before_ki as (
338 (mark_DA] delete try([
339 (mark_lAr] delete try(stem_suffix_chain_before_ki))
341 (mark_possessives] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
345 (mark_nUn] delete try([
348 ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
350 (stem_suffix_chain_before_ki)
356 ((mark_sU] delete try([mark_lAr]delete stem_suffix_chain_before_ki)))
358 (stem_suffix_chain_before_ki)
363 define stem_noun_suffixes as (
364 ([mark_lAr] delete try(stem_suffix_chain_before_ki))
370 ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
372 ([mark_lAr] delete stem_suffix_chain_before_ki)
376 ([(mark_ndA or mark_nA)
380 (mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
382 (stem_suffix_chain_before_ki)
386 ([(mark_ndAn or mark_nU) ((mark_sU ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lArI)))
388 ( [mark_DAn] delete try ([
390 (mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
392 (mark_lAr] delete try(stem_suffix_chain_before_ki))
394 (stem_suffix_chain_before_ki)
398 ([mark_nUn or mark_ylA] delete
400 ([mark_lAr] delete stem_suffix_chain_before_ki)
402 ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
404 stem_suffix_chain_before_ki
410 (stem_suffix_chain_before_ki)
412 ([mark_DA or mark_yU or mark_yA] delete try([((mark_possessives] delete try([mark_lAr)) or mark_lAr) ] delete [ stem_suffix_chain_before_ki))
414 ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
417 define post_process_last_consonants as (
426 // after stemming if the word ends with 'd' or 'g' most probably last U is overstemmed
427 // like in 'kedim' -> 'ked'
428 // Turkish words don't usually end with 'd' or 'g'
429 // some very well known words are ignored (like 'ad' 'soyad'
430 // appends U to stems ending with d or g, decides which vowel to add
431 // based on the last vowel in the stem
432 define append_U_to_stems_ending_with_d_or_g as (
434 (test((goto vowel) 'a' or '{i'}') <+ '{i'}')
436 (test((goto vowel) 'e' or 'i') <+ 'i')
438 (test((goto vowel) 'o' or 'u') <+ 'u')
440 (test((goto vowel) '{o"}' or '{u"}') <+ '{u"}')
445 // Tests if there are more than one syllables
446 // In Turkish each vowel indicates a distinct syllable
447 define more_than_one_syllable_word as (
448 test (atleast 2 (gopast vowel))
451 define is_reserved_word as (
452 test(gopast 'ad' ($strlen = 2) ($strlen == limit))
454 test(gopast 'soyad' ($strlen = 5) ($strlen == limit))
458 not(is_reserved_word)
460 do append_U_to_stems_ending_with_d_or_g
461 do post_process_last_consonants
467 (more_than_one_syllable_word)
470 do stem_nominal_verb_suffixes
471 continue_stemming_noun_suffixes
472 do stem_noun_suffixes