xapian-core/languages/turkish.sbl

   1 // Alias: tr
   2
   3 /* Stemmer for Turkish
   4         * author: Evren (Kapusuz) Çilden
   5         * email: evren.kapusuz at gmail.com
   6         * version: 1.0 (15.01.2007)
   7
   8
   9         * stems nominal verb suffixes
  10         * stems nominal inflections
  11         * more than one syllable word check
  12         * (y,n,s,U) context check
  13         * vowel harmony check
  14         * last consonant check and conversion (b, c, d, ğ to p, ç, t, k)
  15
  16         * The stemming algorithm is based on the paper "An Affix Stripping
  17         * Morphological Analyzer for Turkish" by Gülşen Eryiğit and
  18         * Eşref Adalı (Proceedings of the IAESTED International Conference
  19         * ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004,
  20         * Innsbruck, Austria
  21
  22         * Turkish is an agglutinative language and has a very rich morphological
  23         * structure. In Turkish, you can form many different words from a single stem
  24         * by appending a sequence of suffixes. Eg. The word "doktoruymuşsunuz" means
  25         * "You had been the doctor of him". The stem of the word is "doktor" and it
  26         * takes three different suffixes -sU, -ymUs, and -sUnUz. The rules about
  27         * the append order of suffixes can be clearly described as FSMs.
  28         * The paper referenced above defines some FSMs for right to left
  29         * morphological analysis. I generated a method for constructing snowball
  30         * expressions from right to left FSMs for stemming suffixes.
  31 */
  32
  33 routines (
  34         append_U_to_stems_ending_with_d_or_g // for preventing some overstemmings
  35         check_vowel_harmony     // tests vowel harmony for suffixes
  36         is_reserved_word        // tests whether current string is a reserved word ('ad','soyad')
  37         mark_cAsInA             // nominal verb suffix
  38         mark_DA                 // noun suffix
  39         mark_DAn                // noun suffix
  40         mark_DUr                // nominal verb suffix
  41         mark_ki                 // noun suffix
  42         mark_lAr                // noun suffix, nominal verb suffix
  43         mark_lArI               // noun suffix
  44         mark_nA                 // noun suffix
  45         mark_ncA                // noun suffix
  46         mark_ndA                // noun suffix
  47         mark_ndAn               // noun suffix
  48         mark_nU                 // noun suffix
  49         mark_nUn                // noun suffix
  50         mark_nUz                // nominal verb suffix
  51         mark_sU                 // noun suffix
  52         mark_sUn                // nominal verb suffix
  53         mark_sUnUz              // nominal verb suffix
  54         mark_possessives        // -(U)m,-(U)n,-(U)mUz,-(U)nUz,
  55         mark_yA                 // noun suffix
  56         mark_ylA                // noun suffix
  57         mark_yU                 // noun suffix
  58         mark_yUm                // nominal verb suffix
  59         mark_yUz                // nominal verb suffix
  60         mark_yDU                // nominal verb suffix
  61         mark_yken               // nominal verb suffix
  62         mark_ymUs_              // nominal verb suffix
  63         mark_ysA                // nominal verb suffix
  64
  65         mark_suffix_with_optional_y_consonant
  66         mark_suffix_with_optional_U_vowel
  67         mark_suffix_with_optional_n_consonant
  68         mark_suffix_with_optional_s_consonant
  69
  70         more_than_one_syllable_word
  71
  72         post_process_last_consonants
  73         postlude
  74
  75         stem_nominal_verb_suffixes
  76         stem_noun_suffixes
  77         stem_suffix_chain_before_ki
  78 )
  79
  80 /* Special characters in Unicode Latin-1 and Latin Extended-A */
  81 stringdef c.    hex 'E7'        // LATIN SMALL LETTER C WITH CEDILLA
  82 stringdef g~    hex '011F'      // LATIN SMALL LETTER G WITH BREVE
  83 stringdef i'    hex '0131'      // LATIN SMALL LETTER I WITHOUT DOT
  84 stringdef o"    hex 'F6'        // LATIN SMALL LETTER O WITH DIAERESIS
  85 stringdef s.    hex '015F'      // LATIN SMALL LETTER S WITH CEDILLA
  86 stringdef u"    hex 'FC'        // LATIN SMALL LETTER U WITH DIAERESIS
  87
  88 stringescapes   { }
  89
  90 integers        ( strlen )      // length of a string
  91
  92 booleans        ( continue_stemming_noun_suffixes )
  93
  94 groupings       ( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6)
  95
  96 define vowel    'ae{i'}io{o"}u{u"}'
  97 define U        '{i'}iu{u"}'
  98
  99 // the vowel grouping definitions below are used for checking vowel harmony
 100 define vowel1   'a{i'}ou'               // vowels that can end with suffixes containing 'a'
 101 define vowel2   'ei{o"}{u"}'            // vowels that can end with suffixes containing 'e'
 102 define vowel3   'a{i'}'                 // vowels that can end with suffixes containing 'i''
 103 define vowel4   'ei'                    // vowels that can end with suffixes containing 'i'
 104 define vowel5   'ou'                    // vowels that can end with suffixes containing 'o' or 'u'
 105 define vowel6   '{o"}{u"}'              // vowels that can end with suffixes containing 'o"' or 'u"'
 106
 107 externals       ( stem )
 108
 109 backwardmode (
 110         // checks vowel harmony for possible suffixes,
 111         // helps to detect whether the candidate for suffix applies to vowel harmony
 112         // this rule is added to prevent over stemming
 113         define check_vowel_harmony as (
 114                 test
 115                 (
 116                         (goto vowel)   // if there is a vowel
 117                         (
 118                                 ('a' goto vowel1) or
 119                                 ('e' goto vowel2) or
 120                                 ('{i'}' goto vowel3) or
 121                                 ('i' goto vowel4) or
 122                                 ('o' goto vowel5) or
 123                                 ('{o"}' goto vowel6) or
 124                                 ('u' goto vowel5) or
 125                                 ('{u"}' goto vowel6)
 126                         )
 127                 )
 128         )
 129
 130         // if the last consonant before suffix is vowel and n then advance and delete
 131         // if the last consonant before suffix is non vowel and n do nothing
 132         // if the last consonant before suffix is not n then only delete the suffix
 133         // assumption: slice beginning is set correctly
 134         define mark_suffix_with_optional_n_consonant as (
 135                 ('n' (test vowel))
 136                 or
 137                 ((not(test 'n')) test(next vowel))
 138
 139         )
 140
 141         // if the last consonant before suffix is vowel and s then advance and delete
 142         // if the last consonant before suffix is non vowel and s do nothing
 143         // if the last consonant before suffix is not s then only delete the suffix
 144         // assumption: slice beginning is set correctly
 145         define mark_suffix_with_optional_s_consonant as (
 146                 ('s' (test vowel))
 147                 or
 148                 ((not(test 's')) test(next vowel))
 149         )
 150
 151         // if the last consonant before suffix is vowel and y then advance and delete
 152         // if the last consonant before suffix is non vowel and y do nothing
 153         // if the last consonant before suffix is not y then only delete the suffix
 154         // assumption: slice beginning is set correctly
 155         define mark_suffix_with_optional_y_consonant as (
 156                 ('y' (test vowel))
 157                 or
 158                 ((not(test 'y')) test(next vowel))
 159         )
 160
 161         define mark_suffix_with_optional_U_vowel as (
 162                 (U (test non-vowel))
 163                 or
 164                 ((not(test U)) test(next non-vowel))
 165
 166         )
 167
 168         define mark_possessives as (
 169                 among ('m{i'}z' 'miz' 'muz' 'm{u"}z'
 170                        'n{i'}z' 'niz' 'nuz' 'n{u"}z' 'm' 'n')
 171                 (mark_suffix_with_optional_U_vowel)
 172         )
 173
 174         define mark_sU as (
 175                 check_vowel_harmony
 176                 U
 177                 (mark_suffix_with_optional_s_consonant)
 178         )
 179
 180         define mark_lArI as (
 181                 among ('leri' 'lar{i'}')
 182         )
 183
 184         define mark_yU as (
 185                 check_vowel_harmony
 186                 U
 187                 (mark_suffix_with_optional_y_consonant)
 188         )
 189
 190         define mark_nU as (
 191                 check_vowel_harmony
 192                 among ('n{i'}' 'ni' 'nu' 'n{u"}')
 193         )
 194
 195         define mark_nUn as (
 196                 check_vowel_harmony
 197                 among ('{i'}n' 'in' 'un' '{u"}n')
 198                 (mark_suffix_with_optional_n_consonant)
 199         )
 200
 201         define mark_yA as (
 202                 check_vowel_harmony
 203                 among('a' 'e')
 204                 (mark_suffix_with_optional_y_consonant)
 205         )
 206
 207         define mark_nA as (
 208                 check_vowel_harmony
 209                 among('na' 'ne')
 210         )
 211
 212         define mark_DA as (
 213                 check_vowel_harmony
 214                 among('da' 'de' 'ta' 'te')
 215         )
 216
 217         define mark_ndA as (
 218                 check_vowel_harmony
 219                 among('nda' 'nde')
 220         )
 221
 222         define mark_DAn as (
 223                 check_vowel_harmony
 224                 among('dan' 'den' 'tan' 'ten')
 225         )
 226
 227         define mark_ndAn as (
 228                 check_vowel_harmony
 229                 among('ndan' 'nden')
 230         )
 231
 232         define mark_ylA as (
 233                 check_vowel_harmony
 234                 among('la' 'le')
 235                 (mark_suffix_with_optional_y_consonant)
 236         )
 237
 238         define mark_ki as (
 239                 'ki'
 240         )
 241
 242         define mark_ncA as (
 243                 check_vowel_harmony
 244                 among('ca' 'ce')
 245                 (mark_suffix_with_optional_n_consonant)
 246         )
 247
 248         define mark_yUm as (
 249                 check_vowel_harmony
 250                 among ('{i'}m' 'im' 'um' '{u"}m')
 251                 (mark_suffix_with_optional_y_consonant)
 252         )
 253
 254         define mark_sUn as (
 255                 check_vowel_harmony
 256                 among ('s{i'}n' 'sin' 'sun' 's{u"}n' )
 257         )
 258
 259         define mark_yUz as (
 260                 check_vowel_harmony
 261                 among ('{i'}z' 'iz' 'uz' '{u"}z')
 262                 (mark_suffix_with_optional_y_consonant)
 263         )
 264
 265         define mark_sUnUz as (
 266                 among ('s{i'}n{i'}z' 'siniz' 'sunuz' 's{u"}n{u"}z')
 267         )
 268
 269         define mark_lAr as (
 270                 check_vowel_harmony
 271                 among ('ler' 'lar')
 272         )
 273
 274         define mark_nUz as (
 275                 check_vowel_harmony
 276                 among ('n{i'}z' 'niz' 'nuz' 'n{u"}z')
 277         )
 278
 279         define mark_DUr as (
 280                 check_vowel_harmony
 281                 among ('t{i'}r' 'tir' 'tur' 't{u"}r' 'd{i'}r' 'dir' 'dur' 'd{u"}r')
 282         )
 283
 284         define mark_cAsInA as (
 285                 among ('cas{i'}na' 'cesine')
 286         )
 287
 288         define mark_yDU as (
 289                 check_vowel_harmony
 290                 among ('t{i'}m' 'tim' 'tum' 't{u"}m' 'd{i'}m' 'dim' 'dum' 'd{u"}m'
 291                         't{i'}n' 'tin' 'tun' 't{u"}n' 'd{i'}n' 'din' 'dun' 'd{u"}n'
 292                         't{i'}k' 'tik' 'tuk' 't{u"}k' 'd{i'}k' 'dik' 'duk' 'd{u"}k'
 293                         't{i'}' 'ti' 'tu' 't{u"}' 'd{i'}' 'di' 'du' 'd{u"}')
 294                 (mark_suffix_with_optional_y_consonant)
 295         )
 296
 297         // does not fully obey vowel harmony
 298         define mark_ysA as (
 299                 among ('sam' 'san' 'sak' 'sem' 'sen' 'sek' 'sa' 'se')
 300                 (mark_suffix_with_optional_y_consonant)
 301         )
 302
 303         define mark_ymUs_ as (
 304                 check_vowel_harmony
 305                 among ('m{i'}{s.}' 'mi{s.}' 'mu{s.}' 'm{u"}{s.}')
 306                 (mark_suffix_with_optional_y_consonant)
 307         )
 308
 309         define mark_yken as (
 310                 'ken' (mark_suffix_with_optional_y_consonant)
 311         )
 312
 313         define stem_nominal_verb_suffixes as (
 314                 [
 315                         set continue_stemming_noun_suffixes
 316                         (mark_ymUs_ or mark_yDU or mark_ysA or mark_yken)
 317                         or
 318                         (mark_cAsInA (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_)
 319                         or
 320                         (
 321                                 mark_lAr ] delete try([(mark_DUr or mark_yDU or mark_ysA or mark_ymUs_))
 322                                 unset continue_stemming_noun_suffixes
 323                         )
 324                         or
 325                         (mark_nUz (mark_yDU or mark_ysA))
 326                         or
 327                         ((mark_sUnUz or mark_yUz or mark_sUn or mark_yUm) ] delete try([ mark_ymUs_))
 328                         or
 329                         (mark_DUr ] delete try([ (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_))
 330                 ]delete
 331         )
 332
 333         // stems noun suffix chains ending with -ki
 334         define stem_suffix_chain_before_ki as (
 335                 [
 336                         mark_ki
 337                         (
 338                                 (mark_DA] delete try([
 339                                         (mark_lAr] delete try(stem_suffix_chain_before_ki))
 340                                         or
 341                                         (mark_possessives] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
 342
 343                                 ))
 344                                 or
 345                                 (mark_nUn] delete try([
 346                                         (mark_lArI] delete)
 347                                         or
 348                                         ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
 349                                         or
 350                                         (stem_suffix_chain_before_ki)
 351                                 ))
 352                                 or
 353                                 (mark_ndA (
 354                                         (mark_lArI] delete)
 355                                         or
 356                                         ((mark_sU] delete try([mark_lAr]delete stem_suffix_chain_before_ki)))
 357                                         or
 358                                         (stem_suffix_chain_before_ki)
 359                                 ))
 360                         )
 361         )
 362
 363         define stem_noun_suffixes as (
 364                 ([mark_lAr] delete try(stem_suffix_chain_before_ki))
 365                 or
 366                 ([mark_ncA] delete
 367                         try(
 368                                 ([mark_lArI] delete)
 369                                 or
 370                                 ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
 371                                 or
 372                                 ([mark_lAr] delete stem_suffix_chain_before_ki)
 373                         )
 374                 )
 375                 or
 376                 ([(mark_ndA or mark_nA)
 377                         (
 378                                 (mark_lArI] delete)
 379                                 or
 380                                 (mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
 381                                 or
 382                                 (stem_suffix_chain_before_ki)
 383                         )
 384                 )
 385                 or
 386                 ([(mark_ndAn or mark_nU) ((mark_sU ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lArI)))
 387                 or
 388                 ( [mark_DAn] delete try ([
 389                         (
 390                                 (mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
 391                                 or
 392                                 (mark_lAr] delete try(stem_suffix_chain_before_ki))
 393                                 or
 394                                 (stem_suffix_chain_before_ki)
 395                         ))
 396                 )
 397                 or
 398                 ([mark_nUn or mark_ylA] delete
 399                         try(
 400                                 ([mark_lAr] delete stem_suffix_chain_before_ki)
 401                                 or
 402                                 ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
 403                                 or
 404                                 stem_suffix_chain_before_ki
 405                         )
 406                 )
 407                 or
 408                 ([mark_lArI] delete)
 409                 or
 410                 (stem_suffix_chain_before_ki)
 411                 or
 412                 ([mark_DA or mark_yU or mark_yA] delete try([((mark_possessives] delete try([mark_lAr)) or mark_lAr) ] delete [ stem_suffix_chain_before_ki))
 413                 or
 414                 ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
 415         )
 416
 417         define post_process_last_consonants as (
 418                 [substring] among (
 419                         'b' (<- 'p')
 420                         'c' (<- '{c.}')
 421                         'd' (<- 't')
 422                         '{g~}' (<- 'k')
 423                 )
 424         )
 425
 426         // after stemming if the word ends with 'd' or 'g' most probably last U is overstemmed
 427         // like in 'kedim' -> 'ked'
 428         // Turkish words don't usually end with 'd' or 'g'
 429         // some very well known words are ignored (like 'ad' 'soyad'
 430         // appends U to stems ending with d or g, decides which vowel to add
 431         // based on the last vowel in the stem
 432         define append_U_to_stems_ending_with_d_or_g as (
 433                 test('d' or 'g')
 434                 (test((goto vowel) 'a' or '{i'}') <+ '{i'}')
 435                 or
 436                 (test((goto vowel) 'e' or 'i') <+ 'i')
 437                 or
 438                 (test((goto vowel) 'o' or 'u') <+ 'u')
 439                 or
 440                 (test((goto vowel) '{o"}' or '{u"}') <+ '{u"}')
 441         )
 442
 443 )
 444
 445 // Tests if there are more than one syllables
 446 // In Turkish each vowel indicates a distinct syllable
 447 define more_than_one_syllable_word as (
 448         test (atleast 2 (gopast vowel))
 449 )
 450
 451 define is_reserved_word as (
 452         test(gopast 'ad' ($strlen = 2) ($strlen == limit))
 453         or
 454         test(gopast 'soyad' ($strlen = 5) ($strlen == limit))
 455 )
 456
 457 define postlude as (
 458         not(is_reserved_word)
 459         backwards (
 460                 do append_U_to_stems_ending_with_d_or_g
 461                 do post_process_last_consonants
 462
 463         )
 464 )
 465
 466 define stem as (
 467         (more_than_one_syllable_word)
 468         (
 469                 backwards (
 470                         do stem_nominal_verb_suffixes
 471                         continue_stemming_noun_suffixes
 472                         do stem_noun_suffixes
 473                 )
 474
 475         postlude
 476         )
 477 )
 478
 479