xapian-core/languages/turkish.sbl

   1 // Alias: tr
   2
   3 /* Stemmer for Turkish
   4         * author: Evren (Kapusuz) Çilden
   5         * email: evren.kapusuz at gmail.com
   6         * version: 1.0 (15.01.2007)
   7
   8
   9         * stems nominal verb suffixes
  10         * stems nominal inflections
  11         * more than one syllable word check
  12         * (y,n,s,U) context check
  13         * vowel harmony check
  14         * last consonant check and conversion (b, c, d, ğ to p, ç, t, k)
  15
  16         * The stemming algorithm is based on the paper "An Affix Stripping
  17         * Morphological Analyzer for Turkish" by Gülşen Eryiğit and
  18         * Eşref Adalı (Proceedings of the IAESTED International Conference
  19         * ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004,
  20         * Innsbruck, Austria
  21
  22         * Turkish is an agglutinative language and has a very rich morphological
  23         * structure. In Turkish, you can form many different words from a single stem
  24         * by appending a sequence of suffixes. Eg. The word "doktoruymuşsunuz" means
  25         * "You had been the doctor of him". The stem of the word is "doktor" and it
  26         * takes three different suffixes -sU, -ymUs, and -sUnUz. The rules about
  27         * the append order of suffixes can be clearly described as FSMs.
  28         * The paper referenced above defines some FSMs for right to left
  29         * morphological analysis. I generated a method for constructing snowball
  30         * expressions from right to left FSMs for stemming suffixes.
  31 */
  32
  33 routines (
  34         append_U_to_stems_ending_with_d_or_g // for preventing some overstemmings
  35         check_vowel_harmony     // tests vowel harmony for suffixes
  36         is_reserved_word        // tests whether current string is a reserved word ('ad','soyad')
  37         mark_cAsInA             // nominal verb suffix
  38         mark_DA                 // noun suffix
  39         mark_DAn                // noun suffix
  40         mark_DUr                // nominal verb suffix
  41         mark_ki                 // noun suffix
  42         mark_lAr                // noun suffix, nominal verb suffix
  43         mark_lArI               // noun suffix
  44         mark_nA                 // noun suffix
  45         mark_ncA                // noun suffix
  46         mark_ndA                // noun suffix
  47         mark_ndAn               // noun suffix
  48         mark_nU                 // noun suffix
  49         mark_nUn                // noun suffix
  50         mark_nUz                // nominal verb suffix
  51         mark_sU                 // noun suffix
  52         mark_sUn                // nominal verb suffix
  53         mark_sUnUz              // nominal verb suffix
  54         mark_possessives        // -(U)m,-(U)n,-(U)mUz,-(U)nUz,
  55         mark_yA                 // noun suffix
  56         mark_ylA                // noun suffix
  57         mark_yU                 // noun suffix
  58         mark_yUm                // nominal verb suffix
  59         mark_yUz                // nominal verb suffix
  60         mark_yDU                // nominal verb suffix
  61         mark_yken               // nominal verb suffix
  62         mark_ymUs_              // nominal verb suffix
  63         mark_ysA                // nominal verb suffix
  64
  65         mark_suffix_with_optional_y_consonant
  66         mark_suffix_with_optional_U_vowel
  67         mark_suffix_with_optional_n_consonant
  68         mark_suffix_with_optional_s_consonant
  69
  70         more_than_one_syllable_word
  71
  72         post_process_last_consonants
  73         postlude
  74
  75         stem_nominal_verb_suffixes
  76         stem_noun_suffixes
  77         stem_suffix_chain_before_ki
  78 )
  79
  80 stringescapes   { }
  81
  82 /* Special characters in Unicode Latin-1 and Latin Extended-A */
  83 stringdef c,    '{U+00E7}'      // LATIN SMALL LETTER C WITH CEDILLA
  84 stringdef g~    '{U+011F}'      // LATIN SMALL LETTER G WITH BREVE
  85 stringdef i'    '{U+0131}'      // LATIN SMALL LETTER I WITHOUT DOT
  86 stringdef o"    '{U+00F6}'      // LATIN SMALL LETTER O WITH DIAERESIS
  87 stringdef s,    '{U+015F}'      // LATIN SMALL LETTER S WITH CEDILLA
  88 stringdef u"    '{U+00FC}'      // LATIN SMALL LETTER U WITH DIAERESIS
  89
  90 booleans        ( continue_stemming_noun_suffixes )
  91
  92 groupings       ( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6)
  93
  94 define vowel    'ae{i'}io{o"}u{u"}'
  95 define U        '{i'}iu{u"}'
  96
  97 // the vowel grouping definitions below are used for checking vowel harmony
  98 define vowel1   'a{i'}ou'               // vowels that can end with suffixes containing 'a'
  99 define vowel2   'ei{o"}{u"}'            // vowels that can end with suffixes containing 'e'
 100 define vowel3   'a{i'}'                 // vowels that can end with suffixes containing 'i''
 101 define vowel4   'ei'                    // vowels that can end with suffixes containing 'i'
 102 define vowel5   'ou'                    // vowels that can end with suffixes containing 'o' or 'u'
 103 define vowel6   '{o"}{u"}'              // vowels that can end with suffixes containing 'o"' or 'u"'
 104
 105 externals       ( stem )
 106
 107 backwardmode (
 108         // checks vowel harmony for possible suffixes,
 109         // helps to detect whether the candidate for suffix applies to vowel harmony
 110         // this rule is added to prevent over stemming
 111         define check_vowel_harmony as (
 112                 test
 113                 (
 114                         (goto vowel)   // if there is a vowel
 115                         (
 116                                 ('a' goto vowel1) or
 117                                 ('e' goto vowel2) or
 118                                 ('{i'}' goto vowel3) or
 119                                 ('i' goto vowel4) or
 120                                 ('o' goto vowel5) or
 121                                 ('{o"}' goto vowel6) or
 122                                 ('u' goto vowel5) or
 123                                 ('{u"}' goto vowel6)
 124                         )
 125                 )
 126         )
 127
 128         // if the last consonant before suffix is vowel and n then advance and delete
 129         // if the last consonant before suffix is non vowel and n do nothing
 130         // if the last consonant before suffix is not n then only delete the suffix
 131         // assumption: slice beginning is set correctly
 132         define mark_suffix_with_optional_n_consonant as (
 133                 ('n' (test vowel))
 134                 or
 135                 ((not(test 'n')) test(next vowel))
 136
 137         )
 138
 139         // if the last consonant before suffix is vowel and s then advance and delete
 140         // if the last consonant before suffix is non vowel and s do nothing
 141         // if the last consonant before suffix is not s then only delete the suffix
 142         // assumption: slice beginning is set correctly
 143         define mark_suffix_with_optional_s_consonant as (
 144                 ('s' (test vowel))
 145                 or
 146                 ((not(test 's')) test(next vowel))
 147         )
 148
 149         // if the last consonant before suffix is vowel and y then advance and delete
 150         // if the last consonant before suffix is non vowel and y do nothing
 151         // if the last consonant before suffix is not y then only delete the suffix
 152         // assumption: slice beginning is set correctly
 153         define mark_suffix_with_optional_y_consonant as (
 154                 ('y' (test vowel))
 155                 or
 156                 ((not(test 'y')) test(next vowel))
 157         )
 158
 159         define mark_suffix_with_optional_U_vowel as (
 160                 (U (test non-vowel))
 161                 or
 162                 ((not(test U)) test(next non-vowel))
 163
 164         )
 165
 166         define mark_possessives as (
 167                 among ('m{i'}z' 'miz' 'muz' 'm{u"}z'
 168                        'n{i'}z' 'niz' 'nuz' 'n{u"}z' 'm' 'n')
 169                 (mark_suffix_with_optional_U_vowel)
 170         )
 171
 172         define mark_sU as (
 173                 check_vowel_harmony
 174                 U
 175                 (mark_suffix_with_optional_s_consonant)
 176         )
 177
 178         define mark_lArI as (
 179                 among ('leri' 'lar{i'}')
 180         )
 181
 182         define mark_yU as (
 183                 check_vowel_harmony
 184                 U
 185                 (mark_suffix_with_optional_y_consonant)
 186         )
 187
 188         define mark_nU as (
 189                 check_vowel_harmony
 190                 among ('n{i'}' 'ni' 'nu' 'n{u"}')
 191         )
 192
 193         define mark_nUn as (
 194                 check_vowel_harmony
 195                 among ('{i'}n' 'in' 'un' '{u"}n')
 196                 (mark_suffix_with_optional_n_consonant)
 197         )
 198
 199         define mark_yA as (
 200                 check_vowel_harmony
 201                 among('a' 'e')
 202                 (mark_suffix_with_optional_y_consonant)
 203         )
 204
 205         define mark_nA as (
 206                 check_vowel_harmony
 207                 among('na' 'ne')
 208         )
 209
 210         define mark_DA as (
 211                 check_vowel_harmony
 212                 among('da' 'de' 'ta' 'te')
 213         )
 214
 215         define mark_ndA as (
 216                 check_vowel_harmony
 217                 among('nda' 'nde')
 218         )
 219
 220         define mark_DAn as (
 221                 check_vowel_harmony
 222                 among('dan' 'den' 'tan' 'ten')
 223         )
 224
 225         define mark_ndAn as (
 226                 check_vowel_harmony
 227                 among('ndan' 'nden')
 228         )
 229
 230         define mark_ylA as (
 231                 check_vowel_harmony
 232                 among('la' 'le')
 233                 (mark_suffix_with_optional_y_consonant)
 234         )
 235
 236         define mark_ki as (
 237                 'ki'
 238         )
 239
 240         define mark_ncA as (
 241                 check_vowel_harmony
 242                 among('ca' 'ce')
 243                 (mark_suffix_with_optional_n_consonant)
 244         )
 245
 246         define mark_yUm as (
 247                 check_vowel_harmony
 248                 among ('{i'}m' 'im' 'um' '{u"}m')
 249                 (mark_suffix_with_optional_y_consonant)
 250         )
 251
 252         define mark_sUn as (
 253                 check_vowel_harmony
 254                 among ('s{i'}n' 'sin' 'sun' 's{u"}n' )
 255         )
 256
 257         define mark_yUz as (
 258                 check_vowel_harmony
 259                 among ('{i'}z' 'iz' 'uz' '{u"}z')
 260                 (mark_suffix_with_optional_y_consonant)
 261         )
 262
 263         define mark_sUnUz as (
 264                 among ('s{i'}n{i'}z' 'siniz' 'sunuz' 's{u"}n{u"}z')
 265         )
 266
 267         define mark_lAr as (
 268                 check_vowel_harmony
 269                 among ('ler' 'lar')
 270         )
 271
 272         define mark_nUz as (
 273                 check_vowel_harmony
 274                 among ('n{i'}z' 'niz' 'nuz' 'n{u"}z')
 275         )
 276
 277         define mark_DUr as (
 278                 check_vowel_harmony
 279                 among ('t{i'}r' 'tir' 'tur' 't{u"}r' 'd{i'}r' 'dir' 'dur' 'd{u"}r')
 280         )
 281
 282         define mark_cAsInA as (
 283                 among ('cas{i'}na' 'cesine')
 284         )
 285
 286         define mark_yDU as (
 287                 check_vowel_harmony
 288                 among ('t{i'}m' 'tim' 'tum' 't{u"}m' 'd{i'}m' 'dim' 'dum' 'd{u"}m'
 289                         't{i'}n' 'tin' 'tun' 't{u"}n' 'd{i'}n' 'din' 'dun' 'd{u"}n'
 290                         't{i'}k' 'tik' 'tuk' 't{u"}k' 'd{i'}k' 'dik' 'duk' 'd{u"}k'
 291                         't{i'}' 'ti' 'tu' 't{u"}' 'd{i'}' 'di' 'du' 'd{u"}')
 292                 (mark_suffix_with_optional_y_consonant)
 293         )
 294
 295         // does not fully obey vowel harmony
 296         define mark_ysA as (
 297                 among ('sam' 'san' 'sak' 'sem' 'sen' 'sek' 'sa' 'se')
 298                 (mark_suffix_with_optional_y_consonant)
 299         )
 300
 301         define mark_ymUs_ as (
 302                 check_vowel_harmony
 303                 among ('m{i'}{s,}' 'mi{s,}' 'mu{s,}' 'm{u"}{s,}')
 304                 (mark_suffix_with_optional_y_consonant)
 305         )
 306
 307         define mark_yken as (
 308                 'ken' (mark_suffix_with_optional_y_consonant)
 309         )
 310
 311         define stem_nominal_verb_suffixes as (
 312                 [
 313                         set continue_stemming_noun_suffixes
 314                         (mark_ymUs_ or mark_yDU or mark_ysA or mark_yken)
 315                         or
 316                         (mark_cAsInA (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_)
 317                         or
 318                         (
 319                                 mark_lAr ] delete try([(mark_DUr or mark_yDU or mark_ysA or mark_ymUs_))
 320                                 unset continue_stemming_noun_suffixes
 321                         )
 322                         or
 323                         (mark_nUz (mark_yDU or mark_ysA))
 324                         or
 325                         ((mark_sUnUz or mark_yUz or mark_sUn or mark_yUm) ] delete try([ mark_ymUs_))
 326                         or
 327                         (mark_DUr ] delete try([ (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_))
 328                 ]delete
 329         )
 330
 331         // stems noun suffix chains ending with -ki
 332         define stem_suffix_chain_before_ki as (
 333                 [
 334                         mark_ki
 335                         (
 336                                 (mark_DA] delete try([
 337                                         (mark_lAr] delete try(stem_suffix_chain_before_ki))
 338                                         or
 339                                         (mark_possessives] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
 340
 341                                 ))
 342                                 or
 343                                 (mark_nUn] delete try([
 344                                         (mark_lArI] delete)
 345                                         or
 346                                         ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
 347                                         or
 348                                         (stem_suffix_chain_before_ki)
 349                                 ))
 350                                 or
 351                                 (mark_ndA (
 352                                         (mark_lArI] delete)
 353                                         or
 354                                         ((mark_sU] delete try([mark_lAr]delete stem_suffix_chain_before_ki)))
 355                                         or
 356                                         (stem_suffix_chain_before_ki)
 357                                 ))
 358                         )
 359         )
 360
 361         define stem_noun_suffixes as (
 362                 ([mark_lAr] delete try(stem_suffix_chain_before_ki))
 363                 or
 364                 ([mark_ncA] delete
 365                         try(
 366                                 ([mark_lArI] delete)
 367                                 or
 368                                 ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
 369                                 or
 370                                 ([mark_lAr] delete stem_suffix_chain_before_ki)
 371                         )
 372                 )
 373                 or
 374                 ([(mark_ndA or mark_nA)
 375                         (
 376                                 (mark_lArI] delete)
 377                                 or
 378                                 (mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
 379                                 or
 380                                 (stem_suffix_chain_before_ki)
 381                         )
 382                 )
 383                 or
 384                 ([(mark_ndAn or mark_nU) ((mark_sU ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lArI)))
 385                 or
 386                 ( [mark_DAn] delete try ([
 387                         (
 388                                 (mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
 389                                 or
 390                                 (mark_lAr] delete try(stem_suffix_chain_before_ki))
 391                                 or
 392                                 (stem_suffix_chain_before_ki)
 393                         ))
 394                 )
 395                 or
 396                 ([mark_nUn or mark_ylA] delete
 397                         try(
 398                                 ([mark_lAr] delete stem_suffix_chain_before_ki)
 399                                 or
 400                                 ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
 401                                 or
 402                                 stem_suffix_chain_before_ki
 403                         )
 404                 )
 405                 or
 406                 ([mark_lArI] delete)
 407                 or
 408                 (stem_suffix_chain_before_ki)
 409                 or
 410                 ([mark_DA or mark_yU or mark_yA] delete try([((mark_possessives] delete try([mark_lAr)) or mark_lAr) ] delete [ stem_suffix_chain_before_ki))
 411                 or
 412                 ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
 413         )
 414
 415         define post_process_last_consonants as (
 416                 [substring] among (
 417                         'b' (<- 'p')
 418                         'c' (<- '{c,}')
 419                         'd' (<- 't')
 420                         '{g~}' (<- 'k')
 421                 )
 422         )
 423
 424         // after stemming if the word ends with 'd' or 'g' most probably last U is overstemmed
 425         // like in 'kedim' -> 'ked'
 426         // Turkish words don't usually end with 'd' or 'g'
 427         // some very well known words are ignored (like 'ad' 'soyad'
 428         // appends U to stems ending with d or g, decides which vowel to add
 429         // based on the last vowel in the stem
 430         define append_U_to_stems_ending_with_d_or_g as (
 431                 test('d' or 'g')
 432                 (test((goto vowel) 'a' or '{i'}') <+ '{i'}')
 433                 or
 434                 (test((goto vowel) 'e' or 'i') <+ 'i')
 435                 or
 436                 (test((goto vowel) 'o' or 'u') <+ 'u')
 437                 or
 438                 (test((goto vowel) '{o"}' or '{u"}') <+ '{u"}')
 439         )
 440
 441         define is_reserved_word as (
 442                 'ad' try 'soy' atlimit
 443         )
 444 )
 445
 446 // Tests if there are more than one syllables
 447 // In Turkish each vowel indicates a distinct syllable
 448 define more_than_one_syllable_word as (
 449         test (atleast 2 (gopast vowel))
 450 )
 451
 452 define postlude as (
 453         backwards (
 454                 not(is_reserved_word)
 455                 do append_U_to_stems_ending_with_d_or_g
 456                 do post_process_last_consonants
 457
 458         )
 459 )
 460
 461 define stem as (
 462         (more_than_one_syllable_word)
 463         (
 464                 backwards (
 465                         do stem_nominal_verb_suffixes
 466                         continue_stemming_noun_suffixes
 467                         do stem_noun_suffixes
 468                 )
 469
 470         postlude
 471         )
 472 )