4 * - Assem Chelli, < assem [dot] ch [at] gmail >
5 * - Abdelkrim Aries <ab [underscore] aries [at] esi [dot] dz>
11 /* the Arabic letters in Unicode */
13 stringdef o hex '621' // Hamza
14 stringdef ao hex '623' // Hamza above Alef
15 stringdef ao_ hex '625' // Hamza below Alef
16 stringdef a~ hex '622' // Alef madda
17 stringdef wo hex '624' // Hamza above waw
18 stringdef yo hex '626' // Hamza above yeh
21 stringdef a hex '627' // Alef
22 stringdef a_ hex '649' // Alef Maksura
23 stringdef b hex '628' // Beh
24 stringdef t_ hex '629' // Teh_Marbuta
25 stringdef t hex '62a' // Teh
26 stringdef th hex '62b' // Theh
27 stringdef j hex '62c' // Jeem
28 stringdef h hex '62d' // Hah
29 stringdef x hex '62e' // Khah
30 stringdef d hex '62f' // Dal
31 stringdef dz hex '630' // Thal
32 stringdef r hex '631' // Reh
33 stringdef z hex '632' // Zain
34 stringdef s hex '633' // Seen
35 stringdef sh hex '634' // Sheen
36 stringdef c hex '635' // Sad
37 stringdef dh hex '636' // Dad
38 stringdef tt hex '637' // Tah
39 stringdef zh hex '638' // Zah
40 stringdef i hex '639' // Ain
41 stringdef gh hex '63a' // Ghain
42 stringdef f hex '641' // Feh
43 stringdef q hex '642' // Qaf
44 stringdef k hex '643' // Kaf
45 stringdef l hex '644' // Lam
46 stringdef m hex '645' // Meem
47 stringdef n hex '646' // Noon
48 stringdef e hex '647' // Heh
49 stringdef w hex '648' // Waw
50 stringdef y hex '64a' // Yeh
53 stringdef aan hex '64b' // FatHatan
54 stringdef uun hex '64c' // Dammatan
55 stringdef iin hex '64d' // Kasratan
56 stringdef aa hex '64e' // FatHa
57 stringdef uu hex '64f' // Damma
58 stringdef ii hex '650' // Kasra
59 stringdef oo hex '652' // Sukun
60 stringdef ~ hex '651' // Shadda
62 // Hindu–Arabic numerals
63 stringdef 0 hex '0660'
64 stringdef 1 hex '0661'
65 stringdef 2 hex '0662'
66 stringdef 3 hex '0663'
67 stringdef 4 hex '0664'
68 stringdef 5 hex '0665'
69 stringdef 6 hex '0666'
70 stringdef 7 hex '0667'
71 stringdef 8 hex '0668'
72 stringdef 9 hex '0669'
73 stringdef % hex '066a' // PERCENT
74 stringdef . hex '066b' // DECIMAL
75 stringdef ' hex '066c' // THOUSANDS
78 stringdef _ hex '640' // Kasheeda, Tatweel
81 stringdef , hex '060C' // COMMA
82 stringdef ; hex '061B' // SEMICOLON
83 stringdef ? hex '061F' // QUESTION
86 stringdef o1 hex 'fe80' // HAMZA
87 stringdef ao1 hex 'fe83' // ALEF_HAMZA_ABOVE
88 stringdef ao2 hex 'fe84' // ALEF_HAMZA_ABOVE
89 stringdef ao_1 hex 'fe87' // ALEF_HAMZA_BELOW
90 stringdef ao_2 hex 'fe88' // ALEF_HAMZA_BELOW
91 stringdef yo1 hex 'fe8b' // YEH_HAMZA
92 stringdef yo2 hex 'fe8c' // YEH_HAMZA
93 stringdef yo3 hex 'fe89' // YEH_HAMZA
94 stringdef yo4 hex 'fe8a' // YEH_HAMZA
95 stringdef a~1 hex 'fe81' // ALEF_MADDA
96 stringdef a~2 hex 'fe82' // ALEF_MADDA
97 stringdef wo1 hex 'fe85' // WAW_HAMZA
98 stringdef wo2 hex 'fe86' // WAW_HAMZA
99 stringdef a1 hex 'fe8d' // ALEF
100 stringdef a2 hex 'fe8e' // ALEF
101 stringdef b1 hex 'fe8f' // BEH
102 stringdef b2 hex 'fe90' // BEH
103 stringdef b3 hex 'fe91' // BEH
104 stringdef b4 hex 'fe92' // BEH
105 stringdef t_1 hex 'fe93' // TEH_MARBUTA
106 stringdef t_2 hex 'fe94' // TEH_MARBUTA
107 stringdef t1 hex 'fe97' // TEH
108 stringdef t2 hex 'fe98' // TEH
109 stringdef t3 hex 'fe95' // TEH
110 stringdef t4 hex 'fe96' // TEH
111 stringdef th1 hex 'fe9b' // THEH
112 stringdef th2 hex 'fe9c' // THEH
113 stringdef th3 hex 'fe9a' // THEH
114 stringdef th4 hex 'fe99' // THEH
115 stringdef j1 hex 'fe9f' // JEEM
116 stringdef j2 hex 'fea0' // JEEM
117 stringdef j3 hex 'fe9d' // JEEM
118 stringdef j4 hex 'fe9e' // JEEM
119 stringdef h1 hex 'fea3' // HAH
120 stringdef h2 hex 'fea4' // HAH
121 stringdef h3 hex 'fea1' // HAH
122 stringdef h4 hex 'fea2' // HAH
123 stringdef x1 hex 'fea7' // KHAH
124 stringdef x2 hex 'fea8' // KHAH
125 stringdef x3 hex 'fea5' // KHAH
126 stringdef x4 hex 'fea6' // KHAH
127 stringdef d1 hex 'fea9' // DAL
128 stringdef d2 hex 'feaa' // DAL
129 stringdef dz1 hex 'feab' // THAL
130 stringdef dz2 hex 'feac' // THAL
131 stringdef r1 hex 'fead' // REH
132 stringdef r2 hex 'feae' // REH
133 stringdef z1 hex 'feaf' // ZAIN
134 stringdef z2 hex 'feb0' // ZAIN
135 stringdef s1 hex 'feb3' // SEEN
136 stringdef s2 hex 'feb4' // SEEN
137 stringdef s3 hex 'feb1' // SEEN
138 stringdef s4 hex 'feb2' // SEEN
139 stringdef sh1 hex 'feb7' // SHEEN
140 stringdef sh2 hex 'feb8' // SHEEN
141 stringdef sh3 hex 'feb5' // SHEEN
142 stringdef sh4 hex 'feb6' // SHEEN
143 stringdef c1 hex 'febb' // SAD
144 stringdef c2 hex 'febc' // SAD
145 stringdef c3 hex 'feb9' // SAD
146 stringdef c4 hex 'feba' // SAD
147 stringdef dh1 hex 'febf' // DAD
148 stringdef dh2 hex 'fec0' // DAD
149 stringdef dh3 hex 'febd' // DAD
150 stringdef dh4 hex 'febe' // DAD
151 stringdef tt1 hex 'fec3' // TAH
152 stringdef tt2 hex 'fec4' // TAH
153 stringdef tt3 hex 'fec1' // TAH
154 stringdef tt4 hex 'fec2' // TAH
155 stringdef zh1 hex 'fec7' // ZAH
156 stringdef zh2 hex 'fec8' // ZAH
157 stringdef zh3 hex 'fec5' // ZAH
158 stringdef zh4 hex 'fec6' // ZAH
159 stringdef i1 hex 'fecb' // AIN
160 stringdef i2 hex 'fecc' // AIN
161 stringdef i3 hex 'fec9' // AIN
162 stringdef i4 hex 'feca' // AIN
163 stringdef gh1 hex 'fecf' // GHAIN
164 stringdef gh2 hex 'fed0' // GHAIN
165 stringdef gh3 hex 'fecd' // GHAIN
166 stringdef gh4 hex 'fece' // GHAIN
167 stringdef f1 hex 'fed3' // FEH
168 stringdef f2 hex 'fed4' // FEH
169 stringdef f3 hex 'fed1' // FEH
170 stringdef f4 hex 'fed2' // FEH
171 stringdef q1 hex 'fed7' // QAF
172 stringdef q2 hex 'fed8' // QAF
173 stringdef q3 hex 'fed5' // QAF
174 stringdef q4 hex 'fed6' // QAF
175 stringdef k1 hex 'fedb' // KAF
176 stringdef k2 hex 'fedc' // KAF
177 stringdef k3 hex 'fed9' // KAF
178 stringdef k4 hex 'feda' // KAF
179 stringdef l1 hex 'fedf' // LAM
180 stringdef l2 hex 'fee0' // LAM
181 stringdef l3 hex 'fedd' // LAM
182 stringdef l4 hex 'fede' // LAM
183 stringdef m1 hex 'fee3' // MEEM
184 stringdef m2 hex 'fee4' // MEEM
185 stringdef m3 hex 'fee1' // MEEM
186 stringdef m4 hex 'fee2' // MEEM
187 stringdef n1 hex 'fee7' // NOON
188 stringdef n2 hex 'fee8' // NOON
189 stringdef n3 hex 'fee5' // NOON
190 stringdef n4 hex 'fee6' // NOON
191 stringdef e1 hex 'feeb' // HEH
192 stringdef e2 hex 'feec' // HEH
193 stringdef e3 hex 'fee9' // HEH
194 stringdef e4 hex 'feea' // HEH
195 stringdef w1 hex 'feed' // WAW
196 stringdef w2 hex 'feee' // WAW
197 stringdef a_1 hex 'feef' // ALEF_MAKSURA
198 stringdef a_2 hex 'fef0' // ALEF_MAKSURA
199 stringdef y1 hex 'fef3' // YEH
200 stringdef y2 hex 'fef4' // YEH
201 stringdef y3 hex 'fef1' // YEH
202 stringdef y4 hex 'fef2' // YEH
204 // Ligatures Lam-Alef
205 stringdef la hex 'fefb' // LAM_ALEF
206 stringdef la2 hex 'fefc' // LAM_ALEF
207 stringdef lao hex 'fef7' // LAM_ALEF_HAMZA_ABOVE
208 stringdef lao2 hex 'fef8' // LAM_ALEF_HAMZA_ABOVE
209 stringdef lao_ hex 'fef9' // LAM_ALEF_HAMZA_BELOW
210 stringdef lao_2 hex 'fefa' // LAM_ALEF_HAMZA_BELOW
211 stringdef la~ hex 'fef5' // LAM_ALEF_MADDA_ABOVE
212 stringdef la~2 hex 'fef6' // LAM_ALEF_MADDA_ABOVE
234 Suffix_All_alef_maqsura
259 define Normalize_pre as (
263 '{aan}' '{uun}' '{iin}' '{aa}' '{uu}' '{ii}' '{oo}' '{~}'( delete ) // strip vocalization
264 '{_}' ( delete ) // strip kasheeda
267 '.' ',' ';' ':' '?' '!' '/' '*' '%' '\' '"' ( delete) // General
268 '{,}' '{;}' '{?}' ( delete ) // Arabic-specific
270 // Hindu–Arabic numerals
281 '{%}' '{.}' '{'}' ( delete )
284 '{o1}' ( <- '{o}' ) // HAMZA
285 '{ao1}' '{ao2}' ( <- '{ao}' ) // ALEF_HAMZA_ABOVE
286 '{ao_1}' '{ao_2}' ( <- '{ao_}' ) // ALEF_HAMZA_BELOW
287 '{yo1}' '{yo2}' '{yo3}' '{yo4}' ( <- '{yo}' ) // YEH_HAMZA
288 '{a~1}' '{a~2}'( <- '{a~}' ) // ALEF_MADDA
289 '{wo1}' '{wo2}'( <- '{wo}' ) // WAW_HAMZA
290 '{a1}' '{a2}' ( <- '{a}' ) // ALEF
291 '{b1}' '{b2}' '{b3}' '{b4}' ( <- '{b}' ) // BEH
292 '{t_1}' '{t_2}' ( <- '{t_}' ) // TEH_MARBUTA
293 '{t1}' '{t2}' '{t3}' '{t4}' ( <- '{t}' ) // TEH
294 '{th1}' '{th2}' '{th3}' '{th4}' ( <- '{th}' ) // THEH
295 '{j1}' '{j2}' '{j3}' '{j4}'( <- '{j}' ) // JEEM
296 '{h1}' '{h2}' '{h3}' '{h4}' ( <- '{h}' ) // HAH
297 '{x1}' '{x2}' '{x3}' '{x4}'( <- '{x}' ) // KHAH
298 '{d1}' '{d2}' ( <- '{d}' ) // DAL
299 '{dz1}''{dz2}' ( <- '{dz}' ) // THAL
300 '{r1}' '{r2}'( <- '{r}' ) // REH
301 '{z1}' '{z2}' ( <- '{z}' ) // ZAIN
302 '{s1}' '{s2}' '{s3}' '{s4}'( <- '{s}' ) // SEEN
303 '{sh1}' '{sh2}' '{sh3}' '{sh4}' ( <- '{sh}' ) // SHEEN
304 '{c1}' '{c2}' '{c3}' '{c4}'( <- '{c}' ) // SAD
305 '{dh1}' '{dh2}' '{dh3}' '{dh4}'( <- '{dh}' ) // DAD
306 '{tt1}' '{tt2}' '{tt3}' '{tt4}' ( <- '{tt}' ) // TAH
307 '{zh1}' '{zh2}' '{zh3}' '{zh4}'( <- '{zh}' ) // ZAH
308 '{i1}' '{i2}' '{i3}' '{i4}'( <- '{i}' ) // AIN
309 '{gh1}' '{gh2}' '{gh3}' '{gh4}'( <- '{gh}' ) // GHAIN
310 '{f1}' '{f2}' '{f3}' '{f4}' ( <- '{f}' ) // FEH
311 '{q1}' '{q2}' '{q3}' '{q4}' ( <- '{q}' ) // QAF
312 '{k1}' '{k2}' '{k3}' '{k4}'( <- '{k}' ) // KAF
313 '{l1}' '{l2}' '{l3}' '{l4}'( <- '{l}' ) // LAM
314 '{m1}' '{m2}' '{m3}' '{m4}' ( <- '{m}' ) // MEEM
315 '{n1}' '{n2}' '{n3}' '{n4}'( <- '{n}' ) // NOON
316 '{e1}' '{e2}' '{e3}' '{e4}' ( <- '{e}' ) // HEH
317 '{w1}' '{w2}' ( <- '{w}' ) // WAW
318 '{a_1}' '{a_2}' ( <- '{a_}' ) // ALEF_MAKSURA
319 '{y1}' '{y2}' '{y3}' '{y4}' ( <- '{y}' ) // YEH
321 // Ligatures Lam-Alef
322 '{la}' '{la2}' (<- '{l}{a}')
323 '{lao}' '{lao2}' (<- '{l}{ao}')
324 '{lao_}' '{lao_2}' (<- '{l}{ao_}')
325 '{la~}' '{la~2}' (<- '{l}{a~}')
334 define Normalize_post as (
337 // normalize last hamza
340 '{ao}''{ao_}' '{a~}' ( <- '{o}')
349 // normalize other hamza's
351 '{ao}''{ao_}' '{a~}' ( <- '{a}')
365 '{b}{a}{l}' '{k}{a}{l}' ($word_len > 4 set is_noun unset is_verb set is_defined)
366 '{l}{l}' '{a}{l}' ($word_len > 3 set is_noun unset is_verb set is_defined)
372 define Prefix_Step1 as (
375 '{ao}{ao}' ($word_len > 3 <- '{ao}' )
376 '{ao}{a~}' ($word_len > 3 <- '{a~}' )
377 '{ao}{wo}' ($word_len > 3 <- '{ao}' )
378 '{ao}{a}' ($word_len > 3 <- '{a}' )
379 '{ao}{ao_}' ($word_len > 3 <- '{ao_}' )
380 // '{ao}' ($word_len > 3 delete) //rare case
384 define Prefix_Step2 as (
389 '{f}' ($word_len > 3 delete)
390 '{w}' ($word_len > 3 delete)
394 define Prefix_Step3a_Noun as ( // it is noun and defined
397 '{b}{a}{l}' '{k}{a}{l}' ($word_len > 5 delete)
398 '{l}{l}' '{a}{l}' ($word_len > 4 delete)
402 define Prefix_Step3b_Noun as ( // probably noun and defined
404 not '{b}{a}' // exception
406 '{b}' ($word_len > 3 delete)
407 // '{k}' '{l}' ($word_len > 3 delete) // BUG: cause confusion
408 '{b}{b}' ($word_len > 3 <- '{b}' )
409 '{k}{k}' ($word_len > 3 <- '{k}' )
414 define Prefix_Step3_Verb as (
417 //'{s}' ($word_len > 4 delete)// BUG: cause confusion
418 '{s}{y}' ($word_len > 4 <- '{y}' )
419 '{s}{t}' ($word_len > 4 <- '{t}')
420 '{s}{n}' ($word_len > 4 <- '{n}')
421 '{s}{ao}' ($word_len > 4 <- '{ao}')
425 define Prefix_Step4_Verb as (
428 '{y}{s}{t}' '{n}{s}{t}' '{t}{s}{t}' ($word_len > 4 set is_verb unset is_noun <- '{a}{s}{t}' )
435 define Suffix_Noun_Step1a as (
438 '{y}' '{k}' '{e}' ($word_len >= 4 delete)
439 '{n}{a}' '{k}{m}' '{e}{a}' '{e}{n}' '{e}{m}' ($word_len >= 5 delete)
440 '{k}{m}{a}' '{e}{m}{a}' ($word_len >= 6 delete)
443 define Suffix_Noun_Step1b as (
446 '{n}' ($word_len > 5 delete)
450 define Suffix_Noun_Step2a as (
453 '{a}' '{y}' '{w}' ($word_len > 4 delete)
457 define Suffix_Noun_Step2b as (
460 '{a}{t}' ($word_len >= 5 delete)
464 define Suffix_Noun_Step2c1 as (
467 '{t}' ($word_len >= 4 delete)
470 define Suffix_Noun_Step2c2 as ( // feminine t_
473 '{t_}' ($word_len >= 4 delete)
476 define Suffix_Noun_Step3 as ( // ya' nisbiya
479 '{y}' ($word_len >= 3 delete)
483 define Suffix_Verb_Step1 as (
486 '{e}' '{k}' ($word_len >= 4 delete)
487 '{n}{y}' '{n}{a}' '{e}{a}' '{e}{m}' '{e}{n}' '{k}{m}' '{k}{n}' ($word_len >= 5 delete)
488 '{e}{m}{a}' '{k}{m}{a}' '{k}{m}{w}'($word_len >= 6 delete)
491 define Suffix_Verb_Step2a as (
494 '{t}' ($word_len >= 4 delete)
495 '{a}' '{n}' '{y}' ($word_len >= 4 delete)
496 '{n}{a}' '{t}{a}' '{t}{n}' ($word_len >= 5 delete)// past
497 '{a}{n}' '{w}{n}' '{y}{n}' ($word_len > 5 delete) // present
498 '{t}{m}{a}' ($word_len >= 6 delete)
502 define Suffix_Verb_Step2b as (
505 '{w}{a}' '{t}{m}' ($word_len >= 5 delete) // len >= 5
510 define Suffix_Verb_Step2c as (
513 '{w}' ($word_len >= 4 delete)
514 '{t}{m}{w}' ($word_len >= 6 delete)
518 define Suffix_All_alef_maqsura as (
521 '{a_}' ( <- '{y}' ) // spell error
522 // '{a_}' ( delete ) // if noun > 3
523 // '{a_}' ( <- '{a}') // if verb
529 // set initial values
534 // guess type and properties
537 // normalization pre-stemming
549 (atleast 1 Suffix_Verb_Step1)
550 ( Suffix_Verb_Step2a or Suffix_Verb_Step2c or next)
552 or Suffix_Verb_Step2b
553 or Suffix_Verb_Step2a
563 or (not is_defined Suffix_Noun_Step1a (
565 or Suffix_Noun_Step2b
566 or Suffix_Noun_Step2c1
568 or (Suffix_Noun_Step1b (
570 or Suffix_Noun_Step2b
571 or Suffix_Noun_Step2c1))
572 or (not is_defined Suffix_Noun_Step2a)
573 or (Suffix_Noun_Step2b)
580 // Suffixes for alef maqsura
581 or Suffix_All_alef_maqsura
590 or (is_noun Prefix_Step3b_Noun)
591 or (is_verb try Prefix_Step3_Verb Prefix_Step4_Verb)
595 // normalization post-stemming