Bumping gaia.json for 2 gaia revision(s) a=gaia-bump
[gecko.git] / intl / lwbrk / rulebrk.c
blobf46b220fbcddd3a792fec4ab0f8b295cd43d04e3
1 /* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4 #define TH_UNICODE
6 #include <stdlib.h>
7 #include <assert.h>
8 #include "th_char.h"
9 #define th_isalpha(c) (((c)>='a'&&(c)<='z')||((c)>='A'&&(c)<='Z'))
10 #define th_isspace(c) ((c)==' '||(c)=='\t')
14 /////////////////////////////////////////////////
15 // Thai character type array
18 typedef unsigned short twb_t;
19 extern const twb_t _TwbType[0x100-0xa0];
22 // bit definition
25 #define VRS 0x0001
26 #define VRE 0x0002
27 #define VRX 0x0004
29 #define VRA 0x0008
31 #define VLA 0x0010
32 #define VLO 0x0020
33 #define VLI 0x0040
35 #define VC 0x0080
37 #define CC 0x0100
38 #define CS 0x0200
40 #define C2 0x0400
41 #define CHB 0x0800
42 #define CHE 0x1000
44 #define MT 0x2000
46 //_#define me 0x2000
48 #define M 0x4000
50 #define T 0x8000
52 #define VL (VLA|VLO|VLI)
53 #define VR (VRS|VRE|VRX)
54 #define NE (VL|VRS)
55 #define NB (VR|M)
56 #define V (VL|VR)
57 #define CX (CC|CS)
58 #define C (CX|VC)
59 #define A (C|V|M)
61 #define twbtype(c) (_TwbType[th_zcode(c)])
63 #ifndef TRUE
64 #define TRUE 1
65 #define FALSE 0
66 #endif
67 #define RETURN(b) return (b)
71 /////////////////////////////////////////////////
74 int TrbWordBreakPos(const th_char *pstr, int left,
75 const th_char *rstr, int right)
76 /* const ThBreakIterator *it, const th_char **p)*/
79 //int left, right;
80 //const th_char *s = *p;
82 const th_char *lstr = pstr + left;
83 th_char _c[6];
84 twb_t _t[6];
85 #define c(i) (_c[(i)+3])
86 #define t(i) (_t[(i)+3])
87 int i, j;
90 //left = s - it->begin;
92 if(left < 0) return -1;
94 //right = (it->end == NULL) ? 4 : it->begin - s;
96 if(right < 1) return -1;
99 // get c(0), t(0)
101 c(0) = rstr[0]; /* may be '\0' */
102 if(!th_isthai(c(0))) return -1;
103 t(0) = twbtype(c(0));
104 if(!(t(0) & A)) return -1;
107 // get c(-1), t(-1)
109 if(left >= 1) {
110 c(-1) = lstr[-1];
111 if(!th_isthai(c(-1))) return 0;
112 t(-1) = twbtype(c(-1));
113 if(!(t(-1) & A)) return 0; /* handle punctuation marks here */
114 } else { c(-1) = 0; t(-1) = 0; }
117 // get c(1..2), t(1..2)
119 for(i = 1; i <= 2; i++) {
120 if(i >= right) { c(i) = 0; t(i) = 0; }
121 else {
122 c(i) = rstr[i]; /* may be '\0'; */
123 if(!th_isthai(c(i))) right = i--;
124 else {
125 t(i) = twbtype(c(i));
126 if(!(t(i) & A)) right = i--;
131 // get c(-2..-3), t(-2..-3)
133 for(i = -2, j = -2; i >= -3 ; j--) {
134 if(j < -left) { c(i) = 0; t(i) = 0; i--; }
135 else {
136 c(i) = lstr[j];
137 if(!th_isthai(c(i))) left = 0;
138 else {
139 t(i) = (twb_t)(th_isthai(c(i)) ? twbtype(c(i)) : 0);
140 if(!(t(i) & A)) left = 0;
141 else {
142 if((t(i+1) & MT) && ((t(i) & VR) || (t(i+2) & VR))) {
143 c(i+1) = c(i); t(i+1) = t(i);
144 } else i--;
151 // prohibit the unlikely
153 if((t(-1) & C) && (t(0) & C)) {
154 if((t(-1) & CHE) || (t(0) & CHB)) return -1;
157 // special case : vlao, C/ sara_a|aa, !sara_a
159 if((t(-3) & (VLA|VLO)) && (t(-2) & C) && (c(0) != TH_SARA_A) &&
160 (c(-1) == TH_SARA_A || c(-0) == TH_SARA_AA)) return 0;
163 // prohibit break
165 if(t(0) & NB) return -1;
166 if(t(-1) & NE) return -1;
170 // apply 100% rules
172 if(t(-1) & VRE) {
173 if(c(-2) == TH_SARA_AA && c(-1) == TH_SARA_A) return 0;
174 return -1; /* usually too short syllable, part of word */
177 if(t(-2) & VRE) return -1;
179 if((t(0) & C) && (t(1) & (VR|MT)) && (c(2) != TH_THANTHAKHAT)) { /*?C, NB */
180 if((t(-1) & (VRS|VRX)) && c(1) == TH_SARA_I) return -1; /* exception */
181 if(t(-1) & (V|M)) return 0; /* !C/ C, NB */
182 if(t(-2) & VRS) return 0; /* VRS, C / C, NB */
183 if(!(t(0) & C2) && c(1) == TH_SARA_I) { /* / !C2 or /c, sara_i */
184 if(t(-2) & VRX) return 0; /* VRX, C / C, NB ? 100%? */
185 if(t(-2) & VC) return 0; /* VC, C / C, NB ? 100% */
188 if((t(-1) & VRX) && (t(0) & CC)) return 0; /* VRX/ CC */
189 if((t(-2) & VRS) && (t(-1) & C) && (t(0) & (V|M))) return 0;/* VRS, C/ !C */
192 if((t(0) & CX) && (t(1) & C2) && (c(2) != TH_THANTHAKHAT)) {
193 if((t(-2) & A) && (t(-1) & CX)) return 0; /* A, CX / CX, C2 */
194 if((t(-2) & CX) && (t(-1) & MT)) return 0; /* CX, MT / CX, C2 */
197 // apply 90% rules
199 if(t(0) & VL) return 0;
200 if(t(1) & VL) return -1;
201 if(c(-1) == TH_THANTHAKHAT && c(-2) != TH_RORUA && c(-2) != TH_LOLING) return 0;
204 //return -1;
205 // apply 80% rules
207 if(t(0) & CHE) {
208 if((t(-2) & VRS) && (t(-1) & C)) return 0; /* VRS, C/ CHE */
209 /*if(t(-1) & VRX) return 0; // VRX/ CHE */
210 if(t(-1) & VC) return 0; /* VC/ CHE */
212 if(t(-1) & CHB) {
213 if((t(0) & C) && (t(1) & VR)) return 0; /* CHB/ CC, VR */
214 if(t(0) & VC) return 0; /* CHB/ VC */
217 if((t(-2) & VL) && (t(1) & VR)) { /* VL, C? C, VR */
218 if(t(-2) & VLI) return 0; /* VLI,C/C,VR .*/
219 else { /* vlao, C ? C , VR */
220 if(c(1) == TH_SARA_A) return 2; /* vlao, C, C, sara_a/ */
221 if(t(-2) & VLO) return 0; /* VLO, C/ C, !sara_a */
222 if(!(t(1) & VRA)) return 0; /* VLA, C/ C, !vca */
225 /* C,MT,C */
226 if((t(-2) & C) && (t(-1) & MT) && (t(0) & CX)) return 1;
228 return -1;
232 int TrbFollowing(const th_char *begin, int length, int offset)
234 //(ThBreakIterator *this, int offset)
237 const th_char *w = begin + offset;
238 const th_char *end = begin + length;
239 while(w < end && *w && !th_isthai(*w) && th_isspace(*w)) w++;
241 if(w < end && *w && !th_isthai(*w)) {
242 int english = FALSE;
243 while(w < end && *w && !th_isthai(*w) && !th_isspace(*w)) {
244 if(th_isalpha(*w)) english = TRUE;
245 w++;
247 if(english || w == end ||
248 (!th_isthai(*w) && th_isspace(*w))) return w - begin;
250 if(w == end || *w == 0 || !th_isthai(*w)) return w - begin;
251 w++;
252 if(w < end && *w && th_isthai(*w)) {
253 int brk = TrbWordBreakPos(begin, w-begin, w, end-w);
254 while (brk < 0) {
255 w++;
256 if(w == end || *w == 0 || !th_isthai(*w)) break;
257 brk = TrbWordBreakPos(begin, w-begin, w, end-w);
259 if (brk > 0) w += brk;
261 if(w < end && *w && !th_isthai(*w)) {
262 while(w < end && *w && !th_isthai(*w) &&
263 !th_isalpha(*w) && !th_isspace(*w)) w++;
265 return w - begin;
270 /////////////////////////////////////////////////
272 const twb_t _TwbType[0x100-0xa0] = {
273 #if 0
274 /* 80 € */ T,
275 /* 81-8f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
276 /* 90 � */ T,
277 /* 91-9f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
278 #endif
279 /* a0   */ 0,
280 /* a1 ¡ */ CS,
281 /* a2 ¢ */ CS | CHE,
282 /* a3 £ */ CC | CHE,
283 /* a4 ¤ */ CS | CHE,
284 /* a5 ¥ */ CC | CHE,
285 /* a6 ¦ */ CS,
286 /* a7 § */ CS | CHB,
287 /* a8 ¨ */ CS,
288 /* a9 © */ CC | CHE,
289 /* aa ª */ CS,
290 /* ab « */ CC | CHE,
291 /* ac ¬ */ CC | CHB | CHE,
292 /* ad ­ */ CS | CHB,
293 /* ae ® */ CS | CHB,
294 /* af ¯ */ CS | CHB,
295 /* b0 ° */ CS,
296 /* b1 ± */ CS | CHB | CHE,
297 /* b2 ² */ CS | CHB | CHE,
298 /* b3 ³ */ CS | CHB,
299 /* b4 ´ */ CS,
300 /* b5 µ */ CS,
301 /* b6 ¶ */ CS,
302 /* b7 · */ CS,
303 /* b8 ¸ */ CS,
304 /* b9 ¹ */ CS,
305 /* ba º */ CS,
306 /* bb » */ CS,
307 /* bc ¼ */ CC | CHE,
308 /* bd ½ */ CC | CHE,
309 /* be ¾ */ CS,
310 /* bf ¿ */ CS,
311 /* c0 À */ CS | CHE,
312 /* c1 Á */ CS,
313 /* c2 Â */ CS,
314 /* c3 Ã */ CS | C2 | CHE, /* ? add CHE */
315 /* c4 Ä */ VC | CHE,
316 /* c5 Å */ CS | C2,
317 /* c6 Æ */ VC | CHE,
318 /* c7 Ç */ VC | C2,
319 /* c8 È */ CS,
320 /* c9 É */ CS | CHB,
321 /* ca Ê */ CS | CHE,
322 /* cb Ë */ CC | CHE,
323 /* CC Ì */ CS | CHB | CHE,
324 /* cd Í */ VC,
325 /* ce Î */ CC | CHE,
326 /* cf Ï */ T,
327 /* d0 Ð */ VRE | VRA,
328 /* d1 Ñ */ VRS,
329 /* d2 Ò */ VRX | VRA,
330 /* d3 Ó */ VRE,
331 /* d4 Ô */ VRX | VRA,
332 /* d5 Õ */ VRX | VRA,
333 /* d6 Ö */ VRS,
334 /* d7 × */ VRS | VRA,
335 /* d8 Ø */ VRX,
336 /* d9 Ù */ VRX,
337 /* da Ú */ T,
338 /* db Û */ 0,
339 /* dc Ü */ 0,
340 /* dd Ý */ 0,
341 /* de Þ */ 0,
342 /* df ß */ T,
343 /* e0 à */ VLA,
344 /* e1 á */ VLO,
345 /* e2 â */ VLO,
346 /* e3 ã */ VLI,
347 /* e4 ä */ VLI,
348 /* e5 å */ VRE,
349 /* e6 æ */ M,
350 /* e7 ç */ M,
351 /* e8 è */ M | MT,
352 /* e9 é */ M | MT,
353 /* ea ê */ M | MT,
354 /* eb ë */ M | MT,
355 /* ec ì */ M,
356 /* ed í */ T,
357 /* ee î */ T,
358 /* ef ï */ T,
359 /* f0 ð */ T,
360 /* f1 ñ */ T,
361 /* f2 ò */ T,
362 /* f3 ó */ T,
363 /* f4 ô */ T,
364 /* f5 õ */ T,
365 /* f6 ö */ T,
366 /* f7 ÷ */ T,
367 /* f8 ø */ T,
368 /* f9 ù */ T,
369 /* fa ú */ T,
370 /* fb û */ T,
371 /* fc ü */ 0,
372 /* fd ý */ 0,
373 /* fe þ */ 0,
374 /* ff ’ */ 0