Bug 1855375: Basic implementation for Yelp Suggestions r=fluent-reviewers,flod,adw
[gecko.git] / intl / lwbrk / rulebrk.c
blobd7574b929f10d48b53202983ba45d521e57da496
1 /* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4 #define TH_UNICODE
6 #include <stdlib.h>
7 #include <stdint.h>
8 #include <assert.h>
9 #include "th_char.h"
10 #define th_isalpha(c) (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
11 #define th_isspace(c) ((c) == ' ' || (c) == '\t')
14 /////////////////////////////////////////////////
15 // Thai character type array
18 typedef unsigned short twb_t;
19 extern const twb_t _TwbType[0x100 - 0xa0];
22 // bit definition
25 #define VRS 0x0001
26 #define VRE 0x0002
27 #define VRX 0x0004
29 #define VRA 0x0008
31 #define VLA 0x0010
32 #define VLO 0x0020
33 #define VLI 0x0040
35 #define VC 0x0080
37 #define CC 0x0100
38 #define CS 0x0200
40 #define C2 0x0400
41 #define CHB 0x0800
42 #define CHE 0x1000
44 #define MT 0x2000
46 //_#define me 0x2000
48 #define M 0x4000
50 #define T 0x8000
52 #define VL (VLA | VLO | VLI)
53 #define VR (VRS | VRE | VRX)
54 #define NE (VL | VRS)
55 #define NB (VR | M)
56 #define V (VL | VR)
57 #define CX (CC | CS)
58 #define C (CX | VC)
59 #define A (C | V | M)
61 #define twbtype(c) (_TwbType[th_zcode(c)])
63 #ifndef TRUE
64 # define TRUE 1
65 # define FALSE 0
66 #endif
67 #define RETURN(b) return (b)
70 /////////////////////////////////////////////////
73 int TrbWordBreakPos(const th_char* pstr, int left, const th_char* rstr,
74 int right)
75 /* const ThBreakIterator *it, const th_char **p)*/
78 //int left, right;
79 //const th_char *s = *p;
81 const th_char* lstr = pstr + left;
82 th_char _c[6];
83 twb_t _t[6];
84 #define c(i) (_c[(i) + 3])
85 #define t(i) (_t[(i) + 3])
86 int i, j;
89 //left = s - it->begin;
91 if (left < 0) return -1;
93 //right = (it->end == NULL) ? 4 : it->begin - s;
95 if (right < 1) return -1;
98 // get c(0), t(0)
100 c(0) = rstr[0]; /* may be '\0' */
101 if (!th_isthai(c(0))) return -1;
102 t(0) = twbtype(c(0));
103 if (!(t(0) & A)) return -1;
106 // get c(-1), t(-1)
108 if (left >= 1) {
109 c(-1) = lstr[-1];
110 if (!th_isthai(c(-1))) return 0;
111 t(-1) = twbtype(c(-1));
112 if (!(t(-1) & A)) return 0; /* handle punctuation marks here */
113 } else {
114 c(-1) = 0;
115 t(-1) = 0;
119 // get c(1..2), t(1..2)
121 for (i = 1; i <= 2; i++) {
122 if (i >= right) {
123 c(i) = 0;
124 t(i) = 0;
125 } else {
126 c(i) = rstr[i]; /* may be '\0'; */
127 if (!th_isthai(c(i)))
128 right = i--;
129 else {
130 t(i) = twbtype(c(i));
131 if (!(t(i) & A)) right = i--;
136 // get c(-2..-3), t(-2..-3)
138 for (i = -2, j = -2; i >= -3; j--) {
139 if (j < -left) {
140 c(i) = 0;
141 t(i) = 0;
142 i--;
143 } else {
144 c(i) = lstr[j];
145 if (!th_isthai(c(i)))
146 left = 0;
147 else {
148 t(i) = (twb_t)(th_isthai(c(i)) ? twbtype(c(i)) : 0);
149 if (!(t(i) & A))
150 left = 0;
151 else {
152 if ((t(i + 1) & MT) && ((t(i) & VR) || (t(i + 2) & VR))) {
153 c(i + 1) = c(i);
154 t(i + 1) = t(i);
155 } else
156 i--;
163 // prohibit the unlikely
165 if ((t(-1) & C) && (t(0) & C)) {
166 if ((t(-1) & CHE) || (t(0) & CHB)) return -1;
169 // special case : vlao, C/ sara_a|aa, !sara_a
171 if ((t(-3) & (VLA | VLO)) && (t(-2) & C) && (c(0) != TH_SARA_A) &&
172 (c(-1) == TH_SARA_A || c(-0) == TH_SARA_AA))
173 return 0;
176 // prohibit break
178 if (t(0) & NB) return -1;
179 if (t(-1) & NE) return -1;
182 // apply 100% rules
184 if (t(-1) & VRE) {
185 if (c(-2) == TH_SARA_AA && c(-1) == TH_SARA_A) return 0;
186 return -1; /* usually too short syllable, part of word */
189 if (t(-2) & VRE) return -1;
191 if ((t(0) & C) && (t(1) & (VR | MT)) &&
192 (c(2) != TH_THANTHAKHAT)) { /*?C, NB */
193 if ((t(-1) & (VRS | VRX)) && c(1) == TH_SARA_I) return -1; /* exception */
194 if (t(-1) & (V | M)) return 0; /* !C/ C, NB */
195 if (t(-2) & VRS) return 0; /* VRS, C / C, NB */
196 if (!(t(0) & C2) && c(1) == TH_SARA_I) { /* / !C2 or /c, sara_i */
197 if (t(-2) & VRX) return 0; /* VRX, C / C, NB ? 100%? */
198 if (t(-2) & VC) return 0; /* VC, C / C, NB ? 100% */
201 if ((t(-1) & VRX) && (t(0) & CC)) return 0; /* VRX/ CC */
202 if ((t(-2) & VRS) && (t(-1) & C) && (t(0) & (V | M)))
203 return 0; /* VRS, C/ !C */
205 if ((t(0) & CX) && (t(1) & C2) && (c(2) != TH_THANTHAKHAT)) {
206 if ((t(-2) & A) && (t(-1) & CX)) return 0; /* A, CX / CX, C2 */
207 if ((t(-2) & CX) && (t(-1) & MT)) return 0; /* CX, MT / CX, C2 */
210 // apply 90% rules
212 if (t(0) & VL) return 0;
213 if (t(1) & VL) return -1;
214 if (c(-1) == TH_THANTHAKHAT && c(-2) != TH_RORUA && c(-2) != TH_LOLING)
215 return 0;
218 //return -1;
219 // apply 80% rules
221 if (t(0) & CHE) {
222 if ((t(-2) & VRS) && (t(-1) & C)) return 0; /* VRS, C/ CHE */
223 /*if(t(-1) & VRX) return 0; // VRX/ CHE */
224 if (t(-1) & VC) return 0; /* VC/ CHE */
226 if (t(-1) & CHB) {
227 if ((t(0) & C) && (t(1) & VR)) return 0; /* CHB/ CC, VR */
228 if (t(0) & VC) return 0; /* CHB/ VC */
231 if ((t(-2) & VL) && (t(1) & VR)) { /* VL, C? C, VR */
232 if (t(-2) & VLI)
233 return 0; /* VLI,C/C,VR .*/
234 else { /* vlao, C ? C , VR */
235 if (c(1) == TH_SARA_A) return 2; /* vlao, C, C, sara_a/ */
236 if (t(-2) & VLO) return 0; /* VLO, C/ C, !sara_a */
237 if (!(t(1) & VRA)) return 0; /* VLA, C/ C, !vca */
240 /* C,MT,C */
241 if ((t(-2) & C) && (t(-1) & MT) && (t(0) & CX)) return 1;
243 return -1;
246 int TrbFollowing(const th_char* begin, int length, int offset)
248 //(ThBreakIterator *this, int offset)
251 const th_char* w = begin + offset;
252 const th_char* end = begin + length;
253 while (w < end && *w && !th_isthai(*w) && th_isspace(*w)) w++;
255 if (w < end && *w && !th_isthai(*w)) {
256 int english = FALSE;
257 while (w < end && *w && !th_isthai(*w) && !th_isspace(*w)) {
258 if (th_isalpha(*w)) english = TRUE;
259 w++;
261 if (english || w == end || (!th_isthai(*w) && th_isspace(*w)))
262 return w - begin;
264 if (w == end || *w == 0 || !th_isthai(*w)) return w - begin;
265 w++;
266 if (w < end && *w && th_isthai(*w)) {
267 int brk = TrbWordBreakPos(begin, w - begin, w, end - w);
268 while (brk < 0) {
269 w++;
270 if (w == end || *w == 0 || !th_isthai(*w)) break;
271 brk = TrbWordBreakPos(begin, w - begin, w, end - w);
273 if (brk > 0) w += brk;
275 if (w < end && *w && !th_isthai(*w)) {
276 while (w < end && *w && !th_isthai(*w) && !th_isalpha(*w) &&
277 !th_isspace(*w))
278 w++;
280 return w - begin;
284 /////////////////////////////////////////////////
286 const twb_t _TwbType[0x100 - 0xa0] = {
287 #if 0
288 /* 80 € */ T,
289 /* 81-8f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
290 /* 90  */ T,
291 /* 91-9f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
292 #endif
293 /* a0   */ 0,
294 /* a1 ¡ */ CS,
295 /* a2 ¢ */ CS | CHE,
296 /* a3 £ */ CC | CHE,
297 /* a4 € */ CS | CHE,
298 /* a5 ¥ */ CC | CHE,
299 /* a6 Š */ CS,
300 /* a7 § */ CS | CHB,
301 /* a8 š */ CS,
302 /* a9 © */ CC | CHE,
303 /* aa ª */ CS,
304 /* ab « */ CC | CHE,
305 /* ac ¬ */ CC | CHB | CHE,
306 /* ad ­ */ CS | CHB,
307 /* ae ® */ CS | CHB,
308 /* af ¯ */ CS | CHB,
309 /* b0 ° */ CS,
310 /* b1 ± */ CS | CHB | CHE,
311 /* b2 ² */ CS | CHB | CHE,
312 /* b3 ³ */ CS | CHB,
313 /* b4 Ž */ CS,
314 /* b5 µ */ CS,
315 /* b6 ¶ */ CS,
316 /* b7 · */ CS,
317 /* b8 ž */ CS,
318 /* b9 ¹ */ CS,
319 /* ba º */ CS,
320 /* bb » */ CS,
321 /* bc Œ */ CC | CHE,
322 /* bd œ */ CC | CHE,
323 /* be Ÿ */ CS,
324 /* bf ¿ */ CS,
325 /* c0 À */ CS | CHE,
326 /* c1 Á */ CS,
327 /* c2 Â */ CS,
328 /* c3 Ã */ CS | C2 | CHE, /* ? add CHE */
329 /* c4 Ä */ VC | CHE,
330 /* c5 Å */ CS | C2,
331 /* c6 Æ */ VC | CHE,
332 /* c7 Ç */ VC | C2,
333 /* c8 È */ CS,
334 /* c9 É */ CS | CHB,
335 /* ca Ê */ CS | CHE,
336 /* cb Ë */ CC | CHE,
337 /* CC Ì */ CS | CHB | CHE,
338 /* cd Í */ VC,
339 /* ce Î */ CC | CHE,
340 /* cf Ï */ T,
341 /* d0 Ð */ VRE | VRA,
342 /* d1 Ñ */ VRS,
343 /* d2 Ò */ VRX | VRA,
344 /* d3 Ó */ VRE,
345 /* d4 Ô */ VRX | VRA,
346 /* d5 Õ */ VRX | VRA,
347 /* d6 Ö */ VRS,
348 /* d7 × */ VRS | VRA,
349 /* d8 Ø */ VRX,
350 /* d9 Ù */ VRX,
351 /* da Ú */ T,
352 /* db Û */ 0,
353 /* dc Ü */ 0,
354 /* dd Ý */ 0,
355 /* de Þ */ 0,
356 /* df ß */ T,
357 /* e0 à */ VLA,
358 /* e1 á */ VLO,
359 /* e2 â */ VLO,
360 /* e3 ã */ VLI,
361 /* e4 ä */ VLI,
362 /* e5 å */ VRE,
363 /* e6 æ */ M,
364 /* e7 ç */ M,
365 /* e8 è */ M | MT,
366 /* e9 é */ M | MT,
367 /* ea ê */ M | MT,
368 /* eb ë */ M | MT,
369 /* ec ì */ M,
370 /* ed í */ T,
371 /* ee î */ T,
372 /* ef ï */ T,
373 /* f0 ð */ T,
374 /* f1 ñ */ T,
375 /* f2 ò */ T,
376 /* f3 ó */ T,
377 /* f4 ô */ T,
378 /* f5 õ */ T,
379 /* f6 ö */ T,
380 /* f7 ÷ */ T,
381 /* f8 ø */ T,
382 /* f9 ù */ T,
383 /* fa ú */ T,
384 /* fb û */ T,
385 /* fc ü */ 0,
386 /* fd ý */ 0,
387 /* fe þ */ 0,
388 /* ff ’ */ 0};