1 /* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
10 #define th_isalpha(c) (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
11 #define th_isspace(c) ((c) == ' ' || (c) == '\t')
14 /////////////////////////////////////////////////
15 // Thai character type array
18 typedef unsigned short twb_t
;
19 extern const twb_t _TwbType
[0x100 - 0xa0];
52 #define VL (VLA | VLO | VLI)
53 #define VR (VRS | VRE | VRX)
61 #define twbtype(c) (_TwbType[th_zcode(c)])
67 #define RETURN(b) return (b)
70 /////////////////////////////////////////////////
73 int TrbWordBreakPos(const th_char
* pstr
, int left
, const th_char
* rstr
,
75 /* const ThBreakIterator *it, const th_char **p)*/
79 //const th_char *s = *p;
81 const th_char
* lstr
= pstr
+ left
;
84 #define c(i) (_c[(i) + 3])
85 #define t(i) (_t[(i) + 3])
89 //left = s - it->begin;
91 if (left
< 0) return -1;
93 //right = (it->end == NULL) ? 4 : it->begin - s;
95 if (right
< 1) return -1;
100 c(0) = rstr
[0]; /* may be '\0' */
101 if (!th_isthai(c(0))) return -1;
102 t(0) = twbtype(c(0));
103 if (!(t(0) & A
)) return -1;
110 if (!th_isthai(c(-1))) return 0;
111 t(-1) = twbtype(c(-1));
112 if (!(t(-1) & A
)) return 0; /* handle punctuation marks here */
119 // get c(1..2), t(1..2)
121 for (i
= 1; i
<= 2; i
++) {
126 c(i
) = rstr
[i
]; /* may be '\0'; */
127 if (!th_isthai(c(i
)))
130 t(i
) = twbtype(c(i
));
131 if (!(t(i
) & A
)) right
= i
--;
136 // get c(-2..-3), t(-2..-3)
138 for (i
= -2, j
= -2; i
>= -3; j
--) {
145 if (!th_isthai(c(i
)))
148 t(i
) = (twb_t
)(th_isthai(c(i
)) ? twbtype(c(i
)) : 0);
152 if ((t(i
+ 1) & MT
) && ((t(i
) & VR
) || (t(i
+ 2) & VR
))) {
163 // prohibit the unlikely
165 if ((t(-1) & C
) && (t(0) & C
)) {
166 if ((t(-1) & CHE
) || (t(0) & CHB
)) return -1;
169 // special case : vlao, C/ sara_a|aa, !sara_a
171 if ((t(-3) & (VLA
| VLO
)) && (t(-2) & C
) && (c(0) != TH_SARA_A
) &&
172 (c(-1) == TH_SARA_A
|| c(-0) == TH_SARA_AA
))
178 if (t(0) & NB
) return -1;
179 if (t(-1) & NE
) return -1;
185 if (c(-2) == TH_SARA_AA
&& c(-1) == TH_SARA_A
) return 0;
186 return -1; /* usually too short syllable, part of word */
189 if (t(-2) & VRE
) return -1;
191 if ((t(0) & C
) && (t(1) & (VR
| MT
)) &&
192 (c(2) != TH_THANTHAKHAT
)) { /*?C, NB */
193 if ((t(-1) & (VRS
| VRX
)) && c(1) == TH_SARA_I
) return -1; /* exception */
194 if (t(-1) & (V
| M
)) return 0; /* !C/ C, NB */
195 if (t(-2) & VRS
) return 0; /* VRS, C / C, NB */
196 if (!(t(0) & C2
) && c(1) == TH_SARA_I
) { /* / !C2 or /c, sara_i */
197 if (t(-2) & VRX
) return 0; /* VRX, C / C, NB ? 100%? */
198 if (t(-2) & VC
) return 0; /* VC, C / C, NB ? 100% */
201 if ((t(-1) & VRX
) && (t(0) & CC
)) return 0; /* VRX/ CC */
202 if ((t(-2) & VRS
) && (t(-1) & C
) && (t(0) & (V
| M
)))
203 return 0; /* VRS, C/ !C */
205 if ((t(0) & CX
) && (t(1) & C2
) && (c(2) != TH_THANTHAKHAT
)) {
206 if ((t(-2) & A
) && (t(-1) & CX
)) return 0; /* A, CX / CX, C2 */
207 if ((t(-2) & CX
) && (t(-1) & MT
)) return 0; /* CX, MT / CX, C2 */
212 if (t(0) & VL
) return 0;
213 if (t(1) & VL
) return -1;
214 if (c(-1) == TH_THANTHAKHAT
&& c(-2) != TH_RORUA
&& c(-2) != TH_LOLING
)
222 if ((t(-2) & VRS
) && (t(-1) & C
)) return 0; /* VRS, C/ CHE */
223 /*if(t(-1) & VRX) return 0; // VRX/ CHE */
224 if (t(-1) & VC
) return 0; /* VC/ CHE */
227 if ((t(0) & C
) && (t(1) & VR
)) return 0; /* CHB/ CC, VR */
228 if (t(0) & VC
) return 0; /* CHB/ VC */
231 if ((t(-2) & VL
) && (t(1) & VR
)) { /* VL, C? C, VR */
233 return 0; /* VLI,C/C,VR .*/
234 else { /* vlao, C ? C , VR */
235 if (c(1) == TH_SARA_A
) return 2; /* vlao, C, C, sara_a/ */
236 if (t(-2) & VLO
) return 0; /* VLO, C/ C, !sara_a */
237 if (!(t(1) & VRA
)) return 0; /* VLA, C/ C, !vca */
241 if ((t(-2) & C
) && (t(-1) & MT
) && (t(0) & CX
)) return 1;
246 int TrbFollowing(const th_char
* begin
, int length
, int offset
)
248 //(ThBreakIterator *this, int offset)
251 const th_char
* w
= begin
+ offset
;
252 const th_char
* end
= begin
+ length
;
253 while (w
< end
&& *w
&& !th_isthai(*w
) && th_isspace(*w
)) w
++;
255 if (w
< end
&& *w
&& !th_isthai(*w
)) {
257 while (w
< end
&& *w
&& !th_isthai(*w
) && !th_isspace(*w
)) {
258 if (th_isalpha(*w
)) english
= TRUE
;
261 if (english
|| w
== end
|| (!th_isthai(*w
) && th_isspace(*w
)))
264 if (w
== end
|| *w
== 0 || !th_isthai(*w
)) return w
- begin
;
266 if (w
< end
&& *w
&& th_isthai(*w
)) {
267 int brk
= TrbWordBreakPos(begin
, w
- begin
, w
, end
- w
);
270 if (w
== end
|| *w
== 0 || !th_isthai(*w
)) break;
271 brk
= TrbWordBreakPos(begin
, w
- begin
, w
, end
- w
);
273 if (brk
> 0) w
+= brk
;
275 if (w
< end
&& *w
&& !th_isthai(*w
)) {
276 while (w
< end
&& *w
&& !th_isthai(*w
) && !th_isalpha(*w
) &&
284 /////////////////////////////////////////////////
286 const twb_t _TwbType
[0x100 - 0xa0] = {
289 /* 81-8f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
291 /* 91-9f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
305 /* ac ¬ */ CC
| CHB
| CHE
,
310 /* b1 ± */ CS
| CHB
| CHE
,
311 /* b2 ² */ CS
| CHB
| CHE
,
328 /* c3 Ã */ CS
| C2
| CHE
, /* ? add CHE */
337 /* CC Ì */ CS
| CHB
| CHE
,
341 /* d0 Ð */ VRE
| VRA
,
343 /* d2 Ò */ VRX
| VRA
,
345 /* d4 Ô */ VRX
| VRA
,
346 /* d5 Õ */ VRX
| VRA
,
348 /* d7 × */ VRS
| VRA
,