1 /* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
9 #define th_isalpha(c) (((c)>='a'&&(c)<='z')||((c)>='A'&&(c)<='Z'))
10 #define th_isspace(c) ((c)==' '||(c)=='\t')
14 /////////////////////////////////////////////////
15 // Thai character type array
18 typedef unsigned short twb_t
;
19 extern const twb_t _TwbType
[0x100-0xa0];
52 #define VL (VLA|VLO|VLI)
53 #define VR (VRS|VRE|VRX)
61 #define twbtype(c) (_TwbType[th_zcode(c)])
67 #define RETURN(b) return (b)
71 /////////////////////////////////////////////////
74 int TrbWordBreakPos(const th_char
*pstr
, int left
,
75 const th_char
*rstr
, int right
)
76 /* const ThBreakIterator *it, const th_char **p)*/
80 //const th_char *s = *p;
82 const th_char
*lstr
= pstr
+ left
;
85 #define c(i) (_c[(i)+3])
86 #define t(i) (_t[(i)+3])
90 //left = s - it->begin;
92 if(left
< 0) return -1;
94 //right = (it->end == NULL) ? 4 : it->begin - s;
96 if(right
< 1) return -1;
101 c(0) = rstr
[0]; /* may be '\0' */
102 if(!th_isthai(c(0))) return -1;
103 t(0) = twbtype(c(0));
104 if(!(t(0) & A
)) return -1;
111 if(!th_isthai(c(-1))) return 0;
112 t(-1) = twbtype(c(-1));
113 if(!(t(-1) & A
)) return 0; /* handle punctuation marks here */
114 } else { c(-1) = 0; t(-1) = 0; }
117 // get c(1..2), t(1..2)
119 for(i
= 1; i
<= 2; i
++) {
120 if(i
>= right
) { c(i
) = 0; t(i
) = 0; }
122 c(i
) = rstr
[i
]; /* may be '\0'; */
123 if(!th_isthai(c(i
))) right
= i
--;
125 t(i
) = twbtype(c(i
));
126 if(!(t(i
) & A
)) right
= i
--;
131 // get c(-2..-3), t(-2..-3)
133 for(i
= -2, j
= -2; i
>= -3 ; j
--) {
134 if(j
< -left
) { c(i
) = 0; t(i
) = 0; i
--; }
137 if(!th_isthai(c(i
))) left
= 0;
139 t(i
) = (twb_t
)(th_isthai(c(i
)) ? twbtype(c(i
)) : 0);
140 if(!(t(i
) & A
)) left
= 0;
142 if((t(i
+1) & MT
) && ((t(i
) & VR
) || (t(i
+2) & VR
))) {
143 c(i
+1) = c(i
); t(i
+1) = t(i
);
151 // prohibit the unlikely
153 if((t(-1) & C
) && (t(0) & C
)) {
154 if((t(-1) & CHE
) || (t(0) & CHB
)) return -1;
157 // special case : vlao, C/ sara_a|aa, !sara_a
159 if((t(-3) & (VLA
|VLO
)) && (t(-2) & C
) && (c(0) != TH_SARA_A
) &&
160 (c(-1) == TH_SARA_A
|| c(-0) == TH_SARA_AA
)) return 0;
165 if(t(0) & NB
) return -1;
166 if(t(-1) & NE
) return -1;
173 if(c(-2) == TH_SARA_AA
&& c(-1) == TH_SARA_A
) return 0;
174 return -1; /* usually too short syllable, part of word */
177 if(t(-2) & VRE
) return -1;
179 if((t(0) & C
) && (t(1) & (VR
|MT
)) && (c(2) != TH_THANTHAKHAT
)) { /*?C, NB */
180 if((t(-1) & (VRS
|VRX
)) && c(1) == TH_SARA_I
) return -1; /* exception */
181 if(t(-1) & (V
|M
)) return 0; /* !C/ C, NB */
182 if(t(-2) & VRS
) return 0; /* VRS, C / C, NB */
183 if(!(t(0) & C2
) && c(1) == TH_SARA_I
) { /* / !C2 or /c, sara_i */
184 if(t(-2) & VRX
) return 0; /* VRX, C / C, NB ? 100%? */
185 if(t(-2) & VC
) return 0; /* VC, C / C, NB ? 100% */
188 if((t(-1) & VRX
) && (t(0) & CC
)) return 0; /* VRX/ CC */
189 if((t(-2) & VRS
) && (t(-1) & C
) && (t(0) & (V
|M
))) return 0;/* VRS, C/ !C */
192 if((t(0) & CX
) && (t(1) & C2
) && (c(2) != TH_THANTHAKHAT
)) {
193 if((t(-2) & A
) && (t(-1) & CX
)) return 0; /* A, CX / CX, C2 */
194 if((t(-2) & CX
) && (t(-1) & MT
)) return 0; /* CX, MT / CX, C2 */
199 if(t(0) & VL
) return 0;
200 if(t(1) & VL
) return -1;
201 if(c(-1) == TH_THANTHAKHAT
&& c(-2) != TH_RORUA
&& c(-2) != TH_LOLING
) return 0;
208 if((t(-2) & VRS
) && (t(-1) & C
)) return 0; /* VRS, C/ CHE */
209 /*if(t(-1) & VRX) return 0; // VRX/ CHE */
210 if(t(-1) & VC
) return 0; /* VC/ CHE */
213 if((t(0) & C
) && (t(1) & VR
)) return 0; /* CHB/ CC, VR */
214 if(t(0) & VC
) return 0; /* CHB/ VC */
217 if((t(-2) & VL
) && (t(1) & VR
)) { /* VL, C? C, VR */
218 if(t(-2) & VLI
) return 0; /* VLI,C/C,VR .*/
219 else { /* vlao, C ? C , VR */
220 if(c(1) == TH_SARA_A
) return 2; /* vlao, C, C, sara_a/ */
221 if(t(-2) & VLO
) return 0; /* VLO, C/ C, !sara_a */
222 if(!(t(1) & VRA
)) return 0; /* VLA, C/ C, !vca */
226 if((t(-2) & C
) && (t(-1) & MT
) && (t(0) & CX
)) return 1;
232 int TrbFollowing(const th_char
*begin
, int length
, int offset
)
234 //(ThBreakIterator *this, int offset)
237 const th_char
*w
= begin
+ offset
;
238 const th_char
*end
= begin
+ length
;
239 while(w
< end
&& *w
&& !th_isthai(*w
) && th_isspace(*w
)) w
++;
241 if(w
< end
&& *w
&& !th_isthai(*w
)) {
243 while(w
< end
&& *w
&& !th_isthai(*w
) && !th_isspace(*w
)) {
244 if(th_isalpha(*w
)) english
= TRUE
;
247 if(english
|| w
== end
||
248 (!th_isthai(*w
) && th_isspace(*w
))) return w
- begin
;
250 if(w
== end
|| *w
== 0 || !th_isthai(*w
)) return w
- begin
;
252 if(w
< end
&& *w
&& th_isthai(*w
)) {
253 int brk
= TrbWordBreakPos(begin
, w
-begin
, w
, end
-w
);
256 if(w
== end
|| *w
== 0 || !th_isthai(*w
)) break;
257 brk
= TrbWordBreakPos(begin
, w
-begin
, w
, end
-w
);
259 if (brk
> 0) w
+= brk
;
261 if(w
< end
&& *w
&& !th_isthai(*w
)) {
262 while(w
< end
&& *w
&& !th_isthai(*w
) &&
263 !th_isalpha(*w
) && !th_isspace(*w
)) w
++;
270 /////////////////////////////////////////////////
272 const twb_t _TwbType
[0x100-0xa0] = {
275 /* 81-8f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
277 /* 91-9f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
291 /* ac ¬ */ CC
| CHB
| CHE
,
296 /* b1 ± */ CS
| CHB
| CHE
,
297 /* b2 ² */ CS
| CHB
| CHE
,
314 /* c3 Ã */ CS
| C2
| CHE
, /* ? add CHE */
323 /* CC Ì */ CS
| CHB
| CHE
,
327 /* d0 Ð */ VRE
| VRA
,
329 /* d2 Ò */ VRX
| VRA
,
331 /* d4 Ô */ VRX
| VRA
,
332 /* d5 Õ */ VRX
| VRA
,
334 /* d7 × */ VRS
| VRA
,