2 * Copyright © 2011,2012,2014 Google, Inc.
4 * This is part of HarfBuzz, a text shaping library.
6 * Permission is hereby granted, without written agreement and without
7 * license or royalty fees, to use, copy, modify, and distribute this
8 * software and its documentation for any purpose, provided that the
9 * above copyright notice and the following two paragraphs appear in
10 * all copies of this software.
12 * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
13 * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
14 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
15 * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
18 * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
19 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
20 * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS
21 * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
22 * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
24 * Google Author(s): Behdad Esfahbod
32 #include "hb-open-type.hh"
37 typedef uint8_t codepoint_t
;
38 static constexpr unsigned max_len
= 4;
40 static const codepoint_t
*
41 next (const codepoint_t
*text
,
42 const codepoint_t
*end
,
43 hb_codepoint_t
*unicode
,
44 hb_codepoint_t replacement
)
46 /* Written to only accept well-formed sequences.
47 * Based on ideas from ICU's U8_NEXT.
48 * Generates one "replacement" for each ill-formed byte. */
50 hb_codepoint_t c
= *text
++;
54 if (hb_in_range
<hb_codepoint_t
> (c
, 0xC2u
, 0xDFu
)) /* Two-byte */
57 if (likely (text
< end
&&
58 (t1
= text
[0] - 0x80u
) <= 0x3Fu
))
60 c
= ((c
&0x1Fu
)<<6) | t1
;
66 else if (hb_in_range
<hb_codepoint_t
> (c
, 0xE0u
, 0xEFu
)) /* Three-byte */
69 if (likely (1 < end
- text
&&
70 (t1
= text
[0] - 0x80u
) <= 0x3Fu
&&
71 (t2
= text
[1] - 0x80u
) <= 0x3Fu
))
73 c
= ((c
&0xFu
)<<12) | (t1
<<6) | t2
;
74 if (unlikely (c
< 0x0800u
|| hb_in_range
<hb_codepoint_t
> (c
, 0xD800u
, 0xDFFFu
)))
81 else if (hb_in_range
<hb_codepoint_t
> (c
, 0xF0u
, 0xF4u
)) /* Four-byte */
83 unsigned int t1
, t2
, t3
;
84 if (likely (2 < end
- text
&&
85 (t1
= text
[0] - 0x80u
) <= 0x3Fu
&&
86 (t2
= text
[1] - 0x80u
) <= 0x3Fu
&&
87 (t3
= text
[2] - 0x80u
) <= 0x3Fu
))
89 c
= ((c
&0x7u
)<<18) | (t1
<<12) | (t2
<<6) | t3
;
90 if (unlikely (!hb_in_range
<hb_codepoint_t
> (c
, 0x10000u
, 0x10FFFFu
)))
105 *unicode
= replacement
;
109 static const codepoint_t
*
110 prev (const codepoint_t
*text
,
111 const codepoint_t
*start
,
112 hb_codepoint_t
*unicode
,
113 hb_codepoint_t replacement
)
115 const codepoint_t
*end
= text
--;
116 while (start
< text
&& (*text
& 0xc0) == 0x80 && end
- text
< 4)
119 if (likely (next (text
, end
, unicode
, replacement
) == end
))
122 *unicode
= replacement
;
127 strlen (const codepoint_t
*text
)
128 { return ::strlen ((const char *) text
); }
131 encode_len (hb_codepoint_t unicode
)
133 if (unicode
< 0x0080u
) return 1;
134 if (unicode
< 0x0800u
) return 2;
135 if (unicode
< 0x10000u
) return 3;
136 if (unicode
< 0x110000u
) return 4;
141 encode (codepoint_t
*text
,
142 const codepoint_t
*end
,
143 hb_codepoint_t unicode
)
145 if (unlikely (unicode
>= 0xD800u
&& (unicode
<= 0xDFFFu
|| unicode
> 0x10FFFFu
)))
147 if (unicode
< 0x0080u
)
149 else if (unicode
< 0x0800u
)
153 *text
++ = 0xC0u
+ (0x1Fu
& (unicode
>> 6));
154 *text
++ = 0x80u
+ (0x3Fu
& (unicode
));
157 else if (unicode
< 0x10000u
)
161 *text
++ = 0xE0u
+ (0x0Fu
& (unicode
>> 12));
162 *text
++ = 0x80u
+ (0x3Fu
& (unicode
>> 6));
163 *text
++ = 0x80u
+ (0x3Fu
& (unicode
));
170 *text
++ = 0xF0u
+ (0x07u
& (unicode
>> 18));
171 *text
++ = 0x80u
+ (0x3Fu
& (unicode
>> 12));
172 *text
++ = 0x80u
+ (0x3Fu
& (unicode
>> 6));
173 *text
++ = 0x80u
+ (0x3Fu
& (unicode
));
181 template <typename TCodepoint
>
184 static_assert (sizeof (TCodepoint
) == 2, "");
185 typedef TCodepoint codepoint_t
;
186 static constexpr unsigned max_len
= 2;
188 static const codepoint_t
*
189 next (const codepoint_t
*text
,
190 const codepoint_t
*end
,
191 hb_codepoint_t
*unicode
,
192 hb_codepoint_t replacement
)
194 hb_codepoint_t c
= *text
++;
196 if (likely (!hb_in_range
<hb_codepoint_t
> (c
, 0xD800u
, 0xDFFFu
)))
202 if (likely (c
<= 0xDBFFu
&& text
< end
))
204 /* High-surrogate in c */
205 hb_codepoint_t l
= *text
;
206 if (likely (hb_in_range
<hb_codepoint_t
> (l
, 0xDC00u
, 0xDFFFu
)))
208 /* Low-surrogate in l */
209 *unicode
= (c
<< 10) + l
- ((0xD800u
<< 10) - 0x10000u
+ 0xDC00u
);
215 /* Lonely / out-of-order surrogate. */
216 *unicode
= replacement
;
220 static const codepoint_t
*
221 prev (const codepoint_t
*text
,
222 const codepoint_t
*start
,
223 hb_codepoint_t
*unicode
,
224 hb_codepoint_t replacement
)
226 hb_codepoint_t c
= *--text
;
228 if (likely (!hb_in_range
<hb_codepoint_t
> (c
, 0xD800u
, 0xDFFFu
)))
234 if (likely (c
>= 0xDC00u
&& start
< text
))
236 /* Low-surrogate in c */
237 hb_codepoint_t h
= text
[-1];
238 if (likely (hb_in_range
<hb_codepoint_t
> (h
, 0xD800u
, 0xDBFFu
)))
240 /* High-surrogate in h */
241 *unicode
= (h
<< 10) + c
- ((0xD800u
<< 10) - 0x10000u
+ 0xDC00u
);
247 /* Lonely / out-of-order surrogate. */
248 *unicode
= replacement
;
254 strlen (const codepoint_t
*text
)
262 encode_len (hb_codepoint_t unicode
)
264 return unicode
< 0x10000 ? 1 : 2;
268 encode (codepoint_t
*text
,
269 const codepoint_t
*end
,
270 hb_codepoint_t unicode
)
272 if (unlikely (unicode
>= 0xD800u
&& (unicode
<= 0xDFFFu
|| unicode
> 0x10FFFFu
)))
274 if (unicode
< 0x10000u
)
276 else if (end
- text
>= 2)
279 *text
++ = 0xD800u
+ (unicode
>> 10);
280 *text
++ = 0xDC00u
+ (unicode
& 0x03FFu
);
286 typedef hb_utf16_xe_t
<uint16_t> hb_utf16_t
;
287 typedef hb_utf16_xe_t
<OT::HBUINT16
> hb_utf16_be_t
;
290 template <typename TCodepoint
, bool validate
=true>
293 static_assert (sizeof (TCodepoint
) == 4, "");
294 typedef TCodepoint codepoint_t
;
295 static constexpr unsigned max_len
= 1;
297 static const TCodepoint
*
298 next (const TCodepoint
*text
,
299 const TCodepoint
*end HB_UNUSED
,
300 hb_codepoint_t
*unicode
,
301 hb_codepoint_t replacement
)
303 hb_codepoint_t c
= *unicode
= *text
++;
304 if (validate
&& unlikely (c
>= 0xD800u
&& (c
<= 0xDFFFu
|| c
> 0x10FFFFu
)))
305 *unicode
= replacement
;
309 static const TCodepoint
*
310 prev (const TCodepoint
*text
,
311 const TCodepoint
*start HB_UNUSED
,
312 hb_codepoint_t
*unicode
,
313 hb_codepoint_t replacement
)
315 hb_codepoint_t c
= *unicode
= *--text
;
316 if (validate
&& unlikely (c
>= 0xD800u
&& (c
<= 0xDFFFu
|| c
> 0x10FFFFu
)))
317 *unicode
= replacement
;
322 strlen (const TCodepoint
*text
)
330 encode_len (hb_codepoint_t unicode HB_UNUSED
)
336 encode (codepoint_t
*text
,
337 const codepoint_t
*end HB_UNUSED
,
338 hb_codepoint_t unicode
)
340 if (validate
&& unlikely (unicode
>= 0xD800u
&& (unicode
<= 0xDFFFu
|| unicode
> 0x10FFFFu
)))
347 typedef hb_utf32_xe_t
<uint32_t> hb_utf32_t
;
348 typedef hb_utf32_xe_t
<uint32_t, false> hb_utf32_novalidate_t
;
353 typedef uint8_t codepoint_t
;
354 static constexpr unsigned max_len
= 1;
356 static const codepoint_t
*
357 next (const codepoint_t
*text
,
358 const codepoint_t
*end HB_UNUSED
,
359 hb_codepoint_t
*unicode
,
360 hb_codepoint_t replacement HB_UNUSED
)
366 static const codepoint_t
*
367 prev (const codepoint_t
*text
,
368 const codepoint_t
*start HB_UNUSED
,
369 hb_codepoint_t
*unicode
,
370 hb_codepoint_t replacement HB_UNUSED
)
377 strlen (const codepoint_t
*text
)
385 encode_len (hb_codepoint_t unicode HB_UNUSED
)
391 encode (codepoint_t
*text
,
392 const codepoint_t
*end HB_UNUSED
,
393 hb_codepoint_t unicode
)
395 if (unlikely (unicode
>= 0x0100u
))
405 typedef uint8_t codepoint_t
;
406 static constexpr unsigned max_len
= 1;
408 static const codepoint_t
*
409 next (const codepoint_t
*text
,
410 const codepoint_t
*end HB_UNUSED
,
411 hb_codepoint_t
*unicode
,
412 hb_codepoint_t replacement
)
415 if (*unicode
>= 0x0080u
)
416 *unicode
= replacement
;
420 static const codepoint_t
*
421 prev (const codepoint_t
*text
,
422 const codepoint_t
*start HB_UNUSED
,
423 hb_codepoint_t
*unicode
,
424 hb_codepoint_t replacement
)
427 if (*unicode
>= 0x0080u
)
428 *unicode
= replacement
;
433 strlen (const codepoint_t
*text
)
441 encode_len (hb_codepoint_t unicode HB_UNUSED
)
447 encode (codepoint_t
*text
,
448 const codepoint_t
*end HB_UNUSED
,
449 hb_codepoint_t unicode
)
451 if (unlikely (unicode
>= 0x0080u
))
458 template <typename utf_t
>
459 static inline const typename
utf_t::codepoint_t
*
460 hb_utf_offset_to_pointer (const typename
utf_t::codepoint_t
*start
,
463 hb_codepoint_t unicode
;
466 start
= utf_t::next (start
,
467 start
+ utf_t::max_len
,
469 HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT
);
472 start
= utf_t::prev (start
,
473 start
- utf_t::max_len
,
475 HB_BUFFER_REPLACEMENT_CODEPOINT_DEFAULT
);
481 #endif /* HB_UTF_HH */