2 * Functions related to UTF encoding.
4 * Copyright: Copyright (C) 1999-2023 by The D Language Foundation, All Rights Reserved
5 * Authors: $(LINK2 https://www.digitalmars.com, Walter Bright)
6 * License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/root/utf.d, _utf.d)
8 * Documentation: https://dlang.org/phobos/dmd_root_utf.html
9 * Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/root/utf.d
14 @nogc nothrow pure @safe:
16 /// The Unicode code space is the range of code points [0x000000,0x10FFFF]
17 /// except the UTF-16 surrogate pairs in the range [0xD800,0xDFFF]
18 bool utf_isValidDchar(dchar c
)
20 // TODO: Whether non-char code points should be rejected is pending review.
21 // 0xFFFE and 0xFFFF are valid for internal use, like Phobos std.utf.isValidDChar
22 // See also https://issues.dlang.org/show_bug.cgi?id=1357
23 if (c
< 0xD800) // Almost all characters in a typical document.
25 if (c
> 0xDFFF && c
<= 0x10FFFF)
30 /*******************************
31 * Return !=0 if unicode alpha.
32 * Use table from C99 Appendix D.
34 bool isUniAlpha(dchar c
)
36 static immutable wchar[2][] ALPHA_TABLE
=
285 size_t high
= ALPHA_TABLE
.length
- 1;
286 // Shortcut search if c is out of range
287 size_t low
= (c
< ALPHA_TABLE
[0][0] || ALPHA_TABLE
[high
][1] < c
) ? high
+ 1 : 0;
291 const size_t mid
= low
+ ((high
- low
) >> 1);
292 if (c
< ALPHA_TABLE
[mid
][0])
294 else if (ALPHA_TABLE
[mid
][1] < c
)
298 assert(ALPHA_TABLE
[mid
][0] <= c
&& c
<= ALPHA_TABLE
[mid
][1]);
306 * Returns the code length of c in code units.
308 int utf_codeLengthChar(dchar c
)
321 int utf_codeLengthWchar(dchar c
)
323 return c
<= 0xFFFF ?
1 : 2;
327 * Returns the code length of c in code units for the encoding.
328 * sz is the encoding: 1 = utf8, 2 = utf16, 4 = utf32.
330 int utf_codeLength(int sz
, dchar c
)
333 return utf_codeLengthChar(c
);
335 return utf_codeLengthWchar(c
);
340 void utf_encodeChar(char* s
, dchar c
) @system
343 assert(utf_isValidDchar(c
));
348 else if (c
<= 0x07FF)
350 s
[0] = cast(char)(0xC0 |
(c
>> 6));
351 s
[1] = cast(char)(0x80 |
(c
& 0x3F));
353 else if (c
<= 0xFFFF)
355 s
[0] = cast(char)(0xE0 |
(c
>> 12));
356 s
[1] = cast(char)(0x80 |
((c
>> 6) & 0x3F));
357 s
[2] = cast(char)(0x80 |
(c
& 0x3F));
359 else if (c
<= 0x10FFFF)
361 s
[0] = cast(char)(0xF0 |
(c
>> 18));
362 s
[1] = cast(char)(0x80 |
((c
>> 12) & 0x3F));
363 s
[2] = cast(char)(0x80 |
((c
>> 6) & 0x3F));
364 s
[3] = cast(char)(0x80 |
(c
& 0x3F));
370 void utf_encodeWchar(wchar* s
, dchar c
) @system
373 assert(utf_isValidDchar(c
));
380 s
[0] = cast(wchar)((((c
- 0x010000) >> 10) & 0x03FF) + 0xD800);
381 s
[1] = cast(wchar)(((c
- 0x010000) & 0x03FF) + 0xDC00);
385 void utf_encode(int sz
, void* s
, dchar c
) @system
388 utf_encodeChar(cast(char*)s
, c
);
390 utf_encodeWchar(cast(wchar*)s
, c
);
394 *(cast(dchar*)s
) = c
;
398 /********************************************
399 * Checks whether an Unicode code point is a bidirectional
402 bool isBidiControl(dchar c
)
404 // Source: https://www.unicode.org/versions/Unicode15.0.0, table 23-3.
410 case '\u202A': .. case '\u202E':
411 case '\u2066': .. case '\u2069':
418 /********************************************
419 * Decode a UTF-8 sequence as a single UTF-32 code point.
422 * ridx = starting index in s[], updated to reflect number of code units decoded
423 * rresult = set to character decoded
425 * null on success, otherwise error message string
427 string
utf_decodeChar(const(char)[] s
, ref size_t ridx
, out dchar rresult
)
429 // UTF-8 decoding errors
430 static immutable string UTF8_DECODE_OK
= null; // no error
431 static immutable string UTF8_DECODE_OUTSIDE_CODE_SPACE
= "Outside Unicode code space";
432 static immutable string UTF8_DECODE_TRUNCATED_SEQUENCE
= "Truncated UTF-8 sequence";
433 static immutable string UTF8_DECODE_OVERLONG
= "Overlong UTF-8 sequence";
434 static immutable string UTF8_DECODE_INVALID_TRAILER
= "Invalid trailing code unit";
435 static immutable string UTF8_DECODE_INVALID_CODE_POINT
= "Invalid code point decoded";
437 /* The following encodings are valid, except for the 5 and 6 byte
441 * 1110xxxx 10xxxxxx 10xxxxxx
442 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
443 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
444 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
446 static immutable ubyte[256] UTF8_STRIDE
=
466 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
467 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
468 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
469 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
470 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
471 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
472 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
473 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
484 5,5,5,5, 6,6,0xFF,0xFF
491 // Pre-stage results for ASCII and error cases
493 //printf("utf_decodeChar(s = %02x, %02x, %02x len = %d)\n", u, s[1], s[2], len);
494 // Get expected sequence length
495 const size_t n
= UTF8_STRIDE
[u
];
500 return UTF8_DECODE_OK
;
507 // 5- or 6-byte sequence
508 return UTF8_DECODE_OUTSIDE_CODE_SPACE
;
510 if (s
.length
< i
+ n
) // source too short
511 return UTF8_DECODE_TRUNCATED_SEQUENCE
;
512 // Pick off 7 - n low bits from first code unit
513 dchar c
= u
& ((1 << (7 - n
)) - 1);
514 /* The following combinations are overlong, and illegal:
515 * 1100000x (10xxxxxx)
516 * 11100000 100xxxxx (10xxxxxx)
517 * 11110000 1000xxxx (10xxxxxx 10xxxxxx)
518 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
519 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
521 const char u2
= s
[++i
];
522 // overlong combination
523 if ((u
& 0xFE) == 0xC0 ||
(u
== 0xE0 && (u2
& 0xE0) == 0x80) ||
(u
== 0xF0 && (u2
& 0xF0) == 0x80) ||
(u
== 0xF8 && (u2
& 0xF8) == 0x80) ||
(u
== 0xFC && (u2
& 0xFC) == 0x80))
524 return UTF8_DECODE_OVERLONG
;
525 // Decode remaining bits
526 for (const m
= n
+ i
- 1; i
!= m
; ++i
)
529 if ((u3
& 0xC0) != 0x80) // trailing bytes are 10xxxxxx
530 return UTF8_DECODE_INVALID_TRAILER
;
531 c
= (c
<< 6) |
(u3
& 0x3F);
533 if (!utf_isValidDchar(c
))
534 return UTF8_DECODE_INVALID_CODE_POINT
;
537 return UTF8_DECODE_OK
;
540 /********************************************
541 * Decode a UTF-16 sequence as a single UTF-32 code point.
543 * s = UTF-16 sequence
544 * ridx = starting index in s[], updated to reflect number of code units decoded
545 * rresult = set to character decoded
547 * null on success, otherwise error message string
549 string
utf_decodeWchar(const(wchar)[] s
, ref size_t ridx
, out dchar rresult
)
551 // UTF-16 decoding errors
552 static immutable string UTF16_DECODE_OK
= null; // no error
553 static immutable string UTF16_DECODE_TRUNCATED_SEQUENCE
= "Truncated UTF-16 sequence";
554 static immutable string UTF16_DECODE_INVALID_SURROGATE
= "Invalid low surrogate";
555 static immutable string UTF16_DECODE_UNPAIRED_SURROGATE
= "Unpaired surrogate";
556 static immutable string UTF16_DECODE_INVALID_CODE_POINT
= "Invalid code point decoded";
561 // Pre-stage results for single wchar and error cases
562 dchar u
= rresult
= s
[i
];
563 if (u
< 0xD800) // Single wchar codepoint
564 return UTF16_DECODE_OK
;
565 if (0xD800 <= u
&& u
<= 0xDBFF) // Surrogate pair
567 if (s
.length
<= i
+ 1)
568 return UTF16_DECODE_TRUNCATED_SEQUENCE
;
570 if (u2
< 0xDC00 ||
0xDFFF < u
)
571 return UTF16_DECODE_INVALID_SURROGATE
;
572 u
= ((u
- 0xD7C0) << 10) + (u2
- 0xDC00);
575 else if (0xDC00 <= u
&& u
<= 0xDFFF)
576 return UTF16_DECODE_UNPAIRED_SURROGATE
;
577 if (!utf_isValidDchar(u
))
578 return UTF16_DECODE_INVALID_CODE_POINT
;
580 return UTF16_DECODE_OK
;