d: Merge upstream dmd, druntime 4c18eed967, phobos d945686a4.
[official-gcc.git] / gcc / d / dmd / root / utf.d
blobd7ba17f8a0b1db6b109aca8d95204b823ec46919
1 /**
2 * Functions related to UTF encoding.
4 * Copyright: Copyright (C) 1999-2023 by The D Language Foundation, All Rights Reserved
5 * Authors: $(LINK2 https://www.digitalmars.com, Walter Bright)
6 * License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7 * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/root/utf.d, _utf.d)
8 * Documentation: https://dlang.org/phobos/dmd_root_utf.html
9 * Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/root/utf.d
12 module dmd.root.utf;
14 @nogc nothrow pure @safe:
16 /// The Unicode code space is the range of code points [0x000000,0x10FFFF]
17 /// except the UTF-16 surrogate pairs in the range [0xD800,0xDFFF]
18 bool utf_isValidDchar(dchar c)
20 // TODO: Whether non-char code points should be rejected is pending review.
21 // 0xFFFE and 0xFFFF are valid for internal use, like Phobos std.utf.isValidDChar
22 // See also https://issues.dlang.org/show_bug.cgi?id=1357
23 if (c < 0xD800) // Almost all characters in a typical document.
24 return true;
25 if (c > 0xDFFF && c <= 0x10FFFF)
26 return true;
27 return false;
30 /*******************************
31 * Return !=0 if unicode alpha.
32 * Use table from C99 Appendix D.
34 bool isUniAlpha(dchar c)
36 static immutable wchar[2][] ALPHA_TABLE =
38 [0x00AA, 0x00AA],
39 [0x00B5, 0x00B5],
40 [0x00B7, 0x00B7],
41 [0x00BA, 0x00BA],
42 [0x00C0, 0x00D6],
43 [0x00D8, 0x00F6],
44 [0x00F8, 0x01F5],
45 [0x01FA, 0x0217],
46 [0x0250, 0x02A8],
47 [0x02B0, 0x02B8],
48 [0x02BB, 0x02BB],
49 [0x02BD, 0x02C1],
50 [0x02D0, 0x02D1],
51 [0x02E0, 0x02E4],
52 [0x037A, 0x037A],
53 [0x0386, 0x0386],
54 [0x0388, 0x038A],
55 [0x038C, 0x038C],
56 [0x038E, 0x03A1],
57 [0x03A3, 0x03CE],
58 [0x03D0, 0x03D6],
59 [0x03DA, 0x03DA],
60 [0x03DC, 0x03DC],
61 [0x03DE, 0x03DE],
62 [0x03E0, 0x03E0],
63 [0x03E2, 0x03F3],
64 [0x0401, 0x040C],
65 [0x040E, 0x044F],
66 [0x0451, 0x045C],
67 [0x045E, 0x0481],
68 [0x0490, 0x04C4],
69 [0x04C7, 0x04C8],
70 [0x04CB, 0x04CC],
71 [0x04D0, 0x04EB],
72 [0x04EE, 0x04F5],
73 [0x04F8, 0x04F9],
74 [0x0531, 0x0556],
75 [0x0559, 0x0559],
76 [0x0561, 0x0587],
77 [0x05B0, 0x05B9],
78 [0x05BB, 0x05BD],
79 [0x05BF, 0x05BF],
80 [0x05C1, 0x05C2],
81 [0x05D0, 0x05EA],
82 [0x05F0, 0x05F2],
83 [0x0621, 0x063A],
84 [0x0640, 0x0652],
85 [0x0660, 0x0669],
86 [0x0670, 0x06B7],
87 [0x06BA, 0x06BE],
88 [0x06C0, 0x06CE],
89 [0x06D0, 0x06DC],
90 [0x06E5, 0x06E8],
91 [0x06EA, 0x06ED],
92 [0x06F0, 0x06F9],
93 [0x0901, 0x0903],
94 [0x0905, 0x0939],
95 [0x093D, 0x094D],
96 [0x0950, 0x0952],
97 [0x0958, 0x0963],
98 [0x0966, 0x096F],
99 [0x0981, 0x0983],
100 [0x0985, 0x098C],
101 [0x098F, 0x0990],
102 [0x0993, 0x09A8],
103 [0x09AA, 0x09B0],
104 [0x09B2, 0x09B2],
105 [0x09B6, 0x09B9],
106 [0x09BE, 0x09C4],
107 [0x09C7, 0x09C8],
108 [0x09CB, 0x09CD],
109 [0x09DC, 0x09DD],
110 [0x09DF, 0x09E3],
111 [0x09E6, 0x09F1],
112 [0x0A02, 0x0A02],
113 [0x0A05, 0x0A0A],
114 [0x0A0F, 0x0A10],
115 [0x0A13, 0x0A28],
116 [0x0A2A, 0x0A30],
117 [0x0A32, 0x0A33],
118 [0x0A35, 0x0A36],
119 [0x0A38, 0x0A39],
120 [0x0A3E, 0x0A42],
121 [0x0A47, 0x0A48],
122 [0x0A4B, 0x0A4D],
123 [0x0A59, 0x0A5C],
124 [0x0A5E, 0x0A5E],
125 [0x0A66, 0x0A6F],
126 [0x0A74, 0x0A74],
127 [0x0A81, 0x0A83],
128 [0x0A85, 0x0A8B],
129 [0x0A8D, 0x0A8D],
130 [0x0A8F, 0x0A91],
131 [0x0A93, 0x0AA8],
132 [0x0AAA, 0x0AB0],
133 [0x0AB2, 0x0AB3],
134 [0x0AB5, 0x0AB9],
135 [0x0ABD, 0x0AC5],
136 [0x0AC7, 0x0AC9],
137 [0x0ACB, 0x0ACD],
138 [0x0AD0, 0x0AD0],
139 [0x0AE0, 0x0AE0],
140 [0x0AE6, 0x0AEF],
141 [0x0B01, 0x0B03],
142 [0x0B05, 0x0B0C],
143 [0x0B0F, 0x0B10],
144 [0x0B13, 0x0B28],
145 [0x0B2A, 0x0B30],
146 [0x0B32, 0x0B33],
147 [0x0B36, 0x0B39],
148 [0x0B3D, 0x0B43],
149 [0x0B47, 0x0B48],
150 [0x0B4B, 0x0B4D],
151 [0x0B5C, 0x0B5D],
152 [0x0B5F, 0x0B61],
153 [0x0B66, 0x0B6F],
154 [0x0B82, 0x0B83],
155 [0x0B85, 0x0B8A],
156 [0x0B8E, 0x0B90],
157 [0x0B92, 0x0B95],
158 [0x0B99, 0x0B9A],
159 [0x0B9C, 0x0B9C],
160 [0x0B9E, 0x0B9F],
161 [0x0BA3, 0x0BA4],
162 [0x0BA8, 0x0BAA],
163 [0x0BAE, 0x0BB5],
164 [0x0BB7, 0x0BB9],
165 [0x0BBE, 0x0BC2],
166 [0x0BC6, 0x0BC8],
167 [0x0BCA, 0x0BCD],
168 [0x0BE7, 0x0BEF],
169 [0x0C01, 0x0C03],
170 [0x0C05, 0x0C0C],
171 [0x0C0E, 0x0C10],
172 [0x0C12, 0x0C28],
173 [0x0C2A, 0x0C33],
174 [0x0C35, 0x0C39],
175 [0x0C3E, 0x0C44],
176 [0x0C46, 0x0C48],
177 [0x0C4A, 0x0C4D],
178 [0x0C60, 0x0C61],
179 [0x0C66, 0x0C6F],
180 [0x0C82, 0x0C83],
181 [0x0C85, 0x0C8C],
182 [0x0C8E, 0x0C90],
183 [0x0C92, 0x0CA8],
184 [0x0CAA, 0x0CB3],
185 [0x0CB5, 0x0CB9],
186 [0x0CBE, 0x0CC4],
187 [0x0CC6, 0x0CC8],
188 [0x0CCA, 0x0CCD],
189 [0x0CDE, 0x0CDE],
190 [0x0CE0, 0x0CE1],
191 [0x0CE6, 0x0CEF],
192 [0x0D02, 0x0D03],
193 [0x0D05, 0x0D0C],
194 [0x0D0E, 0x0D10],
195 [0x0D12, 0x0D28],
196 [0x0D2A, 0x0D39],
197 [0x0D3E, 0x0D43],
198 [0x0D46, 0x0D48],
199 [0x0D4A, 0x0D4D],
200 [0x0D60, 0x0D61],
201 [0x0D66, 0x0D6F],
202 [0x0E01, 0x0E3A],
203 [0x0E40, 0x0E5B],
204 [0x0E81, 0x0E82],
205 [0x0E84, 0x0E84],
206 [0x0E87, 0x0E88],
207 [0x0E8A, 0x0E8A],
208 [0x0E8D, 0x0E8D],
209 [0x0E94, 0x0E97],
210 [0x0E99, 0x0E9F],
211 [0x0EA1, 0x0EA3],
212 [0x0EA5, 0x0EA5],
213 [0x0EA7, 0x0EA7],
214 [0x0EAA, 0x0EAB],
215 [0x0EAD, 0x0EAE],
216 [0x0EB0, 0x0EB9],
217 [0x0EBB, 0x0EBD],
218 [0x0EC0, 0x0EC4],
219 [0x0EC6, 0x0EC6],
220 [0x0EC8, 0x0ECD],
221 [0x0ED0, 0x0ED9],
222 [0x0EDC, 0x0EDD],
223 [0x0F00, 0x0F00],
224 [0x0F18, 0x0F19],
225 [0x0F20, 0x0F33],
226 [0x0F35, 0x0F35],
227 [0x0F37, 0x0F37],
228 [0x0F39, 0x0F39],
229 [0x0F3E, 0x0F47],
230 [0x0F49, 0x0F69],
231 [0x0F71, 0x0F84],
232 [0x0F86, 0x0F8B],
233 [0x0F90, 0x0F95],
234 [0x0F97, 0x0F97],
235 [0x0F99, 0x0FAD],
236 [0x0FB1, 0x0FB7],
237 [0x0FB9, 0x0FB9],
238 [0x10A0, 0x10C5],
239 [0x10D0, 0x10F6],
240 [0x1E00, 0x1E9B],
241 [0x1EA0, 0x1EF9],
242 [0x1F00, 0x1F15],
243 [0x1F18, 0x1F1D],
244 [0x1F20, 0x1F45],
245 [0x1F48, 0x1F4D],
246 [0x1F50, 0x1F57],
247 [0x1F59, 0x1F59],
248 [0x1F5B, 0x1F5B],
249 [0x1F5D, 0x1F5D],
250 [0x1F5F, 0x1F7D],
251 [0x1F80, 0x1FB4],
252 [0x1FB6, 0x1FBC],
253 [0x1FBE, 0x1FBE],
254 [0x1FC2, 0x1FC4],
255 [0x1FC6, 0x1FCC],
256 [0x1FD0, 0x1FD3],
257 [0x1FD6, 0x1FDB],
258 [0x1FE0, 0x1FEC],
259 [0x1FF2, 0x1FF4],
260 [0x1FF6, 0x1FFC],
261 [0x203F, 0x2040],
262 [0x207F, 0x207F],
263 [0x2102, 0x2102],
264 [0x2107, 0x2107],
265 [0x210A, 0x2113],
266 [0x2115, 0x2115],
267 [0x2118, 0x211D],
268 [0x2124, 0x2124],
269 [0x2126, 0x2126],
270 [0x2128, 0x2128],
271 [0x212A, 0x2131],
272 [0x2133, 0x2138],
273 [0x2160, 0x2182],
274 [0x3005, 0x3007],
275 [0x3021, 0x3029],
276 [0x3041, 0x3093],
277 [0x309B, 0x309C],
278 [0x30A1, 0x30F6],
279 [0x30FB, 0x30FC],
280 [0x3105, 0x312C],
281 [0x4E00, 0x9FA5],
282 [0xAC00, 0xD7A3]
285 size_t high = ALPHA_TABLE.length - 1;
286 // Shortcut search if c is out of range
287 size_t low = (c < ALPHA_TABLE[0][0] || ALPHA_TABLE[high][1] < c) ? high + 1 : 0;
288 // Binary search
289 while (low <= high)
291 const size_t mid = low + ((high - low) >> 1);
292 if (c < ALPHA_TABLE[mid][0])
293 high = mid - 1;
294 else if (ALPHA_TABLE[mid][1] < c)
295 low = mid + 1;
296 else
298 assert(ALPHA_TABLE[mid][0] <= c && c <= ALPHA_TABLE[mid][1]);
299 return true;
302 return false;
306 * Returns the code length of c in code units.
308 int utf_codeLengthChar(dchar c)
310 if (c <= 0x7F)
311 return 1;
312 if (c <= 0x7FF)
313 return 2;
314 if (c <= 0xFFFF)
315 return 3;
316 if (c <= 0x10FFFF)
317 return 4;
318 assert(false);
321 int utf_codeLengthWchar(dchar c)
323 return c <= 0xFFFF ? 1 : 2;
327 * Returns the code length of c in code units for the encoding.
328 * sz is the encoding: 1 = utf8, 2 = utf16, 4 = utf32.
330 int utf_codeLength(int sz, dchar c)
332 if (sz == 1)
333 return utf_codeLengthChar(c);
334 if (sz == 2)
335 return utf_codeLengthWchar(c);
336 assert(sz == 4);
337 return 1;
340 void utf_encodeChar(char* s, dchar c) @system
342 assert(s !is null);
343 assert(utf_isValidDchar(c));
344 if (c <= 0x7F)
346 s[0] = cast(char)c;
348 else if (c <= 0x07FF)
350 s[0] = cast(char)(0xC0 | (c >> 6));
351 s[1] = cast(char)(0x80 | (c & 0x3F));
353 else if (c <= 0xFFFF)
355 s[0] = cast(char)(0xE0 | (c >> 12));
356 s[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
357 s[2] = cast(char)(0x80 | (c & 0x3F));
359 else if (c <= 0x10FFFF)
361 s[0] = cast(char)(0xF0 | (c >> 18));
362 s[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
363 s[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
364 s[3] = cast(char)(0x80 | (c & 0x3F));
366 else
367 assert(0);
370 void utf_encodeWchar(wchar* s, dchar c) @system
372 assert(s !is null);
373 assert(utf_isValidDchar(c));
374 if (c <= 0xFFFF)
376 s[0] = cast(wchar)c;
378 else
380 s[0] = cast(wchar)((((c - 0x010000) >> 10) & 0x03FF) + 0xD800);
381 s[1] = cast(wchar)(((c - 0x010000) & 0x03FF) + 0xDC00);
385 void utf_encode(int sz, void* s, dchar c) @system
387 if (sz == 1)
388 utf_encodeChar(cast(char*)s, c);
389 else if (sz == 2)
390 utf_encodeWchar(cast(wchar*)s, c);
391 else
393 assert(sz == 4);
394 *(cast(dchar*)s) = c;
398 /********************************************
399 * Checks whether an Unicode code point is a bidirectional
400 * control character.
402 bool isBidiControl(dchar c)
404 // Source: https://www.unicode.org/versions/Unicode15.0.0, table 23-3.
405 switch(c)
407 case '\u061C':
408 case '\u200E':
409 case '\u200F':
410 case '\u202A': .. case '\u202E':
411 case '\u2066': .. case '\u2069':
412 return true;
413 default:
414 return false;
418 /********************************************
419 * Decode a UTF-8 sequence as a single UTF-32 code point.
420 * Params:
421 * s = UTF-8 sequence
422 * ridx = starting index in s[], updated to reflect number of code units decoded
423 * rresult = set to character decoded
424 * Returns:
425 * null on success, otherwise error message string
427 string utf_decodeChar(const(char)[] s, ref size_t ridx, out dchar rresult)
429 // UTF-8 decoding errors
430 static immutable string UTF8_DECODE_OK = null; // no error
431 static immutable string UTF8_DECODE_OUTSIDE_CODE_SPACE = "Outside Unicode code space";
432 static immutable string UTF8_DECODE_TRUNCATED_SEQUENCE = "Truncated UTF-8 sequence";
433 static immutable string UTF8_DECODE_OVERLONG = "Overlong UTF-8 sequence";
434 static immutable string UTF8_DECODE_INVALID_TRAILER = "Invalid trailing code unit";
435 static immutable string UTF8_DECODE_INVALID_CODE_POINT = "Invalid code point decoded";
437 /* The following encodings are valid, except for the 5 and 6 byte
438 * combinations:
439 * 0xxxxxxx
440 * 110xxxxx 10xxxxxx
441 * 1110xxxx 10xxxxxx 10xxxxxx
442 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
443 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
444 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
446 static immutable ubyte[256] UTF8_STRIDE =
448 1,1,1,1, 1,1,1,1,
449 1,1,1,1, 1,1,1,1,
450 1,1,1,1, 1,1,1,1,
451 1,1,1,1, 1,1,1,1,
452 1,1,1,1, 1,1,1,1,
453 1,1,1,1, 1,1,1,1,
454 1,1,1,1, 1,1,1,1,
455 1,1,1,1, 1,1,1,1,
457 1,1,1,1, 1,1,1,1,
458 1,1,1,1, 1,1,1,1,
459 1,1,1,1, 1,1,1,1,
460 1,1,1,1, 1,1,1,1,
461 1,1,1,1, 1,1,1,1,
462 1,1,1,1, 1,1,1,1,
463 1,1,1,1, 1,1,1,1,
464 1,1,1,1, 1,1,1,1,
466 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
467 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
468 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
469 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
470 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
471 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
472 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
473 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
475 2,2,2,2, 2,2,2,2,
476 2,2,2,2, 2,2,2,2,
477 2,2,2,2, 2,2,2,2,
478 2,2,2,2, 2,2,2,2,
480 3,3,3,3, 3,3,3,3,
481 3,3,3,3, 3,3,3,3,
483 4,4,4,4, 4,4,4,4,
484 5,5,5,5, 6,6,0xFF,0xFF
487 assert(s !is null);
488 size_t i = ridx++;
490 const char u = s[i];
491 // Pre-stage results for ASCII and error cases
492 rresult = u;
493 //printf("utf_decodeChar(s = %02x, %02x, %02x len = %d)\n", u, s[1], s[2], len);
494 // Get expected sequence length
495 const size_t n = UTF8_STRIDE[u];
496 switch (n)
498 case 1:
499 // ASCII
500 return UTF8_DECODE_OK;
501 case 2:
502 case 3:
503 case 4:
504 // multi-byte UTF-8
505 break;
506 default:
507 // 5- or 6-byte sequence
508 return UTF8_DECODE_OUTSIDE_CODE_SPACE;
510 if (s.length < i + n) // source too short
511 return UTF8_DECODE_TRUNCATED_SEQUENCE;
512 // Pick off 7 - n low bits from first code unit
513 dchar c = u & ((1 << (7 - n)) - 1);
514 /* The following combinations are overlong, and illegal:
515 * 1100000x (10xxxxxx)
516 * 11100000 100xxxxx (10xxxxxx)
517 * 11110000 1000xxxx (10xxxxxx 10xxxxxx)
518 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
519 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
521 const char u2 = s[++i];
522 // overlong combination
523 if ((u & 0xFE) == 0xC0 || (u == 0xE0 && (u2 & 0xE0) == 0x80) || (u == 0xF0 && (u2 & 0xF0) == 0x80) || (u == 0xF8 && (u2 & 0xF8) == 0x80) || (u == 0xFC && (u2 & 0xFC) == 0x80))
524 return UTF8_DECODE_OVERLONG;
525 // Decode remaining bits
526 for (const m = n + i - 1; i != m; ++i)
528 const u3 = s[i];
529 if ((u3 & 0xC0) != 0x80) // trailing bytes are 10xxxxxx
530 return UTF8_DECODE_INVALID_TRAILER;
531 c = (c << 6) | (u3 & 0x3F);
533 if (!utf_isValidDchar(c))
534 return UTF8_DECODE_INVALID_CODE_POINT;
535 ridx = i;
536 rresult = c;
537 return UTF8_DECODE_OK;
540 /********************************************
541 * Decode a UTF-16 sequence as a single UTF-32 code point.
542 * Params:
543 * s = UTF-16 sequence
544 * ridx = starting index in s[], updated to reflect number of code units decoded
545 * rresult = set to character decoded
546 * Returns:
547 * null on success, otherwise error message string
549 string utf_decodeWchar(const(wchar)[] s, ref size_t ridx, out dchar rresult)
551 // UTF-16 decoding errors
552 static immutable string UTF16_DECODE_OK = null; // no error
553 static immutable string UTF16_DECODE_TRUNCATED_SEQUENCE = "Truncated UTF-16 sequence";
554 static immutable string UTF16_DECODE_INVALID_SURROGATE = "Invalid low surrogate";
555 static immutable string UTF16_DECODE_UNPAIRED_SURROGATE = "Unpaired surrogate";
556 static immutable string UTF16_DECODE_INVALID_CODE_POINT = "Invalid code point decoded";
558 assert(s !is null);
559 size_t i = ridx++;
561 // Pre-stage results for single wchar and error cases
562 dchar u = rresult = s[i];
563 if (u < 0xD800) // Single wchar codepoint
564 return UTF16_DECODE_OK;
565 if (0xD800 <= u && u <= 0xDBFF) // Surrogate pair
567 if (s.length <= i + 1)
568 return UTF16_DECODE_TRUNCATED_SEQUENCE;
569 wchar u2 = s[i + 1];
570 if (u2 < 0xDC00 || 0xDFFF < u)
571 return UTF16_DECODE_INVALID_SURROGATE;
572 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
573 ++ridx;
575 else if (0xDC00 <= u && u <= 0xDFFF)
576 return UTF16_DECODE_UNPAIRED_SURROGATE;
577 if (!utf_isValidDchar(u))
578 return UTF16_DECODE_INVALID_CODE_POINT;
579 rresult = u;
580 return UTF16_DECODE_OK;