C2x scanf %wN, %wfN support
[glibc.git] / wcsmbs / c8rtomb.c
blob931684ea11c75cd06e0f13919bc8578f01c01753
1 /* UTF-8 to multibyte conversion.
2 Copyright (C) 2022-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
19 #include <errno.h>
20 #include <uchar.h>
21 #include <wchar.h>
24 /* This is the private state used if PS is NULL. */
25 static mbstate_t state;
27 size_t
28 c8rtomb (char *s, char8_t c8, mbstate_t *ps)
30 /* This implementation depends on the converter invoked by wcrtomb not
31 needing to retain state in either the top most bit of ps->__count or
32 in ps->__value between invocations. This implementation uses the
33 top most bit of ps->__count to indicate that trailing code units are
34 expected and uses ps->__value to store previously seen code units. */
36 wchar_t wc;
38 if (ps == NULL)
39 ps = &state;
41 if (s == NULL)
43 /* if 's' is a null pointer, behave as if u8'\0' was passed as 'c8'. If
44 this occurs for an incomplete code unit sequence, then an error will
45 be reported below. */
46 c8 = u8""[0];
49 if (! (ps->__count & 0x80000000))
51 /* Initial state. */
52 if ((c8 >= 0x80 && c8 <= 0xC1) || c8 >= 0xF5)
54 /* An invalid lead code unit. */
55 __set_errno (EILSEQ);
56 return -1;
58 if (c8 >= 0xC2)
60 /* A valid lead code unit. */
61 ps->__count |= 0x80000000;
62 ps->__value.__wchb[0] = c8;
63 ps->__value.__wchb[3] = 1;
64 return 0;
66 /* A single byte (ASCII) code unit. */
67 wc = c8;
69 else
71 char8_t cu1 = ps->__value.__wchb[0];
72 if (ps->__value.__wchb[3] == 1)
74 /* A single lead code unit was previously seen. */
75 if ((c8 < 0x80 || c8 > 0xBF)
76 || (cu1 == 0xE0 && c8 < 0xA0)
77 || (cu1 == 0xED && c8 > 0x9F)
78 || (cu1 == 0xF0 && c8 < 0x90)
79 || (cu1 == 0xF4 && c8 > 0x8F))
81 /* An invalid second code unit. */
82 __set_errno (EILSEQ);
83 return -1;
85 if (cu1 >= 0xE0)
87 /* A three or four code unit sequence. */
88 ps->__value.__wchb[1] = c8;
89 ++ps->__value.__wchb[3];
90 return 0;
92 wc = ((cu1 & 0x1F) << 6)
93 + (c8 & 0x3F);
95 else
97 char8_t cu2 = ps->__value.__wchb[1];
98 /* A three or four byte code unit sequence. */
99 if (c8 < 0x80 || c8 > 0xBF)
101 /* An invalid third or fourth code unit. */
102 __set_errno (EILSEQ);
103 return -1;
105 if (ps->__value.__wchb[3] == 2 && cu1 >= 0xF0)
107 /* A four code unit sequence. */
108 ps->__value.__wchb[2] = c8;
109 ++ps->__value.__wchb[3];
110 return 0;
112 if (cu1 < 0xF0)
114 wc = ((cu1 & 0x0F) << 12)
115 + ((cu2 & 0x3F) << 6)
116 + (c8 & 0x3F);
118 else
120 char8_t cu3 = ps->__value.__wchb[2];
121 wc = ((cu1 & 0x07) << 18)
122 + ((cu2 & 0x3F) << 12)
123 + ((cu3 & 0x3F) << 6)
124 + (c8 & 0x3F);
127 ps->__count &= 0x7fffffff;
128 ps->__value.__wch = 0;
131 return wcrtomb (s, wc, ps);