1 ------------------------------------------------------------------------------
3 -- GNAT RUN-TIME COMPONENTS --
5 -- ADA.STRINGS.UTF_ENCODING.WIDE_WIDE_STRINGS --
9 -- Copyright (C) 2010-2023, Free Software Foundation, Inc. --
11 -- GNAT is free software; you can redistribute it and/or modify it under --
12 -- terms of the GNU General Public License as published by the Free Soft- --
13 -- ware Foundation; either version 3, or (at your option) any later ver- --
14 -- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
15 -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
16 -- or FITNESS FOR A PARTICULAR PURPOSE. --
18 -- As a special exception under Section 7 of GPL version 3, you are granted --
19 -- additional permissions described in the GCC Runtime Library Exception, --
20 -- version 3.1, as published by the Free Software Foundation. --
22 -- You should have received a copy of the GNU General Public License and --
23 -- a copy of the GCC Runtime Library Exception along with this program; --
24 -- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see --
25 -- <http://www.gnu.org/licenses/>. --
27 -- GNAT was originally developed by the GNAT team at New York University. --
28 -- Extensive contributions were provided by Ada Core Technologies Inc. --
30 ------------------------------------------------------------------------------
32 package body Ada
.Strings
.UTF_Encoding
.Wide_Wide_Strings
is
39 -- Decode UTF-8/UTF-16BE/UTF-16LE input to Wide_Wide_String
43 Input_Scheme
: Encoding_Scheme
) return Wide_Wide_String
46 if Input_Scheme
= UTF_8
then
49 return Decode
(To_UTF_16
(Item
, Input_Scheme
));
53 -- Decode UTF-8 input to Wide_Wide_String
55 function Decode
(Item
: UTF_8_String
) return Wide_Wide_String
is
56 Result
: Wide_Wide_String
(1 .. Item
'Length);
57 -- Result string (worst case is same length as input)
60 -- Length of result stored so far
63 -- Input string pointer
68 procedure Get_Continuation
;
69 -- Reads a continuation byte of the form 10xxxxxx, shifts R left by 6
70 -- bits, and or's in the xxxxxx to the low order 6 bits. On return Ptr
71 -- is incremented. Raises exception if continuation byte does not exist
74 ----------------------
75 -- Get_Continuation --
76 ----------------------
78 procedure Get_Continuation
is
80 if Iptr
> Item
'Last then
81 Raise_Encoding_Error
(Iptr
- 1);
84 C
:= To_Unsigned_8
(Item
(Iptr
));
87 if C
not in 2#
10_000000#
.. 2#
10_111111#
then
88 Raise_Encoding_Error
(Iptr
- 1);
90 R
:= Shift_Left
(R
, 6) or Unsigned_32
(C
and 2#
00_111111#
);
95 -- Start of processing for Decode
103 and then Item
(Iptr
.. Iptr
+ 2) = BOM_8
109 elsif Item
'Length >= 2
110 and then (Item
(Iptr
.. Iptr
+ 1) = BOM_16BE
112 Item
(Iptr
.. Iptr
+ 1) = BOM_16LE
)
114 Raise_Encoding_Error
(Iptr
);
117 -- Loop through input characters
119 while Iptr
<= Item
'Last loop
120 C
:= To_Unsigned_8
(Item
(Iptr
));
123 -- Codes in the range 16#00# - 16#7F# are represented as
127 R
:= Unsigned_32
(C
);
129 -- No initial code can be of the form 10xxxxxx. Such codes are used
130 -- only for continuations.
132 elsif C
<= 2#
10_111111#
then
133 Raise_Encoding_Error
(Iptr
- 1);
135 -- Codes in the range 16#80# - 16#7FF# are represented as
138 elsif C
<= 2#
110_11111#
then
139 R
:= Unsigned_32
(C
and 2#
000_11111#
);
142 -- Codes in the range 16#800# - 16#FFFF# are represented as
143 -- 1110yyyy 10yyyyxx 10xxxxxx
145 elsif C
<= 2#
1110_1111#
then
146 R
:= Unsigned_32
(C
and 2#
0000_1111#
);
150 -- Codes in the range 16#10000# - 16#10FFFF# are represented as
151 -- 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
153 elsif C
<= 2#
11110_111#
then
154 R
:= Unsigned_32
(C
and 2#
00000_111#
);
159 -- Any other code is an error
162 Raise_Encoding_Error
(Iptr
- 1);
166 Result
(Len
) := Wide_Wide_Character
'Val (R
);
169 return Result
(1 .. Len
);
172 -- Decode UTF-16 input to Wide_Wide_String
174 function Decode
(Item
: UTF_16_Wide_String
) return Wide_Wide_String
is
175 Result
: Wide_Wide_String
(1 .. Item
'Length);
176 -- Result cannot be longer than the input string
182 -- Pointer to next element in Item
188 -- Skip UTF-16 BOM at start
192 if Iptr
<= Item
'Last and then Item
(Iptr
) = BOM_16
(1) then
196 -- Loop through input characters
198 while Iptr
<= Item
'Last loop
199 C
:= To_Unsigned_16
(Item
(Iptr
));
202 -- Codes in the range 16#0000#..16#D7FF# or 16#E000#..16#FFFD#
203 -- represent their own value.
205 if C
<= 16#D7FF#
or else C
in 16#E000#
.. 16#FFFD#
then
207 Result
(Len
) := Wide_Wide_Character
'Val (C
);
209 -- Codes in the range 16#D800#..16#DBFF# represent the first of the
210 -- two surrogates used to encode the range 16#01_000#..16#10_FFFF".
211 -- The first surrogate provides 10 high order bits of the result.
213 elsif C
<= 16#DBFF#
then
214 R
:= Shift_Left
((Unsigned_32
(C
) - 16#D800#
), 10);
216 -- Error if at end of string
218 if Iptr
> Item
'Last then
219 Raise_Encoding_Error
(Iptr
- 1);
221 -- Otherwise next character must be valid low order surrogate
222 -- which provides the low 10 order bits of the result.
225 C
:= To_Unsigned_16
(Item
(Iptr
));
228 if C
not in 16#DC00#
.. 16#DFFF#
then
229 Raise_Encoding_Error
(Iptr
- 1);
232 R
:= R
or (Unsigned_32
(C
) mod 2 ** 10);
234 -- The final adjustment is to add 16#01_0000 to get the
235 -- result back in the required 21 bit range.
237 R
:= R
+ 16#
01_0000#
;
239 Result
(Len
) := Wide_Wide_Character
'Val (R
);
243 -- Remaining codes are invalid
246 Raise_Encoding_Error
(Iptr
- 1);
250 return Result
(1 .. Len
);
257 -- Encode Wide_Wide_String in UTF-8, UTF-16BE or UTF-16LE
260 (Item
: Wide_Wide_String
;
261 Output_Scheme
: Encoding_Scheme
;
262 Output_BOM
: Boolean := False) return UTF_String
265 if Output_Scheme
= UTF_8
then
266 return Encode
(Item
, Output_BOM
);
268 return From_UTF_16
(Encode
(Item
), Output_Scheme
, Output_BOM
);
272 -- Encode Wide_Wide_String in UTF-8
275 (Item
: Wide_Wide_String
;
276 Output_BOM
: Boolean := False) return UTF_8_String
278 Result
: String (1 .. 4 * Item
'Length + 3);
279 -- Worst case is four bytes per input byte + space for BOM
282 -- Number of output codes stored in Result
285 -- Single input character
287 procedure Store
(C
: Unsigned_32
);
288 pragma Inline
(Store
);
289 -- Store one output code (input is in range 0 .. 255)
295 procedure Store
(C
: Unsigned_32
) is
298 Result
(Len
) := Character'Val (C
);
301 -- Start of processing for Encode
304 -- Output BOM if required
307 Result
(1 .. 3) := BOM_8
;
313 -- Loop through characters of input
315 for Iptr
in Item
'Range loop
316 C
:= To_Unsigned_32
(Item
(Iptr
));
318 -- Codes in the range 16#00#..16#7F# are represented as
324 -- Codes in the range 16#80#..16#7FF# are represented as
327 elsif C
<= 16#
7FF#
then
328 Store
(2#
110_00000#
or Shift_Right
(C
, 6));
329 Store
(2#
10_000000#
or (C
and 2#
00_111111#
));
331 -- Codes in the range 16#800#..16#D7FF# or 16#E000#..16#FFFD# are
333 -- 1110yyyy 10yyyyxx 10xxxxxx
335 elsif C
<= 16#D7FF#
or else C
in 16#E000#
.. 16#FFFD#
then
336 Store
(2#
1110_0000#
or Shift_Right
(C
, 12));
337 Store
(2#
10_000000#
or
338 Shift_Right
(C
and 2#
111111_000000#
, 6));
339 Store
(2#
10_000000#
or (C
and 2#
00_111111#
));
341 -- Codes in the range 16#10000# - 16#10FFFF# are represented as
342 -- 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
344 elsif C
in 16#
1_0000#
.. 16#
10_FFFF#
then
345 Store
(2#
11110_000#
or
346 Shift_Right
(C
, 18));
347 Store
(2#
10_000000#
or
348 Shift_Right
(C
and 2#
111111_000000_000000#
, 12));
349 Store
(2#
10_000000#
or
350 Shift_Right
(C
and 2#
111111_000000#
, 6));
351 Store
(2#
10_000000#
or
352 (C
and 2#
00_111111#
));
354 -- All other codes are invalid
357 Raise_Encoding_Error
(Iptr
);
361 return Result
(1 .. Len
);
364 -- Encode Wide_Wide_String in UTF-16
367 (Item
: Wide_Wide_String
;
368 Output_BOM
: Boolean := False) return UTF_16_Wide_String
370 Result
: UTF_16_Wide_String
(1 .. 2 * Item
'Length + 1);
371 -- Worst case is each input character generates two output characters
372 -- plus one for possible BOM.
375 -- Length of output string
380 -- Output BOM if needed
383 Result
(1) := BOM_16
(1);
389 -- Loop through input characters encoding them
391 for Iptr
in Item
'Range loop
392 C
:= To_Unsigned_32
(Item
(Iptr
));
394 -- Codes in the range 16#00_0000#..16#00_D7FF# or 16#E000#..16#FFFD#
395 -- are output unchanged
397 if C
<= 16#
00_D7FF#
or else C
in 16#E000#
.. 16#FFFD#
then
399 Result
(Len
) := Wide_Character'Val (C
);
401 -- Codes in the range 16#01_0000#..16#10_FFFF# are output using two
402 -- surrogate characters. First 16#1_0000# is subtracted from the code
403 -- point to give a 20-bit value. This is then split into two separate
404 -- 10-bit values each of which is represented as a surrogate with the
405 -- most significant half placed in the first surrogate. The ranges of
406 -- values used for the two surrogates are 16#D800#-16#DBFF# for the
407 -- first, most significant surrogate and 16#DC00#-16#DFFF# for the
408 -- second, least significant surrogate.
410 elsif C
in 16#
1_0000#
.. 16#
10_FFFF#
then
414 Result
(Len
) := Wide_Character'Val (16#D800#
+ C
/ 2 ** 10);
417 Result
(Len
) := Wide_Character'Val (16#DC00#
+ C
mod 2 ** 10);
419 -- All other codes are invalid
422 Raise_Encoding_Error
(Iptr
);
426 return Result
(1 .. Len
);
429 end Ada
.Strings
.UTF_Encoding
.Wide_Wide_Strings
;