1 ------------------------------------------------------------------------------
3 -- GNAT RUN-TIME COMPONENTS --
5 -- ADA.STRINGS.UTF_ENCODING.WIDE_WIDE_ENCODING --
9 -- Copyright (C) 2010, Free Software Foundation, Inc. --
11 -- GNAT is free software; you can redistribute it and/or modify it under --
12 -- terms of the GNU General Public License as published by the Free Soft- --
13 -- ware Foundation; either version 2, or (at your option) any later ver- --
14 -- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
15 -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
16 -- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License --
17 -- for more details. You should have received a copy of the GNU General --
18 -- Public License distributed with GNAT; see file COPYING. If not, write --
19 -- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, --
20 -- Boston, MA 02110-1301, USA. --
22 -- As a special exception, if other files instantiate generics from this --
23 -- unit, or you link this unit with other files to produce an executable, --
24 -- this unit does not by itself cause the resulting executable to be --
25 -- covered by the GNU General Public License. This exception does not --
26 -- however invalidate any other reasons why the executable file might be --
27 -- covered by the GNU Public License. --
29 -- GNAT was originally developed by the GNAT team at New York University. --
30 -- Extensive contributions were provided by Ada Core Technologies Inc. --
32 ------------------------------------------------------------------------------
34 package body Ada
.Strings
.UTF_Encoding
.Wide_Wide_Encoding
is
41 -- Version to decode UTF-8/UTF-16BE/UTF-16LE input to Wide_Wide_String
45 Input_Scheme
: Encoding_Scheme
) return Wide_Wide_String
48 if Input_Scheme
= UTF_8
then
51 return Decode
(To_UTF_16
(Item
, Input_Scheme
));
55 -- Decode UTF-8 input to Wide_Wide_String
57 function Decode
(Item
: UTF_8_String
) return Wide_Wide_String
is
58 Result
: Wide_Wide_String
(1 .. Item
'Length);
59 -- Result string (worst case is same length as input)
62 -- Length of result stored so far
65 -- Input string pointer
70 procedure Get_Continuation
;
71 -- Reads a continuation byte of the form 10xxxxxx, shifts R left
72 -- by 6 bits, and or's in the xxxxxx to the low order 6 bits. On
73 -- return Ptr is incremented. Raises exceptioon if continuation
74 -- byte does not exist or is invalid.
76 ----------------------
77 -- Get_Continuation --
78 ----------------------
80 procedure Get_Continuation
is
82 if Iptr
> Item
'Last then
83 Raise_Encoding_Error
(Iptr
- 1);
86 C
:= To_Unsigned_8
(Item
(Iptr
));
89 if C
not in 2#
10_000000#
.. 2#
10_111111#
then
90 Raise_Encoding_Error
(Iptr
- 1);
92 R
:= Shift_Left
(R
, 6) or Unsigned_32
(C
and 2#
00_111111#
);
97 -- Start of processing for Decode
105 and then Item
(Iptr
.. Iptr
+ 2) = BOM_8
111 elsif Item
'Length >= 2
112 and then (Item
(Iptr
.. Iptr
+ 1) = BOM_16BE
114 Item
(Iptr
.. Iptr
+ 1) = BOM_16LE
)
116 Raise_Encoding_Error
(Iptr
);
119 -- Loop through input characters
121 while Iptr
<= Item
'Last loop
122 C
:= To_Unsigned_8
(Item
(Iptr
));
125 -- Codes in the range 16#00# - 16#7F# are represented as
129 R
:= Unsigned_32
(C
);
131 -- No initial code can be of the form 10xxxxxx. Such codes are used
132 -- only for continuations.
134 elsif C
<= 2#
10_111111#
then
135 Raise_Encoding_Error
(Iptr
- 1);
137 -- Codes in the range 16#80# - 16#7FF# are represented as
140 elsif C
<= 2#
110_11111#
then
141 R
:= Unsigned_32
(C
and 2#
000_11111#
);
144 -- Codes in the range 16#800# - 16#FFFF# are represented as
145 -- 1110yyyy 10yyyyxx 10xxxxxx
147 elsif C
<= 2#
1110_1111#
then
148 R
:= Unsigned_32
(C
and 2#
0000_1111#
);
152 -- Codes in the range 16#10000# - 16#10FFFF# are represented as
153 -- 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
155 elsif C
<= 2#
11110_111#
then
156 R
:= Unsigned_32
(C
and 2#
00000_111#
);
161 -- Any other code is an error
164 Raise_Encoding_Error
(Iptr
- 1);
168 Result
(Len
) := Wide_Wide_Character
'Val (R
);
171 return Result
(1 .. Len
);
174 -- Decode UTF-16 input to Wide_Wide_String
176 function Decode
(Item
: UTF_16_Wide_String
) return Wide_Wide_String
is
177 Result
: Wide_Wide_String
(1 .. Item
'Length);
178 -- Result cannot be longer than the input string
184 -- Pointer to next element in Item
190 -- Skip UTF-16 BOM at start
194 if Iptr
<= Item
'Last and then Item
(Iptr
) = BOM_16
(1) then
198 -- Loop through input characters
200 while Iptr
<= Item
'Last loop
201 C
:= To_Unsigned_16
(Item
(Iptr
));
204 -- Codes in the range 16#0000#..16#D7FF# or 16#E000#..16#FFFD#
205 -- represent their own value.
207 if C
<= 16#D7FF#
or else C
in 16#E000#
.. 16#FFFD#
then
209 Result
(Len
) := Wide_Wide_Character
'Val (C
);
211 -- Codes in the range 16#D800#..16#DBFF# represent the first of the
212 -- two surrogates used to encode the range 16#01_000#..16#10_FFFF".
213 -- The first surrogate provides 10 high order bits of the result.
215 elsif C
<= 16#DBFF#
then
216 R
:= Shift_Left
((Unsigned_32
(C
) - 16#D800#
), 10);
218 -- Error if at end of string
220 if Iptr
> Item
'Last then
221 Raise_Encoding_Error
(Iptr
- 1);
223 -- Otherwise next character must be valid low order surrogate
224 -- which provides the low 10 order bits of the result.
227 C
:= To_Unsigned_16
(Item
(Iptr
));
230 if C
not in 16#DC00#
.. 16#DFFF#
then
231 Raise_Encoding_Error
(Iptr
- 1);
234 R
:= R
or (Unsigned_32
(C
) mod 2 ** 10);
236 -- The final adjustment is to add 16#01_0000 to get the
237 -- result back in the required 21 bit range.
239 R
:= R
+ 16#
01_0000#
;
241 Result
(Len
) := Wide_Wide_Character
'Val (R
);
245 -- Remaining codes are invalid
248 Raise_Encoding_Error
(Iptr
- 1);
252 return Result
(1 .. Len
);
259 -- Encode Wide_Wide_String in UTF-8, UTF-16BE or UTF-16LE
262 (Item
: Wide_Wide_String
;
263 Output_Scheme
: Encoding_Scheme
;
264 Output_BOM
: Boolean := False) return UTF_String
267 if Output_Scheme
= UTF_8
then
268 return Encode
(Item
, Output_BOM
);
270 return From_UTF_16
(Encode
(Item
), Output_Scheme
, Output_BOM
);
274 -- Encode Wide_Wide_String in UTF-8
277 (Item
: Wide_Wide_String
;
278 Output_BOM
: Boolean := False) return UTF_8_String
280 Result
: String (1 .. 4 * Item
'Length + 3);
281 -- Worst case is four bytes per input byte + space for BOM
284 -- Number of output codes stored in Result
287 -- Single input character
289 procedure Store
(C
: Unsigned_32
);
290 pragma Inline
(Store
);
291 -- Store one output code (input is in range 0 .. 255)
297 procedure Store
(C
: Unsigned_32
) is
300 Result
(Len
) := Character'Val (C
);
303 -- Start of processing for Encode
306 -- Output BOM if required
309 Result
(1 .. 3) := BOM_8
;
315 -- Loop through characters of input
317 for Iptr
in Item
'Range loop
318 C
:= To_Unsigned_32
(Item
(Iptr
));
320 -- Codes in the range 16#00#..16#7F# are represented as
326 -- Codes in the range 16#80#..16#7FF# are represented as
329 elsif C
<= 16#
7FF#
then
330 Store
(2#
110_00000#
or Shift_Right
(C
, 6));
331 Store
(2#
10_000000#
or (C
and 2#
00_111111#
));
333 -- Codes in the range 16#800#..16#D7FF# or 16#E000#..16#FFFD# are
335 -- 1110yyyy 10yyyyxx 10xxxxxx
337 elsif C
<= 16#D7FF#
or else C
in 16#E000#
.. 16#FFFD#
then
338 Store
(2#
1110_0000#
or Shift_Right
(C
, 12));
339 Store
(2#
10_000000#
or
340 Shift_Right
(C
and 2#
111111_000000#
, 6));
341 Store
(2#
10_000000#
or (C
and 2#
00_111111#
));
343 -- Codes in the range 16#10000# - 16#10FFFF# are represented as
344 -- 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
346 elsif C
in 16#
1_0000#
.. 16#
10_FFFF#
then
347 Store
(2#
11110_000#
or
348 Shift_Right
(C
, 18));
349 Store
(2#
10_000000#
or
350 Shift_Right
(C
and 2#
111111_000000_000000#
, 12));
351 Store
(2#
10_000000#
or
352 Shift_Right
(C
and 2#
111111_000000#
, 6));
353 Store
(2#
10_000000#
or
354 (C
and 2#
00_111111#
));
356 -- All other codes are invalid
359 Raise_Encoding_Error
(Iptr
);
363 return Result
(1 .. Len
);
366 -- Encode Wide_Wide_String in UTF-16
369 (Item
: Wide_Wide_String
;
370 Output_BOM
: Boolean := False) return UTF_16_Wide_String
372 Result
: Wide_String (1 .. 2 * Item
'Length + 1);
373 -- Worst case is each input character generates two output characters
374 -- plus one for possible BOM.
377 -- Length of output string
382 -- Output BOM if needed
385 Result
(1) := BOM_16
(1);
391 -- Loop through input characters encoding them
393 for Iptr
in Item
'Range loop
394 C
:= To_Unsigned_32
(Item
(Iptr
));
396 -- Codes in the range 16#00_0000#..16#00_D7FF# or 16#E000#..16#FFFD#
397 -- are output unchanged
399 if C
<= 16#
00_D7FF#
or else C
in 16#E000#
.. 16#FFFD#
then
401 Result
(Len
) := Wide_Character'Val (C
);
403 -- Codes in the range 16#01_0000#..16#10_FFFF# are output using two
404 -- surrogate characters. First 16#1_0000# is subtracted from the code
405 -- point to give a 20-bit value. This is then split into two separate
406 -- 10-bit values each of which is represented as a surrogate with the
407 -- most significant half placed in the first surrogate. The ranges of
408 -- values used for the two surrogates are 16#D800#-16#DBFF# for the
409 -- first, most significant surrogate and 16#DC00#-16#DFFF# for the
410 -- second, least significant surrogate.
412 elsif C
in 16#
1_0000#
.. 16#
10_FFFF#
then
416 Result
(Len
) := Wide_Character'Val (16#D800#
+ C
/ 2 ** 10);
419 Result
(Len
) := Wide_Character'Val (16#DC00#
+ C
mod 2 ** 10);
421 -- All other codes are invalid
424 Raise_Encoding_Error
(Iptr
);
428 return Result
(1 .. Len
);
431 end Ada
.Strings
.UTF_Encoding
.Wide_Wide_Encoding
;