1 ------------------------------------------------------------------------------
3 -- GNAT RUN-TIME COMPONENTS --
5 -- ADA.STRINGS.UTF_ENCODING.WIDE_STRINGS --
9 -- Copyright (C) 2010, Free Software Foundation, Inc. --
11 -- GNAT is free software; you can redistribute it and/or modify it under --
12 -- terms of the GNU General Public License as published by the Free Soft- --
13 -- ware Foundation; either version 3, or (at your option) any later ver- --
14 -- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
15 -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
16 -- or FITNESS FOR A PARTICULAR PURPOSE. --
18 -- As a special exception under Section 7 of GPL version 3, you are granted --
19 -- additional permissions described in the GCC Runtime Library Exception, --
20 -- version 3.1, as published by the Free Software Foundation. --
22 -- You should have received a copy of the GNU General Public License and --
23 -- a copy of the GCC Runtime Library Exception along with this program; --
24 -- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see --
25 -- <http://www.gnu.org/licenses/>. --
27 -- GNAT was originally developed by the GNAT team at New York University. --
28 -- Extensive contributions were provided by Ada Core Technologies Inc. --
30 ------------------------------------------------------------------------------
32 package body Ada
.Strings
.UTF_Encoding
.Wide_Strings
is
39 -- Decode UTF-8/UTF-16BE/UTF-16LE input to Wide_String
43 Input_Scheme
: Encoding_Scheme
) return Wide_String
46 if Input_Scheme
= UTF_8
then
49 return Decode
(To_UTF_16
(Item
, Input_Scheme
));
53 -- Decode UTF-8 input to Wide_String
55 function Decode
(Item
: UTF_8_String
) return Wide_String is
56 Result
: Wide_String (1 .. Item
'Length);
57 -- Result string (worst case is same length as input)
60 -- Length of result stored so far
68 procedure Get_Continuation
;
69 -- Reads a continuation byte of the form 10xxxxxx, shifts R left
70 -- by 6 bits, and or's in the xxxxxx to the low order 6 bits. On
71 -- return Ptr is incremented. Raises exception if continuation
72 -- byte does not exist or is invalid.
74 ----------------------
75 -- Get_Continuation --
76 ----------------------
78 procedure Get_Continuation
is
80 if Iptr
> Item
'Last then
81 Raise_Encoding_Error
(Iptr
- 1);
84 C
:= To_Unsigned_8
(Item
(Iptr
));
87 if C
not in 2#
10_000000#
.. 2#
10_111111#
then
88 Raise_Encoding_Error
(Iptr
- 1);
90 R
:= Shift_Left
(R
, 6) or Unsigned_16
(C
and 2#
00_111111#
);
95 -- Start of processing for Decode
103 and then Item
(Iptr
.. Iptr
+ 2) = BOM_8
109 elsif Item
'Length >= 2
110 and then (Item
(Iptr
.. Iptr
+ 1) = BOM_16BE
112 Item
(Iptr
.. Iptr
+ 1) = BOM_16LE
)
114 Raise_Encoding_Error
(Iptr
);
117 while Iptr
<= Item
'Last loop
118 C
:= To_Unsigned_8
(Item
(Iptr
));
121 -- Codes in the range 16#00# - 16#7F# are represented as
125 R
:= Unsigned_16
(C
);
127 -- No initial code can be of the form 10xxxxxx. Such codes are used
128 -- only for continuations.
130 elsif C
<= 2#
10_111111#
then
131 Raise_Encoding_Error
(Iptr
- 1);
133 -- Codes in the range 16#80# - 16#7FF# are represented as
136 elsif C
<= 2#
110_11111#
then
137 R
:= Unsigned_16
(C
and 2#
000_11111#
);
140 -- Codes in the range 16#800# - 16#FFFF# are represented as
141 -- 1110yyyy 10yyyyxx 10xxxxxx
143 elsif C
<= 2#
1110_1111#
then
144 R
:= Unsigned_16
(C
and 2#
0000_1111#
);
148 -- Codes in the range 16#10000# - 16#10FFFF# are represented as
149 -- 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
151 -- Such codes are out of range for Wide_String output
154 Raise_Encoding_Error
(Iptr
- 1);
158 Result
(Len
) := Wide_Character'Val (R
);
161 return Result
(1 .. Len
);
164 -- Decode UTF-16 input to Wide_String
166 function Decode
(Item
: UTF_16_Wide_String
) return Wide_String is
167 Result
: Wide_String (1 .. Item
'Length);
168 -- Result is same length as input (possibly minus 1 if BOM present)
174 -- Index of next Item element
179 -- Skip UTF-16 BOM at start
183 if Item
'Length > 0 and then Item
(Iptr
) = BOM_16
(1) then
187 -- Loop through input characters
189 while Iptr
<= Item
'Last loop
190 C
:= To_Unsigned_16
(Item
(Iptr
));
193 -- Codes in the range 16#0000#..16#D7FF# or 16#E000#..16#FFFD#
194 -- represent their own value.
196 if C
<= 16#D7FF#
or else C
in 16#E000#
.. 16#FFFD#
then
198 Result
(Len
) := Wide_Character'Val (C
);
200 -- Codes in the range 16#D800#..16#DBFF# represent the first of the
201 -- two surrogates used to encode the range 16#01_000#..16#10_FFFF".
202 -- Such codes are out of range for 16-bit output.
204 -- The case of input in the range 16#DC00#..16#DFFF# must never
205 -- occur, since it means we have a second surrogate character with
206 -- no corresponding first surrogate.
208 -- Codes in the range 16#FFFE# .. 16#FFFF# are also invalid since
209 -- they conflict with codes used for BOM values.
211 -- Thus all remaining codes are invalid
214 Raise_Encoding_Error
(Iptr
- 1);
218 return Result
(1 .. Len
);
225 -- Encode Wide_String in UTF-8, UTF-16BE or UTF-16LE
229 Output_Scheme
: Encoding_Scheme
;
230 Output_BOM
: Boolean := False) return UTF_String
235 if Output_Scheme
= UTF_8
then
236 return Encode
(Item
, Output_BOM
);
238 -- Case of UTF_16LE or UTF_16BE, use UTF-16 intermediary
241 return From_UTF_16
(UTF_16_Wide_String
'(Encode (Item)),
242 Output_Scheme, Output_BOM);
246 -- Encode Wide_String in UTF-8
250 Output_BOM : Boolean := False) return UTF_8_String
252 Result : UTF_8_String (1 .. 3 * Item'Length + 3);
253 -- Worst case is three bytes per input byte + space for BOM
256 -- Number of output codes stored in Result
259 -- Single input character
261 procedure Store (C : Unsigned_16);
262 pragma Inline (Store);
263 -- Store one output code, C is in the range 0 .. 255
269 procedure Store (C : Unsigned_16) is
272 Result (Len) := Character'Val (C);
275 -- Start of processing for UTF8_Encode
278 -- Output BOM if required
281 Result (1 .. 3) := BOM_8;
287 -- Loop through characters of input
289 for J in Item'Range loop
290 C := To_Unsigned_16 (Item (J));
292 -- Codes in the range 16#00# - 16#7F# are represented as
298 -- Codes in the range 16#80# - 16#7FF# are represented as
301 elsif C <= 16#7FF# then
302 Store (2#110_00000# or Shift_Right (C, 6));
303 Store (2#10_000000# or (C and 2#00_111111#));
305 -- Codes in the range 16#800# - 16#FFFF# are represented as
306 -- 1110yyyy 10yyyyxx 10xxxxxx
309 Store (2#1110_0000# or Shift_Right (C, 12));
310 Store (2#10_000000# or
311 Shift_Right (C and 2#111111_000000#, 6));
312 Store (2#10_000000# or (C and 2#00_111111#));
316 return Result (1 .. Len);
319 -- Encode Wide_String in UTF-16
323 Output_BOM : Boolean := False) return UTF_16_Wide_String
325 Result : UTF_16_Wide_String
326 (1 .. Item'Length + Boolean'Pos (Output_BOM));
327 -- Output is same length as input + possible BOM
330 -- Length of output string
335 -- Output BOM if required
338 Result (1) := BOM_16 (1);
344 -- Loop through input characters encoding them
346 for Iptr in Item'Range loop
347 C := To_Unsigned_16 (Item (Iptr));
349 -- Codes in the range 16#0000#..16#D7FF# or 16#E000#..16#FFFD# are
352 if C <= 16#D7FF# or else C in 16#E000# .. 16#FFFD# then
354 Result (Len) := Wide_Character'Val (C);
356 -- Codes in the range 16#D800#..16#DFFF# should never appear in the
357 -- input, since no valid Unicode characters are in this range (which
358 -- would conflict with the UTF-16 surrogate encodings). Similarly
359 -- codes in the range 16#FFFE#..16#FFFF conflict with BOM codes.
360 -- Thus all remaining codes are illegal.
363 Raise_Encoding_Error (Iptr);
370 end Ada.Strings.UTF_Encoding.Wide_Strings;