[gcc/]
[official-gcc.git] / gcc / ada / a-suewst.adb
blobc0855d3e0d7214a05b2dffc0ab694dd747da85c6
1 ------------------------------------------------------------------------------
2 -- --
3 -- GNAT RUN-TIME COMPONENTS --
4 -- --
5 -- ADA.STRINGS.UTF_ENCODING.WIDE_STRINGS --
6 -- --
7 -- B o d y --
8 -- --
9 -- Copyright (C) 2010-2012, Free Software Foundation, Inc. --
10 -- --
11 -- GNAT is free software; you can redistribute it and/or modify it under --
12 -- terms of the GNU General Public License as published by the Free Soft- --
13 -- ware Foundation; either version 3, or (at your option) any later ver- --
14 -- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
15 -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
16 -- or FITNESS FOR A PARTICULAR PURPOSE. --
17 -- --
18 -- As a special exception under Section 7 of GPL version 3, you are granted --
19 -- additional permissions described in the GCC Runtime Library Exception, --
20 -- version 3.1, as published by the Free Software Foundation. --
21 -- --
22 -- You should have received a copy of the GNU General Public License and --
23 -- a copy of the GCC Runtime Library Exception along with this program; --
24 -- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see --
25 -- <http://www.gnu.org/licenses/>. --
26 -- --
27 -- GNAT was originally developed by the GNAT team at New York University. --
28 -- Extensive contributions were provided by Ada Core Technologies Inc. --
29 -- --
30 ------------------------------------------------------------------------------
32 package body Ada.Strings.UTF_Encoding.Wide_Strings is
33 use Interfaces;
35 ------------
36 -- Decode --
37 ------------
39 -- Decode UTF-8/UTF-16BE/UTF-16LE input to Wide_String
41 function Decode
42 (Item : UTF_String;
43 Input_Scheme : Encoding_Scheme) return Wide_String
45 begin
46 if Input_Scheme = UTF_8 then
47 return Decode (Item);
48 else
49 return Decode (To_UTF_16 (Item, Input_Scheme));
50 end if;
51 end Decode;
53 -- Decode UTF-8 input to Wide_String
55 function Decode (Item : UTF_8_String) return Wide_String is
56 Result : Wide_String (1 .. Item'Length);
57 -- Result string (worst case is same length as input)
59 Len : Natural := 0;
60 -- Length of result stored so far
62 Iptr : Natural;
63 -- Input Item pointer
65 C : Unsigned_8;
66 R : Unsigned_16;
68 procedure Get_Continuation;
69 -- Reads a continuation byte of the form 10xxxxxx, shifts R left by 6
70 -- bits, and or's in the xxxxxx to the low order 6 bits. On return Ptr
71 -- is incremented. Raises exception if continuation byte does not exist
72 -- or is invalid.
74 ----------------------
75 -- Get_Continuation --
76 ----------------------
78 procedure Get_Continuation is
79 begin
80 if Iptr > Item'Last then
81 Raise_Encoding_Error (Iptr - 1);
83 else
84 C := To_Unsigned_8 (Item (Iptr));
85 Iptr := Iptr + 1;
87 if C not in 2#10_000000# .. 2#10_111111# then
88 Raise_Encoding_Error (Iptr - 1);
89 else
90 R := Shift_Left (R, 6) or Unsigned_16 (C and 2#00_111111#);
91 end if;
92 end if;
93 end Get_Continuation;
95 -- Start of processing for Decode
97 begin
98 Iptr := Item'First;
100 -- Skip BOM at start
102 if Item'Length >= 3
103 and then Item (Iptr .. Iptr + 2) = BOM_8
104 then
105 Iptr := Iptr + 3;
107 -- Error if bad BOM
109 elsif Item'Length >= 2
110 and then (Item (Iptr .. Iptr + 1) = BOM_16BE
111 or else
112 Item (Iptr .. Iptr + 1) = BOM_16LE)
113 then
114 Raise_Encoding_Error (Iptr);
115 end if;
117 while Iptr <= Item'Last loop
118 C := To_Unsigned_8 (Item (Iptr));
119 Iptr := Iptr + 1;
121 -- Codes in the range 16#00# - 16#7F# are represented as
122 -- 0xxxxxxx
124 if C <= 16#7F# then
125 R := Unsigned_16 (C);
127 -- No initial code can be of the form 10xxxxxx. Such codes are used
128 -- only for continuations.
130 elsif C <= 2#10_111111# then
131 Raise_Encoding_Error (Iptr - 1);
133 -- Codes in the range 16#80# - 16#7FF# are represented as
134 -- 110yyyxx 10xxxxxx
136 elsif C <= 2#110_11111# then
137 R := Unsigned_16 (C and 2#000_11111#);
138 Get_Continuation;
140 -- Codes in the range 16#800# - 16#FFFF# are represented as
141 -- 1110yyyy 10yyyyxx 10xxxxxx
143 elsif C <= 2#1110_1111# then
144 R := Unsigned_16 (C and 2#0000_1111#);
145 Get_Continuation;
146 Get_Continuation;
148 -- Codes in the range 16#10000# - 16#10FFFF# are represented as
149 -- 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
151 -- Such codes are out of range for Wide_String output
153 else
154 Raise_Encoding_Error (Iptr - 1);
155 end if;
157 Len := Len + 1;
158 Result (Len) := Wide_Character'Val (R);
159 end loop;
161 return Result (1 .. Len);
162 end Decode;
164 -- Decode UTF-16 input to Wide_String
166 function Decode (Item : UTF_16_Wide_String) return Wide_String is
167 Result : Wide_String (1 .. Item'Length);
168 -- Result is same length as input (possibly minus 1 if BOM present)
170 Len : Natural := 0;
171 -- Length of result
173 Iptr : Natural;
174 -- Index of next Item element
176 C : Unsigned_16;
178 begin
179 -- Skip UTF-16 BOM at start
181 Iptr := Item'First;
183 if Item'Length > 0 and then Item (Iptr) = BOM_16 (1) then
184 Iptr := Iptr + 1;
185 end if;
187 -- Loop through input characters
189 while Iptr <= Item'Last loop
190 C := To_Unsigned_16 (Item (Iptr));
191 Iptr := Iptr + 1;
193 -- Codes in the range 16#0000#..16#D7FF# or 16#E000#..16#FFFD#
194 -- represent their own value.
196 if C <= 16#D7FF# or else C in 16#E000# .. 16#FFFD# then
197 Len := Len + 1;
198 Result (Len) := Wide_Character'Val (C);
200 -- Codes in the range 16#D800#..16#DBFF# represent the first of the
201 -- two surrogates used to encode the range 16#01_000#..16#10_FFFF".
202 -- Such codes are out of range for 16-bit output.
204 -- The case of input in the range 16#DC00#..16#DFFF# must never
205 -- occur, since it means we have a second surrogate character with
206 -- no corresponding first surrogate.
208 -- Codes in the range 16#FFFE# .. 16#FFFF# are also invalid since
209 -- they conflict with codes used for BOM values.
211 -- Thus all remaining codes are invalid
213 else
214 Raise_Encoding_Error (Iptr - 1);
215 end if;
216 end loop;
218 return Result (1 .. Len);
219 end Decode;
221 ------------
222 -- Encode --
223 ------------
225 -- Encode Wide_String in UTF-8, UTF-16BE or UTF-16LE
227 function Encode
228 (Item : Wide_String;
229 Output_Scheme : Encoding_Scheme;
230 Output_BOM : Boolean := False) return UTF_String
232 begin
233 -- Case of UTF_8
235 if Output_Scheme = UTF_8 then
236 return Encode (Item, Output_BOM);
238 -- Case of UTF_16LE or UTF_16BE, use UTF-16 intermediary
240 else
241 return From_UTF_16 (UTF_16_Wide_String'(Encode (Item)),
242 Output_Scheme, Output_BOM);
243 end if;
244 end Encode;
246 -- Encode Wide_String in UTF-8
248 function Encode
249 (Item : Wide_String;
250 Output_BOM : Boolean := False) return UTF_8_String
252 Result : UTF_8_String (1 .. 3 * Item'Length + 3);
253 -- Worst case is three bytes per input byte + space for BOM
255 Len : Natural;
256 -- Number of output codes stored in Result
258 C : Unsigned_16;
259 -- Single input character
261 procedure Store (C : Unsigned_16);
262 pragma Inline (Store);
263 -- Store one output code, C is in the range 0 .. 255
265 -----------
266 -- Store --
267 -----------
269 procedure Store (C : Unsigned_16) is
270 begin
271 Len := Len + 1;
272 Result (Len) := Character'Val (C);
273 end Store;
275 -- Start of processing for UTF8_Encode
277 begin
278 -- Output BOM if required
280 if Output_BOM then
281 Result (1 .. 3) := BOM_8;
282 Len := 3;
283 else
284 Len := 0;
285 end if;
287 -- Loop through characters of input
289 for J in Item'Range loop
290 C := To_Unsigned_16 (Item (J));
292 -- Codes in the range 16#00# - 16#7F# are represented as
293 -- 0xxxxxxx
295 if C <= 16#7F# then
296 Store (C);
298 -- Codes in the range 16#80# - 16#7FF# are represented as
299 -- 110yyyxx 10xxxxxx
301 elsif C <= 16#7FF# then
302 Store (2#110_00000# or Shift_Right (C, 6));
303 Store (2#10_000000# or (C and 2#00_111111#));
305 -- Codes in the range 16#800# - 16#FFFF# are represented as
306 -- 1110yyyy 10yyyyxx 10xxxxxx
308 else
309 Store (2#1110_0000# or Shift_Right (C, 12));
310 Store (2#10_000000# or
311 Shift_Right (C and 2#111111_000000#, 6));
312 Store (2#10_000000# or (C and 2#00_111111#));
313 end if;
314 end loop;
316 return Result (1 .. Len);
317 end Encode;
319 -- Encode Wide_String in UTF-16
321 function Encode
322 (Item : Wide_String;
323 Output_BOM : Boolean := False) return UTF_16_Wide_String
325 Result : UTF_16_Wide_String
326 (1 .. Item'Length + Boolean'Pos (Output_BOM));
327 -- Output is same length as input + possible BOM
329 Len : Integer;
330 -- Length of output string
332 C : Unsigned_16;
334 begin
335 -- Output BOM if required
337 if Output_BOM then
338 Result (1) := BOM_16 (1);
339 Len := 1;
340 else
341 Len := 0;
342 end if;
344 -- Loop through input characters encoding them
346 for Iptr in Item'Range loop
347 C := To_Unsigned_16 (Item (Iptr));
349 -- Codes in the range 16#0000#..16#D7FF# or 16#E000#..16#FFFD# are
350 -- output unchanged.
352 if C <= 16#D7FF# or else C in 16#E000# .. 16#FFFD# then
353 Len := Len + 1;
354 Result (Len) := Wide_Character'Val (C);
356 -- Codes in the range 16#D800#..16#DFFF# should never appear in the
357 -- input, since no valid Unicode characters are in this range (which
358 -- would conflict with the UTF-16 surrogate encodings). Similarly
359 -- codes in the range 16#FFFE#..16#FFFF conflict with BOM codes.
360 -- Thus all remaining codes are illegal.
362 else
363 Raise_Encoding_Error (Iptr);
364 end if;
365 end loop;
367 return Result;
368 end Encode;
370 end Ada.Strings.UTF_Encoding.Wide_Strings;