2010-07-27 Paolo Carlini <paolo.carlini@oracle.com>
[official-gcc/alias-decl.git] / gcc / ada / a-suezen.adb
blob972fbf061e8b4b8f84c0f22f97b14c76a03a8050
1 ------------------------------------------------------------------------------
2 -- --
3 -- GNAT RUN-TIME COMPONENTS --
4 -- --
5 -- ADA.STRINGS.UTF_ENCODING.WIDE_WIDE_ENCODING --
6 -- --
7 -- B o d y --
8 -- --
9 -- Copyright (C) 2010, Free Software Foundation, Inc. --
10 -- --
11 -- GNAT is free software; you can redistribute it and/or modify it under --
12 -- terms of the GNU General Public License as published by the Free Soft- --
13 -- ware Foundation; either version 2, or (at your option) any later ver- --
14 -- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
15 -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
16 -- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License --
17 -- for more details. You should have received a copy of the GNU General --
18 -- Public License distributed with GNAT; see file COPYING. If not, write --
19 -- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, --
20 -- Boston, MA 02110-1301, USA. --
21 -- --
22 -- As a special exception, if other files instantiate generics from this --
23 -- unit, or you link this unit with other files to produce an executable, --
24 -- this unit does not by itself cause the resulting executable to be --
25 -- covered by the GNU General Public License. This exception does not --
26 -- however invalidate any other reasons why the executable file might be --
27 -- covered by the GNU Public License. --
28 -- --
29 -- GNAT was originally developed by the GNAT team at New York University. --
30 -- Extensive contributions were provided by Ada Core Technologies Inc. --
31 -- --
32 ------------------------------------------------------------------------------
34 package body Ada.Strings.UTF_Encoding.Wide_Wide_Encoding is
35 use Interfaces;
37 ------------
38 -- Decode --
39 ------------
41 -- Version to decode UTF-8/UTF-16BE/UTF-16LE input to Wide_Wide_String
43 function Decode
44 (Item : UTF_String;
45 Input_Scheme : Encoding_Scheme) return Wide_Wide_String
47 begin
48 if Input_Scheme = UTF_8 then
49 return Decode (Item);
50 else
51 return Decode (To_UTF_16 (Item, Input_Scheme));
52 end if;
53 end Decode;
55 -- Decode UTF-8 input to Wide_Wide_String
57 function Decode (Item : UTF_8_String) return Wide_Wide_String is
58 Result : Wide_Wide_String (1 .. Item'Length);
59 -- Result string (worst case is same length as input)
61 Len : Natural := 0;
62 -- Length of result stored so far
64 Iptr : Natural;
65 -- Input string pointer
67 C : Unsigned_8;
68 R : Unsigned_32;
70 procedure Get_Continuation;
71 -- Reads a continuation byte of the form 10xxxxxx, shifts R left
72 -- by 6 bits, and or's in the xxxxxx to the low order 6 bits. On
73 -- return Ptr is incremented. Raises exceptioon if continuation
74 -- byte does not exist or is invalid.
76 ----------------------
77 -- Get_Continuation --
78 ----------------------
80 procedure Get_Continuation is
81 begin
82 if Iptr > Item'Last then
83 Raise_Encoding_Error (Iptr - 1);
85 else
86 C := To_Unsigned_8 (Item (Iptr));
87 Iptr := Iptr + 1;
89 if C not in 2#10_000000# .. 2#10_111111# then
90 Raise_Encoding_Error (Iptr - 1);
91 else
92 R := Shift_Left (R, 6) or Unsigned_32 (C and 2#00_111111#);
93 end if;
94 end if;
95 end Get_Continuation;
97 -- Start of processing for Decode
99 begin
100 Iptr := Item'First;
102 -- Skip BOM at start
104 if Item'Length >= 3
105 and then Item (Iptr .. Iptr + 2) = BOM_8
106 then
107 Iptr := Iptr + 3;
109 -- Error if bad BOM
111 elsif Item'Length >= 2
112 and then (Item (Iptr .. Iptr + 1) = BOM_16BE
113 or else
114 Item (Iptr .. Iptr + 1) = BOM_16LE)
115 then
116 Raise_Encoding_Error (Iptr);
117 end if;
119 -- Loop through input characters
121 while Iptr <= Item'Last loop
122 C := To_Unsigned_8 (Item (Iptr));
123 Iptr := Iptr + 1;
125 -- Codes in the range 16#00# - 16#7F# are represented as
126 -- 0xxxxxxx
128 if C <= 16#7F# then
129 R := Unsigned_32 (C);
131 -- No initial code can be of the form 10xxxxxx. Such codes are used
132 -- only for continuations.
134 elsif C <= 2#10_111111# then
135 Raise_Encoding_Error (Iptr - 1);
137 -- Codes in the range 16#80# - 16#7FF# are represented as
138 -- 110yyyxx 10xxxxxx
140 elsif C <= 2#110_11111# then
141 R := Unsigned_32 (C and 2#000_11111#);
142 Get_Continuation;
144 -- Codes in the range 16#800# - 16#FFFF# are represented as
145 -- 1110yyyy 10yyyyxx 10xxxxxx
147 elsif C <= 2#1110_1111# then
148 R := Unsigned_32 (C and 2#0000_1111#);
149 Get_Continuation;
150 Get_Continuation;
152 -- Codes in the range 16#10000# - 16#10FFFF# are represented as
153 -- 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
155 elsif C <= 2#11110_111# then
156 R := Unsigned_32 (C and 2#00000_111#);
157 Get_Continuation;
158 Get_Continuation;
159 Get_Continuation;
161 -- Any other code is an error
163 else
164 Raise_Encoding_Error (Iptr - 1);
165 end if;
167 Len := Len + 1;
168 Result (Len) := Wide_Wide_Character'Val (R);
169 end loop;
171 return Result (1 .. Len);
172 end Decode;
174 -- Decode UTF-16 input to Wide_Wide_String
176 function Decode (Item : UTF_16_Wide_String) return Wide_Wide_String is
177 Result : Wide_Wide_String (1 .. Item'Length);
178 -- Result cannot be longer than the input string
180 Len : Natural := 0;
181 -- Length of result
183 Iptr : Natural;
184 -- Pointer to next element in Item
186 C : Unsigned_16;
187 R : Unsigned_32;
189 begin
190 -- Skip UTF-16 BOM at start
192 Iptr := Item'First;
194 if Iptr <= Item'Last and then Item (Iptr) = BOM_16 (1) then
195 Iptr := Iptr + 1;
196 end if;
198 -- Loop through input characters
200 while Iptr <= Item'Last loop
201 C := To_Unsigned_16 (Item (Iptr));
202 Iptr := Iptr + 1;
204 -- Codes in the range 16#0000#..16#D7FF# or 16#E000#..16#FFFD#
205 -- represent their own value.
207 if C <= 16#D7FF# or else C in 16#E000# .. 16#FFFD# then
208 Len := Len + 1;
209 Result (Len) := Wide_Wide_Character'Val (C);
211 -- Codes in the range 16#D800#..16#DBFF# represent the first of the
212 -- two surrogates used to encode the range 16#01_000#..16#10_FFFF".
213 -- The first surrogate provides 10 high order bits of the result.
215 elsif C <= 16#DBFF# then
216 R := Shift_Left ((Unsigned_32 (C) - 16#D800#), 10);
218 -- Error if at end of string
220 if Iptr > Item'Last then
221 Raise_Encoding_Error (Iptr - 1);
223 -- Otherwise next character must be valid low order surrogate
224 -- which provides the low 10 order bits of the result.
226 else
227 C := To_Unsigned_16 (Item (Iptr));
228 Iptr := Iptr + 1;
230 if C not in 16#DC00# .. 16#DFFF# then
231 Raise_Encoding_Error (Iptr - 1);
233 else
234 R := R or (Unsigned_32 (C) mod 2 ** 10);
236 -- The final adjustment is to add 16#01_0000 to get the
237 -- result back in the required 21 bit range.
239 R := R + 16#01_0000#;
240 Len := Len + 1;
241 Result (Len) := Wide_Wide_Character'Val (R);
242 end if;
243 end if;
245 -- Remaining codes are invalid
247 else
248 Raise_Encoding_Error (Iptr - 1);
249 end if;
250 end loop;
252 return Result (1 .. Len);
253 end Decode;
255 ------------
256 -- Encode --
257 ------------
259 -- Encode Wide_Wide_String in UTF-8, UTF-16BE or UTF-16LE
261 function Encode
262 (Item : Wide_Wide_String;
263 Output_Scheme : Encoding_Scheme;
264 Output_BOM : Boolean := False) return UTF_String
266 begin
267 if Output_Scheme = UTF_8 then
268 return Encode (Item, Output_BOM);
269 else
270 return From_UTF_16 (Encode (Item), Output_Scheme, Output_BOM);
271 end if;
272 end Encode;
274 -- Encode Wide_Wide_String in UTF-8
276 function Encode
277 (Item : Wide_Wide_String;
278 Output_BOM : Boolean := False) return UTF_8_String
280 Result : String (1 .. 4 * Item'Length + 3);
281 -- Worst case is four bytes per input byte + space for BOM
283 Len : Natural;
284 -- Number of output codes stored in Result
286 C : Unsigned_32;
287 -- Single input character
289 procedure Store (C : Unsigned_32);
290 pragma Inline (Store);
291 -- Store one output code (input is in range 0 .. 255)
293 -----------
294 -- Store --
295 -----------
297 procedure Store (C : Unsigned_32) is
298 begin
299 Len := Len + 1;
300 Result (Len) := Character'Val (C);
301 end Store;
303 -- Start of processing for Encode
305 begin
306 -- Output BOM if required
308 if Output_BOM then
309 Result (1 .. 3) := BOM_8;
310 Len := 3;
311 else
312 Len := 0;
313 end if;
315 -- Loop through characters of input
317 for Iptr in Item'Range loop
318 C := To_Unsigned_32 (Item (Iptr));
320 -- Codes in the range 16#00#..16#7F# are represented as
321 -- 0xxxxxxx
323 if C <= 16#7F# then
324 Store (C);
326 -- Codes in the range 16#80#..16#7FF# are represented as
327 -- 110yyyxx 10xxxxxx
329 elsif C <= 16#7FF# then
330 Store (2#110_00000# or Shift_Right (C, 6));
331 Store (2#10_000000# or (C and 2#00_111111#));
333 -- Codes in the range 16#800#..16#D7FF# or 16#E000#..16#FFFD# are
334 -- represented as
335 -- 1110yyyy 10yyyyxx 10xxxxxx
337 elsif C <= 16#D7FF# or else C in 16#E000# .. 16#FFFD# then
338 Store (2#1110_0000# or Shift_Right (C, 12));
339 Store (2#10_000000# or
340 Shift_Right (C and 2#111111_000000#, 6));
341 Store (2#10_000000# or (C and 2#00_111111#));
343 -- Codes in the range 16#10000# - 16#10FFFF# are represented as
344 -- 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
346 elsif C in 16#1_0000# .. 16#10_FFFF# then
347 Store (2#11110_000# or
348 Shift_Right (C, 18));
349 Store (2#10_000000# or
350 Shift_Right (C and 2#111111_000000_000000#, 12));
351 Store (2#10_000000# or
352 Shift_Right (C and 2#111111_000000#, 6));
353 Store (2#10_000000# or
354 (C and 2#00_111111#));
356 -- All other codes are invalid
358 else
359 Raise_Encoding_Error (Iptr);
360 end if;
361 end loop;
363 return Result (1 .. Len);
364 end Encode;
366 -- Encode Wide_Wide_String in UTF-16
368 function Encode
369 (Item : Wide_Wide_String;
370 Output_BOM : Boolean := False) return UTF_16_Wide_String
372 Result : Wide_String (1 .. 2 * Item'Length + 1);
373 -- Worst case is each input character generates two output characters
374 -- plus one for possible BOM.
376 Len : Integer;
377 -- Length of output string
379 C : Unsigned_32;
381 begin
382 -- Output BOM if needed
384 if Output_BOM then
385 Result (1) := BOM_16 (1);
386 Len := 1;
387 else
388 Len := 0;
389 end if;
391 -- Loop through input characters encoding them
393 for Iptr in Item'Range loop
394 C := To_Unsigned_32 (Item (Iptr));
396 -- Codes in the range 16#00_0000#..16#00_D7FF# or 16#E000#..16#FFFD#
397 -- are output unchanged
399 if C <= 16#00_D7FF# or else C in 16#E000# .. 16#FFFD# then
400 Len := Len + 1;
401 Result (Len) := Wide_Character'Val (C);
403 -- Codes in the range 16#01_0000#..16#10_FFFF# are output using two
404 -- surrogate characters. First 16#1_0000# is subtracted from the code
405 -- point to give a 20-bit value. This is then split into two separate
406 -- 10-bit values each of which is represented as a surrogate with the
407 -- most significant half placed in the first surrogate. The ranges of
408 -- values used for the two surrogates are 16#D800#-16#DBFF# for the
409 -- first, most significant surrogate and 16#DC00#-16#DFFF# for the
410 -- second, least significant surrogate.
412 elsif C in 16#1_0000# .. 16#10_FFFF# then
413 C := C - 16#1_0000#;
415 Len := Len + 1;
416 Result (Len) := Wide_Character'Val (16#D800# + C / 2 ** 10);
418 Len := Len + 1;
419 Result (Len) := Wide_Character'Val (16#DC00# + C mod 2 ** 10);
421 -- All other codes are invalid
423 else
424 Raise_Encoding_Error (Iptr);
425 end if;
426 end loop;
428 return Result (1 .. Len);
429 end Encode;
431 end Ada.Strings.UTF_Encoding.Wide_Wide_Encoding;