Imported GNU Classpath 0.90
[official-gcc.git] / gcc / ada / s-wchcnv.adb
blobecbcb26c4bb82a8410ae1794532b26125f844416
1 ------------------------------------------------------------------------------
2 -- --
3 -- GNAT RUN-TIME COMPONENTS --
4 -- --
5 -- S Y S T E M . W C H _ C N V --
6 -- --
7 -- B o d y --
8 -- --
9 -- Copyright (C) 1992-2006, Free Software Foundation, Inc. --
10 -- --
11 -- GNAT is free software; you can redistribute it and/or modify it under --
12 -- terms of the GNU General Public License as published by the Free Soft- --
13 -- ware Foundation; either version 2, or (at your option) any later ver- --
14 -- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
15 -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
16 -- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License --
17 -- for more details. You should have received a copy of the GNU General --
18 -- Public License distributed with GNAT; see file COPYING. If not, write --
19 -- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, --
20 -- Boston, MA 02110-1301, USA. --
21 -- --
22 -- As a special exception, if other files instantiate generics from this --
23 -- unit, or you link this unit with other files to produce an executable, --
24 -- this unit does not by itself cause the resulting executable to be --
25 -- covered by the GNU General Public License. This exception does not --
26 -- however invalidate any other reasons why the executable file might be --
27 -- covered by the GNU Public License. --
28 -- --
29 -- GNAT was originally developed by the GNAT team at New York University. --
30 -- Extensive contributions were provided by Ada Core Technologies Inc. --
31 -- --
32 ------------------------------------------------------------------------------
34 -- This package contains generic subprograms used for converting between
35 -- sequences of Character and Wide_Character. All access to wide character
36 -- sequences is isolated in this unit.
38 with Interfaces; use Interfaces;
39 with System.WCh_Con; use System.WCh_Con;
40 with System.WCh_JIS; use System.WCh_JIS;
42 package body System.WCh_Cnv is
44 -----------------------------
45 -- Char_Sequence_To_UTF_32 --
46 -----------------------------
48 function Char_Sequence_To_UTF_32
49 (C : Character;
50 EM : WC_Encoding_Method) return UTF_32_Code
52 B1 : Unsigned_32;
53 C1 : Character;
54 U : Unsigned_32;
55 W : Unsigned_32;
57 procedure Get_Hex (N : Character);
58 -- If N is a hex character, then set B1 to 16 * B1 + character N.
59 -- Raise Constraint_Error if character N is not a hex character.
61 procedure Get_UTF_Byte;
62 pragma Inline (Get_UTF_Byte);
63 -- Used to interpret a 2#10xxxxxx# continuation byte in UTF-8 mode.
64 -- Reads a byte, and raises CE if the first two bits are not 10.
65 -- Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
67 -------------
68 -- Get_Hex --
69 -------------
71 procedure Get_Hex (N : Character) is
72 B2 : constant Unsigned_32 := Character'Pos (N);
73 begin
74 if B2 in Character'Pos ('0') .. Character'Pos ('9') then
75 B1 := B1 * 16 + B2 - Character'Pos ('0');
76 elsif B2 in Character'Pos ('A') .. Character'Pos ('F') then
77 B1 := B1 * 16 + B2 - (Character'Pos ('A') - 10);
78 elsif B2 in Character'Pos ('a') .. Character'Pos ('f') then
79 B1 := B1 * 16 + B2 - (Character'Pos ('a') - 10);
80 else
81 raise Constraint_Error;
82 end if;
83 end Get_Hex;
85 ------------------
86 -- Get_UTF_Byte --
87 ------------------
89 procedure Get_UTF_Byte is
90 begin
91 U := Unsigned_32 (Character'Pos (In_Char));
93 if (U and 2#11000000#) /= 2#10_000000# then
94 raise Constraint_Error;
95 end if;
97 W := Shift_Left (W, 6) or (U and 2#00111111#);
98 end Get_UTF_Byte;
100 -- Start of processing for Char_Sequence_To_Wide
102 begin
103 case EM is
105 when WCEM_Hex =>
106 if C /= ASCII.ESC then
107 return Character'Pos (C);
109 else
110 B1 := 0;
111 Get_Hex (In_Char);
112 Get_Hex (In_Char);
113 Get_Hex (In_Char);
114 Get_Hex (In_Char);
116 return UTF_32_Code (B1);
117 end if;
119 when WCEM_Upper =>
120 if C > ASCII.DEL then
121 return 256 * Character'Pos (C) + Character'Pos (In_Char);
122 else
123 return Character'Pos (C);
124 end if;
126 when WCEM_Shift_JIS =>
127 if C > ASCII.DEL then
128 return Wide_Character'Pos (Shift_JIS_To_JIS (C, In_Char));
129 else
130 return Character'Pos (C);
131 end if;
133 when WCEM_EUC =>
134 if C > ASCII.DEL then
135 return Wide_Character'Pos (EUC_To_JIS (C, In_Char));
136 else
137 return Character'Pos (C);
138 end if;
140 when WCEM_UTF8 =>
142 -- Note: for details of UTF8 encoding see RFC 3629
144 U := Unsigned_32 (Character'Pos (C));
146 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
148 if (U and 2#10000000#) = 2#00000000# then
149 return Character'Pos (C);
151 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
153 elsif (U and 2#11100000#) = 2#110_00000# then
154 W := Shift_Left (U and 2#00011111#, 6);
155 U := Unsigned_32 (Character'Pos (In_Char));
157 if (U and 2#11000000#) /= 2#10_000000# then
158 raise Constraint_Error;
159 end if;
161 W := W or (U and 2#00111111#);
163 return UTF_32_Code (W);
165 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
167 elsif (U and 2#11110000#) = 2#1110_0000# then
168 W := U and 2#00001111#;
169 Get_UTF_Byte;
170 Get_UTF_Byte;
171 return UTF_32_Code (W);
173 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
175 elsif (U and 2#11111000#) = 2#11110_000# then
176 W := U and 2#00000111#;
178 for K in 1 .. 3 loop
179 Get_UTF_Byte;
180 end loop;
182 return UTF_32_Code (W);
184 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
185 -- 10xxxxxx 10xxxxxx
187 elsif (U and 2#11111100#) = 2#111110_00# then
188 W := U and 2#00000011#;
190 for K in 1 .. 4 loop
191 Get_UTF_Byte;
192 end loop;
194 return UTF_32_Code (W);
196 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
197 -- 10xxxxxx 10xxxxxx 10xxxxxx
199 elsif (U and 2#11111110#) = 2#1111110_0# then
200 W := U and 2#00000001#;
202 for K in 1 .. 5 loop
203 Get_UTF_Byte;
204 end loop;
206 return UTF_32_Code (W);
208 else
209 raise Constraint_Error;
210 end if;
212 when WCEM_Brackets =>
214 if C /= '[' then
215 return Character'Pos (C);
216 end if;
218 if In_Char /= '"' then
219 raise Constraint_Error;
220 end if;
222 B1 := 0;
223 Get_Hex (In_Char);
224 Get_Hex (In_Char);
226 C1 := In_Char;
228 if C1 /= '"' then
229 Get_Hex (C1);
230 Get_Hex (In_Char);
232 C1 := In_Char;
234 if C1 /= '"' then
235 Get_Hex (C1);
236 Get_Hex (In_Char);
238 C1 := In_Char;
240 if C1 /= '"' then
241 Get_Hex (C1);
242 Get_Hex (In_Char);
244 if B1 > Unsigned_32 (UTF_32_Code'Last) then
245 raise Constraint_Error;
246 end if;
248 if In_Char /= '"' then
249 raise Constraint_Error;
250 end if;
251 end if;
252 end if;
253 end if;
255 if In_Char /= ']' then
256 raise Constraint_Error;
257 end if;
259 return UTF_32_Code (B1);
261 end case;
262 end Char_Sequence_To_UTF_32;
264 --------------------------------
265 -- Char_Sequence_To_Wide_Char --
266 --------------------------------
268 function Char_Sequence_To_Wide_Char
269 (C : Character;
270 EM : System.WCh_Con.WC_Encoding_Method) return Wide_Character
272 function Char_Sequence_To_UTF is new Char_Sequence_To_UTF_32 (In_Char);
274 U : constant UTF_32_Code := Char_Sequence_To_UTF (C, EM);
276 begin
277 if U > 16#FFFF# then
278 raise Constraint_Error;
279 else
280 return Wide_Character'Val (U);
281 end if;
282 end Char_Sequence_To_Wide_Char;
284 -----------------------------
285 -- UTF_32_To_Char_Sequence --
286 -----------------------------
288 procedure UTF_32_To_Char_Sequence
289 (Val : UTF_32_Code;
290 EM : System.WCh_Con.WC_Encoding_Method)
292 Hexc : constant array (UTF_32_Code range 0 .. 15) of Character :=
293 "0123456789ABCDEF";
295 C1, C2 : Character;
296 U : Unsigned_32;
298 begin
299 case EM is
301 when WCEM_Hex =>
302 if Val < 256 then
303 Out_Char (Character'Val (Val));
304 elsif Val <= 16#FFFF# then
305 Out_Char (ASCII.ESC);
306 Out_Char (Hexc (Val / (16**3)));
307 Out_Char (Hexc ((Val / (16**2)) mod 16));
308 Out_Char (Hexc ((Val / 16) mod 16));
309 Out_Char (Hexc (Val mod 16));
310 else
311 raise Constraint_Error;
312 end if;
314 when WCEM_Upper =>
315 if Val < 128 then
316 Out_Char (Character'Val (Val));
317 elsif Val < 16#8000# or else Val > 16#FFFF# then
318 raise Constraint_Error;
319 else
320 Out_Char (Character'Val (Val / 256));
321 Out_Char (Character'Val (Val mod 256));
322 end if;
324 when WCEM_Shift_JIS =>
325 if Val < 128 then
326 Out_Char (Character'Val (Val));
327 elsif Val <= 16#FFFF# then
328 JIS_To_Shift_JIS (Wide_Character'Val (Val), C1, C2);
329 Out_Char (C1);
330 Out_Char (C2);
331 else
332 raise Constraint_Error;
333 end if;
335 when WCEM_EUC =>
336 if Val < 128 then
337 Out_Char (Character'Val (Val));
338 elsif Val <= 16#FFFF# then
339 JIS_To_EUC (Wide_Character'Val (Val), C1, C2);
340 Out_Char (C1);
341 Out_Char (C2);
342 else
343 raise Constraint_Error;
344 end if;
346 when WCEM_UTF8 =>
348 -- Note: for details of UTF8 encoding see RFC 3629
350 U := Unsigned_32 (Val);
352 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
354 if U <= 16#00_007F# then
355 Out_Char (Character'Val (U));
357 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
359 elsif U <= 16#00_07FF# then
360 Out_Char (Character'Val (2#11000000# or Shift_Right (U, 6)));
361 Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
363 -- 16#00_0800#-16#00_FFFF#: 1110xxxx 10xxxxxx 10xxxxxx
365 elsif U <= 16#00_FFFF# then
366 Out_Char (Character'Val (2#11100000# or Shift_Right (U, 12)));
367 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
368 and 2#00111111#)));
369 Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
371 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
373 elsif U <= 16#10_FFFF# then
374 Out_Char (Character'Val (2#11110000# or Shift_Right (U, 18)));
375 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12)
376 and 2#00111111#)));
377 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
378 and 2#00111111#)));
379 Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
381 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
382 -- 10xxxxxx 10xxxxxx
384 elsif U <= 16#03FF_FFFF# then
385 Out_Char (Character'Val (2#11111000# or Shift_Right (U, 24)));
386 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 18)
387 and 2#00111111#)));
388 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12)
389 and 2#00111111#)));
390 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
391 and 2#00111111#)));
392 Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
394 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
395 -- 10xxxxxx 10xxxxxx 10xxxxxx
397 elsif U <= 16#7FFF_FFFF# then
398 Out_Char (Character'Val (2#11111100# or Shift_Right (U, 30)));
399 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 24)
400 and 2#00111111#)));
401 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 18)
402 and 2#00111111#)));
403 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12)
404 and 2#00111111#)));
405 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
406 and 2#00111111#)));
407 Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
409 else
410 raise Constraint_Error;
411 end if;
413 when WCEM_Brackets =>
415 -- Values in the range 0-255 are directly output. Note that there
416 -- is some issue with [ (16#5B#] since this will cause confusion
417 -- if the resulting string is interpreted using brackets encoding.
419 -- One possibility would be to always output [ as ["5B"] but in
420 -- practice this is undesirable, since for example normal use of
421 -- Wide_Text_IO for output (much more common than input), really
422 -- does want to be able to say something like
424 -- Put_Line ("Start of output [first run]");
426 -- and have it come out as intended, rather than contaminated by
427 -- a ["5B"] sequence in place of the left bracket.
429 if Val < 256 then
430 Out_Char (Character'Val (Val));
432 -- Otherwise use brackets notation for vales greater than 255
434 else
435 Out_Char ('[');
436 Out_Char ('"');
438 if Val > 16#FFFF# then
439 if Val > 16#00FF_FFFF# then
440 if Val > 16#7FFF_FFFF# then
441 raise Constraint_Error;
442 end if;
444 Out_Char (Hexc (Val / 16 ** 7));
445 Out_Char (Hexc ((Val / 16 ** 6) mod 16));
446 end if;
448 Out_Char (Hexc ((Val / 16 ** 5) mod 16));
449 Out_Char (Hexc ((Val / 16 ** 4) mod 16));
450 end if;
452 Out_Char (Hexc ((Val / 16 ** 3) mod 16));
453 Out_Char (Hexc ((Val / 16 ** 2) mod 16));
454 Out_Char (Hexc ((Val / 16) mod 16));
455 Out_Char (Hexc (Val mod 16));
457 Out_Char ('"');
458 Out_Char (']');
459 end if;
460 end case;
461 end UTF_32_To_Char_Sequence;
463 --------------------------------
464 -- Wide_Char_To_Char_Sequence --
465 --------------------------------
467 procedure Wide_Char_To_Char_Sequence
468 (WC : Wide_Character;
469 EM : System.WCh_Con.WC_Encoding_Method)
471 procedure UTF_To_Char_Sequence is new UTF_32_To_Char_Sequence (Out_Char);
472 begin
473 UTF_To_Char_Sequence (Wide_Character'Pos (WC), EM);
474 end Wide_Char_To_Char_Sequence;
476 end System.WCh_Cnv;