2016-10-26 François Dumont <fdumont@gcc.gnu.org>
[official-gcc.git] / gcc / ada / s-wchcnv.adb
blob345af8f57d7043510f00706f0fd59833fd35cc7d
1 ------------------------------------------------------------------------------
2 -- --
3 -- GNAT RUN-TIME COMPONENTS --
4 -- --
5 -- S Y S T E M . W C H _ C N V --
6 -- --
7 -- B o d y --
8 -- --
9 -- Copyright (C) 1992-2013, Free Software Foundation, Inc. --
10 -- --
11 -- GNAT is free software; you can redistribute it and/or modify it under --
12 -- terms of the GNU General Public License as published by the Free Soft- --
13 -- ware Foundation; either version 3, or (at your option) any later ver- --
14 -- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
15 -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
16 -- or FITNESS FOR A PARTICULAR PURPOSE. --
17 -- --
18 -- As a special exception under Section 7 of GPL version 3, you are granted --
19 -- additional permissions described in the GCC Runtime Library Exception, --
20 -- version 3.1, as published by the Free Software Foundation. --
21 -- --
22 -- You should have received a copy of the GNU General Public License and --
23 -- a copy of the GCC Runtime Library Exception along with this program; --
24 -- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see --
25 -- <http://www.gnu.org/licenses/>. --
26 -- --
27 -- GNAT was originally developed by the GNAT team at New York University. --
28 -- Extensive contributions were provided by Ada Core Technologies Inc. --
29 -- --
30 ------------------------------------------------------------------------------
32 pragma Compiler_Unit_Warning;
34 with Interfaces; use Interfaces;
35 with System.WCh_Con; use System.WCh_Con;
36 with System.WCh_JIS; use System.WCh_JIS;
38 package body System.WCh_Cnv is
40 -----------------------------
41 -- Char_Sequence_To_UTF_32 --
42 -----------------------------
44 function Char_Sequence_To_UTF_32
45 (C : Character;
46 EM : System.WCh_Con.WC_Encoding_Method) return UTF_32_Code
48 B1 : Unsigned_32;
49 C1 : Character;
50 U : Unsigned_32;
51 W : Unsigned_32;
53 procedure Get_Hex (N : Character);
54 -- If N is a hex character, then set B1 to 16 * B1 + character N.
55 -- Raise Constraint_Error if character N is not a hex character.
57 procedure Get_UTF_Byte;
58 pragma Inline (Get_UTF_Byte);
59 -- Used to interpret a 2#10xxxxxx# continuation byte in UTF-8 mode.
60 -- Reads a byte, and raises CE if the first two bits are not 10.
61 -- Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
63 -------------
64 -- Get_Hex --
65 -------------
67 procedure Get_Hex (N : Character) is
68 B2 : constant Unsigned_32 := Character'Pos (N);
69 begin
70 if B2 in Character'Pos ('0') .. Character'Pos ('9') then
71 B1 := B1 * 16 + B2 - Character'Pos ('0');
72 elsif B2 in Character'Pos ('A') .. Character'Pos ('F') then
73 B1 := B1 * 16 + B2 - (Character'Pos ('A') - 10);
74 elsif B2 in Character'Pos ('a') .. Character'Pos ('f') then
75 B1 := B1 * 16 + B2 - (Character'Pos ('a') - 10);
76 else
77 raise Constraint_Error;
78 end if;
79 end Get_Hex;
81 ------------------
82 -- Get_UTF_Byte --
83 ------------------
85 procedure Get_UTF_Byte is
86 begin
87 U := Unsigned_32 (Character'Pos (In_Char));
89 if (U and 2#11000000#) /= 2#10_000000# then
90 raise Constraint_Error;
91 end if;
93 W := Shift_Left (W, 6) or (U and 2#00111111#);
94 end Get_UTF_Byte;
96 -- Start of processing for Char_Sequence_To_Wide
98 begin
99 case EM is
101 when WCEM_Hex =>
102 if C /= ASCII.ESC then
103 return Character'Pos (C);
105 else
106 B1 := 0;
107 Get_Hex (In_Char);
108 Get_Hex (In_Char);
109 Get_Hex (In_Char);
110 Get_Hex (In_Char);
112 return UTF_32_Code (B1);
113 end if;
115 when WCEM_Upper =>
116 if C > ASCII.DEL then
117 return 256 * Character'Pos (C) + Character'Pos (In_Char);
118 else
119 return Character'Pos (C);
120 end if;
122 when WCEM_Shift_JIS =>
123 if C > ASCII.DEL then
124 return Wide_Character'Pos (Shift_JIS_To_JIS (C, In_Char));
125 else
126 return Character'Pos (C);
127 end if;
129 when WCEM_EUC =>
130 if C > ASCII.DEL then
131 return Wide_Character'Pos (EUC_To_JIS (C, In_Char));
132 else
133 return Character'Pos (C);
134 end if;
136 when WCEM_UTF8 =>
138 -- Note: for details of UTF8 encoding see RFC 3629
140 U := Unsigned_32 (Character'Pos (C));
142 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
144 if (U and 2#10000000#) = 2#00000000# then
145 return Character'Pos (C);
147 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
149 elsif (U and 2#11100000#) = 2#110_00000# then
150 W := U and 2#00011111#;
151 Get_UTF_Byte;
152 return UTF_32_Code (W);
154 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
156 elsif (U and 2#11110000#) = 2#1110_0000# then
157 W := U and 2#00001111#;
158 Get_UTF_Byte;
159 Get_UTF_Byte;
160 return UTF_32_Code (W);
162 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
164 elsif (U and 2#11111000#) = 2#11110_000# then
165 W := U and 2#00000111#;
167 for K in 1 .. 3 loop
168 Get_UTF_Byte;
169 end loop;
171 return UTF_32_Code (W);
173 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
174 -- 10xxxxxx 10xxxxxx
176 elsif (U and 2#11111100#) = 2#111110_00# then
177 W := U and 2#00000011#;
179 for K in 1 .. 4 loop
180 Get_UTF_Byte;
181 end loop;
183 return UTF_32_Code (W);
185 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
186 -- 10xxxxxx 10xxxxxx 10xxxxxx
188 elsif (U and 2#11111110#) = 2#1111110_0# then
189 W := U and 2#00000001#;
191 for K in 1 .. 5 loop
192 Get_UTF_Byte;
193 end loop;
195 return UTF_32_Code (W);
197 else
198 raise Constraint_Error;
199 end if;
201 when WCEM_Brackets =>
202 if C /= '[' then
203 return Character'Pos (C);
204 end if;
206 if In_Char /= '"' then
207 raise Constraint_Error;
208 end if;
210 B1 := 0;
211 Get_Hex (In_Char);
212 Get_Hex (In_Char);
214 C1 := In_Char;
216 if C1 /= '"' then
217 Get_Hex (C1);
218 Get_Hex (In_Char);
220 C1 := In_Char;
222 if C1 /= '"' then
223 Get_Hex (C1);
224 Get_Hex (In_Char);
226 C1 := In_Char;
228 if C1 /= '"' then
229 Get_Hex (C1);
230 Get_Hex (In_Char);
232 if B1 > Unsigned_32 (UTF_32_Code'Last) then
233 raise Constraint_Error;
234 end if;
236 if In_Char /= '"' then
237 raise Constraint_Error;
238 end if;
239 end if;
240 end if;
241 end if;
243 if In_Char /= ']' then
244 raise Constraint_Error;
245 end if;
247 return UTF_32_Code (B1);
249 end case;
250 end Char_Sequence_To_UTF_32;
252 --------------------------------
253 -- Char_Sequence_To_Wide_Char --
254 --------------------------------
256 function Char_Sequence_To_Wide_Char
257 (C : Character;
258 EM : System.WCh_Con.WC_Encoding_Method) return Wide_Character
260 function Char_Sequence_To_UTF is new Char_Sequence_To_UTF_32 (In_Char);
262 U : constant UTF_32_Code := Char_Sequence_To_UTF (C, EM);
264 begin
265 if U > 16#FFFF# then
266 raise Constraint_Error;
267 else
268 return Wide_Character'Val (U);
269 end if;
270 end Char_Sequence_To_Wide_Char;
272 -----------------------------
273 -- UTF_32_To_Char_Sequence --
274 -----------------------------
276 procedure UTF_32_To_Char_Sequence
277 (Val : UTF_32_Code;
278 EM : System.WCh_Con.WC_Encoding_Method)
280 Hexc : constant array (UTF_32_Code range 0 .. 15) of Character :=
281 "0123456789ABCDEF";
283 C1, C2 : Character;
284 U : Unsigned_32;
286 begin
287 -- Raise CE for invalid UTF_32_Code
289 if not Val'Valid then
290 raise Constraint_Error;
291 end if;
293 -- Processing depends on encoding mode
295 case EM is
297 when WCEM_Hex =>
298 if Val < 256 then
299 Out_Char (Character'Val (Val));
300 elsif Val <= 16#FFFF# then
301 Out_Char (ASCII.ESC);
302 Out_Char (Hexc (Val / (16**3)));
303 Out_Char (Hexc ((Val / (16**2)) mod 16));
304 Out_Char (Hexc ((Val / 16) mod 16));
305 Out_Char (Hexc (Val mod 16));
306 else
307 raise Constraint_Error;
308 end if;
310 when WCEM_Upper =>
311 if Val < 128 then
312 Out_Char (Character'Val (Val));
313 elsif Val < 16#8000# or else Val > 16#FFFF# then
314 raise Constraint_Error;
315 else
316 Out_Char (Character'Val (Val / 256));
317 Out_Char (Character'Val (Val mod 256));
318 end if;
320 when WCEM_Shift_JIS =>
321 if Val < 128 then
322 Out_Char (Character'Val (Val));
323 elsif Val <= 16#FFFF# then
324 JIS_To_Shift_JIS (Wide_Character'Val (Val), C1, C2);
325 Out_Char (C1);
326 Out_Char (C2);
327 else
328 raise Constraint_Error;
329 end if;
331 when WCEM_EUC =>
332 if Val < 128 then
333 Out_Char (Character'Val (Val));
334 elsif Val <= 16#FFFF# then
335 JIS_To_EUC (Wide_Character'Val (Val), C1, C2);
336 Out_Char (C1);
337 Out_Char (C2);
338 else
339 raise Constraint_Error;
340 end if;
342 when WCEM_UTF8 =>
344 -- Note: for details of UTF8 encoding see RFC 3629
346 U := Unsigned_32 (Val);
348 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
350 if U <= 16#00_007F# then
351 Out_Char (Character'Val (U));
353 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
355 elsif U <= 16#00_07FF# then
356 Out_Char (Character'Val (2#11000000# or Shift_Right (U, 6)));
357 Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
359 -- 16#00_0800#-16#00_FFFF#: 1110xxxx 10xxxxxx 10xxxxxx
361 elsif U <= 16#00_FFFF# then
362 Out_Char (Character'Val (2#11100000# or Shift_Right (U, 12)));
363 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
364 and 2#00111111#)));
365 Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
367 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
369 elsif U <= 16#10_FFFF# then
370 Out_Char (Character'Val (2#11110000# or Shift_Right (U, 18)));
371 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12)
372 and 2#00111111#)));
373 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
374 and 2#00111111#)));
375 Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
377 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
378 -- 10xxxxxx 10xxxxxx
380 elsif U <= 16#03FF_FFFF# then
381 Out_Char (Character'Val (2#11111000# or Shift_Right (U, 24)));
382 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 18)
383 and 2#00111111#)));
384 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12)
385 and 2#00111111#)));
386 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
387 and 2#00111111#)));
388 Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
390 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
391 -- 10xxxxxx 10xxxxxx 10xxxxxx
393 elsif U <= 16#7FFF_FFFF# then
394 Out_Char (Character'Val (2#11111100# or Shift_Right (U, 30)));
395 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 24)
396 and 2#00111111#)));
397 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 18)
398 and 2#00111111#)));
399 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12)
400 and 2#00111111#)));
401 Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
402 and 2#00111111#)));
403 Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
405 else
406 raise Constraint_Error;
407 end if;
409 when WCEM_Brackets =>
411 -- Values in the range 0-255 are directly output. Note that there
412 -- is some issue with [ (16#5B#] since this will cause confusion
413 -- if the resulting string is interpreted using brackets encoding.
415 -- One possibility would be to always output [ as ["5B"] but in
416 -- practice this is undesirable, since for example normal use of
417 -- Wide_Text_IO for output (much more common than input), really
418 -- does want to be able to say something like
420 -- Put_Line ("Start of output [first run]");
422 -- and have it come out as intended, rather than contaminated by
423 -- a ["5B"] sequence in place of the left bracket.
425 if Val < 256 then
426 Out_Char (Character'Val (Val));
428 -- Otherwise use brackets notation for vales greater than 255
430 else
431 Out_Char ('[');
432 Out_Char ('"');
434 if Val > 16#FFFF# then
435 if Val > 16#00FF_FFFF# then
436 Out_Char (Hexc (Val / 16 ** 7));
437 Out_Char (Hexc ((Val / 16 ** 6) mod 16));
438 end if;
440 Out_Char (Hexc ((Val / 16 ** 5) mod 16));
441 Out_Char (Hexc ((Val / 16 ** 4) mod 16));
442 end if;
444 Out_Char (Hexc ((Val / 16 ** 3) mod 16));
445 Out_Char (Hexc ((Val / 16 ** 2) mod 16));
446 Out_Char (Hexc ((Val / 16) mod 16));
447 Out_Char (Hexc (Val mod 16));
449 Out_Char ('"');
450 Out_Char (']');
451 end if;
452 end case;
453 end UTF_32_To_Char_Sequence;
455 --------------------------------
456 -- Wide_Char_To_Char_Sequence --
457 --------------------------------
459 procedure Wide_Char_To_Char_Sequence
460 (WC : Wide_Character;
461 EM : System.WCh_Con.WC_Encoding_Method)
463 procedure UTF_To_Char_Sequence is new UTF_32_To_Char_Sequence (Out_Char);
464 begin
465 UTF_To_Char_Sequence (Wide_Character'Pos (WC), EM);
466 end Wide_Char_To_Char_Sequence;
468 end System.WCh_Cnv;