1 ------------------------------------------------------------------------------
3 -- GNAT RUN-TIME COMPONENTS --
5 -- S Y S T E M . W C H _ C N V --
9 -- Copyright (C) 1992-2016, Free Software Foundation, Inc. --
11 -- GNAT is free software; you can redistribute it and/or modify it under --
12 -- terms of the GNU General Public License as published by the Free Soft- --
13 -- ware Foundation; either version 3, or (at your option) any later ver- --
14 -- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
15 -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
16 -- or FITNESS FOR A PARTICULAR PURPOSE. --
18 -- As a special exception under Section 7 of GPL version 3, you are granted --
19 -- additional permissions described in the GCC Runtime Library Exception, --
20 -- version 3.1, as published by the Free Software Foundation. --
22 -- You should have received a copy of the GNU General Public License and --
23 -- a copy of the GCC Runtime Library Exception along with this program; --
24 -- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see --
25 -- <http://www.gnu.org/licenses/>. --
27 -- GNAT was originally developed by the GNAT team at New York University. --
28 -- Extensive contributions were provided by Ada Core Technologies Inc. --
30 ------------------------------------------------------------------------------
32 pragma Compiler_Unit_Warning
;
34 with Interfaces
; use Interfaces
;
35 with System
.WCh_Con
; use System
.WCh_Con
;
36 with System
.WCh_JIS
; use System
.WCh_JIS
;
38 package body System
.WCh_Cnv
is
40 -----------------------------
41 -- Char_Sequence_To_UTF_32 --
42 -----------------------------
44 function Char_Sequence_To_UTF_32
46 EM
: System
.WCh_Con
.WC_Encoding_Method
) return UTF_32_Code
53 procedure Get_Hex
(N
: Character);
54 -- If N is a hex character, then set B1 to 16 * B1 + character N.
55 -- Raise Constraint_Error if character N is not a hex character.
57 procedure Get_UTF_Byte
;
58 pragma Inline
(Get_UTF_Byte
);
59 -- Used to interpret a 2#10xxxxxx# continuation byte in UTF-8 mode.
60 -- Reads a byte, and raises CE if the first two bits are not 10.
61 -- Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
67 procedure Get_Hex
(N
: Character) is
68 B2
: constant Unsigned_32
:= Character'Pos (N
);
70 if B2
in Character'Pos ('0') .. Character'Pos ('9') then
71 B1
:= B1
* 16 + B2
- Character'Pos ('0');
72 elsif B2
in Character'Pos ('A') .. Character'Pos ('F') then
73 B1
:= B1
* 16 + B2
- (Character'Pos ('A') - 10);
74 elsif B2
in Character'Pos ('a') .. Character'Pos ('f') then
75 B1
:= B1
* 16 + B2
- (Character'Pos ('a') - 10);
77 raise Constraint_Error
;
85 procedure Get_UTF_Byte
is
87 U
:= Unsigned_32
(Character'Pos (In_Char
));
89 if (U
and 2#
11000000#
) /= 2#
10_000000#
then
90 raise Constraint_Error
;
93 W
:= Shift_Left
(W
, 6) or (U
and 2#
00111111#
);
96 -- Start of processing for Char_Sequence_To_UTF_32
101 if C
/= ASCII
.ESC
then
102 return Character'Pos (C
);
111 return UTF_32_Code
(B1
);
115 if C
> ASCII
.DEL
then
116 return 256 * Character'Pos (C
) + Character'Pos (In_Char
);
118 return Character'Pos (C
);
121 when WCEM_Shift_JIS
=>
122 if C
> ASCII
.DEL
then
123 return Wide_Character'Pos (Shift_JIS_To_JIS
(C
, In_Char
));
125 return Character'Pos (C
);
129 if C
> ASCII
.DEL
then
130 return Wide_Character'Pos (EUC_To_JIS
(C
, In_Char
));
132 return Character'Pos (C
);
137 -- Note: for details of UTF8 encoding see RFC 3629
139 U
:= Unsigned_32
(Character'Pos (C
));
141 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
143 if (U
and 2#
10000000#
) = 2#
00000000#
then
144 return Character'Pos (C
);
146 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
148 elsif (U
and 2#
11100000#
) = 2#
110_00000#
then
149 W
:= U
and 2#
00011111#
;
151 return UTF_32_Code
(W
);
153 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
155 elsif (U
and 2#
11110000#
) = 2#
1110_0000#
then
156 W
:= U
and 2#
00001111#
;
159 return UTF_32_Code
(W
);
161 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
163 elsif (U
and 2#
11111000#
) = 2#
11110_000#
then
164 W
:= U
and 2#
00000111#
;
170 return UTF_32_Code
(W
);
172 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
175 elsif (U
and 2#
11111100#
) = 2#
111110_00#
then
176 W
:= U
and 2#
00000011#
;
182 return UTF_32_Code
(W
);
184 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
185 -- 10xxxxxx 10xxxxxx 10xxxxxx
187 elsif (U
and 2#
11111110#
) = 2#
1111110_0#
then
188 W
:= U
and 2#
00000001#
;
194 return UTF_32_Code
(W
);
197 raise Constraint_Error
;
200 when WCEM_Brackets
=>
202 return Character'Pos (C
);
205 if In_Char
/= '"' then
206 raise Constraint_Error
;
231 if B1
> Unsigned_32
(UTF_32_Code
'Last) then
232 raise Constraint_Error
;
235 if In_Char
/= '"' then
236 raise Constraint_Error
;
242 if In_Char
/= ']' then
243 raise Constraint_Error
;
246 return UTF_32_Code
(B1
);
248 end Char_Sequence_To_UTF_32
;
250 --------------------------------
251 -- Char_Sequence_To_Wide_Char --
252 --------------------------------
254 function Char_Sequence_To_Wide_Char
256 EM
: System
.WCh_Con
.WC_Encoding_Method
) return Wide_Character
258 function Char_Sequence_To_UTF
is new Char_Sequence_To_UTF_32
(In_Char
);
260 U
: constant UTF_32_Code
:= Char_Sequence_To_UTF
(C
, EM
);
264 raise Constraint_Error
;
266 return Wide_Character'Val (U
);
268 end Char_Sequence_To_Wide_Char
;
270 -----------------------------
271 -- UTF_32_To_Char_Sequence --
272 -----------------------------
274 procedure UTF_32_To_Char_Sequence
276 EM
: System
.WCh_Con
.WC_Encoding_Method
)
278 Hexc
: constant array (UTF_32_Code
range 0 .. 15) of Character :=
285 -- Raise CE for invalid UTF_32_Code
287 if not Val
'Valid then
288 raise Constraint_Error
;
291 -- Processing depends on encoding mode
296 Out_Char
(Character'Val (Val
));
297 elsif Val
<= 16#FFFF#
then
298 Out_Char
(ASCII
.ESC
);
299 Out_Char
(Hexc
(Val
/ (16**3)));
300 Out_Char
(Hexc
((Val
/ (16**2)) mod 16));
301 Out_Char
(Hexc
((Val
/ 16) mod 16));
302 Out_Char
(Hexc
(Val
mod 16));
304 raise Constraint_Error
;
309 Out_Char
(Character'Val (Val
));
310 elsif Val
< 16#
8000#
or else Val
> 16#FFFF#
then
311 raise Constraint_Error
;
313 Out_Char
(Character'Val (Val
/ 256));
314 Out_Char
(Character'Val (Val
mod 256));
317 when WCEM_Shift_JIS
=>
319 Out_Char
(Character'Val (Val
));
320 elsif Val
<= 16#FFFF#
then
321 JIS_To_Shift_JIS
(Wide_Character'Val (Val
), C1
, C2
);
325 raise Constraint_Error
;
330 Out_Char
(Character'Val (Val
));
331 elsif Val
<= 16#FFFF#
then
332 JIS_To_EUC
(Wide_Character'Val (Val
), C1
, C2
);
336 raise Constraint_Error
;
341 -- Note: for details of UTF8 encoding see RFC 3629
343 U
:= Unsigned_32
(Val
);
345 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
347 if U
<= 16#
00_007F#
then
348 Out_Char
(Character'Val (U
));
350 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
352 elsif U
<= 16#
00_07FF#
then
353 Out_Char
(Character'Val (2#
11000000#
or Shift_Right
(U
, 6)));
354 Out_Char
(Character'Val (2#
10000000#
or (U
and 2#
00111111#
)));
356 -- 16#00_0800#-16#00_FFFF#: 1110xxxx 10xxxxxx 10xxxxxx
358 elsif U
<= 16#
00_FFFF#
then
359 Out_Char
(Character'Val (2#
11100000#
or Shift_Right
(U
, 12)));
360 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 6)
362 Out_Char
(Character'Val (2#
10000000#
or (U
and 2#
00111111#
)));
364 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
366 elsif U
<= 16#
10_FFFF#
then
367 Out_Char
(Character'Val (2#
11110000#
or Shift_Right
(U
, 18)));
368 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 12)
370 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 6)
372 Out_Char
(Character'Val (2#
10000000#
or (U
and 2#
00111111#
)));
374 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
377 elsif U
<= 16#
03FF_FFFF#
then
378 Out_Char
(Character'Val (2#
11111000#
or Shift_Right
(U
, 24)));
379 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 18)
381 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 12)
383 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 6)
385 Out_Char
(Character'Val (2#
10000000#
or (U
and 2#
00111111#
)));
387 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
388 -- 10xxxxxx 10xxxxxx 10xxxxxx
390 elsif U
<= 16#
7FFF_FFFF#
then
391 Out_Char
(Character'Val (2#
11111100#
or Shift_Right
(U
, 30)));
392 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 24)
394 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 18)
396 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 12)
398 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 6)
400 Out_Char
(Character'Val (2#
10000000#
or (U
and 2#
00111111#
)));
403 raise Constraint_Error
;
406 when WCEM_Brackets
=>
408 -- Values in the range 0-255 are directly output. Note that there
409 -- is an issue with [ (16#5B#) since this will cause confusion
410 -- if the resulting string is interpreted using brackets encoding.
412 -- One possibility would be to always output [ as ["5B"] but in
413 -- practice this is undesirable, since for example normal use of
414 -- Wide_Text_IO for output (much more common than input), really
415 -- does want to be able to say something like
417 -- Put_Line ("Start of output [first run]");
419 -- and have it come out as intended, rather than contaminated by
420 -- a ["5B"] sequence in place of the left bracket.
423 Out_Char
(Character'Val (Val
));
425 -- Otherwise use brackets notation for vales greater than 255
431 if Val
> 16#FFFF#
then
432 if Val
> 16#
00FF_FFFF#
then
433 Out_Char
(Hexc
(Val
/ 16 ** 7));
434 Out_Char
(Hexc
((Val
/ 16 ** 6) mod 16));
437 Out_Char
(Hexc
((Val
/ 16 ** 5) mod 16));
438 Out_Char
(Hexc
((Val
/ 16 ** 4) mod 16));
441 Out_Char
(Hexc
((Val
/ 16 ** 3) mod 16));
442 Out_Char
(Hexc
((Val
/ 16 ** 2) mod 16));
443 Out_Char
(Hexc
((Val
/ 16) mod 16));
444 Out_Char
(Hexc
(Val
mod 16));
450 end UTF_32_To_Char_Sequence
;
452 --------------------------------
453 -- Wide_Char_To_Char_Sequence --
454 --------------------------------
456 procedure Wide_Char_To_Char_Sequence
457 (WC
: Wide_Character;
458 EM
: System
.WCh_Con
.WC_Encoding_Method
)
460 procedure UTF_To_Char_Sequence
is new UTF_32_To_Char_Sequence
(Out_Char
);
462 UTF_To_Char_Sequence
(Wide_Character'Pos (WC
), EM
);
463 end Wide_Char_To_Char_Sequence
;