1 ------------------------------------------------------------------------------
3 -- GNAT RUN-TIME COMPONENTS --
5 -- S Y S T E M . W C H _ C N V --
9 -- Copyright (C) 1992-2013, Free Software Foundation, Inc. --
11 -- GNAT is free software; you can redistribute it and/or modify it under --
12 -- terms of the GNU General Public License as published by the Free Soft- --
13 -- ware Foundation; either version 3, or (at your option) any later ver- --
14 -- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
15 -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
16 -- or FITNESS FOR A PARTICULAR PURPOSE. --
18 -- As a special exception under Section 7 of GPL version 3, you are granted --
19 -- additional permissions described in the GCC Runtime Library Exception, --
20 -- version 3.1, as published by the Free Software Foundation. --
22 -- You should have received a copy of the GNU General Public License and --
23 -- a copy of the GCC Runtime Library Exception along with this program; --
24 -- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see --
25 -- <http://www.gnu.org/licenses/>. --
27 -- GNAT was originally developed by the GNAT team at New York University. --
28 -- Extensive contributions were provided by Ada Core Technologies Inc. --
30 ------------------------------------------------------------------------------
32 pragma Compiler_Unit_Warning
;
34 with Interfaces
; use Interfaces
;
35 with System
.WCh_Con
; use System
.WCh_Con
;
36 with System
.WCh_JIS
; use System
.WCh_JIS
;
38 package body System
.WCh_Cnv
is
40 -----------------------------
41 -- Char_Sequence_To_UTF_32 --
42 -----------------------------
44 function Char_Sequence_To_UTF_32
46 EM
: System
.WCh_Con
.WC_Encoding_Method
) return UTF_32_Code
53 procedure Get_Hex
(N
: Character);
54 -- If N is a hex character, then set B1 to 16 * B1 + character N.
55 -- Raise Constraint_Error if character N is not a hex character.
57 procedure Get_UTF_Byte
;
58 pragma Inline
(Get_UTF_Byte
);
59 -- Used to interpret a 2#10xxxxxx# continuation byte in UTF-8 mode.
60 -- Reads a byte, and raises CE if the first two bits are not 10.
61 -- Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
67 procedure Get_Hex
(N
: Character) is
68 B2
: constant Unsigned_32
:= Character'Pos (N
);
70 if B2
in Character'Pos ('0') .. Character'Pos ('9') then
71 B1
:= B1
* 16 + B2
- Character'Pos ('0');
72 elsif B2
in Character'Pos ('A') .. Character'Pos ('F') then
73 B1
:= B1
* 16 + B2
- (Character'Pos ('A') - 10);
74 elsif B2
in Character'Pos ('a') .. Character'Pos ('f') then
75 B1
:= B1
* 16 + B2
- (Character'Pos ('a') - 10);
77 raise Constraint_Error
;
85 procedure Get_UTF_Byte
is
87 U
:= Unsigned_32
(Character'Pos (In_Char
));
89 if (U
and 2#
11000000#
) /= 2#
10_000000#
then
90 raise Constraint_Error
;
93 W
:= Shift_Left
(W
, 6) or (U
and 2#
00111111#
);
96 -- Start of processing for Char_Sequence_To_Wide
102 if C
/= ASCII
.ESC
then
103 return Character'Pos (C
);
112 return UTF_32_Code
(B1
);
116 if C
> ASCII
.DEL
then
117 return 256 * Character'Pos (C
) + Character'Pos (In_Char
);
119 return Character'Pos (C
);
122 when WCEM_Shift_JIS
=>
123 if C
> ASCII
.DEL
then
124 return Wide_Character'Pos (Shift_JIS_To_JIS
(C
, In_Char
));
126 return Character'Pos (C
);
130 if C
> ASCII
.DEL
then
131 return Wide_Character'Pos (EUC_To_JIS
(C
, In_Char
));
133 return Character'Pos (C
);
138 -- Note: for details of UTF8 encoding see RFC 3629
140 U
:= Unsigned_32
(Character'Pos (C
));
142 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
144 if (U
and 2#
10000000#
) = 2#
00000000#
then
145 return Character'Pos (C
);
147 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
149 elsif (U
and 2#
11100000#
) = 2#
110_00000#
then
150 W
:= U
and 2#
00011111#
;
152 return UTF_32_Code
(W
);
154 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
156 elsif (U
and 2#
11110000#
) = 2#
1110_0000#
then
157 W
:= U
and 2#
00001111#
;
160 return UTF_32_Code
(W
);
162 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
164 elsif (U
and 2#
11111000#
) = 2#
11110_000#
then
165 W
:= U
and 2#
00000111#
;
171 return UTF_32_Code
(W
);
173 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
176 elsif (U
and 2#
11111100#
) = 2#
111110_00#
then
177 W
:= U
and 2#
00000011#
;
183 return UTF_32_Code
(W
);
185 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
186 -- 10xxxxxx 10xxxxxx 10xxxxxx
188 elsif (U
and 2#
11111110#
) = 2#
1111110_0#
then
189 W
:= U
and 2#
00000001#
;
195 return UTF_32_Code
(W
);
198 raise Constraint_Error
;
201 when WCEM_Brackets
=>
203 return Character'Pos (C
);
206 if In_Char
/= '"' then
207 raise Constraint_Error
;
232 if B1
> Unsigned_32
(UTF_32_Code
'Last) then
233 raise Constraint_Error
;
236 if In_Char
/= '"' then
237 raise Constraint_Error
;
243 if In_Char
/= ']' then
244 raise Constraint_Error
;
247 return UTF_32_Code
(B1
);
250 end Char_Sequence_To_UTF_32
;
252 --------------------------------
253 -- Char_Sequence_To_Wide_Char --
254 --------------------------------
256 function Char_Sequence_To_Wide_Char
258 EM
: System
.WCh_Con
.WC_Encoding_Method
) return Wide_Character
260 function Char_Sequence_To_UTF
is new Char_Sequence_To_UTF_32
(In_Char
);
262 U
: constant UTF_32_Code
:= Char_Sequence_To_UTF
(C
, EM
);
266 raise Constraint_Error
;
268 return Wide_Character'Val (U
);
270 end Char_Sequence_To_Wide_Char
;
272 -----------------------------
273 -- UTF_32_To_Char_Sequence --
274 -----------------------------
276 procedure UTF_32_To_Char_Sequence
278 EM
: System
.WCh_Con
.WC_Encoding_Method
)
280 Hexc
: constant array (UTF_32_Code
range 0 .. 15) of Character :=
287 -- Raise CE for invalid UTF_32_Code
289 if not Val
'Valid then
290 raise Constraint_Error
;
293 -- Processing depends on encoding mode
299 Out_Char
(Character'Val (Val
));
300 elsif Val
<= 16#FFFF#
then
301 Out_Char
(ASCII
.ESC
);
302 Out_Char
(Hexc
(Val
/ (16**3)));
303 Out_Char
(Hexc
((Val
/ (16**2)) mod 16));
304 Out_Char
(Hexc
((Val
/ 16) mod 16));
305 Out_Char
(Hexc
(Val
mod 16));
307 raise Constraint_Error
;
312 Out_Char
(Character'Val (Val
));
313 elsif Val
< 16#
8000#
or else Val
> 16#FFFF#
then
314 raise Constraint_Error
;
316 Out_Char
(Character'Val (Val
/ 256));
317 Out_Char
(Character'Val (Val
mod 256));
320 when WCEM_Shift_JIS
=>
322 Out_Char
(Character'Val (Val
));
323 elsif Val
<= 16#FFFF#
then
324 JIS_To_Shift_JIS
(Wide_Character'Val (Val
), C1
, C2
);
328 raise Constraint_Error
;
333 Out_Char
(Character'Val (Val
));
334 elsif Val
<= 16#FFFF#
then
335 JIS_To_EUC
(Wide_Character'Val (Val
), C1
, C2
);
339 raise Constraint_Error
;
344 -- Note: for details of UTF8 encoding see RFC 3629
346 U
:= Unsigned_32
(Val
);
348 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
350 if U
<= 16#
00_007F#
then
351 Out_Char
(Character'Val (U
));
353 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
355 elsif U
<= 16#
00_07FF#
then
356 Out_Char
(Character'Val (2#
11000000#
or Shift_Right
(U
, 6)));
357 Out_Char
(Character'Val (2#
10000000#
or (U
and 2#
00111111#
)));
359 -- 16#00_0800#-16#00_FFFF#: 1110xxxx 10xxxxxx 10xxxxxx
361 elsif U
<= 16#
00_FFFF#
then
362 Out_Char
(Character'Val (2#
11100000#
or Shift_Right
(U
, 12)));
363 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 6)
365 Out_Char
(Character'Val (2#
10000000#
or (U
and 2#
00111111#
)));
367 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
369 elsif U
<= 16#
10_FFFF#
then
370 Out_Char
(Character'Val (2#
11110000#
or Shift_Right
(U
, 18)));
371 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 12)
373 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 6)
375 Out_Char
(Character'Val (2#
10000000#
or (U
and 2#
00111111#
)));
377 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
380 elsif U
<= 16#
03FF_FFFF#
then
381 Out_Char
(Character'Val (2#
11111000#
or Shift_Right
(U
, 24)));
382 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 18)
384 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 12)
386 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 6)
388 Out_Char
(Character'Val (2#
10000000#
or (U
and 2#
00111111#
)));
390 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
391 -- 10xxxxxx 10xxxxxx 10xxxxxx
393 elsif U
<= 16#
7FFF_FFFF#
then
394 Out_Char
(Character'Val (2#
11111100#
or Shift_Right
(U
, 30)));
395 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 24)
397 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 18)
399 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 12)
401 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 6)
403 Out_Char
(Character'Val (2#
10000000#
or (U
and 2#
00111111#
)));
406 raise Constraint_Error
;
409 when WCEM_Brackets
=>
411 -- Values in the range 0-255 are directly output. Note that there
412 -- is some issue with [ (16#5B#] since this will cause confusion
413 -- if the resulting string is interpreted using brackets encoding.
415 -- One possibility would be to always output [ as ["5B"] but in
416 -- practice this is undesirable, since for example normal use of
417 -- Wide_Text_IO for output (much more common than input), really
418 -- does want to be able to say something like
420 -- Put_Line ("Start of output [first run]");
422 -- and have it come out as intended, rather than contaminated by
423 -- a ["5B"] sequence in place of the left bracket.
426 Out_Char
(Character'Val (Val
));
428 -- Otherwise use brackets notation for vales greater than 255
434 if Val
> 16#FFFF#
then
435 if Val
> 16#
00FF_FFFF#
then
436 Out_Char
(Hexc
(Val
/ 16 ** 7));
437 Out_Char
(Hexc
((Val
/ 16 ** 6) mod 16));
440 Out_Char
(Hexc
((Val
/ 16 ** 5) mod 16));
441 Out_Char
(Hexc
((Val
/ 16 ** 4) mod 16));
444 Out_Char
(Hexc
((Val
/ 16 ** 3) mod 16));
445 Out_Char
(Hexc
((Val
/ 16 ** 2) mod 16));
446 Out_Char
(Hexc
((Val
/ 16) mod 16));
447 Out_Char
(Hexc
(Val
mod 16));
453 end UTF_32_To_Char_Sequence
;
455 --------------------------------
456 -- Wide_Char_To_Char_Sequence --
457 --------------------------------
459 procedure Wide_Char_To_Char_Sequence
460 (WC
: Wide_Character;
461 EM
: System
.WCh_Con
.WC_Encoding_Method
)
463 procedure UTF_To_Char_Sequence
is new UTF_32_To_Char_Sequence
(Out_Char
);
465 UTF_To_Char_Sequence
(Wide_Character'Pos (WC
), EM
);
466 end Wide_Char_To_Char_Sequence
;