1 ------------------------------------------------------------------------------
3 -- GNAT RUN-TIME COMPONENTS --
5 -- S Y S T E M . W C H _ C N V --
9 -- Copyright (C) 1992-2006, Free Software Foundation, Inc. --
11 -- GNAT is free software; you can redistribute it and/or modify it under --
12 -- terms of the GNU General Public License as published by the Free Soft- --
13 -- ware Foundation; either version 2, or (at your option) any later ver- --
14 -- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
15 -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
16 -- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License --
17 -- for more details. You should have received a copy of the GNU General --
18 -- Public License distributed with GNAT; see file COPYING. If not, write --
19 -- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, --
20 -- Boston, MA 02110-1301, USA. --
22 -- As a special exception, if other files instantiate generics from this --
23 -- unit, or you link this unit with other files to produce an executable, --
24 -- this unit does not by itself cause the resulting executable to be --
25 -- covered by the GNU General Public License. This exception does not --
26 -- however invalidate any other reasons why the executable file might be --
27 -- covered by the GNU Public License. --
29 -- GNAT was originally developed by the GNAT team at New York University. --
30 -- Extensive contributions were provided by Ada Core Technologies Inc. --
32 ------------------------------------------------------------------------------
34 -- This package contains generic subprograms used for converting between
35 -- sequences of Character and Wide_Character. All access to wide character
36 -- sequences is isolated in this unit.
38 with Interfaces
; use Interfaces
;
39 with System
.WCh_Con
; use System
.WCh_Con
;
40 with System
.WCh_JIS
; use System
.WCh_JIS
;
42 package body System
.WCh_Cnv
is
44 -----------------------------
45 -- Char_Sequence_To_UTF_32 --
46 -----------------------------
48 function Char_Sequence_To_UTF_32
50 EM
: WC_Encoding_Method
) return UTF_32_Code
57 procedure Get_Hex
(N
: Character);
58 -- If N is a hex character, then set B1 to 16 * B1 + character N.
59 -- Raise Constraint_Error if character N is not a hex character.
61 procedure Get_UTF_Byte
;
62 pragma Inline
(Get_UTF_Byte
);
63 -- Used to interpret a 2#10xxxxxx# continuation byte in UTF-8 mode.
64 -- Reads a byte, and raises CE if the first two bits are not 10.
65 -- Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
71 procedure Get_Hex
(N
: Character) is
72 B2
: constant Unsigned_32
:= Character'Pos (N
);
74 if B2
in Character'Pos ('0') .. Character'Pos ('9') then
75 B1
:= B1
* 16 + B2
- Character'Pos ('0');
76 elsif B2
in Character'Pos ('A') .. Character'Pos ('F') then
77 B1
:= B1
* 16 + B2
- (Character'Pos ('A') - 10);
78 elsif B2
in Character'Pos ('a') .. Character'Pos ('f') then
79 B1
:= B1
* 16 + B2
- (Character'Pos ('a') - 10);
81 raise Constraint_Error
;
89 procedure Get_UTF_Byte
is
91 U
:= Unsigned_32
(Character'Pos (In_Char
));
93 if (U
and 2#
11000000#
) /= 2#
10_000000#
then
94 raise Constraint_Error
;
97 W
:= Shift_Left
(W
, 6) or (U
and 2#
00111111#
);
100 -- Start of processing for Char_Sequence_To_Wide
106 if C
/= ASCII
.ESC
then
107 return Character'Pos (C
);
116 return UTF_32_Code
(B1
);
120 if C
> ASCII
.DEL
then
121 return 256 * Character'Pos (C
) + Character'Pos (In_Char
);
123 return Character'Pos (C
);
126 when WCEM_Shift_JIS
=>
127 if C
> ASCII
.DEL
then
128 return Wide_Character'Pos (Shift_JIS_To_JIS
(C
, In_Char
));
130 return Character'Pos (C
);
134 if C
> ASCII
.DEL
then
135 return Wide_Character'Pos (EUC_To_JIS
(C
, In_Char
));
137 return Character'Pos (C
);
142 -- Note: for details of UTF8 encoding see RFC 3629
144 U
:= Unsigned_32
(Character'Pos (C
));
146 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
148 if (U
and 2#
10000000#
) = 2#
00000000#
then
149 return Character'Pos (C
);
151 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
153 elsif (U
and 2#
11100000#
) = 2#
110_00000#
then
154 W
:= Shift_Left
(U
and 2#
00011111#
, 6);
155 U
:= Unsigned_32
(Character'Pos (In_Char
));
157 if (U
and 2#
11000000#
) /= 2#
10_000000#
then
158 raise Constraint_Error
;
161 W
:= W
or (U
and 2#
00111111#
);
163 return UTF_32_Code
(W
);
165 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
167 elsif (U
and 2#
11110000#
) = 2#
1110_0000#
then
168 W
:= U
and 2#
00001111#
;
171 return UTF_32_Code
(W
);
173 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
175 elsif (U
and 2#
11111000#
) = 2#
11110_000#
then
176 W
:= U
and 2#
00000111#
;
182 return UTF_32_Code
(W
);
184 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
187 elsif (U
and 2#
11111100#
) = 2#
111110_00#
then
188 W
:= U
and 2#
00000011#
;
194 return UTF_32_Code
(W
);
196 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
197 -- 10xxxxxx 10xxxxxx 10xxxxxx
199 elsif (U
and 2#
11111110#
) = 2#
1111110_0#
then
200 W
:= U
and 2#
00000001#
;
206 return UTF_32_Code
(W
);
209 raise Constraint_Error
;
212 when WCEM_Brackets
=>
215 return Character'Pos (C
);
218 if In_Char
/= '"' then
219 raise Constraint_Error
;
244 if B1
> Unsigned_32
(UTF_32_Code
'Last) then
245 raise Constraint_Error
;
248 if In_Char
/= '"' then
249 raise Constraint_Error
;
255 if In_Char
/= ']' then
256 raise Constraint_Error
;
259 return UTF_32_Code
(B1
);
262 end Char_Sequence_To_UTF_32
;
264 --------------------------------
265 -- Char_Sequence_To_Wide_Char --
266 --------------------------------
268 function Char_Sequence_To_Wide_Char
270 EM
: System
.WCh_Con
.WC_Encoding_Method
) return Wide_Character
272 function Char_Sequence_To_UTF
is new Char_Sequence_To_UTF_32
(In_Char
);
274 U
: constant UTF_32_Code
:= Char_Sequence_To_UTF
(C
, EM
);
278 raise Constraint_Error
;
280 return Wide_Character'Val (U
);
282 end Char_Sequence_To_Wide_Char
;
284 -----------------------------
285 -- UTF_32_To_Char_Sequence --
286 -----------------------------
288 procedure UTF_32_To_Char_Sequence
290 EM
: System
.WCh_Con
.WC_Encoding_Method
)
292 Hexc
: constant array (UTF_32_Code
range 0 .. 15) of Character :=
303 Out_Char
(Character'Val (Val
));
304 elsif Val
<= 16#FFFF#
then
305 Out_Char
(ASCII
.ESC
);
306 Out_Char
(Hexc
(Val
/ (16**3)));
307 Out_Char
(Hexc
((Val
/ (16**2)) mod 16));
308 Out_Char
(Hexc
((Val
/ 16) mod 16));
309 Out_Char
(Hexc
(Val
mod 16));
311 raise Constraint_Error
;
316 Out_Char
(Character'Val (Val
));
317 elsif Val
< 16#
8000#
or else Val
> 16#FFFF#
then
318 raise Constraint_Error
;
320 Out_Char
(Character'Val (Val
/ 256));
321 Out_Char
(Character'Val (Val
mod 256));
324 when WCEM_Shift_JIS
=>
326 Out_Char
(Character'Val (Val
));
327 elsif Val
<= 16#FFFF#
then
328 JIS_To_Shift_JIS
(Wide_Character'Val (Val
), C1
, C2
);
332 raise Constraint_Error
;
337 Out_Char
(Character'Val (Val
));
338 elsif Val
<= 16#FFFF#
then
339 JIS_To_EUC
(Wide_Character'Val (Val
), C1
, C2
);
343 raise Constraint_Error
;
348 -- Note: for details of UTF8 encoding see RFC 3629
350 U
:= Unsigned_32
(Val
);
352 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
354 if U
<= 16#
00_007F#
then
355 Out_Char
(Character'Val (U
));
357 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
359 elsif U
<= 16#
00_07FF#
then
360 Out_Char
(Character'Val (2#
11000000#
or Shift_Right
(U
, 6)));
361 Out_Char
(Character'Val (2#
10000000#
or (U
and 2#
00111111#
)));
363 -- 16#00_0800#-16#00_FFFF#: 1110xxxx 10xxxxxx 10xxxxxx
365 elsif U
<= 16#
00_FFFF#
then
366 Out_Char
(Character'Val (2#
11100000#
or Shift_Right
(U
, 12)));
367 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 6)
369 Out_Char
(Character'Val (2#
10000000#
or (U
and 2#
00111111#
)));
371 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
373 elsif U
<= 16#
10_FFFF#
then
374 Out_Char
(Character'Val (2#
11110000#
or Shift_Right
(U
, 18)));
375 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 12)
377 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 6)
379 Out_Char
(Character'Val (2#
10000000#
or (U
and 2#
00111111#
)));
381 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
384 elsif U
<= 16#
03FF_FFFF#
then
385 Out_Char
(Character'Val (2#
11111000#
or Shift_Right
(U
, 24)));
386 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 18)
388 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 12)
390 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 6)
392 Out_Char
(Character'Val (2#
10000000#
or (U
and 2#
00111111#
)));
394 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
395 -- 10xxxxxx 10xxxxxx 10xxxxxx
397 elsif U
<= 16#
7FFF_FFFF#
then
398 Out_Char
(Character'Val (2#
11111100#
or Shift_Right
(U
, 30)));
399 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 24)
401 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 18)
403 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 12)
405 Out_Char
(Character'Val (2#
10000000#
or (Shift_Right
(U
, 6)
407 Out_Char
(Character'Val (2#
10000000#
or (U
and 2#
00111111#
)));
410 raise Constraint_Error
;
413 when WCEM_Brackets
=>
415 -- Values in the range 0-255 are directly output. Note that there
416 -- is some issue with [ (16#5B#] since this will cause confusion
417 -- if the resulting string is interpreted using brackets encoding.
419 -- One possibility would be to always output [ as ["5B"] but in
420 -- practice this is undesirable, since for example normal use of
421 -- Wide_Text_IO for output (much more common than input), really
422 -- does want to be able to say something like
424 -- Put_Line ("Start of output [first run]");
426 -- and have it come out as intended, rather than contaminated by
427 -- a ["5B"] sequence in place of the left bracket.
430 Out_Char
(Character'Val (Val
));
432 -- Otherwise use brackets notation for vales greater than 255
438 if Val
> 16#FFFF#
then
439 if Val
> 16#
00FF_FFFF#
then
440 if Val
> 16#
7FFF_FFFF#
then
441 raise Constraint_Error
;
444 Out_Char
(Hexc
(Val
/ 16 ** 7));
445 Out_Char
(Hexc
((Val
/ 16 ** 6) mod 16));
448 Out_Char
(Hexc
((Val
/ 16 ** 5) mod 16));
449 Out_Char
(Hexc
((Val
/ 16 ** 4) mod 16));
452 Out_Char
(Hexc
((Val
/ 16 ** 3) mod 16));
453 Out_Char
(Hexc
((Val
/ 16 ** 2) mod 16));
454 Out_Char
(Hexc
((Val
/ 16) mod 16));
455 Out_Char
(Hexc
(Val
mod 16));
461 end UTF_32_To_Char_Sequence
;
463 --------------------------------
464 -- Wide_Char_To_Char_Sequence --
465 --------------------------------
467 procedure Wide_Char_To_Char_Sequence
468 (WC
: Wide_Character;
469 EM
: System
.WCh_Con
.WC_Encoding_Method
)
471 procedure UTF_To_Char_Sequence
is new UTF_32_To_Char_Sequence
(Out_Char
);
473 UTF_To_Char_Sequence
(Wide_Character'Pos (WC
), EM
);
474 end Wide_Char_To_Char_Sequence
;