1 ------------------------------------------------------------------------------
3 -- GNAT RUN-TIME COMPONENTS --
5 -- G N A T . D E C O D E _ S T R I N G --
9 -- Copyright (C) 2007-2008, AdaCore --
11 -- GNAT is free software; you can redistribute it and/or modify it under --
12 -- terms of the GNU General Public License as published by the Free Soft- --
13 -- ware Foundation; either version 2, or (at your option) any later ver- --
14 -- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
15 -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
16 -- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License --
17 -- for more details. You should have received a copy of the GNU General --
18 -- Public License distributed with GNAT; see file COPYING. If not, write --
19 -- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, --
20 -- Boston, MA 02110-1301, USA. --
22 -- As a special exception, if other files instantiate generics from this --
23 -- unit, or you link this unit with other files to produce an executable, --
24 -- this unit does not by itself cause the resulting executable to be --
25 -- covered by the GNU General Public License. This exception does not --
26 -- however invalidate any other reasons why the executable file might be --
27 -- covered by the GNU Public License. --
29 -- GNAT was originally developed by the GNAT team at New York University. --
30 -- Extensive contributions were provided by Ada Core Technologies Inc. --
32 ------------------------------------------------------------------------------
34 -- This package provides a utility routine for converting from an encoded
35 -- string to a corresponding Wide_String or Wide_Wide_String value.
37 with Interfaces
; use Interfaces
;
39 with System
.WCh_Cnv
; use System
.WCh_Cnv
;
40 with System
.WCh_Con
; use System
.WCh_Con
;
42 package body GNAT
.Decode_String
is
44 -----------------------
45 -- Local Subprograms --
46 -----------------------
49 pragma No_Return
(Bad
);
50 -- Raise error for bad encoding
53 pragma No_Return
(Past_End
);
54 -- Raise error for off end of string
62 raise Constraint_Error
with
63 "bad encoding or character out of range";
66 ---------------------------
67 -- Decode_Wide_Character --
68 ---------------------------
70 procedure Decode_Wide_Character
73 Result
: out Wide_Character)
75 Char
: Wide_Wide_Character
;
77 Decode_Wide_Wide_Character
(Input
, Ptr
, Char
);
79 if Wide_Wide_Character
'Pos (Char
) > 16#FFFF#
then
82 Result
:= Wide_Character'Val (Wide_Wide_Character
'Pos (Char
));
84 end Decode_Wide_Character
;
86 ------------------------
87 -- Decode_Wide_String --
88 ------------------------
90 function Decode_Wide_String
(S
: String) return Wide_String is
91 Result
: Wide_String (1 .. S
'Length);
94 Decode_Wide_String
(S
, Result
, Length
);
95 return Result
(1 .. Length
);
96 end Decode_Wide_String
;
98 procedure Decode_Wide_String
100 Result
: out Wide_String;
101 Length
: out Natural)
108 while Ptr
<= S
'Last loop
109 if Length
>= Result
'Last then
113 Length
:= Length
+ 1;
114 Decode_Wide_Character
(S
, Ptr
, Result
(Length
));
116 end Decode_Wide_String
;
118 --------------------------------
119 -- Decode_Wide_Wide_Character --
120 --------------------------------
122 procedure Decode_Wide_Wide_Character
124 Ptr
: in out Natural;
125 Result
: out Wide_Wide_Character
)
129 function In_Char
return Character;
130 pragma Inline
(In_Char
);
131 -- Function to get one input character
137 function In_Char
return Character is
139 if Ptr
<= Input
'Last then
141 return Input
(Ptr
- 1);
147 -- Start of processing for Decode_Wide_Wide_Character
152 -- Special fast processing for UTF-8 case
154 if Encoding_Method
= WCEM_UTF8
then
159 procedure Get_UTF_Byte
;
160 pragma Inline
(Get_UTF_Byte
);
161 -- Used to interpret 2#10xxxxxx# continuation byte in UTF-8 mode.
162 -- Reads a byte, and raises CE if the first two bits are not 10.
163 -- Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
169 procedure Get_UTF_Byte
is
171 U
:= Unsigned_32
(Character'Pos (In_Char
));
173 if (U
and 2#
11000000#
) /= 2#
10_000000#
then
177 W
:= Shift_Left
(W
, 6) or (U
and 2#
00111111#
);
180 -- Start of processing for UTF8 case
183 -- Note: for details of UTF8 encoding see RFC 3629
185 U
:= Unsigned_32
(Character'Pos (C
));
187 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
189 if (U
and 2#
10000000#
) = 2#
00000000#
then
190 Result
:= Wide_Wide_Character
'Val (Character'Pos (C
));
192 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
194 elsif (U
and 2#
11100000#
) = 2#
110_00000#
then
195 W
:= U
and 2#
00011111#
;
197 Result
:= Wide_Wide_Character
'Val (W
);
199 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
201 elsif (U
and 2#
11110000#
) = 2#
1110_0000#
then
202 W
:= U
and 2#
00001111#
;
205 Result
:= Wide_Wide_Character
'Val (W
);
207 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
209 elsif (U
and 2#
11111000#
) = 2#
11110_000#
then
210 W
:= U
and 2#
00000111#
;
216 Result
:= Wide_Wide_Character
'Val (W
);
218 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
221 elsif (U
and 2#
11111100#
) = 2#
111110_00#
then
222 W
:= U
and 2#
00000011#
;
228 Result
:= Wide_Wide_Character
'Val (W
);
230 -- All other cases are invalid, note that this includes:
232 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
233 -- 10xxxxxx 10xxxxxx 10xxxxxx
235 -- since Wide_Wide_Character does not include code values
236 -- greater than 16#03FF_FFFF#.
243 -- All encoding functions other than UTF-8
247 function Char_Sequence_To_UTF
is
248 new Char_Sequence_To_UTF_32
(In_Char
);
251 -- For brackets, must test for specific case of [ not followed by
252 -- quotation, where we must not call Char_Sequence_To_UTF, but
253 -- instead just return the bracket unchanged.
255 if Encoding_Method
= WCEM_Brackets
257 and then (Ptr
> Input
'Last or else Input
(Ptr
) /= '"')
261 -- All other cases including [" with Brackets
265 Wide_Wide_Character
'Val
266 (Char_Sequence_To_UTF
(C
, Encoding_Method
));
270 end Decode_Wide_Wide_Character
;
272 -----------------------------
273 -- Decode_Wide_Wide_String --
274 -----------------------------
276 function Decode_Wide_Wide_String
(S
: String) return Wide_Wide_String
is
277 Result
: Wide_Wide_String
(1 .. S
'Length);
280 Decode_Wide_Wide_String
(S
, Result
, Length
);
281 return Result
(1 .. Length
);
282 end Decode_Wide_Wide_String
;
284 procedure Decode_Wide_Wide_String
286 Result
: out Wide_Wide_String
;
287 Length
: out Natural)
294 while Ptr
<= S
'Last loop
295 if Length
>= Result
'Last then
299 Length
:= Length
+ 1;
300 Decode_Wide_Wide_Character
(S
, Ptr
, Result
(Length
));
302 end Decode_Wide_Wide_String
;
304 -------------------------
305 -- Next_Wide_Character --
306 -------------------------
308 procedure Next_Wide_Character
(Input
: String; Ptr
: in out Natural) is
310 if Ptr
< Input
'First then
314 -- Special efficient encoding for UTF-8 case
316 if Encoding_Method
= WCEM_UTF8
then
321 pragma Inline
(Getc
);
322 -- Gets the character at Input (Ptr) and returns code in U as
323 -- Unsigned_32 value. On return Ptr is bumped past the character.
325 procedure Skip_UTF_Byte
;
326 pragma Inline
(Skip_UTF_Byte
);
327 -- Skips past one encoded byte which must be 2#10xxxxxx#
335 if Ptr
> Input
'Last then
338 U
:= Unsigned_32
(Character'Pos (Input
(Ptr
)));
347 procedure Skip_UTF_Byte
is
351 if (U
and 2#
11000000#
) /= 2#
10_000000#
then
356 -- Start of processing for UTF-8 case
359 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
363 if (U
and 2#
10000000#
) = 2#
00000000#
then
366 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
368 elsif (U
and 2#
11100000#
) = 2#
110_00000#
then
371 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
373 elsif (U
and 2#
11110000#
) = 2#
1110_0000#
then
377 -- Any other code is invalid, note that this includes:
379 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
381 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
384 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
385 -- 10xxxxxx 10xxxxxx 10xxxxxx
387 -- since Wide_Character does not allow codes > 16#FFFF#
398 Discard
: Wide_Character;
400 Decode_Wide_Character
(Input
, Ptr
, Discard
);
403 end Next_Wide_Character
;
405 ------------------------------
406 -- Next_Wide_Wide_Character --
407 ------------------------------
409 procedure Next_Wide_Wide_Character
(Input
: String; Ptr
: in out Natural) is
411 -- Special efficient encoding for UTF-8 case
413 if Encoding_Method
= WCEM_UTF8
then
418 pragma Inline
(Getc
);
419 -- Gets the character at Input (Ptr) and returns code in U as
420 -- Unsigned_32 value. On return Ptr is bumped past the character.
422 procedure Skip_UTF_Byte
;
423 pragma Inline
(Skip_UTF_Byte
);
424 -- Skips past one encoded byte which must be 2#10xxxxxx#
432 if Ptr
> Input
'Last then
435 U
:= Unsigned_32
(Character'Pos (Input
(Ptr
)));
444 procedure Skip_UTF_Byte
is
448 if (U
and 2#
11000000#
) /= 2#
10_000000#
then
453 -- Start of processing for UTF-8 case
456 if Ptr
< Input
'First then
460 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
464 if (U
and 2#
10000000#
) = 2#
00000000#
then
467 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
469 elsif (U
and 2#
11100000#
) = 2#
110_00000#
then
472 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
474 elsif (U
and 2#
11110000#
) = 2#
1110_0000#
then
478 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
480 elsif (U
and 2#
11111000#
) = 2#
11110_000#
then
485 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
488 elsif (U
and 2#
11111100#
) = 2#
111110_00#
then
493 -- Any other code is invalid, note that this includes:
495 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
496 -- 10xxxxxx 10xxxxxx 10xxxxxx
498 -- since Wide_Wide_Character does not allow codes > 16#03FF_FFFF#
509 Discard
: Wide_Wide_Character
;
511 Decode_Wide_Wide_Character
(Input
, Ptr
, Discard
);
514 end Next_Wide_Wide_Character
;
520 procedure Past_End
is
522 raise Constraint_Error
with "past end of string";
525 -------------------------
526 -- Prev_Wide_Character --
527 -------------------------
529 procedure Prev_Wide_Character
(Input
: String; Ptr
: in out Natural) is
531 if Ptr
> Input
'Last + 1 then
535 -- Special efficient encoding for UTF-8 case
537 if Encoding_Method
= WCEM_UTF8
then
542 pragma Inline
(Getc
);
543 -- Gets the character at Input (Ptr - 1) and returns code in U as
544 -- Unsigned_32 value. On return Ptr is decremented by one.
546 procedure Skip_UTF_Byte
;
547 pragma Inline
(Skip_UTF_Byte
);
548 -- Checks that U is 2#10xxxxxx# and then calls Get
556 if Ptr
<= Input
'First then
560 U
:= Unsigned_32
(Character'Pos (Input
(Ptr
)));
568 procedure Skip_UTF_Byte
is
570 if (U
and 2#
11000000#
) = 2#
10_000000#
then
577 -- Start of processing for UTF-8 case
580 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
584 if (U
and 2#
10000000#
) = 2#
00000000#
then
587 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
592 if (U
and 2#
11100000#
) = 2#
110_00000#
then
595 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
600 if (U
and 2#
11110000#
) = 2#
1110_0000#
then
603 -- Any other code is invalid, note that this includes:
605 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
608 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
612 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
616 -- since Wide_Character does not allow codes > 16#FFFF#
625 -- Special efficient encoding for brackets case
627 elsif Encoding_Method
= WCEM_Brackets
then
633 -- See if we have "] at end positions
635 if Ptr
> Input
'First + 1
636 and then Input
(Ptr
- 1) = ']'
637 and then Input
(Ptr
- 2) = '"'
641 -- Loop back looking for [" at start
643 while P
>= Ptr
- 10 loop
644 if P
<= Input
'First + 1 then
647 elsif Input
(P
- 1) = '"'
648 and then Input
(P
- 2) = '['
650 -- Found ["..."], scan forward to check it
654 Next_Wide_Character
(Input
, P
);
656 -- OK if at original pointer, else error
669 -- Falling through loop means more than 8 chars between the
670 -- enclosing brackets (or simply a missing left bracket)
674 -- Here if no bracket sequence present
677 if Ptr
= Input
'First then
685 -- Non-UTF-8/Brackets. These are the inefficient cases where we have to
686 -- go to the start of the string and skip forwards till Ptr matches.
689 Non_UTF_Brackets
: declare
690 Discard
: Wide_Character;
703 Decode_Wide_Character
(Input
, PtrS
, Discard
);
709 elsif PtrS
> Ptr
then
715 when Constraint_Error
=>
717 end Non_UTF_Brackets
;
719 end Prev_Wide_Character
;
721 ------------------------------
722 -- Prev_Wide_Wide_Character --
723 ------------------------------
725 procedure Prev_Wide_Wide_Character
(Input
: String; Ptr
: in out Natural) is
727 if Ptr
> Input
'Last + 1 then
731 -- Special efficient encoding for UTF-8 case
733 if Encoding_Method
= WCEM_UTF8
then
738 pragma Inline
(Getc
);
739 -- Gets the character at Input (Ptr - 1) and returns code in U as
740 -- Unsigned_32 value. On return Ptr is decremented by one.
742 procedure Skip_UTF_Byte
;
743 pragma Inline
(Skip_UTF_Byte
);
744 -- Checks that U is 2#10xxxxxx# and then calls Get
752 if Ptr
<= Input
'First then
756 U
:= Unsigned_32
(Character'Pos (Input
(Ptr
)));
764 procedure Skip_UTF_Byte
is
766 if (U
and 2#
11000000#
) = 2#
10_000000#
then
773 -- Start of processing for UTF-8 case
776 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
780 if (U
and 2#
10000000#
) = 2#
00000000#
then
783 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
788 if (U
and 2#
11100000#
) = 2#
110_00000#
then
791 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
796 if (U
and 2#
11110000#
) = 2#
1110_0000#
then
799 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
805 if (U
and 2#
11111000#
) = 2#
11110_000#
then
808 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
815 if (U
and 2#
11111100#
) = 2#
111110_00#
then
818 -- Any other code is invalid, note that this includes:
820 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
824 -- since Wide_Wide_Character does not allow codes
825 -- greater than 16#03FF_FFFF#
836 -- Special efficient encoding for brackets case
838 elsif Encoding_Method
= WCEM_Brackets
then
844 -- See if we have "] at end positions
846 if Ptr
> Input
'First + 1
847 and then Input
(Ptr
- 1) = ']'
848 and then Input
(Ptr
- 2) = '"'
852 -- Loop back looking for [" at start
854 while P
>= Ptr
- 10 loop
855 if P
<= Input
'First + 1 then
858 elsif Input
(P
- 1) = '"'
859 and then Input
(P
- 2) = '['
861 -- Found ["..."], scan forward to check it
865 Next_Wide_Wide_Character
(Input
, P
);
867 -- OK if at original pointer, else error
880 -- Falling through loop means more than 8 chars between the
881 -- enclosing brackets (or simply a missing left bracket)
885 -- Here if no bracket sequence present
888 if Ptr
= Input
'First then
896 -- Non-UTF-8/Brackets. These are the inefficient cases where we have to
897 -- go to the start of the string and skip forwards till Ptr matches.
900 Non_UTF8_Brackets
: declare
901 Discard
: Wide_Wide_Character
;
914 Decode_Wide_Wide_Character
(Input
, PtrS
, Discard
);
920 elsif PtrS
> Ptr
then
926 when Constraint_Error
=>
928 end Non_UTF8_Brackets
;
930 end Prev_Wide_Wide_Character
;
932 --------------------------
933 -- Validate_Wide_String --
934 --------------------------
936 function Validate_Wide_String
(S
: String) return Boolean is
941 while Ptr
<= S
'Last loop
942 Next_Wide_Character
(S
, Ptr
);
948 when Constraint_Error
=>
950 end Validate_Wide_String
;
952 -------------------------------
953 -- Validate_Wide_Wide_String --
954 -------------------------------
956 function Validate_Wide_Wide_String
(S
: String) return Boolean is
961 while Ptr
<= S
'Last loop
962 Next_Wide_Wide_Character
(S
, Ptr
);
968 when Constraint_Error
=>
970 end Validate_Wide_Wide_String
;
972 end GNAT
.Decode_String
;