1 ------------------------------------------------------------------------------
3 -- GNAT RUN-TIME COMPONENTS --
5 -- G N A T . D E C O D E _ S T R I N G --
9 -- Copyright (C) 2007-2010, AdaCore --
11 -- GNAT is free software; you can redistribute it and/or modify it under --
12 -- terms of the GNU General Public License as published by the Free Soft- --
13 -- ware Foundation; either version 3, or (at your option) any later ver- --
14 -- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
15 -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
16 -- or FITNESS FOR A PARTICULAR PURPOSE. --
18 -- As a special exception under Section 7 of GPL version 3, you are granted --
19 -- additional permissions described in the GCC Runtime Library Exception, --
20 -- version 3.1, as published by the Free Software Foundation. --
22 -- You should have received a copy of the GNU General Public License and --
23 -- a copy of the GCC Runtime Library Exception along with this program; --
24 -- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see --
25 -- <http://www.gnu.org/licenses/>. --
27 -- GNAT was originally developed by the GNAT team at New York University. --
28 -- Extensive contributions were provided by Ada Core Technologies Inc. --
30 ------------------------------------------------------------------------------
32 -- This package provides a utility routine for converting from an encoded
33 -- string to a corresponding Wide_String or Wide_Wide_String value.
35 with Interfaces
; use Interfaces
;
37 with System
.WCh_Cnv
; use System
.WCh_Cnv
;
38 with System
.WCh_Con
; use System
.WCh_Con
;
40 package body GNAT
.Decode_String
is
42 -----------------------
43 -- Local Subprograms --
44 -----------------------
47 pragma No_Return
(Bad
);
48 -- Raise error for bad encoding
51 pragma No_Return
(Past_End
);
52 -- Raise error for off end of string
60 raise Constraint_Error
with
61 "bad encoding or character out of range";
64 ---------------------------
65 -- Decode_Wide_Character --
66 ---------------------------
68 procedure Decode_Wide_Character
71 Result
: out Wide_Character)
73 Char
: Wide_Wide_Character
;
75 Decode_Wide_Wide_Character
(Input
, Ptr
, Char
);
77 if Wide_Wide_Character
'Pos (Char
) > 16#FFFF#
then
80 Result
:= Wide_Character'Val (Wide_Wide_Character
'Pos (Char
));
82 end Decode_Wide_Character
;
84 ------------------------
85 -- Decode_Wide_String --
86 ------------------------
88 function Decode_Wide_String
(S
: String) return Wide_String is
89 Result
: Wide_String (1 .. S
'Length);
92 Decode_Wide_String
(S
, Result
, Length
);
93 return Result
(1 .. Length
);
94 end Decode_Wide_String
;
96 procedure Decode_Wide_String
98 Result
: out Wide_String;
106 while Ptr
<= S
'Last loop
107 if Length
>= Result
'Last then
111 Length
:= Length
+ 1;
112 Decode_Wide_Character
(S
, Ptr
, Result
(Length
));
114 end Decode_Wide_String
;
116 --------------------------------
117 -- Decode_Wide_Wide_Character --
118 --------------------------------
120 procedure Decode_Wide_Wide_Character
122 Ptr
: in out Natural;
123 Result
: out Wide_Wide_Character
)
127 function In_Char
return Character;
128 pragma Inline
(In_Char
);
129 -- Function to get one input character
135 function In_Char
return Character is
137 if Ptr
<= Input
'Last then
139 return Input
(Ptr
- 1);
145 -- Start of processing for Decode_Wide_Wide_Character
150 -- Special fast processing for UTF-8 case
152 if Encoding_Method
= WCEM_UTF8
then
157 procedure Get_UTF_Byte
;
158 pragma Inline
(Get_UTF_Byte
);
159 -- Used to interpret 2#10xxxxxx# continuation byte in UTF-8 mode.
160 -- Reads a byte, and raises CE if the first two bits are not 10.
161 -- Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
167 procedure Get_UTF_Byte
is
169 U
:= Unsigned_32
(Character'Pos (In_Char
));
171 if (U
and 2#
11000000#
) /= 2#
10_000000#
then
175 W
:= Shift_Left
(W
, 6) or (U
and 2#
00111111#
);
178 -- Start of processing for UTF8 case
181 -- Note: for details of UTF8 encoding see RFC 3629
183 U
:= Unsigned_32
(Character'Pos (C
));
185 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
187 if (U
and 2#
10000000#
) = 2#
00000000#
then
188 Result
:= Wide_Wide_Character
'Val (Character'Pos (C
));
190 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
192 elsif (U
and 2#
11100000#
) = 2#
110_00000#
then
193 W
:= U
and 2#
00011111#
;
195 Result
:= Wide_Wide_Character
'Val (W
);
197 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
199 elsif (U
and 2#
11110000#
) = 2#
1110_0000#
then
200 W
:= U
and 2#
00001111#
;
203 Result
:= Wide_Wide_Character
'Val (W
);
205 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
207 elsif (U
and 2#
11111000#
) = 2#
11110_000#
then
208 W
:= U
and 2#
00000111#
;
214 Result
:= Wide_Wide_Character
'Val (W
);
216 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
219 elsif (U
and 2#
11111100#
) = 2#
111110_00#
then
220 W
:= U
and 2#
00000011#
;
226 Result
:= Wide_Wide_Character
'Val (W
);
228 -- All other cases are invalid, note that this includes:
230 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
231 -- 10xxxxxx 10xxxxxx 10xxxxxx
233 -- since Wide_Wide_Character does not include code values
234 -- greater than 16#03FF_FFFF#.
241 -- All encoding functions other than UTF-8
245 function Char_Sequence_To_UTF
is
246 new Char_Sequence_To_UTF_32
(In_Char
);
249 -- For brackets, must test for specific case of [ not followed by
250 -- quotation, where we must not call Char_Sequence_To_UTF, but
251 -- instead just return the bracket unchanged.
253 if Encoding_Method
= WCEM_Brackets
255 and then (Ptr
> Input
'Last or else Input
(Ptr
) /= '"')
259 -- All other cases including [" with Brackets
263 Wide_Wide_Character
'Val
264 (Char_Sequence_To_UTF
(C
, Encoding_Method
));
268 end Decode_Wide_Wide_Character
;
270 -----------------------------
271 -- Decode_Wide_Wide_String --
272 -----------------------------
274 function Decode_Wide_Wide_String
(S
: String) return Wide_Wide_String
is
275 Result
: Wide_Wide_String
(1 .. S
'Length);
278 Decode_Wide_Wide_String
(S
, Result
, Length
);
279 return Result
(1 .. Length
);
280 end Decode_Wide_Wide_String
;
282 procedure Decode_Wide_Wide_String
284 Result
: out Wide_Wide_String
;
285 Length
: out Natural)
292 while Ptr
<= S
'Last loop
293 if Length
>= Result
'Last then
297 Length
:= Length
+ 1;
298 Decode_Wide_Wide_Character
(S
, Ptr
, Result
(Length
));
300 end Decode_Wide_Wide_String
;
302 -------------------------
303 -- Next_Wide_Character --
304 -------------------------
306 procedure Next_Wide_Character
(Input
: String; Ptr
: in out Natural) is
308 if Ptr
< Input
'First then
312 -- Special efficient encoding for UTF-8 case
314 if Encoding_Method
= WCEM_UTF8
then
319 pragma Inline
(Getc
);
320 -- Gets the character at Input (Ptr) and returns code in U as
321 -- Unsigned_32 value. On return Ptr is bumped past the character.
323 procedure Skip_UTF_Byte
;
324 pragma Inline
(Skip_UTF_Byte
);
325 -- Skips past one encoded byte which must be 2#10xxxxxx#
333 if Ptr
> Input
'Last then
336 U
:= Unsigned_32
(Character'Pos (Input
(Ptr
)));
345 procedure Skip_UTF_Byte
is
349 if (U
and 2#
11000000#
) /= 2#
10_000000#
then
354 -- Start of processing for UTF-8 case
357 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
361 if (U
and 2#
10000000#
) = 2#
00000000#
then
364 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
366 elsif (U
and 2#
11100000#
) = 2#
110_00000#
then
369 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
371 elsif (U
and 2#
11110000#
) = 2#
1110_0000#
then
375 -- Any other code is invalid, note that this includes:
377 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
379 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
382 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
383 -- 10xxxxxx 10xxxxxx 10xxxxxx
385 -- since Wide_Character does not allow codes > 16#FFFF#
396 Discard
: Wide_Character;
398 Decode_Wide_Character
(Input
, Ptr
, Discard
);
401 end Next_Wide_Character
;
403 ------------------------------
404 -- Next_Wide_Wide_Character --
405 ------------------------------
407 procedure Next_Wide_Wide_Character
(Input
: String; Ptr
: in out Natural) is
409 -- Special efficient encoding for UTF-8 case
411 if Encoding_Method
= WCEM_UTF8
then
416 pragma Inline
(Getc
);
417 -- Gets the character at Input (Ptr) and returns code in U as
418 -- Unsigned_32 value. On return Ptr is bumped past the character.
420 procedure Skip_UTF_Byte
;
421 pragma Inline
(Skip_UTF_Byte
);
422 -- Skips past one encoded byte which must be 2#10xxxxxx#
430 if Ptr
> Input
'Last then
433 U
:= Unsigned_32
(Character'Pos (Input
(Ptr
)));
442 procedure Skip_UTF_Byte
is
446 if (U
and 2#
11000000#
) /= 2#
10_000000#
then
451 -- Start of processing for UTF-8 case
454 if Ptr
< Input
'First then
458 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
462 if (U
and 2#
10000000#
) = 2#
00000000#
then
465 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
467 elsif (U
and 2#
11100000#
) = 2#
110_00000#
then
470 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
472 elsif (U
and 2#
11110000#
) = 2#
1110_0000#
then
476 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
478 elsif (U
and 2#
11111000#
) = 2#
11110_000#
then
483 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
486 elsif (U
and 2#
11111100#
) = 2#
111110_00#
then
491 -- Any other code is invalid, note that this includes:
493 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
494 -- 10xxxxxx 10xxxxxx 10xxxxxx
496 -- since Wide_Wide_Character does not allow codes > 16#03FF_FFFF#
507 Discard
: Wide_Wide_Character
;
509 Decode_Wide_Wide_Character
(Input
, Ptr
, Discard
);
512 end Next_Wide_Wide_Character
;
518 procedure Past_End
is
520 raise Constraint_Error
with "past end of string";
523 -------------------------
524 -- Prev_Wide_Character --
525 -------------------------
527 procedure Prev_Wide_Character
(Input
: String; Ptr
: in out Natural) is
529 if Ptr
> Input
'Last + 1 then
533 -- Special efficient encoding for UTF-8 case
535 if Encoding_Method
= WCEM_UTF8
then
540 pragma Inline
(Getc
);
541 -- Gets the character at Input (Ptr - 1) and returns code in U as
542 -- Unsigned_32 value. On return Ptr is decremented by one.
544 procedure Skip_UTF_Byte
;
545 pragma Inline
(Skip_UTF_Byte
);
546 -- Checks that U is 2#10xxxxxx# and then calls Get
554 if Ptr
<= Input
'First then
558 U
:= Unsigned_32
(Character'Pos (Input
(Ptr
)));
566 procedure Skip_UTF_Byte
is
568 if (U
and 2#
11000000#
) = 2#
10_000000#
then
575 -- Start of processing for UTF-8 case
578 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
582 if (U
and 2#
10000000#
) = 2#
00000000#
then
585 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
590 if (U
and 2#
11100000#
) = 2#
110_00000#
then
593 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
598 if (U
and 2#
11110000#
) = 2#
1110_0000#
then
601 -- Any other code is invalid, note that this includes:
603 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
606 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
610 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
614 -- since Wide_Character does not allow codes > 16#FFFF#
623 -- Special efficient encoding for brackets case
625 elsif Encoding_Method
= WCEM_Brackets
then
631 -- See if we have "] at end positions
633 if Ptr
> Input
'First + 1
634 and then Input
(Ptr
- 1) = ']'
635 and then Input
(Ptr
- 2) = '"'
639 -- Loop back looking for [" at start
641 while P
>= Ptr
- 10 loop
642 if P
<= Input
'First + 1 then
645 elsif Input
(P
- 1) = '"'
646 and then Input
(P
- 2) = '['
648 -- Found ["..."], scan forward to check it
652 Next_Wide_Character
(Input
, P
);
654 -- OK if at original pointer, else error
667 -- Falling through loop means more than 8 chars between the
668 -- enclosing brackets (or simply a missing left bracket)
672 -- Here if no bracket sequence present
675 if Ptr
= Input
'First then
683 -- Non-UTF-8/Brackets. These are the inefficient cases where we have to
684 -- go to the start of the string and skip forwards till Ptr matches.
687 Non_UTF_Brackets
: declare
688 Discard
: Wide_Character;
701 Decode_Wide_Character
(Input
, PtrS
, Discard
);
707 elsif PtrS
> Ptr
then
713 when Constraint_Error
=>
715 end Non_UTF_Brackets
;
717 end Prev_Wide_Character
;
719 ------------------------------
720 -- Prev_Wide_Wide_Character --
721 ------------------------------
723 procedure Prev_Wide_Wide_Character
(Input
: String; Ptr
: in out Natural) is
725 if Ptr
> Input
'Last + 1 then
729 -- Special efficient encoding for UTF-8 case
731 if Encoding_Method
= WCEM_UTF8
then
736 pragma Inline
(Getc
);
737 -- Gets the character at Input (Ptr - 1) and returns code in U as
738 -- Unsigned_32 value. On return Ptr is decremented by one.
740 procedure Skip_UTF_Byte
;
741 pragma Inline
(Skip_UTF_Byte
);
742 -- Checks that U is 2#10xxxxxx# and then calls Get
750 if Ptr
<= Input
'First then
754 U
:= Unsigned_32
(Character'Pos (Input
(Ptr
)));
762 procedure Skip_UTF_Byte
is
764 if (U
and 2#
11000000#
) = 2#
10_000000#
then
771 -- Start of processing for UTF-8 case
774 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
778 if (U
and 2#
10000000#
) = 2#
00000000#
then
781 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
786 if (U
and 2#
11100000#
) = 2#
110_00000#
then
789 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
794 if (U
and 2#
11110000#
) = 2#
1110_0000#
then
797 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
803 if (U
and 2#
11111000#
) = 2#
11110_000#
then
806 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
813 if (U
and 2#
11111100#
) = 2#
111110_00#
then
816 -- Any other code is invalid, note that this includes:
818 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
822 -- since Wide_Wide_Character does not allow codes
823 -- greater than 16#03FF_FFFF#
834 -- Special efficient encoding for brackets case
836 elsif Encoding_Method
= WCEM_Brackets
then
842 -- See if we have "] at end positions
844 if Ptr
> Input
'First + 1
845 and then Input
(Ptr
- 1) = ']'
846 and then Input
(Ptr
- 2) = '"'
850 -- Loop back looking for [" at start
852 while P
>= Ptr
- 10 loop
853 if P
<= Input
'First + 1 then
856 elsif Input
(P
- 1) = '"'
857 and then Input
(P
- 2) = '['
859 -- Found ["..."], scan forward to check it
863 Next_Wide_Wide_Character
(Input
, P
);
865 -- OK if at original pointer, else error
878 -- Falling through loop means more than 8 chars between the
879 -- enclosing brackets (or simply a missing left bracket)
883 -- Here if no bracket sequence present
886 if Ptr
= Input
'First then
894 -- Non-UTF-8/Brackets. These are the inefficient cases where we have to
895 -- go to the start of the string and skip forwards till Ptr matches.
898 Non_UTF8_Brackets
: declare
899 Discard
: Wide_Wide_Character
;
912 Decode_Wide_Wide_Character
(Input
, PtrS
, Discard
);
918 elsif PtrS
> Ptr
then
924 when Constraint_Error
=>
926 end Non_UTF8_Brackets
;
928 end Prev_Wide_Wide_Character
;
930 --------------------------
931 -- Validate_Wide_String --
932 --------------------------
934 function Validate_Wide_String
(S
: String) return Boolean is
939 while Ptr
<= S
'Last loop
940 Next_Wide_Character
(S
, Ptr
);
946 when Constraint_Error
=>
948 end Validate_Wide_String
;
950 -------------------------------
951 -- Validate_Wide_Wide_String --
952 -------------------------------
954 function Validate_Wide_Wide_String
(S
: String) return Boolean is
959 while Ptr
<= S
'Last loop
960 Next_Wide_Wide_Character
(S
, Ptr
);
966 when Constraint_Error
=>
968 end Validate_Wide_Wide_String
;
970 end GNAT
.Decode_String
;