1 ------------------------------------------------------------------------------
3 -- GNAT RUN-TIME COMPONENTS --
5 -- G N A T . D E C O D E _ S T R I N G --
9 -- Copyright (C) 2007-2014, AdaCore --
11 -- GNAT is free software; you can redistribute it and/or modify it under --
12 -- terms of the GNU General Public License as published by the Free Soft- --
13 -- ware Foundation; either version 3, or (at your option) any later ver- --
14 -- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
15 -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
16 -- or FITNESS FOR A PARTICULAR PURPOSE. --
18 -- As a special exception under Section 7 of GPL version 3, you are granted --
19 -- additional permissions described in the GCC Runtime Library Exception, --
20 -- version 3.1, as published by the Free Software Foundation. --
22 -- You should have received a copy of the GNU General Public License and --
23 -- a copy of the GCC Runtime Library Exception along with this program; --
24 -- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see --
25 -- <http://www.gnu.org/licenses/>. --
27 -- GNAT was originally developed by the GNAT team at New York University. --
28 -- Extensive contributions were provided by Ada Core Technologies Inc. --
30 ------------------------------------------------------------------------------
32 -- This package provides a utility routine for converting from an encoded
33 -- string to a corresponding Wide_String or Wide_Wide_String value.
35 with Interfaces
; use Interfaces
;
37 with System
.WCh_Cnv
; use System
.WCh_Cnv
;
38 with System
.WCh_Con
; use System
.WCh_Con
;
40 package body GNAT
.Decode_String
is
42 -----------------------
43 -- Local Subprograms --
44 -----------------------
47 pragma No_Return
(Bad
);
48 -- Raise error for bad encoding
51 pragma No_Return
(Past_End
);
52 -- Raise error for off end of string
60 raise Constraint_Error
with
61 "bad encoding or character out of range";
64 ---------------------------
65 -- Decode_Wide_Character --
66 ---------------------------
68 procedure Decode_Wide_Character
71 Result
: out Wide_Character)
73 Char
: Wide_Wide_Character
;
75 Decode_Wide_Wide_Character
(Input
, Ptr
, Char
);
77 if Wide_Wide_Character
'Pos (Char
) > 16#FFFF#
then
80 Result
:= Wide_Character'Val (Wide_Wide_Character
'Pos (Char
));
82 end Decode_Wide_Character
;
84 ------------------------
85 -- Decode_Wide_String --
86 ------------------------
88 function Decode_Wide_String
(S
: String) return Wide_String is
89 Result
: Wide_String (1 .. S
'Length);
92 Decode_Wide_String
(S
, Result
, Length
);
93 return Result
(1 .. Length
);
94 end Decode_Wide_String
;
96 procedure Decode_Wide_String
98 Result
: out Wide_String;
106 while Ptr
<= S
'Last loop
107 if Length
>= Result
'Last then
111 Length
:= Length
+ 1;
112 Decode_Wide_Character
(S
, Ptr
, Result
(Length
));
114 end Decode_Wide_String
;
116 --------------------------------
117 -- Decode_Wide_Wide_Character --
118 --------------------------------
120 procedure Decode_Wide_Wide_Character
122 Ptr
: in out Natural;
123 Result
: out Wide_Wide_Character
)
127 function In_Char
return Character;
128 pragma Inline
(In_Char
);
129 -- Function to get one input character
135 function In_Char
return Character is
137 if Ptr
<= Input
'Last then
139 return Input
(Ptr
- 1);
145 -- Start of processing for Decode_Wide_Wide_Character
150 -- Special fast processing for UTF-8 case
152 if Encoding_Method
= WCEM_UTF8
then
157 procedure Get_UTF_Byte
;
158 pragma Inline
(Get_UTF_Byte
);
159 -- Used to interpret 2#10xxxxxx# continuation byte in UTF-8 mode.
160 -- Reads a byte, and raises CE if the first two bits are not 10.
161 -- Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
167 procedure Get_UTF_Byte
is
169 U
:= Unsigned_32
(Character'Pos (In_Char
));
171 if (U
and 2#
11000000#
) /= 2#
10_000000#
then
175 W
:= Shift_Left
(W
, 6) or (U
and 2#
00111111#
);
178 -- Start of processing for UTF8 case
181 -- Note: for details of UTF8 encoding see RFC 3629
183 U
:= Unsigned_32
(Character'Pos (C
));
185 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
187 if (U
and 2#
10000000#
) = 2#
00000000#
then
188 Result
:= Wide_Wide_Character
'Val (Character'Pos (C
));
190 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
192 elsif (U
and 2#
11100000#
) = 2#
110_00000#
then
193 W
:= U
and 2#
00011111#
;
196 if W
not in 16#
00_0080#
.. 16#
00_07FF#
then
200 Result
:= Wide_Wide_Character
'Val (W
);
202 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
204 elsif (U
and 2#
11110000#
) = 2#
1110_0000#
then
205 W
:= U
and 2#
00001111#
;
209 if W
not in 16#
00_0800#
.. 16#
00_FFFF#
then
213 Result
:= Wide_Wide_Character
'Val (W
);
215 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
217 elsif (U
and 2#
11111000#
) = 2#
11110_000#
then
218 W
:= U
and 2#
00000111#
;
224 if W
not in 16#
01_0000#
.. 16#
10_FFFF#
then
228 Result
:= Wide_Wide_Character
'Val (W
);
230 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
233 elsif (U
and 2#
11111100#
) = 2#
111110_00#
then
234 W
:= U
and 2#
00000011#
;
240 if W
not in 16#
0020_0000#
.. 16#
03FF_FFFF#
then
244 Result
:= Wide_Wide_Character
'Val (W
);
246 -- All other cases are invalid, note that this includes:
248 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
249 -- 10xxxxxx 10xxxxxx 10xxxxxx
251 -- since Wide_Wide_Character does not include code values
252 -- greater than 16#03FF_FFFF#.
259 -- All encoding functions other than UTF-8
263 function Char_Sequence_To_UTF
is
264 new Char_Sequence_To_UTF_32
(In_Char
);
267 -- For brackets, must test for specific case of [ not followed by
268 -- quotation, where we must not call Char_Sequence_To_UTF, but
269 -- instead just return the bracket unchanged.
271 if Encoding_Method
= WCEM_Brackets
273 and then (Ptr
> Input
'Last or else Input
(Ptr
) /= '"')
277 -- All other cases including [" with Brackets
281 Wide_Wide_Character
'Val
282 (Char_Sequence_To_UTF
(C
, Encoding_Method
));
286 end Decode_Wide_Wide_Character
;
288 -----------------------------
289 -- Decode_Wide_Wide_String --
290 -----------------------------
292 function Decode_Wide_Wide_String
(S
: String) return Wide_Wide_String
is
293 Result
: Wide_Wide_String
(1 .. S
'Length);
296 Decode_Wide_Wide_String
(S
, Result
, Length
);
297 return Result
(1 .. Length
);
298 end Decode_Wide_Wide_String
;
300 procedure Decode_Wide_Wide_String
302 Result
: out Wide_Wide_String
;
303 Length
: out Natural)
310 while Ptr
<= S
'Last loop
311 if Length
>= Result
'Last then
315 Length
:= Length
+ 1;
316 Decode_Wide_Wide_Character
(S
, Ptr
, Result
(Length
));
318 end Decode_Wide_Wide_String
;
320 -------------------------
321 -- Next_Wide_Character --
322 -------------------------
324 procedure Next_Wide_Character
(Input
: String; Ptr
: in out Natural) is
325 Discard
: Wide_Character;
327 Decode_Wide_Character
(Input
, Ptr
, Discard
);
328 end Next_Wide_Character
;
330 ------------------------------
331 -- Next_Wide_Wide_Character --
332 ------------------------------
334 procedure Next_Wide_Wide_Character
(Input
: String; Ptr
: in out Natural) is
335 Discard
: Wide_Wide_Character
;
337 Decode_Wide_Wide_Character
(Input
, Ptr
, Discard
);
338 end Next_Wide_Wide_Character
;
344 procedure Past_End
is
346 raise Constraint_Error
with "past end of string";
349 -------------------------
350 -- Prev_Wide_Character --
351 -------------------------
353 procedure Prev_Wide_Character
(Input
: String; Ptr
: in out Natural) is
355 if Ptr
> Input
'Last + 1 then
359 -- Special efficient encoding for UTF-8 case
361 if Encoding_Method
= WCEM_UTF8
then
366 pragma Inline
(Getc
);
367 -- Gets the character at Input (Ptr - 1) and returns code in U as
368 -- Unsigned_32 value. On return Ptr is decremented by one.
370 procedure Skip_UTF_Byte
;
371 pragma Inline
(Skip_UTF_Byte
);
372 -- Checks that U is 2#10xxxxxx# and then calls Get
380 if Ptr
<= Input
'First then
384 U
:= Unsigned_32
(Character'Pos (Input
(Ptr
)));
392 procedure Skip_UTF_Byte
is
394 if (U
and 2#
11000000#
) = 2#
10_000000#
then
401 -- Start of processing for UTF-8 case
404 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
408 if (U
and 2#
10000000#
) = 2#
00000000#
then
411 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
416 if (U
and 2#
11100000#
) = 2#
110_00000#
then
419 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
424 if (U
and 2#
11110000#
) = 2#
1110_0000#
then
427 -- Any other code is invalid, note that this includes:
429 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
432 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
436 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
440 -- since Wide_Character does not allow codes > 16#FFFF#
449 -- Special efficient encoding for brackets case
451 elsif Encoding_Method
= WCEM_Brackets
then
457 -- See if we have "] at end positions
459 if Ptr
> Input
'First + 1
460 and then Input
(Ptr
- 1) = ']'
461 and then Input
(Ptr
- 2) = '"'
465 -- Loop back looking for [" at start
467 while P
>= Ptr
- 10 loop
468 if P
<= Input
'First + 1 then
471 elsif Input
(P
- 1) = '"'
472 and then Input
(P
- 2) = '['
474 -- Found ["..."], scan forward to check it
478 Next_Wide_Character
(Input
, P
);
480 -- OK if at original pointer, else error
493 -- Falling through loop means more than 8 chars between the
494 -- enclosing brackets (or simply a missing left bracket)
498 -- Here if no bracket sequence present
501 if Ptr
= Input
'First then
509 -- Non-UTF-8/Brackets. These are the inefficient cases where we have to
510 -- go to the start of the string and skip forwards till Ptr matches.
513 Non_UTF_Brackets
: declare
514 Discard
: Wide_Character;
527 Decode_Wide_Character
(Input
, PtrS
, Discard
);
533 elsif PtrS
> Ptr
then
539 when Constraint_Error
=>
541 end Non_UTF_Brackets
;
543 end Prev_Wide_Character
;
545 ------------------------------
546 -- Prev_Wide_Wide_Character --
547 ------------------------------
549 procedure Prev_Wide_Wide_Character
(Input
: String; Ptr
: in out Natural) is
551 if Ptr
> Input
'Last + 1 then
555 -- Special efficient encoding for UTF-8 case
557 if Encoding_Method
= WCEM_UTF8
then
562 pragma Inline
(Getc
);
563 -- Gets the character at Input (Ptr - 1) and returns code in U as
564 -- Unsigned_32 value. On return Ptr is decremented by one.
566 procedure Skip_UTF_Byte
;
567 pragma Inline
(Skip_UTF_Byte
);
568 -- Checks that U is 2#10xxxxxx# and then calls Get
576 if Ptr
<= Input
'First then
580 U
:= Unsigned_32
(Character'Pos (Input
(Ptr
)));
588 procedure Skip_UTF_Byte
is
590 if (U
and 2#
11000000#
) = 2#
10_000000#
then
597 -- Start of processing for UTF-8 case
600 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
604 if (U
and 2#
10000000#
) = 2#
00000000#
then
607 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
612 if (U
and 2#
11100000#
) = 2#
110_00000#
then
615 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
620 if (U
and 2#
11110000#
) = 2#
1110_0000#
then
623 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
629 if (U
and 2#
11111000#
) = 2#
11110_000#
then
632 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
639 if (U
and 2#
11111100#
) = 2#
111110_00#
then
642 -- Any other code is invalid, note that this includes:
644 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
648 -- since Wide_Wide_Character does not allow codes
649 -- greater than 16#03FF_FFFF#
660 -- Special efficient encoding for brackets case
662 elsif Encoding_Method
= WCEM_Brackets
then
668 -- See if we have "] at end positions
670 if Ptr
> Input
'First + 1
671 and then Input
(Ptr
- 1) = ']'
672 and then Input
(Ptr
- 2) = '"'
676 -- Loop back looking for [" at start
678 while P
>= Ptr
- 10 loop
679 if P
<= Input
'First + 1 then
682 elsif Input
(P
- 1) = '"'
683 and then Input
(P
- 2) = '['
685 -- Found ["..."], scan forward to check it
689 Next_Wide_Wide_Character
(Input
, P
);
691 -- OK if at original pointer, else error
704 -- Falling through loop means more than 8 chars between the
705 -- enclosing brackets (or simply a missing left bracket)
709 -- Here if no bracket sequence present
712 if Ptr
= Input
'First then
720 -- Non-UTF-8/Brackets. These are the inefficient cases where we have to
721 -- go to the start of the string and skip forwards till Ptr matches.
724 Non_UTF8_Brackets
: declare
725 Discard
: Wide_Wide_Character
;
738 Decode_Wide_Wide_Character
(Input
, PtrS
, Discard
);
744 elsif PtrS
> Ptr
then
750 when Constraint_Error
=>
752 end Non_UTF8_Brackets
;
754 end Prev_Wide_Wide_Character
;
756 --------------------------
757 -- Validate_Wide_String --
758 --------------------------
760 function Validate_Wide_String
(S
: String) return Boolean is
765 while Ptr
<= S
'Last loop
766 Next_Wide_Character
(S
, Ptr
);
772 when Constraint_Error
=>
774 end Validate_Wide_String
;
776 -------------------------------
777 -- Validate_Wide_Wide_String --
778 -------------------------------
780 function Validate_Wide_Wide_String
(S
: String) return Boolean is
785 while Ptr
<= S
'Last loop
786 Next_Wide_Wide_Character
(S
, Ptr
);
792 when Constraint_Error
=>
794 end Validate_Wide_Wide_String
;
796 end GNAT
.Decode_String
;