1 ------------------------------------------------------------------------------
3 -- GNAT RUN-TIME COMPONENTS --
5 -- G N A T . D E C O D E _ S T R I N G --
9 -- Copyright (C) 2007-2013, AdaCore --
11 -- GNAT is free software; you can redistribute it and/or modify it under --
12 -- terms of the GNU General Public License as published by the Free Soft- --
13 -- ware Foundation; either version 3, or (at your option) any later ver- --
14 -- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
15 -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
16 -- or FITNESS FOR A PARTICULAR PURPOSE. --
18 -- As a special exception under Section 7 of GPL version 3, you are granted --
19 -- additional permissions described in the GCC Runtime Library Exception, --
20 -- version 3.1, as published by the Free Software Foundation. --
22 -- You should have received a copy of the GNU General Public License and --
23 -- a copy of the GCC Runtime Library Exception along with this program; --
24 -- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see --
25 -- <http://www.gnu.org/licenses/>. --
27 -- GNAT was originally developed by the GNAT team at New York University. --
28 -- Extensive contributions were provided by Ada Core Technologies Inc. --
30 ------------------------------------------------------------------------------
32 -- This package provides a utility routine for converting from an encoded
33 -- string to a corresponding Wide_String or Wide_Wide_String value.
35 with Interfaces
; use Interfaces
;
37 with System
.WCh_Cnv
; use System
.WCh_Cnv
;
38 with System
.WCh_Con
; use System
.WCh_Con
;
40 package body GNAT
.Decode_String
is
42 -----------------------
43 -- Local Subprograms --
44 -----------------------
47 pragma No_Return
(Bad
);
48 -- Raise error for bad encoding
51 pragma No_Return
(Past_End
);
52 -- Raise error for off end of string
60 raise Constraint_Error
with
61 "bad encoding or character out of range";
64 ---------------------------
65 -- Decode_Wide_Character --
66 ---------------------------
68 procedure Decode_Wide_Character
71 Result
: out Wide_Character)
73 Char
: Wide_Wide_Character
;
75 Decode_Wide_Wide_Character
(Input
, Ptr
, Char
);
77 if Wide_Wide_Character
'Pos (Char
) > 16#FFFF#
then
80 Result
:= Wide_Character'Val (Wide_Wide_Character
'Pos (Char
));
82 end Decode_Wide_Character
;
84 ------------------------
85 -- Decode_Wide_String --
86 ------------------------
88 function Decode_Wide_String
(S
: String) return Wide_String is
89 Result
: Wide_String (1 .. S
'Length);
92 Decode_Wide_String
(S
, Result
, Length
);
93 return Result
(1 .. Length
);
94 end Decode_Wide_String
;
96 procedure Decode_Wide_String
98 Result
: out Wide_String;
106 while Ptr
<= S
'Last loop
107 if Length
>= Result
'Last then
111 Length
:= Length
+ 1;
112 Decode_Wide_Character
(S
, Ptr
, Result
(Length
));
114 end Decode_Wide_String
;
116 --------------------------------
117 -- Decode_Wide_Wide_Character --
118 --------------------------------
120 procedure Decode_Wide_Wide_Character
122 Ptr
: in out Natural;
123 Result
: out Wide_Wide_Character
)
127 function In_Char
return Character;
128 pragma Inline
(In_Char
);
129 -- Function to get one input character
135 function In_Char
return Character is
137 if Ptr
<= Input
'Last then
139 return Input
(Ptr
- 1);
145 -- Start of processing for Decode_Wide_Wide_Character
150 -- Special fast processing for UTF-8 case
152 if Encoding_Method
= WCEM_UTF8
then
157 procedure Get_UTF_Byte
;
158 pragma Inline
(Get_UTF_Byte
);
159 -- Used to interpret 2#10xxxxxx# continuation byte in UTF-8 mode.
160 -- Reads a byte, and raises CE if the first two bits are not 10.
161 -- Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
167 procedure Get_UTF_Byte
is
169 U
:= Unsigned_32
(Character'Pos (In_Char
));
171 if (U
and 2#
11000000#
) /= 2#
10_000000#
then
175 W
:= Shift_Left
(W
, 6) or (U
and 2#
00111111#
);
178 -- Start of processing for UTF8 case
181 -- Note: for details of UTF8 encoding see RFC 3629
183 U
:= Unsigned_32
(Character'Pos (C
));
185 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
187 if (U
and 2#
10000000#
) = 2#
00000000#
then
188 Result
:= Wide_Wide_Character
'Val (Character'Pos (C
));
190 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
192 elsif (U
and 2#
11100000#
) = 2#
110_00000#
then
193 W
:= U
and 2#
00011111#
;
196 if W
not in 16#
00_0080#
.. 16#
00_07FF#
then
200 Result
:= Wide_Wide_Character
'Val (W
);
202 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
204 elsif (U
and 2#
11110000#
) = 2#
1110_0000#
then
205 W
:= U
and 2#
00001111#
;
209 if W
not in 16#
00_0800#
.. 16#
00_FFFF#
then
213 Result
:= Wide_Wide_Character
'Val (W
);
215 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
217 elsif (U
and 2#
11111000#
) = 2#
11110_000#
then
218 W
:= U
and 2#
00000111#
;
224 if W
not in 16#
01_0000#
.. 16#
10_FFFF#
then
228 Result
:= Wide_Wide_Character
'Val (W
);
230 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
233 elsif (U
and 2#
11111100#
) = 2#
111110_00#
then
234 W
:= U
and 2#
00000011#
;
240 if W
not in 16#
0020_0000#
.. 16#
03FF_FFFF#
then
244 Result
:= Wide_Wide_Character
'Val (W
);
246 -- All other cases are invalid, note that this includes:
248 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
249 -- 10xxxxxx 10xxxxxx 10xxxxxx
251 -- since Wide_Wide_Character does not include code values
252 -- greater than 16#03FF_FFFF#.
259 -- All encoding functions other than UTF-8
263 function Char_Sequence_To_UTF
is
264 new Char_Sequence_To_UTF_32
(In_Char
);
267 -- For brackets, must test for specific case of [ not followed by
268 -- quotation, where we must not call Char_Sequence_To_UTF, but
269 -- instead just return the bracket unchanged.
271 if Encoding_Method
= WCEM_Brackets
273 and then (Ptr
> Input
'Last or else Input
(Ptr
) /= '"')
277 -- All other cases including [" with Brackets
281 Wide_Wide_Character
'Val
282 (Char_Sequence_To_UTF
(C
, Encoding_Method
));
286 end Decode_Wide_Wide_Character
;
288 -----------------------------
289 -- Decode_Wide_Wide_String --
290 -----------------------------
292 function Decode_Wide_Wide_String
(S
: String) return Wide_Wide_String
is
293 Result
: Wide_Wide_String
(1 .. S
'Length);
296 Decode_Wide_Wide_String
(S
, Result
, Length
);
297 return Result
(1 .. Length
);
298 end Decode_Wide_Wide_String
;
300 procedure Decode_Wide_Wide_String
302 Result
: out Wide_Wide_String
;
303 Length
: out Natural)
310 while Ptr
<= S
'Last loop
311 if Length
>= Result
'Last then
315 Length
:= Length
+ 1;
316 Decode_Wide_Wide_Character
(S
, Ptr
, Result
(Length
));
318 end Decode_Wide_Wide_String
;
320 -------------------------
321 -- Next_Wide_Character --
322 -------------------------
324 procedure Next_Wide_Character
(Input
: String; Ptr
: in out Natural) is
325 Discard
: Wide_Character;
326 pragma Unreferenced
(Discard
);
328 Decode_Wide_Character
(Input
, Ptr
, Discard
);
329 end Next_Wide_Character
;
331 ------------------------------
332 -- Next_Wide_Wide_Character --
333 ------------------------------
335 procedure Next_Wide_Wide_Character
(Input
: String; Ptr
: in out Natural) is
336 Discard
: Wide_Wide_Character
;
337 pragma Unreferenced
(Discard
);
339 Decode_Wide_Wide_Character
(Input
, Ptr
, Discard
);
340 end Next_Wide_Wide_Character
;
346 procedure Past_End
is
348 raise Constraint_Error
with "past end of string";
351 -------------------------
352 -- Prev_Wide_Character --
353 -------------------------
355 procedure Prev_Wide_Character
(Input
: String; Ptr
: in out Natural) is
357 if Ptr
> Input
'Last + 1 then
361 -- Special efficient encoding for UTF-8 case
363 if Encoding_Method
= WCEM_UTF8
then
368 pragma Inline
(Getc
);
369 -- Gets the character at Input (Ptr - 1) and returns code in U as
370 -- Unsigned_32 value. On return Ptr is decremented by one.
372 procedure Skip_UTF_Byte
;
373 pragma Inline
(Skip_UTF_Byte
);
374 -- Checks that U is 2#10xxxxxx# and then calls Get
382 if Ptr
<= Input
'First then
386 U
:= Unsigned_32
(Character'Pos (Input
(Ptr
)));
394 procedure Skip_UTF_Byte
is
396 if (U
and 2#
11000000#
) = 2#
10_000000#
then
403 -- Start of processing for UTF-8 case
406 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
410 if (U
and 2#
10000000#
) = 2#
00000000#
then
413 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
418 if (U
and 2#
11100000#
) = 2#
110_00000#
then
421 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
426 if (U
and 2#
11110000#
) = 2#
1110_0000#
then
429 -- Any other code is invalid, note that this includes:
431 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
434 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
438 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
442 -- since Wide_Character does not allow codes > 16#FFFF#
451 -- Special efficient encoding for brackets case
453 elsif Encoding_Method
= WCEM_Brackets
then
459 -- See if we have "] at end positions
461 if Ptr
> Input
'First + 1
462 and then Input
(Ptr
- 1) = ']'
463 and then Input
(Ptr
- 2) = '"'
467 -- Loop back looking for [" at start
469 while P
>= Ptr
- 10 loop
470 if P
<= Input
'First + 1 then
473 elsif Input
(P
- 1) = '"'
474 and then Input
(P
- 2) = '['
476 -- Found ["..."], scan forward to check it
480 Next_Wide_Character
(Input
, P
);
482 -- OK if at original pointer, else error
495 -- Falling through loop means more than 8 chars between the
496 -- enclosing brackets (or simply a missing left bracket)
500 -- Here if no bracket sequence present
503 if Ptr
= Input
'First then
511 -- Non-UTF-8/Brackets. These are the inefficient cases where we have to
512 -- go to the start of the string and skip forwards till Ptr matches.
515 Non_UTF_Brackets
: declare
516 Discard
: Wide_Character;
529 Decode_Wide_Character
(Input
, PtrS
, Discard
);
535 elsif PtrS
> Ptr
then
541 when Constraint_Error
=>
543 end Non_UTF_Brackets
;
545 end Prev_Wide_Character
;
547 ------------------------------
548 -- Prev_Wide_Wide_Character --
549 ------------------------------
551 procedure Prev_Wide_Wide_Character
(Input
: String; Ptr
: in out Natural) is
553 if Ptr
> Input
'Last + 1 then
557 -- Special efficient encoding for UTF-8 case
559 if Encoding_Method
= WCEM_UTF8
then
564 pragma Inline
(Getc
);
565 -- Gets the character at Input (Ptr - 1) and returns code in U as
566 -- Unsigned_32 value. On return Ptr is decremented by one.
568 procedure Skip_UTF_Byte
;
569 pragma Inline
(Skip_UTF_Byte
);
570 -- Checks that U is 2#10xxxxxx# and then calls Get
578 if Ptr
<= Input
'First then
582 U
:= Unsigned_32
(Character'Pos (Input
(Ptr
)));
590 procedure Skip_UTF_Byte
is
592 if (U
and 2#
11000000#
) = 2#
10_000000#
then
599 -- Start of processing for UTF-8 case
602 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
606 if (U
and 2#
10000000#
) = 2#
00000000#
then
609 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
614 if (U
and 2#
11100000#
) = 2#
110_00000#
then
617 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
622 if (U
and 2#
11110000#
) = 2#
1110_0000#
then
625 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
631 if (U
and 2#
11111000#
) = 2#
11110_000#
then
634 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
641 if (U
and 2#
11111100#
) = 2#
111110_00#
then
644 -- Any other code is invalid, note that this includes:
646 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
650 -- since Wide_Wide_Character does not allow codes
651 -- greater than 16#03FF_FFFF#
662 -- Special efficient encoding for brackets case
664 elsif Encoding_Method
= WCEM_Brackets
then
670 -- See if we have "] at end positions
672 if Ptr
> Input
'First + 1
673 and then Input
(Ptr
- 1) = ']'
674 and then Input
(Ptr
- 2) = '"'
678 -- Loop back looking for [" at start
680 while P
>= Ptr
- 10 loop
681 if P
<= Input
'First + 1 then
684 elsif Input
(P
- 1) = '"'
685 and then Input
(P
- 2) = '['
687 -- Found ["..."], scan forward to check it
691 Next_Wide_Wide_Character
(Input
, P
);
693 -- OK if at original pointer, else error
706 -- Falling through loop means more than 8 chars between the
707 -- enclosing brackets (or simply a missing left bracket)
711 -- Here if no bracket sequence present
714 if Ptr
= Input
'First then
722 -- Non-UTF-8/Brackets. These are the inefficient cases where we have to
723 -- go to the start of the string and skip forwards till Ptr matches.
726 Non_UTF8_Brackets
: declare
727 Discard
: Wide_Wide_Character
;
740 Decode_Wide_Wide_Character
(Input
, PtrS
, Discard
);
746 elsif PtrS
> Ptr
then
752 when Constraint_Error
=>
754 end Non_UTF8_Brackets
;
756 end Prev_Wide_Wide_Character
;
758 --------------------------
759 -- Validate_Wide_String --
760 --------------------------
762 function Validate_Wide_String
(S
: String) return Boolean is
767 while Ptr
<= S
'Last loop
768 Next_Wide_Character
(S
, Ptr
);
774 when Constraint_Error
=>
776 end Validate_Wide_String
;
778 -------------------------------
779 -- Validate_Wide_Wide_String --
780 -------------------------------
782 function Validate_Wide_Wide_String
(S
: String) return Boolean is
787 while Ptr
<= S
'Last loop
788 Next_Wide_Wide_Character
(S
, Ptr
);
794 when Constraint_Error
=>
796 end Validate_Wide_Wide_String
;
798 end GNAT
.Decode_String
;