[gcc]
[official-gcc.git] / gcc / ada / g-decstr.adb
blobab8d06c2b7fa327e676c2ac5d895f4cb2bec0895
1 ------------------------------------------------------------------------------
2 -- --
3 -- GNAT RUN-TIME COMPONENTS --
4 -- --
5 -- G N A T . D E C O D E _ S T R I N G --
6 -- --
7 -- S p e c --
8 -- --
9 -- Copyright (C) 2007-2014, AdaCore --
10 -- --
11 -- GNAT is free software; you can redistribute it and/or modify it under --
12 -- terms of the GNU General Public License as published by the Free Soft- --
13 -- ware Foundation; either version 3, or (at your option) any later ver- --
14 -- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
15 -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
16 -- or FITNESS FOR A PARTICULAR PURPOSE. --
17 -- --
18 -- As a special exception under Section 7 of GPL version 3, you are granted --
19 -- additional permissions described in the GCC Runtime Library Exception, --
20 -- version 3.1, as published by the Free Software Foundation. --
21 -- --
22 -- You should have received a copy of the GNU General Public License and --
23 -- a copy of the GCC Runtime Library Exception along with this program; --
24 -- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see --
25 -- <http://www.gnu.org/licenses/>. --
26 -- --
27 -- GNAT was originally developed by the GNAT team at New York University. --
28 -- Extensive contributions were provided by Ada Core Technologies Inc. --
29 -- --
30 ------------------------------------------------------------------------------
32 -- This package provides a utility routine for converting from an encoded
33 -- string to a corresponding Wide_String or Wide_Wide_String value.
35 with Interfaces; use Interfaces;
37 with System.WCh_Cnv; use System.WCh_Cnv;
38 with System.WCh_Con; use System.WCh_Con;
40 package body GNAT.Decode_String is
42 -----------------------
43 -- Local Subprograms --
44 -----------------------
46 procedure Bad;
47 pragma No_Return (Bad);
48 -- Raise error for bad encoding
50 procedure Past_End;
51 pragma No_Return (Past_End);
52 -- Raise error for off end of string
54 ---------
55 -- Bad --
56 ---------
58 procedure Bad is
59 begin
60 raise Constraint_Error with
61 "bad encoding or character out of range";
62 end Bad;
64 ---------------------------
65 -- Decode_Wide_Character --
66 ---------------------------
68 procedure Decode_Wide_Character
69 (Input : String;
70 Ptr : in out Natural;
71 Result : out Wide_Character)
73 Char : Wide_Wide_Character;
74 begin
75 Decode_Wide_Wide_Character (Input, Ptr, Char);
77 if Wide_Wide_Character'Pos (Char) > 16#FFFF# then
78 Bad;
79 else
80 Result := Wide_Character'Val (Wide_Wide_Character'Pos (Char));
81 end if;
82 end Decode_Wide_Character;
84 ------------------------
85 -- Decode_Wide_String --
86 ------------------------
88 function Decode_Wide_String (S : String) return Wide_String is
89 Result : Wide_String (1 .. S'Length);
90 Length : Natural;
91 begin
92 Decode_Wide_String (S, Result, Length);
93 return Result (1 .. Length);
94 end Decode_Wide_String;
96 procedure Decode_Wide_String
97 (S : String;
98 Result : out Wide_String;
99 Length : out Natural)
101 Ptr : Natural;
103 begin
104 Ptr := S'First;
105 Length := 0;
106 while Ptr <= S'Last loop
107 if Length >= Result'Last then
108 Past_End;
109 end if;
111 Length := Length + 1;
112 Decode_Wide_Character (S, Ptr, Result (Length));
113 end loop;
114 end Decode_Wide_String;
116 --------------------------------
117 -- Decode_Wide_Wide_Character --
118 --------------------------------
120 procedure Decode_Wide_Wide_Character
121 (Input : String;
122 Ptr : in out Natural;
123 Result : out Wide_Wide_Character)
125 C : Character;
127 function In_Char return Character;
128 pragma Inline (In_Char);
129 -- Function to get one input character
131 -------------
132 -- In_Char --
133 -------------
135 function In_Char return Character is
136 begin
137 if Ptr <= Input'Last then
138 Ptr := Ptr + 1;
139 return Input (Ptr - 1);
140 else
141 Past_End;
142 end if;
143 end In_Char;
145 -- Start of processing for Decode_Wide_Wide_Character
147 begin
148 C := In_Char;
150 -- Special fast processing for UTF-8 case
152 if Encoding_Method = WCEM_UTF8 then
153 UTF8 : declare
154 U : Unsigned_32;
155 W : Unsigned_32;
157 procedure Get_UTF_Byte;
158 pragma Inline (Get_UTF_Byte);
159 -- Used to interpret 2#10xxxxxx# continuation byte in UTF-8 mode.
160 -- Reads a byte, and raises CE if the first two bits are not 10.
161 -- Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
163 ------------------
164 -- Get_UTF_Byte --
165 ------------------
167 procedure Get_UTF_Byte is
168 begin
169 U := Unsigned_32 (Character'Pos (In_Char));
171 if (U and 2#11000000#) /= 2#10_000000# then
172 Bad;
173 end if;
175 W := Shift_Left (W, 6) or (U and 2#00111111#);
176 end Get_UTF_Byte;
178 -- Start of processing for UTF8 case
180 begin
181 -- Note: for details of UTF8 encoding see RFC 3629
183 U := Unsigned_32 (Character'Pos (C));
185 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
187 if (U and 2#10000000#) = 2#00000000# then
188 Result := Wide_Wide_Character'Val (Character'Pos (C));
190 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
192 elsif (U and 2#11100000#) = 2#110_00000# then
193 W := U and 2#00011111#;
194 Get_UTF_Byte;
196 if W not in 16#00_0080# .. 16#00_07FF# then
197 Bad;
198 end if;
200 Result := Wide_Wide_Character'Val (W);
202 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
204 elsif (U and 2#11110000#) = 2#1110_0000# then
205 W := U and 2#00001111#;
206 Get_UTF_Byte;
207 Get_UTF_Byte;
209 if W not in 16#00_0800# .. 16#00_FFFF# then
210 Bad;
211 end if;
213 Result := Wide_Wide_Character'Val (W);
215 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
217 elsif (U and 2#11111000#) = 2#11110_000# then
218 W := U and 2#00000111#;
220 for K in 1 .. 3 loop
221 Get_UTF_Byte;
222 end loop;
224 if W not in 16#01_0000# .. 16#10_FFFF# then
225 Bad;
226 end if;
228 Result := Wide_Wide_Character'Val (W);
230 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
231 -- 10xxxxxx 10xxxxxx
233 elsif (U and 2#11111100#) = 2#111110_00# then
234 W := U and 2#00000011#;
236 for K in 1 .. 4 loop
237 Get_UTF_Byte;
238 end loop;
240 if W not in 16#0020_0000# .. 16#03FF_FFFF# then
241 Bad;
242 end if;
244 Result := Wide_Wide_Character'Val (W);
246 -- All other cases are invalid, note that this includes:
248 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
249 -- 10xxxxxx 10xxxxxx 10xxxxxx
251 -- since Wide_Wide_Character does not include code values
252 -- greater than 16#03FF_FFFF#.
254 else
255 Bad;
256 end if;
257 end UTF8;
259 -- All encoding functions other than UTF-8
261 else
262 Non_UTF8 : declare
263 function Char_Sequence_To_UTF is
264 new Char_Sequence_To_UTF_32 (In_Char);
266 begin
267 -- For brackets, must test for specific case of [ not followed by
268 -- quotation, where we must not call Char_Sequence_To_UTF, but
269 -- instead just return the bracket unchanged.
271 if Encoding_Method = WCEM_Brackets
272 and then C = '['
273 and then (Ptr > Input'Last or else Input (Ptr) /= '"')
274 then
275 Result := '[';
277 -- All other cases including [" with Brackets
279 else
280 Result :=
281 Wide_Wide_Character'Val
282 (Char_Sequence_To_UTF (C, Encoding_Method));
283 end if;
284 end Non_UTF8;
285 end if;
286 end Decode_Wide_Wide_Character;
288 -----------------------------
289 -- Decode_Wide_Wide_String --
290 -----------------------------
292 function Decode_Wide_Wide_String (S : String) return Wide_Wide_String is
293 Result : Wide_Wide_String (1 .. S'Length);
294 Length : Natural;
295 begin
296 Decode_Wide_Wide_String (S, Result, Length);
297 return Result (1 .. Length);
298 end Decode_Wide_Wide_String;
300 procedure Decode_Wide_Wide_String
301 (S : String;
302 Result : out Wide_Wide_String;
303 Length : out Natural)
305 Ptr : Natural;
307 begin
308 Ptr := S'First;
309 Length := 0;
310 while Ptr <= S'Last loop
311 if Length >= Result'Last then
312 Past_End;
313 end if;
315 Length := Length + 1;
316 Decode_Wide_Wide_Character (S, Ptr, Result (Length));
317 end loop;
318 end Decode_Wide_Wide_String;
320 -------------------------
321 -- Next_Wide_Character --
322 -------------------------
324 procedure Next_Wide_Character (Input : String; Ptr : in out Natural) is
325 Discard : Wide_Character;
326 begin
327 Decode_Wide_Character (Input, Ptr, Discard);
328 end Next_Wide_Character;
330 ------------------------------
331 -- Next_Wide_Wide_Character --
332 ------------------------------
334 procedure Next_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
335 Discard : Wide_Wide_Character;
336 begin
337 Decode_Wide_Wide_Character (Input, Ptr, Discard);
338 end Next_Wide_Wide_Character;
340 --------------
341 -- Past_End --
342 --------------
344 procedure Past_End is
345 begin
346 raise Constraint_Error with "past end of string";
347 end Past_End;
349 -------------------------
350 -- Prev_Wide_Character --
351 -------------------------
353 procedure Prev_Wide_Character (Input : String; Ptr : in out Natural) is
354 begin
355 if Ptr > Input'Last + 1 then
356 Past_End;
357 end if;
359 -- Special efficient encoding for UTF-8 case
361 if Encoding_Method = WCEM_UTF8 then
362 UTF8 : declare
363 U : Unsigned_32;
365 procedure Getc;
366 pragma Inline (Getc);
367 -- Gets the character at Input (Ptr - 1) and returns code in U as
368 -- Unsigned_32 value. On return Ptr is decremented by one.
370 procedure Skip_UTF_Byte;
371 pragma Inline (Skip_UTF_Byte);
372 -- Checks that U is 2#10xxxxxx# and then calls Get
374 ----------
375 -- Getc --
376 ----------
378 procedure Getc is
379 begin
380 if Ptr <= Input'First then
381 Past_End;
382 else
383 Ptr := Ptr - 1;
384 U := Unsigned_32 (Character'Pos (Input (Ptr)));
385 end if;
386 end Getc;
388 -------------------
389 -- Skip_UTF_Byte --
390 -------------------
392 procedure Skip_UTF_Byte is
393 begin
394 if (U and 2#11000000#) = 2#10_000000# then
395 Getc;
396 else
397 Bad;
398 end if;
399 end Skip_UTF_Byte;
401 -- Start of processing for UTF-8 case
403 begin
404 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
406 Getc;
408 if (U and 2#10000000#) = 2#00000000# then
409 return;
411 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
413 else
414 Skip_UTF_Byte;
416 if (U and 2#11100000#) = 2#110_00000# then
417 return;
419 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
421 else
422 Skip_UTF_Byte;
424 if (U and 2#11110000#) = 2#1110_0000# then
425 return;
427 -- Any other code is invalid, note that this includes:
429 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
430 -- 10xxxxxx
432 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
433 -- 10xxxxxx 10xxxxxx
434 -- 10xxxxxx
436 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
437 -- 10xxxxxx 10xxxxxx
438 -- 10xxxxxx 10xxxxxx
440 -- since Wide_Character does not allow codes > 16#FFFF#
442 else
443 Bad;
444 end if;
445 end if;
446 end if;
447 end UTF8;
449 -- Special efficient encoding for brackets case
451 elsif Encoding_Method = WCEM_Brackets then
452 Brackets : declare
453 P : Natural;
454 S : Natural;
456 begin
457 -- See if we have "] at end positions
459 if Ptr > Input'First + 1
460 and then Input (Ptr - 1) = ']'
461 and then Input (Ptr - 2) = '"'
462 then
463 P := Ptr - 2;
465 -- Loop back looking for [" at start
467 while P >= Ptr - 10 loop
468 if P <= Input'First + 1 then
469 Bad;
471 elsif Input (P - 1) = '"'
472 and then Input (P - 2) = '['
473 then
474 -- Found ["..."], scan forward to check it
476 S := P - 2;
477 P := S;
478 Next_Wide_Character (Input, P);
480 -- OK if at original pointer, else error
482 if P = Ptr then
483 Ptr := S;
484 return;
485 else
486 Bad;
487 end if;
488 end if;
490 P := P - 1;
491 end loop;
493 -- Falling through loop means more than 8 chars between the
494 -- enclosing brackets (or simply a missing left bracket)
496 Bad;
498 -- Here if no bracket sequence present
500 else
501 if Ptr = Input'First then
502 Past_End;
503 else
504 Ptr := Ptr - 1;
505 end if;
506 end if;
507 end Brackets;
509 -- Non-UTF-8/Brackets. These are the inefficient cases where we have to
510 -- go to the start of the string and skip forwards till Ptr matches.
512 else
513 Non_UTF_Brackets : declare
514 Discard : Wide_Character;
515 PtrS : Natural;
516 PtrP : Natural;
518 begin
519 PtrS := Input'First;
521 if Ptr <= PtrS then
522 Past_End;
523 end if;
525 loop
526 PtrP := PtrS;
527 Decode_Wide_Character (Input, PtrS, Discard);
529 if PtrS = Ptr then
530 Ptr := PtrP;
531 return;
533 elsif PtrS > Ptr then
534 Bad;
535 end if;
536 end loop;
538 exception
539 when Constraint_Error =>
540 Bad;
541 end Non_UTF_Brackets;
542 end if;
543 end Prev_Wide_Character;
545 ------------------------------
546 -- Prev_Wide_Wide_Character --
547 ------------------------------
549 procedure Prev_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
550 begin
551 if Ptr > Input'Last + 1 then
552 Past_End;
553 end if;
555 -- Special efficient encoding for UTF-8 case
557 if Encoding_Method = WCEM_UTF8 then
558 UTF8 : declare
559 U : Unsigned_32;
561 procedure Getc;
562 pragma Inline (Getc);
563 -- Gets the character at Input (Ptr - 1) and returns code in U as
564 -- Unsigned_32 value. On return Ptr is decremented by one.
566 procedure Skip_UTF_Byte;
567 pragma Inline (Skip_UTF_Byte);
568 -- Checks that U is 2#10xxxxxx# and then calls Get
570 ----------
571 -- Getc --
572 ----------
574 procedure Getc is
575 begin
576 if Ptr <= Input'First then
577 Past_End;
578 else
579 Ptr := Ptr - 1;
580 U := Unsigned_32 (Character'Pos (Input (Ptr)));
581 end if;
582 end Getc;
584 -------------------
585 -- Skip_UTF_Byte --
586 -------------------
588 procedure Skip_UTF_Byte is
589 begin
590 if (U and 2#11000000#) = 2#10_000000# then
591 Getc;
592 else
593 Bad;
594 end if;
595 end Skip_UTF_Byte;
597 -- Start of processing for UTF-8 case
599 begin
600 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
602 Getc;
604 if (U and 2#10000000#) = 2#00000000# then
605 return;
607 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
609 else
610 Skip_UTF_Byte;
612 if (U and 2#11100000#) = 2#110_00000# then
613 return;
615 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
617 else
618 Skip_UTF_Byte;
620 if (U and 2#11110000#) = 2#1110_0000# then
621 return;
623 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
624 -- 10xxxxxx
626 else
627 Skip_UTF_Byte;
629 if (U and 2#11111000#) = 2#11110_000# then
630 return;
632 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
633 -- 10xxxxxx 10xxxxxx
634 -- 10xxxxxx
636 else
637 Skip_UTF_Byte;
639 if (U and 2#11111100#) = 2#111110_00# then
640 return;
642 -- Any other code is invalid, note that this includes:
644 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
645 -- 10xxxxxx 10xxxxxx
646 -- 10xxxxxx 10xxxxxx
648 -- since Wide_Wide_Character does not allow codes
649 -- greater than 16#03FF_FFFF#
651 else
652 Bad;
653 end if;
654 end if;
655 end if;
656 end if;
657 end if;
658 end UTF8;
660 -- Special efficient encoding for brackets case
662 elsif Encoding_Method = WCEM_Brackets then
663 Brackets : declare
664 P : Natural;
665 S : Natural;
667 begin
668 -- See if we have "] at end positions
670 if Ptr > Input'First + 1
671 and then Input (Ptr - 1) = ']'
672 and then Input (Ptr - 2) = '"'
673 then
674 P := Ptr - 2;
676 -- Loop back looking for [" at start
678 while P >= Ptr - 10 loop
679 if P <= Input'First + 1 then
680 Bad;
682 elsif Input (P - 1) = '"'
683 and then Input (P - 2) = '['
684 then
685 -- Found ["..."], scan forward to check it
687 S := P - 2;
688 P := S;
689 Next_Wide_Wide_Character (Input, P);
691 -- OK if at original pointer, else error
693 if P = Ptr then
694 Ptr := S;
695 return;
696 else
697 Bad;
698 end if;
699 end if;
701 P := P - 1;
702 end loop;
704 -- Falling through loop means more than 8 chars between the
705 -- enclosing brackets (or simply a missing left bracket)
707 Bad;
709 -- Here if no bracket sequence present
711 else
712 if Ptr = Input'First then
713 Past_End;
714 else
715 Ptr := Ptr - 1;
716 end if;
717 end if;
718 end Brackets;
720 -- Non-UTF-8/Brackets. These are the inefficient cases where we have to
721 -- go to the start of the string and skip forwards till Ptr matches.
723 else
724 Non_UTF8_Brackets : declare
725 Discard : Wide_Wide_Character;
726 PtrS : Natural;
727 PtrP : Natural;
729 begin
730 PtrS := Input'First;
732 if Ptr <= PtrS then
733 Past_End;
734 end if;
736 loop
737 PtrP := PtrS;
738 Decode_Wide_Wide_Character (Input, PtrS, Discard);
740 if PtrS = Ptr then
741 Ptr := PtrP;
742 return;
744 elsif PtrS > Ptr then
745 Bad;
746 end if;
747 end loop;
749 exception
750 when Constraint_Error =>
751 Bad;
752 end Non_UTF8_Brackets;
753 end if;
754 end Prev_Wide_Wide_Character;
756 --------------------------
757 -- Validate_Wide_String --
758 --------------------------
760 function Validate_Wide_String (S : String) return Boolean is
761 Ptr : Natural;
763 begin
764 Ptr := S'First;
765 while Ptr <= S'Last loop
766 Next_Wide_Character (S, Ptr);
767 end loop;
769 return True;
771 exception
772 when Constraint_Error =>
773 return False;
774 end Validate_Wide_String;
776 -------------------------------
777 -- Validate_Wide_Wide_String --
778 -------------------------------
780 function Validate_Wide_Wide_String (S : String) return Boolean is
781 Ptr : Natural;
783 begin
784 Ptr := S'First;
785 while Ptr <= S'Last loop
786 Next_Wide_Wide_Character (S, Ptr);
787 end loop;
789 return True;
791 exception
792 when Constraint_Error =>
793 return False;
794 end Validate_Wide_Wide_String;
796 end GNAT.Decode_String;