2013-11-13 Jan-Benedict Glaw <jbglaw@lug-owl.de>
[official-gcc.git] / gcc / ada / g-decstr.adb
blob255e78a2614d81e65c391096851dbc43c814955b
1 ------------------------------------------------------------------------------
2 -- --
3 -- GNAT RUN-TIME COMPONENTS --
4 -- --
5 -- G N A T . D E C O D E _ S T R I N G --
6 -- --
7 -- S p e c --
8 -- --
9 -- Copyright (C) 2007-2013, AdaCore --
10 -- --
11 -- GNAT is free software; you can redistribute it and/or modify it under --
12 -- terms of the GNU General Public License as published by the Free Soft- --
13 -- ware Foundation; either version 3, or (at your option) any later ver- --
14 -- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
15 -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
16 -- or FITNESS FOR A PARTICULAR PURPOSE. --
17 -- --
18 -- As a special exception under Section 7 of GPL version 3, you are granted --
19 -- additional permissions described in the GCC Runtime Library Exception, --
20 -- version 3.1, as published by the Free Software Foundation. --
21 -- --
22 -- You should have received a copy of the GNU General Public License and --
23 -- a copy of the GCC Runtime Library Exception along with this program; --
24 -- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see --
25 -- <http://www.gnu.org/licenses/>. --
26 -- --
27 -- GNAT was originally developed by the GNAT team at New York University. --
28 -- Extensive contributions were provided by Ada Core Technologies Inc. --
29 -- --
30 ------------------------------------------------------------------------------
32 -- This package provides a utility routine for converting from an encoded
33 -- string to a corresponding Wide_String or Wide_Wide_String value.
35 with Interfaces; use Interfaces;
37 with System.WCh_Cnv; use System.WCh_Cnv;
38 with System.WCh_Con; use System.WCh_Con;
40 package body GNAT.Decode_String is
42 -----------------------
43 -- Local Subprograms --
44 -----------------------
46 procedure Bad;
47 pragma No_Return (Bad);
48 -- Raise error for bad encoding
50 procedure Past_End;
51 pragma No_Return (Past_End);
52 -- Raise error for off end of string
54 ---------
55 -- Bad --
56 ---------
58 procedure Bad is
59 begin
60 raise Constraint_Error with
61 "bad encoding or character out of range";
62 end Bad;
64 ---------------------------
65 -- Decode_Wide_Character --
66 ---------------------------
68 procedure Decode_Wide_Character
69 (Input : String;
70 Ptr : in out Natural;
71 Result : out Wide_Character)
73 Char : Wide_Wide_Character;
74 begin
75 Decode_Wide_Wide_Character (Input, Ptr, Char);
77 if Wide_Wide_Character'Pos (Char) > 16#FFFF# then
78 Bad;
79 else
80 Result := Wide_Character'Val (Wide_Wide_Character'Pos (Char));
81 end if;
82 end Decode_Wide_Character;
84 ------------------------
85 -- Decode_Wide_String --
86 ------------------------
88 function Decode_Wide_String (S : String) return Wide_String is
89 Result : Wide_String (1 .. S'Length);
90 Length : Natural;
91 begin
92 Decode_Wide_String (S, Result, Length);
93 return Result (1 .. Length);
94 end Decode_Wide_String;
96 procedure Decode_Wide_String
97 (S : String;
98 Result : out Wide_String;
99 Length : out Natural)
101 Ptr : Natural;
103 begin
104 Ptr := S'First;
105 Length := 0;
106 while Ptr <= S'Last loop
107 if Length >= Result'Last then
108 Past_End;
109 end if;
111 Length := Length + 1;
112 Decode_Wide_Character (S, Ptr, Result (Length));
113 end loop;
114 end Decode_Wide_String;
116 --------------------------------
117 -- Decode_Wide_Wide_Character --
118 --------------------------------
120 procedure Decode_Wide_Wide_Character
121 (Input : String;
122 Ptr : in out Natural;
123 Result : out Wide_Wide_Character)
125 C : Character;
127 function In_Char return Character;
128 pragma Inline (In_Char);
129 -- Function to get one input character
131 -------------
132 -- In_Char --
133 -------------
135 function In_Char return Character is
136 begin
137 if Ptr <= Input'Last then
138 Ptr := Ptr + 1;
139 return Input (Ptr - 1);
140 else
141 Past_End;
142 end if;
143 end In_Char;
145 -- Start of processing for Decode_Wide_Wide_Character
147 begin
148 C := In_Char;
150 -- Special fast processing for UTF-8 case
152 if Encoding_Method = WCEM_UTF8 then
153 UTF8 : declare
154 U : Unsigned_32;
155 W : Unsigned_32;
157 procedure Get_UTF_Byte;
158 pragma Inline (Get_UTF_Byte);
159 -- Used to interpret 2#10xxxxxx# continuation byte in UTF-8 mode.
160 -- Reads a byte, and raises CE if the first two bits are not 10.
161 -- Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
163 ------------------
164 -- Get_UTF_Byte --
165 ------------------
167 procedure Get_UTF_Byte is
168 begin
169 U := Unsigned_32 (Character'Pos (In_Char));
171 if (U and 2#11000000#) /= 2#10_000000# then
172 Bad;
173 end if;
175 W := Shift_Left (W, 6) or (U and 2#00111111#);
176 end Get_UTF_Byte;
178 -- Start of processing for UTF8 case
180 begin
181 -- Note: for details of UTF8 encoding see RFC 3629
183 U := Unsigned_32 (Character'Pos (C));
185 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
187 if (U and 2#10000000#) = 2#00000000# then
188 Result := Wide_Wide_Character'Val (Character'Pos (C));
190 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
192 elsif (U and 2#11100000#) = 2#110_00000# then
193 W := U and 2#00011111#;
194 Get_UTF_Byte;
196 if W not in 16#00_0080# .. 16#00_07FF# then
197 Bad;
198 end if;
200 Result := Wide_Wide_Character'Val (W);
202 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
204 elsif (U and 2#11110000#) = 2#1110_0000# then
205 W := U and 2#00001111#;
206 Get_UTF_Byte;
207 Get_UTF_Byte;
209 if W not in 16#00_0800# .. 16#00_FFFF# then
210 Bad;
211 end if;
213 Result := Wide_Wide_Character'Val (W);
215 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
217 elsif (U and 2#11111000#) = 2#11110_000# then
218 W := U and 2#00000111#;
220 for K in 1 .. 3 loop
221 Get_UTF_Byte;
222 end loop;
224 if W not in 16#01_0000# .. 16#10_FFFF# then
225 Bad;
226 end if;
228 Result := Wide_Wide_Character'Val (W);
230 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
231 -- 10xxxxxx 10xxxxxx
233 elsif (U and 2#11111100#) = 2#111110_00# then
234 W := U and 2#00000011#;
236 for K in 1 .. 4 loop
237 Get_UTF_Byte;
238 end loop;
240 if W not in 16#0020_0000# .. 16#03FF_FFFF# then
241 Bad;
242 end if;
244 Result := Wide_Wide_Character'Val (W);
246 -- All other cases are invalid, note that this includes:
248 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
249 -- 10xxxxxx 10xxxxxx 10xxxxxx
251 -- since Wide_Wide_Character does not include code values
252 -- greater than 16#03FF_FFFF#.
254 else
255 Bad;
256 end if;
257 end UTF8;
259 -- All encoding functions other than UTF-8
261 else
262 Non_UTF8 : declare
263 function Char_Sequence_To_UTF is
264 new Char_Sequence_To_UTF_32 (In_Char);
266 begin
267 -- For brackets, must test for specific case of [ not followed by
268 -- quotation, where we must not call Char_Sequence_To_UTF, but
269 -- instead just return the bracket unchanged.
271 if Encoding_Method = WCEM_Brackets
272 and then C = '['
273 and then (Ptr > Input'Last or else Input (Ptr) /= '"')
274 then
275 Result := '[';
277 -- All other cases including [" with Brackets
279 else
280 Result :=
281 Wide_Wide_Character'Val
282 (Char_Sequence_To_UTF (C, Encoding_Method));
283 end if;
284 end Non_UTF8;
285 end if;
286 end Decode_Wide_Wide_Character;
288 -----------------------------
289 -- Decode_Wide_Wide_String --
290 -----------------------------
292 function Decode_Wide_Wide_String (S : String) return Wide_Wide_String is
293 Result : Wide_Wide_String (1 .. S'Length);
294 Length : Natural;
295 begin
296 Decode_Wide_Wide_String (S, Result, Length);
297 return Result (1 .. Length);
298 end Decode_Wide_Wide_String;
300 procedure Decode_Wide_Wide_String
301 (S : String;
302 Result : out Wide_Wide_String;
303 Length : out Natural)
305 Ptr : Natural;
307 begin
308 Ptr := S'First;
309 Length := 0;
310 while Ptr <= S'Last loop
311 if Length >= Result'Last then
312 Past_End;
313 end if;
315 Length := Length + 1;
316 Decode_Wide_Wide_Character (S, Ptr, Result (Length));
317 end loop;
318 end Decode_Wide_Wide_String;
320 -------------------------
321 -- Next_Wide_Character --
322 -------------------------
324 procedure Next_Wide_Character (Input : String; Ptr : in out Natural) is
325 Discard : Wide_Character;
326 pragma Unreferenced (Discard);
327 begin
328 Decode_Wide_Character (Input, Ptr, Discard);
329 end Next_Wide_Character;
331 ------------------------------
332 -- Next_Wide_Wide_Character --
333 ------------------------------
335 procedure Next_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
336 Discard : Wide_Wide_Character;
337 pragma Unreferenced (Discard);
338 begin
339 Decode_Wide_Wide_Character (Input, Ptr, Discard);
340 end Next_Wide_Wide_Character;
342 --------------
343 -- Past_End --
344 --------------
346 procedure Past_End is
347 begin
348 raise Constraint_Error with "past end of string";
349 end Past_End;
351 -------------------------
352 -- Prev_Wide_Character --
353 -------------------------
355 procedure Prev_Wide_Character (Input : String; Ptr : in out Natural) is
356 begin
357 if Ptr > Input'Last + 1 then
358 Past_End;
359 end if;
361 -- Special efficient encoding for UTF-8 case
363 if Encoding_Method = WCEM_UTF8 then
364 UTF8 : declare
365 U : Unsigned_32;
367 procedure Getc;
368 pragma Inline (Getc);
369 -- Gets the character at Input (Ptr - 1) and returns code in U as
370 -- Unsigned_32 value. On return Ptr is decremented by one.
372 procedure Skip_UTF_Byte;
373 pragma Inline (Skip_UTF_Byte);
374 -- Checks that U is 2#10xxxxxx# and then calls Get
376 ----------
377 -- Getc --
378 ----------
380 procedure Getc is
381 begin
382 if Ptr <= Input'First then
383 Past_End;
384 else
385 Ptr := Ptr - 1;
386 U := Unsigned_32 (Character'Pos (Input (Ptr)));
387 end if;
388 end Getc;
390 -------------------
391 -- Skip_UTF_Byte --
392 -------------------
394 procedure Skip_UTF_Byte is
395 begin
396 if (U and 2#11000000#) = 2#10_000000# then
397 Getc;
398 else
399 Bad;
400 end if;
401 end Skip_UTF_Byte;
403 -- Start of processing for UTF-8 case
405 begin
406 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
408 Getc;
410 if (U and 2#10000000#) = 2#00000000# then
411 return;
413 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
415 else
416 Skip_UTF_Byte;
418 if (U and 2#11100000#) = 2#110_00000# then
419 return;
421 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
423 else
424 Skip_UTF_Byte;
426 if (U and 2#11110000#) = 2#1110_0000# then
427 return;
429 -- Any other code is invalid, note that this includes:
431 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
432 -- 10xxxxxx
434 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
435 -- 10xxxxxx 10xxxxxx
436 -- 10xxxxxx
438 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
439 -- 10xxxxxx 10xxxxxx
440 -- 10xxxxxx 10xxxxxx
442 -- since Wide_Character does not allow codes > 16#FFFF#
444 else
445 Bad;
446 end if;
447 end if;
448 end if;
449 end UTF8;
451 -- Special efficient encoding for brackets case
453 elsif Encoding_Method = WCEM_Brackets then
454 Brackets : declare
455 P : Natural;
456 S : Natural;
458 begin
459 -- See if we have "] at end positions
461 if Ptr > Input'First + 1
462 and then Input (Ptr - 1) = ']'
463 and then Input (Ptr - 2) = '"'
464 then
465 P := Ptr - 2;
467 -- Loop back looking for [" at start
469 while P >= Ptr - 10 loop
470 if P <= Input'First + 1 then
471 Bad;
473 elsif Input (P - 1) = '"'
474 and then Input (P - 2) = '['
475 then
476 -- Found ["..."], scan forward to check it
478 S := P - 2;
479 P := S;
480 Next_Wide_Character (Input, P);
482 -- OK if at original pointer, else error
484 if P = Ptr then
485 Ptr := S;
486 return;
487 else
488 Bad;
489 end if;
490 end if;
492 P := P - 1;
493 end loop;
495 -- Falling through loop means more than 8 chars between the
496 -- enclosing brackets (or simply a missing left bracket)
498 Bad;
500 -- Here if no bracket sequence present
502 else
503 if Ptr = Input'First then
504 Past_End;
505 else
506 Ptr := Ptr - 1;
507 end if;
508 end if;
509 end Brackets;
511 -- Non-UTF-8/Brackets. These are the inefficient cases where we have to
512 -- go to the start of the string and skip forwards till Ptr matches.
514 else
515 Non_UTF_Brackets : declare
516 Discard : Wide_Character;
517 PtrS : Natural;
518 PtrP : Natural;
520 begin
521 PtrS := Input'First;
523 if Ptr <= PtrS then
524 Past_End;
525 end if;
527 loop
528 PtrP := PtrS;
529 Decode_Wide_Character (Input, PtrS, Discard);
531 if PtrS = Ptr then
532 Ptr := PtrP;
533 return;
535 elsif PtrS > Ptr then
536 Bad;
537 end if;
538 end loop;
540 exception
541 when Constraint_Error =>
542 Bad;
543 end Non_UTF_Brackets;
544 end if;
545 end Prev_Wide_Character;
547 ------------------------------
548 -- Prev_Wide_Wide_Character --
549 ------------------------------
551 procedure Prev_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
552 begin
553 if Ptr > Input'Last + 1 then
554 Past_End;
555 end if;
557 -- Special efficient encoding for UTF-8 case
559 if Encoding_Method = WCEM_UTF8 then
560 UTF8 : declare
561 U : Unsigned_32;
563 procedure Getc;
564 pragma Inline (Getc);
565 -- Gets the character at Input (Ptr - 1) and returns code in U as
566 -- Unsigned_32 value. On return Ptr is decremented by one.
568 procedure Skip_UTF_Byte;
569 pragma Inline (Skip_UTF_Byte);
570 -- Checks that U is 2#10xxxxxx# and then calls Get
572 ----------
573 -- Getc --
574 ----------
576 procedure Getc is
577 begin
578 if Ptr <= Input'First then
579 Past_End;
580 else
581 Ptr := Ptr - 1;
582 U := Unsigned_32 (Character'Pos (Input (Ptr)));
583 end if;
584 end Getc;
586 -------------------
587 -- Skip_UTF_Byte --
588 -------------------
590 procedure Skip_UTF_Byte is
591 begin
592 if (U and 2#11000000#) = 2#10_000000# then
593 Getc;
594 else
595 Bad;
596 end if;
597 end Skip_UTF_Byte;
599 -- Start of processing for UTF-8 case
601 begin
602 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
604 Getc;
606 if (U and 2#10000000#) = 2#00000000# then
607 return;
609 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
611 else
612 Skip_UTF_Byte;
614 if (U and 2#11100000#) = 2#110_00000# then
615 return;
617 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
619 else
620 Skip_UTF_Byte;
622 if (U and 2#11110000#) = 2#1110_0000# then
623 return;
625 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
626 -- 10xxxxxx
628 else
629 Skip_UTF_Byte;
631 if (U and 2#11111000#) = 2#11110_000# then
632 return;
634 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
635 -- 10xxxxxx 10xxxxxx
636 -- 10xxxxxx
638 else
639 Skip_UTF_Byte;
641 if (U and 2#11111100#) = 2#111110_00# then
642 return;
644 -- Any other code is invalid, note that this includes:
646 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
647 -- 10xxxxxx 10xxxxxx
648 -- 10xxxxxx 10xxxxxx
650 -- since Wide_Wide_Character does not allow codes
651 -- greater than 16#03FF_FFFF#
653 else
654 Bad;
655 end if;
656 end if;
657 end if;
658 end if;
659 end if;
660 end UTF8;
662 -- Special efficient encoding for brackets case
664 elsif Encoding_Method = WCEM_Brackets then
665 Brackets : declare
666 P : Natural;
667 S : Natural;
669 begin
670 -- See if we have "] at end positions
672 if Ptr > Input'First + 1
673 and then Input (Ptr - 1) = ']'
674 and then Input (Ptr - 2) = '"'
675 then
676 P := Ptr - 2;
678 -- Loop back looking for [" at start
680 while P >= Ptr - 10 loop
681 if P <= Input'First + 1 then
682 Bad;
684 elsif Input (P - 1) = '"'
685 and then Input (P - 2) = '['
686 then
687 -- Found ["..."], scan forward to check it
689 S := P - 2;
690 P := S;
691 Next_Wide_Wide_Character (Input, P);
693 -- OK if at original pointer, else error
695 if P = Ptr then
696 Ptr := S;
697 return;
698 else
699 Bad;
700 end if;
701 end if;
703 P := P - 1;
704 end loop;
706 -- Falling through loop means more than 8 chars between the
707 -- enclosing brackets (or simply a missing left bracket)
709 Bad;
711 -- Here if no bracket sequence present
713 else
714 if Ptr = Input'First then
715 Past_End;
716 else
717 Ptr := Ptr - 1;
718 end if;
719 end if;
720 end Brackets;
722 -- Non-UTF-8/Brackets. These are the inefficient cases where we have to
723 -- go to the start of the string and skip forwards till Ptr matches.
725 else
726 Non_UTF8_Brackets : declare
727 Discard : Wide_Wide_Character;
728 PtrS : Natural;
729 PtrP : Natural;
731 begin
732 PtrS := Input'First;
734 if Ptr <= PtrS then
735 Past_End;
736 end if;
738 loop
739 PtrP := PtrS;
740 Decode_Wide_Wide_Character (Input, PtrS, Discard);
742 if PtrS = Ptr then
743 Ptr := PtrP;
744 return;
746 elsif PtrS > Ptr then
747 Bad;
748 end if;
749 end loop;
751 exception
752 when Constraint_Error =>
753 Bad;
754 end Non_UTF8_Brackets;
755 end if;
756 end Prev_Wide_Wide_Character;
758 --------------------------
759 -- Validate_Wide_String --
760 --------------------------
762 function Validate_Wide_String (S : String) return Boolean is
763 Ptr : Natural;
765 begin
766 Ptr := S'First;
767 while Ptr <= S'Last loop
768 Next_Wide_Character (S, Ptr);
769 end loop;
771 return True;
773 exception
774 when Constraint_Error =>
775 return False;
776 end Validate_Wide_String;
778 -------------------------------
779 -- Validate_Wide_Wide_String --
780 -------------------------------
782 function Validate_Wide_Wide_String (S : String) return Boolean is
783 Ptr : Natural;
785 begin
786 Ptr := S'First;
787 while Ptr <= S'Last loop
788 Next_Wide_Wide_Character (S, Ptr);
789 end loop;
791 return True;
793 exception
794 when Constraint_Error =>
795 return False;
796 end Validate_Wide_Wide_String;
798 end GNAT.Decode_String;