gcc/ada/g-decstr.adb

   1 ------------------------------------------------------------------------------
   2 --                                                                          --
   3 --                         GNAT RUN-TIME COMPONENTS                         --
   4 --                                                                          --
   5 --                    G N A T . D E C O D E _ S T R I N G                   --
   6 --                                                                          --
   7 --                                 S p e c                                  --
   8 --                                                                          --
   9 --                     Copyright (C) 2007-2010, AdaCore                     --
  10 --                                                                          --
  11 -- GNAT is free software;  you can  redistribute it  and/or modify it under --
  12 -- terms of the  GNU General Public License as published  by the Free Soft- --
  13 -- ware  Foundation;  either version 3,  or (at your option) any later ver- --
  14 -- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
  15 -- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
  16 -- or FITNESS FOR A PARTICULAR PURPOSE.                                     --
  17 --                                                                          --
  18 -- As a special exception under Section 7 of GPL version 3, you are granted --
  19 -- additional permissions described in the GCC Runtime Library Exception,   --
  20 -- version 3.1, as published by the Free Software Foundation.               --
  21 --                                                                          --
  22 -- You should have received a copy of the GNU General Public License and    --
  23 -- a copy of the GCC Runtime Library Exception along with this program;     --
  24 -- see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see    --
  25 -- <http://www.gnu.org/licenses/>.                                          --
  26 --                                                                          --
  27 -- GNAT was originally developed  by the GNAT team at  New York University. --
  28 -- Extensive contributions were provided by Ada Core Technologies Inc.      --
  29 --                                                                          --
  30 ------------------------------------------------------------------------------
  31
  32 --  This package provides a utility routine for converting from an encoded
  33 --  string to a corresponding Wide_String or Wide_Wide_String value.
  34
  35 with Interfaces; use Interfaces;
  36
  37 with System.WCh_Cnv; use System.WCh_Cnv;
  38 with System.WCh_Con; use System.WCh_Con;
  39
  40 package body GNAT.Decode_String is
  41
  42    -----------------------
  43    -- Local Subprograms --
  44    -----------------------
  45
  46    procedure Bad;
  47    pragma No_Return (Bad);
  48    --  Raise error for bad encoding
  49
  50    procedure Past_End;
  51    pragma No_Return (Past_End);
  52    --  Raise error for off end of string
  53
  54    ---------
  55    -- Bad --
  56    ---------
  57
  58    procedure Bad is
  59    begin
  60       raise Constraint_Error with
  61         "bad encoding or character out of range";
  62    end Bad;
  63
  64    ---------------------------
  65    -- Decode_Wide_Character --
  66    ---------------------------
  67
  68    procedure Decode_Wide_Character
  69      (Input  : String;
  70       Ptr    : in out Natural;
  71       Result : out Wide_Character)
  72    is
  73       Char : Wide_Wide_Character;
  74    begin
  75       Decode_Wide_Wide_Character (Input, Ptr, Char);
  76
  77       if Wide_Wide_Character'Pos (Char) > 16#FFFF# then
  78          Bad;
  79       else
  80          Result := Wide_Character'Val (Wide_Wide_Character'Pos (Char));
  81       end if;
  82    end Decode_Wide_Character;
  83
  84    ------------------------
  85    -- Decode_Wide_String --
  86    ------------------------
  87
  88    function Decode_Wide_String (S : String) return Wide_String is
  89       Result : Wide_String (1 .. S'Length);
  90       Length : Natural;
  91    begin
  92       Decode_Wide_String (S, Result, Length);
  93       return Result (1 .. Length);
  94    end Decode_Wide_String;
  95
  96    procedure Decode_Wide_String
  97      (S      : String;
  98       Result : out Wide_String;
  99       Length : out Natural)
 100    is
 101       Ptr : Natural;
 102
 103    begin
 104       Ptr := S'First;
 105       Length := 0;
 106       while Ptr <= S'Last loop
 107          if Length >= Result'Last then
 108             Past_End;
 109          end if;
 110
 111          Length := Length + 1;
 112          Decode_Wide_Character (S, Ptr, Result (Length));
 113       end loop;
 114    end Decode_Wide_String;
 115
 116    --------------------------------
 117    -- Decode_Wide_Wide_Character --
 118    --------------------------------
 119
 120    procedure Decode_Wide_Wide_Character
 121      (Input  : String;
 122       Ptr    : in out Natural;
 123       Result : out Wide_Wide_Character)
 124    is
 125       C : Character;
 126
 127       function In_Char return Character;
 128       pragma Inline (In_Char);
 129       --  Function to get one input character
 130
 131       -------------
 132       -- In_Char --
 133       -------------
 134
 135       function In_Char return Character is
 136       begin
 137          if Ptr <= Input'Last then
 138             Ptr := Ptr + 1;
 139             return Input (Ptr - 1);
 140          else
 141             Past_End;
 142          end if;
 143       end In_Char;
 144
 145    --  Start of processing for Decode_Wide_Wide_Character
 146
 147    begin
 148       C := In_Char;
 149
 150       --  Special fast processing for UTF-8 case
 151
 152       if Encoding_Method = WCEM_UTF8 then
 153          UTF8 : declare
 154             U : Unsigned_32;
 155             W : Unsigned_32;
 156
 157             procedure Get_UTF_Byte;
 158             pragma Inline (Get_UTF_Byte);
 159             --  Used to interpret 2#10xxxxxx# continuation byte in UTF-8 mode.
 160             --  Reads a byte, and raises CE if the first two bits are not 10.
 161             --  Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
 162
 163             ------------------
 164             -- Get_UTF_Byte --
 165             ------------------
 166
 167             procedure Get_UTF_Byte is
 168             begin
 169                U := Unsigned_32 (Character'Pos (In_Char));
 170
 171                if (U and 2#11000000#) /= 2#10_000000# then
 172                   Bad;
 173                end if;
 174
 175                W := Shift_Left (W, 6) or (U and 2#00111111#);
 176             end Get_UTF_Byte;
 177
 178          --  Start of processing for UTF8 case
 179
 180          begin
 181             --  Note: for details of UTF8 encoding see RFC 3629
 182
 183             U := Unsigned_32 (Character'Pos (C));
 184
 185             --  16#00_0000#-16#00_007F#: 0xxxxxxx
 186
 187             if (U and 2#10000000#) = 2#00000000# then
 188                Result := Wide_Wide_Character'Val (Character'Pos (C));
 189
 190             --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
 191
 192             elsif (U and 2#11100000#) = 2#110_00000# then
 193                W := U and 2#00011111#;
 194                Get_UTF_Byte;
 195                Result := Wide_Wide_Character'Val (W);
 196
 197             --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
 198
 199             elsif (U and 2#11110000#) = 2#1110_0000# then
 200                W := U and 2#00001111#;
 201                Get_UTF_Byte;
 202                Get_UTF_Byte;
 203                Result := Wide_Wide_Character'Val (W);
 204
 205             --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 206
 207             elsif (U and 2#11111000#) = 2#11110_000# then
 208                W := U and 2#00000111#;
 209
 210                for K in 1 .. 3 loop
 211                   Get_UTF_Byte;
 212                end loop;
 213
 214                Result := Wide_Wide_Character'Val (W);
 215
 216             --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
 217             --                               10xxxxxx 10xxxxxx
 218
 219             elsif (U and 2#11111100#) = 2#111110_00# then
 220                W := U and 2#00000011#;
 221
 222                for K in 1 .. 4 loop
 223                   Get_UTF_Byte;
 224                end loop;
 225
 226                Result := Wide_Wide_Character'Val (W);
 227
 228             --  All other cases are invalid, note that this includes:
 229
 230             --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
 231             --                               10xxxxxx 10xxxxxx 10xxxxxx
 232
 233             --  since Wide_Wide_Character does not include code values
 234             --  greater than 16#03FF_FFFF#.
 235
 236             else
 237                Bad;
 238             end if;
 239          end UTF8;
 240
 241       --  All encoding functions other than UTF-8
 242
 243       else
 244          Non_UTF8 : declare
 245             function Char_Sequence_To_UTF is
 246               new Char_Sequence_To_UTF_32 (In_Char);
 247
 248          begin
 249             --  For brackets, must test for specific case of [ not followed by
 250             --  quotation, where we must not call Char_Sequence_To_UTF, but
 251             --  instead just return the bracket unchanged.
 252
 253             if Encoding_Method = WCEM_Brackets
 254               and then C = '['
 255               and then (Ptr > Input'Last or else Input (Ptr) /= '"')
 256             then
 257                Result := '[';
 258
 259             --  All other cases including [" with Brackets
 260
 261             else
 262                Result :=
 263                  Wide_Wide_Character'Val
 264                    (Char_Sequence_To_UTF (C, Encoding_Method));
 265             end if;
 266          end Non_UTF8;
 267       end if;
 268    end Decode_Wide_Wide_Character;
 269
 270    -----------------------------
 271    -- Decode_Wide_Wide_String --
 272    -----------------------------
 273
 274    function Decode_Wide_Wide_String (S : String) return Wide_Wide_String is
 275       Result : Wide_Wide_String (1 .. S'Length);
 276       Length : Natural;
 277    begin
 278       Decode_Wide_Wide_String (S, Result, Length);
 279       return Result (1 .. Length);
 280    end Decode_Wide_Wide_String;
 281
 282    procedure Decode_Wide_Wide_String
 283      (S      : String;
 284       Result : out Wide_Wide_String;
 285       Length : out Natural)
 286    is
 287       Ptr : Natural;
 288
 289    begin
 290       Ptr := S'First;
 291       Length := 0;
 292       while Ptr <= S'Last loop
 293          if Length >= Result'Last then
 294             Past_End;
 295          end if;
 296
 297          Length := Length + 1;
 298          Decode_Wide_Wide_Character (S, Ptr, Result (Length));
 299       end loop;
 300    end Decode_Wide_Wide_String;
 301
 302    -------------------------
 303    -- Next_Wide_Character --
 304    -------------------------
 305
 306    procedure Next_Wide_Character (Input : String; Ptr : in out Natural) is
 307    begin
 308       if Ptr < Input'First then
 309          Past_End;
 310       end if;
 311
 312       --  Special efficient encoding for UTF-8 case
 313
 314       if Encoding_Method = WCEM_UTF8 then
 315          UTF8 : declare
 316             U : Unsigned_32;
 317
 318             procedure Getc;
 319             pragma Inline (Getc);
 320             --  Gets the character at Input (Ptr) and returns code in U as
 321             --  Unsigned_32 value. On return Ptr is bumped past the character.
 322
 323             procedure Skip_UTF_Byte;
 324             pragma Inline (Skip_UTF_Byte);
 325             --  Skips past one encoded byte which must be 2#10xxxxxx#
 326
 327             ----------
 328             -- Getc --
 329             ----------
 330
 331             procedure Getc is
 332             begin
 333                if Ptr > Input'Last then
 334                   Past_End;
 335                else
 336                   U := Unsigned_32 (Character'Pos (Input (Ptr)));
 337                   Ptr := Ptr + 1;
 338                end if;
 339             end Getc;
 340
 341             -------------------
 342             -- Skip_UTF_Byte --
 343             -------------------
 344
 345             procedure Skip_UTF_Byte is
 346             begin
 347                Getc;
 348
 349                if (U and 2#11000000#) /= 2#10_000000# then
 350                   Bad;
 351                end if;
 352             end Skip_UTF_Byte;
 353
 354          --  Start of processing for UTF-8 case
 355
 356          begin
 357             --  16#00_0000#-16#00_007F#: 0xxxxxxx
 358
 359             Getc;
 360
 361             if (U and 2#10000000#) = 2#00000000# then
 362                return;
 363
 364             --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
 365
 366             elsif (U and 2#11100000#) = 2#110_00000# then
 367                Skip_UTF_Byte;
 368
 369             --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
 370
 371             elsif (U and 2#11110000#) = 2#1110_0000# then
 372                Skip_UTF_Byte;
 373                Skip_UTF_Byte;
 374
 375             --  Any other code is invalid, note that this includes:
 376
 377             --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 378
 379             --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
 380             --                               10xxxxxx 10xxxxxx
 381
 382             --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
 383             --                               10xxxxxx 10xxxxxx 10xxxxxx
 384
 385             --  since Wide_Character does not allow codes > 16#FFFF#
 386
 387             else
 388                Bad;
 389             end if;
 390          end UTF8;
 391
 392       --  Non-UTF-8 case
 393
 394       else
 395          declare
 396             Discard : Wide_Character;
 397          begin
 398             Decode_Wide_Character (Input, Ptr, Discard);
 399          end;
 400       end if;
 401    end Next_Wide_Character;
 402
 403    ------------------------------
 404    -- Next_Wide_Wide_Character --
 405    ------------------------------
 406
 407    procedure Next_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
 408    begin
 409       --  Special efficient encoding for UTF-8 case
 410
 411       if Encoding_Method = WCEM_UTF8 then
 412          UTF8 : declare
 413             U : Unsigned_32;
 414
 415             procedure Getc;
 416             pragma Inline (Getc);
 417             --  Gets the character at Input (Ptr) and returns code in U as
 418             --  Unsigned_32 value. On return Ptr is bumped past the character.
 419
 420             procedure Skip_UTF_Byte;
 421             pragma Inline (Skip_UTF_Byte);
 422             --  Skips past one encoded byte which must be 2#10xxxxxx#
 423
 424             ----------
 425             -- Getc --
 426             ----------
 427
 428             procedure Getc is
 429             begin
 430                if Ptr > Input'Last then
 431                   Past_End;
 432                else
 433                   U := Unsigned_32 (Character'Pos (Input (Ptr)));
 434                   Ptr := Ptr + 1;
 435                end if;
 436             end Getc;
 437
 438             -------------------
 439             -- Skip_UTF_Byte --
 440             -------------------
 441
 442             procedure Skip_UTF_Byte is
 443             begin
 444                Getc;
 445
 446                if (U and 2#11000000#) /= 2#10_000000# then
 447                   Bad;
 448                end if;
 449             end Skip_UTF_Byte;
 450
 451          --  Start of processing for UTF-8 case
 452
 453          begin
 454             if Ptr < Input'First then
 455                Past_End;
 456             end if;
 457
 458             --  16#00_0000#-16#00_007F#: 0xxxxxxx
 459
 460             Getc;
 461
 462             if (U and 2#10000000#) = 2#00000000# then
 463                null;
 464
 465             --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
 466
 467             elsif (U and 2#11100000#) = 2#110_00000# then
 468                Skip_UTF_Byte;
 469
 470             --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
 471
 472             elsif (U and 2#11110000#) = 2#1110_0000# then
 473                Skip_UTF_Byte;
 474                Skip_UTF_Byte;
 475
 476             --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 477
 478             elsif (U and 2#11111000#) = 2#11110_000# then
 479                for K in 1 .. 3 loop
 480                   Skip_UTF_Byte;
 481                end loop;
 482
 483             --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
 484             --                               10xxxxxx 10xxxxxx
 485
 486             elsif (U and 2#11111100#) = 2#111110_00# then
 487                for K in 1 .. 4 loop
 488                   Skip_UTF_Byte;
 489                end loop;
 490
 491             --  Any other code is invalid, note that this includes:
 492
 493             --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
 494             --                               10xxxxxx 10xxxxxx 10xxxxxx
 495
 496             --  since Wide_Wide_Character does not allow codes > 16#03FF_FFFF#
 497
 498             else
 499                Bad;
 500             end if;
 501          end UTF8;
 502
 503       --  Non-UTF-8 case
 504
 505       else
 506          declare
 507             Discard : Wide_Wide_Character;
 508          begin
 509             Decode_Wide_Wide_Character (Input, Ptr, Discard);
 510          end;
 511       end if;
 512    end Next_Wide_Wide_Character;
 513
 514    --------------
 515    -- Past_End --
 516    --------------
 517
 518    procedure Past_End is
 519    begin
 520       raise Constraint_Error with "past end of string";
 521    end Past_End;
 522
 523    -------------------------
 524    -- Prev_Wide_Character --
 525    -------------------------
 526
 527    procedure Prev_Wide_Character (Input : String; Ptr : in out Natural) is
 528    begin
 529       if Ptr > Input'Last + 1 then
 530          Past_End;
 531       end if;
 532
 533       --  Special efficient encoding for UTF-8 case
 534
 535       if Encoding_Method = WCEM_UTF8 then
 536          UTF8 : declare
 537             U : Unsigned_32;
 538
 539             procedure Getc;
 540             pragma Inline (Getc);
 541             --  Gets the character at Input (Ptr - 1) and returns code in U as
 542             --  Unsigned_32 value. On return Ptr is decremented by one.
 543
 544             procedure Skip_UTF_Byte;
 545             pragma Inline (Skip_UTF_Byte);
 546             --  Checks that U is 2#10xxxxxx# and then calls Get
 547
 548             ----------
 549             -- Getc --
 550             ----------
 551
 552             procedure Getc is
 553             begin
 554                if Ptr <= Input'First then
 555                   Past_End;
 556                else
 557                   Ptr := Ptr - 1;
 558                   U := Unsigned_32 (Character'Pos (Input (Ptr)));
 559                end if;
 560             end Getc;
 561
 562             -------------------
 563             -- Skip_UTF_Byte --
 564             -------------------
 565
 566             procedure Skip_UTF_Byte is
 567             begin
 568                if (U and 2#11000000#) = 2#10_000000# then
 569                   Getc;
 570                else
 571                   Bad;
 572                end if;
 573             end Skip_UTF_Byte;
 574
 575          --  Start of processing for UTF-8 case
 576
 577          begin
 578             --  16#00_0000#-16#00_007F#: 0xxxxxxx
 579
 580             Getc;
 581
 582             if (U and 2#10000000#) = 2#00000000# then
 583                return;
 584
 585             --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
 586
 587             else
 588                Skip_UTF_Byte;
 589
 590                if (U and 2#11100000#) = 2#110_00000# then
 591                   return;
 592
 593                --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
 594
 595                else
 596                   Skip_UTF_Byte;
 597
 598                   if (U and 2#11110000#) = 2#1110_0000# then
 599                      return;
 600
 601                      --  Any other code is invalid, note that this includes:
 602
 603                      --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
 604                      --                           10xxxxxx
 605
 606                      --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
 607                      --                               10xxxxxx 10xxxxxx
 608                      --                               10xxxxxx
 609
 610                      --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
 611                      --                               10xxxxxx 10xxxxxx
 612                      --                               10xxxxxx 10xxxxxx
 613
 614                      --  since Wide_Character does not allow codes > 16#FFFF#
 615
 616                   else
 617                      Bad;
 618                   end if;
 619                end if;
 620             end if;
 621          end UTF8;
 622
 623       --  Special efficient encoding for brackets case
 624
 625       elsif Encoding_Method = WCEM_Brackets then
 626          Brackets : declare
 627             P : Natural;
 628             S : Natural;
 629
 630          begin
 631             --  See if we have "] at end positions
 632
 633             if Ptr > Input'First + 1
 634               and then Input (Ptr - 1) = ']'
 635               and then Input (Ptr - 2) = '"'
 636             then
 637                P := Ptr - 2;
 638
 639                --  Loop back looking for [" at start
 640
 641                while P >= Ptr - 10 loop
 642                   if P <= Input'First + 1 then
 643                      Bad;
 644
 645                   elsif Input (P - 1) = '"'
 646                     and then Input (P - 2) = '['
 647                   then
 648                      --  Found ["..."], scan forward to check it
 649
 650                      S := P - 2;
 651                      P := S;
 652                      Next_Wide_Character (Input, P);
 653
 654                      --  OK if at original pointer, else error
 655
 656                      if P = Ptr then
 657                         Ptr := S;
 658                         return;
 659                      else
 660                         Bad;
 661                      end if;
 662                   end if;
 663
 664                   P := P - 1;
 665                end loop;
 666
 667                --  Falling through loop means more than 8 chars between the
 668                --  enclosing brackets (or simply a missing left bracket)
 669
 670                Bad;
 671
 672             --  Here if no bracket sequence present
 673
 674             else
 675                if Ptr = Input'First then
 676                   Past_End;
 677                else
 678                   Ptr := Ptr - 1;
 679                end if;
 680             end if;
 681          end Brackets;
 682
 683       --  Non-UTF-8/Brackets. These are the inefficient cases where we have to
 684       --  go to the start of the string and skip forwards till Ptr matches.
 685
 686       else
 687          Non_UTF_Brackets : declare
 688             Discard : Wide_Character;
 689             PtrS    : Natural;
 690             PtrP    : Natural;
 691
 692          begin
 693             PtrS := Input'First;
 694
 695             if Ptr <= PtrS then
 696                Past_End;
 697             end if;
 698
 699             loop
 700                PtrP := PtrS;
 701                Decode_Wide_Character (Input, PtrS, Discard);
 702
 703                if PtrS = Ptr then
 704                   Ptr := PtrP;
 705                   return;
 706
 707                elsif PtrS > Ptr then
 708                   Bad;
 709                end if;
 710             end loop;
 711
 712          exception
 713             when Constraint_Error =>
 714                Bad;
 715          end Non_UTF_Brackets;
 716       end if;
 717    end Prev_Wide_Character;
 718
 719    ------------------------------
 720    -- Prev_Wide_Wide_Character --
 721    ------------------------------
 722
 723    procedure Prev_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
 724    begin
 725       if Ptr > Input'Last + 1 then
 726          Past_End;
 727       end if;
 728
 729       --  Special efficient encoding for UTF-8 case
 730
 731       if Encoding_Method = WCEM_UTF8 then
 732          UTF8 : declare
 733             U : Unsigned_32;
 734
 735             procedure Getc;
 736             pragma Inline (Getc);
 737             --  Gets the character at Input (Ptr - 1) and returns code in U as
 738             --  Unsigned_32 value. On return Ptr is decremented by one.
 739
 740             procedure Skip_UTF_Byte;
 741             pragma Inline (Skip_UTF_Byte);
 742             --  Checks that U is 2#10xxxxxx# and then calls Get
 743
 744             ----------
 745             -- Getc --
 746             ----------
 747
 748             procedure Getc is
 749             begin
 750                if Ptr <= Input'First then
 751                   Past_End;
 752                else
 753                   Ptr := Ptr - 1;
 754                   U := Unsigned_32 (Character'Pos (Input (Ptr)));
 755                end if;
 756             end Getc;
 757
 758             -------------------
 759             -- Skip_UTF_Byte --
 760             -------------------
 761
 762             procedure Skip_UTF_Byte is
 763             begin
 764                if (U and 2#11000000#) = 2#10_000000# then
 765                   Getc;
 766                else
 767                   Bad;
 768                end if;
 769             end Skip_UTF_Byte;
 770
 771          --  Start of processing for UTF-8 case
 772
 773          begin
 774             --  16#00_0000#-16#00_007F#: 0xxxxxxx
 775
 776             Getc;
 777
 778             if (U and 2#10000000#) = 2#00000000# then
 779                return;
 780
 781             --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
 782
 783             else
 784                Skip_UTF_Byte;
 785
 786                if (U and 2#11100000#) = 2#110_00000# then
 787                   return;
 788
 789                --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
 790
 791                else
 792                   Skip_UTF_Byte;
 793
 794                   if (U and 2#11110000#) = 2#1110_0000# then
 795                      return;
 796
 797                   --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
 798                   --                           10xxxxxx
 799
 800                   else
 801                      Skip_UTF_Byte;
 802
 803                      if (U and 2#11111000#) = 2#11110_000# then
 804                         return;
 805
 806                      --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
 807                      --                               10xxxxxx 10xxxxxx
 808                      --                               10xxxxxx
 809
 810                      else
 811                         Skip_UTF_Byte;
 812
 813                         if (U and 2#11111100#) = 2#111110_00# then
 814                            return;
 815
 816                         --  Any other code is invalid, note that this includes:
 817
 818                         --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
 819                         --                               10xxxxxx 10xxxxxx
 820                         --                               10xxxxxx 10xxxxxx
 821
 822                         --  since Wide_Wide_Character does not allow codes
 823                         --  greater than 16#03FF_FFFF#
 824
 825                         else
 826                            Bad;
 827                         end if;
 828                      end if;
 829                   end if;
 830                end if;
 831             end if;
 832          end UTF8;
 833
 834       --  Special efficient encoding for brackets case
 835
 836       elsif Encoding_Method = WCEM_Brackets then
 837          Brackets : declare
 838             P : Natural;
 839             S : Natural;
 840
 841          begin
 842             --  See if we have "] at end positions
 843
 844             if Ptr > Input'First + 1
 845               and then Input (Ptr - 1) = ']'
 846               and then Input (Ptr - 2) = '"'
 847             then
 848                P := Ptr - 2;
 849
 850                --  Loop back looking for [" at start
 851
 852                while P >= Ptr - 10 loop
 853                   if P <= Input'First + 1 then
 854                      Bad;
 855
 856                   elsif Input (P - 1) = '"'
 857                     and then Input (P - 2) = '['
 858                   then
 859                      --  Found ["..."], scan forward to check it
 860
 861                      S := P - 2;
 862                      P := S;
 863                      Next_Wide_Wide_Character (Input, P);
 864
 865                      --  OK if at original pointer, else error
 866
 867                      if P = Ptr then
 868                         Ptr := S;
 869                         return;
 870                      else
 871                         Bad;
 872                      end if;
 873                   end if;
 874
 875                   P := P - 1;
 876                end loop;
 877
 878                --  Falling through loop means more than 8 chars between the
 879                --  enclosing brackets (or simply a missing left bracket)
 880
 881                Bad;
 882
 883             --  Here if no bracket sequence present
 884
 885             else
 886                if Ptr = Input'First then
 887                   Past_End;
 888                else
 889                   Ptr := Ptr - 1;
 890                end if;
 891             end if;
 892          end Brackets;
 893
 894       --  Non-UTF-8/Brackets. These are the inefficient cases where we have to
 895       --  go to the start of the string and skip forwards till Ptr matches.
 896
 897       else
 898          Non_UTF8_Brackets : declare
 899             Discard : Wide_Wide_Character;
 900             PtrS    : Natural;
 901             PtrP    : Natural;
 902
 903          begin
 904             PtrS := Input'First;
 905
 906             if Ptr <= PtrS then
 907                Past_End;
 908             end if;
 909
 910             loop
 911                PtrP := PtrS;
 912                Decode_Wide_Wide_Character (Input, PtrS, Discard);
 913
 914                if PtrS = Ptr then
 915                   Ptr := PtrP;
 916                   return;
 917
 918                elsif PtrS > Ptr then
 919                   Bad;
 920                end if;
 921             end loop;
 922
 923          exception
 924             when Constraint_Error =>
 925                Bad;
 926          end Non_UTF8_Brackets;
 927       end if;
 928    end Prev_Wide_Wide_Character;
 929
 930    --------------------------
 931    -- Validate_Wide_String --
 932    --------------------------
 933
 934    function Validate_Wide_String (S : String) return Boolean is
 935       Ptr : Natural;
 936
 937    begin
 938       Ptr := S'First;
 939       while Ptr <= S'Last loop
 940          Next_Wide_Character (S, Ptr);
 941       end loop;
 942
 943       return True;
 944
 945    exception
 946       when Constraint_Error =>
 947          return False;
 948    end Validate_Wide_String;
 949
 950    -------------------------------
 951    -- Validate_Wide_Wide_String --
 952    -------------------------------
 953
 954    function Validate_Wide_Wide_String (S : String) return Boolean is
 955       Ptr : Natural;
 956
 957    begin
 958       Ptr := S'First;
 959       while Ptr <= S'Last loop
 960          Next_Wide_Wide_Character (S, Ptr);
 961       end loop;
 962
 963       return True;
 964
 965    exception
 966       when Constraint_Error =>
 967          return False;
 968    end Validate_Wide_Wide_String;
 969
 970 end GNAT.Decode_String;