gcc/ada/g-decstr.adb

   1 ------------------------------------------------------------------------------
   2 --                                                                          --
   3 --                         GNAT RUN-TIME COMPONENTS                         --
   4 --                                                                          --
   5 --                    G N A T . D E C O D E _ S T R I N G                   --
   6 --                                                                          --
   7 --                                 S p e c                                  --
   8 --                                                                          --
   9 --                     Copyright (C) 2007-2008, AdaCore                     --
  10 --                                                                          --
  11 -- GNAT is free software;  you can  redistribute it  and/or modify it under --
  12 -- terms of the  GNU General Public License as published  by the Free Soft- --
  13 -- ware  Foundation;  either version 2,  or (at your option) any later ver- --
  14 -- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
  15 -- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
  16 -- or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License --
  17 -- for  more details.  You should have  received  a copy of the GNU General --
  18 -- Public License  distributed with GNAT;  see file COPYING.  If not, write --
  19 -- to  the  Free Software Foundation,  51  Franklin  Street,  Fifth  Floor, --
  20 -- Boston, MA 02110-1301, USA.                                              --
  21 --                                                                          --
  22 -- As a special exception,  if other files  instantiate  generics from this --
  23 -- unit, or you link  this unit with other files  to produce an executable, --
  24 -- this  unit  does not  by itself cause  the resulting  executable  to  be --
  25 -- covered  by the  GNU  General  Public  License.  This exception does not --
  26 -- however invalidate  any other reasons why  the executable file  might be --
  27 -- covered by the  GNU Public License.                                      --
  28 --                                                                          --
  29 -- GNAT was originally developed  by the GNAT team at  New York University. --
  30 -- Extensive contributions were provided by Ada Core Technologies Inc.      --
  31 --                                                                          --
  32 ------------------------------------------------------------------------------
  33
  34 --  This package provides a utility routine for converting from an encoded
  35 --  string to a corresponding Wide_String or Wide_Wide_String value.
  36
  37 with Interfaces; use Interfaces;
  38
  39 with System.WCh_Cnv; use System.WCh_Cnv;
  40 with System.WCh_Con; use System.WCh_Con;
  41
  42 package body GNAT.Decode_String is
  43
  44    -----------------------
  45    -- Local Subprograms --
  46    -----------------------
  47
  48    procedure Bad;
  49    pragma No_Return (Bad);
  50    --  Raise error for bad encoding
  51
  52    procedure Past_End;
  53    pragma No_Return (Past_End);
  54    --  Raise error for off end of string
  55
  56    ---------
  57    -- Bad --
  58    ---------
  59
  60    procedure Bad is
  61    begin
  62       raise Constraint_Error with
  63         "bad encoding or character out of range";
  64    end Bad;
  65
  66    ---------------------------
  67    -- Decode_Wide_Character --
  68    ---------------------------
  69
  70    procedure Decode_Wide_Character
  71      (Input  : String;
  72       Ptr    : in out Natural;
  73       Result : out Wide_Character)
  74    is
  75       Char : Wide_Wide_Character;
  76    begin
  77       Decode_Wide_Wide_Character (Input, Ptr, Char);
  78
  79       if Wide_Wide_Character'Pos (Char) > 16#FFFF# then
  80          Bad;
  81       else
  82          Result := Wide_Character'Val (Wide_Wide_Character'Pos (Char));
  83       end if;
  84    end Decode_Wide_Character;
  85
  86    ------------------------
  87    -- Decode_Wide_String --
  88    ------------------------
  89
  90    function Decode_Wide_String (S : String) return Wide_String is
  91       Result : Wide_String (1 .. S'Length);
  92       Length : Natural;
  93    begin
  94       Decode_Wide_String (S, Result, Length);
  95       return Result (1 .. Length);
  96    end Decode_Wide_String;
  97
  98    procedure Decode_Wide_String
  99      (S      : String;
 100       Result : out Wide_String;
 101       Length : out Natural)
 102    is
 103       Ptr : Natural;
 104
 105    begin
 106       Ptr := S'First;
 107       Length := 0;
 108       while Ptr <= S'Last loop
 109          if Length >= Result'Last then
 110             Past_End;
 111          end if;
 112
 113          Length := Length + 1;
 114          Decode_Wide_Character (S, Ptr, Result (Length));
 115       end loop;
 116    end Decode_Wide_String;
 117
 118    --------------------------------
 119    -- Decode_Wide_Wide_Character --
 120    --------------------------------
 121
 122    procedure Decode_Wide_Wide_Character
 123      (Input  : String;
 124       Ptr    : in out Natural;
 125       Result : out Wide_Wide_Character)
 126    is
 127       C : Character;
 128
 129       function In_Char return Character;
 130       pragma Inline (In_Char);
 131       --  Function to get one input character
 132
 133       -------------
 134       -- In_Char --
 135       -------------
 136
 137       function In_Char return Character is
 138       begin
 139          if Ptr <= Input'Last then
 140             Ptr := Ptr + 1;
 141             return Input (Ptr - 1);
 142          else
 143             Past_End;
 144          end if;
 145       end In_Char;
 146
 147    --  Start of processing for Decode_Wide_Wide_Character
 148
 149    begin
 150       C := In_Char;
 151
 152       --  Special fast processing for UTF-8 case
 153
 154       if Encoding_Method = WCEM_UTF8 then
 155          UTF8 : declare
 156             U : Unsigned_32;
 157             W : Unsigned_32;
 158
 159             procedure Get_UTF_Byte;
 160             pragma Inline (Get_UTF_Byte);
 161             --  Used to interpret 2#10xxxxxx# continuation byte in UTF-8 mode.
 162             --  Reads a byte, and raises CE if the first two bits are not 10.
 163             --  Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
 164
 165             ------------------
 166             -- Get_UTF_Byte --
 167             ------------------
 168
 169             procedure Get_UTF_Byte is
 170             begin
 171                U := Unsigned_32 (Character'Pos (In_Char));
 172
 173                if (U and 2#11000000#) /= 2#10_000000# then
 174                   Bad;
 175                end if;
 176
 177                W := Shift_Left (W, 6) or (U and 2#00111111#);
 178             end Get_UTF_Byte;
 179
 180          --  Start of processing for UTF8 case
 181
 182          begin
 183             --  Note: for details of UTF8 encoding see RFC 3629
 184
 185             U := Unsigned_32 (Character'Pos (C));
 186
 187             --  16#00_0000#-16#00_007F#: 0xxxxxxx
 188
 189             if (U and 2#10000000#) = 2#00000000# then
 190                Result := Wide_Wide_Character'Val (Character'Pos (C));
 191
 192             --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
 193
 194             elsif (U and 2#11100000#) = 2#110_00000# then
 195                W := U and 2#00011111#;
 196                Get_UTF_Byte;
 197                Result := Wide_Wide_Character'Val (W);
 198
 199             --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
 200
 201             elsif (U and 2#11110000#) = 2#1110_0000# then
 202                W := U and 2#00001111#;
 203                Get_UTF_Byte;
 204                Get_UTF_Byte;
 205                Result := Wide_Wide_Character'Val (W);
 206
 207             --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 208
 209             elsif (U and 2#11111000#) = 2#11110_000# then
 210                W := U and 2#00000111#;
 211
 212                for K in 1 .. 3 loop
 213                   Get_UTF_Byte;
 214                end loop;
 215
 216                Result := Wide_Wide_Character'Val (W);
 217
 218             --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
 219             --                               10xxxxxx 10xxxxxx
 220
 221             elsif (U and 2#11111100#) = 2#111110_00# then
 222                W := U and 2#00000011#;
 223
 224                for K in 1 .. 4 loop
 225                   Get_UTF_Byte;
 226                end loop;
 227
 228                Result := Wide_Wide_Character'Val (W);
 229
 230             --  All other cases are invalid, note that this includes:
 231
 232             --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
 233             --                               10xxxxxx 10xxxxxx 10xxxxxx
 234
 235             --  since Wide_Wide_Character does not include code values
 236             --  greater than 16#03FF_FFFF#.
 237
 238             else
 239                Bad;
 240             end if;
 241          end UTF8;
 242
 243       --  All encoding functions other than UTF-8
 244
 245       else
 246          Non_UTF8 : declare
 247             function Char_Sequence_To_UTF is
 248               new Char_Sequence_To_UTF_32 (In_Char);
 249
 250          begin
 251             --  For brackets, must test for specific case of [ not followed by
 252             --  quotation, where we must not call Char_Sequence_To_UTF, but
 253             --  instead just return the bracket unchanged.
 254
 255             if Encoding_Method = WCEM_Brackets
 256               and then C = '['
 257               and then (Ptr > Input'Last or else Input (Ptr) /= '"')
 258             then
 259                Result := '[';
 260
 261             --  All other cases including [" with Brackets
 262
 263             else
 264                Result :=
 265                  Wide_Wide_Character'Val
 266                    (Char_Sequence_To_UTF (C, Encoding_Method));
 267             end if;
 268          end Non_UTF8;
 269       end if;
 270    end Decode_Wide_Wide_Character;
 271
 272    -----------------------------
 273    -- Decode_Wide_Wide_String --
 274    -----------------------------
 275
 276    function Decode_Wide_Wide_String (S : String) return Wide_Wide_String is
 277       Result : Wide_Wide_String (1 .. S'Length);
 278       Length : Natural;
 279    begin
 280       Decode_Wide_Wide_String (S, Result, Length);
 281       return Result (1 .. Length);
 282    end Decode_Wide_Wide_String;
 283
 284    procedure Decode_Wide_Wide_String
 285      (S      : String;
 286       Result : out Wide_Wide_String;
 287       Length : out Natural)
 288    is
 289       Ptr : Natural;
 290
 291    begin
 292       Ptr := S'First;
 293       Length := 0;
 294       while Ptr <= S'Last loop
 295          if Length >= Result'Last then
 296             Past_End;
 297          end if;
 298
 299          Length := Length + 1;
 300          Decode_Wide_Wide_Character (S, Ptr, Result (Length));
 301       end loop;
 302    end Decode_Wide_Wide_String;
 303
 304    -------------------------
 305    -- Next_Wide_Character --
 306    -------------------------
 307
 308    procedure Next_Wide_Character (Input : String; Ptr : in out Natural) is
 309    begin
 310       if Ptr < Input'First then
 311          Past_End;
 312       end if;
 313
 314       --  Special efficient encoding for UTF-8 case
 315
 316       if Encoding_Method = WCEM_UTF8 then
 317          UTF8 : declare
 318             U : Unsigned_32;
 319
 320             procedure Getc;
 321             pragma Inline (Getc);
 322             --  Gets the character at Input (Ptr) and returns code in U as
 323             --  Unsigned_32 value. On return Ptr is bumped past the character.
 324
 325             procedure Skip_UTF_Byte;
 326             pragma Inline (Skip_UTF_Byte);
 327             --  Skips past one encoded byte which must be 2#10xxxxxx#
 328
 329             ----------
 330             -- Getc --
 331             ----------
 332
 333             procedure Getc is
 334             begin
 335                if Ptr > Input'Last then
 336                   Past_End;
 337                else
 338                   U := Unsigned_32 (Character'Pos (Input (Ptr)));
 339                   Ptr := Ptr + 1;
 340                end if;
 341             end Getc;
 342
 343             -------------------
 344             -- Skip_UTF_Byte --
 345             -------------------
 346
 347             procedure Skip_UTF_Byte is
 348             begin
 349                Getc;
 350
 351                if (U and 2#11000000#) /= 2#10_000000# then
 352                   Bad;
 353                end if;
 354             end Skip_UTF_Byte;
 355
 356          --  Start of processing for UTF-8 case
 357
 358          begin
 359             --  16#00_0000#-16#00_007F#: 0xxxxxxx
 360
 361             Getc;
 362
 363             if (U and 2#10000000#) = 2#00000000# then
 364                return;
 365
 366             --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
 367
 368             elsif (U and 2#11100000#) = 2#110_00000# then
 369                Skip_UTF_Byte;
 370
 371             --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
 372
 373             elsif (U and 2#11110000#) = 2#1110_0000# then
 374                Skip_UTF_Byte;
 375                Skip_UTF_Byte;
 376
 377             --  Any other code is invalid, note that this includes:
 378
 379             --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 380
 381             --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
 382             --                               10xxxxxx 10xxxxxx
 383
 384             --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
 385             --                               10xxxxxx 10xxxxxx 10xxxxxx
 386
 387             --  since Wide_Character does not allow codes > 16#FFFF#
 388
 389             else
 390                Bad;
 391             end if;
 392          end UTF8;
 393
 394       --  Non-UTF-8 case
 395
 396       else
 397          declare
 398             Discard : Wide_Character;
 399          begin
 400             Decode_Wide_Character (Input, Ptr, Discard);
 401          end;
 402       end if;
 403    end Next_Wide_Character;
 404
 405    ------------------------------
 406    -- Next_Wide_Wide_Character --
 407    ------------------------------
 408
 409    procedure Next_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
 410    begin
 411       --  Special efficient encoding for UTF-8 case
 412
 413       if Encoding_Method = WCEM_UTF8 then
 414          UTF8 : declare
 415             U : Unsigned_32;
 416
 417             procedure Getc;
 418             pragma Inline (Getc);
 419             --  Gets the character at Input (Ptr) and returns code in U as
 420             --  Unsigned_32 value. On return Ptr is bumped past the character.
 421
 422             procedure Skip_UTF_Byte;
 423             pragma Inline (Skip_UTF_Byte);
 424             --  Skips past one encoded byte which must be 2#10xxxxxx#
 425
 426             ----------
 427             -- Getc --
 428             ----------
 429
 430             procedure Getc is
 431             begin
 432                if Ptr > Input'Last then
 433                   Past_End;
 434                else
 435                   U := Unsigned_32 (Character'Pos (Input (Ptr)));
 436                   Ptr := Ptr + 1;
 437                end if;
 438             end Getc;
 439
 440             -------------------
 441             -- Skip_UTF_Byte --
 442             -------------------
 443
 444             procedure Skip_UTF_Byte is
 445             begin
 446                Getc;
 447
 448                if (U and 2#11000000#) /= 2#10_000000# then
 449                   Bad;
 450                end if;
 451             end Skip_UTF_Byte;
 452
 453          --  Start of processing for UTF-8 case
 454
 455          begin
 456             if Ptr < Input'First then
 457                Past_End;
 458             end if;
 459
 460             --  16#00_0000#-16#00_007F#: 0xxxxxxx
 461
 462             Getc;
 463
 464             if (U and 2#10000000#) = 2#00000000# then
 465                null;
 466
 467             --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
 468
 469             elsif (U and 2#11100000#) = 2#110_00000# then
 470                Skip_UTF_Byte;
 471
 472             --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
 473
 474             elsif (U and 2#11110000#) = 2#1110_0000# then
 475                Skip_UTF_Byte;
 476                Skip_UTF_Byte;
 477
 478             --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 479
 480             elsif (U and 2#11111000#) = 2#11110_000# then
 481                for K in 1 .. 3 loop
 482                   Skip_UTF_Byte;
 483                end loop;
 484
 485             --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
 486             --                               10xxxxxx 10xxxxxx
 487
 488             elsif (U and 2#11111100#) = 2#111110_00# then
 489                for K in 1 .. 4 loop
 490                   Skip_UTF_Byte;
 491                end loop;
 492
 493             --  Any other code is invalid, note that this includes:
 494
 495             --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
 496             --                               10xxxxxx 10xxxxxx 10xxxxxx
 497
 498             --  since Wide_Wide_Character does not allow codes > 16#03FF_FFFF#
 499
 500             else
 501                Bad;
 502             end if;
 503          end UTF8;
 504
 505       --  Non-UTF-8 case
 506
 507       else
 508          declare
 509             Discard : Wide_Wide_Character;
 510          begin
 511             Decode_Wide_Wide_Character (Input, Ptr, Discard);
 512          end;
 513       end if;
 514    end Next_Wide_Wide_Character;
 515
 516    --------------
 517    -- Past_End --
 518    --------------
 519
 520    procedure Past_End is
 521    begin
 522       raise Constraint_Error with "past end of string";
 523    end Past_End;
 524
 525    -------------------------
 526    -- Prev_Wide_Character --
 527    -------------------------
 528
 529    procedure Prev_Wide_Character (Input : String; Ptr : in out Natural) is
 530    begin
 531       if Ptr > Input'Last + 1 then
 532          Past_End;
 533       end if;
 534
 535       --  Special efficient encoding for UTF-8 case
 536
 537       if Encoding_Method = WCEM_UTF8 then
 538          UTF8 : declare
 539             U : Unsigned_32;
 540
 541             procedure Getc;
 542             pragma Inline (Getc);
 543             --  Gets the character at Input (Ptr - 1) and returns code in U as
 544             --  Unsigned_32 value. On return Ptr is decremented by one.
 545
 546             procedure Skip_UTF_Byte;
 547             pragma Inline (Skip_UTF_Byte);
 548             --  Checks that U is 2#10xxxxxx# and then calls Get
 549
 550             ----------
 551             -- Getc --
 552             ----------
 553
 554             procedure Getc is
 555             begin
 556                if Ptr <= Input'First then
 557                   Past_End;
 558                else
 559                   Ptr := Ptr - 1;
 560                   U := Unsigned_32 (Character'Pos (Input (Ptr)));
 561                end if;
 562             end Getc;
 563
 564             -------------------
 565             -- Skip_UTF_Byte --
 566             -------------------
 567
 568             procedure Skip_UTF_Byte is
 569             begin
 570                if (U and 2#11000000#) = 2#10_000000# then
 571                   Getc;
 572                else
 573                   Bad;
 574                end if;
 575             end Skip_UTF_Byte;
 576
 577          --  Start of processing for UTF-8 case
 578
 579          begin
 580             --  16#00_0000#-16#00_007F#: 0xxxxxxx
 581
 582             Getc;
 583
 584             if (U and 2#10000000#) = 2#00000000# then
 585                return;
 586
 587             --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
 588
 589             else
 590                Skip_UTF_Byte;
 591
 592                if (U and 2#11100000#) = 2#110_00000# then
 593                   return;
 594
 595                --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
 596
 597                else
 598                   Skip_UTF_Byte;
 599
 600                   if (U and 2#11110000#) = 2#1110_0000# then
 601                      return;
 602
 603                      --  Any other code is invalid, note that this includes:
 604
 605                      --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
 606                      --                           10xxxxxx
 607
 608                      --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
 609                      --                               10xxxxxx 10xxxxxx
 610                      --                               10xxxxxx
 611
 612                      --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
 613                      --                               10xxxxxx 10xxxxxx
 614                      --                               10xxxxxx 10xxxxxx
 615
 616                      --  since Wide_Character does not allow codes > 16#FFFF#
 617
 618                   else
 619                      Bad;
 620                   end if;
 621                end if;
 622             end if;
 623          end UTF8;
 624
 625       --  Special efficient encoding for brackets case
 626
 627       elsif Encoding_Method = WCEM_Brackets then
 628          Brackets : declare
 629             P : Natural;
 630             S : Natural;
 631
 632          begin
 633             --  See if we have "] at end positions
 634
 635             if Ptr > Input'First + 1
 636               and then Input (Ptr - 1) = ']'
 637               and then Input (Ptr - 2) = '"'
 638             then
 639                P := Ptr - 2;
 640
 641                --  Loop back looking for [" at start
 642
 643                while P >= Ptr - 10 loop
 644                   if P <= Input'First + 1 then
 645                      Bad;
 646
 647                   elsif Input (P - 1) = '"'
 648                     and then Input (P - 2) = '['
 649                   then
 650                      --  Found ["..."], scan forward to check it
 651
 652                      S := P - 2;
 653                      P := S;
 654                      Next_Wide_Character (Input, P);
 655
 656                      --  OK if at original pointer, else error
 657
 658                      if P = Ptr then
 659                         Ptr := S;
 660                         return;
 661                      else
 662                         Bad;
 663                      end if;
 664                   end if;
 665
 666                   P := P - 1;
 667                end loop;
 668
 669                --  Falling through loop means more than 8 chars between the
 670                --  enclosing brackets (or simply a missing left bracket)
 671
 672                Bad;
 673
 674             --  Here if no bracket sequence present
 675
 676             else
 677                if Ptr = Input'First then
 678                   Past_End;
 679                else
 680                   Ptr := Ptr - 1;
 681                end if;
 682             end if;
 683          end Brackets;
 684
 685       --  Non-UTF-8/Brackets. These are the inefficient cases where we have to
 686       --  go to the start of the string and skip forwards till Ptr matches.
 687
 688       else
 689          Non_UTF_Brackets : declare
 690             Discard : Wide_Character;
 691             PtrS    : Natural;
 692             PtrP    : Natural;
 693
 694          begin
 695             PtrS := Input'First;
 696
 697             if Ptr <= PtrS then
 698                Past_End;
 699             end if;
 700
 701             loop
 702                PtrP := PtrS;
 703                Decode_Wide_Character (Input, PtrS, Discard);
 704
 705                if PtrS = Ptr then
 706                   Ptr := PtrP;
 707                   return;
 708
 709                elsif PtrS > Ptr then
 710                   Bad;
 711                end if;
 712             end loop;
 713
 714          exception
 715             when Constraint_Error =>
 716                Bad;
 717          end Non_UTF_Brackets;
 718       end if;
 719    end Prev_Wide_Character;
 720
 721    ------------------------------
 722    -- Prev_Wide_Wide_Character --
 723    ------------------------------
 724
 725    procedure Prev_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
 726    begin
 727       if Ptr > Input'Last + 1 then
 728          Past_End;
 729       end if;
 730
 731       --  Special efficient encoding for UTF-8 case
 732
 733       if Encoding_Method = WCEM_UTF8 then
 734          UTF8 : declare
 735             U : Unsigned_32;
 736
 737             procedure Getc;
 738             pragma Inline (Getc);
 739             --  Gets the character at Input (Ptr - 1) and returns code in U as
 740             --  Unsigned_32 value. On return Ptr is decremented by one.
 741
 742             procedure Skip_UTF_Byte;
 743             pragma Inline (Skip_UTF_Byte);
 744             --  Checks that U is 2#10xxxxxx# and then calls Get
 745
 746             ----------
 747             -- Getc --
 748             ----------
 749
 750             procedure Getc is
 751             begin
 752                if Ptr <= Input'First then
 753                   Past_End;
 754                else
 755                   Ptr := Ptr - 1;
 756                   U := Unsigned_32 (Character'Pos (Input (Ptr)));
 757                end if;
 758             end Getc;
 759
 760             -------------------
 761             -- Skip_UTF_Byte --
 762             -------------------
 763
 764             procedure Skip_UTF_Byte is
 765             begin
 766                if (U and 2#11000000#) = 2#10_000000# then
 767                   Getc;
 768                else
 769                   Bad;
 770                end if;
 771             end Skip_UTF_Byte;
 772
 773          --  Start of processing for UTF-8 case
 774
 775          begin
 776             --  16#00_0000#-16#00_007F#: 0xxxxxxx
 777
 778             Getc;
 779
 780             if (U and 2#10000000#) = 2#00000000# then
 781                return;
 782
 783             --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
 784
 785             else
 786                Skip_UTF_Byte;
 787
 788                if (U and 2#11100000#) = 2#110_00000# then
 789                   return;
 790
 791                --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
 792
 793                else
 794                   Skip_UTF_Byte;
 795
 796                   if (U and 2#11110000#) = 2#1110_0000# then
 797                      return;
 798
 799                   --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
 800                   --                           10xxxxxx
 801
 802                   else
 803                      Skip_UTF_Byte;
 804
 805                      if (U and 2#11111000#) = 2#11110_000# then
 806                         return;
 807
 808                      --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
 809                      --                               10xxxxxx 10xxxxxx
 810                      --                               10xxxxxx
 811
 812                      else
 813                         Skip_UTF_Byte;
 814
 815                         if (U and 2#11111100#) = 2#111110_00# then
 816                            return;
 817
 818                         --  Any other code is invalid, note that this includes:
 819
 820                         --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
 821                         --                               10xxxxxx 10xxxxxx
 822                         --                               10xxxxxx 10xxxxxx
 823
 824                         --  since Wide_Wide_Character does not allow codes
 825                         --  greater than 16#03FF_FFFF#
 826
 827                         else
 828                            Bad;
 829                         end if;
 830                      end if;
 831                   end if;
 832                end if;
 833             end if;
 834          end UTF8;
 835
 836       --  Special efficient encoding for brackets case
 837
 838       elsif Encoding_Method = WCEM_Brackets then
 839          Brackets : declare
 840             P : Natural;
 841             S : Natural;
 842
 843          begin
 844             --  See if we have "] at end positions
 845
 846             if Ptr > Input'First + 1
 847               and then Input (Ptr - 1) = ']'
 848               and then Input (Ptr - 2) = '"'
 849             then
 850                P := Ptr - 2;
 851
 852                --  Loop back looking for [" at start
 853
 854                while P >= Ptr - 10 loop
 855                   if P <= Input'First + 1 then
 856                      Bad;
 857
 858                   elsif Input (P - 1) = '"'
 859                     and then Input (P - 2) = '['
 860                   then
 861                      --  Found ["..."], scan forward to check it
 862
 863                      S := P - 2;
 864                      P := S;
 865                      Next_Wide_Wide_Character (Input, P);
 866
 867                      --  OK if at original pointer, else error
 868
 869                      if P = Ptr then
 870                         Ptr := S;
 871                         return;
 872                      else
 873                         Bad;
 874                      end if;
 875                   end if;
 876
 877                   P := P - 1;
 878                end loop;
 879
 880                --  Falling through loop means more than 8 chars between the
 881                --  enclosing brackets (or simply a missing left bracket)
 882
 883                Bad;
 884
 885             --  Here if no bracket sequence present
 886
 887             else
 888                if Ptr = Input'First then
 889                   Past_End;
 890                else
 891                   Ptr := Ptr - 1;
 892                end if;
 893             end if;
 894          end Brackets;
 895
 896       --  Non-UTF-8/Brackets. These are the inefficient cases where we have to
 897       --  go to the start of the string and skip forwards till Ptr matches.
 898
 899       else
 900          Non_UTF8_Brackets : declare
 901             Discard : Wide_Wide_Character;
 902             PtrS    : Natural;
 903             PtrP    : Natural;
 904
 905          begin
 906             PtrS := Input'First;
 907
 908             if Ptr <= PtrS then
 909                Past_End;
 910             end if;
 911
 912             loop
 913                PtrP := PtrS;
 914                Decode_Wide_Wide_Character (Input, PtrS, Discard);
 915
 916                if PtrS = Ptr then
 917                   Ptr := PtrP;
 918                   return;
 919
 920                elsif PtrS > Ptr then
 921                   Bad;
 922                end if;
 923             end loop;
 924
 925          exception
 926             when Constraint_Error =>
 927                Bad;
 928          end Non_UTF8_Brackets;
 929       end if;
 930    end Prev_Wide_Wide_Character;
 931
 932    --------------------------
 933    -- Validate_Wide_String --
 934    --------------------------
 935
 936    function Validate_Wide_String (S : String) return Boolean is
 937       Ptr : Natural;
 938
 939    begin
 940       Ptr := S'First;
 941       while Ptr <= S'Last loop
 942          Next_Wide_Character (S, Ptr);
 943       end loop;
 944
 945       return True;
 946
 947    exception
 948       when Constraint_Error =>
 949          return False;
 950    end Validate_Wide_String;
 951
 952    -------------------------------
 953    -- Validate_Wide_Wide_String --
 954    -------------------------------
 955
 956    function Validate_Wide_Wide_String (S : String) return Boolean is
 957       Ptr : Natural;
 958
 959    begin
 960       Ptr := S'First;
 961       while Ptr <= S'Last loop
 962          Next_Wide_Wide_Character (S, Ptr);
 963       end loop;
 964
 965       return True;
 966
 967    exception
 968       when Constraint_Error =>
 969          return False;
 970    end Validate_Wide_Wide_String;
 971
 972 end GNAT.Decode_String;