gcc/ada/g-decstr.adb

   1 ------------------------------------------------------------------------------
   2 --                                                                          --
   3 --                         GNAT RUN-TIME COMPONENTS                         --
   4 --                                                                          --
   5 --                    G N A T . D E C O D E _ S T R I N G                   --
   6 --                                                                          --
   7 --                                 S p e c                                  --
   8 --                                                                          --
   9 --                     Copyright (C) 2007-2014, AdaCore                     --
  10 --                                                                          --
  11 -- GNAT is free software;  you can  redistribute it  and/or modify it under --
  12 -- terms of the  GNU General Public License as published  by the Free Soft- --
  13 -- ware  Foundation;  either version 3,  or (at your option) any later ver- --
  14 -- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
  15 -- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
  16 -- or FITNESS FOR A PARTICULAR PURPOSE.                                     --
  17 --                                                                          --
  18 -- As a special exception under Section 7 of GPL version 3, you are granted --
  19 -- additional permissions described in the GCC Runtime Library Exception,   --
  20 -- version 3.1, as published by the Free Software Foundation.               --
  21 --                                                                          --
  22 -- You should have received a copy of the GNU General Public License and    --
  23 -- a copy of the GCC Runtime Library Exception along with this program;     --
  24 -- see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see    --
  25 -- <http://www.gnu.org/licenses/>.                                          --
  26 --                                                                          --
  27 -- GNAT was originally developed  by the GNAT team at  New York University. --
  28 -- Extensive contributions were provided by Ada Core Technologies Inc.      --
  29 --                                                                          --
  30 ------------------------------------------------------------------------------
  31
  32 --  This package provides a utility routine for converting from an encoded
  33 --  string to a corresponding Wide_String or Wide_Wide_String value.
  34
  35 with Interfaces; use Interfaces;
  36
  37 with System.WCh_Cnv; use System.WCh_Cnv;
  38 with System.WCh_Con; use System.WCh_Con;
  39
  40 package body GNAT.Decode_String is
  41
  42    -----------------------
  43    -- Local Subprograms --
  44    -----------------------
  45
  46    procedure Bad;
  47    pragma No_Return (Bad);
  48    --  Raise error for bad encoding
  49
  50    procedure Past_End;
  51    pragma No_Return (Past_End);
  52    --  Raise error for off end of string
  53
  54    ---------
  55    -- Bad --
  56    ---------
  57
  58    procedure Bad is
  59    begin
  60       raise Constraint_Error with
  61         "bad encoding or character out of range";
  62    end Bad;
  63
  64    ---------------------------
  65    -- Decode_Wide_Character --
  66    ---------------------------
  67
  68    procedure Decode_Wide_Character
  69      (Input  : String;
  70       Ptr    : in out Natural;
  71       Result : out Wide_Character)
  72    is
  73       Char : Wide_Wide_Character;
  74    begin
  75       Decode_Wide_Wide_Character (Input, Ptr, Char);
  76
  77       if Wide_Wide_Character'Pos (Char) > 16#FFFF# then
  78          Bad;
  79       else
  80          Result := Wide_Character'Val (Wide_Wide_Character'Pos (Char));
  81       end if;
  82    end Decode_Wide_Character;
  83
  84    ------------------------
  85    -- Decode_Wide_String --
  86    ------------------------
  87
  88    function Decode_Wide_String (S : String) return Wide_String is
  89       Result : Wide_String (1 .. S'Length);
  90       Length : Natural;
  91    begin
  92       Decode_Wide_String (S, Result, Length);
  93       return Result (1 .. Length);
  94    end Decode_Wide_String;
  95
  96    procedure Decode_Wide_String
  97      (S      : String;
  98       Result : out Wide_String;
  99       Length : out Natural)
 100    is
 101       Ptr : Natural;
 102
 103    begin
 104       Ptr := S'First;
 105       Length := 0;
 106       while Ptr <= S'Last loop
 107          if Length >= Result'Last then
 108             Past_End;
 109          end if;
 110
 111          Length := Length + 1;
 112          Decode_Wide_Character (S, Ptr, Result (Length));
 113       end loop;
 114    end Decode_Wide_String;
 115
 116    --------------------------------
 117    -- Decode_Wide_Wide_Character --
 118    --------------------------------
 119
 120    procedure Decode_Wide_Wide_Character
 121      (Input  : String;
 122       Ptr    : in out Natural;
 123       Result : out Wide_Wide_Character)
 124    is
 125       C : Character;
 126
 127       function In_Char return Character;
 128       pragma Inline (In_Char);
 129       --  Function to get one input character
 130
 131       -------------
 132       -- In_Char --
 133       -------------
 134
 135       function In_Char return Character is
 136       begin
 137          if Ptr <= Input'Last then
 138             Ptr := Ptr + 1;
 139             return Input (Ptr - 1);
 140          else
 141             Past_End;
 142          end if;
 143       end In_Char;
 144
 145    --  Start of processing for Decode_Wide_Wide_Character
 146
 147    begin
 148       C := In_Char;
 149
 150       --  Special fast processing for UTF-8 case
 151
 152       if Encoding_Method = WCEM_UTF8 then
 153          UTF8 : declare
 154             U : Unsigned_32;
 155             W : Unsigned_32;
 156
 157             procedure Get_UTF_Byte;
 158             pragma Inline (Get_UTF_Byte);
 159             --  Used to interpret 2#10xxxxxx# continuation byte in UTF-8 mode.
 160             --  Reads a byte, and raises CE if the first two bits are not 10.
 161             --  Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
 162
 163             ------------------
 164             -- Get_UTF_Byte --
 165             ------------------
 166
 167             procedure Get_UTF_Byte is
 168             begin
 169                U := Unsigned_32 (Character'Pos (In_Char));
 170
 171                if (U and 2#11000000#) /= 2#10_000000# then
 172                   Bad;
 173                end if;
 174
 175                W := Shift_Left (W, 6) or (U and 2#00111111#);
 176             end Get_UTF_Byte;
 177
 178          --  Start of processing for UTF8 case
 179
 180          begin
 181             --  Note: for details of UTF8 encoding see RFC 3629
 182
 183             U := Unsigned_32 (Character'Pos (C));
 184
 185             --  16#00_0000#-16#00_007F#: 0xxxxxxx
 186
 187             if (U and 2#10000000#) = 2#00000000# then
 188                Result := Wide_Wide_Character'Val (Character'Pos (C));
 189
 190             --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
 191
 192             elsif (U and 2#11100000#) = 2#110_00000# then
 193                W := U and 2#00011111#;
 194                Get_UTF_Byte;
 195
 196                if W not in 16#00_0080# .. 16#00_07FF# then
 197                   Bad;
 198                end if;
 199
 200                Result := Wide_Wide_Character'Val (W);
 201
 202             --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
 203
 204             elsif (U and 2#11110000#) = 2#1110_0000# then
 205                W := U and 2#00001111#;
 206                Get_UTF_Byte;
 207                Get_UTF_Byte;
 208
 209                if W not in 16#00_0800# .. 16#00_FFFF# then
 210                   Bad;
 211                end if;
 212
 213                Result := Wide_Wide_Character'Val (W);
 214
 215             --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 216
 217             elsif (U and 2#11111000#) = 2#11110_000# then
 218                W := U and 2#00000111#;
 219
 220                for K in 1 .. 3 loop
 221                   Get_UTF_Byte;
 222                end loop;
 223
 224                if W not in 16#01_0000# .. 16#10_FFFF# then
 225                   Bad;
 226                end if;
 227
 228                Result := Wide_Wide_Character'Val (W);
 229
 230             --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
 231             --                               10xxxxxx 10xxxxxx
 232
 233             elsif (U and 2#11111100#) = 2#111110_00# then
 234                W := U and 2#00000011#;
 235
 236                for K in 1 .. 4 loop
 237                   Get_UTF_Byte;
 238                end loop;
 239
 240                if W not in 16#0020_0000# .. 16#03FF_FFFF# then
 241                   Bad;
 242                end if;
 243
 244                Result := Wide_Wide_Character'Val (W);
 245
 246             --  All other cases are invalid, note that this includes:
 247
 248             --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
 249             --                               10xxxxxx 10xxxxxx 10xxxxxx
 250
 251             --  since Wide_Wide_Character does not include code values
 252             --  greater than 16#03FF_FFFF#.
 253
 254             else
 255                Bad;
 256             end if;
 257          end UTF8;
 258
 259       --  All encoding functions other than UTF-8
 260
 261       else
 262          Non_UTF8 : declare
 263             function Char_Sequence_To_UTF is
 264               new Char_Sequence_To_UTF_32 (In_Char);
 265
 266          begin
 267             --  For brackets, must test for specific case of [ not followed by
 268             --  quotation, where we must not call Char_Sequence_To_UTF, but
 269             --  instead just return the bracket unchanged.
 270
 271             if Encoding_Method = WCEM_Brackets
 272               and then C = '['
 273               and then (Ptr > Input'Last or else Input (Ptr) /= '"')
 274             then
 275                Result := '[';
 276
 277             --  All other cases including [" with Brackets
 278
 279             else
 280                Result :=
 281                  Wide_Wide_Character'Val
 282                    (Char_Sequence_To_UTF (C, Encoding_Method));
 283             end if;
 284          end Non_UTF8;
 285       end if;
 286    end Decode_Wide_Wide_Character;
 287
 288    -----------------------------
 289    -- Decode_Wide_Wide_String --
 290    -----------------------------
 291
 292    function Decode_Wide_Wide_String (S : String) return Wide_Wide_String is
 293       Result : Wide_Wide_String (1 .. S'Length);
 294       Length : Natural;
 295    begin
 296       Decode_Wide_Wide_String (S, Result, Length);
 297       return Result (1 .. Length);
 298    end Decode_Wide_Wide_String;
 299
 300    procedure Decode_Wide_Wide_String
 301      (S      : String;
 302       Result : out Wide_Wide_String;
 303       Length : out Natural)
 304    is
 305       Ptr : Natural;
 306
 307    begin
 308       Ptr := S'First;
 309       Length := 0;
 310       while Ptr <= S'Last loop
 311          if Length >= Result'Last then
 312             Past_End;
 313          end if;
 314
 315          Length := Length + 1;
 316          Decode_Wide_Wide_Character (S, Ptr, Result (Length));
 317       end loop;
 318    end Decode_Wide_Wide_String;
 319
 320    -------------------------
 321    -- Next_Wide_Character --
 322    -------------------------
 323
 324    procedure Next_Wide_Character (Input : String; Ptr : in out Natural) is
 325       Discard : Wide_Character;
 326    begin
 327       Decode_Wide_Character (Input, Ptr, Discard);
 328    end Next_Wide_Character;
 329
 330    ------------------------------
 331    -- Next_Wide_Wide_Character --
 332    ------------------------------
 333
 334    procedure Next_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
 335       Discard : Wide_Wide_Character;
 336    begin
 337       Decode_Wide_Wide_Character (Input, Ptr, Discard);
 338    end Next_Wide_Wide_Character;
 339
 340    --------------
 341    -- Past_End --
 342    --------------
 343
 344    procedure Past_End is
 345    begin
 346       raise Constraint_Error with "past end of string";
 347    end Past_End;
 348
 349    -------------------------
 350    -- Prev_Wide_Character --
 351    -------------------------
 352
 353    procedure Prev_Wide_Character (Input : String; Ptr : in out Natural) is
 354    begin
 355       if Ptr > Input'Last + 1 then
 356          Past_End;
 357       end if;
 358
 359       --  Special efficient encoding for UTF-8 case
 360
 361       if Encoding_Method = WCEM_UTF8 then
 362          UTF8 : declare
 363             U : Unsigned_32;
 364
 365             procedure Getc;
 366             pragma Inline (Getc);
 367             --  Gets the character at Input (Ptr - 1) and returns code in U as
 368             --  Unsigned_32 value. On return Ptr is decremented by one.
 369
 370             procedure Skip_UTF_Byte;
 371             pragma Inline (Skip_UTF_Byte);
 372             --  Checks that U is 2#10xxxxxx# and then calls Get
 373
 374             ----------
 375             -- Getc --
 376             ----------
 377
 378             procedure Getc is
 379             begin
 380                if Ptr <= Input'First then
 381                   Past_End;
 382                else
 383                   Ptr := Ptr - 1;
 384                   U := Unsigned_32 (Character'Pos (Input (Ptr)));
 385                end if;
 386             end Getc;
 387
 388             -------------------
 389             -- Skip_UTF_Byte --
 390             -------------------
 391
 392             procedure Skip_UTF_Byte is
 393             begin
 394                if (U and 2#11000000#) = 2#10_000000# then
 395                   Getc;
 396                else
 397                   Bad;
 398                end if;
 399             end Skip_UTF_Byte;
 400
 401          --  Start of processing for UTF-8 case
 402
 403          begin
 404             --  16#00_0000#-16#00_007F#: 0xxxxxxx
 405
 406             Getc;
 407
 408             if (U and 2#10000000#) = 2#00000000# then
 409                return;
 410
 411             --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
 412
 413             else
 414                Skip_UTF_Byte;
 415
 416                if (U and 2#11100000#) = 2#110_00000# then
 417                   return;
 418
 419                --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
 420
 421                else
 422                   Skip_UTF_Byte;
 423
 424                   if (U and 2#11110000#) = 2#1110_0000# then
 425                      return;
 426
 427                      --  Any other code is invalid, note that this includes:
 428
 429                      --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
 430                      --                           10xxxxxx
 431
 432                      --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
 433                      --                               10xxxxxx 10xxxxxx
 434                      --                               10xxxxxx
 435
 436                      --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
 437                      --                               10xxxxxx 10xxxxxx
 438                      --                               10xxxxxx 10xxxxxx
 439
 440                      --  since Wide_Character does not allow codes > 16#FFFF#
 441
 442                   else
 443                      Bad;
 444                   end if;
 445                end if;
 446             end if;
 447          end UTF8;
 448
 449       --  Special efficient encoding for brackets case
 450
 451       elsif Encoding_Method = WCEM_Brackets then
 452          Brackets : declare
 453             P : Natural;
 454             S : Natural;
 455
 456          begin
 457             --  See if we have "] at end positions
 458
 459             if Ptr > Input'First + 1
 460               and then Input (Ptr - 1) = ']'
 461               and then Input (Ptr - 2) = '"'
 462             then
 463                P := Ptr - 2;
 464
 465                --  Loop back looking for [" at start
 466
 467                while P >= Ptr - 10 loop
 468                   if P <= Input'First + 1 then
 469                      Bad;
 470
 471                   elsif Input (P - 1) = '"'
 472                     and then Input (P - 2) = '['
 473                   then
 474                      --  Found ["..."], scan forward to check it
 475
 476                      S := P - 2;
 477                      P := S;
 478                      Next_Wide_Character (Input, P);
 479
 480                      --  OK if at original pointer, else error
 481
 482                      if P = Ptr then
 483                         Ptr := S;
 484                         return;
 485                      else
 486                         Bad;
 487                      end if;
 488                   end if;
 489
 490                   P := P - 1;
 491                end loop;
 492
 493                --  Falling through loop means more than 8 chars between the
 494                --  enclosing brackets (or simply a missing left bracket)
 495
 496                Bad;
 497
 498             --  Here if no bracket sequence present
 499
 500             else
 501                if Ptr = Input'First then
 502                   Past_End;
 503                else
 504                   Ptr := Ptr - 1;
 505                end if;
 506             end if;
 507          end Brackets;
 508
 509       --  Non-UTF-8/Brackets. These are the inefficient cases where we have to
 510       --  go to the start of the string and skip forwards till Ptr matches.
 511
 512       else
 513          Non_UTF_Brackets : declare
 514             Discard : Wide_Character;
 515             PtrS    : Natural;
 516             PtrP    : Natural;
 517
 518          begin
 519             PtrS := Input'First;
 520
 521             if Ptr <= PtrS then
 522                Past_End;
 523             end if;
 524
 525             loop
 526                PtrP := PtrS;
 527                Decode_Wide_Character (Input, PtrS, Discard);
 528
 529                if PtrS = Ptr then
 530                   Ptr := PtrP;
 531                   return;
 532
 533                elsif PtrS > Ptr then
 534                   Bad;
 535                end if;
 536             end loop;
 537
 538          exception
 539             when Constraint_Error =>
 540                Bad;
 541          end Non_UTF_Brackets;
 542       end if;
 543    end Prev_Wide_Character;
 544
 545    ------------------------------
 546    -- Prev_Wide_Wide_Character --
 547    ------------------------------
 548
 549    procedure Prev_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
 550    begin
 551       if Ptr > Input'Last + 1 then
 552          Past_End;
 553       end if;
 554
 555       --  Special efficient encoding for UTF-8 case
 556
 557       if Encoding_Method = WCEM_UTF8 then
 558          UTF8 : declare
 559             U : Unsigned_32;
 560
 561             procedure Getc;
 562             pragma Inline (Getc);
 563             --  Gets the character at Input (Ptr - 1) and returns code in U as
 564             --  Unsigned_32 value. On return Ptr is decremented by one.
 565
 566             procedure Skip_UTF_Byte;
 567             pragma Inline (Skip_UTF_Byte);
 568             --  Checks that U is 2#10xxxxxx# and then calls Get
 569
 570             ----------
 571             -- Getc --
 572             ----------
 573
 574             procedure Getc is
 575             begin
 576                if Ptr <= Input'First then
 577                   Past_End;
 578                else
 579                   Ptr := Ptr - 1;
 580                   U := Unsigned_32 (Character'Pos (Input (Ptr)));
 581                end if;
 582             end Getc;
 583
 584             -------------------
 585             -- Skip_UTF_Byte --
 586             -------------------
 587
 588             procedure Skip_UTF_Byte is
 589             begin
 590                if (U and 2#11000000#) = 2#10_000000# then
 591                   Getc;
 592                else
 593                   Bad;
 594                end if;
 595             end Skip_UTF_Byte;
 596
 597          --  Start of processing for UTF-8 case
 598
 599          begin
 600             --  16#00_0000#-16#00_007F#: 0xxxxxxx
 601
 602             Getc;
 603
 604             if (U and 2#10000000#) = 2#00000000# then
 605                return;
 606
 607             --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
 608
 609             else
 610                Skip_UTF_Byte;
 611
 612                if (U and 2#11100000#) = 2#110_00000# then
 613                   return;
 614
 615                --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
 616
 617                else
 618                   Skip_UTF_Byte;
 619
 620                   if (U and 2#11110000#) = 2#1110_0000# then
 621                      return;
 622
 623                   --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
 624                   --                           10xxxxxx
 625
 626                   else
 627                      Skip_UTF_Byte;
 628
 629                      if (U and 2#11111000#) = 2#11110_000# then
 630                         return;
 631
 632                      --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
 633                      --                               10xxxxxx 10xxxxxx
 634                      --                               10xxxxxx
 635
 636                      else
 637                         Skip_UTF_Byte;
 638
 639                         if (U and 2#11111100#) = 2#111110_00# then
 640                            return;
 641
 642                         --  Any other code is invalid, note that this includes:
 643
 644                         --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
 645                         --                               10xxxxxx 10xxxxxx
 646                         --                               10xxxxxx 10xxxxxx
 647
 648                         --  since Wide_Wide_Character does not allow codes
 649                         --  greater than 16#03FF_FFFF#
 650
 651                         else
 652                            Bad;
 653                         end if;
 654                      end if;
 655                   end if;
 656                end if;
 657             end if;
 658          end UTF8;
 659
 660       --  Special efficient encoding for brackets case
 661
 662       elsif Encoding_Method = WCEM_Brackets then
 663          Brackets : declare
 664             P : Natural;
 665             S : Natural;
 666
 667          begin
 668             --  See if we have "] at end positions
 669
 670             if Ptr > Input'First + 1
 671               and then Input (Ptr - 1) = ']'
 672               and then Input (Ptr - 2) = '"'
 673             then
 674                P := Ptr - 2;
 675
 676                --  Loop back looking for [" at start
 677
 678                while P >= Ptr - 10 loop
 679                   if P <= Input'First + 1 then
 680                      Bad;
 681
 682                   elsif Input (P - 1) = '"'
 683                     and then Input (P - 2) = '['
 684                   then
 685                      --  Found ["..."], scan forward to check it
 686
 687                      S := P - 2;
 688                      P := S;
 689                      Next_Wide_Wide_Character (Input, P);
 690
 691                      --  OK if at original pointer, else error
 692
 693                      if P = Ptr then
 694                         Ptr := S;
 695                         return;
 696                      else
 697                         Bad;
 698                      end if;
 699                   end if;
 700
 701                   P := P - 1;
 702                end loop;
 703
 704                --  Falling through loop means more than 8 chars between the
 705                --  enclosing brackets (or simply a missing left bracket)
 706
 707                Bad;
 708
 709             --  Here if no bracket sequence present
 710
 711             else
 712                if Ptr = Input'First then
 713                   Past_End;
 714                else
 715                   Ptr := Ptr - 1;
 716                end if;
 717             end if;
 718          end Brackets;
 719
 720       --  Non-UTF-8/Brackets. These are the inefficient cases where we have to
 721       --  go to the start of the string and skip forwards till Ptr matches.
 722
 723       else
 724          Non_UTF8_Brackets : declare
 725             Discard : Wide_Wide_Character;
 726             PtrS    : Natural;
 727             PtrP    : Natural;
 728
 729          begin
 730             PtrS := Input'First;
 731
 732             if Ptr <= PtrS then
 733                Past_End;
 734             end if;
 735
 736             loop
 737                PtrP := PtrS;
 738                Decode_Wide_Wide_Character (Input, PtrS, Discard);
 739
 740                if PtrS = Ptr then
 741                   Ptr := PtrP;
 742                   return;
 743
 744                elsif PtrS > Ptr then
 745                   Bad;
 746                end if;
 747             end loop;
 748
 749          exception
 750             when Constraint_Error =>
 751                Bad;
 752          end Non_UTF8_Brackets;
 753       end if;
 754    end Prev_Wide_Wide_Character;
 755
 756    --------------------------
 757    -- Validate_Wide_String --
 758    --------------------------
 759
 760    function Validate_Wide_String (S : String) return Boolean is
 761       Ptr : Natural;
 762
 763    begin
 764       Ptr := S'First;
 765       while Ptr <= S'Last loop
 766          Next_Wide_Character (S, Ptr);
 767       end loop;
 768
 769       return True;
 770
 771    exception
 772       when Constraint_Error =>
 773          return False;
 774    end Validate_Wide_String;
 775
 776    -------------------------------
 777    -- Validate_Wide_Wide_String --
 778    -------------------------------
 779
 780    function Validate_Wide_Wide_String (S : String) return Boolean is
 781       Ptr : Natural;
 782
 783    begin
 784       Ptr := S'First;
 785       while Ptr <= S'Last loop
 786          Next_Wide_Wide_Character (S, Ptr);
 787       end loop;
 788
 789       return True;
 790
 791    exception
 792       when Constraint_Error =>
 793          return False;
 794    end Validate_Wide_Wide_String;
 795
 796 end GNAT.Decode_String;