gcc/ada/g-decstr.adb

   1 ------------------------------------------------------------------------------
   2 --                                                                          --
   3 --                         GNAT RUN-TIME COMPONENTS                         --
   4 --                                                                          --
   5 --                    G N A T . D E C O D E _ S T R I N G                   --
   6 --                                                                          --
   7 --                                 S p e c                                  --
   8 --                                                                          --
   9 --                     Copyright (C) 2007-2013, AdaCore                     --
  10 --                                                                          --
  11 -- GNAT is free software;  you can  redistribute it  and/or modify it under --
  12 -- terms of the  GNU General Public License as published  by the Free Soft- --
  13 -- ware  Foundation;  either version 3,  or (at your option) any later ver- --
  14 -- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
  15 -- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
  16 -- or FITNESS FOR A PARTICULAR PURPOSE.                                     --
  17 --                                                                          --
  18 -- As a special exception under Section 7 of GPL version 3, you are granted --
  19 -- additional permissions described in the GCC Runtime Library Exception,   --
  20 -- version 3.1, as published by the Free Software Foundation.               --
  21 --                                                                          --
  22 -- You should have received a copy of the GNU General Public License and    --
  23 -- a copy of the GCC Runtime Library Exception along with this program;     --
  24 -- see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see    --
  25 -- <http://www.gnu.org/licenses/>.                                          --
  26 --                                                                          --
  27 -- GNAT was originally developed  by the GNAT team at  New York University. --
  28 -- Extensive contributions were provided by Ada Core Technologies Inc.      --
  29 --                                                                          --
  30 ------------------------------------------------------------------------------
  31
  32 --  This package provides a utility routine for converting from an encoded
  33 --  string to a corresponding Wide_String or Wide_Wide_String value.
  34
  35 with Interfaces; use Interfaces;
  36
  37 with System.WCh_Cnv; use System.WCh_Cnv;
  38 with System.WCh_Con; use System.WCh_Con;
  39
  40 package body GNAT.Decode_String is
  41
  42    -----------------------
  43    -- Local Subprograms --
  44    -----------------------
  45
  46    procedure Bad;
  47    pragma No_Return (Bad);
  48    --  Raise error for bad encoding
  49
  50    procedure Past_End;
  51    pragma No_Return (Past_End);
  52    --  Raise error for off end of string
  53
  54    ---------
  55    -- Bad --
  56    ---------
  57
  58    procedure Bad is
  59    begin
  60       raise Constraint_Error with
  61         "bad encoding or character out of range";
  62    end Bad;
  63
  64    ---------------------------
  65    -- Decode_Wide_Character --
  66    ---------------------------
  67
  68    procedure Decode_Wide_Character
  69      (Input  : String;
  70       Ptr    : in out Natural;
  71       Result : out Wide_Character)
  72    is
  73       Char : Wide_Wide_Character;
  74    begin
  75       Decode_Wide_Wide_Character (Input, Ptr, Char);
  76
  77       if Wide_Wide_Character'Pos (Char) > 16#FFFF# then
  78          Bad;
  79       else
  80          Result := Wide_Character'Val (Wide_Wide_Character'Pos (Char));
  81       end if;
  82    end Decode_Wide_Character;
  83
  84    ------------------------
  85    -- Decode_Wide_String --
  86    ------------------------
  87
  88    function Decode_Wide_String (S : String) return Wide_String is
  89       Result : Wide_String (1 .. S'Length);
  90       Length : Natural;
  91    begin
  92       Decode_Wide_String (S, Result, Length);
  93       return Result (1 .. Length);
  94    end Decode_Wide_String;
  95
  96    procedure Decode_Wide_String
  97      (S      : String;
  98       Result : out Wide_String;
  99       Length : out Natural)
 100    is
 101       Ptr : Natural;
 102
 103    begin
 104       Ptr := S'First;
 105       Length := 0;
 106       while Ptr <= S'Last loop
 107          if Length >= Result'Last then
 108             Past_End;
 109          end if;
 110
 111          Length := Length + 1;
 112          Decode_Wide_Character (S, Ptr, Result (Length));
 113       end loop;
 114    end Decode_Wide_String;
 115
 116    --------------------------------
 117    -- Decode_Wide_Wide_Character --
 118    --------------------------------
 119
 120    procedure Decode_Wide_Wide_Character
 121      (Input  : String;
 122       Ptr    : in out Natural;
 123       Result : out Wide_Wide_Character)
 124    is
 125       C : Character;
 126
 127       function In_Char return Character;
 128       pragma Inline (In_Char);
 129       --  Function to get one input character
 130
 131       -------------
 132       -- In_Char --
 133       -------------
 134
 135       function In_Char return Character is
 136       begin
 137          if Ptr <= Input'Last then
 138             Ptr := Ptr + 1;
 139             return Input (Ptr - 1);
 140          else
 141             Past_End;
 142          end if;
 143       end In_Char;
 144
 145    --  Start of processing for Decode_Wide_Wide_Character
 146
 147    begin
 148       C := In_Char;
 149
 150       --  Special fast processing for UTF-8 case
 151
 152       if Encoding_Method = WCEM_UTF8 then
 153          UTF8 : declare
 154             U : Unsigned_32;
 155             W : Unsigned_32;
 156
 157             procedure Get_UTF_Byte;
 158             pragma Inline (Get_UTF_Byte);
 159             --  Used to interpret 2#10xxxxxx# continuation byte in UTF-8 mode.
 160             --  Reads a byte, and raises CE if the first two bits are not 10.
 161             --  Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
 162
 163             ------------------
 164             -- Get_UTF_Byte --
 165             ------------------
 166
 167             procedure Get_UTF_Byte is
 168             begin
 169                U := Unsigned_32 (Character'Pos (In_Char));
 170
 171                if (U and 2#11000000#) /= 2#10_000000# then
 172                   Bad;
 173                end if;
 174
 175                W := Shift_Left (W, 6) or (U and 2#00111111#);
 176             end Get_UTF_Byte;
 177
 178          --  Start of processing for UTF8 case
 179
 180          begin
 181             --  Note: for details of UTF8 encoding see RFC 3629
 182
 183             U := Unsigned_32 (Character'Pos (C));
 184
 185             --  16#00_0000#-16#00_007F#: 0xxxxxxx
 186
 187             if (U and 2#10000000#) = 2#00000000# then
 188                Result := Wide_Wide_Character'Val (Character'Pos (C));
 189
 190             --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
 191
 192             elsif (U and 2#11100000#) = 2#110_00000# then
 193                W := U and 2#00011111#;
 194                Get_UTF_Byte;
 195
 196                if W not in 16#00_0080# .. 16#00_07FF# then
 197                   Bad;
 198                end if;
 199
 200                Result := Wide_Wide_Character'Val (W);
 201
 202             --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
 203
 204             elsif (U and 2#11110000#) = 2#1110_0000# then
 205                W := U and 2#00001111#;
 206                Get_UTF_Byte;
 207                Get_UTF_Byte;
 208
 209                if W not in 16#00_0800# .. 16#00_FFFF# then
 210                   Bad;
 211                end if;
 212
 213                Result := Wide_Wide_Character'Val (W);
 214
 215             --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 216
 217             elsif (U and 2#11111000#) = 2#11110_000# then
 218                W := U and 2#00000111#;
 219
 220                for K in 1 .. 3 loop
 221                   Get_UTF_Byte;
 222                end loop;
 223
 224                if W not in 16#01_0000# .. 16#10_FFFF# then
 225                   Bad;
 226                end if;
 227
 228                Result := Wide_Wide_Character'Val (W);
 229
 230             --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
 231             --                               10xxxxxx 10xxxxxx
 232
 233             elsif (U and 2#11111100#) = 2#111110_00# then
 234                W := U and 2#00000011#;
 235
 236                for K in 1 .. 4 loop
 237                   Get_UTF_Byte;
 238                end loop;
 239
 240                if W not in 16#0020_0000# .. 16#03FF_FFFF# then
 241                   Bad;
 242                end if;
 243
 244                Result := Wide_Wide_Character'Val (W);
 245
 246             --  All other cases are invalid, note that this includes:
 247
 248             --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
 249             --                               10xxxxxx 10xxxxxx 10xxxxxx
 250
 251             --  since Wide_Wide_Character does not include code values
 252             --  greater than 16#03FF_FFFF#.
 253
 254             else
 255                Bad;
 256             end if;
 257          end UTF8;
 258
 259       --  All encoding functions other than UTF-8
 260
 261       else
 262          Non_UTF8 : declare
 263             function Char_Sequence_To_UTF is
 264               new Char_Sequence_To_UTF_32 (In_Char);
 265
 266          begin
 267             --  For brackets, must test for specific case of [ not followed by
 268             --  quotation, where we must not call Char_Sequence_To_UTF, but
 269             --  instead just return the bracket unchanged.
 270
 271             if Encoding_Method = WCEM_Brackets
 272               and then C = '['
 273               and then (Ptr > Input'Last or else Input (Ptr) /= '"')
 274             then
 275                Result := '[';
 276
 277             --  All other cases including [" with Brackets
 278
 279             else
 280                Result :=
 281                  Wide_Wide_Character'Val
 282                    (Char_Sequence_To_UTF (C, Encoding_Method));
 283             end if;
 284          end Non_UTF8;
 285       end if;
 286    end Decode_Wide_Wide_Character;
 287
 288    -----------------------------
 289    -- Decode_Wide_Wide_String --
 290    -----------------------------
 291
 292    function Decode_Wide_Wide_String (S : String) return Wide_Wide_String is
 293       Result : Wide_Wide_String (1 .. S'Length);
 294       Length : Natural;
 295    begin
 296       Decode_Wide_Wide_String (S, Result, Length);
 297       return Result (1 .. Length);
 298    end Decode_Wide_Wide_String;
 299
 300    procedure Decode_Wide_Wide_String
 301      (S      : String;
 302       Result : out Wide_Wide_String;
 303       Length : out Natural)
 304    is
 305       Ptr : Natural;
 306
 307    begin
 308       Ptr := S'First;
 309       Length := 0;
 310       while Ptr <= S'Last loop
 311          if Length >= Result'Last then
 312             Past_End;
 313          end if;
 314
 315          Length := Length + 1;
 316          Decode_Wide_Wide_Character (S, Ptr, Result (Length));
 317       end loop;
 318    end Decode_Wide_Wide_String;
 319
 320    -------------------------
 321    -- Next_Wide_Character --
 322    -------------------------
 323
 324    procedure Next_Wide_Character (Input : String; Ptr : in out Natural) is
 325       Discard : Wide_Character;
 326       pragma Unreferenced (Discard);
 327    begin
 328       Decode_Wide_Character (Input, Ptr, Discard);
 329    end Next_Wide_Character;
 330
 331    ------------------------------
 332    -- Next_Wide_Wide_Character --
 333    ------------------------------
 334
 335    procedure Next_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
 336       Discard : Wide_Wide_Character;
 337       pragma Unreferenced (Discard);
 338    begin
 339       Decode_Wide_Wide_Character (Input, Ptr, Discard);
 340    end Next_Wide_Wide_Character;
 341
 342    --------------
 343    -- Past_End --
 344    --------------
 345
 346    procedure Past_End is
 347    begin
 348       raise Constraint_Error with "past end of string";
 349    end Past_End;
 350
 351    -------------------------
 352    -- Prev_Wide_Character --
 353    -------------------------
 354
 355    procedure Prev_Wide_Character (Input : String; Ptr : in out Natural) is
 356    begin
 357       if Ptr > Input'Last + 1 then
 358          Past_End;
 359       end if;
 360
 361       --  Special efficient encoding for UTF-8 case
 362
 363       if Encoding_Method = WCEM_UTF8 then
 364          UTF8 : declare
 365             U : Unsigned_32;
 366
 367             procedure Getc;
 368             pragma Inline (Getc);
 369             --  Gets the character at Input (Ptr - 1) and returns code in U as
 370             --  Unsigned_32 value. On return Ptr is decremented by one.
 371
 372             procedure Skip_UTF_Byte;
 373             pragma Inline (Skip_UTF_Byte);
 374             --  Checks that U is 2#10xxxxxx# and then calls Get
 375
 376             ----------
 377             -- Getc --
 378             ----------
 379
 380             procedure Getc is
 381             begin
 382                if Ptr <= Input'First then
 383                   Past_End;
 384                else
 385                   Ptr := Ptr - 1;
 386                   U := Unsigned_32 (Character'Pos (Input (Ptr)));
 387                end if;
 388             end Getc;
 389
 390             -------------------
 391             -- Skip_UTF_Byte --
 392             -------------------
 393
 394             procedure Skip_UTF_Byte is
 395             begin
 396                if (U and 2#11000000#) = 2#10_000000# then
 397                   Getc;
 398                else
 399                   Bad;
 400                end if;
 401             end Skip_UTF_Byte;
 402
 403          --  Start of processing for UTF-8 case
 404
 405          begin
 406             --  16#00_0000#-16#00_007F#: 0xxxxxxx
 407
 408             Getc;
 409
 410             if (U and 2#10000000#) = 2#00000000# then
 411                return;
 412
 413             --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
 414
 415             else
 416                Skip_UTF_Byte;
 417
 418                if (U and 2#11100000#) = 2#110_00000# then
 419                   return;
 420
 421                --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
 422
 423                else
 424                   Skip_UTF_Byte;
 425
 426                   if (U and 2#11110000#) = 2#1110_0000# then
 427                      return;
 428
 429                      --  Any other code is invalid, note that this includes:
 430
 431                      --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
 432                      --                           10xxxxxx
 433
 434                      --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
 435                      --                               10xxxxxx 10xxxxxx
 436                      --                               10xxxxxx
 437
 438                      --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
 439                      --                               10xxxxxx 10xxxxxx
 440                      --                               10xxxxxx 10xxxxxx
 441
 442                      --  since Wide_Character does not allow codes > 16#FFFF#
 443
 444                   else
 445                      Bad;
 446                   end if;
 447                end if;
 448             end if;
 449          end UTF8;
 450
 451       --  Special efficient encoding for brackets case
 452
 453       elsif Encoding_Method = WCEM_Brackets then
 454          Brackets : declare
 455             P : Natural;
 456             S : Natural;
 457
 458          begin
 459             --  See if we have "] at end positions
 460
 461             if Ptr > Input'First + 1
 462               and then Input (Ptr - 1) = ']'
 463               and then Input (Ptr - 2) = '"'
 464             then
 465                P := Ptr - 2;
 466
 467                --  Loop back looking for [" at start
 468
 469                while P >= Ptr - 10 loop
 470                   if P <= Input'First + 1 then
 471                      Bad;
 472
 473                   elsif Input (P - 1) = '"'
 474                     and then Input (P - 2) = '['
 475                   then
 476                      --  Found ["..."], scan forward to check it
 477
 478                      S := P - 2;
 479                      P := S;
 480                      Next_Wide_Character (Input, P);
 481
 482                      --  OK if at original pointer, else error
 483
 484                      if P = Ptr then
 485                         Ptr := S;
 486                         return;
 487                      else
 488                         Bad;
 489                      end if;
 490                   end if;
 491
 492                   P := P - 1;
 493                end loop;
 494
 495                --  Falling through loop means more than 8 chars between the
 496                --  enclosing brackets (or simply a missing left bracket)
 497
 498                Bad;
 499
 500             --  Here if no bracket sequence present
 501
 502             else
 503                if Ptr = Input'First then
 504                   Past_End;
 505                else
 506                   Ptr := Ptr - 1;
 507                end if;
 508             end if;
 509          end Brackets;
 510
 511       --  Non-UTF-8/Brackets. These are the inefficient cases where we have to
 512       --  go to the start of the string and skip forwards till Ptr matches.
 513
 514       else
 515          Non_UTF_Brackets : declare
 516             Discard : Wide_Character;
 517             PtrS    : Natural;
 518             PtrP    : Natural;
 519
 520          begin
 521             PtrS := Input'First;
 522
 523             if Ptr <= PtrS then
 524                Past_End;
 525             end if;
 526
 527             loop
 528                PtrP := PtrS;
 529                Decode_Wide_Character (Input, PtrS, Discard);
 530
 531                if PtrS = Ptr then
 532                   Ptr := PtrP;
 533                   return;
 534
 535                elsif PtrS > Ptr then
 536                   Bad;
 537                end if;
 538             end loop;
 539
 540          exception
 541             when Constraint_Error =>
 542                Bad;
 543          end Non_UTF_Brackets;
 544       end if;
 545    end Prev_Wide_Character;
 546
 547    ------------------------------
 548    -- Prev_Wide_Wide_Character --
 549    ------------------------------
 550
 551    procedure Prev_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
 552    begin
 553       if Ptr > Input'Last + 1 then
 554          Past_End;
 555       end if;
 556
 557       --  Special efficient encoding for UTF-8 case
 558
 559       if Encoding_Method = WCEM_UTF8 then
 560          UTF8 : declare
 561             U : Unsigned_32;
 562
 563             procedure Getc;
 564             pragma Inline (Getc);
 565             --  Gets the character at Input (Ptr - 1) and returns code in U as
 566             --  Unsigned_32 value. On return Ptr is decremented by one.
 567
 568             procedure Skip_UTF_Byte;
 569             pragma Inline (Skip_UTF_Byte);
 570             --  Checks that U is 2#10xxxxxx# and then calls Get
 571
 572             ----------
 573             -- Getc --
 574             ----------
 575
 576             procedure Getc is
 577             begin
 578                if Ptr <= Input'First then
 579                   Past_End;
 580                else
 581                   Ptr := Ptr - 1;
 582                   U := Unsigned_32 (Character'Pos (Input (Ptr)));
 583                end if;
 584             end Getc;
 585
 586             -------------------
 587             -- Skip_UTF_Byte --
 588             -------------------
 589
 590             procedure Skip_UTF_Byte is
 591             begin
 592                if (U and 2#11000000#) = 2#10_000000# then
 593                   Getc;
 594                else
 595                   Bad;
 596                end if;
 597             end Skip_UTF_Byte;
 598
 599          --  Start of processing for UTF-8 case
 600
 601          begin
 602             --  16#00_0000#-16#00_007F#: 0xxxxxxx
 603
 604             Getc;
 605
 606             if (U and 2#10000000#) = 2#00000000# then
 607                return;
 608
 609             --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
 610
 611             else
 612                Skip_UTF_Byte;
 613
 614                if (U and 2#11100000#) = 2#110_00000# then
 615                   return;
 616
 617                --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
 618
 619                else
 620                   Skip_UTF_Byte;
 621
 622                   if (U and 2#11110000#) = 2#1110_0000# then
 623                      return;
 624
 625                   --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
 626                   --                           10xxxxxx
 627
 628                   else
 629                      Skip_UTF_Byte;
 630
 631                      if (U and 2#11111000#) = 2#11110_000# then
 632                         return;
 633
 634                      --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
 635                      --                               10xxxxxx 10xxxxxx
 636                      --                               10xxxxxx
 637
 638                      else
 639                         Skip_UTF_Byte;
 640
 641                         if (U and 2#11111100#) = 2#111110_00# then
 642                            return;
 643
 644                         --  Any other code is invalid, note that this includes:
 645
 646                         --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
 647                         --                               10xxxxxx 10xxxxxx
 648                         --                               10xxxxxx 10xxxxxx
 649
 650                         --  since Wide_Wide_Character does not allow codes
 651                         --  greater than 16#03FF_FFFF#
 652
 653                         else
 654                            Bad;
 655                         end if;
 656                      end if;
 657                   end if;
 658                end if;
 659             end if;
 660          end UTF8;
 661
 662       --  Special efficient encoding for brackets case
 663
 664       elsif Encoding_Method = WCEM_Brackets then
 665          Brackets : declare
 666             P : Natural;
 667             S : Natural;
 668
 669          begin
 670             --  See if we have "] at end positions
 671
 672             if Ptr > Input'First + 1
 673               and then Input (Ptr - 1) = ']'
 674               and then Input (Ptr - 2) = '"'
 675             then
 676                P := Ptr - 2;
 677
 678                --  Loop back looking for [" at start
 679
 680                while P >= Ptr - 10 loop
 681                   if P <= Input'First + 1 then
 682                      Bad;
 683
 684                   elsif Input (P - 1) = '"'
 685                     and then Input (P - 2) = '['
 686                   then
 687                      --  Found ["..."], scan forward to check it
 688
 689                      S := P - 2;
 690                      P := S;
 691                      Next_Wide_Wide_Character (Input, P);
 692
 693                      --  OK if at original pointer, else error
 694
 695                      if P = Ptr then
 696                         Ptr := S;
 697                         return;
 698                      else
 699                         Bad;
 700                      end if;
 701                   end if;
 702
 703                   P := P - 1;
 704                end loop;
 705
 706                --  Falling through loop means more than 8 chars between the
 707                --  enclosing brackets (or simply a missing left bracket)
 708
 709                Bad;
 710
 711             --  Here if no bracket sequence present
 712
 713             else
 714                if Ptr = Input'First then
 715                   Past_End;
 716                else
 717                   Ptr := Ptr - 1;
 718                end if;
 719             end if;
 720          end Brackets;
 721
 722       --  Non-UTF-8/Brackets. These are the inefficient cases where we have to
 723       --  go to the start of the string and skip forwards till Ptr matches.
 724
 725       else
 726          Non_UTF8_Brackets : declare
 727             Discard : Wide_Wide_Character;
 728             PtrS    : Natural;
 729             PtrP    : Natural;
 730
 731          begin
 732             PtrS := Input'First;
 733
 734             if Ptr <= PtrS then
 735                Past_End;
 736             end if;
 737
 738             loop
 739                PtrP := PtrS;
 740                Decode_Wide_Wide_Character (Input, PtrS, Discard);
 741
 742                if PtrS = Ptr then
 743                   Ptr := PtrP;
 744                   return;
 745
 746                elsif PtrS > Ptr then
 747                   Bad;
 748                end if;
 749             end loop;
 750
 751          exception
 752             when Constraint_Error =>
 753                Bad;
 754          end Non_UTF8_Brackets;
 755       end if;
 756    end Prev_Wide_Wide_Character;
 757
 758    --------------------------
 759    -- Validate_Wide_String --
 760    --------------------------
 761
 762    function Validate_Wide_String (S : String) return Boolean is
 763       Ptr : Natural;
 764
 765    begin
 766       Ptr := S'First;
 767       while Ptr <= S'Last loop
 768          Next_Wide_Character (S, Ptr);
 769       end loop;
 770
 771       return True;
 772
 773    exception
 774       when Constraint_Error =>
 775          return False;
 776    end Validate_Wide_String;
 777
 778    -------------------------------
 779    -- Validate_Wide_Wide_String --
 780    -------------------------------
 781
 782    function Validate_Wide_Wide_String (S : String) return Boolean is
 783       Ptr : Natural;
 784
 785    begin
 786       Ptr := S'First;
 787       while Ptr <= S'Last loop
 788          Next_Wide_Wide_Character (S, Ptr);
 789       end loop;
 790
 791       return True;
 792
 793    exception
 794       when Constraint_Error =>
 795          return False;
 796    end Validate_Wide_Wide_String;
 797
 798 end GNAT.Decode_String;