gcc/ada/s-wchcnv.adb

   1 ------------------------------------------------------------------------------
   2 --                                                                          --
   3 --                         GNAT RUN-TIME COMPONENTS                         --
   4 --                                                                          --
   5 --                       S Y S T E M . W C H _ C N V                        --
   6 --                                                                          --
   7 --                                 B o d y                                  --
   8 --                                                                          --
   9 --          Copyright (C) 1992-2016, Free Software Foundation, Inc.         --
  10 --                                                                          --
  11 -- GNAT is free software;  you can  redistribute it  and/or modify it under --
  12 -- terms of the  GNU General Public License as published  by the Free Soft- --
  13 -- ware  Foundation;  either version 3,  or (at your option) any later ver- --
  14 -- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
  15 -- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
  16 -- or FITNESS FOR A PARTICULAR PURPOSE.                                     --
  17 --                                                                          --
  18 -- As a special exception under Section 7 of GPL version 3, you are granted --
  19 -- additional permissions described in the GCC Runtime Library Exception,   --
  20 -- version 3.1, as published by the Free Software Foundation.               --
  21 --                                                                          --
  22 -- You should have received a copy of the GNU General Public License and    --
  23 -- a copy of the GCC Runtime Library Exception along with this program;     --
  24 -- see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see    --
  25 -- <http://www.gnu.org/licenses/>.                                          --
  26 --                                                                          --
  27 -- GNAT was originally developed  by the GNAT team at  New York University. --
  28 -- Extensive contributions were provided by Ada Core Technologies Inc.      --
  29 --                                                                          --
  30 ------------------------------------------------------------------------------
  31
  32 pragma Compiler_Unit_Warning;
  33
  34 with Interfaces;     use Interfaces;
  35 with System.WCh_Con; use System.WCh_Con;
  36 with System.WCh_JIS; use System.WCh_JIS;
  37
  38 package body System.WCh_Cnv is
  39
  40    -----------------------------
  41    -- Char_Sequence_To_UTF_32 --
  42    -----------------------------
  43
  44    function Char_Sequence_To_UTF_32
  45      (C  : Character;
  46       EM : System.WCh_Con.WC_Encoding_Method) return UTF_32_Code
  47    is
  48       B1 : Unsigned_32;
  49       C1 : Character;
  50       U  : Unsigned_32;
  51       W  : Unsigned_32;
  52
  53       procedure Get_Hex (N : Character);
  54       --  If N is a hex character, then set B1 to 16 * B1 + character N.
  55       --  Raise Constraint_Error if character N is not a hex character.
  56
  57       procedure Get_UTF_Byte;
  58       pragma Inline (Get_UTF_Byte);
  59       --  Used to interpret a 2#10xxxxxx# continuation byte in UTF-8 mode.
  60       --  Reads a byte, and raises CE if the first two bits are not 10.
  61       --  Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
  62
  63       -------------
  64       -- Get_Hex --
  65       -------------
  66
  67       procedure Get_Hex (N : Character) is
  68          B2 : constant Unsigned_32 := Character'Pos (N);
  69       begin
  70          if B2 in Character'Pos ('0') .. Character'Pos ('9') then
  71             B1 := B1 * 16 + B2 - Character'Pos ('0');
  72          elsif B2 in Character'Pos ('A') .. Character'Pos ('F') then
  73             B1 := B1 * 16 + B2 - (Character'Pos ('A') - 10);
  74          elsif B2 in Character'Pos ('a') .. Character'Pos ('f') then
  75             B1 := B1 * 16 + B2 - (Character'Pos ('a') - 10);
  76          else
  77             raise Constraint_Error;
  78          end if;
  79       end Get_Hex;
  80
  81       ------------------
  82       -- Get_UTF_Byte --
  83       ------------------
  84
  85       procedure Get_UTF_Byte is
  86       begin
  87          U := Unsigned_32 (Character'Pos (In_Char));
  88
  89          if (U and 2#11000000#) /= 2#10_000000# then
  90             raise Constraint_Error;
  91          end if;
  92
  93          W := Shift_Left (W, 6) or (U and 2#00111111#);
  94       end Get_UTF_Byte;
  95
  96    --  Start of processing for Char_Sequence_To_UTF_32
  97
  98    begin
  99       case EM is
 100          when WCEM_Hex =>
 101             if C /= ASCII.ESC then
 102                return Character'Pos (C);
 103
 104             else
 105                B1 := 0;
 106                Get_Hex (In_Char);
 107                Get_Hex (In_Char);
 108                Get_Hex (In_Char);
 109                Get_Hex (In_Char);
 110
 111                return UTF_32_Code (B1);
 112             end if;
 113
 114          when WCEM_Upper =>
 115             if C > ASCII.DEL then
 116                return 256 * Character'Pos (C) + Character'Pos (In_Char);
 117             else
 118                return Character'Pos (C);
 119             end if;
 120
 121          when WCEM_Shift_JIS =>
 122             if C > ASCII.DEL then
 123                return Wide_Character'Pos (Shift_JIS_To_JIS (C, In_Char));
 124             else
 125                return Character'Pos (C);
 126             end if;
 127
 128          when WCEM_EUC =>
 129             if C > ASCII.DEL then
 130                return Wide_Character'Pos (EUC_To_JIS (C, In_Char));
 131             else
 132                return Character'Pos (C);
 133             end if;
 134
 135          when WCEM_UTF8 =>
 136
 137             --  Note: for details of UTF8 encoding see RFC 3629
 138
 139             U := Unsigned_32 (Character'Pos (C));
 140
 141             --  16#00_0000#-16#00_007F#: 0xxxxxxx
 142
 143             if (U and 2#10000000#) = 2#00000000# then
 144                return Character'Pos (C);
 145
 146             --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
 147
 148             elsif (U and 2#11100000#) = 2#110_00000# then
 149                W := U and 2#00011111#;
 150                Get_UTF_Byte;
 151                return UTF_32_Code (W);
 152
 153             --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
 154
 155             elsif (U and 2#11110000#) = 2#1110_0000# then
 156                W := U and 2#00001111#;
 157                Get_UTF_Byte;
 158                Get_UTF_Byte;
 159                return UTF_32_Code (W);
 160
 161             --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 162
 163             elsif (U and 2#11111000#) = 2#11110_000# then
 164                W := U and 2#00000111#;
 165
 166                for K in 1 .. 3 loop
 167                   Get_UTF_Byte;
 168                end loop;
 169
 170                return UTF_32_Code (W);
 171
 172             --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
 173             --                               10xxxxxx 10xxxxxx
 174
 175             elsif (U and 2#11111100#) = 2#111110_00# then
 176                W := U and 2#00000011#;
 177
 178                for K in 1 .. 4 loop
 179                   Get_UTF_Byte;
 180                end loop;
 181
 182                return UTF_32_Code (W);
 183
 184             --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
 185             --                               10xxxxxx 10xxxxxx 10xxxxxx
 186
 187             elsif (U and 2#11111110#) = 2#1111110_0# then
 188                W := U and 2#00000001#;
 189
 190                for K in 1 .. 5 loop
 191                   Get_UTF_Byte;
 192                end loop;
 193
 194                return UTF_32_Code (W);
 195
 196             else
 197                raise Constraint_Error;
 198             end if;
 199
 200          when WCEM_Brackets =>
 201             if C /= '[' then
 202                return Character'Pos (C);
 203             end if;
 204
 205             if In_Char /= '"' then
 206                raise Constraint_Error;
 207             end if;
 208
 209             B1 := 0;
 210             Get_Hex (In_Char);
 211             Get_Hex (In_Char);
 212
 213             C1 := In_Char;
 214
 215             if C1 /= '"' then
 216                Get_Hex (C1);
 217                Get_Hex (In_Char);
 218
 219                C1 := In_Char;
 220
 221                if C1 /= '"' then
 222                   Get_Hex (C1);
 223                   Get_Hex (In_Char);
 224
 225                   C1 := In_Char;
 226
 227                   if C1 /= '"' then
 228                      Get_Hex (C1);
 229                      Get_Hex (In_Char);
 230
 231                      if B1 > Unsigned_32 (UTF_32_Code'Last) then
 232                         raise Constraint_Error;
 233                      end if;
 234
 235                      if In_Char /= '"' then
 236                         raise Constraint_Error;
 237                      end if;
 238                   end if;
 239                end if;
 240             end if;
 241
 242             if In_Char /= ']' then
 243                raise Constraint_Error;
 244             end if;
 245
 246             return UTF_32_Code (B1);
 247       end case;
 248    end Char_Sequence_To_UTF_32;
 249
 250    --------------------------------
 251    -- Char_Sequence_To_Wide_Char --
 252    --------------------------------
 253
 254    function Char_Sequence_To_Wide_Char
 255      (C  : Character;
 256       EM : System.WCh_Con.WC_Encoding_Method) return Wide_Character
 257    is
 258       function Char_Sequence_To_UTF is new Char_Sequence_To_UTF_32 (In_Char);
 259
 260       U : constant UTF_32_Code := Char_Sequence_To_UTF (C, EM);
 261
 262    begin
 263       if U > 16#FFFF# then
 264          raise Constraint_Error;
 265       else
 266          return Wide_Character'Val (U);
 267       end if;
 268    end Char_Sequence_To_Wide_Char;
 269
 270    -----------------------------
 271    -- UTF_32_To_Char_Sequence --
 272    -----------------------------
 273
 274    procedure UTF_32_To_Char_Sequence
 275      (Val : UTF_32_Code;
 276       EM  : System.WCh_Con.WC_Encoding_Method)
 277    is
 278       Hexc : constant array (UTF_32_Code range 0 .. 15) of Character :=
 279                "0123456789ABCDEF";
 280
 281       C1, C2 : Character;
 282       U      : Unsigned_32;
 283
 284    begin
 285       --  Raise CE for invalid UTF_32_Code
 286
 287       if not Val'Valid then
 288          raise Constraint_Error;
 289       end if;
 290
 291       --  Processing depends on encoding mode
 292
 293       case EM is
 294          when WCEM_Hex =>
 295             if Val < 256 then
 296                Out_Char (Character'Val (Val));
 297             elsif Val <= 16#FFFF# then
 298                Out_Char (ASCII.ESC);
 299                Out_Char (Hexc (Val / (16**3)));
 300                Out_Char (Hexc ((Val / (16**2)) mod 16));
 301                Out_Char (Hexc ((Val / 16) mod 16));
 302                Out_Char (Hexc (Val mod 16));
 303             else
 304                raise Constraint_Error;
 305             end if;
 306
 307          when WCEM_Upper =>
 308             if Val < 128 then
 309                Out_Char (Character'Val (Val));
 310             elsif Val < 16#8000# or else Val > 16#FFFF# then
 311                raise Constraint_Error;
 312             else
 313                Out_Char (Character'Val (Val / 256));
 314                Out_Char (Character'Val (Val mod 256));
 315             end if;
 316
 317          when WCEM_Shift_JIS =>
 318             if Val < 128 then
 319                Out_Char (Character'Val (Val));
 320             elsif Val <= 16#FFFF# then
 321                JIS_To_Shift_JIS (Wide_Character'Val (Val), C1, C2);
 322                Out_Char (C1);
 323                Out_Char (C2);
 324             else
 325                raise Constraint_Error;
 326             end if;
 327
 328          when WCEM_EUC =>
 329             if Val < 128 then
 330                Out_Char (Character'Val (Val));
 331             elsif Val <= 16#FFFF# then
 332                JIS_To_EUC (Wide_Character'Val (Val), C1, C2);
 333                Out_Char (C1);
 334                Out_Char (C2);
 335             else
 336                raise Constraint_Error;
 337             end if;
 338
 339          when WCEM_UTF8 =>
 340
 341             --  Note: for details of UTF8 encoding see RFC 3629
 342
 343             U := Unsigned_32 (Val);
 344
 345             --  16#00_0000#-16#00_007F#: 0xxxxxxx
 346
 347             if U <= 16#00_007F# then
 348                Out_Char (Character'Val (U));
 349
 350             --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
 351
 352             elsif U <= 16#00_07FF# then
 353                Out_Char (Character'Val (2#11000000# or Shift_Right (U, 6)));
 354                Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
 355
 356             --  16#00_0800#-16#00_FFFF#: 1110xxxx 10xxxxxx 10xxxxxx
 357
 358             elsif U <= 16#00_FFFF# then
 359                Out_Char (Character'Val (2#11100000# or Shift_Right (U, 12)));
 360                Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
 361                                                           and 2#00111111#)));
 362                Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
 363
 364             --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 365
 366             elsif U <= 16#10_FFFF# then
 367                Out_Char (Character'Val (2#11110000# or Shift_Right (U, 18)));
 368                Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12)
 369                                                           and 2#00111111#)));
 370                Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
 371                                                           and 2#00111111#)));
 372                Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
 373
 374             --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
 375             --                               10xxxxxx 10xxxxxx
 376
 377             elsif U <= 16#03FF_FFFF# then
 378                Out_Char (Character'Val (2#11111000# or Shift_Right (U, 24)));
 379                Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 18)
 380                                                           and 2#00111111#)));
 381                Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12)
 382                                                           and 2#00111111#)));
 383                Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
 384                                                           and 2#00111111#)));
 385                Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
 386
 387             --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
 388             --                               10xxxxxx 10xxxxxx 10xxxxxx
 389
 390             elsif U <= 16#7FFF_FFFF# then
 391                Out_Char (Character'Val (2#11111100# or Shift_Right (U, 30)));
 392                Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 24)
 393                                                           and 2#00111111#)));
 394                Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 18)
 395                                                           and 2#00111111#)));
 396                Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12)
 397                                                           and 2#00111111#)));
 398                Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
 399                                                           and 2#00111111#)));
 400                Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
 401
 402             else
 403                raise Constraint_Error;
 404             end if;
 405
 406          when WCEM_Brackets =>
 407
 408             --  Values in the range 0-255 are directly output. Note that there
 409             --  is an issue with [ (16#5B#) since this will cause confusion
 410             --  if the resulting string is interpreted using brackets encoding.
 411
 412             --  One possibility would be to always output [ as ["5B"] but in
 413             --  practice this is undesirable, since for example normal use of
 414             --  Wide_Text_IO for output (much more common than input), really
 415             --  does want to be able to say something like
 416
 417             --     Put_Line ("Start of output [first run]");
 418
 419             --  and have it come out as intended, rather than contaminated by
 420             --  a ["5B"] sequence in place of the left bracket.
 421
 422             if Val < 256 then
 423                Out_Char (Character'Val (Val));
 424
 425             --  Otherwise use brackets notation for vales greater than 255
 426
 427             else
 428                Out_Char ('[');
 429                Out_Char ('"');
 430
 431                if Val > 16#FFFF# then
 432                   if Val > 16#00FF_FFFF# then
 433                      Out_Char (Hexc (Val / 16 ** 7));
 434                      Out_Char (Hexc ((Val / 16 ** 6) mod 16));
 435                   end if;
 436
 437                   Out_Char (Hexc ((Val / 16 ** 5) mod 16));
 438                   Out_Char (Hexc ((Val / 16 ** 4) mod 16));
 439                end if;
 440
 441                Out_Char (Hexc ((Val / 16 ** 3) mod 16));
 442                Out_Char (Hexc ((Val / 16 ** 2) mod 16));
 443                Out_Char (Hexc ((Val / 16) mod 16));
 444                Out_Char (Hexc (Val mod 16));
 445
 446                Out_Char ('"');
 447                Out_Char (']');
 448             end if;
 449       end case;
 450    end UTF_32_To_Char_Sequence;
 451
 452    --------------------------------
 453    -- Wide_Char_To_Char_Sequence --
 454    --------------------------------
 455
 456    procedure Wide_Char_To_Char_Sequence
 457      (WC : Wide_Character;
 458       EM : System.WCh_Con.WC_Encoding_Method)
 459    is
 460       procedure UTF_To_Char_Sequence is new UTF_32_To_Char_Sequence (Out_Char);
 461    begin
 462       UTF_To_Char_Sequence (Wide_Character'Pos (WC), EM);
 463    end Wide_Char_To_Char_Sequence;
 464
 465 end System.WCh_Cnv;