gcc/ada/s-wchcnv.adb

   1 ------------------------------------------------------------------------------
   2 --                                                                          --
   3 --                         GNAT RUN-TIME COMPONENTS                         --
   4 --                                                                          --
   5 --                       S Y S T E M . W C H _ C N V                        --
   6 --                                                                          --
   7 --                                 B o d y                                  --
   8 --                                                                          --
   9 --          Copyright (C) 1992-2009, Free Software Foundation, Inc.         --
  10 --                                                                          --
  11 -- GNAT is free software;  you can  redistribute it  and/or modify it under --
  12 -- terms of the  GNU General Public License as published  by the Free Soft- --
  13 -- ware  Foundation;  either version 3,  or (at your option) any later ver- --
  14 -- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
  15 -- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
  16 -- or FITNESS FOR A PARTICULAR PURPOSE.                                     --
  17 --                                                                          --
  18 -- As a special exception under Section 7 of GPL version 3, you are granted --
  19 -- additional permissions described in the GCC Runtime Library Exception,   --
  20 -- version 3.1, as published by the Free Software Foundation.               --
  21 --                                                                          --
  22 -- You should have received a copy of the GNU General Public License and    --
  23 -- a copy of the GCC Runtime Library Exception along with this program;     --
  24 -- see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see    --
  25 -- <http://www.gnu.org/licenses/>.                                          --
  26 --                                                                          --
  27 -- GNAT was originally developed  by the GNAT team at  New York University. --
  28 -- Extensive contributions were provided by Ada Core Technologies Inc.      --
  29 --                                                                          --
  30 ------------------------------------------------------------------------------
  31
  32 pragma Compiler_Unit;
  33
  34 with Interfaces;     use Interfaces;
  35 with System.WCh_Con; use System.WCh_Con;
  36 with System.WCh_JIS; use System.WCh_JIS;
  37
  38 package body System.WCh_Cnv is
  39
  40    -----------------------------
  41    -- Char_Sequence_To_UTF_32 --
  42    -----------------------------
  43
  44    function Char_Sequence_To_UTF_32
  45      (C  : Character;
  46       EM : System.WCh_Con.WC_Encoding_Method) return UTF_32_Code
  47    is
  48       B1 : Unsigned_32;
  49       C1 : Character;
  50       U  : Unsigned_32;
  51       W  : Unsigned_32;
  52
  53       procedure Get_Hex (N : Character);
  54       --  If N is a hex character, then set B1 to 16 * B1 + character N.
  55       --  Raise Constraint_Error if character N is not a hex character.
  56
  57       procedure Get_UTF_Byte;
  58       pragma Inline (Get_UTF_Byte);
  59       --  Used to interpret a 2#10xxxxxx# continuation byte in UTF-8 mode.
  60       --  Reads a byte, and raises CE if the first two bits are not 10.
  61       --  Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
  62
  63       -------------
  64       -- Get_Hex --
  65       -------------
  66
  67       procedure Get_Hex (N : Character) is
  68          B2 : constant Unsigned_32 := Character'Pos (N);
  69       begin
  70          if B2 in Character'Pos ('0') .. Character'Pos ('9') then
  71             B1 := B1 * 16 + B2 - Character'Pos ('0');
  72          elsif B2 in Character'Pos ('A') .. Character'Pos ('F') then
  73             B1 := B1 * 16 + B2 - (Character'Pos ('A') - 10);
  74          elsif B2 in Character'Pos ('a') .. Character'Pos ('f') then
  75             B1 := B1 * 16 + B2 - (Character'Pos ('a') - 10);
  76          else
  77             raise Constraint_Error;
  78          end if;
  79       end Get_Hex;
  80
  81       ------------------
  82       -- Get_UTF_Byte --
  83       ------------------
  84
  85       procedure Get_UTF_Byte is
  86       begin
  87          U := Unsigned_32 (Character'Pos (In_Char));
  88
  89          if (U and 2#11000000#) /= 2#10_000000# then
  90             raise Constraint_Error;
  91          end if;
  92
  93          W := Shift_Left (W, 6) or (U and 2#00111111#);
  94       end Get_UTF_Byte;
  95
  96    --  Start of processing for Char_Sequence_To_Wide
  97
  98    begin
  99       case EM is
 100
 101          when WCEM_Hex =>
 102             if C /= ASCII.ESC then
 103                return Character'Pos (C);
 104
 105             else
 106                B1 := 0;
 107                Get_Hex (In_Char);
 108                Get_Hex (In_Char);
 109                Get_Hex (In_Char);
 110                Get_Hex (In_Char);
 111
 112                return UTF_32_Code (B1);
 113             end if;
 114
 115          when WCEM_Upper =>
 116             if C > ASCII.DEL then
 117                return 256 * Character'Pos (C) + Character'Pos (In_Char);
 118             else
 119                return Character'Pos (C);
 120             end if;
 121
 122          when WCEM_Shift_JIS =>
 123             if C > ASCII.DEL then
 124                return Wide_Character'Pos (Shift_JIS_To_JIS (C, In_Char));
 125             else
 126                return Character'Pos (C);
 127             end if;
 128
 129          when WCEM_EUC =>
 130             if C > ASCII.DEL then
 131                return Wide_Character'Pos (EUC_To_JIS (C, In_Char));
 132             else
 133                return Character'Pos (C);
 134             end if;
 135
 136          when WCEM_UTF8 =>
 137
 138             --  Note: for details of UTF8 encoding see RFC 3629
 139
 140             U := Unsigned_32 (Character'Pos (C));
 141
 142             --  16#00_0000#-16#00_007F#: 0xxxxxxx
 143
 144             if (U and 2#10000000#) = 2#00000000# then
 145                return Character'Pos (C);
 146
 147             --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
 148
 149             elsif (U and 2#11100000#) = 2#110_00000# then
 150                W := U and 2#00011111#;
 151                Get_UTF_Byte;
 152                return UTF_32_Code (W);
 153
 154             --  16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
 155
 156             elsif (U and 2#11110000#) = 2#1110_0000# then
 157                W := U and 2#00001111#;
 158                Get_UTF_Byte;
 159                Get_UTF_Byte;
 160                return UTF_32_Code (W);
 161
 162             --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 163
 164             elsif (U and 2#11111000#) = 2#11110_000# then
 165                W := U and 2#00000111#;
 166
 167                for K in 1 .. 3 loop
 168                   Get_UTF_Byte;
 169                end loop;
 170
 171                return UTF_32_Code (W);
 172
 173             --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
 174             --                               10xxxxxx 10xxxxxx
 175
 176             elsif (U and 2#11111100#) = 2#111110_00# then
 177                W := U and 2#00000011#;
 178
 179                for K in 1 .. 4 loop
 180                   Get_UTF_Byte;
 181                end loop;
 182
 183                return UTF_32_Code (W);
 184
 185             --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
 186             --                               10xxxxxx 10xxxxxx 10xxxxxx
 187
 188             elsif (U and 2#11111110#) = 2#1111110_0# then
 189                W := U and 2#00000001#;
 190
 191                for K in 1 .. 5 loop
 192                   Get_UTF_Byte;
 193                end loop;
 194
 195                return UTF_32_Code (W);
 196
 197             else
 198                raise Constraint_Error;
 199             end if;
 200
 201          when WCEM_Brackets =>
 202             if C /= '[' then
 203                return Character'Pos (C);
 204             end if;
 205
 206             if In_Char /= '"' then
 207                raise Constraint_Error;
 208             end if;
 209
 210             B1 := 0;
 211             Get_Hex (In_Char);
 212             Get_Hex (In_Char);
 213
 214             C1 := In_Char;
 215
 216             if C1 /= '"' then
 217                Get_Hex (C1);
 218                Get_Hex (In_Char);
 219
 220                C1 := In_Char;
 221
 222                if C1 /= '"' then
 223                   Get_Hex (C1);
 224                   Get_Hex (In_Char);
 225
 226                   C1 := In_Char;
 227
 228                   if C1 /= '"' then
 229                      Get_Hex (C1);
 230                      Get_Hex (In_Char);
 231
 232                      if B1 > Unsigned_32 (UTF_32_Code'Last) then
 233                         raise Constraint_Error;
 234                      end if;
 235
 236                      if In_Char /= '"' then
 237                         raise Constraint_Error;
 238                      end if;
 239                   end if;
 240                end if;
 241             end if;
 242
 243             if In_Char /= ']' then
 244                raise Constraint_Error;
 245             end if;
 246
 247             return UTF_32_Code (B1);
 248
 249       end case;
 250    end Char_Sequence_To_UTF_32;
 251
 252    --------------------------------
 253    -- Char_Sequence_To_Wide_Char --
 254    --------------------------------
 255
 256    function Char_Sequence_To_Wide_Char
 257      (C  : Character;
 258       EM : System.WCh_Con.WC_Encoding_Method) return Wide_Character
 259    is
 260       function Char_Sequence_To_UTF is new Char_Sequence_To_UTF_32 (In_Char);
 261
 262       U : constant UTF_32_Code := Char_Sequence_To_UTF (C, EM);
 263
 264    begin
 265       if U > 16#FFFF# then
 266          raise Constraint_Error;
 267       else
 268          return Wide_Character'Val (U);
 269       end if;
 270    end Char_Sequence_To_Wide_Char;
 271
 272    -----------------------------
 273    -- UTF_32_To_Char_Sequence --
 274    -----------------------------
 275
 276    procedure UTF_32_To_Char_Sequence
 277      (Val : UTF_32_Code;
 278       EM  : System.WCh_Con.WC_Encoding_Method)
 279    is
 280       Hexc : constant array (UTF_32_Code range 0 .. 15) of Character :=
 281                "0123456789ABCDEF";
 282
 283       C1, C2 : Character;
 284       U      : Unsigned_32;
 285
 286    begin
 287       --  Raise CE for invalid UTF_32_Code
 288
 289       if not Val'Valid then
 290          raise Constraint_Error;
 291       end if;
 292
 293       --  Processing depends on encoding mode
 294
 295       case EM is
 296
 297          when WCEM_Hex =>
 298             if Val < 256 then
 299                Out_Char (Character'Val (Val));
 300             elsif Val <= 16#FFFF# then
 301                Out_Char (ASCII.ESC);
 302                Out_Char (Hexc (Val / (16**3)));
 303                Out_Char (Hexc ((Val / (16**2)) mod 16));
 304                Out_Char (Hexc ((Val / 16) mod 16));
 305                Out_Char (Hexc (Val mod 16));
 306             else
 307                raise Constraint_Error;
 308             end if;
 309
 310          when WCEM_Upper =>
 311             if Val < 128 then
 312                Out_Char (Character'Val (Val));
 313             elsif Val < 16#8000# or else Val > 16#FFFF# then
 314                raise Constraint_Error;
 315             else
 316                Out_Char (Character'Val (Val / 256));
 317                Out_Char (Character'Val (Val mod 256));
 318             end if;
 319
 320          when WCEM_Shift_JIS =>
 321             if Val < 128 then
 322                Out_Char (Character'Val (Val));
 323             elsif Val <= 16#FFFF# then
 324                JIS_To_Shift_JIS (Wide_Character'Val (Val), C1, C2);
 325                Out_Char (C1);
 326                Out_Char (C2);
 327             else
 328                raise Constraint_Error;
 329             end if;
 330
 331          when WCEM_EUC =>
 332             if Val < 128 then
 333                Out_Char (Character'Val (Val));
 334             elsif Val <= 16#FFFF# then
 335                JIS_To_EUC (Wide_Character'Val (Val), C1, C2);
 336                Out_Char (C1);
 337                Out_Char (C2);
 338             else
 339                raise Constraint_Error;
 340             end if;
 341
 342          when WCEM_UTF8 =>
 343
 344             --  Note: for details of UTF8 encoding see RFC 3629
 345
 346             U := Unsigned_32 (Val);
 347
 348             --  16#00_0000#-16#00_007F#: 0xxxxxxx
 349
 350             if U <= 16#00_007F# then
 351                Out_Char (Character'Val (U));
 352
 353             --  16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
 354
 355             elsif U <= 16#00_07FF# then
 356                Out_Char (Character'Val (2#11000000# or Shift_Right (U, 6)));
 357                Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
 358
 359             --  16#00_0800#-16#00_FFFF#: 1110xxxx 10xxxxxx 10xxxxxx
 360
 361             elsif U <= 16#00_FFFF# then
 362                Out_Char (Character'Val (2#11100000# or Shift_Right (U, 12)));
 363                Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
 364                                                           and 2#00111111#)));
 365                Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
 366
 367             --  16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 368
 369             elsif U <= 16#10_FFFF# then
 370                Out_Char (Character'Val (2#11110000# or Shift_Right (U, 18)));
 371                Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12)
 372                                                           and 2#00111111#)));
 373                Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
 374                                                           and 2#00111111#)));
 375                Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
 376
 377             --  16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
 378             --                               10xxxxxx 10xxxxxx
 379
 380             elsif U <= 16#03FF_FFFF# then
 381                Out_Char (Character'Val (2#11111000# or Shift_Right (U, 24)));
 382                Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 18)
 383                                                           and 2#00111111#)));
 384                Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12)
 385                                                           and 2#00111111#)));
 386                Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
 387                                                           and 2#00111111#)));
 388                Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
 389
 390             --  16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
 391             --                               10xxxxxx 10xxxxxx 10xxxxxx
 392
 393             elsif U <= 16#7FFF_FFFF# then
 394                Out_Char (Character'Val (2#11111100# or Shift_Right (U, 30)));
 395                Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 24)
 396                                                           and 2#00111111#)));
 397                Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 18)
 398                                                           and 2#00111111#)));
 399                Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 12)
 400                                                           and 2#00111111#)));
 401                Out_Char (Character'Val (2#10000000# or (Shift_Right (U, 6)
 402                                                           and 2#00111111#)));
 403                Out_Char (Character'Val (2#10000000# or (U and 2#00111111#)));
 404
 405             else
 406                raise Constraint_Error;
 407             end if;
 408
 409          when WCEM_Brackets =>
 410
 411             --  Values in the range 0-255 are directly output. Note that there
 412             --  is some issue with [ (16#5B#] since this will cause confusion
 413             --  if the resulting string is interpreted using brackets encoding.
 414
 415             --  One possibility would be to always output [ as ["5B"] but in
 416             --  practice this is undesirable, since for example normal use of
 417             --  Wide_Text_IO for output (much more common than input), really
 418             --  does want to be able to say something like
 419
 420             --     Put_Line ("Start of output [first run]");
 421
 422             --  and have it come out as intended, rather than contaminated by
 423             --  a ["5B"] sequence in place of the left bracket.
 424
 425             if Val < 256 then
 426                Out_Char (Character'Val (Val));
 427
 428             --  Otherwise use brackets notation for vales greater than 255
 429
 430             else
 431                Out_Char ('[');
 432                Out_Char ('"');
 433
 434                if Val > 16#FFFF# then
 435                   if Val > 16#00FF_FFFF# then
 436                      Out_Char (Hexc (Val / 16 ** 7));
 437                      Out_Char (Hexc ((Val / 16 ** 6) mod 16));
 438                   end if;
 439
 440                   Out_Char (Hexc ((Val / 16 ** 5) mod 16));
 441                   Out_Char (Hexc ((Val / 16 ** 4) mod 16));
 442                end if;
 443
 444                Out_Char (Hexc ((Val / 16 ** 3) mod 16));
 445                Out_Char (Hexc ((Val / 16 ** 2) mod 16));
 446                Out_Char (Hexc ((Val / 16) mod 16));
 447                Out_Char (Hexc (Val mod 16));
 448
 449                Out_Char ('"');
 450                Out_Char (']');
 451             end if;
 452       end case;
 453    end UTF_32_To_Char_Sequence;
 454
 455    --------------------------------
 456    -- Wide_Char_To_Char_Sequence --
 457    --------------------------------
 458
 459    procedure Wide_Char_To_Char_Sequence
 460      (WC : Wide_Character;
 461       EM : System.WCh_Con.WC_Encoding_Method)
 462    is
 463       procedure UTF_To_Char_Sequence is new UTF_32_To_Char_Sequence (Out_Char);
 464    begin
 465       UTF_To_Char_Sequence (Wide_Character'Pos (WC), EM);
 466    end Wide_Char_To_Char_Sequence;
 467
 468 end System.WCh_Cnv;