1 ------------------------------------------------------------------------------
3 -- GNAT RUN-TIME COMPONENTS --
5 -- A D A . S T R I N G S . U T F _ E N C O D I N G --
9 -- This specification is derived from the Ada Reference Manual for use with --
10 -- GNAT. The copyright notice above, and the license provisions that follow --
11 -- apply solely to the contents of the part following the private keyword. --
13 -- GNAT is free software; you can redistribute it and/or modify it under --
14 -- terms of the GNU General Public License as published by the Free Soft- --
15 -- ware Foundation; either version 3, or (at your option) any later ver- --
16 -- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
17 -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
18 -- or FITNESS FOR A PARTICULAR PURPOSE. --
20 -- As a special exception under Section 7 of GPL version 3, you are granted --
21 -- additional permissions described in the GCC Runtime Library Exception, --
22 -- version 3.1, as published by the Free Software Foundation. --
24 -- You should have received a copy of the GNU General Public License and --
25 -- a copy of the GCC Runtime Library Exception along with this program; --
26 -- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see --
27 -- <http://www.gnu.org/licenses/>. --
29 -- GNAT was originally developed by the GNAT team at New York University. --
30 -- Extensive contributions were provided by Ada Core Technologies Inc. --
32 ------------------------------------------------------------------------------
34 -- This is one of the Ada 2012 package defined in AI05-0137-1. It is a parent
35 -- package that contains declarations used in the child packages for handling
36 -- UTF encoded strings. Note: this package is consistent with Ada 95, and may
37 -- be used in Ada 95 or Ada 2005 mode.
40 with Unchecked_Conversion
;
42 package Ada
.Strings
.UTF_Encoding
is
43 pragma Pure
(UTF_Encoding
);
45 subtype UTF_String
is String;
46 -- Used to represent a string of 8-bit values containing a sequence of
47 -- values encoded in one of three ways (UTF-8, UTF-16BE, or UTF-16LE).
48 -- Typically used in connection with a Scheme parameter indicating which
49 -- of the encodings applies. This is not strictly a String value in the
50 -- sense defined in the Ada RM, but in practice type String accommodates
51 -- all possible 256 codes, and can be used to hold any sequence of 8-bit
52 -- codes. We use String directly rather than create a new type so that
53 -- all existing facilities for manipulating type String (e.g. the child
54 -- packages of Ada.Strings) are available for manipulation of UTF_Strings.
56 type Encoding_Scheme
is (UTF_8
, UTF_16BE
, UTF_16LE
);
57 -- Used to specify which of three possible encodings apply to a UTF_String
59 subtype UTF_8_String
is String;
60 -- Similar to UTF_String but specifically represents a UTF-8 encoded string
62 subtype UTF_16_Wide_String
is Wide_String;
63 -- This is similar to UTF_8_String but is used to represent a Wide_String
64 -- value which is a sequence of 16-bit values encoded using UTF-16. Again
65 -- this is not strictly a Wide_String in the sense of the Ada RM, but the
66 -- type Wide_String can be used to represent a sequence of arbitrary 16-bit
67 -- values, and it is more convenient to use Wide_String than a new type.
69 Encoding_Error
: exception;
70 -- This exception is raised in the following situations:
71 -- a) A UTF encoded string contains an invalid encoding sequence
72 -- b) A UTF-16BE or UTF-16LE input string has an odd length
73 -- c) An incorrect character value is present in the Input string
74 -- d) The result for a Wide_Character output exceeds 16#FFFF#
75 -- The exception message has the index value where the error occurred.
77 -- The BOM (BYTE_ORDER_MARK) values defined here are used at the start of
78 -- a string to indicate the encoding. The convention in this package is
79 -- that on input a correct BOM is ignored and an incorrect BOM causes an
80 -- Encoding_Error exception. On output, the output string may or may not
81 -- include a BOM depending on the setting of Output_BOM.
83 BOM_8
: constant UTF_8_String
:=
84 Character'Val (16#EF#
) &
85 Character'Val (16#BB#
) &
86 Character'Val (16#BF#
);
88 BOM_16BE
: constant UTF_String
:=
89 Character'Val (16#FE#
) &
90 Character'Val (16#FF#
);
92 BOM_16LE
: constant UTF_String
:=
93 Character'Val (16#FF#
) &
94 Character'Val (16#FE#
);
96 BOM_16
: constant UTF_16_Wide_String
:=
97 (1 => Wide_Character'Val (16#FEFF#
));
101 Default
: Encoding_Scheme
:= UTF_8
) return Encoding_Scheme
;
102 -- This function inspects a UTF_String value to determine whether it
103 -- starts with a BOM for UTF-8, UTF-16BE, or UTF_16LE. If so, the result
104 -- is the scheme corresponding to the BOM. If no valid BOM is present
105 -- then the result is the specified Default value.
108 function To_Unsigned_8
is new
109 Unchecked_Conversion
(Character, Interfaces
.Unsigned_8
);
111 function To_Unsigned_16
is new
112 Unchecked_Conversion
(Wide_Character, Interfaces
.Unsigned_16
);
114 function To_Unsigned_32
is new
115 Unchecked_Conversion
(Wide_Wide_Character
, Interfaces
.Unsigned_32
);
117 subtype UTF_XE_Encoding
is Encoding_Scheme
range UTF_16BE
.. UTF_16LE
;
118 -- Subtype containing only UTF_16BE and UTF_16LE entries
120 -- Utility routines for converting between UTF-16 and UTF-16LE/BE
123 (Item
: UTF_16_Wide_String
;
124 Output_Scheme
: UTF_XE_Encoding
;
125 Output_BOM
: Boolean := False) return UTF_String
;
126 -- The input string Item is encoded in UTF-16. The output is encoded using
127 -- Output_Scheme (which is either UTF-16LE or UTF-16BE). There are no error
128 -- cases. The output starts with BOM_16BE/LE if Output_BOM is True.
132 Input_Scheme
: UTF_XE_Encoding
;
133 Output_BOM
: Boolean := False) return UTF_16_Wide_String
;
134 -- The input string Item is encoded using Input_Scheme which is either
135 -- UTF-16LE or UTF-16BE. The output is the corresponding UTF_16 wide
136 -- string. Encoding error is raised if the length of the input is odd.
137 -- The output starts with BOM_16 if Output_BOM is True.
139 procedure Raise_Encoding_Error
(Index
: Natural);
140 pragma No_Return
(Raise_Encoding_Error
);
141 -- Raise Encoding_Error exception for bad encoding in input item. The
142 -- parameter Index is the index of the location in Item for the error.
144 end Ada
.Strings
.UTF_Encoding
;