3 % Copyright
2013 Taco Hoekwater
<taco@@luatex.org
>
5 % This file is part of LuaTeX.
7 % LuaTeX is free software
; you can redistribute it and
/or modify it under
8 % the terms of the GNU General Public License as published by the Free
9 % Software Foundation
; either version
2 of the License
, or
(at your
10 % option
) any later version.
12 % LuaTeX is distributed in the hope that it will be useful
, but WITHOUT
13 % ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY or
14 % FITNESS
FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 % License for more details.
17 % You should have received a copy of the GNU General Public License along
18 % with LuaTeX
; if not
, see
<http
://www.gnu.org
/licenses
/>.
28 static void utf_error
(void
)
31 { "A funny symbol that I can't read has just been (re)read.",
32 "Just continue, I'll change it to 0xFFFD.",
35 deletions_allowed
= false
;
36 tex_error
("String contains an invalid utf-8 sequence", hlp
);
37 deletions_allowed
= true
;
41 unsigned str2uni
(const unsigned char
*k
)
45 const unsigned char
*text
= k
;
46 if
((ch
= *text
++) < 0x80) {
48 } else if
(ch
<= 0xbf) { /* error
*/
49 } else if
(ch
<= 0xdf) {
50 if
(*text
>= 0x80 && *text < 0xc0)
51 val
= (unsigned
) (((ch
& 0x1f) << 6) | (*text++ & 0x3f));
52 } else if
(ch
<= 0xef) {
53 if
(*text
>= 0x80 && *text < 0xc0 && text[1] >= 0x80 && text[1] < 0xc0) {
55 (((ch
& 0xf) << 12) | ((text[0] & 0x3f) << 6) |
58 } else if
(ch
<= 0xf7) {
59 int w
= (((ch
& 0x7) << 2) | ((text[0] & 0x30) >> 4)) - 1, w2;
60 w
= (w
<< 6) |
((text
[0] & 0xf) << 2) | ((text[1] & 0x30) >> 4);
61 w2
= ((text
[1] & 0xf) << 6) | (text[2] & 0x3f);
62 val
= (unsigned
) (w
* 0x400 + w2
+ 0x10000);
63 if
(*text
< 0x80 || text
[1] < 0x80 || text
[2] < 0x80 ||
64 *text
>= 0xc0 || text
[1] >= 0xc0 || text
[2] >= 0xc0)
67 /* the
5- and
6-byte UTF-8 sequences generate integers
68 that are outside of the valid UCS range
, and therefore
77 @ This is a very basic helper
79 unsigned char
*uni2str
(unsigned unic
)
81 unsigned char
*buf
= xmalloc
(5);
82 unsigned char
*pt
= buf
;
84 *pt
++ = (unsigned char
) unic
;
85 else if
(unic
< 0x800) {
86 *pt
++ = (unsigned char
) (0xc0 |
(unic
>> 6));
87 *pt
++ = (unsigned char
) (0x80 |
(unic
& 0x3f));
88 } else if
(unic
>= 0x110000) {
89 *pt
++ = (unsigned char
) (unic
- 0x110000);
90 } else if
(unic
< 0x10000) {
91 *pt
++ = (unsigned char
) (0xe0 |
(unic
>> 12));
92 *pt
++ = (unsigned char
) (0x80 |
((unic
>> 6) & 0x3f));
93 *pt
++ = (unsigned char
) (0x80 |
(unic
& 0x3f));
95 unsigned val
= unic
- 0x10000;
96 int u
= (int
) (((val
& 0xf0000) >> 16) + 1);
97 int z
= (int
) ((val
& 0x0f000) >> 12);
98 int y
= (int
) ((val
& 0x00fc0) >> 6);
99 int x
= (int
) (val
& 0x0003f);
100 *pt
++ = (unsigned char
) (0xf0 |
(u
>> 2));
101 *pt
++ = (unsigned char
) (0x80 |
((u
& 3) << 4) | z);
102 *pt
++ = (unsigned char
) (0x80 | y
);
103 *pt
++ = (unsigned char
) (0x80 | x
);
109 @ |buffer_to_unichar| converts a sequence of bytes in the |buffer|
110 into a unicode character value. It does not check for overflow
111 of the |buffer|
, but it is careful to check the validity of the
115 int buffer_to_unichar
(int k
)
117 return str2uni
((const unsigned char
*)(buffer
+k
));
121 @ These came from texlang.w
123 char
*uni2string
(char
*utf8_text
, unsigned ch
)
125 /* Increment and deposit character
*/
126 if
(ch
>= 17 * 65536)
130 *utf8_text
++ = (char
) ch
;
131 else if
(ch
<= 0x7ff) {
132 *utf8_text
++ = (char
) (0xc0 |
(ch
>> 6));
133 *utf8_text
++ = (char
) (0x80 |
(ch
& 0x3f));
134 } else if
(ch
<= 0xffff) {
135 *utf8_text
++ = (char
) (0xe0 |
(ch
>> 12));
136 *utf8_text
++ = (char
) (0x80 |
((ch
>> 6) & 0x3f));
137 *utf8_text
++ = (char
) (0x80 |
(ch
& 0x3f));
139 unsigned val
= ch
- 0x10000;
140 unsigned u
= ((val
& 0xf0000) >> 16) + 1, z = (val & 0x0f000) >> 12, y =
141 (val
& 0x00fc0) >> 6, x = val & 0x0003f;
142 *utf8_text
++ = (char
) (0xf0 |
(u
>> 2));
143 *utf8_text
++ = (char
) (0x80 |
((u
& 3) << 4) | z);
144 *utf8_text
++ = (char
) (0x80 | y
);
145 *utf8_text
++ = (char
) (0x80 | x
);
150 unsigned u_length
(register unsigned int
*str
)
152 register unsigned len
= 0;
153 while
(*str
++ != '\
0'
)
159 void utf2uni_strcpy
(unsigned int
*ubuf
, const char
*utf8buf
)
161 int len
= (int
) strlen
(utf8buf
) + 1;
162 unsigned int
*upt
= ubuf
, *uend
= ubuf
+ len
- 1;
163 const unsigned char
*pt
= (const unsigned char
*) utf8buf
, *end
=
164 pt
+ strlen
(utf8buf
);
167 while
(pt
< end
&& *pt != '\0' && upt < uend) {
170 else if
(*pt
<= 0xdf) {
171 *upt
= (unsigned int
) (((*pt
& 0x1f) << 6) | (pt[1] & 0x3f));
173 } else if
(*pt
<= 0xef) {
175 (unsigned int
) (((*pt
& 0xf) << 12) | ((pt[1] & 0x3f) << 6) |
179 w
= (((*pt
& 0x7) << 2) | ((pt[1] & 0x30) >> 4)) - 1;
180 w
= (w
<< 6) |
((pt
[1] & 0xf) << 2) | ((pt[2] & 0x30) >> 4);
181 w2
= ((pt
[2] & 0xf) << 6) | (pt[3] & 0x3f);
182 *upt
= (unsigned int
) (w
* 0x400 + w2
+ 0x10000);
191 char
*utf16be_str
(long code
)
193 static char buf
[SMALL_BUF_SIZE
];
200 sprintf
(buf
, "%04lX", code
);
203 vh
= (unsigned
) (v
/ 0x400 + 0xD800);
204 vl
= (unsigned
) (v
% 0x400 + 0xDC00);
205 sprintf
(buf
, "%04X%04X", vh
, vl
);