2 * Copyright (c) 2014 - 2015 Steffen (Daode) Nurpmeso <sdaoden@users.sf.net>.
4 * Copyright (C) 2005, 2006, 2008
5 * Free Software Foundation, Inc.
6 * Written by Werner Lemberg (wl@gnu.org)
8 * This is free software; you can redistribute it and/or modify it under
9 * the terms of the GNU General Public License as published by the Free
10 * Software Foundation; either version 2, or (at your option) any later
13 * This is distributed in the hope that it will be useful, but WITHOUT ANY
14 * WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 * You should have received a copy of the GNU General Public License along
19 * with groff; see the file COPYING. If not, write to the Free Software
20 * Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA.
32 #include "file_case.h"
34 #include "localcharset.h"
36 #include "stringclass.h"
40 # ifdef WORDS_BIGENDIAN
41 # define UNICODE "UTF-32BE"
43 # define UNICODE "UTF-32LE"
47 char default_encoding
[MAX_VAR_LEN
];
48 char user_encoding
[MAX_VAR_LEN
];
49 char encoding_string
[MAX_VAR_LEN
];
58 // The official list of MIME tags can be found at
60 // http://www.iana.org/assignments/character-sets
62 // For encodings which don't have a MIME tag we use GNU iconv's encoding
63 // names (which also work with the portable GNU libiconv package). They
64 // are marked with `*'.
66 // Encodings specific to XEmacs and Emacs are marked as such; no mark means
67 // that they are used by both Emacs and XEmacs.
69 // Encodings marked with `--' are special to Emacs, XEmacs, or other
70 // applications and shouldn't be used for data exchange.
72 // `Not covered' means that the encoding can be handled neither by GNU iconv
73 // nor by libiconv, or just one of them has support for it.
75 // A special case is VIQR encoding: Despite of having a MIME tag it is
76 // missing in both libiconv 1.10 and iconv (coming with GNU libc 2.3.6).
78 // Finally, we add all aliases of GNU iconv for `ascii', `latin1', and
79 // `utf8' to catch those encoding names before iconv is called.
81 // Note that most entries are commented out -- only a small, (rather)
82 // reliable and stable subset of encodings is recognized (for coding tags)
83 // which are still in greater use today (January 2006). Most notably, all
84 // Windows-specific encodings are not selected because they lack stability:
85 // Microsoft has changed the mappings instead of creating new versions.
87 // Please contact the groff list if you find the selection inadequate.
89 static const conversion
90 emacs_to_mime
[] = { // FIXME what kind of shit is THIS?? P.S.: it's documented!
91 {"ascii", "US-ASCII"}, // Emacs
93 {"chinese-big5", "Big5"}, // Emacs
94 {"chinese-euc", "GB2312"}, // XEmacs
95 {"chinese-iso-8bit", "GB2312"}, // Emacs
97 {"cn-gb", "GB2312"}, // Emacs
98 {"cn-gb-2312", "GB2312"},
99 {"cp878", "KOI8-R"}, // Emacs
100 {"cp1047", "CP1047"}, // EBCDIC
101 {"csascii", "US-ASCII"}, // alias
102 {"csisolatin1", "ISO-8859-1"}, // alias
103 {"cyrillic-iso-8bit", "ISO-8859-5"}, // Emacs
104 {"cyrillic-koi8", "KOI8-R"}, // not KOI8!, Emacs
105 {"euc-china", "GB2312"}, // Emacs
106 {"euc-cn", "GB2312"}, // Emacs
107 {"euc-japan", "EUC-JP"},
108 {"euc-japan-1990", "EUC-JP"}, // Emacs
109 {"euc-jp", "EUC-JP"},
110 {"euc-korea", "EUC-KR"},
111 {"euc-kr", "EUC-KR"},
112 {"gb2312", "GB2312"},
113 {"greek-iso-8bit", "ISO-8859-7"},
114 {"iso-10646/utf8", "UTF-8"}, // alias
115 {"iso-10646/utf-8", "UTF-8"}, // alias
116 {"iso-8859-1", "ISO-8859-1"},
117 {"iso-8859-13", "ISO-8859-13"}, // Emacs
118 {"iso-8859-15", "ISO-8859-15"},
119 {"iso-8859-2", "ISO-8859-2"},
120 {"iso-8859-5", "ISO-8859-5"},
121 {"iso-8859-7", "ISO-8859-7"},
122 {"iso-8859-9", "ISO-8859-9"},
123 {"iso-latin-1", "ISO-8859-1"},
124 {"iso-latin-2", "ISO-8859-2"}, // Emacs
125 {"iso-latin-5", "ISO-8859-9"}, // Emacs
126 {"iso-latin-7", "ISO-8859-13"}, // Emacs
127 {"iso-latin-9", "ISO-8859-15"}, // Emacs
128 {"japanese-iso-8bit", "EUC-JP"}, // Emacs
129 {"japanese-euc", "EUC-JP"}, // XEmacs
130 {"jis8", "EUC-JP"}, // XEmacs
131 {"koi8", "KOI8-R"}, // not KOI8!, Emacs
132 {"koi8-r", "KOI8-R"},
133 {"korean-euc", "EUC-KR"}, // XEmacs
134 {"korean-iso-8bit", "EUC-KR"}, // Emacs
135 {"latin1", "ISO-8859-1"}, // alias
136 {"latin-0", "ISO-8859-15"}, // Emacs
137 {"latin-1", "ISO-8859-1"}, // Emacs
138 {"latin-2", "ISO-8859-2"}, // Emacs
139 {"latin-5", "ISO-8859-9"}, // Emacs
140 {"latin-7", "ISO-8859-13"}, // Emacs
141 {"latin-9", "ISO-8859-15"}, // Emacs
142 {"mule-utf-16", "UTF-16"}, // Emacs
143 {"mule-utf-16be", "UTF-16BE"}, // Emacs
144 {"mule-utf-16-be", "UTF-16BE"}, // Emacs
145 {"mule-utf-16be-with-signature", "UTF-16"}, // Emacs, not UTF-16BE
146 {"mule-utf-16le", "UTF-16LE"}, // Emacs
147 {"mule-utf-16-le", "UTF-16LE"}, // Emacs
148 {"mule-utf-16le-with-signature", "UTF-16"}, // Emacs, not UTF-16LE
149 {"mule-utf-8", "UTF-8"}, // Emacs
150 {"us-ascii", "US-ASCII"}, // Emacs
151 {"utf8", "UTF-8"}, // alias
152 {"utf-16", "UTF-16"}, // Emacs
153 {"utf-16be", "UTF-16BE"}, // Emacs
154 {"utf-16-be", "UTF-16BE"}, // Emacs
155 {"utf-16be-with-signature", "UTF-16"}, // Emacs, not UTF-16BE
156 {"utf-16-be-with-signature", "UTF-16"}, // Emacs, not UTF-16BE
157 {"utf-16le", "UTF-16LE"}, // Emacs
158 {"utf-16-le", "UTF-16LE"}, // Emacs
159 {"utf-16le-with-signature", "UTF-16"}, // Emacs, not UTF-16LE
160 {"utf-16-le-with-signature", "UTF-16"}, // Emacs, not UTF-16LE
161 {"utf-8", "UTF-8"}, // Emacs
163 // {"alternativnyj", ""}, // ?
164 // {"arabic-iso-8bit", "ISO-8859-6"}, // Emacs
165 // {"binary", ""}, // --
166 // {"chinese-hz", "HZ-GB-2312"}, // Emacs
167 // {"chinese-iso-7bit", "ISO-2022-CN"}, // Emacs
168 // {"chinese-iso-8bit-with-esc", ""}, // --
169 // {"compound-text", ""}, // --
170 // {"compound-text-with-extension", ""}, // --
171 // {"cp1125", "cp1125"}, // *
172 // {"cp1250", "windows-1250"},// Emacs
173 // {"cp1251", "windows-1251"},// Emacs
174 // {"cp1252", "windows-1252"},// Emacs
175 // {"cp1253", "windows-1253"},// Emacs
176 // {"cp1254", "windows-1254"},// Emacs
177 // {"cp1255", "windows-1255"},// Emacs
178 // {"cp1256", "windows-1256"},// Emacs
179 // {"cp1257", "windows-1257"},// Emacs
180 // {"cp1258", "windows-1258"},// Emacs
181 // {"cp437", "cp437"}, // Emacs
182 // {"cp720", ""}, // not covered
183 // {"cp737", "cp737"}, // *, Emacs
184 // {"cp775", "cp775"}, // Emacs
185 // {"cp850", "cp850"}, // Emacs
186 // {"cp851", "cp851"}, // Emacs
187 // {"cp852", "cp852"}, // Emacs
188 // {"cp855", "cp855"}, // Emacs
189 // {"cp857", "cp857"}, // Emacs
190 // {"cp860", "cp860"}, // Emacs
191 // {"cp861", "cp861"}, // Emacs
192 // {"cp862", "cp862"}, // Emacs
193 // {"cp863", "cp863"}, // Emacs
194 // {"cp864", "cp864"}, // Emacs
195 // {"cp865", "cp865"}, // Emacs
196 // {"cp866", "cp866"}, // Emacs
197 // {"cp866u", "cp1125"}, // *, Emacs
198 // {"cp869", "cp869"}, // Emacs
199 // {"cp874", "cp874"}, // *, Emacs
200 // {"cp932", "cp932"}, // *, Emacs
201 // {"cp936", "cp936"}, // Emacs
202 // {"cp949", "cp949"}, // *, Emacs
203 // {"cp950", "cp950"}, // *, Emacs
204 // {"ctext", ""}, // --
205 // {"ctext-no-compositions", ""}, // --
206 // {"ctext-with-extensions", ""}, // --
207 // {"cyrillic-alternativnyj", ""}, // ?, Emacs
208 // {"cyrillic-iso-8bit-with-esc", ""}, // --
209 // {"cyrillic-koi8-t", "KOI8-T"}, // *, Emacs
210 // {"devanagari", ""}, // not covered
211 // {"dos", ""}, // --
212 // {"emacs-mule", ""}, // --
213 // {"euc-jisx0213", "EUC-JISX0213"},// *, XEmacs?
214 // {"euc-jisx0213-with-esc", ""}, // XEmacs?
215 // {"euc-taiwan", "EUC-TW"}, // *, Emacs
216 // {"euc-tw", "EUC-TW"}, // *, Emacs
217 // {"georgian-ps", "GEORGIAN-PS"}, // *, Emacs
218 // {"greek-iso-8bit-with-esc", ""}, // --
219 // {"hebrew-iso-8bit", "ISO-8859-8"}, // Emacs
220 // {"hebrew-iso-8bit-with-esc", ""}, // --
221 // {"hz", "HZ-GB-2312"},
222 // {"hz-gb-2312", "HZ-GB-2312"},
223 // {"in-is13194", ""}, // not covered
224 // {"in-is13194-devanagari", ""}, // not covered
225 // {"in-is13194-with-esc", ""}, // --
226 // {"iso-2022-7", ""}, // XEmacs?
227 // {"iso-2022-7bit", ""}, // --
228 // {"iso-2022-7bit-lock", ""}, // --
229 // {"iso-2022-7bit-lock-ss2", ""}, // --
230 // {"iso-2022-7bit-ss2", ""}, // --
231 // {"iso-2022-8", ""}, // XEmacs?
232 // {"iso-2022-8bit", ""}, // XEmacs?
233 // {"iso-2022-8bit-lock", ""}, // XEmacs?
234 // {"iso-2022-8bit-lock-ss2", ""}, // XEmacs?
235 // {"iso-2022-8bit-ss2", ""}, // --
236 // {"iso-2022-cjk", ""}, // --
237 // {"iso-2022-cn", "ISO-2022-CN"}, // Emacs
238 // {"iso-2022-cn-ext", "ISO-2022-CN-EXT"},// Emacs
239 // {"iso-2022-int-1", ""}, // --
240 // {"iso-2022-jp", "ISO-2022-JP"},
241 // {"iso-2022-jp-1978-irv", "ISO-2022-JP"},
242 // {"iso-2022-jp-2", "ISO-2022-JP-2"},
243 // {"iso-2022-jp-3", "ISO-2022-JP-3"},// *, XEmacs?
244 // {"iso-2022-jp-3-compatible", ""}, // XEmacs?
245 // {"iso-2022-jp-3-strict", "ISO-2022-JP-3"},// *, XEmacs?
246 // {"iso-2022-kr", "ISO-2022-KR"},
247 // {"iso-2022-lock", ""}, // XEmacs?
248 // {"iso-8859-10", "ISO-8859-10"}, // Emacs
249 // {"iso-8859-11", "ISO-8859-11"}, // *, Emacs
250 // {"iso-8859-14", "ISO-8859-14"}, // Emacs
251 // {"iso-8859-16", "ISO-8859-16"},
252 // {"iso-8859-3", "ISO-8859-3"},
253 // {"iso-8859-4", "ISO-8859-4"},
254 // {"iso-8859-6", "ISO-8859-6"},
255 // {"iso-8859-8", "ISO-8859-8"},
256 // {"iso-8859-8-e", "ISO-8859-8"},
257 // {"iso-8859-8-i", "ISO-8859-8"}, // Emacs
258 // {"iso-latin-10", "ISO-8859-16"}, // Emacs
259 // {"iso-latin-1-with-esc", ""}, // --
260 // {"iso-latin-2-with-esc", ""}, // --
261 // {"iso-latin-3", "ISO-8859-3"}, // Emacs
262 // {"iso-latin-3-with-esc", ""}, // --
263 // {"iso-latin-4", "ISO-8859-4"}, // Emacs
264 // {"iso-latin-4-with-esc", ""}, // --
265 // {"iso-latin-5-with-esc", ""}, // --
266 // {"iso-latin-6", "ISO-8859-10"}, // Emacs
267 // {"iso-latin-8", "ISO-8859-14"}, // Emacs
268 // {"iso-safe", ""}, // --
269 // {"japanese-iso-7bit-1978-irv", "ISO-2022-JP"}, // Emacs
270 // {"japanese-iso-8bit-with-esc", ""}, // --
271 // {"japanese-shift-jis", "Shift_JIS"}, // Emacs
272 // {"japanese-shift-jisx0213", ""}, // XEmacs?
273 // {"jis7", "ISO-2022-JP"}, // Xemacs
274 // {"junet", "ISO-2022-JP"},
275 // {"koi8-t", "KOI8-T"}, // *, Emacs
276 // {"koi8-u", "KOI8-U"}, // Emacs
277 // {"korean-iso-7bit-lock", "ISO-2022-KR"},
278 // {"korean-iso-8bit-with-esc", ""}, // --
279 // {"lao", ""}, // not covered
280 // {"lao-with-esc", ""}, // --
281 // {"latin-10", "ISO-8859-16"}, // Emacs
282 // {"latin-3", "ISO-8859-3"}, // Emacs
283 // {"latin-4", "ISO-8859-4"}, // Emacs
284 // {"latin-6", "ISO-8859-10"}, // Emacs
285 // {"latin-8", "ISO-8859-14"}, // Emacs
286 // {"mac", ""}, // --
287 // {"mac-roman", "MACINTOSH"}, // Emacs
288 // {"mik", ""}, // not covered
289 // {"next", "NEXTSTEP"}, // *, Emacs
290 // {"no-conversion", ""}, // --
291 // {"old-jis", "ISO-2022-JP"},
292 // {"pt154", "PT154"}, // Emacs
293 // {"raw-text", ""}, // --
294 // {"ruscii", "cp1125"}, // *, Emacs
295 // {"shift-jis", "Shift_JIS"}, // XEmacs
296 // {"shift_jis", "Shift_JIS"},
297 // {"shift_jisx0213", "Shift_JISX0213"},// *, XEmacs?
298 // {"sjis", "Shift_JIS"}, // Emacs
299 // {"tcvn", "TCVN"}, // *, Emacs
300 // {"tcvn-5712", "TCVN"}, // *, Emacs
301 // {"thai-tis620", "TIS-620"},
302 // {"thai-tis620-with-esc", ""}, // --
303 // {"th-tis620", "TIS-620"},
304 // {"tibetan", ""}, // not covered
305 // {"tibetan-iso-8bit", ""}, // not covered
306 // {"tibetan-iso-8bit-with-esc", ""}, // --
307 // {"tis-620", "TIS-620"},
308 // {"tis620", "TIS-620"},
309 // {"undecided", ""}, // --
310 // {"unix", ""}, // --
311 // {"utf-7", "UTF-7"}, // Emacs
312 // {"utf-7-safe", ""}, // XEmacs?
313 // {"utf-8-ws", "UTF-8"}, // XEmacs?
314 // {"vietnamese-tcvn", "TCVN"}, // *, Emacs
315 // {"vietnamese-viqr", "VIQR"}, // not covered
316 // {"vietnamese-viscii", "VISCII"},
317 // {"vietnamese-vscii", ""}, // not covered
318 // {"viqr", "VIQR"}, // not covered
319 // {"viscii", "VISCII"},
320 // {"vscii", ""}, // not covered
321 // {"windows-037", ""}, // not covered
322 // {"windows-10000", ""}, // not covered
323 // {"windows-10001", ""}, // not covered
324 // {"windows-10006", ""}, // not covered
325 // {"windows-10007", ""}, // not covered
326 // {"windows-10029", ""}, // not covered
327 // {"windows-10079", ""}, // not covered
328 // {"windows-10081", ""}, // not covered
329 // {"windows-1026", ""}, // not covered
330 // {"windows-1200", ""}, // not covered
331 // {"windows-1250", "windows-1250"},
332 // {"windows-1251", "windows-1251"},
333 // {"windows-1252", "windows-1252"},
334 // {"windows-1253", "windows-1253"},
335 // {"windows-1254", "windows-1254"},
336 // {"windows-1255", "windows-1255"},
337 // {"windows-1256", "windows-1256"},
338 // {"windows-1257", "windows-1257"},
339 // {"windows-1258", "windows-1258"},
340 // {"windows-1361", "cp1361"}, // *, XEmacs
341 // {"windows-437", "cp437"}, // XEmacs
342 // {"windows-500", ""}, // not covered
343 // {"windows-708", ""}, // not covered
344 // {"windows-709", ""}, // not covered
345 // {"windows-710", ""}, // not covered
346 // {"windows-720", ""}, // not covered
347 // {"windows-737", "cp737"}, // *, XEmacs
348 // {"windows-775", "cp775"}, // XEmacs
349 // {"windows-850", "cp850"}, // XEmacs
350 // {"windows-852", "cp852"}, // XEmacs
351 // {"windows-855", "cp855"}, // XEmacs
352 // {"windows-857", "cp857"}, // XEmacs
353 // {"windows-860", "cp860"}, // XEmacs
354 // {"windows-861", "cp861"}, // XEmacs
355 // {"windows-862", "cp862"}, // XEmacs
356 // {"windows-863", "cp863"}, // XEmacs
357 // {"windows-864", "cp864"}, // XEmacs
358 // {"windows-865", "cp865"}, // XEmacs
359 // {"windows-866", "cp866"}, // XEmacs
360 // {"windows-869", "cp869"}, // XEmacs
361 // {"windows-874", "cp874"}, // XEmacs
362 // {"windows-875", ""}, // not covered
363 // {"windows-932", "cp932"}, // *, XEmacs
364 // {"windows-936", "cp936"}, // XEmacs
365 // {"windows-949", "cp949"}, // *, XEmacs
366 // {"windows-950", "cp950"}, // *, XEmacs
367 // {"x-ctext", ""}, // --
368 // {"x-ctext-with-extensions", ""}, // --
373 // Convert encoding name from emacs to mime.
375 emacs2mime(char *emacs_enc
)
377 int emacs_enc_len
= strlen(emacs_enc
);
378 if (emacs_enc_len
> 4
379 && !strcasecmp(emacs_enc
+ emacs_enc_len
- 4, "-dos"))
380 emacs_enc
[emacs_enc_len
- 4] = 0;
381 if (emacs_enc_len
> 4
382 && !strcasecmp(emacs_enc
+ emacs_enc_len
- 4, "-mac"))
383 emacs_enc
[emacs_enc_len
- 4] = 0;
384 if (emacs_enc_len
> 5
385 && !strcasecmp(emacs_enc
+ emacs_enc_len
- 5, "-unix"))
386 emacs_enc
[emacs_enc_len
- 5] = 0;
387 for (const conversion
*table
= emacs_to_mime
; table
->from
; table
++)
388 if (!strcasecmp(emacs_enc
, table
->from
))
389 return (char *)table
->to
;
393 // Print out Unicode entity if value is greater than 0x7F.
395 unicode_entity(int u
)
400 // Handle soft hyphen specially -- it is an input character only,
407 printf("\\[u%04X]", u
);
411 // Conversion functions. All functions take `data', which
412 // normally holds the first two lines, and a file pointer.
414 // Conversion from ISO-8859-1 (aka Latin-1) to Unicode.
416 conversion_latin1(file_case
*fcp
, const string
&data
)
418 int len
= data
.length();
419 const unsigned char *ptr
= (const unsigned char *)data
.contents();
420 for (int i
= 0; i
< len
; i
++)
421 unicode_entity(ptr
[i
]);
423 while ((c
= fcp
->get_c()) != EOF
)
427 // A future version of groff shall support UTF-8 natively.
428 // In this case, the UTF-8 stuff here in this file will be
429 // moved to the troff program.
446 int incomplete_warning
;
450 void add(unsigned char);
455 utf8::utf8(file_case
*fcp
)
456 : _fcp(fcp
), byte(FIRST
),
457 expected_bytes(1), invalid_warning(1), incomplete_warning(1)
467 utf8::add(unsigned char c
)
499 if (c
< 0x80 || c
> 0xBF) {
509 if (expected_bytes
== 2) {
513 unicode_entity(((s
[0] & 0x1F) << 6)
521 if (expected_bytes
== 3) {
522 if (!(s
[0] >= 0xE1 || s
[1] >= 0xA0))
525 unicode_entity(((s
[0] & 0x1F) << 12)
526 | ((s
[1] ^ 0x80) << 6)
534 // We reject everything greater than 0x10FFFF.
535 if (expected_bytes
== 4) {
536 if (!((s
[0] >= 0xF1 || s
[1] >= 0x90)
537 && (s
[0] < 0xF4 || (s
[0] == 0xF4 && s
[1] < 0x90))))
540 unicode_entity(((s
[0] & 0x07) << 18)
541 | ((s
[1] ^ 0x80) << 12)
542 | ((s
[2] ^ 0x80) << 6)
550 if (expected_bytes
== 5) {
567 if (debug_flag
&& invalid_warning
) {
568 fprintf(stderr
, " invalid byte(s) found in input stream --\n"
569 " each such sequence replaced with 0xFFFD\n");
572 unicode_entity(0xFFFD);
579 if (debug_flag
&& incomplete_warning
) {
580 fprintf(stderr
, " incomplete sequence(s) found in input stream --\n"
581 " each such sequence replaced with 0xFFFD\n");
582 incomplete_warning
= 0;
584 unicode_entity(0xFFFD);
588 // Conversion from UTF-8 to Unicode.
590 conversion_utf8(file_case
*fcp
, const string
&data
)
593 int len
= data
.length();
594 const unsigned char *ptr
= (const unsigned char *)data
.contents();
595 for (int i
= 0; i
< len
; i
++)
598 while ((c
= fcp
->get_c()) != EOF
)
603 // Conversion from cp1047 (EBCDIC) to UTF-8.
605 conversion_cp1047(file_case
*fcp
, const string
&data
)
607 static unsigned char cp1047
[] = { // FIXME const
608 0x00, 0x01, 0x02, 0x03, 0x9C, 0x09, 0x86, 0x7F, // 0x00
609 0x97, 0x8D, 0x8E, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
610 0x10, 0x11, 0x12, 0x13, 0x9D, 0x85, 0x08, 0x87, // 0x10
611 0x18, 0x19, 0x92, 0x8F, 0x1C, 0x1D, 0x1E, 0x1F,
612 0x80, 0x81, 0x82, 0x83, 0x84, 0x0A, 0x17, 0x1B, // 0x20
613 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x05, 0x06, 0x07,
614 0x90, 0x91, 0x16, 0x93, 0x94, 0x95, 0x96, 0x04, // 0x30
615 0x98, 0x99, 0x9A, 0x9B, 0x14, 0x15, 0x9E, 0x1A,
616 0x20, 0xA0, 0xE2, 0xE4, 0xE0, 0xE1, 0xE3, 0xE5, // 0x40
617 0xE7, 0xF1, 0xA2, 0x2E, 0x3C, 0x28, 0x2B, 0x7C,
618 0x26, 0xE9, 0xEA, 0xEB, 0xE8, 0xED, 0xEE, 0xEF, // 0x50
619 0xEC, 0xDF, 0x21, 0x24, 0x2A, 0x29, 0x3B, 0x5E,
620 0x2D, 0x2F, 0xC2, 0xC4, 0xC0, 0xC1, 0xC3, 0xC5, // 0x60
621 0xC7, 0xD1, 0xA6, 0x2C, 0x25, 0x5F, 0x3E, 0x3F,
622 0xF8, 0xC9, 0xCA, 0xCB, 0xC8, 0xCD, 0xCE, 0xCF, // 0x70
623 0xCC, 0x60, 0x3A, 0x23, 0x40, 0x27, 0x3D, 0x22,
624 0xD8, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, // 0x80
625 0x68, 0x69, 0xAB, 0xBB, 0xF0, 0xFD, 0xFE, 0xB1,
626 0xB0, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, // 0x90
627 0x71, 0x72, 0xAA, 0xBA, 0xE6, 0xB8, 0xC6, 0xA4,
628 0xB5, 0x7E, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, // 0xA0
629 0x79, 0x7A, 0xA1, 0xBF, 0xD0, 0x5B, 0xDE, 0xAE,
630 0xAC, 0xA3, 0xA5, 0xB7, 0xA9, 0xA7, 0xB6, 0xBC, // 0xB0
631 0xBD, 0xBE, 0xDD, 0xA8, 0xAF, 0x5D, 0xB4, 0xD7,
632 0x7B, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, // 0xC0
633 0x48, 0x49, 0xAD, 0xF4, 0xF6, 0xF2, 0xF3, 0xF5,
634 0x7D, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, // 0xD0
635 0x51, 0x52, 0xB9, 0xFB, 0xFC, 0xF9, 0xFA, 0xFF,
636 0x5C, 0xF7, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, // 0xE0
637 0x59, 0x5A, 0xB2, 0xD4, 0xD6, 0xD2, 0xD3, 0xD5,
638 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, // 0xF0
639 0x38, 0x39, 0xB3, 0xDB, 0xDC, 0xD9, 0xDA, 0x9F,
641 int len
= data
.length();
642 const unsigned char *ptr
= (const unsigned char *)data
.contents();
643 for (int i
= 0; i
< len
; i
++)
644 unicode_entity(cp1047
[ptr
[i
]]);
646 while ((c
= fcp
->get_c()) != EOF
)
647 unicode_entity(cp1047
[c
]);
650 // Locale-sensible conversion.
653 conversion_iconv(file_case
*fcp
, const string
&data
, char *enc
)
655 iconv_t handle
= iconv_open(UNICODE
, enc
);
656 if (handle
== (iconv_t
)-1) {
657 if (errno
== EINVAL
) {
658 error("encoding system `%1' not supported by iconv()", enc
);
661 fatal("iconv_open failed");
665 char *outptr
= (char *)outbuf
;
666 size_t outbytes_left
= BUFSIZ
* sizeof (int);
668 char *inptr
= (char *)data
.contents();
669 size_t inbytes_left
= data
.length();
671 while (inbytes_left
> 0) {
672 size_t status
= iconv(handle
,
673 (ICONV_CONST
char **)&inptr
, &inbytes_left
,
674 &outptr
, &outbytes_left
);
675 if (status
== (size_t)-1) {
676 if (errno
== EILSEQ
) {
677 // Invalid byte sequence. XXX
681 else if (errno
== E2BIG
) {
682 // Output buffer is full.
683 limit
= (char *)outbuf
+ BUFSIZ
* sizeof (int) - outbytes_left
;
684 for (int *ptr
= outbuf
; (char *)ptr
< limit
; ptr
++)
685 unicode_entity(*ptr
);
686 memmove(outbuf
, outptr
, outbytes_left
);
687 outptr
= (char *)outbuf
+ outbytes_left
;
688 outbytes_left
= BUFSIZ
* sizeof (int) - outbytes_left
;
690 else if (errno
== EINVAL
) {
691 // `data' ends with partial input sequence.
692 memcpy(inbuf
, inptr
, inbytes_left
);
697 // Handle `fp' and switch to `inbuf'.
699 char *read_start
= inbuf
+ inbytes_left
;
700 while ((read_bytes
= fcp
->get_buf(read_start
, BUFSIZ
- inbytes_left
)) > 0) {
702 inbytes_left
+= read_bytes
;
703 while (inbytes_left
> 0) {
704 size_t status
= iconv(handle
,
705 (ICONV_CONST
char **)&inptr
, &inbytes_left
,
706 &outptr
, &outbytes_left
);
707 if (status
== (size_t)-1) {
708 if (errno
== EILSEQ
) {
709 // Invalid byte sequence. XXX
713 else if (errno
== E2BIG
) {
714 // Output buffer is full.
715 limit
= (char *)outbuf
+ BUFSIZ
* sizeof (int) - outbytes_left
;
716 for (int *ptr
= outbuf
; (char *)ptr
< limit
; ptr
++)
717 unicode_entity(*ptr
);
718 memmove(outbuf
, outptr
, outbytes_left
);
719 outptr
= (char *)outbuf
+ outbytes_left
;
720 outbytes_left
= BUFSIZ
* sizeof (int) - outbytes_left
;
722 else if (errno
== EINVAL
) {
723 // `inbuf' ends with partial input sequence.
724 memmove(inbuf
, inptr
, inbytes_left
);
729 read_start
= inbuf
+ inbytes_left
;
733 limit
= (char *)outbuf
+ BUFSIZ
* sizeof (int) - outbytes_left
;
734 for (int *ptr
= outbuf
; (char *)ptr
< limit
; ptr
++)
735 unicode_entity(*ptr
);
739 // Handle Byte Order Mark.
741 // Since we have a chicken-and-egg problem it's necessary
742 // to handle the BOM manually if it is in the data stream.
743 // As documented in the Unicode book it is very unlikely
744 // that any normal text file (regardless of the encoding)
745 // starts with the bytes which represent a BOM.
747 // Return the BOM in string `BOM'; `data' then starts with
748 // the byte after the BOM. This function reads (at most)
749 // four bytes from the data stream.
751 // Return encoding if a BOM is found, NULL otherwise.
753 get_BOM(file_case
*fcp
, string
&BOM
, string
&data
)
755 // The BOM is U+FEFF. We have thus the following possible
759 // UTF-16: 0xFEFF or 0xFFFE
760 // UTF-32: 0x0000FEFF or 0xFFFE0000
765 } BOM_table
[] = { // FIXME const
766 {4, "\x00\x00\xFE\xFF", "UTF-32"},
767 {4, "\xFF\xFE\x00\x00", "UTF-32"},
768 {3, "\xEF\xBB\xBF", "UTF-8"},
769 {2, "\xFE\xFF", "UTF-16"},
770 {2, "\xFF\xFE", "UTF-16"},
772 const int BOM_table_len
= NELEM(BOM_table
);
774 const char *retval
= NULL
;
776 for (len
= 0; len
< 4; len
++) {
777 int c
= fcp
->get_c();
780 BOM_string
[len
] = char(c
);
783 for (i
= 0; i
< BOM_table_len
; i
++) {
784 if (BOM_table
[i
].len
<= len
785 && memcmp(BOM_string
, BOM_table
[i
].str
, BOM_table
[i
].len
) == 0)
789 if (i
< BOM_table_len
) {
790 for (; j
< BOM_table
[i
].len
; j
++)
791 BOM
+= BOM_string
[j
];
792 retval
= BOM_table
[i
].name
;
795 data
+= BOM_string
[j
];
799 // Get first two lines from input stream.
801 // Return string (allocated with `new') without zero bytes
802 // or NULL in case no coding tag can occur in the data
803 // (which is stored unmodified in `data').
805 get_tag_lines(file_case
*fcp
, string
&data
)
807 int newline_count
= 0;
809 // Handle CR, LF, and CRLF as line separators.
810 for (int i
= 0; i
< data
.length(); i
++) {
812 if (c
== '\n' || c
== '\r')
814 if (c
== '\n' && prev
== '\r')
818 if (newline_count
> 1)
820 int emit_warning
= 1;
821 for (int lines
= newline_count
; lines
< 2; lines
++) {
822 while ((c
= fcp
->get_c()) != EOF
) {
823 if (c
== '\0' && debug_flag
&& emit_warning
) {
825 " null byte(s) found in input stream --\n"
826 " search for coding tag might return false result\n");
830 if (c
== '\n' || c
== '\r')
833 // Handle CR, LF, and CRLF as line separators.
836 if (c
!= EOF
&& c
!= '\n')
842 return data
.extract();
845 // Check whether C string starts with a comment.
847 // Return 1 if true, 0 otherwise.
849 is_comment_line(char *s
)
853 if (*s
== '.' || *s
== '\'')
856 while (*s
== ' ' || *s
== '\t')
858 if (*s
&& *s
== '\\')
861 if (*s
== '"' || *s
== '#')
874 // Get a value/variable pair from a local variables list
875 // in a C string which look like this:
877 // <variable1>: <value1>; <variable2>: <value2>; ...
879 // Leading and trailing blanks are ignored. There might be
880 // more than one blank after `:' and `;'.
882 // Return position of next value/variable pair or NULL if
885 get_variable_value_pair(char *d1
, char **variable
, char **value
)
887 static char var
[MAX_VAR_LEN
], val
[MAX_VAR_LEN
];
890 while (*d1
== ' ' || *d1
== '\t')
894 while (l
< MAX_VAR_LEN
- 1 && *d1
&& !strchr(";: \t", *d1
))
897 // Skip everything until `:', `;', or end of data.
898 while (*d1
&& *d1
!= ':' && *d1
!= ';')
906 while (*d1
== ' ' || *d1
== '\t')
910 while (l
< MAX_VAR_LEN
- 1 && *d1
&& !strchr("; \t", *d1
))
913 // Skip everything until `;' or end of data.
914 while (*d1
&& *d1
!= ';')
921 // Check coding tag in the read buffer.
923 // We search for the following line:
925 // <comment> ... -*-<local variables list>-*-
927 // (`...' might be anything).
929 // <comment> can be one of the following syntax forms at the
930 // beginning of the line:
932 // .\" .\# '\" '\# \#
934 // There can be whitespace after the leading `.' or "'".
936 // The local variables list must occur within the first
937 // comment block at the very beginning of the data stream.
939 // Within the <local variables list>, we search for
943 // which specifies the coding system used for the data
946 // Return <value> if found, NULL otherwise.
948 // Note that null bytes in the data are skipped before applying
949 // the algorithm. This should work even with files encoded as
950 // UTF-16 or UTF-32 (or its siblings) in most cases.
952 // XXX Add support for tag at the end of buffer.
954 check_coding_tag(file_case
*fcp
, string
&data
)
956 char *inbuf
= get_tag_lines(fcp
, data
);
958 for (char *p
= inbuf
; is_comment_line(p
); p
= lineend
+ 1) {
959 if ((lineend
= strchr(p
, '\n')) == NULL
)
961 *lineend
= 0; // switch temporarily to '\0'
962 char *d1
= strstr(p
, "-*-");
965 d2
= strstr(d1
+ 3, "-*-");
966 *lineend
= '\n'; // restore newline
969 *d2
= 0; // switch temporarily to '\0'
972 char *variable
, *value
;
973 d1
= get_variable_value_pair(d1
, &variable
, &value
);
974 if (!strcasecmp(variable
, "coding")) {
975 *d2
= '-'; // restore '-'
980 *d2
= '-'; // restore '-'
986 // Handle an input file. If filename is `-' handle stdin.
988 // Return 1 on success, 0 otherwise.
990 do_file(const char *filename
)
993 fprintf(stderr
, "file `%s':\n", filename
);
995 if ((fcp
= file_case::muxer(filename
, fcp
->mux_need_binary
)) == NULL
) {
996 assert(strcmp(filename
, "-"));
997 error("can't open `%1': %2", filename
, strerror(errno
));
1002 const char *BOM_encoding
= get_BOM(fcp
, BOM
, data
);
1003 // Determine the encoding.
1005 if (user_encoding
[0]) {
1007 fprintf(stderr
, " user-specified encoding `%s', "
1008 "no search for coding tag\n",
1010 if (BOM_encoding
&& strcmp(BOM_encoding
, user_encoding
))
1011 fprintf(stderr
, " but BOM in data stream implies encoding `%s'!\n",
1014 encoding
= (char *)user_encoding
;
1016 else if (BOM_encoding
) {
1018 fprintf(stderr
, " found BOM, no search for coding tag\n");
1019 encoding
= (char *)BOM_encoding
;
1022 // `check_coding_tag' returns a pointer to a static array (or NULL).
1023 char *file_encoding
= check_coding_tag(fcp
, data
);
1024 if (!file_encoding
) {
1026 fprintf(stderr
, " no file encoding\n");
1027 file_encoding
= default_encoding
;
1031 fprintf(stderr
, " file encoding: `%s'\n", file_encoding
);
1032 encoding
= file_encoding
;
1034 strncpy(encoding_string
, encoding
, MAX_VAR_LEN
- 1);
1035 encoding_string
[MAX_VAR_LEN
- 1] = 0;
1036 encoding
= encoding_string
;
1037 // Translate from MIME & Emacs encoding names to locale encoding names.
1038 encoding
= emacs2mime(encoding_string
);
1039 if (encoding
[0] == '\0') {
1040 error("encoding `%1' not supported, not a portable encoding",
1045 fprintf(stderr
, " encoding used: `%s'\n", encoding
);
1047 printf(".lf 1 %s\n", filename
);
1049 // Call converter (converters write to stdout).
1050 if (!strcasecmp(encoding
, "ISO-8859-1"))
1051 conversion_latin1(fcp
, BOM
+ data
);
1052 else if (!strcasecmp(encoding
, "UTF-8"))
1053 conversion_utf8(fcp
, data
);
1054 else if (!strcasecmp(encoding
, "cp1047"))
1055 conversion_cp1047(fcp
, BOM
+ data
);
1058 conversion_iconv(fcp
, BOM
+ data
, encoding
);
1060 error("encoding system `%1' not supported", encoding
);
1062 #endif /* HAVE_ICONV */
1073 fprintf(stream
, "Synopsis: %s [ option ] [ files ]\n"
1075 "-d show debugging messages\n"
1076 "-D encoding specify default encoding\n"
1077 "-e encoding specify input encoding\n"
1078 "-h print this message\n"
1079 "-r don't add .lf requests\n"
1080 "-v print version number\n"
1082 "The default encoding is `%s'.\n",
1083 program_name
, default_encoding
);
1087 main(int argc
, char **argv
)
1089 program_name
= argv
[0];
1090 // Determine the default encoding. This must be done before
1091 // getopt() is called since the usage message shows the default
1093 setlocale(LC_ALL
, "");
1094 char *locale
= getlocale(LC_CTYPE
);
1095 if (!locale
|| !strcmp(locale
, "C") || !strcmp(locale
, "POSIX"))
1096 strcpy(default_encoding
, "latin1");
1098 strncpy(default_encoding
, locale_charset(), MAX_VAR_LEN
- 1);
1099 default_encoding
[MAX_VAR_LEN
- 1] = 0;
1102 program_name
= argv
[0];
1104 static const struct option long_options
[] = {
1105 { "help", no_argument
, 0, 'h' },
1106 { "version", no_argument
, 0, 'v' },
1109 // Parse the command line options.
1110 while ((opt
= getopt_long(argc
, argv
,
1111 "dD:e:hrv", long_options
, NULL
)) != EOF
)
1114 puts(L_P_PRECONV
" (" T_ROFF
") v " VERSION
1129 strncpy(user_encoding
, optarg
, MAX_VAR_LEN
- 1);
1130 user_encoding
[MAX_VAR_LEN
- 1] = 0;
1133 user_encoding
[0] = 0;
1137 strncpy(default_encoding
, optarg
, MAX_VAR_LEN
- 1);
1138 default_encoding
[MAX_VAR_LEN
- 1] = 0;
1157 fprintf(stderr
, "default encoding: `%s'\n", default_encoding
);
1159 nbad
+= !do_file("-");
1161 for (int i
= optind
; i
< argc
; i
++)
1162 nbad
+= !do_file(argv
[i
]);
1163 if (ferror(stdout
) || fflush(stdout
) < 0)
1164 fatal("output error");