Adapt src/pre-pic (src/preproc/pic)
[s-roff.git] / src / preproc / preconv / preconv.cpp
blob6dfef10ace82b7789c28ed4f9fdfb37b6237d979
1 // -*- C++ -*-
2 /* Copyright (C) 2005, 2006, 2008
3 Free Software Foundation, Inc.
4 Written by Werner Lemberg (wl@gnu.org)
6 This file is part of groff.
8 groff is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any later
11 version.
13 groff is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License along
19 with groff; see the file COPYING. If not, write to the Free Software
20 Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
22 #include "lib.h"
24 #include <assert.h>
25 #include <stdlib.h>
26 #include <errno.h>
27 #include "errarg.h"
28 #include "error.h"
29 #include "file_case.h"
30 #include "localcharset.h"
31 #include "nonposix.h"
32 #include "stringclass.h"
34 #include <locale.h>
36 #if HAVE_ICONV
37 # include <iconv.h>
38 # ifdef WORDS_BIGENDIAN
39 # define UNICODE "UTF-32BE"
40 # else
41 # define UNICODE "UTF-32LE"
42 # endif
43 #endif
45 #define MAX_VAR_LEN 100
47 extern "C" const char *Version_string;
49 char default_encoding[MAX_VAR_LEN];
50 char user_encoding[MAX_VAR_LEN];
51 char encoding_string[MAX_VAR_LEN];
52 int debug_flag = 0;
53 int raw_flag = 0;
55 struct conversion {
56 const char *from;
57 const char *to;
60 // The official list of MIME tags can be found at
62 // http://www.iana.org/assignments/character-sets
64 // For encodings which don't have a MIME tag we use GNU iconv's encoding
65 // names (which also work with the portable GNU libiconv package). They
66 // are marked with `*'.
68 // Encodings specific to XEmacs and Emacs are marked as such; no mark means
69 // that they are used by both Emacs and XEmacs.
71 // Encodings marked with `--' are special to Emacs, XEmacs, or other
72 // applications and shouldn't be used for data exchange.
74 // `Not covered' means that the encoding can be handled neither by GNU iconv
75 // nor by libiconv, or just one of them has support for it.
77 // A special case is VIQR encoding: Despite of having a MIME tag it is
78 // missing in both libiconv 1.10 and iconv (coming with GNU libc 2.3.6).
80 // Finally, we add all aliases of GNU iconv for `ascii', `latin1', and
81 // `utf8' to catch those encoding names before iconv is called.
83 // Note that most entries are commented out -- only a small, (rather)
84 // reliable and stable subset of encodings is recognized (for coding tags)
85 // which are still in greater use today (January 2006). Most notably, all
86 // Windows-specific encodings are not selected because they lack stability:
87 // Microsoft has changed the mappings instead of creating new versions.
89 // Please contact the groff list if you find the selection inadequate.
91 static const conversion
92 emacs_to_mime[] = {
93 {"ascii", "US-ASCII"}, // Emacs
94 {"big5", "Big5"},
95 {"chinese-big5", "Big5"}, // Emacs
96 {"chinese-euc", "GB2312"}, // XEmacs
97 {"chinese-iso-8bit", "GB2312"}, // Emacs
98 {"cn-big5", "Big5"},
99 {"cn-gb", "GB2312"}, // Emacs
100 {"cn-gb-2312", "GB2312"},
101 {"cp878", "KOI8-R"}, // Emacs
102 {"cp1047", "CP1047"}, // EBCDIC
103 {"csascii", "US-ASCII"}, // alias
104 {"csisolatin1", "ISO-8859-1"}, // alias
105 {"cyrillic-iso-8bit", "ISO-8859-5"}, // Emacs
106 {"cyrillic-koi8", "KOI8-R"}, // not KOI8!, Emacs
107 {"euc-china", "GB2312"}, // Emacs
108 {"euc-cn", "GB2312"}, // Emacs
109 {"euc-japan", "EUC-JP"},
110 {"euc-japan-1990", "EUC-JP"}, // Emacs
111 {"euc-jp", "EUC-JP"},
112 {"euc-korea", "EUC-KR"},
113 {"euc-kr", "EUC-KR"},
114 {"gb2312", "GB2312"},
115 {"greek-iso-8bit", "ISO-8859-7"},
116 {"iso-10646/utf8", "UTF-8"}, // alias
117 {"iso-10646/utf-8", "UTF-8"}, // alias
118 {"iso-8859-1", "ISO-8859-1"},
119 {"iso-8859-13", "ISO-8859-13"}, // Emacs
120 {"iso-8859-15", "ISO-8859-15"},
121 {"iso-8859-2", "ISO-8859-2"},
122 {"iso-8859-5", "ISO-8859-5"},
123 {"iso-8859-7", "ISO-8859-7"},
124 {"iso-8859-9", "ISO-8859-9"},
125 {"iso-latin-1", "ISO-8859-1"},
126 {"iso-latin-2", "ISO-8859-2"}, // Emacs
127 {"iso-latin-5", "ISO-8859-9"}, // Emacs
128 {"iso-latin-7", "ISO-8859-13"}, // Emacs
129 {"iso-latin-9", "ISO-8859-15"}, // Emacs
130 {"japanese-iso-8bit", "EUC-JP"}, // Emacs
131 {"japanese-euc", "EUC-JP"}, // XEmacs
132 {"jis8", "EUC-JP"}, // XEmacs
133 {"koi8", "KOI8-R"}, // not KOI8!, Emacs
134 {"koi8-r", "KOI8-R"},
135 {"korean-euc", "EUC-KR"}, // XEmacs
136 {"korean-iso-8bit", "EUC-KR"}, // Emacs
137 {"latin1", "ISO-8859-1"}, // alias
138 {"latin-0", "ISO-8859-15"}, // Emacs
139 {"latin-1", "ISO-8859-1"}, // Emacs
140 {"latin-2", "ISO-8859-2"}, // Emacs
141 {"latin-5", "ISO-8859-9"}, // Emacs
142 {"latin-7", "ISO-8859-13"}, // Emacs
143 {"latin-9", "ISO-8859-15"}, // Emacs
144 {"mule-utf-16", "UTF-16"}, // Emacs
145 {"mule-utf-16be", "UTF-16BE"}, // Emacs
146 {"mule-utf-16-be", "UTF-16BE"}, // Emacs
147 {"mule-utf-16be-with-signature", "UTF-16"}, // Emacs, not UTF-16BE
148 {"mule-utf-16le", "UTF-16LE"}, // Emacs
149 {"mule-utf-16-le", "UTF-16LE"}, // Emacs
150 {"mule-utf-16le-with-signature", "UTF-16"}, // Emacs, not UTF-16LE
151 {"mule-utf-8", "UTF-8"}, // Emacs
152 {"us-ascii", "US-ASCII"}, // Emacs
153 {"utf8", "UTF-8"}, // alias
154 {"utf-16", "UTF-16"}, // Emacs
155 {"utf-16be", "UTF-16BE"}, // Emacs
156 {"utf-16-be", "UTF-16BE"}, // Emacs
157 {"utf-16be-with-signature", "UTF-16"}, // Emacs, not UTF-16BE
158 {"utf-16-be-with-signature", "UTF-16"}, // Emacs, not UTF-16BE
159 {"utf-16le", "UTF-16LE"}, // Emacs
160 {"utf-16-le", "UTF-16LE"}, // Emacs
161 {"utf-16le-with-signature", "UTF-16"}, // Emacs, not UTF-16LE
162 {"utf-16-le-with-signature", "UTF-16"}, // Emacs, not UTF-16LE
163 {"utf-8", "UTF-8"}, // Emacs
165 // {"alternativnyj", ""}, // ?
166 // {"arabic-iso-8bit", "ISO-8859-6"}, // Emacs
167 // {"binary", ""}, // --
168 // {"chinese-hz", "HZ-GB-2312"}, // Emacs
169 // {"chinese-iso-7bit", "ISO-2022-CN"}, // Emacs
170 // {"chinese-iso-8bit-with-esc", ""}, // --
171 // {"compound-text", ""}, // --
172 // {"compound-text-with-extension", ""}, // --
173 // {"cp1125", "cp1125"}, // *
174 // {"cp1250", "windows-1250"},// Emacs
175 // {"cp1251", "windows-1251"},// Emacs
176 // {"cp1252", "windows-1252"},// Emacs
177 // {"cp1253", "windows-1253"},// Emacs
178 // {"cp1254", "windows-1254"},// Emacs
179 // {"cp1255", "windows-1255"},// Emacs
180 // {"cp1256", "windows-1256"},// Emacs
181 // {"cp1257", "windows-1257"},// Emacs
182 // {"cp1258", "windows-1258"},// Emacs
183 // {"cp437", "cp437"}, // Emacs
184 // {"cp720", ""}, // not covered
185 // {"cp737", "cp737"}, // *, Emacs
186 // {"cp775", "cp775"}, // Emacs
187 // {"cp850", "cp850"}, // Emacs
188 // {"cp851", "cp851"}, // Emacs
189 // {"cp852", "cp852"}, // Emacs
190 // {"cp855", "cp855"}, // Emacs
191 // {"cp857", "cp857"}, // Emacs
192 // {"cp860", "cp860"}, // Emacs
193 // {"cp861", "cp861"}, // Emacs
194 // {"cp862", "cp862"}, // Emacs
195 // {"cp863", "cp863"}, // Emacs
196 // {"cp864", "cp864"}, // Emacs
197 // {"cp865", "cp865"}, // Emacs
198 // {"cp866", "cp866"}, // Emacs
199 // {"cp866u", "cp1125"}, // *, Emacs
200 // {"cp869", "cp869"}, // Emacs
201 // {"cp874", "cp874"}, // *, Emacs
202 // {"cp932", "cp932"}, // *, Emacs
203 // {"cp936", "cp936"}, // Emacs
204 // {"cp949", "cp949"}, // *, Emacs
205 // {"cp950", "cp950"}, // *, Emacs
206 // {"ctext", ""}, // --
207 // {"ctext-no-compositions", ""}, // --
208 // {"ctext-with-extensions", ""}, // --
209 // {"cyrillic-alternativnyj", ""}, // ?, Emacs
210 // {"cyrillic-iso-8bit-with-esc", ""}, // --
211 // {"cyrillic-koi8-t", "KOI8-T"}, // *, Emacs
212 // {"devanagari", ""}, // not covered
213 // {"dos", ""}, // --
214 // {"emacs-mule", ""}, // --
215 // {"euc-jisx0213", "EUC-JISX0213"},// *, XEmacs?
216 // {"euc-jisx0213-with-esc", ""}, // XEmacs?
217 // {"euc-taiwan", "EUC-TW"}, // *, Emacs
218 // {"euc-tw", "EUC-TW"}, // *, Emacs
219 // {"georgian-ps", "GEORGIAN-PS"}, // *, Emacs
220 // {"greek-iso-8bit-with-esc", ""}, // --
221 // {"hebrew-iso-8bit", "ISO-8859-8"}, // Emacs
222 // {"hebrew-iso-8bit-with-esc", ""}, // --
223 // {"hz", "HZ-GB-2312"},
224 // {"hz-gb-2312", "HZ-GB-2312"},
225 // {"in-is13194", ""}, // not covered
226 // {"in-is13194-devanagari", ""}, // not covered
227 // {"in-is13194-with-esc", ""}, // --
228 // {"iso-2022-7", ""}, // XEmacs?
229 // {"iso-2022-7bit", ""}, // --
230 // {"iso-2022-7bit-lock", ""}, // --
231 // {"iso-2022-7bit-lock-ss2", ""}, // --
232 // {"iso-2022-7bit-ss2", ""}, // --
233 // {"iso-2022-8", ""}, // XEmacs?
234 // {"iso-2022-8bit", ""}, // XEmacs?
235 // {"iso-2022-8bit-lock", ""}, // XEmacs?
236 // {"iso-2022-8bit-lock-ss2", ""}, // XEmacs?
237 // {"iso-2022-8bit-ss2", ""}, // --
238 // {"iso-2022-cjk", ""}, // --
239 // {"iso-2022-cn", "ISO-2022-CN"}, // Emacs
240 // {"iso-2022-cn-ext", "ISO-2022-CN-EXT"},// Emacs
241 // {"iso-2022-int-1", ""}, // --
242 // {"iso-2022-jp", "ISO-2022-JP"},
243 // {"iso-2022-jp-1978-irv", "ISO-2022-JP"},
244 // {"iso-2022-jp-2", "ISO-2022-JP-2"},
245 // {"iso-2022-jp-3", "ISO-2022-JP-3"},// *, XEmacs?
246 // {"iso-2022-jp-3-compatible", ""}, // XEmacs?
247 // {"iso-2022-jp-3-strict", "ISO-2022-JP-3"},// *, XEmacs?
248 // {"iso-2022-kr", "ISO-2022-KR"},
249 // {"iso-2022-lock", ""}, // XEmacs?
250 // {"iso-8859-10", "ISO-8859-10"}, // Emacs
251 // {"iso-8859-11", "ISO-8859-11"}, // *, Emacs
252 // {"iso-8859-14", "ISO-8859-14"}, // Emacs
253 // {"iso-8859-16", "ISO-8859-16"},
254 // {"iso-8859-3", "ISO-8859-3"},
255 // {"iso-8859-4", "ISO-8859-4"},
256 // {"iso-8859-6", "ISO-8859-6"},
257 // {"iso-8859-8", "ISO-8859-8"},
258 // {"iso-8859-8-e", "ISO-8859-8"},
259 // {"iso-8859-8-i", "ISO-8859-8"}, // Emacs
260 // {"iso-latin-10", "ISO-8859-16"}, // Emacs
261 // {"iso-latin-1-with-esc", ""}, // --
262 // {"iso-latin-2-with-esc", ""}, // --
263 // {"iso-latin-3", "ISO-8859-3"}, // Emacs
264 // {"iso-latin-3-with-esc", ""}, // --
265 // {"iso-latin-4", "ISO-8859-4"}, // Emacs
266 // {"iso-latin-4-with-esc", ""}, // --
267 // {"iso-latin-5-with-esc", ""}, // --
268 // {"iso-latin-6", "ISO-8859-10"}, // Emacs
269 // {"iso-latin-8", "ISO-8859-14"}, // Emacs
270 // {"iso-safe", ""}, // --
271 // {"japanese-iso-7bit-1978-irv", "ISO-2022-JP"}, // Emacs
272 // {"japanese-iso-8bit-with-esc", ""}, // --
273 // {"japanese-shift-jis", "Shift_JIS"}, // Emacs
274 // {"japanese-shift-jisx0213", ""}, // XEmacs?
275 // {"jis7", "ISO-2022-JP"}, // Xemacs
276 // {"junet", "ISO-2022-JP"},
277 // {"koi8-t", "KOI8-T"}, // *, Emacs
278 // {"koi8-u", "KOI8-U"}, // Emacs
279 // {"korean-iso-7bit-lock", "ISO-2022-KR"},
280 // {"korean-iso-8bit-with-esc", ""}, // --
281 // {"lao", ""}, // not covered
282 // {"lao-with-esc", ""}, // --
283 // {"latin-10", "ISO-8859-16"}, // Emacs
284 // {"latin-3", "ISO-8859-3"}, // Emacs
285 // {"latin-4", "ISO-8859-4"}, // Emacs
286 // {"latin-6", "ISO-8859-10"}, // Emacs
287 // {"latin-8", "ISO-8859-14"}, // Emacs
288 // {"mac", ""}, // --
289 // {"mac-roman", "MACINTOSH"}, // Emacs
290 // {"mik", ""}, // not covered
291 // {"next", "NEXTSTEP"}, // *, Emacs
292 // {"no-conversion", ""}, // --
293 // {"old-jis", "ISO-2022-JP"},
294 // {"pt154", "PT154"}, // Emacs
295 // {"raw-text", ""}, // --
296 // {"ruscii", "cp1125"}, // *, Emacs
297 // {"shift-jis", "Shift_JIS"}, // XEmacs
298 // {"shift_jis", "Shift_JIS"},
299 // {"shift_jisx0213", "Shift_JISX0213"},// *, XEmacs?
300 // {"sjis", "Shift_JIS"}, // Emacs
301 // {"tcvn", "TCVN"}, // *, Emacs
302 // {"tcvn-5712", "TCVN"}, // *, Emacs
303 // {"thai-tis620", "TIS-620"},
304 // {"thai-tis620-with-esc", ""}, // --
305 // {"th-tis620", "TIS-620"},
306 // {"tibetan", ""}, // not covered
307 // {"tibetan-iso-8bit", ""}, // not covered
308 // {"tibetan-iso-8bit-with-esc", ""}, // --
309 // {"tis-620", "TIS-620"},
310 // {"tis620", "TIS-620"},
311 // {"undecided", ""}, // --
312 // {"unix", ""}, // --
313 // {"utf-7", "UTF-7"}, // Emacs
314 // {"utf-7-safe", ""}, // XEmacs?
315 // {"utf-8-ws", "UTF-8"}, // XEmacs?
316 // {"vietnamese-tcvn", "TCVN"}, // *, Emacs
317 // {"vietnamese-viqr", "VIQR"}, // not covered
318 // {"vietnamese-viscii", "VISCII"},
319 // {"vietnamese-vscii", ""}, // not covered
320 // {"viqr", "VIQR"}, // not covered
321 // {"viscii", "VISCII"},
322 // {"vscii", ""}, // not covered
323 // {"windows-037", ""}, // not covered
324 // {"windows-10000", ""}, // not covered
325 // {"windows-10001", ""}, // not covered
326 // {"windows-10006", ""}, // not covered
327 // {"windows-10007", ""}, // not covered
328 // {"windows-10029", ""}, // not covered
329 // {"windows-10079", ""}, // not covered
330 // {"windows-10081", ""}, // not covered
331 // {"windows-1026", ""}, // not covered
332 // {"windows-1200", ""}, // not covered
333 // {"windows-1250", "windows-1250"},
334 // {"windows-1251", "windows-1251"},
335 // {"windows-1252", "windows-1252"},
336 // {"windows-1253", "windows-1253"},
337 // {"windows-1254", "windows-1254"},
338 // {"windows-1255", "windows-1255"},
339 // {"windows-1256", "windows-1256"},
340 // {"windows-1257", "windows-1257"},
341 // {"windows-1258", "windows-1258"},
342 // {"windows-1361", "cp1361"}, // *, XEmacs
343 // {"windows-437", "cp437"}, // XEmacs
344 // {"windows-500", ""}, // not covered
345 // {"windows-708", ""}, // not covered
346 // {"windows-709", ""}, // not covered
347 // {"windows-710", ""}, // not covered
348 // {"windows-720", ""}, // not covered
349 // {"windows-737", "cp737"}, // *, XEmacs
350 // {"windows-775", "cp775"}, // XEmacs
351 // {"windows-850", "cp850"}, // XEmacs
352 // {"windows-852", "cp852"}, // XEmacs
353 // {"windows-855", "cp855"}, // XEmacs
354 // {"windows-857", "cp857"}, // XEmacs
355 // {"windows-860", "cp860"}, // XEmacs
356 // {"windows-861", "cp861"}, // XEmacs
357 // {"windows-862", "cp862"}, // XEmacs
358 // {"windows-863", "cp863"}, // XEmacs
359 // {"windows-864", "cp864"}, // XEmacs
360 // {"windows-865", "cp865"}, // XEmacs
361 // {"windows-866", "cp866"}, // XEmacs
362 // {"windows-869", "cp869"}, // XEmacs
363 // {"windows-874", "cp874"}, // XEmacs
364 // {"windows-875", ""}, // not covered
365 // {"windows-932", "cp932"}, // *, XEmacs
366 // {"windows-936", "cp936"}, // XEmacs
367 // {"windows-949", "cp949"}, // *, XEmacs
368 // {"windows-950", "cp950"}, // *, XEmacs
369 // {"x-ctext", ""}, // --
370 // {"x-ctext-with-extensions", ""}, // --
372 {NULL, NULL},
375 // ---------------------------------------------------------
376 // Convert encoding name from emacs to mime.
377 // ---------------------------------------------------------
378 char *
379 emacs2mime(char *emacs_enc)
381 int emacs_enc_len = strlen(emacs_enc);
382 if (emacs_enc_len > 4
383 && !strcasecmp(emacs_enc + emacs_enc_len - 4, "-dos"))
384 emacs_enc[emacs_enc_len - 4] = 0;
385 if (emacs_enc_len > 4
386 && !strcasecmp(emacs_enc + emacs_enc_len - 4, "-mac"))
387 emacs_enc[emacs_enc_len - 4] = 0;
388 if (emacs_enc_len > 5
389 && !strcasecmp(emacs_enc + emacs_enc_len - 5, "-unix"))
390 emacs_enc[emacs_enc_len - 5] = 0;
391 for (const conversion *table = emacs_to_mime; table->from; table++)
392 if (!strcasecmp(emacs_enc, table->from))
393 return (char *)table->to;
394 return emacs_enc;
397 // ---------------------------------------------------------
398 // Print out Unicode entity if value is greater than 0x7F.
399 // ---------------------------------------------------------
400 inline void
401 unicode_entity(int u)
403 if (u < 0x80)
404 putchar(u);
405 else {
406 // Handle soft hyphen specially -- it is an input character only,
407 // not a glyph.
408 if (u == 0xAD) {
409 putchar('\\');
410 putchar('%');
412 else
413 printf("\\[u%04X]", u);
417 // ---------------------------------------------------------
418 // Conversion functions. All functions take `data', which
419 // normally holds the first two lines, and a file pointer.
420 // ---------------------------------------------------------
422 // Conversion from ISO-8859-1 (aka Latin-1) to Unicode.
423 void
424 conversion_latin1(file_case *fcp, const string &data)
426 int len = data.length();
427 const unsigned char *ptr = (const unsigned char *)data.contents();
428 for (int i = 0; i < len; i++)
429 unicode_entity(ptr[i]);
430 int c = -1;
431 while ((c = fcp->get_c()) != EOF)
432 unicode_entity(c);
435 // A future version of groff shall support UTF-8 natively.
436 // In this case, the UTF-8 stuff here in this file will be
437 // moved to the troff program.
439 struct utf8 {
440 file_case *_fcp;
441 unsigned char s[6];
442 enum {
443 FIRST = 0,
444 SECOND,
445 THIRD,
446 FOURTH,
447 FIFTH,
448 SIXTH
449 } byte;
450 int expected_bytes;
451 int invalid_warning;
452 int incomplete_warning;
453 utf8(file_case *);
454 ~utf8();
455 void add(unsigned char);
456 void invalid();
457 void incomplete();
460 utf8::utf8(file_case *fcp) : _fcp(fcp), byte(FIRST), expected_bytes(1),
461 invalid_warning(1), incomplete_warning(1)
463 // empty
466 utf8::~utf8()
468 if (byte != FIRST)
469 incomplete();
472 inline void
473 utf8::add(unsigned char c)
475 s[byte] = c;
476 if (byte == FIRST) {
477 if (c < 0x80)
478 unicode_entity(c);
479 else if (c < 0xC0)
480 invalid();
481 else if (c < 0xE0) {
482 expected_bytes = 2;
483 byte = SECOND;
485 else if (c < 0xF0) {
486 expected_bytes = 3;
487 byte = SECOND;
489 else if (c < 0xF8) {
490 expected_bytes = 4;
491 byte = SECOND;
493 else if (c < 0xFC) {
494 expected_bytes = 5;
495 byte = SECOND;
497 else if (c < 0xFE) {
498 expected_bytes = 6;
499 byte = SECOND;
501 else
502 invalid();
503 return;
505 if (c < 0x80 || c > 0xBF) {
506 incomplete();
507 add(c);
508 return;
510 switch (byte) {
511 case FIRST:
512 // can't happen
513 break;
514 case SECOND:
515 if (expected_bytes == 2) {
516 if (s[0] < 0xC2)
517 invalid();
518 else
519 unicode_entity(((s[0] & 0x1F) << 6)
520 | (s[1] ^ 0x80));
521 byte = FIRST;
523 else
524 byte = THIRD;
525 break;
526 case THIRD:
527 if (expected_bytes == 3) {
528 if (!(s[0] >= 0xE1 || s[1] >= 0xA0))
529 invalid();
530 else
531 unicode_entity(((s[0] & 0x1F) << 12)
532 | ((s[1] ^ 0x80) << 6)
533 | (s[2] ^ 0x80));
534 byte = FIRST;
536 else
537 byte = FOURTH;
538 break;
539 case FOURTH:
540 // We reject everything greater than 0x10FFFF.
541 if (expected_bytes == 4) {
542 if (!((s[0] >= 0xF1 || s[1] >= 0x90)
543 && (s[0] < 0xF4 || (s[0] == 0xF4 && s[1] < 0x90))))
544 invalid();
545 else
546 unicode_entity(((s[0] & 0x07) << 18)
547 | ((s[1] ^ 0x80) << 12)
548 | ((s[2] ^ 0x80) << 6)
549 | (s[3] ^ 0x80));
550 byte = FIRST;
552 else
553 byte = FIFTH;
554 break;
555 case FIFTH:
556 if (expected_bytes == 5) {
557 invalid();
558 byte = FIRST;
560 else
561 byte = SIXTH;
562 break;
563 case SIXTH:
564 invalid();
565 byte = FIRST;
566 break;
570 void
571 utf8::invalid()
573 if (debug_flag && invalid_warning) {
574 fprintf(stderr, " invalid byte(s) found in input stream --\n"
575 " each such sequence replaced with 0xFFFD\n");
576 invalid_warning = 0;
578 unicode_entity(0xFFFD);
579 byte = FIRST;
582 void
583 utf8::incomplete()
585 if (debug_flag && incomplete_warning) {
586 fprintf(stderr, " incomplete sequence(s) found in input stream --\n"
587 " each such sequence replaced with 0xFFFD\n");
588 incomplete_warning = 0;
590 unicode_entity(0xFFFD);
591 byte = FIRST;
594 // Conversion from UTF-8 to Unicode.
595 void
596 conversion_utf8(file_case *fcp, const string &data)
598 utf8 u(fcp);
599 int len = data.length();
600 const unsigned char *ptr = (const unsigned char *)data.contents();
601 for (int i = 0; i < len; i++)
602 u.add(ptr[i]);
603 int c = -1;
604 while ((c = fcp->get_c()) != EOF)
605 u.add(c);
606 return;
609 // Conversion from cp1047 (EBCDIC) to UTF-8.
610 void
611 conversion_cp1047(file_case *fcp, const string &data)
613 static unsigned char cp1047[] = {
614 0x00, 0x01, 0x02, 0x03, 0x9C, 0x09, 0x86, 0x7F, // 0x00
615 0x97, 0x8D, 0x8E, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
616 0x10, 0x11, 0x12, 0x13, 0x9D, 0x85, 0x08, 0x87, // 0x10
617 0x18, 0x19, 0x92, 0x8F, 0x1C, 0x1D, 0x1E, 0x1F,
618 0x80, 0x81, 0x82, 0x83, 0x84, 0x0A, 0x17, 0x1B, // 0x20
619 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x05, 0x06, 0x07,
620 0x90, 0x91, 0x16, 0x93, 0x94, 0x95, 0x96, 0x04, // 0x30
621 0x98, 0x99, 0x9A, 0x9B, 0x14, 0x15, 0x9E, 0x1A,
622 0x20, 0xA0, 0xE2, 0xE4, 0xE0, 0xE1, 0xE3, 0xE5, // 0x40
623 0xE7, 0xF1, 0xA2, 0x2E, 0x3C, 0x28, 0x2B, 0x7C,
624 0x26, 0xE9, 0xEA, 0xEB, 0xE8, 0xED, 0xEE, 0xEF, // 0x50
625 0xEC, 0xDF, 0x21, 0x24, 0x2A, 0x29, 0x3B, 0x5E,
626 0x2D, 0x2F, 0xC2, 0xC4, 0xC0, 0xC1, 0xC3, 0xC5, // 0x60
627 0xC7, 0xD1, 0xA6, 0x2C, 0x25, 0x5F, 0x3E, 0x3F,
628 0xF8, 0xC9, 0xCA, 0xCB, 0xC8, 0xCD, 0xCE, 0xCF, // 0x70
629 0xCC, 0x60, 0x3A, 0x23, 0x40, 0x27, 0x3D, 0x22,
630 0xD8, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, // 0x80
631 0x68, 0x69, 0xAB, 0xBB, 0xF0, 0xFD, 0xFE, 0xB1,
632 0xB0, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, // 0x90
633 0x71, 0x72, 0xAA, 0xBA, 0xE6, 0xB8, 0xC6, 0xA4,
634 0xB5, 0x7E, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, // 0xA0
635 0x79, 0x7A, 0xA1, 0xBF, 0xD0, 0x5B, 0xDE, 0xAE,
636 0xAC, 0xA3, 0xA5, 0xB7, 0xA9, 0xA7, 0xB6, 0xBC, // 0xB0
637 0xBD, 0xBE, 0xDD, 0xA8, 0xAF, 0x5D, 0xB4, 0xD7,
638 0x7B, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, // 0xC0
639 0x48, 0x49, 0xAD, 0xF4, 0xF6, 0xF2, 0xF3, 0xF5,
640 0x7D, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, // 0xD0
641 0x51, 0x52, 0xB9, 0xFB, 0xFC, 0xF9, 0xFA, 0xFF,
642 0x5C, 0xF7, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, // 0xE0
643 0x59, 0x5A, 0xB2, 0xD4, 0xD6, 0xD2, 0xD3, 0xD5,
644 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, // 0xF0
645 0x38, 0x39, 0xB3, 0xDB, 0xDC, 0xD9, 0xDA, 0x9F,
647 int len = data.length();
648 const unsigned char *ptr = (const unsigned char *)data.contents();
649 for (int i = 0; i < len; i++)
650 unicode_entity(cp1047[ptr[i]]);
651 int c = -1;
652 while ((c = fcp->get_c()) != EOF)
653 unicode_entity(cp1047[c]);
656 // Locale-sensible conversion.
657 #if HAVE_ICONV
658 void
659 conversion_iconv(file_case *fcp, const string &data, char *enc)
661 iconv_t handle = iconv_open(UNICODE, enc);
662 if (handle == (iconv_t)-1) {
663 if (errno == EINVAL) {
664 error("encoding system `%1' not supported by iconv()", enc);
665 return;
667 fatal("iconv_open failed");
669 char inbuf[BUFSIZ];
670 int outbuf[BUFSIZ];
671 char *outptr = (char *)outbuf;
672 size_t outbytes_left = BUFSIZ * sizeof (int);
673 // Handle `data'.
674 char *inptr = (char *)data.contents();
675 size_t inbytes_left = data.length();
676 char *limit;
677 while (inbytes_left > 0) {
678 size_t status = iconv(handle,
679 (ICONV_CONST char **)&inptr, &inbytes_left,
680 &outptr, &outbytes_left);
681 if (status == (size_t)-1) {
682 if (errno == EILSEQ) {
683 // Invalid byte sequence. XXX
684 inptr++;
685 inbytes_left--;
687 else if (errno == E2BIG) {
688 // Output buffer is full.
689 limit = (char *)outbuf + BUFSIZ * sizeof (int) - outbytes_left;
690 for (int *ptr = outbuf; (char *)ptr < limit; ptr++)
691 unicode_entity(*ptr);
692 memmove(outbuf, outptr, outbytes_left);
693 outptr = (char *)outbuf + outbytes_left;
694 outbytes_left = BUFSIZ * sizeof (int) - outbytes_left;
696 else if (errno == EINVAL) {
697 // `data' ends with partial input sequence.
698 memcpy(inbuf, inptr, inbytes_left);
699 break;
703 // Handle `fp' and switch to `inbuf'.
704 size_t read_bytes;
705 char *read_start = inbuf + inbytes_left;
706 while ((read_bytes = fcp->get_buf(read_start, BUFSIZ - inbytes_left)) > 0) {
707 inptr = inbuf;
708 inbytes_left += read_bytes;
709 while (inbytes_left > 0) {
710 size_t status = iconv(handle,
711 (ICONV_CONST char **)&inptr, &inbytes_left,
712 &outptr, &outbytes_left);
713 if (status == (size_t)-1) {
714 if (errno == EILSEQ) {
715 // Invalid byte sequence. XXX
716 inptr++;
717 inbytes_left--;
719 else if (errno == E2BIG) {
720 // Output buffer is full.
721 limit = (char *)outbuf + BUFSIZ * sizeof (int) - outbytes_left;
722 for (int *ptr = outbuf; (char *)ptr < limit; ptr++)
723 unicode_entity(*ptr);
724 memmove(outbuf, outptr, outbytes_left);
725 outptr = (char *)outbuf + outbytes_left;
726 outbytes_left = BUFSIZ * sizeof (int) - outbytes_left;
728 else if (errno == EINVAL) {
729 // `inbuf' ends with partial input sequence.
730 memmove(inbuf, inptr, inbytes_left);
731 break;
735 read_start = inbuf + inbytes_left;
737 iconv_close(handle);
738 // XXX use ferror?
739 limit = (char *)outbuf + BUFSIZ * sizeof (int) - outbytes_left;
740 for (int *ptr = outbuf; (char *)ptr < limit; ptr++)
741 unicode_entity(*ptr);
743 #endif /* HAVE_ICONV */
745 // ---------------------------------------------------------
746 // Handle Byte Order Mark.
748 // Since we have a chicken-and-egg problem it's necessary
749 // to handle the BOM manually if it is in the data stream.
750 // As documented in the Unicode book it is very unlikely
751 // that any normal text file (regardless of the encoding)
752 // starts with the bytes which represent a BOM.
754 // Return the BOM in string `BOM'; `data' then starts with
755 // the byte after the BOM. This function reads (at most)
756 // four bytes from the data stream.
758 // Return encoding if a BOM is found, NULL otherwise.
759 // ---------------------------------------------------------
760 const char *
761 get_BOM(file_case *fcp, string &BOM, string &data)
763 // The BOM is U+FEFF. We have thus the following possible
764 // representations.
766 // UTF-8: 0xEFBBBF
767 // UTF-16: 0xFEFF or 0xFFFE
768 // UTF-32: 0x0000FEFF or 0xFFFE0000
769 static struct {
770 int len;
771 const char *str;
772 const char *name;
773 } BOM_table[] = {
774 {4, "\x00\x00\xFE\xFF", "UTF-32"},
775 {4, "\xFF\xFE\x00\x00", "UTF-32"},
776 {3, "\xEF\xBB\xBF", "UTF-8"},
777 {2, "\xFE\xFF", "UTF-16"},
778 {2, "\xFF\xFE", "UTF-16"},
780 const int BOM_table_len = sizeof (BOM_table) / sizeof (BOM_table[0]);
781 char BOM_string[4];
782 const char *retval = NULL;
783 int len;
784 for (len = 0; len < 4; len++) {
785 int c = fcp->get_c();
786 if (c == EOF)
787 break;
788 BOM_string[len] = char(c);
790 int i;
791 for (i = 0; i < BOM_table_len; i++) {
792 if (BOM_table[i].len <= len
793 && memcmp(BOM_string, BOM_table[i].str, BOM_table[i].len) == 0)
794 break;
796 int j = 0;
797 if (i < BOM_table_len) {
798 for (; j < BOM_table[i].len; j++)
799 BOM += BOM_string[j];
800 retval = BOM_table[i].name;
802 for (; j < len; j++)
803 data += BOM_string[j];
804 return retval;
807 // ---------------------------------------------------------
808 // Get first two lines from input stream.
810 // Return string (allocated with `new') without zero bytes
811 // or NULL in case no coding tag can occur in the data
812 // (which is stored unmodified in `data').
813 // ---------------------------------------------------------
814 char *
815 get_tag_lines(file_case *fcp, string &data)
817 int newline_count = 0;
818 int c, prev = -1;
819 // Handle CR, LF, and CRLF as line separators.
820 for (int i = 0; i < data.length(); i++) {
821 c = data[i];
822 if (c == '\n' || c == '\r')
823 newline_count++;
824 if (c == '\n' && prev == '\r')
825 newline_count--;
826 prev = c;
828 if (newline_count > 1)
829 return NULL;
830 int emit_warning = 1;
831 for (int lines = newline_count; lines < 2; lines++) {
832 while ((c = fcp->get_c()) != EOF) {
833 if (c == '\0' && debug_flag && emit_warning) {
834 fprintf(stderr,
835 " null byte(s) found in input stream --\n"
836 " search for coding tag might return false result\n");
837 emit_warning = 0;
839 data += char(c);
840 if (c == '\n' || c == '\r')
841 break;
843 // Handle CR, LF, and CRLF as line separators.
844 if (c == '\r') {
845 c = fcp->get_c();
846 if (c != EOF && c != '\n')
847 fcp->unget_c(c);
848 else
849 data += char(c);
852 return data.extract();
855 // ---------------------------------------------------------
856 // Check whether C string starts with a comment.
858 // Return 1 if true, 0 otherwise.
859 // ---------------------------------------------------------
861 is_comment_line(char *s)
863 if (!s || !*s)
864 return 0;
865 if (*s == '.' || *s == '\'')
867 s++;
868 while (*s == ' ' || *s == '\t')
869 s++;
870 if (*s && *s == '\\')
872 s++;
873 if (*s == '"' || *s == '#')
874 return 1;
877 else if (*s == '\\')
879 s++;
880 if (*s == '#')
881 return 1;
883 return 0;
886 // ---------------------------------------------------------
887 // Get a value/variable pair from a local variables list
888 // in a C string which look like this:
890 // <variable1>: <value1>; <variable2>: <value2>; ...
892 // Leading and trailing blanks are ignored. There might be
893 // more than one blank after `:' and `;'.
895 // Return position of next value/variable pair or NULL if
896 // at end of data.
897 // ---------------------------------------------------------
898 char *
899 get_variable_value_pair(char *d1, char **variable, char **value)
901 static char var[MAX_VAR_LEN], val[MAX_VAR_LEN];
902 *variable = var;
903 *value = val;
904 while (*d1 == ' ' || *d1 == '\t')
905 d1++;
906 // Get variable.
907 int l = 0;
908 while (l < MAX_VAR_LEN - 1 && *d1 && !strchr(";: \t", *d1))
909 var[l++] = *(d1++);
910 var[l] = 0;
911 // Skip everything until `:', `;', or end of data.
912 while (*d1 && *d1 != ':' && *d1 != ';')
913 d1++;
914 val[0] = 0;
915 if (!*d1)
916 return NULL;
917 if (*d1 == ';')
918 return d1 + 1;
919 d1++;
920 while (*d1 == ' ' || *d1 == '\t')
921 d1++;
922 // Get value.
923 l = 0;
924 while (l < MAX_VAR_LEN - 1 && *d1 && !strchr("; \t", *d1))
925 val[l++] = *(d1++);
926 val[l] = 0;
927 // Skip everything until `;' or end of data.
928 while (*d1 && *d1 != ';')
929 d1++;
930 if (*d1 == ';')
931 return d1 + 1;
932 return NULL;
935 // ---------------------------------------------------------
936 // Check coding tag in the read buffer.
938 // We search for the following line:
940 // <comment> ... -*-<local variables list>-*-
942 // (`...' might be anything).
944 // <comment> can be one of the following syntax forms at the
945 // beginning of the line:
947 // .\" .\# '\" '\# \#
949 // There can be whitespace after the leading `.' or "'".
951 // The local variables list must occur within the first
952 // comment block at the very beginning of the data stream.
954 // Within the <local variables list>, we search for
956 // coding: <value>
958 // which specifies the coding system used for the data
959 // stream.
961 // Return <value> if found, NULL otherwise.
963 // Note that null bytes in the data are skipped before applying
964 // the algorithm. This should work even with files encoded as
965 // UTF-16 or UTF-32 (or its siblings) in most cases.
967 // XXX Add support for tag at the end of buffer.
968 // ---------------------------------------------------------
969 char *
970 check_coding_tag(file_case *fcp, string &data)
972 char *inbuf = get_tag_lines(fcp, data);
973 char *lineend;
974 for (char *p = inbuf; is_comment_line(p); p = lineend + 1) {
975 if ((lineend = strchr(p, '\n')) == NULL)
976 break;
977 *lineend = 0; // switch temporarily to '\0'
978 char *d1 = strstr(p, "-*-");
979 char *d2 = 0;
980 if (d1)
981 d2 = strstr(d1 + 3, "-*-");
982 *lineend = '\n'; // restore newline
983 if (!d1 || !d2)
984 continue;
985 *d2 = 0; // switch temporarily to '\0'
986 d1 += 3;
987 while (d1) {
988 char *variable, *value;
989 d1 = get_variable_value_pair(d1, &variable, &value);
990 if (!strcasecmp(variable, "coding")) {
991 *d2 = '-'; // restore '-'
992 a_delete inbuf;
993 return value;
996 *d2 = '-'; // restore '-'
998 a_delete inbuf;
999 return NULL;
1002 // ---------------------------------------------------------
1003 // Handle an input file. If filename is `-' handle stdin.
1005 // Return 1 on success, 0 otherwise.
1006 // ---------------------------------------------------------
1008 do_file(const char *filename)
1010 if (debug_flag)
1011 fprintf(stderr, "file `%s':\n", filename);
1012 file_case *fcp;
1013 if ((fcp = file_case::muxer(filename, fcp->mux_need_binary)) == NULL) {
1014 assert(strcmp(filename, "-"));
1015 error("can't open `%1': %2", filename, strerror(errno));
1016 return 0;
1019 string BOM, data;
1020 const char *BOM_encoding = get_BOM(fcp, BOM, data);
1021 // Determine the encoding.
1022 char *encoding;
1023 if (user_encoding[0]) {
1024 if (debug_flag) {
1025 fprintf(stderr, " user-specified encoding `%s', "
1026 "no search for coding tag\n",
1027 user_encoding);
1028 if (BOM_encoding && strcmp(BOM_encoding, user_encoding))
1029 fprintf(stderr, " but BOM in data stream implies encoding `%s'!\n",
1030 BOM_encoding);
1032 encoding = (char *)user_encoding;
1034 else if (BOM_encoding) {
1035 if (debug_flag)
1036 fprintf(stderr, " found BOM, no search for coding tag\n");
1037 encoding = (char *)BOM_encoding;
1039 else {
1040 // `check_coding_tag' returns a pointer to a static array (or NULL).
1041 char *file_encoding = check_coding_tag(fcp, data);
1042 if (!file_encoding) {
1043 if (debug_flag)
1044 fprintf(stderr, " no file encoding\n");
1045 file_encoding = default_encoding;
1047 else
1048 if (debug_flag)
1049 fprintf(stderr, " file encoding: `%s'\n", file_encoding);
1050 encoding = file_encoding;
1052 strncpy(encoding_string, encoding, MAX_VAR_LEN - 1);
1053 encoding_string[MAX_VAR_LEN - 1] = 0;
1054 encoding = encoding_string;
1055 // Translate from MIME & Emacs encoding names to locale encoding names.
1056 encoding = emacs2mime(encoding_string);
1057 if (encoding[0] == '\0') {
1058 error("encoding `%1' not supported, not a portable encoding",
1059 encoding_string);
1060 return 0;
1062 if (debug_flag)
1063 fprintf(stderr, " encoding used: `%s'\n", encoding);
1064 if (!raw_flag)
1065 printf(".lf 1 %s\n", filename);
1066 int success = 1;
1067 // Call converter (converters write to stdout).
1068 if (!strcasecmp(encoding, "ISO-8859-1"))
1069 conversion_latin1(fcp, BOM + data);
1070 else if (!strcasecmp(encoding, "UTF-8"))
1071 conversion_utf8(fcp, data);
1072 else if (!strcasecmp(encoding, "cp1047"))
1073 conversion_cp1047(fcp, BOM + data);
1074 else {
1075 #if HAVE_ICONV
1076 conversion_iconv(fcp, BOM + data, encoding);
1077 #else
1078 error("encoding system `%1' not supported", encoding);
1079 success = 0;
1080 #endif /* HAVE_ICONV */
1083 delete fcp;
1084 return success;
1087 // ---------------------------------------------------------
1088 // Print usage.
1089 // ---------------------------------------------------------
1090 void
1091 usage(FILE *stream)
1093 fprintf(stream, "usage: %s [ option ] [ files ]\n"
1094 "\n"
1095 "-d show debugging messages\n"
1096 "-D encoding specify default encoding\n"
1097 "-e encoding specify input encoding\n"
1098 "-h print this message\n"
1099 "-r don't add .lf requests\n"
1100 "-v print version number\n"
1101 "\n"
1102 "The default encoding is `%s'.\n",
1103 program_name, default_encoding);
1106 // ---------------------------------------------------------
1107 // Main routine.
1108 // ---------------------------------------------------------
1110 main(int argc, char **argv)
1112 program_name = argv[0];
1113 // Determine the default encoding. This must be done before
1114 // getopt() is called since the usage message shows the default
1115 // encoding.
1116 setlocale(LC_ALL, "");
1117 char *locale = getlocale(LC_CTYPE);
1118 if (!locale || !strcmp(locale, "C") || !strcmp(locale, "POSIX"))
1119 strcpy(default_encoding, "latin1");
1120 else {
1121 strncpy(default_encoding, locale_charset(), MAX_VAR_LEN - 1);
1122 default_encoding[MAX_VAR_LEN - 1] = 0;
1125 program_name = argv[0];
1126 int opt;
1127 static const struct option long_options[] = {
1128 { "help", no_argument, 0, 'h' },
1129 { "version", no_argument, 0, 'v' },
1130 { NULL, 0, 0, 0 }
1132 // Parse the command line options.
1133 while ((opt = getopt_long(argc, argv,
1134 "dD:e:hrv", long_options, NULL)) != EOF)
1135 switch (opt) {
1136 case 'v':
1137 printf("GNU preconv (groff) version %s %s iconv support\n",
1138 Version_string,
1139 #ifdef HAVE_ICONV
1140 "with"
1141 #else
1142 "without"
1143 #endif /* HAVE_ICONV */
1145 exit(0);
1146 break;
1147 case 'd':
1148 debug_flag = 1;
1149 break;
1150 case 'e':
1151 if (optarg) {
1152 strncpy(user_encoding, optarg, MAX_VAR_LEN - 1);
1153 user_encoding[MAX_VAR_LEN - 1] = 0;
1155 else
1156 user_encoding[0] = 0;
1157 break;
1158 case 'D':
1159 if (optarg) {
1160 strncpy(default_encoding, optarg, MAX_VAR_LEN - 1);
1161 default_encoding[MAX_VAR_LEN - 1] = 0;
1163 break;
1164 case 'r':
1165 raw_flag = 1;
1166 break;
1167 case 'h':
1168 usage(stdout);
1169 exit(0);
1170 break;
1171 case '?':
1172 usage(stderr);
1173 exit(1);
1174 break;
1175 default:
1176 assert(0);
1178 int nbad = 0;
1179 if (debug_flag)
1180 fprintf(stderr, "default encoding: `%s'\n", default_encoding);
1181 if (optind >= argc)
1182 nbad += !do_file("-");
1183 else
1184 for (int i = optind; i < argc; i++)
1185 nbad += !do_file(argv[i]);
1186 if (ferror(stdout) || fflush(stdout) < 0)
1187 fatal("output error");
1188 return nbad != 0;
1191 /* end of preconv.cpp */