src/preproc/preconv/preconv.cpp

   1 // -*- C++ -*-
   2 /* Copyright (C) 2005, 2006, 2008
   3    Free Software Foundation, Inc.
   4      Written by Werner Lemberg (wl@gnu.org)
   5
   6 This file is part of groff.
   7
   8 groff is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 2, or (at your option) any later
  11 version.
  12
  13 groff is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License along
  19 with groff; see the file COPYING.  If not, write to the Free Software
  20 Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
  21
  22 #include "lib.h"
  23
  24 #include <assert.h>
  25 #include <stdlib.h>
  26 #include <errno.h>
  27 #include "errarg.h"
  28 #include "error.h"
  29 #include "file_case.h"
  30 #include "localcharset.h"
  31 #include "nonposix.h"
  32 #include "stringclass.h"
  33
  34 #include <locale.h>
  35
  36 #if HAVE_ICONV
  37 # include <iconv.h>
  38 # ifdef WORDS_BIGENDIAN
  39 #  define UNICODE "UTF-32BE"
  40 # else
  41 #  define UNICODE "UTF-32LE"
  42 # endif
  43 #endif
  44
  45 #define MAX_VAR_LEN 100
  46
  47 extern "C" const char *Version_string;
  48
  49 char default_encoding[MAX_VAR_LEN];
  50 char user_encoding[MAX_VAR_LEN];
  51 char encoding_string[MAX_VAR_LEN];
  52 int debug_flag = 0;
  53 int raw_flag = 0;
  54
  55 struct conversion {
  56   const char *from;
  57   const char *to;
  58 };
  59
  60 // The official list of MIME tags can be found at
  61 //
  62 //   http://www.iana.org/assignments/character-sets
  63 //
  64 // For encodings which don't have a MIME tag we use GNU iconv's encoding
  65 // names (which also work with the portable GNU libiconv package).  They
  66 // are marked with `*'.
  67 //
  68 // Encodings specific to XEmacs and Emacs are marked as such; no mark means
  69 // that they are used by both Emacs and XEmacs.
  70 //
  71 // Encodings marked with `--' are special to Emacs, XEmacs, or other
  72 // applications and shouldn't be used for data exchange.
  73 //
  74 // `Not covered' means that the encoding can be handled neither by GNU iconv
  75 // nor by libiconv, or just one of them has support for it.
  76 //
  77 // A special case is VIQR encoding: Despite of having a MIME tag it is
  78 // missing in both libiconv 1.10 and iconv (coming with GNU libc 2.3.6).
  79 //
  80 // Finally, we add all aliases of GNU iconv for `ascii', `latin1', and
  81 // `utf8' to catch those encoding names before iconv is called.
  82 //
  83 // Note that most entries are commented out -- only a small, (rather)
  84 // reliable and stable subset of encodings is recognized (for coding tags)
  85 // which are still in greater use today (January 2006).  Most notably, all
  86 // Windows-specific encodings are not selected because they lack stability:
  87 // Microsoft has changed the mappings instead of creating new versions.
  88 //
  89 // Please contact the groff list if you find the selection inadequate.
  90
  91 static const conversion
  92 emacs_to_mime[] = {
  93   {"ascii",                             "US-ASCII"},    // Emacs
  94   {"big5",                              "Big5"},
  95   {"chinese-big5",                      "Big5"},        // Emacs
  96   {"chinese-euc",                       "GB2312"},      // XEmacs
  97   {"chinese-iso-8bit",                  "GB2312"},      // Emacs
  98   {"cn-big5",                           "Big5"},
  99   {"cn-gb",                             "GB2312"},      // Emacs
 100   {"cn-gb-2312",                        "GB2312"},
 101   {"cp878",                             "KOI8-R"},      // Emacs
 102   {"cp1047",                            "CP1047"},      // EBCDIC
 103   {"csascii",                           "US-ASCII"},    // alias
 104   {"csisolatin1",                       "ISO-8859-1"},  // alias
 105   {"cyrillic-iso-8bit",                 "ISO-8859-5"},  // Emacs
 106   {"cyrillic-koi8",                     "KOI8-R"},      // not KOI8!, Emacs
 107   {"euc-china",                         "GB2312"},      // Emacs
 108   {"euc-cn",                            "GB2312"},      // Emacs
 109   {"euc-japan",                         "EUC-JP"},
 110   {"euc-japan-1990",                    "EUC-JP"},      // Emacs
 111   {"euc-jp",                            "EUC-JP"},
 112   {"euc-korea",                         "EUC-KR"},
 113   {"euc-kr",                            "EUC-KR"},
 114   {"gb2312",                            "GB2312"},
 115   {"greek-iso-8bit",                    "ISO-8859-7"},
 116   {"iso-10646/utf8",                    "UTF-8"},       // alias
 117   {"iso-10646/utf-8",                   "UTF-8"},       // alias
 118   {"iso-8859-1",                        "ISO-8859-1"},
 119   {"iso-8859-13",                       "ISO-8859-13"}, // Emacs
 120   {"iso-8859-15",                       "ISO-8859-15"},
 121   {"iso-8859-2",                        "ISO-8859-2"},
 122   {"iso-8859-5",                        "ISO-8859-5"},
 123   {"iso-8859-7",                        "ISO-8859-7"},
 124   {"iso-8859-9",                        "ISO-8859-9"},
 125   {"iso-latin-1",                       "ISO-8859-1"},
 126   {"iso-latin-2",                       "ISO-8859-2"},  // Emacs
 127   {"iso-latin-5",                       "ISO-8859-9"},  // Emacs
 128   {"iso-latin-7",                       "ISO-8859-13"}, // Emacs
 129   {"iso-latin-9",                       "ISO-8859-15"}, // Emacs
 130   {"japanese-iso-8bit",                 "EUC-JP"},      // Emacs
 131   {"japanese-euc",                      "EUC-JP"},      // XEmacs
 132   {"jis8",                              "EUC-JP"},      // XEmacs
 133   {"koi8",                              "KOI8-R"},      // not KOI8!, Emacs
 134   {"koi8-r",                            "KOI8-R"},
 135   {"korean-euc",                        "EUC-KR"},      // XEmacs
 136   {"korean-iso-8bit",                   "EUC-KR"},      // Emacs
 137   {"latin1",                            "ISO-8859-1"},  // alias
 138   {"latin-0",                           "ISO-8859-15"}, // Emacs
 139   {"latin-1",                           "ISO-8859-1"},  // Emacs
 140   {"latin-2",                           "ISO-8859-2"},  // Emacs
 141   {"latin-5",                           "ISO-8859-9"},  // Emacs
 142   {"latin-7",                           "ISO-8859-13"}, // Emacs
 143   {"latin-9",                           "ISO-8859-15"}, // Emacs
 144   {"mule-utf-16",                       "UTF-16"},      // Emacs
 145   {"mule-utf-16be",                     "UTF-16BE"},    // Emacs
 146   {"mule-utf-16-be",                    "UTF-16BE"},    // Emacs
 147   {"mule-utf-16be-with-signature",      "UTF-16"},      // Emacs, not UTF-16BE
 148   {"mule-utf-16le",                     "UTF-16LE"},    // Emacs
 149   {"mule-utf-16-le",                    "UTF-16LE"},    // Emacs
 150   {"mule-utf-16le-with-signature",      "UTF-16"},      // Emacs, not UTF-16LE
 151   {"mule-utf-8",                        "UTF-8"},       // Emacs
 152   {"us-ascii",                          "US-ASCII"},    // Emacs
 153   {"utf8",                              "UTF-8"},       // alias
 154   {"utf-16",                            "UTF-16"},      // Emacs
 155   {"utf-16be",                          "UTF-16BE"},    // Emacs
 156   {"utf-16-be",                         "UTF-16BE"},    // Emacs
 157   {"utf-16be-with-signature",           "UTF-16"},      // Emacs, not UTF-16BE
 158   {"utf-16-be-with-signature",          "UTF-16"},      // Emacs, not UTF-16BE
 159   {"utf-16le",                          "UTF-16LE"},    // Emacs
 160   {"utf-16-le",                         "UTF-16LE"},    // Emacs
 161   {"utf-16le-with-signature",           "UTF-16"},      // Emacs, not UTF-16LE
 162   {"utf-16-le-with-signature",          "UTF-16"},      // Emacs, not UTF-16LE
 163   {"utf-8",                             "UTF-8"},       // Emacs
 164
 165 //  {"alternativnyj",                   ""},            // ?
 166 //  {"arabic-iso-8bit",                 "ISO-8859-6"},  // Emacs
 167 //  {"binary",                          ""},            // --
 168 //  {"chinese-hz",                      "HZ-GB-2312"},  // Emacs
 169 //  {"chinese-iso-7bit",                "ISO-2022-CN"}, // Emacs
 170 //  {"chinese-iso-8bit-with-esc",       ""},            // --
 171 //  {"compound-text",                   ""},            // --
 172 //  {"compound-text-with-extension",    ""},            // --
 173 //  {"cp1125",                          "cp1125"},      // *
 174 //  {"cp1250",                          "windows-1250"},// Emacs
 175 //  {"cp1251",                          "windows-1251"},// Emacs
 176 //  {"cp1252",                          "windows-1252"},// Emacs
 177 //  {"cp1253",                          "windows-1253"},// Emacs
 178 //  {"cp1254",                          "windows-1254"},// Emacs
 179 //  {"cp1255",                          "windows-1255"},// Emacs
 180 //  {"cp1256",                          "windows-1256"},// Emacs
 181 //  {"cp1257",                          "windows-1257"},// Emacs
 182 //  {"cp1258",                          "windows-1258"},// Emacs
 183 //  {"cp437",                           "cp437"},       // Emacs
 184 //  {"cp720",                           ""},            // not covered
 185 //  {"cp737",                           "cp737"},       // *, Emacs
 186 //  {"cp775",                           "cp775"},       // Emacs
 187 //  {"cp850",                           "cp850"},       // Emacs
 188 //  {"cp851",                           "cp851"},       // Emacs
 189 //  {"cp852",                           "cp852"},       // Emacs
 190 //  {"cp855",                           "cp855"},       // Emacs
 191 //  {"cp857",                           "cp857"},       // Emacs
 192 //  {"cp860",                           "cp860"},       // Emacs
 193 //  {"cp861",                           "cp861"},       // Emacs
 194 //  {"cp862",                           "cp862"},       // Emacs
 195 //  {"cp863",                           "cp863"},       // Emacs
 196 //  {"cp864",                           "cp864"},       // Emacs
 197 //  {"cp865",                           "cp865"},       // Emacs
 198 //  {"cp866",                           "cp866"},       // Emacs
 199 //  {"cp866u",                          "cp1125"},      // *, Emacs
 200 //  {"cp869",                           "cp869"},       // Emacs
 201 //  {"cp874",                           "cp874"},       // *, Emacs
 202 //  {"cp932",                           "cp932"},       // *, Emacs
 203 //  {"cp936",                           "cp936"},       // Emacs
 204 //  {"cp949",                           "cp949"},       // *, Emacs
 205 //  {"cp950",                           "cp950"},       // *, Emacs
 206 //  {"ctext",                           ""},            // --
 207 //  {"ctext-no-compositions",           ""},            // --
 208 //  {"ctext-with-extensions",           ""},            // --
 209 //  {"cyrillic-alternativnyj",          ""},            // ?, Emacs
 210 //  {"cyrillic-iso-8bit-with-esc",      ""},            // --
 211 //  {"cyrillic-koi8-t",                 "KOI8-T"},      // *, Emacs
 212 //  {"devanagari",                      ""},            // not covered
 213 //  {"dos",                             ""},            // --
 214 //  {"emacs-mule",                      ""},            // --
 215 //  {"euc-jisx0213",                    "EUC-JISX0213"},// *, XEmacs?
 216 //  {"euc-jisx0213-with-esc",           ""},            // XEmacs?
 217 //  {"euc-taiwan",                      "EUC-TW"},      // *, Emacs
 218 //  {"euc-tw",                          "EUC-TW"},      // *, Emacs
 219 //  {"georgian-ps",                     "GEORGIAN-PS"}, // *, Emacs
 220 //  {"greek-iso-8bit-with-esc",         ""},            // --
 221 //  {"hebrew-iso-8bit",                 "ISO-8859-8"},  // Emacs
 222 //  {"hebrew-iso-8bit-with-esc",        ""},            // --
 223 //  {"hz",                              "HZ-GB-2312"},
 224 //  {"hz-gb-2312",                      "HZ-GB-2312"},
 225 //  {"in-is13194",                      ""},            // not covered
 226 //  {"in-is13194-devanagari",           ""},            // not covered
 227 //  {"in-is13194-with-esc",             ""},            // --
 228 //  {"iso-2022-7",                      ""},            // XEmacs?
 229 //  {"iso-2022-7bit",                   ""},            // --
 230 //  {"iso-2022-7bit-lock",              ""},            // --
 231 //  {"iso-2022-7bit-lock-ss2",          ""},            // --
 232 //  {"iso-2022-7bit-ss2",               ""},            // --
 233 //  {"iso-2022-8",                      ""},            // XEmacs?
 234 //  {"iso-2022-8bit",                   ""},            // XEmacs?
 235 //  {"iso-2022-8bit-lock",              ""},            // XEmacs?
 236 //  {"iso-2022-8bit-lock-ss2",          ""},            // XEmacs?
 237 //  {"iso-2022-8bit-ss2",               ""},            // --
 238 //  {"iso-2022-cjk",                    ""},            // --
 239 //  {"iso-2022-cn",                     "ISO-2022-CN"}, // Emacs
 240 //  {"iso-2022-cn-ext",                 "ISO-2022-CN-EXT"},// Emacs
 241 //  {"iso-2022-int-1",                  ""},            // --
 242 //  {"iso-2022-jp",                     "ISO-2022-JP"},
 243 //  {"iso-2022-jp-1978-irv",            "ISO-2022-JP"},
 244 //  {"iso-2022-jp-2",                   "ISO-2022-JP-2"},
 245 //  {"iso-2022-jp-3",                   "ISO-2022-JP-3"},// *, XEmacs?
 246 //  {"iso-2022-jp-3-compatible",        ""},            // XEmacs?
 247 //  {"iso-2022-jp-3-strict",            "ISO-2022-JP-3"},// *, XEmacs?
 248 //  {"iso-2022-kr",                     "ISO-2022-KR"},
 249 //  {"iso-2022-lock",                   ""},            // XEmacs?
 250 //  {"iso-8859-10",                     "ISO-8859-10"}, // Emacs
 251 //  {"iso-8859-11",                     "ISO-8859-11"}, // *, Emacs
 252 //  {"iso-8859-14",                     "ISO-8859-14"}, // Emacs
 253 //  {"iso-8859-16",                     "ISO-8859-16"},
 254 //  {"iso-8859-3",                      "ISO-8859-3"},
 255 //  {"iso-8859-4",                      "ISO-8859-4"},
 256 //  {"iso-8859-6",                      "ISO-8859-6"},
 257 //  {"iso-8859-8",                      "ISO-8859-8"},
 258 //  {"iso-8859-8-e",                    "ISO-8859-8"},
 259 //  {"iso-8859-8-i",                    "ISO-8859-8"},  // Emacs
 260 //  {"iso-latin-10",                    "ISO-8859-16"}, // Emacs
 261 //  {"iso-latin-1-with-esc",            ""},            // --
 262 //  {"iso-latin-2-with-esc",            ""},            // --
 263 //  {"iso-latin-3",                     "ISO-8859-3"},  // Emacs
 264 //  {"iso-latin-3-with-esc",            ""},            // --
 265 //  {"iso-latin-4",                     "ISO-8859-4"},  // Emacs
 266 //  {"iso-latin-4-with-esc",            ""},            // --
 267 //  {"iso-latin-5-with-esc",            ""},            // --
 268 //  {"iso-latin-6",                     "ISO-8859-10"}, // Emacs
 269 //  {"iso-latin-8",                     "ISO-8859-14"}, // Emacs
 270 //  {"iso-safe",                                ""},            // --
 271 //  {"japanese-iso-7bit-1978-irv",      "ISO-2022-JP"}, // Emacs
 272 //  {"japanese-iso-8bit-with-esc",      ""},            // --
 273 //  {"japanese-shift-jis",              "Shift_JIS"},   // Emacs
 274 //  {"japanese-shift-jisx0213",         ""},            // XEmacs?
 275 //  {"jis7",                            "ISO-2022-JP"}, // Xemacs
 276 //  {"junet",                           "ISO-2022-JP"},
 277 //  {"koi8-t",                          "KOI8-T"},      // *, Emacs
 278 //  {"koi8-u",                          "KOI8-U"},      // Emacs
 279 //  {"korean-iso-7bit-lock",            "ISO-2022-KR"},
 280 //  {"korean-iso-8bit-with-esc",        ""},            // --
 281 //  {"lao",                             ""},            // not covered
 282 //  {"lao-with-esc",                    ""},            // --
 283 //  {"latin-10",                        "ISO-8859-16"}, // Emacs
 284 //  {"latin-3",                         "ISO-8859-3"},  // Emacs
 285 //  {"latin-4",                         "ISO-8859-4"},  // Emacs
 286 //  {"latin-6",                         "ISO-8859-10"}, // Emacs
 287 //  {"latin-8",                         "ISO-8859-14"}, // Emacs
 288 //  {"mac",                             ""},            // --
 289 //  {"mac-roman",                       "MACINTOSH"},   // Emacs
 290 //  {"mik",                             ""},            // not covered
 291 //  {"next",                            "NEXTSTEP"},    // *, Emacs
 292 //  {"no-conversion",                   ""},            // --
 293 //  {"old-jis",                         "ISO-2022-JP"},
 294 //  {"pt154",                           "PT154"},       // Emacs
 295 //  {"raw-text",                        ""},            // --
 296 //  {"ruscii",                          "cp1125"},      // *, Emacs
 297 //  {"shift-jis",                       "Shift_JIS"},   // XEmacs
 298 //  {"shift_jis",                       "Shift_JIS"},
 299 //  {"shift_jisx0213",                  "Shift_JISX0213"},// *, XEmacs?
 300 //  {"sjis",                            "Shift_JIS"},   // Emacs
 301 //  {"tcvn",                            "TCVN"},        // *, Emacs
 302 //  {"tcvn-5712",                       "TCVN"},        // *, Emacs
 303 //  {"thai-tis620",                     "TIS-620"},
 304 //  {"thai-tis620-with-esc",            ""},            // --
 305 //  {"th-tis620",                       "TIS-620"},
 306 //  {"tibetan",                         ""},            // not covered
 307 //  {"tibetan-iso-8bit",                ""},            // not covered
 308 //  {"tibetan-iso-8bit-with-esc",       ""},            // --
 309 //  {"tis-620",                         "TIS-620"},
 310 //  {"tis620",                          "TIS-620"},
 311 //  {"undecided",                       ""},            // --
 312 //  {"unix",                            ""},            // --
 313 //  {"utf-7",                           "UTF-7"},       // Emacs
 314 //  {"utf-7-safe",                      ""},            // XEmacs?
 315 //  {"utf-8-ws",                        "UTF-8"},       // XEmacs?
 316 //  {"vietnamese-tcvn",                 "TCVN"},        // *, Emacs
 317 //  {"vietnamese-viqr",                 "VIQR"},        // not covered
 318 //  {"vietnamese-viscii",               "VISCII"},
 319 //  {"vietnamese-vscii",                ""},            // not covered
 320 //  {"viqr",                            "VIQR"},        // not covered
 321 //  {"viscii",                          "VISCII"},
 322 //  {"vscii",                           ""},            // not covered
 323 //  {"windows-037",                     ""},            // not covered
 324 //  {"windows-10000",                   ""},            // not covered
 325 //  {"windows-10001",                   ""},            // not covered
 326 //  {"windows-10006",                   ""},            // not covered
 327 //  {"windows-10007",                   ""},            // not covered
 328 //  {"windows-10029",                   ""},            // not covered
 329 //  {"windows-10079",                   ""},            // not covered
 330 //  {"windows-10081",                   ""},            // not covered
 331 //  {"windows-1026",                    ""},            // not covered
 332 //  {"windows-1200",                    ""},            // not covered
 333 //  {"windows-1250",                    "windows-1250"},
 334 //  {"windows-1251",                    "windows-1251"},
 335 //  {"windows-1252",                    "windows-1252"},
 336 //  {"windows-1253",                    "windows-1253"},
 337 //  {"windows-1254",                    "windows-1254"},
 338 //  {"windows-1255",                    "windows-1255"},
 339 //  {"windows-1256",                    "windows-1256"},
 340 //  {"windows-1257",                    "windows-1257"},
 341 //  {"windows-1258",                    "windows-1258"},
 342 //  {"windows-1361",                    "cp1361"},      // *, XEmacs
 343 //  {"windows-437",                     "cp437"},       // XEmacs
 344 //  {"windows-500",                     ""},            // not covered
 345 //  {"windows-708",                     ""},            // not covered
 346 //  {"windows-709",                     ""},            // not covered
 347 //  {"windows-710",                     ""},            // not covered
 348 //  {"windows-720",                     ""},            // not covered
 349 //  {"windows-737",                     "cp737"},       // *, XEmacs
 350 //  {"windows-775",                     "cp775"},       // XEmacs
 351 //  {"windows-850",                     "cp850"},       // XEmacs
 352 //  {"windows-852",                     "cp852"},       // XEmacs
 353 //  {"windows-855",                     "cp855"},       // XEmacs
 354 //  {"windows-857",                     "cp857"},       // XEmacs
 355 //  {"windows-860",                     "cp860"},       // XEmacs
 356 //  {"windows-861",                     "cp861"},       // XEmacs
 357 //  {"windows-862",                     "cp862"},       // XEmacs
 358 //  {"windows-863",                     "cp863"},       // XEmacs
 359 //  {"windows-864",                     "cp864"},       // XEmacs
 360 //  {"windows-865",                     "cp865"},       // XEmacs
 361 //  {"windows-866",                     "cp866"},       // XEmacs
 362 //  {"windows-869",                     "cp869"},       // XEmacs
 363 //  {"windows-874",                     "cp874"},       // XEmacs
 364 //  {"windows-875",                     ""},            // not covered
 365 //  {"windows-932",                     "cp932"},       // *, XEmacs
 366 //  {"windows-936",                     "cp936"},       // XEmacs
 367 //  {"windows-949",                     "cp949"},       // *, XEmacs
 368 //  {"windows-950",                     "cp950"},       // *, XEmacs
 369 //  {"x-ctext",                         ""},            // --
 370 //  {"x-ctext-with-extensions",         ""},            // --
 371
 372   {NULL,                                NULL},
 373 };
 374
 375 // ---------------------------------------------------------
 376 // Convert encoding name from emacs to mime.
 377 // ---------------------------------------------------------
 378 char *
 379 emacs2mime(char *emacs_enc)
 380 {
 381   int emacs_enc_len = strlen(emacs_enc);
 382   if (emacs_enc_len > 4
 383       && !strcasecmp(emacs_enc + emacs_enc_len - 4, "-dos"))
 384     emacs_enc[emacs_enc_len - 4] = 0;
 385   if (emacs_enc_len > 4
 386       && !strcasecmp(emacs_enc + emacs_enc_len - 4, "-mac"))
 387     emacs_enc[emacs_enc_len - 4] = 0;
 388   if (emacs_enc_len > 5
 389       && !strcasecmp(emacs_enc + emacs_enc_len - 5, "-unix"))
 390     emacs_enc[emacs_enc_len - 5] = 0;
 391   for (const conversion *table = emacs_to_mime; table->from; table++)
 392     if (!strcasecmp(emacs_enc, table->from))
 393       return (char *)table->to;
 394   return emacs_enc;
 395 }
 396
 397 // ---------------------------------------------------------
 398 // Print out Unicode entity if value is greater than 0x7F.
 399 // ---------------------------------------------------------
 400 inline void
 401 unicode_entity(int u)
 402 {
 403   if (u < 0x80)
 404     putchar(u);
 405   else {
 406     // Handle soft hyphen specially -- it is an input character only,
 407     // not a glyph.
 408     if (u == 0xAD) {
 409       putchar('\\');
 410       putchar('%');
 411     }
 412     else
 413       printf("\\[u%04X]", u);
 414   }
 415 }
 416
 417 // ---------------------------------------------------------
 418 // Conversion functions.  All functions take `data', which
 419 // normally holds the first two lines, and a file pointer.
 420 // ---------------------------------------------------------
 421
 422 // Conversion from ISO-8859-1 (aka Latin-1) to Unicode.
 423 void
 424 conversion_latin1(file_case *fcp, const string &data)
 425 {
 426   int len = data.length();
 427   const unsigned char *ptr = (const unsigned char *)data.contents();
 428   for (int i = 0; i < len; i++)
 429     unicode_entity(ptr[i]);
 430   int c = -1;
 431   while ((c = fcp->get_c()) != EOF)
 432     unicode_entity(c);
 433 }
 434
 435 // A future version of groff shall support UTF-8 natively.
 436 // In this case, the UTF-8 stuff here in this file will be
 437 // moved to the troff program.
 438
 439 struct utf8 {
 440   file_case *_fcp;
 441   unsigned char s[6];
 442   enum {
 443     FIRST = 0,
 444     SECOND,
 445     THIRD,
 446     FOURTH,
 447     FIFTH,
 448     SIXTH
 449   } byte;
 450   int expected_bytes;
 451   int invalid_warning;
 452   int incomplete_warning;
 453   utf8(file_case *);
 454   ~utf8();
 455   void add(unsigned char);
 456   void invalid();
 457   void incomplete();
 458 };
 459
 460 utf8::utf8(file_case *fcp) : _fcp(fcp), byte(FIRST), expected_bytes(1),
 461                       invalid_warning(1), incomplete_warning(1)
 462 {
 463   // empty
 464 }
 465
 466 utf8::~utf8()
 467 {
 468   if (byte != FIRST)
 469     incomplete();
 470 }
 471
 472 inline void
 473 utf8::add(unsigned char c)
 474 {
 475   s[byte] = c;
 476   if (byte == FIRST) {
 477     if (c < 0x80)
 478       unicode_entity(c);
 479     else if (c < 0xC0)
 480       invalid();
 481     else if (c < 0xE0) {
 482       expected_bytes = 2;
 483       byte = SECOND;
 484     }
 485     else if (c < 0xF0) {
 486       expected_bytes = 3;
 487       byte = SECOND;
 488     }
 489     else if (c < 0xF8) {
 490       expected_bytes = 4;
 491       byte = SECOND;
 492     }
 493     else if (c < 0xFC) {
 494       expected_bytes = 5;
 495       byte = SECOND;
 496     }
 497     else if (c < 0xFE) {
 498       expected_bytes = 6;
 499       byte = SECOND;
 500     }
 501     else
 502       invalid();
 503     return;
 504   }
 505   if (c < 0x80 || c > 0xBF) {
 506     incomplete();
 507     add(c);
 508     return;
 509   }
 510   switch (byte) {
 511   case FIRST:
 512     // can't happen
 513     break;
 514   case SECOND:
 515     if (expected_bytes == 2) {
 516       if (s[0] < 0xC2)
 517         invalid();
 518       else
 519         unicode_entity(((s[0] & 0x1F) << 6)
 520                        | (s[1] ^ 0x80));
 521       byte = FIRST;
 522     }
 523     else
 524       byte = THIRD;
 525     break;
 526   case THIRD:
 527     if (expected_bytes == 3) {
 528       if (!(s[0] >= 0xE1 || s[1] >= 0xA0))
 529         invalid();
 530       else
 531         unicode_entity(((s[0] & 0x1F) << 12)
 532                        | ((s[1] ^ 0x80) << 6)
 533                        | (s[2] ^ 0x80));
 534       byte = FIRST;
 535     }
 536     else
 537       byte = FOURTH;
 538     break;
 539   case FOURTH:
 540     // We reject everything greater than 0x10FFFF.
 541     if (expected_bytes == 4) {
 542       if (!((s[0] >= 0xF1 || s[1] >= 0x90)
 543             && (s[0] < 0xF4 || (s[0] == 0xF4 && s[1] < 0x90))))
 544         invalid();
 545       else
 546         unicode_entity(((s[0] & 0x07) << 18)
 547                        | ((s[1] ^ 0x80) << 12)
 548                        | ((s[2] ^ 0x80) << 6)
 549                        | (s[3] ^ 0x80));
 550       byte = FIRST;
 551     }
 552     else
 553       byte = FIFTH;
 554     break;
 555   case FIFTH:
 556     if (expected_bytes == 5) {
 557       invalid();
 558       byte = FIRST;
 559     }
 560     else
 561       byte = SIXTH;
 562     break;
 563   case SIXTH:
 564     invalid();
 565     byte = FIRST;
 566     break;
 567   }
 568 }
 569
 570 void
 571 utf8::invalid()
 572 {
 573   if (debug_flag && invalid_warning) {
 574     fprintf(stderr, "  invalid byte(s) found in input stream --\n"
 575                     "  each such sequence replaced with 0xFFFD\n");
 576     invalid_warning = 0;
 577   }
 578   unicode_entity(0xFFFD);
 579   byte = FIRST;
 580 }
 581
 582 void
 583 utf8::incomplete()
 584 {
 585   if (debug_flag && incomplete_warning) {
 586     fprintf(stderr, "  incomplete sequence(s) found in input stream --\n"
 587                     "  each such sequence replaced with 0xFFFD\n");
 588     incomplete_warning = 0;
 589   }
 590   unicode_entity(0xFFFD);
 591   byte = FIRST;
 592 }
 593
 594 // Conversion from UTF-8 to Unicode.
 595 void
 596 conversion_utf8(file_case *fcp, const string &data)
 597 {
 598   utf8 u(fcp);
 599   int len = data.length();
 600   const unsigned char *ptr = (const unsigned char *)data.contents();
 601   for (int i = 0; i < len; i++)
 602     u.add(ptr[i]);
 603   int c = -1;
 604   while ((c = fcp->get_c()) != EOF)
 605     u.add(c);
 606   return;
 607 }
 608
 609 // Conversion from cp1047 (EBCDIC) to UTF-8.
 610 void
 611 conversion_cp1047(file_case *fcp, const string &data)
 612 {
 613   static unsigned char cp1047[] = {
 614     0x00, 0x01, 0x02, 0x03, 0x9C, 0x09, 0x86, 0x7F,     // 0x00
 615     0x97, 0x8D, 0x8E, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
 616     0x10, 0x11, 0x12, 0x13, 0x9D, 0x85, 0x08, 0x87,     // 0x10
 617     0x18, 0x19, 0x92, 0x8F, 0x1C, 0x1D, 0x1E, 0x1F,
 618     0x80, 0x81, 0x82, 0x83, 0x84, 0x0A, 0x17, 0x1B,     // 0x20
 619     0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x05, 0x06, 0x07,
 620     0x90, 0x91, 0x16, 0x93, 0x94, 0x95, 0x96, 0x04,     // 0x30
 621     0x98, 0x99, 0x9A, 0x9B, 0x14, 0x15, 0x9E, 0x1A,
 622     0x20, 0xA0, 0xE2, 0xE4, 0xE0, 0xE1, 0xE3, 0xE5,     // 0x40
 623     0xE7, 0xF1, 0xA2, 0x2E, 0x3C, 0x28, 0x2B, 0x7C,
 624     0x26, 0xE9, 0xEA, 0xEB, 0xE8, 0xED, 0xEE, 0xEF,     // 0x50
 625     0xEC, 0xDF, 0x21, 0x24, 0x2A, 0x29, 0x3B, 0x5E,
 626     0x2D, 0x2F, 0xC2, 0xC4, 0xC0, 0xC1, 0xC3, 0xC5,     // 0x60
 627     0xC7, 0xD1, 0xA6, 0x2C, 0x25, 0x5F, 0x3E, 0x3F,
 628     0xF8, 0xC9, 0xCA, 0xCB, 0xC8, 0xCD, 0xCE, 0xCF,     // 0x70
 629     0xCC, 0x60, 0x3A, 0x23, 0x40, 0x27, 0x3D, 0x22,
 630     0xD8, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,     // 0x80
 631     0x68, 0x69, 0xAB, 0xBB, 0xF0, 0xFD, 0xFE, 0xB1,
 632     0xB0, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70,     // 0x90
 633     0x71, 0x72, 0xAA, 0xBA, 0xE6, 0xB8, 0xC6, 0xA4,
 634     0xB5, 0x7E, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,     // 0xA0
 635     0x79, 0x7A, 0xA1, 0xBF, 0xD0, 0x5B, 0xDE, 0xAE,
 636     0xAC, 0xA3, 0xA5, 0xB7, 0xA9, 0xA7, 0xB6, 0xBC,     // 0xB0
 637     0xBD, 0xBE, 0xDD, 0xA8, 0xAF, 0x5D, 0xB4, 0xD7,
 638     0x7B, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,     // 0xC0
 639     0x48, 0x49, 0xAD, 0xF4, 0xF6, 0xF2, 0xF3, 0xF5,
 640     0x7D, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50,     // 0xD0
 641     0x51, 0x52, 0xB9, 0xFB, 0xFC, 0xF9, 0xFA, 0xFF,
 642     0x5C, 0xF7, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,     // 0xE0
 643     0x59, 0x5A, 0xB2, 0xD4, 0xD6, 0xD2, 0xD3, 0xD5,
 644     0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,     // 0xF0
 645     0x38, 0x39, 0xB3, 0xDB, 0xDC, 0xD9, 0xDA, 0x9F,
 646   };
 647   int len = data.length();
 648   const unsigned char *ptr = (const unsigned char *)data.contents();
 649   for (int i = 0; i < len; i++)
 650     unicode_entity(cp1047[ptr[i]]);
 651   int c = -1;
 652   while ((c = fcp->get_c()) != EOF)
 653     unicode_entity(cp1047[c]);
 654 }
 655
 656 // Locale-sensible conversion.
 657 #if HAVE_ICONV
 658 void
 659 conversion_iconv(file_case *fcp, const string &data, char *enc)
 660 {
 661   iconv_t handle = iconv_open(UNICODE, enc);
 662   if (handle == (iconv_t)-1) {
 663     if (errno == EINVAL) {
 664       error("encoding system `%1' not supported by iconv()", enc);
 665       return;
 666     }
 667     fatal("iconv_open failed");
 668   }
 669   char inbuf[BUFSIZ];
 670   int outbuf[BUFSIZ];
 671   char *outptr = (char *)outbuf;
 672   size_t outbytes_left = BUFSIZ * sizeof (int);
 673   // Handle `data'.
 674   char *inptr = (char *)data.contents();
 675   size_t inbytes_left = data.length();
 676   char *limit;
 677   while (inbytes_left > 0) {
 678     size_t status = iconv(handle,
 679                           (ICONV_CONST char **)&inptr, &inbytes_left,
 680                           &outptr, &outbytes_left);
 681     if (status == (size_t)-1) {
 682       if (errno == EILSEQ) {
 683         // Invalid byte sequence.  XXX
 684         inptr++;
 685         inbytes_left--;
 686       }
 687       else if (errno == E2BIG) {
 688         // Output buffer is full.
 689         limit = (char *)outbuf + BUFSIZ * sizeof (int) - outbytes_left;
 690         for (int *ptr = outbuf; (char *)ptr < limit; ptr++)
 691           unicode_entity(*ptr);
 692         memmove(outbuf, outptr, outbytes_left);
 693         outptr = (char *)outbuf + outbytes_left;
 694         outbytes_left = BUFSIZ * sizeof (int) - outbytes_left;
 695       }
 696       else if (errno == EINVAL) {
 697         // `data' ends with partial input sequence.
 698         memcpy(inbuf, inptr, inbytes_left);
 699         break;
 700       }
 701     }
 702   }
 703   // Handle `fp' and switch to `inbuf'.
 704   size_t read_bytes;
 705   char *read_start = inbuf + inbytes_left;
 706   while ((read_bytes = fcp->get_buf(read_start, BUFSIZ - inbytes_left)) > 0) {
 707     inptr = inbuf;
 708     inbytes_left += read_bytes;
 709     while (inbytes_left > 0) {
 710       size_t status = iconv(handle,
 711                             (ICONV_CONST char **)&inptr, &inbytes_left,
 712                             &outptr, &outbytes_left);
 713       if (status == (size_t)-1) {
 714         if (errno == EILSEQ) {
 715           // Invalid byte sequence.  XXX
 716           inptr++;
 717           inbytes_left--;
 718         }
 719         else if (errno == E2BIG) {
 720           // Output buffer is full.
 721           limit = (char *)outbuf + BUFSIZ * sizeof (int) - outbytes_left;
 722           for (int *ptr = outbuf; (char *)ptr < limit; ptr++)
 723             unicode_entity(*ptr);
 724           memmove(outbuf, outptr, outbytes_left);
 725           outptr = (char *)outbuf + outbytes_left;
 726           outbytes_left = BUFSIZ * sizeof (int) - outbytes_left;
 727         }
 728         else if (errno == EINVAL) {
 729           // `inbuf' ends with partial input sequence.
 730           memmove(inbuf, inptr, inbytes_left);
 731           break;
 732         }
 733       }
 734     }
 735     read_start = inbuf + inbytes_left;
 736   }
 737   iconv_close(handle);
 738   // XXX use ferror?
 739   limit = (char *)outbuf + BUFSIZ * sizeof (int) - outbytes_left;
 740   for (int *ptr = outbuf; (char *)ptr < limit; ptr++)
 741     unicode_entity(*ptr);
 742 }
 743 #endif /* HAVE_ICONV */
 744
 745 // ---------------------------------------------------------
 746 // Handle Byte Order Mark.
 747 //
 748 // Since we have a chicken-and-egg problem it's necessary
 749 // to handle the BOM manually if it is in the data stream.
 750 // As documented in the Unicode book it is very unlikely
 751 // that any normal text file (regardless of the encoding)
 752 // starts with the bytes which represent a BOM.
 753 //
 754 // Return the BOM in string `BOM'; `data' then starts with
 755 // the byte after the BOM.  This function reads (at most)
 756 // four bytes from the data stream.
 757 //
 758 // Return encoding if a BOM is found, NULL otherwise.
 759 // ---------------------------------------------------------
 760 const char *
 761 get_BOM(file_case *fcp, string &BOM, string &data)
 762 {
 763   // The BOM is U+FEFF.  We have thus the following possible
 764   // representations.
 765   //
 766   //   UTF-8: 0xEFBBBF
 767   //   UTF-16: 0xFEFF or 0xFFFE
 768   //   UTF-32: 0x0000FEFF or 0xFFFE0000
 769   static struct {
 770     int len;
 771     const char *str;
 772     const char *name;
 773   } BOM_table[] = {
 774     {4, "\x00\x00\xFE\xFF", "UTF-32"},
 775     {4, "\xFF\xFE\x00\x00", "UTF-32"},
 776     {3, "\xEF\xBB\xBF", "UTF-8"},
 777     {2, "\xFE\xFF", "UTF-16"},
 778     {2, "\xFF\xFE", "UTF-16"},
 779   };
 780   const int BOM_table_len = sizeof (BOM_table) / sizeof (BOM_table[0]);
 781   char BOM_string[4];
 782   const char *retval = NULL;
 783   int len;
 784   for (len = 0; len < 4; len++) {
 785     int c = fcp->get_c();
 786     if (c == EOF)
 787       break;
 788     BOM_string[len] = char(c);
 789   }
 790   int i;
 791   for (i = 0; i < BOM_table_len; i++) {
 792     if (BOM_table[i].len <= len
 793         && memcmp(BOM_string, BOM_table[i].str, BOM_table[i].len) == 0)
 794       break;
 795   }
 796   int j = 0;
 797   if (i < BOM_table_len) {
 798     for (; j < BOM_table[i].len; j++)
 799       BOM += BOM_string[j];
 800     retval = BOM_table[i].name;
 801   }
 802   for (; j < len; j++)
 803     data += BOM_string[j];
 804   return retval;
 805 }
 806
 807 // ---------------------------------------------------------
 808 // Get first two lines from input stream.
 809 //
 810 // Return string (allocated with `new') without zero bytes
 811 // or NULL in case no coding tag can occur in the data
 812 // (which is stored unmodified in `data').
 813 // ---------------------------------------------------------
 814 char *
 815 get_tag_lines(file_case *fcp, string &data)
 816 {
 817   int newline_count = 0;
 818   int c, prev = -1;
 819   // Handle CR, LF, and CRLF as line separators.
 820   for (int i = 0; i < data.length(); i++) {
 821     c = data[i];
 822     if (c == '\n' || c == '\r')
 823       newline_count++;
 824     if (c == '\n' && prev == '\r')
 825       newline_count--;
 826     prev = c;
 827   }
 828   if (newline_count > 1)
 829     return NULL;
 830   int emit_warning = 1;
 831   for (int lines = newline_count; lines < 2; lines++) {
 832     while ((c = fcp->get_c()) != EOF) {
 833       if (c == '\0' && debug_flag && emit_warning) {
 834         fprintf(stderr,
 835                 "  null byte(s) found in input stream --\n"
 836                 "  search for coding tag might return false result\n");
 837         emit_warning = 0;
 838       }
 839       data += char(c);
 840       if (c == '\n' || c == '\r')
 841         break;
 842     }
 843     // Handle CR, LF, and CRLF as line separators.
 844     if (c == '\r') {
 845       c = fcp->get_c();
 846       if (c != EOF && c != '\n')
 847         fcp->unget_c(c);
 848       else
 849         data += char(c);
 850     }
 851   }
 852   return data.extract();
 853 }
 854
 855 // ---------------------------------------------------------
 856 // Check whether C string starts with a comment.
 857 //
 858 // Return 1 if true, 0 otherwise.
 859 // ---------------------------------------------------------
 860 int
 861 is_comment_line(char *s)
 862 {
 863   if (!s || !*s)
 864     return 0;
 865   if (*s == '.' || *s == '\'')
 866   {
 867     s++;
 868     while (*s == ' ' || *s == '\t')
 869       s++;
 870     if (*s && *s == '\\')
 871     {
 872       s++;
 873       if (*s == '"' || *s == '#')
 874         return 1;
 875     }
 876   }
 877   else if (*s == '\\')
 878   {
 879     s++;
 880     if (*s == '#')
 881       return 1;
 882   }
 883   return 0;
 884 }
 885
 886 // ---------------------------------------------------------
 887 // Get a value/variable pair from a local variables list
 888 // in a C string which look like this:
 889 //
 890 //   <variable1>: <value1>; <variable2>: <value2>; ...
 891 //
 892 // Leading and trailing blanks are ignored.  There might be
 893 // more than one blank after `:' and `;'.
 894 //
 895 // Return position of next value/variable pair or NULL if
 896 // at end of data.
 897 // ---------------------------------------------------------
 898 char *
 899 get_variable_value_pair(char *d1, char **variable, char **value)
 900 {
 901   static char var[MAX_VAR_LEN], val[MAX_VAR_LEN];
 902   *variable = var;
 903   *value = val;
 904   while (*d1 == ' ' || *d1 == '\t')
 905     d1++;
 906   // Get variable.
 907   int l = 0;
 908   while (l < MAX_VAR_LEN - 1 && *d1 && !strchr(";: \t", *d1))
 909     var[l++] = *(d1++);
 910   var[l] = 0;
 911   // Skip everything until `:', `;', or end of data.
 912   while (*d1 && *d1 != ':' && *d1 != ';')
 913     d1++;
 914   val[0] = 0;
 915   if (!*d1)
 916     return NULL;
 917   if (*d1 == ';')
 918     return d1 + 1;
 919   d1++;
 920   while (*d1 == ' ' || *d1 == '\t')
 921     d1++;
 922   // Get value.
 923   l = 0;
 924   while (l < MAX_VAR_LEN - 1 && *d1 && !strchr("; \t", *d1))
 925     val[l++] = *(d1++);
 926   val[l] = 0;
 927   // Skip everything until `;' or end of data.
 928   while (*d1 && *d1 != ';')
 929     d1++;
 930   if (*d1 == ';')
 931     return d1 + 1;
 932   return NULL;
 933 }
 934
 935 // ---------------------------------------------------------
 936 // Check coding tag in the read buffer.
 937 //
 938 // We search for the following line:
 939 //
 940 //   <comment> ... -*-<local variables list>-*-
 941 //
 942 // (`...' might be anything).
 943 //
 944 // <comment> can be one of the following syntax forms at the
 945 // beginning of the line:
 946 //
 947 //   .\"   .\#   '\"   '\#   \#
 948 //
 949 // There can be whitespace after the leading `.' or "'".
 950 //
 951 // The local variables list must occur within the first
 952 // comment block at the very beginning of the data stream.
 953 //
 954 // Within the <local variables list>, we search for
 955 //
 956 //   coding: <value>
 957 //
 958 // which specifies the coding system used for the data
 959 // stream.
 960 //
 961 // Return <value> if found, NULL otherwise.
 962 //
 963 // Note that null bytes in the data are skipped before applying
 964 // the algorithm.  This should work even with files encoded as
 965 // UTF-16 or UTF-32 (or its siblings) in most cases.
 966 //
 967 // XXX Add support for tag at the end of buffer.
 968 // ---------------------------------------------------------
 969 char *
 970 check_coding_tag(file_case *fcp, string &data)
 971 {
 972   char *inbuf = get_tag_lines(fcp, data);
 973   char *lineend;
 974   for (char *p = inbuf; is_comment_line(p); p = lineend + 1) {
 975     if ((lineend = strchr(p, '\n')) == NULL)
 976       break;
 977     *lineend = 0;               // switch temporarily to '\0'
 978     char *d1 = strstr(p, "-*-");
 979     char *d2 = 0;
 980     if (d1)
 981       d2 = strstr(d1 + 3, "-*-");
 982     *lineend = '\n';            // restore newline
 983     if (!d1 || !d2)
 984       continue;
 985     *d2 = 0;                    // switch temporarily to '\0'
 986     d1 += 3;
 987     while (d1) {
 988       char *variable, *value;
 989       d1 = get_variable_value_pair(d1, &variable, &value);
 990       if (!strcasecmp(variable, "coding")) {
 991         *d2 = '-';              // restore '-'
 992         a_delete inbuf;
 993         return value;
 994       }
 995     }
 996     *d2 = '-';                  // restore '-'
 997   }
 998   a_delete inbuf;
 999   return NULL;
1000 }
1001
1002 // ---------------------------------------------------------
1003 // Handle an input file.  If filename is `-' handle stdin.
1004 //
1005 // Return 1 on success, 0 otherwise.
1006 // ---------------------------------------------------------
1007 int
1008 do_file(const char *filename)
1009 {
1010   if (debug_flag)
1011     fprintf(stderr, "file `%s':\n", filename);
1012   file_case *fcp;
1013   if ((fcp = file_case::muxer(filename, fcp->mux_need_binary)) == NULL) {
1014     assert(strcmp(filename, "-"));
1015     error("can't open `%1': %2", filename, strerror(errno));
1016     return 0;
1017   }
1018
1019   string BOM, data;
1020   const char *BOM_encoding = get_BOM(fcp, BOM, data);
1021   // Determine the encoding.
1022   char *encoding;
1023   if (user_encoding[0]) {
1024     if (debug_flag) {
1025       fprintf(stderr, "  user-specified encoding `%s', "
1026                       "no search for coding tag\n",
1027                       user_encoding);
1028       if (BOM_encoding && strcmp(BOM_encoding, user_encoding))
1029         fprintf(stderr, "  but BOM in data stream implies encoding `%s'!\n",
1030                         BOM_encoding);
1031     }
1032     encoding = (char *)user_encoding;
1033   }
1034   else if (BOM_encoding) {
1035     if (debug_flag)
1036       fprintf(stderr, "  found BOM, no search for coding tag\n");
1037     encoding = (char *)BOM_encoding;
1038   }
1039   else {
1040     // `check_coding_tag' returns a pointer to a static array (or NULL).
1041     char *file_encoding = check_coding_tag(fcp, data);
1042     if (!file_encoding) {
1043       if (debug_flag)
1044         fprintf(stderr, "  no file encoding\n");
1045       file_encoding = default_encoding;
1046     }
1047     else
1048       if (debug_flag)
1049         fprintf(stderr, "  file encoding: `%s'\n", file_encoding);
1050     encoding = file_encoding;
1051   }
1052   strncpy(encoding_string, encoding, MAX_VAR_LEN - 1);
1053   encoding_string[MAX_VAR_LEN - 1] = 0;
1054   encoding = encoding_string;
1055   // Translate from MIME & Emacs encoding names to locale encoding names.
1056   encoding = emacs2mime(encoding_string);
1057   if (encoding[0] == '\0') {
1058     error("encoding `%1' not supported, not a portable encoding",
1059           encoding_string);
1060     return 0;
1061   }
1062   if (debug_flag)
1063     fprintf(stderr, "  encoding used: `%s'\n", encoding);
1064   if (!raw_flag)
1065     printf(".lf 1 %s\n", filename);
1066   int success = 1;
1067   // Call converter (converters write to stdout).
1068   if (!strcasecmp(encoding, "ISO-8859-1"))
1069     conversion_latin1(fcp, BOM + data);
1070   else if (!strcasecmp(encoding, "UTF-8"))
1071     conversion_utf8(fcp, data);
1072   else if (!strcasecmp(encoding, "cp1047"))
1073     conversion_cp1047(fcp, BOM + data);
1074   else {
1075 #if HAVE_ICONV
1076     conversion_iconv(fcp, BOM + data, encoding);
1077 #else
1078     error("encoding system `%1' not supported", encoding);
1079     success = 0;
1080 #endif /* HAVE_ICONV */
1081   }
1082
1083   delete fcp;
1084   return success;
1085 }
1086
1087 // ---------------------------------------------------------
1088 // Print usage.
1089 // ---------------------------------------------------------
1090 void
1091 usage(FILE *stream)
1092 {
1093   fprintf(stream, "usage: %s [ option ] [ files ]\n"
1094                   "\n"
1095                   "-d           show debugging messages\n"
1096                   "-D encoding  specify default encoding\n"
1097                   "-e encoding  specify input encoding\n"
1098                   "-h           print this message\n"
1099                   "-r           don't add .lf requests\n"
1100                   "-v           print version number\n"
1101                   "\n"
1102                   "The default encoding is `%s'.\n",
1103                   program_name, default_encoding);
1104 }
1105
1106 // ---------------------------------------------------------
1107 // Main routine.
1108 // ---------------------------------------------------------
1109 int
1110 main(int argc, char **argv)
1111 {
1112   program_name = argv[0];
1113   // Determine the default encoding.  This must be done before
1114   // getopt() is called since the usage message shows the default
1115   // encoding.
1116   setlocale(LC_ALL, "");
1117   char *locale = getlocale(LC_CTYPE);
1118   if (!locale || !strcmp(locale, "C") || !strcmp(locale, "POSIX"))
1119     strcpy(default_encoding, "latin1");
1120   else {
1121     strncpy(default_encoding, locale_charset(), MAX_VAR_LEN - 1);
1122     default_encoding[MAX_VAR_LEN - 1] = 0;
1123   }
1124
1125   program_name = argv[0];
1126   int opt;
1127   static const struct option long_options[] = {
1128     { "help", no_argument, 0, 'h' },
1129     { "version", no_argument, 0, 'v' },
1130     { NULL, 0, 0, 0 }
1131   };
1132   // Parse the command line options.
1133   while ((opt = getopt_long(argc, argv,
1134                             "dD:e:hrv", long_options, NULL)) != EOF)
1135     switch (opt) {
1136     case 'v':
1137       printf("GNU preconv (groff) version %s %s iconv support\n",
1138              Version_string,
1139 #ifdef HAVE_ICONV
1140              "with"
1141 #else
1142              "without"
1143 #endif /* HAVE_ICONV */
1144             );
1145       exit(0);
1146       break;
1147     case 'd':
1148       debug_flag = 1;
1149       break;
1150     case 'e':
1151       if (optarg) {
1152         strncpy(user_encoding, optarg, MAX_VAR_LEN - 1);
1153         user_encoding[MAX_VAR_LEN - 1] = 0;
1154       }
1155       else
1156         user_encoding[0] = 0;
1157       break;
1158     case 'D':
1159       if (optarg) {
1160         strncpy(default_encoding, optarg, MAX_VAR_LEN - 1);
1161         default_encoding[MAX_VAR_LEN - 1] = 0;
1162       }
1163       break;
1164     case 'r':
1165       raw_flag = 1;
1166       break;
1167     case 'h':
1168       usage(stdout);
1169       exit(0);
1170       break;
1171     case '?':
1172       usage(stderr);
1173       exit(1);
1174       break;
1175     default:
1176       assert(0);
1177     }
1178   int nbad = 0;
1179   if (debug_flag)
1180     fprintf(stderr, "default encoding: `%s'\n", default_encoding);
1181   if (optind >= argc)
1182     nbad += !do_file("-");
1183   else
1184     for (int i = optind; i < argc; i++)
1185       nbad += !do_file(argv[i]);
1186   if (ferror(stdout) || fflush(stdout) < 0)
1187     fatal("output error");
1188   return nbad != 0;
1189 }
1190
1191 /* end of preconv.cpp */