src/lib-roff/glyphuni.cpp

   1 /*@
   2  * Copyright (c) 2014 - 2017 Steffen (Daode) Nurpmeso <steffen@sdaoden.eu>.
   3  *
   4  * Copyright (C) 2002, 2003, 2004, 2006
   5  *    Free Software Foundation, Inc.
   6  *      Written by Werner Lemberg <wl@gnu.org>
   7  *
   8  * This is free software; you can redistribute it and/or modify it under
   9  * the terms of the GNU General Public License as published by the Free
  10  * Software Foundation; either version 2, or (at your option) any later
  11  * version.
  12  *
  13  * This is distributed in the hope that it will be useful, but WITHOUT ANY
  14  * WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16  * for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with groff; see the file COPYING.  If not, write to the Free Software
  20  * Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA.
  21  */
  22
  23 #include "config.h"
  24 #include "lib.h"
  25
  26 #include "ptable.h"
  27 #include "stringclass.h"
  28 #include "unicode.h"
  29
  30 struct glyph_to_unicode {
  31   char *value;
  32 };
  33
  34 declare_ptable(glyph_to_unicode)
  35 implement_ptable(glyph_to_unicode)
  36
  37 PTABLE(glyph_to_unicode) glyph_to_unicode_table;
  38
  39 // The entries commented out in the table below can't be used in glyph
  40 // names.
  41
  42 struct S {
  43   const char *key;
  44   const char *value;
  45 } glyph_to_unicode_list[] = { // FIXME const?
  46   { "!", "0021" },
  47   { "\"", "0022" },
  48   { "dq", "0022" },
  49   { "#", "0023" },
  50   { "sh", "0023" },
  51   { "$", "0024" },
  52   { "Do", "0024" },
  53   { "%", "0025" },
  54   { "&", "0026" },
  55   { "aq", "0027" },
  56   { "(", "0028" },
  57   { ")", "0029" },
  58   { "*", "002A" },
  59   { "+", "002B" },
  60   { "pl", "002B" },
  61   { ",", "002C" },
  62   { ".", "002E" },
  63   { "/", "002F" },
  64   { "sl", "002F" },
  65   { "0", "0030" },
  66   { "1", "0031" },
  67   { "2", "0032" },
  68   { "3", "0033" },
  69   { "4", "0034" },
  70   { "5", "0035" },
  71   { "6", "0036" },
  72   { "7", "0037" },
  73   { "8", "0038" },
  74   { "9", "0039" },
  75   { ":", "003A" },
  76   { ";", "003B" },
  77   { "<", "003C" },
  78   { "=", "003D" },
  79   { "eq", "003D" },
  80   { ">", "003E" },
  81   { "?", "003F" },
  82   { "@", "0040" },
  83   { "at", "0040" },
  84   { "A", "0041" },
  85   { "B", "0042" },
  86   { "C", "0043" },
  87   { "D", "0044" },
  88   { "E", "0045" },
  89   { "F", "0046" },
  90   { "G", "0047" },
  91   { "H", "0048" },
  92   { "I", "0049" },
  93   { "J", "004A" },
  94   { "K", "004B" },
  95   { "L", "004C" },
  96   { "M", "004D" },
  97   { "N", "004E" },
  98   { "O", "004F" },
  99   { "P", "0050" },
 100   { "Q", "0051" },
 101   { "R", "0052" },
 102   { "S", "0053" },
 103   { "T", "0054" },
 104   { "U", "0055" },
 105   { "V", "0056" },
 106   { "W", "0057" },
 107   { "X", "0058" },
 108   { "Y", "0059" },
 109   { "Z", "005A" },
 110 //{ "[", "005B" },
 111   { "lB", "005B" },
 112 //{ "\\", "005C" },
 113   { "rs", "005C" },
 114 //{ "]", "005D" },
 115   { "rB", "005D" },
 116   { "a^", "005E" },
 117   { "^", "005E" },
 118   { "ha", "005E" },
 119   { "_", "005F" },
 120   { "ru", "005F" },
 121   { "ul", "005F" },
 122   { "ga", "0060" },
 123   { "a", "0061" },
 124   { "b", "0062" },
 125   { "c", "0063" },
 126   { "d", "0064" },
 127   { "e", "0065" },
 128   { "f", "0066" },
 129   { "ff", "0066_0066" },
 130   { "Fi", "0066_0066_0069" },
 131   { "Fl", "0066_0066_006C" },
 132   { "fi", "0066_0069" },
 133   { "fl", "0066_006C" },
 134   { "g", "0067" },
 135   { "h", "0068" },
 136   { "i", "0069" },
 137   { "j", "006A" },
 138   { "k", "006B" },
 139   { "l", "006C" },
 140   { "m", "006D" },
 141   { "n", "006E" },
 142   { "o", "006F" },
 143   { "p", "0070" },
 144   { "q", "0071" },
 145   { "r", "0072" },
 146   { "s", "0073" },
 147   { "t", "0074" },
 148   { "u", "0075" },
 149   { "v", "0076" },
 150   { "w", "0077" },
 151   { "x", "0078" },
 152   { "y", "0079" },
 153   { "z", "007A" },
 154   { "lC", "007B" },
 155   { "{", "007B" },
 156   { "ba", "007C" },
 157   { "or", "007C" },
 158   { "|", "007C" },
 159   { "rC", "007D" },
 160   { "}", "007D" },
 161   { "a~", "007E" },
 162   { "~", "007E" },
 163   { "ti", "007E" },
 164   { "r!", "00A1" },
 165   { "ct", "00A2" },
 166   { "Po", "00A3" },
 167   { "Cs", "00A4" },
 168   { "Ye", "00A5" },
 169   { "bb", "00A6" },
 170   { "sc", "00A7" },
 171   { "ad", "00A8" },
 172   { "co", "00A9" },
 173   { "Of", "00AA" },
 174   { "Fo", "00AB" },
 175   { "no", "00AC" },
 176   { "tno", "00AC" },
 177   // The soft hypen U+00AD is meaningful only in the input file,
 178   // not in the output.
 179   { "rg", "00AE" },
 180   { "a-", "00AF" },
 181   { "de", "00B0" },
 182   { "+-", "00B1" },
 183   { "t+-", "00B1" },
 184   { "S2", "00B2" },
 185   { "S3", "00B3" },
 186   { "aa", "00B4" },
 187   { "mc", "00B5" },
 188   { "ps", "00B6" },
 189   { "pc", "00B7" },
 190   { "ac", "00B8" },
 191   { "S1", "00B9" },
 192   { "Om", "00BA" },
 193   { "Fc", "00BB" },
 194   { "14", "00BC" },
 195   { "12", "00BD" },
 196   { "34", "00BE" },
 197   { "r?", "00BF" },
 198   { "`A", "00C0" },
 199   { "'A", "00C1" },
 200   { "^A", "00C2" },
 201   { "~A", "00C3" },
 202   { ":A", "00C4" },
 203   { "oA", "00C5" },
 204   { "AE", "00C6" },
 205   { ",C", "00C7" },
 206   { "`E", "00C8" },
 207   { "'E", "00C9" },
 208   { "^E", "00CA" },
 209   { ":E", "00CB" },
 210   { "`I", "00CC" },
 211   { "'I", "00CD" },
 212   { "^I", "00CE" },
 213   { ":I", "00CF" },
 214   { "-D", "00D0" },
 215   { "~N", "00D1" },
 216   { "`O", "00D2" },
 217   { "'O", "00D3" },
 218   { "^O", "00D4" },
 219   { "~O", "00D5" },
 220   { ":O", "00D6" },
 221   { "mu", "00D7" },
 222   { "tmu", "00D7" },
 223   { "/O", "00D8" },
 224   { "`U", "00D9" },
 225   { "'U", "00DA" },
 226   { "^U", "00DB" },
 227   { ":U", "00DC" },
 228   { "'Y", "00DD" },
 229   { "TP", "00DE" },
 230   { "ss", "00DF" },
 231   { "`a", "00E0" },
 232   { "'a", "00E1" },
 233   { "^a", "00E2" },
 234   { "~a", "00E3" },
 235   { ":a", "00E4" },
 236   { "oa", "00E5" },
 237   { "ae", "00E6" },
 238   { ",c", "00E7" },
 239   { "`e", "00E8" },
 240   { "'e", "00E9" },
 241   { "^e", "00EA" },
 242   { ":e", "00EB" },
 243   { "`i", "00EC" },
 244   { "'i", "00ED" },
 245   { "^i", "00EE" },
 246   { ":i", "00EF" },
 247   { "Sd", "00F0" },
 248   { "~n", "00F1" },
 249   { "`o", "00F2" },
 250   { "'o", "00F3" },
 251   { "^o", "00F4" },
 252   { "~o", "00F5" },
 253   { ":o", "00F6" },
 254   { "di", "00F7" },
 255   { "tdi", "00F7" },
 256   { "/o", "00F8" },
 257   { "`u", "00F9" },
 258   { "'u", "00FA" },
 259   { "^u", "00FB" },
 260   { ":u", "00FC" },
 261   { "'y", "00FD" },
 262   { "Tp", "00FE" },
 263   { ":y", "00FF" },
 264   { "'C", "0106" },
 265   { "'c", "0107" },
 266   { ".i", "0131" },
 267   { "IJ", "0132" },
 268   { "ij", "0133" },
 269   { "/L", "0141" },
 270   { "/l", "0142" },
 271   { "OE", "0152" },
 272   { "oe", "0153" },
 273   { "vS", "0160" },
 274   { "vs", "0161" },
 275   { ":Y", "0178" },
 276   { "vZ", "017D" },
 277   { "vz", "017E" },
 278   { "Fn", "0192" },
 279   { "ah", "02C7" },
 280   { "ab", "02D8" },
 281   { "a.", "02D9" },
 282   { "ao", "02DA" },
 283   { "ho", "02DB" },
 284   { "a\"", "02DD" },
 285   { "*A", "0391" },
 286   { "*B", "0392" },
 287   { "*G", "0393" },
 288   { "*D", "0394" },
 289   { "*E", "0395" },
 290   { "*Z", "0396" },
 291   { "*Y", "0397" },
 292   { "*H", "0398" },
 293   { "*I", "0399" },
 294   { "*K", "039A" },
 295   { "*L", "039B" },
 296   { "*M", "039C" },
 297   { "*N", "039D" },
 298   { "*C", "039E" },
 299   { "*O", "039F" },
 300   { "*P", "03A0" },
 301   { "*R", "03A1" },
 302   { "*S", "03A3" },
 303   { "*T", "03A4" },
 304   { "*U", "03A5" },
 305   { "*F", "03A6" },
 306   { "*X", "03A7" },
 307   { "*Q", "03A8" },
 308   { "*W", "03A9" },
 309   { "*a", "03B1" },
 310   { "*b", "03B2" },
 311   { "*g", "03B3" },
 312   { "*d", "03B4" },
 313   { "*e", "03B5" },
 314   { "*z", "03B6" },
 315   { "*y", "03B7" },
 316   { "*h", "03B8" },
 317   { "*i", "03B9" },
 318   { "*k", "03BA" },
 319   { "*l", "03BB" },
 320   { "*m", "03BC" },
 321   { "*n", "03BD" },
 322   { "*c", "03BE" },
 323   { "*o", "03BF" },
 324   { "*p", "03C0" },
 325   { "*r", "03C1" },
 326   { "ts", "03C2" },
 327   { "*s", "03C3" },
 328   { "*t", "03C4" },
 329   { "*u", "03C5" },
 330   // the curly phi variant
 331   { "+f", "03C6" },
 332   { "*x", "03C7" },
 333   { "*q", "03C8" },
 334   { "*w", "03C9" },
 335   { "+h", "03D1" },
 336   // the stroked phi variant
 337   { "*f", "03D5" },
 338   { "+p", "03D6" },
 339   { "+e", "03F5" },
 340   // `-' and `hy' denote a HYPHEN, usually a glyph with a smaller width than
 341   // the MINUS sign.  Users who are viewing broken man pages that assume
 342   // that `-' denotes a U+002D character can either fix the broken man pages
 343   // or apply the workaround described in the PROBLEMS file.
 344   { "-", "2010" },
 345   { "hy", "2010" },
 346   { "en", "2013" },
 347   { "em", "2014" },
 348   { "`", "2018" },
 349   { "oq", "2018" },
 350   { "'", "2019" },
 351   { "cq", "2019" },
 352   { "bq", "201A" },
 353   { "lq", "201C" },
 354   { "rq", "201D" },
 355   { "Bq", "201E" },
 356   { "dg", "2020" },
 357   { "dd", "2021" },
 358   { "bu", "2022" },
 359   { "%0", "2030" },
 360   { "fm", "2032" },
 361   { "sd", "2033" },
 362   { "fo", "2039" },
 363   { "fc", "203A" },
 364   { "rn", "203E" },
 365   { "f/", "2044" },
 366   { "eu", "20AC" },
 367   { "Eu", "20AC" },
 368   { "-h", "210F" },
 369   { "hbar", "210F" },
 370   { "Im", "2111" },
 371   { "wp", "2118" },
 372   { "Re", "211C" },
 373   { "tm", "2122" },
 374   { "Ah", "2135" },
 375   { "18", "215B" },
 376   { "38", "215C" },
 377   { "58", "215D" },
 378   { "78", "215E" },
 379   { "<-", "2190" },
 380   { "ua", "2191" },
 381   { "->", "2192" },
 382   { "da", "2193" },
 383   { "<>", "2194" },
 384   { "va", "2195" },
 385   { "CR", "21B5" },
 386   { "lA", "21D0" },
 387   { "uA", "21D1" },
 388   { "rA", "21D2" },
 389   { "dA", "21D3" },
 390   { "hA", "21D4" },
 391   { "vA", "21D5" },
 392   { "fa", "2200" },
 393   { "pd", "2202" },
 394   { "te", "2203" },
 395   { "es", "2205" },
 396   { "gr", "2207" },
 397   { "mo", "2208" },
 398   { "nm", "2209" },
 399   { "st", "220B" },
 400   { "product", "220F" },
 401   { "coproduct", "2210" },
 402   { "sum", "2211" },
 403   // `mi' and `\-' represent a MINUS sign.  But it is used in many man pages
 404   // to denote the U+002D character that introduces a command-line option.
 405   // For devices that support copy&paste, such as devhtml and devutf8, the
 406   // user can apply the workaround described in the PROBLEMS file.
 407   { "\\-", "2212" },
 408   { "mi", "2212" },
 409   { "-+", "2213" },
 410   { "**", "2217" },
 411   { "sqrt", "221A" },
 412   { "sr", "221A" },
 413   { "pt", "221D" },
 414   { "if", "221E" },
 415   { "/_", "2220" },
 416   { "AN", "2227" },
 417   { "OR", "2228" },
 418   { "ca", "2229" },
 419   { "cu", "222A" },
 420   { "is", "222B" },
 421   { "integral", "222B" },
 422   { "tf", "2234" },
 423   { "3d", "2234" },
 424   { "ap", "223C" },
 425   { "|=", "2243" },
 426   { "=~", "2245" },
 427   { "~~", "2248" },
 428   { "~=", "2248" },
 429   { "!=", "2260" },
 430   { "==", "2261" },
 431   { "ne", "2262" },
 432   { "<=", "2264" },
 433   { ">=", "2265" },
 434   { "<<", "226A" },
 435   { ">>", "226B" },
 436   { "sb", "2282" },
 437   { "sp", "2283" },
 438   { "nb", "2284" },
 439   { "nc", "2285" },
 440   { "ib", "2286" },
 441   { "ip", "2287" },
 442   { "c+", "2295" },
 443   { "c*", "2297" },
 444   { "pp", "22A5" },
 445   { "md", "22C5" },
 446   { "lc", "2308" },
 447   { "rc", "2309" },
 448   { "lf", "230A" },
 449   { "rf", "230B" },
 450   { "parenlefttp", "239B" },
 451   { "parenleftex", "239C" },
 452   { "parenleftbt", "239D" },
 453   { "parenrighttp", "239E" },
 454   { "parenrightex", "239F" },
 455   { "parenrightbt", "23A0" },
 456   { "bracketlefttp", "23A1" },
 457   { "bracketleftex", "23A2" },
 458   { "bracketleftbt", "23A3" },
 459   { "bracketrighttp", "23A4" },
 460   { "bracketrightex", "23A5" },
 461   { "bracketrightbt", "23A6" },
 462   { "lt", "23A7" },
 463   { "bracelefttp", "23A7" },
 464   { "lk", "23A8" },
 465   { "braceleftmid", "23A8" },
 466   { "lb", "23A9" },
 467   { "braceleftbt", "23A9" },
 468   { "bv", "23AA" },
 469   { "braceex", "23AA" },
 470   { "braceleftex", "23AA" },
 471   { "bracerightex", "23AA" },
 472   { "rt", "23AB" },
 473   { "bracerighttp", "23AB" },
 474   { "rk", "23AC" },
 475   { "bracerightmid", "23AC" },
 476   { "rb", "23AD" },
 477   { "bracerightbt", "23AD" },
 478   { "an", "23AF" },
 479   { "br", "2502" },
 480   { "sq", "25A1" },
 481   { "lz", "25CA" },
 482   { "ci", "25CB" },
 483   { "lh", "261C" },
 484   { "rh", "261E" },
 485   { "SP", "2660" },
 486   { "CL", "2663" },
 487   { "HE", "2665" },
 488   { "DI", "2666" },
 489   { "OK", "2713" },
 490   // The `left angle bracket' and `right angle bracket' could be mapped to
 491   // either U+2329,U+232A or U+3008,U+3009 or U+27E8,U+27E9.  But the first
 492   // and second possibility are double-width characters (see Unicode's
 493   // `DerivedEastAsianWidth.txt' file) and are therefore not suitable for
 494   // general use, whereas the third possibility is single-width.
 495   //
 496   // The devhtml device overrides this mapping, because
 497   //
 498   //   http://www.w3.org/TR/html401/sgml/entities.html
 499   //
 500   // says that in HTML, `&lang;' and `&rang;' are U+2329,U+232A,
 501   // respectively.
 502   { "la", "27E8" },
 503   { "ra", "27E9" },
 504 };
 505
 506 // global constructor FIXME static CTOR
 507 static struct glyph_to_unicode_init {
 508   glyph_to_unicode_init();
 509 } _glyph_to_unicode_init;
 510
 511 glyph_to_unicode_init::glyph_to_unicode_init()
 512 {
 513   for (unsigned int i = 0;
 514        i < sizeof(glyph_to_unicode_list)/sizeof(glyph_to_unicode_list[0]);
 515        i++) {
 516     glyph_to_unicode *gtu = new glyph_to_unicode[1];
 517     gtu->value = (char *)glyph_to_unicode_list[i].value;
 518     glyph_to_unicode_table.define(glyph_to_unicode_list[i].key, gtu);
 519   }
 520 }
 521
 522 const char *glyph_name_to_unicode(const char *s)
 523 {
 524   glyph_to_unicode *result = glyph_to_unicode_table.lookup(s);
 525   return result ? result->value : 0;
 526 }
 527
 528 // s-it2-mode