Merge branch 'topic/sync-to-go-2'
[s-roff.git] / src / lib-roff / glyphuni.cpp
bloba34f6b18493574f2b56bc90cdbf828a83213ea39
1 /*@
2 * Copyright (c) 2014 - 2017 Steffen (Daode) Nurpmeso <steffen@sdaoden.eu>.
4 * Copyright (C) 2002, 2003, 2004, 2006
5 * Free Software Foundation, Inc.
6 * Written by Werner Lemberg <wl@gnu.org>
8 * This is free software; you can redistribute it and/or modify it under
9 * the terms of the GNU General Public License as published by the Free
10 * Software Foundation; either version 2, or (at your option) any later
11 * version.
13 * This is distributed in the hope that it will be useful, but WITHOUT ANY
14 * WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 * for more details.
18 * You should have received a copy of the GNU General Public License along
19 * with groff; see the file COPYING. If not, write to the Free Software
20 * Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA.
23 #include "config.h"
24 #include "lib.h"
26 #include "ptable.h"
27 #include "stringclass.h"
28 #include "unicode.h"
30 struct glyph_to_unicode {
31 char *value;
34 declare_ptable(glyph_to_unicode)
35 implement_ptable(glyph_to_unicode)
37 PTABLE(glyph_to_unicode) glyph_to_unicode_table;
39 // The entries commented out in the table below can't be used in glyph
40 // names.
42 struct S {
43 const char *key;
44 const char *value;
45 } glyph_to_unicode_list[] = { // FIXME const?
46 { "!", "0021" },
47 { "\"", "0022" },
48 { "dq", "0022" },
49 { "#", "0023" },
50 { "sh", "0023" },
51 { "$", "0024" },
52 { "Do", "0024" },
53 { "%", "0025" },
54 { "&", "0026" },
55 { "aq", "0027" },
56 { "(", "0028" },
57 { ")", "0029" },
58 { "*", "002A" },
59 { "+", "002B" },
60 { "pl", "002B" },
61 { ",", "002C" },
62 { ".", "002E" },
63 { "/", "002F" },
64 { "sl", "002F" },
65 { "0", "0030" },
66 { "1", "0031" },
67 { "2", "0032" },
68 { "3", "0033" },
69 { "4", "0034" },
70 { "5", "0035" },
71 { "6", "0036" },
72 { "7", "0037" },
73 { "8", "0038" },
74 { "9", "0039" },
75 { ":", "003A" },
76 { ";", "003B" },
77 { "<", "003C" },
78 { "=", "003D" },
79 { "eq", "003D" },
80 { ">", "003E" },
81 { "?", "003F" },
82 { "@", "0040" },
83 { "at", "0040" },
84 { "A", "0041" },
85 { "B", "0042" },
86 { "C", "0043" },
87 { "D", "0044" },
88 { "E", "0045" },
89 { "F", "0046" },
90 { "G", "0047" },
91 { "H", "0048" },
92 { "I", "0049" },
93 { "J", "004A" },
94 { "K", "004B" },
95 { "L", "004C" },
96 { "M", "004D" },
97 { "N", "004E" },
98 { "O", "004F" },
99 { "P", "0050" },
100 { "Q", "0051" },
101 { "R", "0052" },
102 { "S", "0053" },
103 { "T", "0054" },
104 { "U", "0055" },
105 { "V", "0056" },
106 { "W", "0057" },
107 { "X", "0058" },
108 { "Y", "0059" },
109 { "Z", "005A" },
110 //{ "[", "005B" },
111 { "lB", "005B" },
112 //{ "\\", "005C" },
113 { "rs", "005C" },
114 //{ "]", "005D" },
115 { "rB", "005D" },
116 { "a^", "005E" },
117 { "^", "005E" },
118 { "ha", "005E" },
119 { "_", "005F" },
120 { "ru", "005F" },
121 { "ul", "005F" },
122 { "ga", "0060" },
123 { "a", "0061" },
124 { "b", "0062" },
125 { "c", "0063" },
126 { "d", "0064" },
127 { "e", "0065" },
128 { "f", "0066" },
129 { "ff", "0066_0066" },
130 { "Fi", "0066_0066_0069" },
131 { "Fl", "0066_0066_006C" },
132 { "fi", "0066_0069" },
133 { "fl", "0066_006C" },
134 { "g", "0067" },
135 { "h", "0068" },
136 { "i", "0069" },
137 { "j", "006A" },
138 { "k", "006B" },
139 { "l", "006C" },
140 { "m", "006D" },
141 { "n", "006E" },
142 { "o", "006F" },
143 { "p", "0070" },
144 { "q", "0071" },
145 { "r", "0072" },
146 { "s", "0073" },
147 { "t", "0074" },
148 { "u", "0075" },
149 { "v", "0076" },
150 { "w", "0077" },
151 { "x", "0078" },
152 { "y", "0079" },
153 { "z", "007A" },
154 { "lC", "007B" },
155 { "{", "007B" },
156 { "ba", "007C" },
157 { "or", "007C" },
158 { "|", "007C" },
159 { "rC", "007D" },
160 { "}", "007D" },
161 { "a~", "007E" },
162 { "~", "007E" },
163 { "ti", "007E" },
164 { "r!", "00A1" },
165 { "ct", "00A2" },
166 { "Po", "00A3" },
167 { "Cs", "00A4" },
168 { "Ye", "00A5" },
169 { "bb", "00A6" },
170 { "sc", "00A7" },
171 { "ad", "00A8" },
172 { "co", "00A9" },
173 { "Of", "00AA" },
174 { "Fo", "00AB" },
175 { "no", "00AC" },
176 { "tno", "00AC" },
177 // The soft hypen U+00AD is meaningful only in the input file,
178 // not in the output.
179 { "rg", "00AE" },
180 { "a-", "00AF" },
181 { "de", "00B0" },
182 { "+-", "00B1" },
183 { "t+-", "00B1" },
184 { "S2", "00B2" },
185 { "S3", "00B3" },
186 { "aa", "00B4" },
187 { "mc", "00B5" },
188 { "ps", "00B6" },
189 { "pc", "00B7" },
190 { "ac", "00B8" },
191 { "S1", "00B9" },
192 { "Om", "00BA" },
193 { "Fc", "00BB" },
194 { "14", "00BC" },
195 { "12", "00BD" },
196 { "34", "00BE" },
197 { "r?", "00BF" },
198 { "`A", "00C0" },
199 { "'A", "00C1" },
200 { "^A", "00C2" },
201 { "~A", "00C3" },
202 { ":A", "00C4" },
203 { "oA", "00C5" },
204 { "AE", "00C6" },
205 { ",C", "00C7" },
206 { "`E", "00C8" },
207 { "'E", "00C9" },
208 { "^E", "00CA" },
209 { ":E", "00CB" },
210 { "`I", "00CC" },
211 { "'I", "00CD" },
212 { "^I", "00CE" },
213 { ":I", "00CF" },
214 { "-D", "00D0" },
215 { "~N", "00D1" },
216 { "`O", "00D2" },
217 { "'O", "00D3" },
218 { "^O", "00D4" },
219 { "~O", "00D5" },
220 { ":O", "00D6" },
221 { "mu", "00D7" },
222 { "tmu", "00D7" },
223 { "/O", "00D8" },
224 { "`U", "00D9" },
225 { "'U", "00DA" },
226 { "^U", "00DB" },
227 { ":U", "00DC" },
228 { "'Y", "00DD" },
229 { "TP", "00DE" },
230 { "ss", "00DF" },
231 { "`a", "00E0" },
232 { "'a", "00E1" },
233 { "^a", "00E2" },
234 { "~a", "00E3" },
235 { ":a", "00E4" },
236 { "oa", "00E5" },
237 { "ae", "00E6" },
238 { ",c", "00E7" },
239 { "`e", "00E8" },
240 { "'e", "00E9" },
241 { "^e", "00EA" },
242 { ":e", "00EB" },
243 { "`i", "00EC" },
244 { "'i", "00ED" },
245 { "^i", "00EE" },
246 { ":i", "00EF" },
247 { "Sd", "00F0" },
248 { "~n", "00F1" },
249 { "`o", "00F2" },
250 { "'o", "00F3" },
251 { "^o", "00F4" },
252 { "~o", "00F5" },
253 { ":o", "00F6" },
254 { "di", "00F7" },
255 { "tdi", "00F7" },
256 { "/o", "00F8" },
257 { "`u", "00F9" },
258 { "'u", "00FA" },
259 { "^u", "00FB" },
260 { ":u", "00FC" },
261 { "'y", "00FD" },
262 { "Tp", "00FE" },
263 { ":y", "00FF" },
264 { "'C", "0106" },
265 { "'c", "0107" },
266 { ".i", "0131" },
267 { "IJ", "0132" },
268 { "ij", "0133" },
269 { "/L", "0141" },
270 { "/l", "0142" },
271 { "OE", "0152" },
272 { "oe", "0153" },
273 { "vS", "0160" },
274 { "vs", "0161" },
275 { ":Y", "0178" },
276 { "vZ", "017D" },
277 { "vz", "017E" },
278 { "Fn", "0192" },
279 { "ah", "02C7" },
280 { "ab", "02D8" },
281 { "a.", "02D9" },
282 { "ao", "02DA" },
283 { "ho", "02DB" },
284 { "a\"", "02DD" },
285 { "*A", "0391" },
286 { "*B", "0392" },
287 { "*G", "0393" },
288 { "*D", "0394" },
289 { "*E", "0395" },
290 { "*Z", "0396" },
291 { "*Y", "0397" },
292 { "*H", "0398" },
293 { "*I", "0399" },
294 { "*K", "039A" },
295 { "*L", "039B" },
296 { "*M", "039C" },
297 { "*N", "039D" },
298 { "*C", "039E" },
299 { "*O", "039F" },
300 { "*P", "03A0" },
301 { "*R", "03A1" },
302 { "*S", "03A3" },
303 { "*T", "03A4" },
304 { "*U", "03A5" },
305 { "*F", "03A6" },
306 { "*X", "03A7" },
307 { "*Q", "03A8" },
308 { "*W", "03A9" },
309 { "*a", "03B1" },
310 { "*b", "03B2" },
311 { "*g", "03B3" },
312 { "*d", "03B4" },
313 { "*e", "03B5" },
314 { "*z", "03B6" },
315 { "*y", "03B7" },
316 { "*h", "03B8" },
317 { "*i", "03B9" },
318 { "*k", "03BA" },
319 { "*l", "03BB" },
320 { "*m", "03BC" },
321 { "*n", "03BD" },
322 { "*c", "03BE" },
323 { "*o", "03BF" },
324 { "*p", "03C0" },
325 { "*r", "03C1" },
326 { "ts", "03C2" },
327 { "*s", "03C3" },
328 { "*t", "03C4" },
329 { "*u", "03C5" },
330 // the curly phi variant
331 { "+f", "03C6" },
332 { "*x", "03C7" },
333 { "*q", "03C8" },
334 { "*w", "03C9" },
335 { "+h", "03D1" },
336 // the stroked phi variant
337 { "*f", "03D5" },
338 { "+p", "03D6" },
339 { "+e", "03F5" },
340 // `-' and `hy' denote a HYPHEN, usually a glyph with a smaller width than
341 // the MINUS sign. Users who are viewing broken man pages that assume
342 // that `-' denotes a U+002D character can either fix the broken man pages
343 // or apply the workaround described in the PROBLEMS file.
344 { "-", "2010" },
345 { "hy", "2010" },
346 { "en", "2013" },
347 { "em", "2014" },
348 { "`", "2018" },
349 { "oq", "2018" },
350 { "'", "2019" },
351 { "cq", "2019" },
352 { "bq", "201A" },
353 { "lq", "201C" },
354 { "rq", "201D" },
355 { "Bq", "201E" },
356 { "dg", "2020" },
357 { "dd", "2021" },
358 { "bu", "2022" },
359 { "%0", "2030" },
360 { "fm", "2032" },
361 { "sd", "2033" },
362 { "fo", "2039" },
363 { "fc", "203A" },
364 { "rn", "203E" },
365 { "f/", "2044" },
366 { "eu", "20AC" },
367 { "Eu", "20AC" },
368 { "-h", "210F" },
369 { "hbar", "210F" },
370 { "Im", "2111" },
371 { "wp", "2118" },
372 { "Re", "211C" },
373 { "tm", "2122" },
374 { "Ah", "2135" },
375 { "18", "215B" },
376 { "38", "215C" },
377 { "58", "215D" },
378 { "78", "215E" },
379 { "<-", "2190" },
380 { "ua", "2191" },
381 { "->", "2192" },
382 { "da", "2193" },
383 { "<>", "2194" },
384 { "va", "2195" },
385 { "CR", "21B5" },
386 { "lA", "21D0" },
387 { "uA", "21D1" },
388 { "rA", "21D2" },
389 { "dA", "21D3" },
390 { "hA", "21D4" },
391 { "vA", "21D5" },
392 { "fa", "2200" },
393 { "pd", "2202" },
394 { "te", "2203" },
395 { "es", "2205" },
396 { "gr", "2207" },
397 { "mo", "2208" },
398 { "nm", "2209" },
399 { "st", "220B" },
400 { "product", "220F" },
401 { "coproduct", "2210" },
402 { "sum", "2211" },
403 // `mi' and `\-' represent a MINUS sign. But it is used in many man pages
404 // to denote the U+002D character that introduces a command-line option.
405 // For devices that support copy&paste, such as devhtml and devutf8, the
406 // user can apply the workaround described in the PROBLEMS file.
407 { "\\-", "2212" },
408 { "mi", "2212" },
409 { "-+", "2213" },
410 { "**", "2217" },
411 { "sqrt", "221A" },
412 { "sr", "221A" },
413 { "pt", "221D" },
414 { "if", "221E" },
415 { "/_", "2220" },
416 { "AN", "2227" },
417 { "OR", "2228" },
418 { "ca", "2229" },
419 { "cu", "222A" },
420 { "is", "222B" },
421 { "integral", "222B" },
422 { "tf", "2234" },
423 { "3d", "2234" },
424 { "ap", "223C" },
425 { "|=", "2243" },
426 { "=~", "2245" },
427 { "~~", "2248" },
428 { "~=", "2248" },
429 { "!=", "2260" },
430 { "==", "2261" },
431 { "ne", "2262" },
432 { "<=", "2264" },
433 { ">=", "2265" },
434 { "<<", "226A" },
435 { ">>", "226B" },
436 { "sb", "2282" },
437 { "sp", "2283" },
438 { "nb", "2284" },
439 { "nc", "2285" },
440 { "ib", "2286" },
441 { "ip", "2287" },
442 { "c+", "2295" },
443 { "c*", "2297" },
444 { "pp", "22A5" },
445 { "md", "22C5" },
446 { "lc", "2308" },
447 { "rc", "2309" },
448 { "lf", "230A" },
449 { "rf", "230B" },
450 { "parenlefttp", "239B" },
451 { "parenleftex", "239C" },
452 { "parenleftbt", "239D" },
453 { "parenrighttp", "239E" },
454 { "parenrightex", "239F" },
455 { "parenrightbt", "23A0" },
456 { "bracketlefttp", "23A1" },
457 { "bracketleftex", "23A2" },
458 { "bracketleftbt", "23A3" },
459 { "bracketrighttp", "23A4" },
460 { "bracketrightex", "23A5" },
461 { "bracketrightbt", "23A6" },
462 { "lt", "23A7" },
463 { "bracelefttp", "23A7" },
464 { "lk", "23A8" },
465 { "braceleftmid", "23A8" },
466 { "lb", "23A9" },
467 { "braceleftbt", "23A9" },
468 { "bv", "23AA" },
469 { "braceex", "23AA" },
470 { "braceleftex", "23AA" },
471 { "bracerightex", "23AA" },
472 { "rt", "23AB" },
473 { "bracerighttp", "23AB" },
474 { "rk", "23AC" },
475 { "bracerightmid", "23AC" },
476 { "rb", "23AD" },
477 { "bracerightbt", "23AD" },
478 { "an", "23AF" },
479 { "br", "2502" },
480 { "sq", "25A1" },
481 { "lz", "25CA" },
482 { "ci", "25CB" },
483 { "lh", "261C" },
484 { "rh", "261E" },
485 { "SP", "2660" },
486 { "CL", "2663" },
487 { "HE", "2665" },
488 { "DI", "2666" },
489 { "OK", "2713" },
490 // The `left angle bracket' and `right angle bracket' could be mapped to
491 // either U+2329,U+232A or U+3008,U+3009 or U+27E8,U+27E9. But the first
492 // and second possibility are double-width characters (see Unicode's
493 // `DerivedEastAsianWidth.txt' file) and are therefore not suitable for
494 // general use, whereas the third possibility is single-width.
496 // The devhtml device overrides this mapping, because
498 // http://www.w3.org/TR/html401/sgml/entities.html
500 // says that in HTML, `&lang;' and `&rang;' are U+2329,U+232A,
501 // respectively.
502 { "la", "27E8" },
503 { "ra", "27E9" },
506 // global constructor FIXME static CTOR
507 static struct glyph_to_unicode_init {
508 glyph_to_unicode_init();
509 } _glyph_to_unicode_init;
511 glyph_to_unicode_init::glyph_to_unicode_init()
513 for (unsigned int i = 0;
514 i < sizeof(glyph_to_unicode_list)/sizeof(glyph_to_unicode_list[0]);
515 i++) {
516 glyph_to_unicode *gtu = new glyph_to_unicode[1];
517 gtu->value = (char *)glyph_to_unicode_list[i].value;
518 glyph_to_unicode_table.define(glyph_to_unicode_list[i].key, gtu);
522 const char *glyph_name_to_unicode(const char *s)
524 glyph_to_unicode *result = glyph_to_unicode_table.lookup(s);
525 return result ? result->value : 0;
528 // s-it2-mode