3 * This file is part of LyX, the document processor.
4 * Licence details can be found in the file COPYING.
6 * \author Lars Gullik Bjønnes
7 * \author Jean-Marc Lasgouttes
10 * Full author contact details are available in file CREDITS.
18 #include "BufferList.h"
19 #include "InsetIterator.h"
20 #include "LaTeXFeatures.h"
24 #include "support/debug.h"
25 #include "support/FileName.h"
26 #include "support/lstrings.h"
27 #include "support/textutils.h"
28 #include "support/unicode.h"
30 #include <boost/cstdint.hpp>
35 using namespace lyx::support
;
41 Encodings::MathCommandSet
Encodings::mathcmd
;
42 Encodings::TextCommandSet
Encodings::textcmd
;
43 Encodings::MathSymbolSet
Encodings::mathsym
;
47 char_type arabic_table
[172][4] = {
48 {0xfe80, 0xfe80, 0xfe80, 0xfe80}, // 0x0621 = hamza
49 {0xfe81, 0xfe82, 0xfe81, 0xfe82}, // 0x0622 = ligature madda on alef
50 {0xfe83, 0xfe84, 0xfe83, 0xfe84}, // 0x0623 = ligature hamza on alef
51 {0xfe85, 0xfe86, 0xfe85, 0xfe86}, // 0x0624 = ligature hamza on waw
52 {0xfe87, 0xfe88, 0xfe87, 0xfe88}, // 0x0625 = ligature hamza under alef
53 {0xfe89, 0xfe8a, 0xfe8b, 0xfe8c}, // 0x0626 = ligature hamza on ya
54 {0xfe8d, 0xfe8e, 0xfe8d, 0xfe8e}, // 0x0627 = alef
55 {0xfe8f, 0xfe90, 0xfe91, 0xfe92}, // 0x0628 = baa
56 {0xfe93, 0xfe94, 0xfe93, 0xfe94}, // 0x0629 = taa marbuta
57 {0xfe95, 0xfe96, 0xfe97, 0xfe98}, // 0x062a = taa
58 {0xfe99, 0xfe9a, 0xfe9b, 0xfe9c}, // 0x062b = thaa
59 {0xfe9d, 0xfe9e, 0xfe9f, 0xfea0}, // 0x062c = jeem
60 {0xfea1, 0xfea2, 0xfea3, 0xfea4}, // 0x062d = haa
61 {0xfea5, 0xfea6, 0xfea7, 0xfea8}, // 0x062e = khaa
62 {0xfea9, 0xfeaa, 0xfea9, 0xfeaa}, // 0x062f = dal
64 {0xfeab, 0xfeac, 0xfeab, 0xfeac}, // 0x0630 = thal
65 {0xfead, 0xfeae, 0xfead, 0xfeae}, // 0x0631 = ra
66 {0xfeaf, 0xfeb0, 0xfeaf, 0xfeb0}, // 0x0632 = zain
67 {0xfeb1, 0xfeb2, 0xfeb3, 0xfeb4}, // 0x0633 = seen
68 {0xfeb5, 0xfeb6, 0xfeb7, 0xfeb8}, // 0x0634 = sheen
69 {0xfeb9, 0xfeba, 0xfebb, 0xfebc}, // 0x0635 = sad
70 {0xfebd, 0xfebe, 0xfebf, 0xfec0}, // 0x0636 = dad
71 {0xfec1, 0xfec2, 0xfec3, 0xfec4}, // 0x0637 = tah
72 {0xfec5, 0xfec6, 0xfec7, 0xfec8}, // 0x0638 = zah
73 {0xfec9, 0xfeca, 0xfecb, 0xfecc}, // 0x0639 = ain
74 {0xfecd, 0xfece, 0xfecf, 0xfed0}, // 0x063a = ghain
75 {0, 0, 0, 0}, // 0x063b
76 {0, 0, 0, 0}, // 0x063c
77 {0, 0, 0, 0}, // 0x063d
78 {0, 0, 0, 0}, // 0x063e
79 {0, 0, 0, 0}, // 0x063f
81 {0, 0, 0, 0}, // 0x0640
82 {0xfed1, 0xfed2, 0xfed3, 0xfed4}, // 0x0641 = fa
83 {0xfed5, 0xfed6, 0xfed7, 0xfed8}, // 0x0642 = qaf
84 {0xfed9, 0xfeda, 0xfedb, 0xfedc}, // 0x0643 = kaf
85 {0xfedd, 0xfede, 0xfedf, 0xfee0}, // 0x0644 = lam
86 {0xfee1, 0xfee2, 0xfee3, 0xfee4}, // 0x0645 = meem
87 {0xfee5, 0xfee6, 0xfee7, 0xfee8}, // 0x0646 = noon
88 {0xfee9, 0xfeea, 0xfeeb, 0xfeec}, // 0x0647 = ha
89 {0xfeed, 0xfeee, 0xfeed, 0xfeee}, // 0x0648 = waw
90 {0xfeef, 0xfef0, 0xfeef, 0xfef0}, // 0x0649 = alef maksura
91 {0xfef1, 0xfef2, 0xfef3, 0xfef4}, // 0x064a = ya
92 {0x065b, 0x065b, 0x065b, 0x065b}, // 0x064b = fathatan
93 {0x065c, 0x065c, 0x065c, 0x065c}, // 0x064c = dammatan
94 {0x064d, 0x064d, 0x064d, 0x064d}, // 0x064d = kasratan
95 {0x064e, 0x064e, 0x064e, 0x064e}, // 0x064e = fatha
96 {0x064f, 0x064f, 0x064f, 0x064f}, // 0x064f = damma
98 {0x0650, 0x0650, 0x0650, 0x0650}, // 0x0650 = kasra
99 {0x0651, 0x0651, 0x0651, 0x0651}, // 0x0651 = shadda
100 {0x0652, 0x0652, 0x0652, 0x0652}, // 0x0652 = sukun
102 {0, 0, 0, 0}, // 0x0653
103 {0, 0, 0, 0}, // 0x0654
104 {0, 0, 0, 0}, // 0x0655
105 {0, 0, 0, 0}, // 0x0656
106 {0, 0, 0, 0}, // 0x0657
107 {0, 0, 0, 0}, // 0x0658
108 {0, 0, 0, 0}, // 0x0659
109 {0, 0, 0, 0}, // 0x065a
110 {0, 0, 0, 0}, // 0x065b
111 {0, 0, 0, 0}, // 0x065c
112 {0, 0, 0, 0}, // 0x065d
113 {0, 0, 0, 0}, // 0x065e
114 {0, 0, 0, 0}, // 0x065f
115 {0, 0, 0, 0}, // 0x0660
116 {0, 0, 0, 0}, // 0x0661
117 {0, 0, 0, 0}, // 0x0662
118 {0, 0, 0, 0}, // 0x0663
119 {0, 0, 0, 0}, // 0x0664
120 {0, 0, 0, 0}, // 0x0665
121 {0, 0, 0, 0}, // 0x0666
122 {0, 0, 0, 0}, // 0x0667
123 {0, 0, 0, 0}, // 0x0668
124 {0, 0, 0, 0}, // 0x0669
125 {0, 0, 0, 0}, // 0x066a
126 {0, 0, 0, 0}, // 0x066b
127 {0, 0, 0, 0}, // 0x066c
128 {0, 0, 0, 0}, // 0x066d
129 {0, 0, 0, 0}, // 0x066e
130 {0, 0, 0, 0}, // 0x066f
131 {0, 0, 0, 0}, // 0x0670
132 {0, 0, 0, 0}, // 0x0671
133 {0, 0, 0, 0}, // 0x0672
134 {0, 0, 0, 0}, // 0x0673
135 {0, 0, 0, 0}, // 0x0674
136 {0, 0, 0, 0}, // 0x0675
137 {0, 0, 0, 0}, // 0x0676
138 {0, 0, 0, 0}, // 0x0677
139 {0, 0, 0, 0}, // 0x0678
140 {0, 0, 0, 0}, // 0x0679
141 {0, 0, 0, 0}, // 0x067a
142 {0, 0, 0, 0}, // 0x067b
143 {0, 0, 0, 0}, // 0x067c
144 {0, 0, 0, 0}, // 0x067d
145 {0xfb56, 0xfb57, 0xfb58, 0xfb59}, // 0x067e = peh
146 {0, 0, 0, 0}, // 0x067f
147 {0, 0, 0, 0}, // 0x0680
148 {0, 0, 0, 0}, // 0x0681
149 {0, 0, 0, 0}, // 0x0682
150 {0, 0, 0, 0}, // 0x0683
151 {0, 0, 0, 0}, // 0x0684
152 {0, 0, 0, 0}, // 0x0685
153 {0xfb7a, 0xfb7b, 0xfb7c, 0xfb7d}, // 0x0686 = tcheh
154 {0, 0, 0, 0}, // 0x0687
155 {0, 0, 0, 0}, // 0x0688
156 {0, 0, 0, 0}, // 0x0689
157 {0, 0, 0, 0}, // 0x068a
158 {0, 0, 0, 0}, // 0x068b
159 {0, 0, 0, 0}, // 0x068c
160 {0, 0, 0, 0}, // 0x068d
161 {0, 0, 0, 0}, // 0x068e
162 {0, 0, 0, 0}, // 0x068f
163 {0, 0, 0, 0}, // 0x0690
164 {0, 0, 0, 0}, // 0x0691
165 {0, 0, 0, 0}, // 0x0692
166 {0, 0, 0, 0}, // 0x0693
167 {0, 0, 0, 0}, // 0x0694
168 {0, 0, 0, 0}, // 0x0695
169 {0, 0, 0, 0}, // 0x0696
170 {0, 0, 0, 0}, // 0x0697
171 {0xfb8a, 0xfb8b, 0xfb8a, 0xfb8b}, // 0x0698 = jeh
172 {0, 0, 0, 0}, // 0x0699
173 {0, 0, 0, 0}, // 0x069a
174 {0, 0, 0, 0}, // 0x069b
175 {0, 0, 0, 0}, // 0x069c
176 {0, 0, 0, 0}, // 0x069d
177 {0, 0, 0, 0}, // 0x069e
178 {0, 0, 0, 0}, // 0x069f
179 {0, 0, 0, 0}, // 0x06a0
180 {0, 0, 0, 0}, // 0x06a1
181 {0, 0, 0, 0}, // 0x06a2
182 {0, 0, 0, 0}, // 0x06a3
183 {0, 0, 0, 0}, // 0x06a4
184 {0, 0, 0, 0}, // 0x06a5
185 {0, 0, 0, 0}, // 0x06a6
186 {0, 0, 0, 0}, // 0x06a7
187 {0, 0, 0, 0}, // 0x06a8
188 {0xfb8e, 0xfb8f, 0xfb90, 0xfb91}, // 0x06a9 = farsi kaf
189 {0, 0, 0, 0}, // 0x06aa
190 {0, 0, 0, 0}, // 0x06ab
191 {0, 0, 0, 0}, // 0x06ac
192 {0, 0, 0, 0}, // 0x06ad
193 {0, 0, 0, 0}, // 0x06ae
194 {0xfb92, 0xfb93, 0xfb94, 0xfb95}, // 0x06af = gaf
195 {0, 0, 0, 0}, // 0x06b0
196 {0, 0, 0, 0}, // 0x06b1
197 {0, 0, 0, 0}, // 0x06b2
198 {0, 0, 0, 0}, // 0x06b3
199 {0, 0, 0, 0}, // 0x06b4
200 {0, 0, 0, 0}, // 0x06b5
201 {0, 0, 0, 0}, // 0x06b6
202 {0, 0, 0, 0}, // 0x06b7
203 {0, 0, 0, 0}, // 0x06b8
204 {0, 0, 0, 0}, // 0x06b9
205 {0, 0, 0, 0}, // 0x06ba
206 {0, 0, 0, 0}, // 0x06bb
207 {0, 0, 0, 0}, // 0x06bc
208 {0, 0, 0, 0}, // 0x06bd
209 {0, 0, 0, 0}, // 0x06be
210 {0, 0, 0, 0}, // 0x06bf
211 {0, 0, 0, 0}, // 0x06c0
212 {0, 0, 0, 0}, // 0x06c1
213 {0, 0, 0, 0}, // 0x06c2
214 {0, 0, 0, 0}, // 0x06c3
215 {0, 0, 0, 0}, // 0x06c4
216 {0, 0, 0, 0}, // 0x06c5
217 {0, 0, 0, 0}, // 0x06c6
218 {0, 0, 0, 0}, // 0x06c7
219 {0, 0, 0, 0}, // 0x06c8
220 {0, 0, 0, 0}, // 0x06c9
221 {0, 0, 0, 0}, // 0x06ca
222 {0, 0, 0, 0}, // 0x06cb
223 {0xfbfc, 0xfbfd, 0xfbfe, 0xfbff} // 0x06cc = farsi yeh
227 char_type
const arabic_start
= 0x0621;
228 char_type
const arabic_end
= 0x06cc;
231 /// Information about a single UCS4 character
233 /// LaTeX command (text mode) for this character
234 docstring textcommand
;
235 /// LaTeX command (math mode) for this character
236 docstring mathcommand
;
237 /// Needed LaTeX preamble (or feature) for text mode
239 /// Needed LaTeX preamble (or feature) for math mode
241 /// Is this a combining character?
243 /// Is \c textpreamble a feature known by LaTeXFeatures, or a raw LaTeX
246 /// Is \c mathpreamble a feature known by LaTeXFeatures, or a raw LaTeX
249 /// Always force the LaTeX command, even if the encoding contains
255 typedef map
<char_type
, CharInfo
> CharInfoMap
;
256 CharInfoMap unicodesymbols
;
258 typedef std::set
<char_type
> CharSet
;
261 typedef std::set
<char_type
> MathAlphaSet
;
262 MathAlphaSet mathalpha
;
265 /// The highest code point in UCS4 encoding (1<<20 + 1<<16)
266 char_type
const max_ucs4
= 0x110000;
271 EncodingException::EncodingException(char_type c
)
272 : failed_char(c
), par_id(0), pos(0)
277 const char * EncodingException::what() const throw()
279 return "Could not find LaTeX command for a character";
283 Encoding::Encoding(string
const & n
, string
const & l
, string
const & g
,
284 string
const & i
, bool f
, Encoding::Package p
)
285 : name_(n
), latexName_(l
), guiName_(g
), iconvName_(i
), fixedwidth_(f
), package_(p
)
288 // ASCII can encode 128 code points and nothing else
289 start_encodable_
= 128;
291 } else if (i
== "UTF-8") {
292 // UTF8 can encode all UCS4 code points
293 start_encodable_
= max_ucs4
;
301 void Encoding::init() const
306 start_encodable_
= 0;
307 // temporarily switch off lyxerr, since we will generate iconv errors
310 // We do not need to check all UCS4 code points, it is enough
311 // if we check all 256 code points of this encoding.
312 for (unsigned short j
= 0; j
< 256; ++j
) {
313 char const c
= char(j
);
314 vector
<char_type
> const ucs4
= eightbit_to_ucs4(&c
, 1, iconvName_
);
315 if (ucs4
.size() != 1)
317 char_type
const uc
= ucs4
[0];
318 CharInfoMap::const_iterator
const it
= unicodesymbols
.find(uc
);
319 if (it
== unicodesymbols
.end() || !it
->second
.force
)
320 encodable_
.insert(uc
);
323 // We do not know how many code points this encoding has, and
324 // they do not have a direct representation as a single byte,
325 // therefore we need to check all UCS4 code points.
326 // This is expensive!
327 for (char_type c
= 0; c
< max_ucs4
; ++c
) {
328 vector
<char> const eightbit
= ucs4_to_eightbit(&c
, 1, iconvName_
);
329 if (!eightbit
.empty()) {
330 CharInfoMap::const_iterator
const it
= unicodesymbols
.find(c
);
331 if (it
== unicodesymbols
.end() || !it
->second
.force
)
332 encodable_
.insert(c
);
337 CharSet::iterator it
= encodable_
.find(start_encodable_
);
338 while (it
!= encodable_
.end()) {
339 encodable_
.erase(it
);
341 it
= encodable_
.find(start_encodable_
);
347 docstring
Encoding::latexChar(char_type c
, bool for_mathed
) const
349 // assure the used encoding is properly initialized
352 if (iconvName_
== "UTF-8" && package_
== none
)
353 return docstring(1, c
);
354 if (c
< start_encodable_
&& !encodings
.isForced(c
))
355 return docstring(1, c
);
356 if (encodable_
.find(c
) != encodable_
.end())
357 return docstring(1, c
);
361 // c cannot (or should not) be encoded in this encoding
362 CharInfoMap::const_iterator
const it
= unicodesymbols
.find(c
);
363 if (it
== unicodesymbols
.end())
364 throw EncodingException(c
);
365 // at least one of mathcommand and textcommand is nonempty
366 if (it
->second
.textcommand
.empty())
367 return "\\ensuremath{" + it
->second
.mathcommand
+ '}';
368 return it
->second
.textcommand
;
372 vector
<char_type
> Encoding::symbolsList() const
374 // assure the used encoding is properly initialized
377 // first all encodable characters
378 vector
<char_type
> symbols(encodable_
.begin(), encodable_
.end());
379 // add those below start_encodable_
380 for (char_type c
= 0; c
< start_encodable_
; ++c
)
381 symbols
.push_back(c
);
382 // now the ones from the unicodesymbols file
383 CharInfoMap::const_iterator
const end
= unicodesymbols
.end();
384 CharInfoMap::const_iterator it
= unicodesymbols
.begin();
385 for (; it
!= end
; ++it
)
386 symbols
.push_back(it
->first
);
391 bool Encodings::latexMathChar(char_type c
, bool mathmode
,
392 Encoding
const * encoding
, docstring
& command
)
395 command
= encoding
->latexChar(c
, true);
397 CharInfoMap::const_iterator
const it
= unicodesymbols
.find(c
);
398 if (it
== unicodesymbols
.end()) {
399 if (!encoding
|| command
.empty())
400 throw EncodingException(c
);
405 // at least one of mathcommand and textcommand is nonempty
406 bool use_math
= (mathmode
&& !it
->second
.mathcommand
.empty()) ||
407 (!mathmode
&& it
->second
.textcommand
.empty());
409 command
= it
->second
.mathcommand
;
412 if (!encoding
|| command
.empty()) {
413 command
= it
->second
.textcommand
;
423 char_type
Encodings::fromLaTeXCommand(docstring
const & cmd
, bool & combining
)
425 CharInfoMap::const_iterator
const end
= unicodesymbols
.end();
426 CharInfoMap::const_iterator it
= unicodesymbols
.begin();
427 for (combining
= false; it
!= end
; ++it
) {
428 docstring
const math
= it
->second
.mathcommand
;
429 docstring
const text
= it
->second
.textcommand
;
430 if (math
== cmd
|| text
== cmd
) {
431 combining
= it
->second
.combining
;
439 docstring
Encodings::fromLaTeXCommand(docstring
const & cmd
, docstring
& rem
,
442 bool const mathmode
= cmdtype
& MATH_CMD
;
443 bool const textmode
= cmdtype
& TEXT_CMD
;
446 size_t const cmdend
= cmd
.size();
447 CharInfoMap::const_iterator
const uniend
= unicodesymbols
.end();
448 for (size_t j
= 0; j
< cmdend
; ++j
) {
449 // Also get the char after a backslash
450 if (j
+ 1 < cmdend
&& cmd
[j
] == '\\')
452 // If a macro argument follows, get it, too
453 if (j
+ 1 < cmdend
&& cmd
[j
+ 1] == '{') {
456 while (k
< cmdend
&& count
&& k
!= docstring::npos
) {
457 k
= cmd
.find_first_of(from_ascii("{}"), k
+ 1);
463 if (k
!= docstring::npos
)
466 // Start with this substring and try augmenting it when it is
467 // the prefix of some command in the unicodesymbols file
468 docstring
const subcmd
= cmd
.substr(i
, j
- i
+ 1);
470 CharInfoMap::const_iterator it
= unicodesymbols
.begin();
471 size_t unicmd_size
= 0;
473 for (; it
!= uniend
; ++it
) {
474 docstring
const math
= mathmode
? it
->second
.mathcommand
476 docstring
const text
= textmode
? it
->second
.textcommand
478 size_t cur_size
= max(math
.size(), text
.size());
479 // The current math or text unicode command cannot
480 // match, or we already matched a longer one
481 if (cur_size
< subcmd
.size() || cur_size
<= unicmd_size
)
484 docstring tmp
= subcmd
;
486 while (prefixIs(math
, tmp
) || prefixIs(text
, tmp
)) {
488 if (k
>= cmdend
|| cur_size
<= tmp
.size())
496 // The last added char caused a mismatch, because
497 // we didn't exhaust the chars in cmd and didn't
498 // exceed the maximum size of the current unicmd
499 if (k
< cmdend
&& cur_size
> tmp
.size())
500 tmp
.resize(tmp
.size() - 1);
502 // If this is an exact match, we found a (longer)
503 // matching entry in the unicodesymbols file.
504 // If the entry doesn't start with '\', we take note
505 // of the match and continue (this is not a ultimate
506 // acceptance, as some other entry may match a longer
507 // portion of the cmd string). However, if the entry
508 // does start with '\', we accept the match only if
509 // this is a valid macro, i.e., either it is a single
510 // (nonletter) char macro, or nothing else follows,
511 // or what follows is a nonletter char, or the last
513 if ((math
== tmp
|| text
== tmp
)
515 || (tmp
.size() == 2 && !isAlphaASCII(tmp
[1]))
517 || !isAlphaASCII(cmd
[k
])
518 || tmp
[tmp
.size() - 1] == '}')
523 unicmd_size
= cur_size
;
528 else if (j
+ 1 == cmdend
)
529 // No luck. Return what remains
536 void Encodings::initUnicodeMath(Buffer
const & buffer
, bool clear_sets
)
539 // The code below is not needed in tex2lyx and requires additional stuff
550 Inset
& inset
= buffer
.inset();
551 InsetIterator it
= inset_iterator_begin(inset
);
552 InsetIterator
const end
= inset_iterator_end(inset
);
553 for (; it
!= end
; ++it
)
554 it
->initUnicodeMath();
557 BufferList::iterator bit
= theBufferList().begin();
558 BufferList::iterator
const bend
= theBufferList().end();
559 for (; bit
!= bend
; ++bit
)
560 if (buffer
.isChild(*bit
))
561 initUnicodeMath(**bit
, false);
566 void Encodings::validate(char_type c
, LaTeXFeatures
& features
, bool for_mathed
)
569 // The code below is not needed in tex2lyx and requires additional stuff
574 CharInfoMap::const_iterator
const it
= unicodesymbols
.find(c
);
575 if (it
!= unicodesymbols
.end()) {
576 // In mathed, c could be used both in textmode and mathmode
577 bool const use_math
= (for_mathed
&& isMathCmd(c
)) ||
578 (!for_mathed
&& it
->second
.textcommand
.empty());
579 bool const use_text
= (for_mathed
&& isTextCmd(c
)) ||
580 (!for_mathed
&& !it
->second
.textcommand
.empty());
582 if (!it
->second
.mathpreamble
.empty()) {
583 if (it
->second
.mathfeature
) {
584 string feats
= it
->second
.mathpreamble
;
585 while (!feats
.empty()) {
587 feats
= split(feats
, feat
, ',');
588 features
.require(feat
);
591 features
.addPreambleSnippet(it
->second
.mathpreamble
);
595 if (!it
->second
.textpreamble
.empty()) {
596 if (it
->second
.textfeature
) {
597 string feats
= it
->second
.textpreamble
;
598 while (!feats
.empty()) {
600 feats
= split(feats
, feat
, ',');
601 features
.require(feat
);
604 features
.addPreambleSnippet(it
->second
.textpreamble
);
608 if (for_mathed
&& isMathSym(c
)) {
609 features
.require("amstext");
610 features
.require("lyxmathsym");
616 bool Encodings::isHebrewComposeChar(char_type c
)
618 return c
<= 0x05c2 && c
>= 0x05b0 && c
!= 0x05be && c
!= 0x05c0;
622 // Special Arabic letters are ones that do not get connected from left
623 // they are hamza, alef_madda, alef_hamza, waw_hamza, alef_hamza_under,
624 // alef, tah_marbota, dal, thal, rah, zai, wow, alef_maksoura
626 bool Encodings::isArabicSpecialChar(char_type c
)
628 return (c
>= 0x0621 && c
<= 0x0625) || (c
>= 0x0630 && c
<= 0x0632)
629 || c
== 0x0627 || c
== 0x0629 || c
== 0x062f || c
== 0x0648
630 || c
== 0x0649 || c
== 0x0698;
634 bool Encodings::isArabicComposeChar(char_type c
)
636 return c
>= 0x064b && c
<= 0x0652;
640 bool Encodings::isArabicChar(char_type c
)
642 return c
>= arabic_start
&& c
<= arabic_end
643 && arabic_table
[c
-arabic_start
][0];
647 char_type
Encodings::transformChar(char_type c
, Encodings::LetterForm form
)
649 return isArabicChar(c
) ? arabic_table
[c
-arabic_start
][form
] : c
;
653 bool Encodings::isCombiningChar(char_type c
)
655 CharInfoMap::const_iterator
const it
= unicodesymbols
.find(c
);
656 if (it
!= unicodesymbols
.end())
657 return it
->second
.combining
;
662 bool Encodings::isKnownScriptChar(char_type
const c
, string
& preamble
)
664 CharInfoMap::const_iterator
const it
= unicodesymbols
.find(c
);
666 if (it
== unicodesymbols
.end())
669 if (it
->second
.textpreamble
!= "textgreek" && it
->second
.textpreamble
!= "textcyr")
672 if (preamble
.empty()) {
673 preamble
= it
->second
.textpreamble
;
676 return it
->second
.textpreamble
== preamble
;
680 bool Encodings::isForced(char_type c
)
682 return (!forced
.empty() && forced
.find(c
) != forced
.end());
686 bool Encodings::isMathAlpha(char_type c
)
688 return mathalpha
.count(c
);
692 Encoding
const * Encodings::fromLyXName(string
const & name
) const
694 EncodingList::const_iterator
const it
= encodinglist
.find(name
);
695 return it
!= encodinglist
.end() ? &it
->second
: 0;
699 Encoding
const * Encodings::fromLaTeXName(string
const & name
) const
701 // We don't use find_if because it makes copies of the pairs in
703 // This linear search is OK since we don't have many encodings.
704 // Users could even optimize it by putting the encodings they use
705 // most at the top of lib/encodings.
706 EncodingList::const_iterator
const end
= encodinglist
.end();
707 for (EncodingList::const_iterator it
= encodinglist
.begin(); it
!= end
; ++it
)
708 if (it
->second
.latexName() == name
)
714 Encodings::Encodings()
719 void Encodings::read(FileName
const & encfile
, FileName
const & symbolsfile
)
721 // We must read the symbolsfile first, because the Encoding
722 // constructor depends on it.
724 symbolslex
.setFile(symbolsfile
);
725 bool getNextToken
= true;
726 while (symbolslex
.isOK()) {
732 if (!symbolslex
.next(true))
737 istringstream
is(symbolslex
.getString());
738 // reading symbol directly does not work if
739 // char_type == wchar_t.
741 if(!(is
>> hex
>> tmp
))
745 if (!symbolslex
.next(true))
747 info
.textcommand
= symbolslex
.getDocString();
748 if (!symbolslex
.next(true))
750 info
.textpreamble
= symbolslex
.getString();
751 if (!symbolslex
.next(true))
753 flags
= symbolslex
.getString();
755 info
.combining
= false;
756 info
.textfeature
= false;
758 while (!flags
.empty()) {
760 flags
= split(flags
, flag
, ',');
761 if (flag
== "combining") {
762 info
.combining
= true;
763 } else if (flag
== "force") {
765 forced
.insert(symbol
);
766 } else if (flag
== "mathalpha") {
767 mathalpha
.insert(symbol
);
769 lyxerr
<< "Ignoring unknown flag `" << flag
770 << "' for symbol `0x"
771 << hex
<< symbol
<< dec
775 // mathcommand and mathpreamble have been added for 1.6.0.
776 // make them optional so that old files still work.
777 int const lineno
= symbolslex
.lineNumber();
778 bool breakout
= false;
779 if (symbolslex
.next(true)) {
780 if (symbolslex
.lineNumber() != lineno
) {
781 // line in old format without mathcommand and mathpreamble
782 getNextToken
= false;
784 info
.mathcommand
= symbolslex
.getDocString();
785 if (symbolslex
.next(true)) {
786 if (symbolslex
.lineNumber() != lineno
) {
787 // line in new format with mathcommand only
788 getNextToken
= false;
790 // line in new format with mathcommand and mathpreamble
791 info
.mathpreamble
= symbolslex
.getString();
800 if (!info
.textpreamble
.empty())
801 info
.textfeature
= info
.textpreamble
[0] != '\\';
802 if (!info
.mathpreamble
.empty())
803 info
.mathfeature
= info
.mathpreamble
[0] != '\\';
805 LYXERR(Debug::INFO
, "Read unicode symbol " << symbol
<< " '"
806 << to_utf8(info
.textcommand
) << "' '" << info
.textpreamble
807 << "' " << info
.combining
<< ' ' << info
.textfeature
808 << " '" << to_utf8(info
.mathcommand
) << "' '"
809 << info
.mathpreamble
<< "' " << info
.mathfeature
);
811 // we assume that at least one command is nonempty when using unicodesymbols
812 if (!info
.textcommand
.empty() || !info
.mathcommand
.empty())
813 unicodesymbols
[symbol
] = info
;
819 // Now read the encodings
825 LexerKeyword encodingtags
[] = {
826 { "encoding", et_encoding
},
830 Lexer
lex(encodingtags
);
831 lex
.setFile(encfile
);
832 lex
.setContext("Encodings::read");
838 string
const name
= lex
.getString();
840 string
const latexname
= lex
.getString();
842 string
const guiname
= lex
.getString();
844 string
const iconvname
= lex
.getString();
846 string
const width
= lex
.getString();
847 bool fixedwidth
= false;
848 if (width
== "fixed")
850 else if (width
== "variable")
853 lex
.printError("Unknown width");
856 string
const p
= lex
.getString();
857 Encoding::Package package
= Encoding::none
;
859 package
= Encoding::none
;
860 else if (p
== "inputenc")
861 package
= Encoding::inputenc
;
863 package
= Encoding::CJK
;
864 else if (p
== "japanese")
865 package
= Encoding::japanese
;
867 lex
.printError("Unknown package");
869 LYXERR(Debug::INFO
, "Reading encoding " << name
);
870 encodinglist
[name
] = Encoding(name
, latexname
,
871 guiname
, iconvname
, fixedwidth
, package
);
873 if (lex
.lex() != et_end
)
874 lex
.printError("Missing end");
878 lex
.printError("Misplaced end");
880 case Lexer::LEX_FEOF
:
883 lex
.printError("Unknown tag");