1 if not modules then modules = { } end modules ['char-ini'] = {
3 comment = "companion to char-ini.mkiv",
4 author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5 copyright = "PRAGMA ADE / ConTeXt Development Team",
6 license = "see context related readme files"
9 -- todo: make two files, one for format generation, one for format use
11 -- we can remove the tag range starting at 0xE0000 (special applications)
13 local utfchar, utfbyte, utfvalues, ustring = utf.char, utf.byte, utf.values, utf.ustring
14 local concat, unpack, tohash = table.concat, table.unpack, table.tohash
15 local next, tonumber, type, rawget, rawset = next, tonumber, type, rawget, rawset
16 local format, lower, gsub, match, gmatch = string.format, string.lower, string.gsub, string.match, string.match, string.gmatch
17 local P, R, Cs, lpegmatch, patterns = lpeg.P, lpeg.R, lpeg.Cs, lpeg.match, lpeg.patterns
19 local utf8byte = patterns.utf8byte
20 local utf8char = patterns.utf8char
22 local allocate = utilities.storage.allocate
23 local mark = utilities.storage.mark
25 local setmetatableindex = table.setmetatableindex
27 local trace_defining = false trackers.register("characters.defining", function(v) characters_defining = v end)
29 local report_defining = logs.reporter("characters")
32 <p>This module implements some methods and creates additional datastructured
33 from the big character table that we use for all kind of purposes:
34 <type>char-def.lua</type>.</p>
36 <p>We assume that at this point <type>characters.data</type> is already
40 characters = characters or { }
41 local characters = characters
42 local data = characters.data
45 mark(data) -- why does this fail
47 report_defining("fatal error: 'char-def.lua' is not loaded")
52 <p>This converts a string (if given) into a number.</p>
55 local pattern = (P("0x") + P("U+")) * ((R("09","AF")^1 * P(-1)) / function(s) return tonumber(s,16) end)
57 patterns.chartonumber = pattern
59 local function chartonumber(k)
60 if type(k) == "string" then
61 local u = lpegmatch(pattern,k)
65 return utfbyte(k) or 0
72 local function charfromnumber(k)
73 if type(k) == "number" then
74 return utfchar(k) or ""
76 local u = lpegmatch(pattern,k)
85 --~ print(chartonumber(97), chartonumber("a"), chartonumber("0x61"), chartonumber("U+61"))
87 characters.tonumber = chartonumber
88 characters.fromnumber = charfromnumber
91 description = "PRIVATE SLOT",
94 local ranges = allocate()
95 characters.ranges = ranges
97 setmetatableindex(data, function(t,k)
99 if tk == "string" then
100 k = lpegmatch(pattern,k) or utfbyte(k)
102 local v = rawget(t,k)
106 tk = "number" -- fall through to range
112 if tk == "number" and k < 0xF0000 then
115 if k >= rr.first and k <= rr.last then
116 local extender = rr.extender
118 local v = extender(k,v)
125 return private -- handy for when we loop over characters in fonts and check for a property
128 local blocks = allocate {
129 ["aegeannumbers"] = { first = 0x10100, last = 0x1013F, description = "Aegean Numbers" },
130 ["alchemicalsymbols"] = { first = 0x1F700, last = 0x1F77F, description = "Alchemical Symbols" },
131 ["alphabeticpresentationforms"] = { first = 0x0FB00, last = 0x0FB4F, otf="latn", description = "Alphabetic Presentation Forms" },
132 ["ancientgreekmusicalnotation"] = { first = 0x1D200, last = 0x1D24F, otf="grek", description = "Ancient Greek Musical Notation" },
133 ["ancientgreeknumbers"] = { first = 0x10140, last = 0x1018F, otf="grek", description = "Ancient Greek Numbers" },
134 ["ancientsymbols"] = { first = 0x10190, last = 0x101CF, otf="grek", description = "Ancient Symbols" },
135 ["arabic"] = { first = 0x00600, last = 0x006FF, otf="arab", description = "Arabic" },
136 ["arabicextendeda"] = { first = 0x008A0, last = 0x008FF, description = "Arabic Extended-A" },
137 ["arabicmathematicalalphabeticsymbols"] = { first = 0x1EE00, last = 0x1EEFF, description = "Arabic Mathematical Alphabetic Symbols" },
138 ["arabicpresentationformsa"] = { first = 0x0FB50, last = 0x0FDFF, otf="arab", description = "Arabic Presentation Forms-A" },
139 ["arabicpresentationformsb"] = { first = 0x0FE70, last = 0x0FEFF, otf="arab", description = "Arabic Presentation Forms-B" },
140 ["arabicsupplement"] = { first = 0x00750, last = 0x0077F, otf="arab", description = "Arabic Supplement" },
141 ["armenian"] = { first = 0x00530, last = 0x0058F, otf="armn", description = "Armenian" },
142 ["arrows"] = { first = 0x02190, last = 0x021FF, description = "Arrows" },
143 ["avestan"] = { first = 0x10B00, last = 0x10B3F, description = "Avestan" },
144 ["balinese"] = { first = 0x01B00, last = 0x01B7F, otf="bali", description = "Balinese" },
145 ["bamum"] = { first = 0x0A6A0, last = 0x0A6FF, description = "Bamum" },
146 ["bamumsupplement"] = { first = 0x16800, last = 0x16A3F, description = "Bamum Supplement" },
147 ["basiclatin"] = { first = 0x00000, last = 0x0007F, otf="latn", description = "Basic Latin" },
148 ["batak"] = { first = 0x01BC0, last = 0x01BFF, description = "Batak" },
149 ["bengali"] = { first = 0x00980, last = 0x009FF, otf="beng", description = "Bengali" },
150 ["blockelements"] = { first = 0x02580, last = 0x0259F, otf="bopo", description = "Block Elements" },
151 ["bopomofo"] = { first = 0x03100, last = 0x0312F, otf="bopo", description = "Bopomofo" },
152 ["bopomofoextended"] = { first = 0x031A0, last = 0x031BF, otf="bopo", description = "Bopomofo Extended" },
153 ["boxdrawing"] = { first = 0x02500, last = 0x0257F, description = "Box Drawing" },
154 ["brahmi"] = { first = 0x11000, last = 0x1107F, description = "Brahmi" },
155 ["braillepatterns"] = { first = 0x02800, last = 0x028FF, otf="brai", description = "Braille Patterns" },
156 ["buginese"] = { first = 0x01A00, last = 0x01A1F, otf="bugi", description = "Buginese" },
157 ["buhid"] = { first = 0x01740, last = 0x0175F, otf="buhd", description = "Buhid" },
158 ["byzantinemusicalsymbols"] = { first = 0x1D000, last = 0x1D0FF, otf="byzm", description = "Byzantine Musical Symbols" },
159 ["commonindicnumberforms"] = { first = 0x0A830, last = 0x0A83F, description = "Common Indic Number Forms" },
160 ["carian"] = { first = 0x102A0, last = 0x102DF, description = "Carian" },
161 ["cham"] = { first = 0x0AA00, last = 0x0AA5F, description = "Cham" },
162 ["cherokee"] = { first = 0x013A0, last = 0x013FF, otf="cher", description = "Cherokee" },
163 ["cjkcompatibility"] = { first = 0x03300, last = 0x033FF, otf="hang", description = "CJK Compatibility" },
164 ["cjkcompatibilityforms"] = { first = 0x0FE30, last = 0x0FE4F, otf="hang", description = "CJK Compatibility Forms" },
165 ["cjkcompatibilityideographs"] = { first = 0x0F900, last = 0x0FAFF, otf="hang", description = "CJK Compatibility Ideographs" },
166 ["cjkcompatibilityideographssupplement"] = { first = 0x2F800, last = 0x2FA1F, otf="hang", description = "CJK Compatibility Ideographs Supplement" },
167 ["cjkradicalssupplement"] = { first = 0x02E80, last = 0x02EFF, otf="hang", description = "CJK Radicals Supplement" },
168 ["cjkstrokes"] = { first = 0x031C0, last = 0x031EF, otf="hang", description = "CJK Strokes" },
169 ["cjksymbolsandpunctuation"] = { first = 0x03000, last = 0x0303F, otf="hang", description = "CJK Symbols and Punctuation" },
170 ["cjkunifiedideographs"] = { first = 0x04E00, last = 0x09FFF, otf="hang", description = "CJK Unified Ideographs" },
171 ["cjkunifiedideographsextensiona"] = { first = 0x03400, last = 0x04DBF, otf="hang", description = "CJK Unified Ideographs Extension A" },
172 ["cjkunifiedideographsextensionb"] = { first = 0x20000, last = 0x2A6DF, otf="hang", description = "CJK Unified Ideographs Extension B" },
173 ["combiningdiacriticalmarks"] = { first = 0x00300, last = 0x0036F, description = "Combining Diacritical Marks" },
174 ["combiningdiacriticalmarksforsymbols"] = { first = 0x020D0, last = 0x020FF, description = "Combining Diacritical Marks for Symbols" },
175 ["combiningdiacriticalmarkssupplement"] = { first = 0x01DC0, last = 0x01DFF, description = "Combining Diacritical Marks Supplement" },
176 ["combininghalfmarks"] = { first = 0x0FE20, last = 0x0FE2F, description = "Combining Half Marks" },
177 ["controlpictures"] = { first = 0x02400, last = 0x0243F, description = "Control Pictures" },
178 ["coptic"] = { first = 0x02C80, last = 0x02CFF, otf="copt", description = "Coptic" },
179 ["countingrodnumerals"] = { first = 0x1D360, last = 0x1D37F, description = "Counting Rod Numerals" },
180 ["cuneiform"] = { first = 0x12000, last = 0x123FF, otf="xsux", description = "Cuneiform" },
181 ["cuneiformnumbersandpunctuation"] = { first = 0x12400, last = 0x1247F, otf="xsux", description = "Cuneiform Numbers and Punctuation" },
182 ["currencysymbols"] = { first = 0x020A0, last = 0x020CF, description = "Currency Symbols" },
183 ["cypriotsyllabary"] = { first = 0x10800, last = 0x1083F, otf="cprt", description = "Cypriot Syllabary" },
184 ["cyrillic"] = { first = 0x00400, last = 0x004FF, otf="cyrl", description = "Cyrillic" },
185 ["cyrillicextendeda"] = { first = 0x02DE0, last = 0x02DFF, otf="cyrl", description = "Cyrillic Extended-A" },
186 ["cyrillicextendedb"] = { first = 0x0A640, last = 0x0A69F, otf="cyrl", description = "Cyrillic Extended-B" },
187 ["cyrillicsupplement"] = { first = 0x00500, last = 0x0052F, otf="cyrl", description = "Cyrillic Supplement" },
188 ["deseret"] = { first = 0x10400, last = 0x1044F, otf="dsrt", description = "Deseret" },
189 ["devanagari"] = { first = 0x00900, last = 0x0097F, otf="deva", description = "Devanagari" },
190 ["devanagariextended"] = { first = 0x0A8E0, last = 0x0A8FF, description = "Devanagari Extended" },
191 ["dingbats"] = { first = 0x02700, last = 0x027BF, description = "Dingbats" },
192 ["dominotiles"] = { first = 0x1F030, last = 0x1F09F, description = "Domino Tiles" },
193 ["egyptianhieroglyphs"] = { first = 0x13000, last = 0x1342F, description = "Egyptian Hieroglyphs" },
194 ["emoticons"] = { first = 0x1F600, last = 0x1F64F, description = "Emoticons" },
195 ["enclosedalphanumericsupplement"] = { first = 0x1F100, last = 0x1F1FF, description = "Enclosed Alphanumeric Supplement" },
196 ["enclosedalphanumerics"] = { first = 0x02460, last = 0x024FF, description = "Enclosed Alphanumerics" },
197 ["enclosedcjklettersandmonths"] = { first = 0x03200, last = 0x032FF, description = "Enclosed CJK Letters and Months" },
198 ["enclosedideographicsupplement"] = { first = 0x1F200, last = 0x1F2FF, description = "Enclosed Ideographic Supplement" },
199 ["ethiopic"] = { first = 0x01200, last = 0x0137F, otf="ethi", description = "Ethiopic" },
200 ["ethiopicextended"] = { first = 0x02D80, last = 0x02DDF, otf="ethi", description = "Ethiopic Extended" },
201 ["ethiopicextendeda"] = { first = 0x0AB00, last = 0x0AB2F, description = "Ethiopic Extended-A" },
202 ["ethiopicsupplement"] = { first = 0x01380, last = 0x0139F, otf="ethi", description = "Ethiopic Supplement" },
203 ["generalpunctuation"] = { first = 0x02000, last = 0x0206F, description = "General Punctuation" },
204 ["geometricshapes"] = { first = 0x025A0, last = 0x025FF, description = "Geometric Shapes" },
205 ["georgian"] = { first = 0x010A0, last = 0x010FF, otf="geor", description = "Georgian" },
206 ["georgiansupplement"] = { first = 0x02D00, last = 0x02D2F, otf="geor", description = "Georgian Supplement" },
207 ["glagolitic"] = { first = 0x02C00, last = 0x02C5F, otf="glag", description = "Glagolitic" },
208 ["gothic"] = { first = 0x10330, last = 0x1034F, otf="goth", description = "Gothic" },
209 ["greekandcoptic"] = { first = 0x00370, last = 0x003FF, otf="grek", description = "Greek and Coptic" },
210 ["greekextended"] = { first = 0x01F00, last = 0x01FFF, otf="grek", description = "Greek Extended" },
211 ["gujarati"] = { first = 0x00A80, last = 0x00AFF, otf="gujr", description = "Gujarati" },
212 ["gurmukhi"] = { first = 0x00A00, last = 0x00A7F, otf="guru", description = "Gurmukhi" },
213 ["halfwidthandfullwidthforms"] = { first = 0x0FF00, last = 0x0FFEF, description = "Halfwidth and Fullwidth Forms" },
214 ["hangulcompatibilityjamo"] = { first = 0x03130, last = 0x0318F, otf="jamo", description = "Hangul Compatibility Jamo" },
215 ["hanguljamo"] = { first = 0x01100, last = 0x011FF, otf="jamo", description = "Hangul Jamo" },
216 ["hanguljamoextendeda"] = { first = 0x0A960, last = 0x0A97F, description = "Hangul Jamo Extended-A" },
217 ["hanguljamoextendedb"] = { first = 0x0D7B0, last = 0x0D7FF, description = "Hangul Jamo Extended-B" },
218 ["hangulsyllables"] = { first = 0x0AC00, last = 0x0D7AF, otf="hang", description = "Hangul Syllables" },
219 ["hanunoo"] = { first = 0x01720, last = 0x0173F, otf="hano", description = "Hanunoo" },
220 ["hebrew"] = { first = 0x00590, last = 0x005FF, otf="hebr", description = "Hebrew" },
221 ["highprivateusesurrogates"] = { first = 0x0DB80, last = 0x0DBFF, description = "High Private Use Surrogates" },
222 ["highsurrogates"] = { first = 0x0D800, last = 0x0DB7F, description = "High Surrogates" },
223 ["hiragana"] = { first = 0x03040, last = 0x0309F, otf="kana", description = "Hiragana" },
224 ["ideographicdescriptioncharacters"] = { first = 0x02FF0, last = 0x02FFF, description = "Ideographic Description Characters" },
225 ["imperialaramaic"] = { first = 0x10840, last = 0x1085F, description = "Imperial Aramaic" },
226 ["inscriptionalpahlavi"] = { first = 0x10B60, last = 0x10B7F, description = "Inscriptional Pahlavi" },
227 ["inscriptionalparthian"] = { first = 0x10B40, last = 0x10B5F, description = "Inscriptional Parthian" },
228 ["ipaextensions"] = { first = 0x00250, last = 0x002AF, description = "IPA Extensions" },
229 ["javanese"] = { first = 0x0A980, last = 0x0A9DF, description = "Javanese" },
230 ["kaithi"] = { first = 0x11080, last = 0x110CF, description = "Kaithi" },
231 ["kanasupplement"] = { first = 0x1B000, last = 0x1B0FF, description = "Kana Supplement" },
232 ["kanbun"] = { first = 0x03190, last = 0x0319F, description = "Kanbun" },
233 ["kangxiradicals"] = { first = 0x02F00, last = 0x02FDF, description = "Kangxi Radicals" },
234 ["kannada"] = { first = 0x00C80, last = 0x00CFF, otf="knda", description = "Kannada" },
235 ["katakana"] = { first = 0x030A0, last = 0x030FF, otf="kana", description = "Katakana" },
236 ["katakanaphoneticextensions"] = { first = 0x031F0, last = 0x031FF, otf="kana", description = "Katakana Phonetic Extensions" },
237 ["kayahli"] = { first = 0x0A900, last = 0x0A92F, description = "Kayah Li" },
238 ["kharoshthi"] = { first = 0x10A00, last = 0x10A5F, otf="khar", description = "Kharoshthi" },
239 ["khmer"] = { first = 0x01780, last = 0x017FF, otf="khmr", description = "Khmer" },
240 ["khmersymbols"] = { first = 0x019E0, last = 0x019FF, otf="khmr", description = "Khmer Symbols" },
241 ["lao"] = { first = 0x00E80, last = 0x00EFF, otf="lao", description = "Lao" },
242 ["latinextendeda"] = { first = 0x00100, last = 0x0017F, otf="latn", description = "Latin Extended-A" },
243 ["latinextendedadditional"] = { first = 0x01E00, last = 0x01EFF, otf="latn", description = "Latin Extended Additional" },
244 ["latinextendedb"] = { first = 0x00180, last = 0x0024F, otf="latn", description = "Latin Extended-B" },
245 ["latinextendedc"] = { first = 0x02C60, last = 0x02C7F, otf="latn", description = "Latin Extended-C" },
246 ["latinextendedd"] = { first = 0x0A720, last = 0x0A7FF, otf="latn", description = "Latin Extended-D" },
247 ["latinsupplement"] = { first = 0x00080, last = 0x000FF, otf="latn", description = "Latin-1 Supplement" },
248 ["lepcha"] = { first = 0x01C00, last = 0x01C4F, description = "Lepcha" },
249 ["letterlikesymbols"] = { first = 0x02100, last = 0x0214F, description = "Letterlike Symbols" },
250 ["limbu"] = { first = 0x01900, last = 0x0194F, otf="limb", description = "Limbu" },
251 ["linearbideograms"] = { first = 0x10080, last = 0x100FF, otf="linb", description = "Linear B Ideograms" },
252 ["linearbsyllabary"] = { first = 0x10000, last = 0x1007F, otf="linb", description = "Linear B Syllabary" },
253 ["lisu"] = { first = 0x0A4D0, last = 0x0A4FF, description = "Lisu" },
254 ["lowsurrogates"] = { first = 0x0DC00, last = 0x0DFFF, description = "Low Surrogates" },
255 ["lycian"] = { first = 0x10280, last = 0x1029F, description = "Lycian" },
256 ["lydian"] = { first = 0x10920, last = 0x1093F, description = "Lydian" },
257 ["mahjongtiles"] = { first = 0x1F000, last = 0x1F02F, description = "Mahjong Tiles" },
258 ["malayalam"] = { first = 0x00D00, last = 0x00D7F, otf="mlym", description = "Malayalam" },
259 ["mandiac"] = { first = 0x00840, last = 0x0085F, otf="mand", description = "Mandaic" },
260 ["mathematicalalphanumericsymbols"] = { first = 0x1D400, last = 0x1D7FF, description = "Mathematical Alphanumeric Symbols" },
261 ["mathematicaloperators"] = { first = 0x02200, last = 0x022FF, description = "Mathematical Operators" },
262 ["meeteimayek"] = { first = 0x0ABC0, last = 0x0ABFF, description = "Meetei Mayek" },
263 ["meeteimayekextensions"] = { first = 0x0AAE0, last = 0x0AAFF, description = "Meetei Mayek Extensions" },
264 ["meroiticcursive"] = { first = 0x109A0, last = 0x109FF, description = "Meroitic Cursive" },
265 ["meroitichieroglyphs"] = { first = 0x10980, last = 0x1099F, description = "Meroitic Hieroglyphs" },
266 ["miao"] = { first = 0x16F00, last = 0x16F9F, description = "Miao" },
267 ["miscellaneousmathematicalsymbolsa"] = { first = 0x027C0, last = 0x027EF, description = "Miscellaneous Mathematical Symbols-A" },
268 ["miscellaneousmathematicalsymbolsb"] = { first = 0x02980, last = 0x029FF, description = "Miscellaneous Mathematical Symbols-B" },
269 ["miscellaneoussymbols"] = { first = 0x02600, last = 0x026FF, description = "Miscellaneous Symbols" },
270 ["miscellaneoussymbolsandarrows"] = { first = 0x02B00, last = 0x02BFF, description = "Miscellaneous Symbols and Arrows" },
271 ["miscellaneoussymbolsandpictographs"] = { first = 0x1F300, last = 0x1F5FF, description = "Miscellaneous Symbols And Pictographs" },
272 ["miscellaneoustechnical"] = { first = 0x02300, last = 0x023FF, description = "Miscellaneous Technical" },
273 ["modifiertoneletters"] = { first = 0x0A700, last = 0x0A71F, description = "Modifier Tone Letters" },
274 ["mongolian"] = { first = 0x01800, last = 0x018AF, otf="mong", description = "Mongolian" },
275 ["musicalsymbols"] = { first = 0x1D100, last = 0x1D1FF, otf="musc", description = "Musical Symbols" },
276 ["myanmar"] = { first = 0x01000, last = 0x0109F, otf="mymr", description = "Myanmar" },
277 ["myanmarextendeda"] = { first = 0x0AA60, last = 0x0AA7F, description = "Myanmar Extended-A" },
278 ["newtailue"] = { first = 0x01980, last = 0x019DF, description = "New Tai Lue" },
279 ["nko"] = { first = 0x007C0, last = 0x007FF, otf="nko", description = "NKo" },
280 ["numberforms"] = { first = 0x02150, last = 0x0218F, description = "Number Forms" },
281 ["ogham"] = { first = 0x01680, last = 0x0169F, otf="ogam", description = "Ogham" },
282 ["olchiki"] = { first = 0x01C50, last = 0x01C7F, description = "Ol Chiki" },
283 ["olditalic"] = { first = 0x10300, last = 0x1032F, otf="ital", description = "Old Italic" },
284 ["oldpersian"] = { first = 0x103A0, last = 0x103DF, otf="xpeo", description = "Old Persian" },
285 ["oldsoutharabian"] = { first = 0x10A60, last = 0x10A7F, description = "Old South Arabian" },
286 ["odlturkic"] = { first = 0x10C00, last = 0x10C4F, description = "Old Turkic" },
287 ["opticalcharacterrecognition"] = { first = 0x02440, last = 0x0245F, description = "Optical Character Recognition" },
288 ["oriya"] = { first = 0x00B00, last = 0x00B7F, otf="orya", description = "Oriya" },
289 ["osmanya"] = { first = 0x10480, last = 0x104AF, otf="osma", description = "Osmanya" },
290 ["phagspa"] = { first = 0x0A840, last = 0x0A87F, otf="phag", description = "Phags-pa" },
291 ["phaistosdisc"] = { first = 0x101D0, last = 0x101FF, description = "Phaistos Disc" },
292 ["phoenician"] = { first = 0x10900, last = 0x1091F, otf="phnx", description = "Phoenician" },
293 ["phoneticextensions"] = { first = 0x01D00, last = 0x01D7F, description = "Phonetic Extensions" },
294 ["phoneticextensionssupplement"] = { first = 0x01D80, last = 0x01DBF, description = "Phonetic Extensions Supplement" },
295 ["playingcards"] = { first = 0x1F0A0, last = 0x1F0FF, description = "Playing Cards" },
296 ["privateusearea"] = { first = 0x0E000, last = 0x0F8FF, description = "Private Use Area" },
297 ["rejang"] = { first = 0x0A930, last = 0x0A95F, description = "Rejang" },
298 ["ruminumeralsymbols"] = { first = 0x10E60, last = 0x10E7F, description = "Rumi Numeral Symbols" },
299 ["runic"] = { first = 0x016A0, last = 0x016FF, otf="runr", description = "Runic" },
300 ["samaritan"] = { first = 0x00800, last = 0x0083F, description = "Samaritan" },
301 ["saurashtra"] = { first = 0x0A880, last = 0x0A8DF, description = "Saurashtra" },
302 ["sharada"] = { first = 0x11180, last = 0x111DF, description = "Sharada" },
303 ["shavian"] = { first = 0x10450, last = 0x1047F, otf="shaw", description = "Shavian" },
304 ["sinhala"] = { first = 0x00D80, last = 0x00DFF, otf="sinh", description = "Sinhala" },
305 ["smallformvariants"] = { first = 0x0FE50, last = 0x0FE6F, description = "Small Form Variants" },
306 ["sorasompeng"] = { first = 0x110D0, last = 0x110FF, description = "Sora Sompeng" },
307 ["spacingmodifierletters"] = { first = 0x002B0, last = 0x002FF, description = "Spacing Modifier Letters" },
308 ["specials"] = { first = 0x0FFF0, last = 0x0FFFF, description = "Specials" },
309 ["sundanese"] = { first = 0x01B80, last = 0x01BBF, description = "Sundanese" },
310 ["sundanesesupplement"] = { first = 0x01CC0, last = 0x01CCF, description = "Sundanese Supplement" },
311 ["superscriptsandsubscripts"] = { first = 0x02070, last = 0x0209F, description = "Superscripts and Subscripts" },
312 ["supplementalarrowsa"] = { first = 0x027F0, last = 0x027FF, description = "Supplemental Arrows-A" },
313 ["supplementalarrowsb"] = { first = 0x02900, last = 0x0297F, description = "Supplemental Arrows-B" },
314 ["supplementalmathematicaloperators"] = { first = 0x02A00, last = 0x02AFF, description = "Supplemental Mathematical Operators" },
315 ["supplementalpunctuation"] = { first = 0x02E00, last = 0x02E7F, description = "Supplemental Punctuation" },
316 ["supplementaryprivateuseareaa"] = { first = 0xF0000, last = 0xFFFFF, description = "Supplementary Private Use Area-A" },
317 ["supplementaryprivateuseareab"] = { first = 0x100000,last = 0x10FFFF, description = "Supplementary Private Use Area-B" },
318 ["sylotinagri"] = { first = 0x0A800, last = 0x0A82F, otf="sylo", description = "Syloti Nagri" },
319 ["syriac"] = { first = 0x00700, last = 0x0074F, otf="syrc", description = "Syriac" },
320 ["tagalog"] = { first = 0x01700, last = 0x0171F, otf="tglg", description = "Tagalog" },
321 ["tagbanwa"] = { first = 0x01760, last = 0x0177F, otf="tagb", description = "Tagbanwa" },
322 ["tags"] = { first = 0xE0000, last = 0xE007F, description = "Tags" },
323 ["taile"] = { first = 0x01950, last = 0x0197F, otf="tale", description = "Tai Le" },
324 ["taitham"] = { first = 0x01A20, last = 0x01AAF, description = "Tai Tham" },
325 ["taiviet"] = { first = 0x0AA80, last = 0x0AADF, description = "Tai Viet" },
326 ["taixuanjingsymbols"] = { first = 0x1D300, last = 0x1D35F, description = "Tai Xuan Jing Symbols" },
327 ["takri"] = { first = 0x11680, last = 0x116CF, description = "Takri" },
328 ["tamil"] = { first = 0x00B80, last = 0x00BFF, otf="taml", description = "Tamil" },
329 ["telugu"] = { first = 0x00C00, last = 0x00C7F, otf="telu", description = "Telugu" },
330 ["thaana"] = { first = 0x00780, last = 0x007BF, otf="thaa", description = "Thaana" },
331 ["thai"] = { first = 0x00E00, last = 0x00E7F, otf="thai", description = "Thai" },
332 ["tibetan"] = { first = 0x00F00, last = 0x00FFF, otf="tibt", description = "Tibetan" },
333 ["tifinagh"] = { first = 0x02D30, last = 0x02D7F, otf="tfng", description = "Tifinagh" },
334 ["transportandmapsymbols"] = { first = 0x1F680, last = 0x1F6FF, description = "Transport And Map Symbols" },
335 ["ugaritic"] = { first = 0x10380, last = 0x1039F, otf="ugar", description = "Ugaritic" },
336 ["unifiedcanadianaboriginalsyllabics"] = { first = 0x01400, last = 0x0167F, otf="cans", description = "Unified Canadian Aboriginal Syllabics" },
337 ["unifiedcanadianaboriginalsyllabicsextended"] = { first = 0x018B0, last = 0x018FF, description = "Unified Canadian Aboriginal Syllabics Extended" },
338 ["vai"] = { first = 0x0A500, last = 0x0A63F, description = "Vai" },
339 ["variationselectors"] = { first = 0x0FE00, last = 0x0FE0F, description = "Variation Selectors" },
340 ["variationselectorssupplement"] = { first = 0xE0100, last = 0xE01EF, description = "Variation Selectors Supplement" },
341 ["vedicextensions"] = { first = 0x01CD0, last = 0x01CFF, description = "Vedic Extensions" },
342 ["verticalforms"] = { first = 0x0FE10, last = 0x0FE1F, description = "Vertical Forms" },
343 ["yijinghexagramsymbols"] = { first = 0x04DC0, last = 0x04DFF, otf="yi", description = "Yijing Hexagram Symbols" },
344 ["yiradicals"] = { first = 0x0A490, last = 0x0A4CF, otf="yi", description = "Yi Radicals" },
345 ["yisyllables"] = { first = 0x0A000, last = 0x0A48F, otf="yi", description = "Yi Syllables" },
348 characters.blocks = blocks
350 function characters.blockrange(name)
351 local b = blocks[name]
353 return b.first, b.last
359 setmetatableindex(blocks, function(t,k) -- we could use an intermediate table if called often
360 return k and rawget(t,lower(gsub(k,"[^a-zA-Z]","")))
363 local otfscripts = utilities.storage.allocate()
364 characters.otfscripts = otfscripts
366 setmetatableindex(otfscripts,function(t,unicode)
367 for k, v in next, blocks do
368 local first, last = v.first, v.last
369 if unicode >= first and unicode <= last then
370 local script = v.otf or "dflt"
377 -- pretty slow when we're here
382 function characters.getrange(name) -- used in font fallback definitions (name or range)
383 local range = blocks[name]
385 return range.first, range.last, range.description
387 name = gsub(name,'"',"0x") -- goodie: tex hex notation
388 local start, stop = match(name,"^(.-)[%-%:](.-)$")
389 if start and stop then
390 start, stop = tonumber(start,16) or tonumber(start), tonumber(stop,16) or tonumber(stop)
391 if start and stop then
392 return start, stop, nil
395 local slot = tonumber(name,16) or tonumber(name)
396 return slot, slot, nil
399 local categorytags = allocate {
400 lu = "Letter Uppercase",
401 ll = "Letter Lowercase",
402 lt = "Letter Titlecase",
403 lm = "Letter Modifier",
405 mn = "Mark Nonspacing",
406 mc = "Mark Spacing Combining",
407 me = "Mark Enclosing",
408 nd = "Number Decimal Digit",
409 nl = "Number Letter",
411 pc = "Punctuation Connector",
412 pd = "Punctuation Dash",
413 ps = "Punctuation Open",
414 pe = "Punctuation Close",
415 pi = "Punctuation Initial Quote",
416 pf = "Punctuation Final Quote",
417 po = "Punctuation Other",
419 sc = "Symbol Currency",
420 sk = "Symbol Modifier",
422 zs = "Separator Space",
423 zl = "Separator Line",
424 zp = "Separator Paragraph",
425 cc = "Other Control",
427 cs = "Other Surrogate",
428 co = "Other Private Use",
429 cn = "Other Not Assigned",
432 characters.categorytags = categorytags
434 --~ special : cf (softhyphen) zs (emspace)
435 --~ characters: ll lm lo lt lu mn nl no pc pd pe pf pi po ps sc sk sm so
437 local is_character = allocate ( tohash {
438 "lu","ll","lt","lm","lo",
442 "pc","pd","ps","pe","pi","pf","po",
446 local is_letter = allocate ( tohash {
447 "ll","lm","lo","lt","lu"
450 local is_command = allocate ( tohash {
454 local is_spacing = allocate ( tohash {
458 local is_mark = allocate ( tohash {
462 -- to be redone: store checked characters
464 characters.is_character = is_character
465 characters.is_letter = is_letter
466 characters.is_command = is_command
467 characters.is_spacing = is_spacing
468 characters.is_mark = is_mark
470 local mt = { -- yes or no ?
471 __index = function(t,k)
472 if type(k) == "number" then
473 local c = data[k].category
474 return c and rawget(t,c)
476 -- avoid auto conversion in data.characters lookups
481 setmetatableindex(characters.is_character, mt)
482 setmetatableindex(characters.is_letter, mt)
483 setmetatableindex(characters.is_command, mt)
484 setmetatableindex(characters.is_spacing, mt)
486 -- linebreak: todo: hash
488 -- normative : BK CR LF CM SG GL CB SP ZW NL WJ JL JV JT H2 H3
489 -- informative : XX OP CL QU NS EX SY IS PR PO NU AL ID IN HY BB BA SA AI B2 new:CP
495 characters.bidi = allocate {
497 lre = "Left-to-Right Embedding",
498 lro = "Left-to-Right Override",
500 al = "Right-to-Left Arabic",
501 rle = "Right-to-Left Embedding",
502 rlo = "Right-to-Left Override",
503 pdf = "Pop Directional Format",
504 en = "European Number",
505 es = "European Number Separator",
506 et = "European Number Terminator",
507 an = "Arabic Number",
508 cs = "Common Number Separator",
509 nsm = "Non-Spacing Mark",
510 bn = "Boundary Neutral",
511 b = "Paragraph Separator",
512 s = "Segment Separator",
514 on = "Other Neutrals",
518 <p>At this point we assume that the big data table is loaded. From this
519 table we derive a few more.</p>
522 if not characters.fallbacks then
524 characters.fallbacks = { } -- not than many
526 local fallbacks = characters.fallbacks
528 for k, d in next, data do
529 local specials = d.specials
530 if specials and specials[1] == "compat" and specials[2] == 0x0020 then
531 local s = specials[3]
542 storage.register("characters/fallbacks", characters.fallbacks, "characters.fallbacks") -- accents and such
545 characters.directions = { }
547 setmetatableindex(characters.directions,function(t,k)
550 local v = d.direction
556 t[k] = false -- maybe 'l'
561 <p>Next comes a whole series of helper methods. These are (will be) part
562 of the official <l n='api'/>.</p>
565 -- we could make them virtual: characters.contextnames[n]
567 function characters.contextname(n) return data[n].contextname or "" end
568 function characters.adobename (n) return data[n].adobename or "" end
569 function characters.description(n) return data[n].description or "" end
570 -------- characters.category (n) return data[n].category or "" end
572 function characters.category(n,verbose)
573 local c = data[n].category
577 return categorytags[c]
583 -- -- some day we will make a table .. not that many calls to utfchar
585 -- local utfchar = utf.char
586 -- local utfbyte = utf.byte
587 -- local utfbytes = { }
588 -- local utfchars = { }
590 -- table.setmetatableindex(utfbytes,function(t,k) local v = utfchar(k) t[k] = v return v end)
591 -- table.setmetatableindex(utfchars,function(t,k) local v = utfbyte(k) t[k] = v return v end)
593 local function toutfstring(s)
594 if type(s) == "table" then
595 return utfchar(unpack(s)) -- concat { utfchar( unpack(s) ) }
601 utf.tostring = toutfstring
603 local categories = allocate() characters.categories = categories -- lazy table
605 setmetatableindex(categories, function(t,u) if u then local c = data[u] c = c and c.category or u t[u] = c return c end end)
607 local lccodes = allocate() characters.lccodes = lccodes -- lazy table
608 local uccodes = allocate() characters.uccodes = uccodes -- lazy table
609 local shcodes = allocate() characters.shcodes = shcodes -- lazy table
610 local fscodes = allocate() characters.fscodes = fscodes -- lazy table
612 setmetatableindex(lccodes, function(t,u) if u then local c = data[u] c = c and c.lccode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
613 setmetatableindex(uccodes, function(t,u) if u then local c = data[u] c = c and c.uccode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
614 setmetatableindex(shcodes, function(t,u) if u then local c = data[u] c = c and c.shcode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
615 setmetatableindex(fscodes, function(t,u) if u then local c = data[u] c = c and c.fscode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
617 local lcchars = allocate() characters.lcchars = lcchars -- lazy table
618 local ucchars = allocate() characters.ucchars = ucchars -- lazy table
619 local shchars = allocate() characters.shchars = shchars -- lazy table
620 local fschars = allocate() characters.fschars = fschars -- lazy table
622 setmetatableindex(lcchars, function(t,u) if u then local c = data[u] c = c and c.lccode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
623 setmetatableindex(ucchars, function(t,u) if u then local c = data[u] c = c and c.uccode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
624 setmetatableindex(shchars, function(t,u) if u then local c = data[u] c = c and c.shcode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
625 setmetatableindex(fschars, function(t,u) if u then local c = data[u] c = c and c.fscode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
627 local decomposed = allocate() characters.decomposed = decomposed -- lazy table
628 local specials = allocate() characters.specials = specials -- lazy table
630 setmetatableindex(decomposed, function(t,u) -- either a table or false
633 local s = c and c.decomposed or false -- could fall back to specials
639 setmetatableindex(specials, function(t,u) -- either a table or false
642 local s = c and c.specials or false
648 local specialchars = allocate() characters.specialchars = specialchars -- lazy table
649 local descriptions = allocate() characters.descriptions = descriptions -- lazy table
651 setmetatableindex(specialchars, function(t,u)
654 local s = c and c.specials
656 local tt, ttn = { }, 0
660 if is_letter[c.category] then
662 tt[ttn] = utfchar(si)
669 if type(u) == "number" then
678 setmetatableindex(descriptions, function(t,k)
680 for u, c in next, data do
681 local d = c.description
688 local d = rawget(t,k)
695 function characters.unicodechar(asked)
696 local n = tonumber(asked)
699 elseif type(asked) == "string" then
700 return descriptions[asked] or descriptions[gsub(asked," ","")]
704 -- function characters.lower(str)
705 -- local new, n = { }, 0
706 -- for u in utfvalues(str) do
708 -- new[n] = lcchars[u]
710 -- return concat(new)
713 -- function characters.upper(str)
714 -- local new, n = { }, 0
715 -- for u in utfvalues(str) do
717 -- new[n] = ucchars[u]
719 -- return concat(new)
722 -- function characters.shaped(str)
723 -- local new, n = { }, 0
724 -- for u in utfvalues(str) do
726 -- new[n] = shchars[u]
728 -- return concat(new)
731 ----- tolower = Cs((utf8byte/lcchars)^0)
732 ----- toupper = Cs((utf8byte/ucchars)^0)
733 ----- toshape = Cs((utf8byte/shchars)^0)
735 local tolower = Cs((utf8char/lcchars)^0)
736 local toupper = Cs((utf8char/ucchars)^0)
737 local toshape = Cs((utf8char/shchars)^0)
739 patterns.tolower = tolower
740 patterns.toupper = toupper
741 patterns.toshape = toshape
743 function characters.lower (str) return lpegmatch(tolower,str) end
744 function characters.upper (str) return lpegmatch(toupper,str) end
745 function characters.shaped(str) return lpegmatch(toshape,str) end
747 function characters.lettered(str,spacing)
748 local new, n = { }, 0
751 for u in utfvalues(str) do
752 local c = data[u].category
754 if done and n > 1 then
761 elseif spacing and is_spacing[c] then
766 for u in utfvalues(str) do
767 if is_letter[data[u].category] then
777 <p>Requesting lower and uppercase codes:</p>
780 function characters.uccode(n) return uccodes[n] end -- obsolete
781 function characters.lccode(n) return lccodes[n] end -- obsolete
783 function characters.safechar(n)
785 if c and c.contextname then
786 return "\\" .. c.contextname
792 function characters.shape(n)
793 local shcode = shcodes[n]
796 elseif type(shcode) == "table" then
797 return shcode[1], shcode[#shcode]
803 -- -- some day we might go this route, but it does not really save that much
804 -- -- so not now (we can generate a lot using mtx-unicode that operates on the
807 -- -- category cjkwd direction linebreak
809 -- -- adobename comment contextcommand contextname description fallback lccode
810 -- -- mathclass mathfiller mathname mathspec mathstretch mathsymbol mirror
811 -- -- range shcode specials uccode uccodes unicodeslot
822 -- description="NULL",
824 -- unicodeslot=0x0000,
827 -- description="START OF HEADING",
829 -- unicodeslot=0x0001,
835 -- local chardata, groupdata = { }, { }
837 -- for group, gdata in next, data do
838 -- local common, vector = { __index = gdata.common }, gdata.vector
839 -- for character, cdata in next, vector do
840 -- chardata[character] = cdata
841 -- setmetatable(cdata,common)
843 -- groupdata[group] = gdata
846 --~ characters.data, characters.groups = chardata, groupdata
851 --~ description="<Plane 0x000F Private Use, First>",
853 --~ unicodeslot=0xF0000,
858 --~ description="<Plane 0x000F Private Use, Last>",
860 --~ unicodeslot=0xFFFFD,
865 --~ description="<Plane 0x0010 Private Use, First>",
867 --~ unicodeslot=0x100000,
872 --~ description="<Plane 0x0010 Private Use, Last>",
874 --~ unicodeslot=0x10FFFD,
877 if not characters.superscripts then
879 local superscripts = allocate() characters.superscripts = superscripts
880 local subscripts = allocate() characters.subscripts = subscripts
882 -- skipping U+02120 (service mark) U+02122 (trademark)
884 for k, v in next, data do
885 local specials = v.specials
887 local what = specials[1]
888 if what == "super" then
889 if #specials == 2 then
890 superscripts[k] = specials[2]
892 report_defining("ignoring %s %a, char %c, description %a","superscript",ustring(k),k,v.description)
894 elseif what == "sub" then
895 if #specials == 2 then
896 subscripts[k] = specials[2]
898 report_defining("ignoring %s %a, char %c, description %a","subscript",ustring(k),k,v.description)
904 -- print(table.serialize(superscripts, "superscripts", { hexify = true }))
905 -- print(table.serialize(subscripts, "subscripts", { hexify = true }))
908 storage.register("characters/superscripts", superscripts, "characters.superscripts")
909 storage.register("characters/subscripts", subscripts, "characters.subscripts")
914 -- for the moment only a few
916 local tracedchars = utilities.strings.tracers
918 tracedchars[0x00] = "[signal]"
919 tracedchars[0x20] = "[space]"
921 -- the following code will move to char-tex.lua
925 if not tex or not context or not commands then return characters end
928 local texsetlccode = tex.setlccode
929 local texsetuccode = tex.setuccode
930 local texsetsfcode = tex.setsfcode
931 local texsetcatcode = tex.setcatcode
933 local contextsprint = context.sprint
934 local ctxcatcodes = catcodes.numbers.ctxcatcodes
937 <p>Instead of using a <l n='tex'/> file to define the named glyphs, we
938 use the table. After all, we have this information available anyway.</p>
941 function commands.makeactive(n,name) --
942 contextsprint(ctxcatcodes,format("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name))
943 -- context("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name)
946 function commands.utfchar(c,n)
948 -- contextsprint(c,charfromnumber(n))
949 contextsprint(c,utfchar(n))
951 -- contextsprint(charfromnumber(c))
952 contextsprint(utfchar(c))
956 function commands.safechar(n)
958 if c and c.contextname then
959 contextsprint("\\" .. c.contextname) -- context[c.contextname]()
961 contextsprint(utfchar(n))
965 tex.uprint = commands.utfchar
967 local forbidden = tohash { -- at least now
969 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x200B, 0x200C, 0x200D,
975 function characters.define(tobelettered, tobeactivated) -- catcodetables
977 if trace_defining then
978 report_defining("defining active character commands")
981 local activated, a = { }, 0
983 for u, chr in next, data do -- these will be commands
984 local fallback = chr.fallback
986 contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\checkedchar{",u,"}{",fallback,"}}}")
990 local contextname = chr.contextname
992 local category = chr.category
993 if is_character[category] then
994 if chr.unicodeslot < 128 then
995 if is_letter[category] then
996 contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,utfchar(u))) -- has no s
998 contextsprint(ctxcatcodes,format("\\chardef\\%s=%s",contextname,u)) -- has no s
1001 contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,utfchar(u))) -- has no s
1003 elseif is_command[category] and not forbidden[u] then
1004 contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\"..contextname,"}}")
1012 if tobelettered then -- shared
1013 local saved = tex.catcodetable
1014 for i=1,#tobelettered do
1015 tex.catcodetable = tobelettered[i]
1016 if trace_defining then
1017 report_defining("defining letters (global, shared)")
1019 for u, chr in next, data do
1020 if not chr.fallback and is_letter[chr.category] and u >= 128 and u <= 65536 then
1023 local range = chr.range
1025 for i=1,range.first,range.last do
1030 texsetcatcode(0x200C,11) -- non-joiner
1031 texsetcatcode(0x200D,11) -- joiner
1033 tex.catcodetable = saved
1036 local nofactivated = #tobeactivated
1037 if tobeactivated and nofactivated > 0 then
1038 for i=1,nofactivated do
1039 local u = activated[i]
1041 report_defining("character %U is active in set %a, containing %a",u,data[u].description,tobeactivated)
1044 local saved = tex.catcodetable
1045 for i=1,#tobeactivated do
1046 local vector = tobeactivated[i]
1047 if trace_defining then
1048 report_defining("defining %a active characters in vector %a",nofactivated,vector)
1050 tex.catcodetable = vector
1051 for i=1,nofactivated do
1052 local u = activated[i]
1058 tex.catcodetable = saved
1064 <p>Setting the lccodes is also done in a loop over the data table.</p>
1067 local sfmode = "unset" -- unset, traditional, normal
1069 function characters.setcodes()
1070 if trace_defining then
1071 report_defining("defining lc and uc codes")
1073 local traditional = sfstate == "traditional" or sfstate == "unset"
1074 for code, chr in next, data do
1075 local cc = chr.category
1076 if is_letter[cc] then
1077 local range = chr.range
1079 for i=range.first,range.last do
1080 texsetcatcode(i,11) -- letter
1081 texsetlccode(i,i,i) -- self self
1084 local lc, uc = chr.lccode, chr.uccode
1086 chr.lccode, lc = code, code
1087 elseif type(lc) == "table" then
1091 chr.uccode, uc = code, code
1092 elseif type(uc) == "table" then
1095 texsetcatcode(code,11) -- letter
1096 texsetlccode(code,lc,uc)
1097 if traditional and cc == "lu" then
1098 texsetsfcode(code,999)
1101 elseif is_mark[cc] then
1102 texsetlccode(code,code,code) -- for hyphenation
1106 sfstate = "traditional"
1110 -- If this is something that is not documentwide and used a lot, then we
1111 -- need a more clever approach (trivial but not now).
1113 local function setuppersfcodes(v,n)
1114 if sfstate ~= "unset" then
1115 report_defining("setting uppercase sf codes to %a",n)
1116 for code, chr in next, data do
1117 if chr.category == "lu" then
1118 texsetsfcode(code,n)
1125 directives.register("characters.spaceafteruppercase",function(v)
1126 if v == "traditional" then
1127 setuppersfcodes(v,999)
1128 elseif v == "normal" then
1129 setuppersfcodes(v,1000)
1135 characters.activeoffset = 0x10000 -- there will be remapped in that byte range
1137 function commands.remapentity(chr,slot)
1138 contextsprint(format("{\\catcode%s=13\\xdef%s{\\string%s}}",slot,utfchar(slot),chr))
1141 -- xml.entities = xml.entities or { }
1143 -- storage.register("xml/entities",xml.entities,"xml.entities") -- this will move to lxml
1145 -- function characters.setmkiventities()
1146 -- local entities = xml.entities
1147 -- entities.lt = "<"
1148 -- entities.amp = "&"
1149 -- entities.gt = ">"
1152 -- function characters.setmkiientities()
1153 -- local entities = xml.entities
1154 -- entities.lt = utfchar(characters.activeoffset + utfbyte("<"))
1155 -- entities.amp = utfchar(characters.activeoffset + utfbyte("&"))
1156 -- entities.gt = utfchar(characters.activeoffset + utfbyte(">"))