beta 2013.05.22 19:28
[context.git] / tex / context / base / char-ini.lua
1 if not modules then modules = { } end modules ['char-ini'] = {
2     version   = 1.001,
3     comment   = "companion to char-ini.mkiv",
4     author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5     copyright = "PRAGMA ADE / ConTeXt Development Team",
6     license   = "see context related readme files"
7 }
8
9 -- todo: make two files, one for format generation, one for format use
10
11 -- we can remove the tag range starting at 0xE0000 (special applications)
12
13 local utfchar, utfbyte, utfvalues, ustring = utf.char, utf.byte, utf.values, utf.ustring
14 local concat, unpack, tohash = table.concat, table.unpack, table.tohash
15 local next, tonumber, type, rawget, rawset = next, tonumber, type, rawget, rawset
16 local format, lower, gsub, match, gmatch = string.format, string.lower, string.gsub, string.match, string.match, string.gmatch
17 local P, R, Cs, lpegmatch, patterns = lpeg.P, lpeg.R, lpeg.Cs, lpeg.match, lpeg.patterns
18
19 local utf8byte          = patterns.utf8byte
20 local utf8char          = patterns.utf8char
21
22 local allocate          = utilities.storage.allocate
23 local mark              = utilities.storage.mark
24
25 local setmetatableindex = table.setmetatableindex
26
27 local trace_defining    = false  trackers.register("characters.defining", function(v) characters_defining = v end)
28
29 local report_defining   = logs.reporter("characters")
30
31 --[[ldx--
32 <p>This module implements some methods and creates additional datastructured
33 from the big character table that we use for all kind of purposes:
34 <type>char-def.lua</type>.</p>
35
36 <p>We assume that at this point <type>characters.data</type> is already
37 loaded!</p>
38 --ldx]]--
39
40 characters       = characters or { }
41 local characters = characters
42 local data       = characters.data
43
44 if data then
45     mark(data) -- why does this fail
46 else
47     report_defining("fatal error: 'char-def.lua' is not loaded")
48     os.exit()
49 end
50
51 --[[ldx--
52 <p>This converts a string (if given) into a number.</p>
53 --ldx]]--
54
55 local pattern = (P("0x") + P("U+")) * ((R("09","AF")^1 * P(-1)) / function(s) return tonumber(s,16) end)
56
57 patterns.chartonumber = pattern
58
59 local function chartonumber(k)
60     if type(k) == "string" then
61         local u = lpegmatch(pattern,k)
62         if u then
63             return utfbyte(u)
64         else
65             return utfbyte(k) or 0
66         end
67     else
68         return k or 0
69     end
70 end
71
72 local function charfromnumber(k)
73     if type(k) == "number" then
74         return utfchar(k) or ""
75     else
76         local u = lpegmatch(pattern,k)
77         if u then
78             return utfchar(u)
79         else
80             return k
81         end
82     end
83 end
84
85 --~ print(chartonumber(97), chartonumber("a"), chartonumber("0x61"), chartonumber("U+61"))
86
87 characters.tonumber   = chartonumber
88 characters.fromnumber = charfromnumber
89
90 local private = {
91     description = "PRIVATE SLOT",
92 }
93
94 local ranges      = allocate()
95 characters.ranges = ranges
96
97 setmetatableindex(data, function(t,k)
98     local tk = type(k)
99     if tk == "string" then
100         k = lpegmatch(pattern,k) or utfbyte(k)
101         if k then
102             local v = rawget(t,k)
103             if v then
104                 return v
105             else
106                 tk = "number" -- fall through to range
107             end
108         else
109             return private
110         end
111     end
112     if tk == "number" and k < 0xF0000 then
113         for r=1,#ranges do
114             local rr = ranges[r]
115             if k >= rr.first and k <= rr.last then
116                 local extender = rr.extender
117                 if extender then
118                     local v = extender(k,v)
119                     t[k] = v
120                     return v
121                 end
122             end
123         end
124     end
125     return private -- handy for when we loop over characters in fonts and check for a property
126 end)
127
128 local blocks = allocate {
129     ["aegeannumbers"]                              = { first = 0x10100, last = 0x1013F,             description = "Aegean Numbers" },
130     ["alchemicalsymbols"]                          = { first = 0x1F700, last = 0x1F77F,             description = "Alchemical Symbols" },
131     ["alphabeticpresentationforms"]                = { first = 0x0FB00, last = 0x0FB4F, otf="latn", description = "Alphabetic Presentation Forms" },
132     ["ancientgreekmusicalnotation"]                = { first = 0x1D200, last = 0x1D24F, otf="grek", description = "Ancient Greek Musical Notation" },
133     ["ancientgreeknumbers"]                        = { first = 0x10140, last = 0x1018F, otf="grek", description = "Ancient Greek Numbers" },
134     ["ancientsymbols"]                             = { first = 0x10190, last = 0x101CF, otf="grek", description = "Ancient Symbols" },
135     ["arabic"]                                     = { first = 0x00600, last = 0x006FF, otf="arab", description = "Arabic" },
136     ["arabicextendeda"]                            = { first = 0x008A0, last = 0x008FF,             description = "Arabic Extended-A" },
137     ["arabicmathematicalalphabeticsymbols"]        = { first = 0x1EE00, last = 0x1EEFF,             description = "Arabic Mathematical Alphabetic Symbols" },
138     ["arabicpresentationformsa"]                   = { first = 0x0FB50, last = 0x0FDFF, otf="arab", description = "Arabic Presentation Forms-A" },
139     ["arabicpresentationformsb"]                   = { first = 0x0FE70, last = 0x0FEFF, otf="arab", description = "Arabic Presentation Forms-B" },
140     ["arabicsupplement"]                           = { first = 0x00750, last = 0x0077F, otf="arab", description = "Arabic Supplement" },
141     ["armenian"]                                   = { first = 0x00530, last = 0x0058F, otf="armn", description = "Armenian" },
142     ["arrows"]                                     = { first = 0x02190, last = 0x021FF,             description = "Arrows" },
143     ["avestan"]                                    = { first = 0x10B00, last = 0x10B3F,             description = "Avestan" },
144     ["balinese"]                                   = { first = 0x01B00, last = 0x01B7F, otf="bali", description = "Balinese" },
145     ["bamum"]                                      = { first = 0x0A6A0, last = 0x0A6FF,             description = "Bamum" },
146     ["bamumsupplement"]                            = { first = 0x16800, last = 0x16A3F,             description = "Bamum Supplement" },
147     ["basiclatin"]                                 = { first = 0x00000, last = 0x0007F, otf="latn", description = "Basic Latin" },
148     ["batak"]                                      = { first = 0x01BC0, last = 0x01BFF,             description = "Batak" },
149     ["bengali"]                                    = { first = 0x00980, last = 0x009FF, otf="beng", description = "Bengali" },
150     ["blockelements"]                              = { first = 0x02580, last = 0x0259F, otf="bopo", description = "Block Elements" },
151     ["bopomofo"]                                   = { first = 0x03100, last = 0x0312F, otf="bopo", description = "Bopomofo" },
152     ["bopomofoextended"]                           = { first = 0x031A0, last = 0x031BF, otf="bopo", description = "Bopomofo Extended" },
153     ["boxdrawing"]                                 = { first = 0x02500, last = 0x0257F,             description = "Box Drawing" },
154     ["brahmi"]                                     = { first = 0x11000, last = 0x1107F,             description = "Brahmi" },
155     ["braillepatterns"]                            = { first = 0x02800, last = 0x028FF, otf="brai", description = "Braille Patterns" },
156     ["buginese"]                                   = { first = 0x01A00, last = 0x01A1F, otf="bugi", description = "Buginese" },
157     ["buhid"]                                      = { first = 0x01740, last = 0x0175F, otf="buhd", description = "Buhid" },
158     ["byzantinemusicalsymbols"]                    = { first = 0x1D000, last = 0x1D0FF, otf="byzm", description = "Byzantine Musical Symbols" },
159     ["commonindicnumberforms"]                     = { first = 0x0A830, last = 0x0A83F,             description = "Common Indic Number Forms" },
160     ["carian"]                                     = { first = 0x102A0, last = 0x102DF,             description = "Carian" },
161     ["cham"]                                       = { first = 0x0AA00, last = 0x0AA5F,             description = "Cham" },
162     ["cherokee"]                                   = { first = 0x013A0, last = 0x013FF, otf="cher", description = "Cherokee" },
163     ["cjkcompatibility"]                           = { first = 0x03300, last = 0x033FF, otf="hang", description = "CJK Compatibility" },
164     ["cjkcompatibilityforms"]                      = { first = 0x0FE30, last = 0x0FE4F, otf="hang", description = "CJK Compatibility Forms" },
165     ["cjkcompatibilityideographs"]                 = { first = 0x0F900, last = 0x0FAFF, otf="hang", description = "CJK Compatibility Ideographs" },
166     ["cjkcompatibilityideographssupplement"]       = { first = 0x2F800, last = 0x2FA1F, otf="hang", description = "CJK Compatibility Ideographs Supplement" },
167     ["cjkradicalssupplement"]                      = { first = 0x02E80, last = 0x02EFF, otf="hang", description = "CJK Radicals Supplement" },
168     ["cjkstrokes"]                                 = { first = 0x031C0, last = 0x031EF, otf="hang", description = "CJK Strokes" },
169     ["cjksymbolsandpunctuation"]                   = { first = 0x03000, last = 0x0303F, otf="hang", description = "CJK Symbols and Punctuation" },
170     ["cjkunifiedideographs"]                       = { first = 0x04E00, last = 0x09FFF, otf="hang", description = "CJK Unified Ideographs" },
171     ["cjkunifiedideographsextensiona"]             = { first = 0x03400, last = 0x04DBF, otf="hang", description = "CJK Unified Ideographs Extension A" },
172     ["cjkunifiedideographsextensionb"]             = { first = 0x20000, last = 0x2A6DF, otf="hang", description = "CJK Unified Ideographs Extension B" },
173     ["combiningdiacriticalmarks"]                  = { first = 0x00300, last = 0x0036F,             description = "Combining Diacritical Marks" },
174     ["combiningdiacriticalmarksforsymbols"]        = { first = 0x020D0, last = 0x020FF,             description = "Combining Diacritical Marks for Symbols" },
175     ["combiningdiacriticalmarkssupplement"]        = { first = 0x01DC0, last = 0x01DFF,             description = "Combining Diacritical Marks Supplement" },
176     ["combininghalfmarks"]                         = { first = 0x0FE20, last = 0x0FE2F,             description = "Combining Half Marks" },
177     ["controlpictures"]                            = { first = 0x02400, last = 0x0243F,             description = "Control Pictures" },
178     ["coptic"]                                     = { first = 0x02C80, last = 0x02CFF, otf="copt", description = "Coptic" },
179     ["countingrodnumerals"]                        = { first = 0x1D360, last = 0x1D37F,             description = "Counting Rod Numerals" },
180     ["cuneiform"]                                  = { first = 0x12000, last = 0x123FF, otf="xsux", description = "Cuneiform" },
181     ["cuneiformnumbersandpunctuation"]             = { first = 0x12400, last = 0x1247F, otf="xsux", description = "Cuneiform Numbers and Punctuation" },
182     ["currencysymbols"]                            = { first = 0x020A0, last = 0x020CF,             description = "Currency Symbols" },
183     ["cypriotsyllabary"]                           = { first = 0x10800, last = 0x1083F, otf="cprt", description = "Cypriot Syllabary" },
184     ["cyrillic"]                                   = { first = 0x00400, last = 0x004FF, otf="cyrl", description = "Cyrillic" },
185     ["cyrillicextendeda"]                          = { first = 0x02DE0, last = 0x02DFF, otf="cyrl", description = "Cyrillic Extended-A" },
186     ["cyrillicextendedb"]                          = { first = 0x0A640, last = 0x0A69F, otf="cyrl", description = "Cyrillic Extended-B" },
187     ["cyrillicsupplement"]                         = { first = 0x00500, last = 0x0052F, otf="cyrl", description = "Cyrillic Supplement" },
188     ["deseret"]                                    = { first = 0x10400, last = 0x1044F, otf="dsrt", description = "Deseret" },
189     ["devanagari"]                                 = { first = 0x00900, last = 0x0097F, otf="deva", description = "Devanagari" },
190     ["devanagariextended"]                         = { first = 0x0A8E0, last = 0x0A8FF,             description = "Devanagari Extended" },
191     ["dingbats"]                                   = { first = 0x02700, last = 0x027BF,             description = "Dingbats" },
192     ["dominotiles"]                                = { first = 0x1F030, last = 0x1F09F,             description = "Domino Tiles" },
193     ["egyptianhieroglyphs"]                        = { first = 0x13000, last = 0x1342F,             description = "Egyptian Hieroglyphs" },
194     ["emoticons"]                                  = { first = 0x1F600, last = 0x1F64F,             description = "Emoticons" },
195     ["enclosedalphanumericsupplement"]             = { first = 0x1F100, last = 0x1F1FF,             description = "Enclosed Alphanumeric Supplement" },
196     ["enclosedalphanumerics"]                      = { first = 0x02460, last = 0x024FF,             description = "Enclosed Alphanumerics" },
197     ["enclosedcjklettersandmonths"]                = { first = 0x03200, last = 0x032FF,             description = "Enclosed CJK Letters and Months" },
198     ["enclosedideographicsupplement"]              = { first = 0x1F200, last = 0x1F2FF,             description = "Enclosed Ideographic Supplement" },
199     ["ethiopic"]                                   = { first = 0x01200, last = 0x0137F, otf="ethi", description = "Ethiopic" },
200     ["ethiopicextended"]                           = { first = 0x02D80, last = 0x02DDF, otf="ethi", description = "Ethiopic Extended" },
201     ["ethiopicextendeda"]                          = { first = 0x0AB00, last = 0x0AB2F,             description = "Ethiopic Extended-A" },
202     ["ethiopicsupplement"]                         = { first = 0x01380, last = 0x0139F, otf="ethi", description = "Ethiopic Supplement" },
203     ["generalpunctuation"]                         = { first = 0x02000, last = 0x0206F,             description = "General Punctuation" },
204     ["geometricshapes"]                            = { first = 0x025A0, last = 0x025FF,             description = "Geometric Shapes" },
205     ["georgian"]                                   = { first = 0x010A0, last = 0x010FF, otf="geor", description = "Georgian" },
206     ["georgiansupplement"]                         = { first = 0x02D00, last = 0x02D2F, otf="geor", description = "Georgian Supplement" },
207     ["glagolitic"]                                 = { first = 0x02C00, last = 0x02C5F, otf="glag", description = "Glagolitic" },
208     ["gothic"]                                     = { first = 0x10330, last = 0x1034F, otf="goth", description = "Gothic" },
209     ["greekandcoptic"]                             = { first = 0x00370, last = 0x003FF, otf="grek", description = "Greek and Coptic" },
210     ["greekextended"]                              = { first = 0x01F00, last = 0x01FFF, otf="grek", description = "Greek Extended" },
211     ["gujarati"]                                   = { first = 0x00A80, last = 0x00AFF, otf="gujr", description = "Gujarati" },
212     ["gurmukhi"]                                   = { first = 0x00A00, last = 0x00A7F, otf="guru", description = "Gurmukhi" },
213     ["halfwidthandfullwidthforms"]                 = { first = 0x0FF00, last = 0x0FFEF,             description = "Halfwidth and Fullwidth Forms" },
214     ["hangulcompatibilityjamo"]                    = { first = 0x03130, last = 0x0318F, otf="jamo", description = "Hangul Compatibility Jamo" },
215     ["hanguljamo"]                                 = { first = 0x01100, last = 0x011FF, otf="jamo", description = "Hangul Jamo" },
216     ["hanguljamoextendeda"]                        = { first = 0x0A960, last = 0x0A97F,             description = "Hangul Jamo Extended-A" },
217     ["hanguljamoextendedb"]                        = { first = 0x0D7B0, last = 0x0D7FF,             description = "Hangul Jamo Extended-B" },
218     ["hangulsyllables"]                            = { first = 0x0AC00, last = 0x0D7AF, otf="hang", description = "Hangul Syllables" },
219     ["hanunoo"]                                    = { first = 0x01720, last = 0x0173F, otf="hano", description = "Hanunoo" },
220     ["hebrew"]                                     = { first = 0x00590, last = 0x005FF, otf="hebr", description = "Hebrew" },
221     ["highprivateusesurrogates"]                   = { first = 0x0DB80, last = 0x0DBFF,             description = "High Private Use Surrogates" },
222     ["highsurrogates"]                             = { first = 0x0D800, last = 0x0DB7F,             description = "High Surrogates" },
223     ["hiragana"]                                   = { first = 0x03040, last = 0x0309F, otf="kana", description = "Hiragana" },
224     ["ideographicdescriptioncharacters"]           = { first = 0x02FF0, last = 0x02FFF,             description = "Ideographic Description Characters" },
225     ["imperialaramaic"]                            = { first = 0x10840, last = 0x1085F,             description = "Imperial Aramaic" },
226     ["inscriptionalpahlavi"]                       = { first = 0x10B60, last = 0x10B7F,             description = "Inscriptional Pahlavi" },
227     ["inscriptionalparthian"]                      = { first = 0x10B40, last = 0x10B5F,             description = "Inscriptional Parthian" },
228     ["ipaextensions"]                              = { first = 0x00250, last = 0x002AF,             description = "IPA Extensions" },
229     ["javanese"]                                   = { first = 0x0A980, last = 0x0A9DF,             description = "Javanese" },
230     ["kaithi"]                                     = { first = 0x11080, last = 0x110CF,             description = "Kaithi" },
231     ["kanasupplement"]                             = { first = 0x1B000, last = 0x1B0FF,             description = "Kana Supplement" },
232     ["kanbun"]                                     = { first = 0x03190, last = 0x0319F,             description = "Kanbun" },
233     ["kangxiradicals"]                             = { first = 0x02F00, last = 0x02FDF,             description = "Kangxi Radicals" },
234     ["kannada"]                                    = { first = 0x00C80, last = 0x00CFF, otf="knda", description = "Kannada" },
235     ["katakana"]                                   = { first = 0x030A0, last = 0x030FF, otf="kana", description = "Katakana" },
236     ["katakanaphoneticextensions"]                 = { first = 0x031F0, last = 0x031FF, otf="kana", description = "Katakana Phonetic Extensions" },
237     ["kayahli"]                                    = { first = 0x0A900, last = 0x0A92F,             description = "Kayah Li" },
238     ["kharoshthi"]                                 = { first = 0x10A00, last = 0x10A5F, otf="khar", description = "Kharoshthi" },
239     ["khmer"]                                      = { first = 0x01780, last = 0x017FF, otf="khmr", description = "Khmer" },
240     ["khmersymbols"]                               = { first = 0x019E0, last = 0x019FF, otf="khmr", description = "Khmer Symbols" },
241     ["lao"]                                        = { first = 0x00E80, last = 0x00EFF, otf="lao",  description = "Lao" },
242     ["latinextendeda"]                             = { first = 0x00100, last = 0x0017F, otf="latn", description = "Latin Extended-A" },
243     ["latinextendedadditional"]                    = { first = 0x01E00, last = 0x01EFF, otf="latn", description = "Latin Extended Additional" },
244     ["latinextendedb"]                             = { first = 0x00180, last = 0x0024F, otf="latn", description = "Latin Extended-B" },
245     ["latinextendedc"]                             = { first = 0x02C60, last = 0x02C7F, otf="latn", description = "Latin Extended-C" },
246     ["latinextendedd"]                             = { first = 0x0A720, last = 0x0A7FF, otf="latn", description = "Latin Extended-D" },
247     ["latinsupplement"]                            = { first = 0x00080, last = 0x000FF, otf="latn", description = "Latin-1 Supplement" },
248     ["lepcha"]                                     = { first = 0x01C00, last = 0x01C4F,             description = "Lepcha" },
249     ["letterlikesymbols"]                          = { first = 0x02100, last = 0x0214F,             description = "Letterlike Symbols" },
250     ["limbu"]                                      = { first = 0x01900, last = 0x0194F, otf="limb", description = "Limbu" },
251     ["linearbideograms"]                           = { first = 0x10080, last = 0x100FF, otf="linb", description = "Linear B Ideograms" },
252     ["linearbsyllabary"]                           = { first = 0x10000, last = 0x1007F, otf="linb", description = "Linear B Syllabary" },
253     ["lisu"]                                       = { first = 0x0A4D0, last = 0x0A4FF,             description = "Lisu" },
254     ["lowsurrogates"]                              = { first = 0x0DC00, last = 0x0DFFF,             description = "Low Surrogates" },
255     ["lycian"]                                     = { first = 0x10280, last = 0x1029F,             description = "Lycian" },
256     ["lydian"]                                     = { first = 0x10920, last = 0x1093F,             description = "Lydian" },
257     ["mahjongtiles"]                               = { first = 0x1F000, last = 0x1F02F,             description = "Mahjong Tiles" },
258     ["malayalam"]                                  = { first = 0x00D00, last = 0x00D7F, otf="mlym", description = "Malayalam" },
259     ["mandiac"]                                    = { first = 0x00840, last = 0x0085F, otf="mand", description = "Mandaic" },
260     ["mathematicalalphanumericsymbols"]            = { first = 0x1D400, last = 0x1D7FF,             description = "Mathematical Alphanumeric Symbols" },
261     ["mathematicaloperators"]                      = { first = 0x02200, last = 0x022FF,             description = "Mathematical Operators" },
262     ["meeteimayek"]                                = { first = 0x0ABC0, last = 0x0ABFF,             description = "Meetei Mayek" },
263     ["meeteimayekextensions"]                      = { first = 0x0AAE0, last = 0x0AAFF,             description = "Meetei Mayek Extensions" },
264     ["meroiticcursive"]                            = { first = 0x109A0, last = 0x109FF,             description = "Meroitic Cursive" },
265     ["meroitichieroglyphs"]                        = { first = 0x10980, last = 0x1099F,             description = "Meroitic Hieroglyphs" },
266     ["miao"]                                       = { first = 0x16F00, last = 0x16F9F,             description = "Miao" },
267     ["miscellaneousmathematicalsymbolsa"]          = { first = 0x027C0, last = 0x027EF,             description = "Miscellaneous Mathematical Symbols-A" },
268     ["miscellaneousmathematicalsymbolsb"]          = { first = 0x02980, last = 0x029FF,             description = "Miscellaneous Mathematical Symbols-B" },
269     ["miscellaneoussymbols"]                       = { first = 0x02600, last = 0x026FF,             description = "Miscellaneous Symbols" },
270     ["miscellaneoussymbolsandarrows"]              = { first = 0x02B00, last = 0x02BFF,             description = "Miscellaneous Symbols and Arrows" },
271     ["miscellaneoussymbolsandpictographs"]         = { first = 0x1F300, last = 0x1F5FF,             description = "Miscellaneous Symbols And Pictographs" },
272     ["miscellaneoustechnical"]                     = { first = 0x02300, last = 0x023FF,             description = "Miscellaneous Technical" },
273     ["modifiertoneletters"]                        = { first = 0x0A700, last = 0x0A71F,             description = "Modifier Tone Letters" },
274     ["mongolian"]                                  = { first = 0x01800, last = 0x018AF, otf="mong", description = "Mongolian" },
275     ["musicalsymbols"]                             = { first = 0x1D100, last = 0x1D1FF, otf="musc", description = "Musical Symbols" },
276     ["myanmar"]                                    = { first = 0x01000, last = 0x0109F, otf="mymr", description = "Myanmar" },
277     ["myanmarextendeda"]                           = { first = 0x0AA60, last = 0x0AA7F,             description = "Myanmar Extended-A" },
278     ["newtailue"]                                  = { first = 0x01980, last = 0x019DF,             description = "New Tai Lue" },
279     ["nko"]                                        = { first = 0x007C0, last = 0x007FF, otf="nko",  description = "NKo" },
280     ["numberforms"]                                = { first = 0x02150, last = 0x0218F,             description = "Number Forms" },
281     ["ogham"]                                      = { first = 0x01680, last = 0x0169F, otf="ogam", description = "Ogham" },
282     ["olchiki"]                                    = { first = 0x01C50, last = 0x01C7F,             description = "Ol Chiki" },
283     ["olditalic"]                                  = { first = 0x10300, last = 0x1032F, otf="ital", description = "Old Italic" },
284     ["oldpersian"]                                 = { first = 0x103A0, last = 0x103DF, otf="xpeo", description = "Old Persian" },
285     ["oldsoutharabian"]                            = { first = 0x10A60, last = 0x10A7F,             description = "Old South Arabian" },
286     ["odlturkic"]                                  = { first = 0x10C00, last = 0x10C4F,             description = "Old Turkic" },
287     ["opticalcharacterrecognition"]                = { first = 0x02440, last = 0x0245F,             description = "Optical Character Recognition" },
288     ["oriya"]                                      = { first = 0x00B00, last = 0x00B7F, otf="orya", description = "Oriya" },
289     ["osmanya"]                                    = { first = 0x10480, last = 0x104AF, otf="osma", description = "Osmanya" },
290     ["phagspa"]                                    = { first = 0x0A840, last = 0x0A87F, otf="phag", description = "Phags-pa" },
291     ["phaistosdisc"]                               = { first = 0x101D0, last = 0x101FF,             description = "Phaistos Disc" },
292     ["phoenician"]                                 = { first = 0x10900, last = 0x1091F, otf="phnx", description = "Phoenician" },
293     ["phoneticextensions"]                         = { first = 0x01D00, last = 0x01D7F,             description = "Phonetic Extensions" },
294     ["phoneticextensionssupplement"]               = { first = 0x01D80, last = 0x01DBF,             description = "Phonetic Extensions Supplement" },
295     ["playingcards"]                               = { first = 0x1F0A0, last = 0x1F0FF,             description = "Playing Cards" },
296     ["privateusearea"]                             = { first = 0x0E000, last = 0x0F8FF,             description = "Private Use Area" },
297     ["rejang"]                                     = { first = 0x0A930, last = 0x0A95F,             description = "Rejang" },
298     ["ruminumeralsymbols"]                         = { first = 0x10E60, last = 0x10E7F,             description = "Rumi Numeral Symbols" },
299     ["runic"]                                      = { first = 0x016A0, last = 0x016FF, otf="runr", description = "Runic" },
300     ["samaritan"]                                  = { first = 0x00800, last = 0x0083F,             description = "Samaritan" },
301     ["saurashtra"]                                 = { first = 0x0A880, last = 0x0A8DF,             description = "Saurashtra" },
302     ["sharada"]                                    = { first = 0x11180, last = 0x111DF,             description = "Sharada" },
303     ["shavian"]                                    = { first = 0x10450, last = 0x1047F, otf="shaw", description = "Shavian" },
304     ["sinhala"]                                    = { first = 0x00D80, last = 0x00DFF, otf="sinh", description = "Sinhala" },
305     ["smallformvariants"]                          = { first = 0x0FE50, last = 0x0FE6F,             description = "Small Form Variants" },
306     ["sorasompeng"]                                = { first = 0x110D0, last = 0x110FF,             description = "Sora Sompeng" },
307     ["spacingmodifierletters"]                     = { first = 0x002B0, last = 0x002FF,             description = "Spacing Modifier Letters" },
308     ["specials"]                                   = { first = 0x0FFF0, last = 0x0FFFF,             description = "Specials" },
309     ["sundanese"]                                  = { first = 0x01B80, last = 0x01BBF,             description = "Sundanese" },
310     ["sundanesesupplement"]                        = { first = 0x01CC0, last = 0x01CCF,             description = "Sundanese Supplement" },
311     ["superscriptsandsubscripts"]                  = { first = 0x02070, last = 0x0209F,             description = "Superscripts and Subscripts" },
312     ["supplementalarrowsa"]                        = { first = 0x027F0, last = 0x027FF,             description = "Supplemental Arrows-A" },
313     ["supplementalarrowsb"]                        = { first = 0x02900, last = 0x0297F,             description = "Supplemental Arrows-B" },
314     ["supplementalmathematicaloperators"]          = { first = 0x02A00, last = 0x02AFF,             description = "Supplemental Mathematical Operators" },
315     ["supplementalpunctuation"]                    = { first = 0x02E00, last = 0x02E7F,             description = "Supplemental Punctuation" },
316     ["supplementaryprivateuseareaa"]               = { first = 0xF0000, last = 0xFFFFF,             description = "Supplementary Private Use Area-A" },
317     ["supplementaryprivateuseareab"]               = { first = 0x100000,last = 0x10FFFF,            description = "Supplementary Private Use Area-B" },
318     ["sylotinagri"]                                = { first = 0x0A800, last = 0x0A82F, otf="sylo", description = "Syloti Nagri" },
319     ["syriac"]                                     = { first = 0x00700, last = 0x0074F, otf="syrc", description = "Syriac" },
320     ["tagalog"]                                    = { first = 0x01700, last = 0x0171F, otf="tglg", description = "Tagalog" },
321     ["tagbanwa"]                                   = { first = 0x01760, last = 0x0177F, otf="tagb", description = "Tagbanwa" },
322     ["tags"]                                       = { first = 0xE0000, last = 0xE007F,             description = "Tags" },
323     ["taile"]                                      = { first = 0x01950, last = 0x0197F, otf="tale", description = "Tai Le" },
324     ["taitham"]                                    = { first = 0x01A20, last = 0x01AAF,             description = "Tai Tham" },
325     ["taiviet"]                                    = { first = 0x0AA80, last = 0x0AADF,             description = "Tai Viet" },
326     ["taixuanjingsymbols"]                         = { first = 0x1D300, last = 0x1D35F,             description = "Tai Xuan Jing Symbols" },
327     ["takri"]                                      = { first = 0x11680, last = 0x116CF,             description = "Takri" },
328     ["tamil"]                                      = { first = 0x00B80, last = 0x00BFF, otf="taml", description = "Tamil" },
329     ["telugu"]                                     = { first = 0x00C00, last = 0x00C7F, otf="telu", description = "Telugu" },
330     ["thaana"]                                     = { first = 0x00780, last = 0x007BF, otf="thaa", description = "Thaana" },
331     ["thai"]                                       = { first = 0x00E00, last = 0x00E7F, otf="thai", description = "Thai" },
332     ["tibetan"]                                    = { first = 0x00F00, last = 0x00FFF, otf="tibt", description = "Tibetan" },
333     ["tifinagh"]                                   = { first = 0x02D30, last = 0x02D7F, otf="tfng", description = "Tifinagh" },
334     ["transportandmapsymbols"]                     = { first = 0x1F680, last = 0x1F6FF,             description = "Transport And Map Symbols" },
335     ["ugaritic"]                                   = { first = 0x10380, last = 0x1039F, otf="ugar", description = "Ugaritic" },
336     ["unifiedcanadianaboriginalsyllabics"]         = { first = 0x01400, last = 0x0167F, otf="cans", description = "Unified Canadian Aboriginal Syllabics" },
337     ["unifiedcanadianaboriginalsyllabicsextended"] = { first = 0x018B0, last = 0x018FF,             description = "Unified Canadian Aboriginal Syllabics Extended" },
338     ["vai"]                                        = { first = 0x0A500, last = 0x0A63F,             description = "Vai" },
339     ["variationselectors"]                         = { first = 0x0FE00, last = 0x0FE0F,             description = "Variation Selectors" },
340     ["variationselectorssupplement"]               = { first = 0xE0100, last = 0xE01EF,             description = "Variation Selectors Supplement" },
341     ["vedicextensions"]                            = { first = 0x01CD0, last = 0x01CFF,             description = "Vedic Extensions" },
342     ["verticalforms"]                              = { first = 0x0FE10, last = 0x0FE1F,             description = "Vertical Forms" },
343     ["yijinghexagramsymbols"]                      = { first = 0x04DC0, last = 0x04DFF, otf="yi",   description = "Yijing Hexagram Symbols" },
344     ["yiradicals"]                                 = { first = 0x0A490, last = 0x0A4CF, otf="yi",   description = "Yi Radicals" },
345     ["yisyllables"]                                = { first = 0x0A000, last = 0x0A48F, otf="yi",   description = "Yi Syllables" },
346 }
347
348 characters.blocks = blocks
349
350 function characters.blockrange(name)
351     local b = blocks[name]
352     if b then
353         return b.first, b.last
354     else
355         return 0, 0
356     end
357 end
358
359 setmetatableindex(blocks, function(t,k) -- we could use an intermediate table if called often
360     return k and rawget(t,lower(gsub(k,"[^a-zA-Z]","")))
361 end)
362
363 local otfscripts      = utilities.storage.allocate()
364 characters.otfscripts = otfscripts
365
366 setmetatableindex(otfscripts,function(t,unicode)
367     for k, v in next, blocks do
368         local first, last = v.first, v.last
369         if unicode >= first and unicode <= last then
370             local script = v.otf or "dflt"
371             for u=first,last do
372                 t[u] = script
373             end
374             return script
375         end
376     end
377     -- pretty slow when we're here
378     t[unicode] = "dflt"
379     return "dflt"
380 end)
381
382 function characters.getrange(name) -- used in font fallback definitions (name or range)
383     local range = blocks[name]
384     if range then
385         return range.first, range.last, range.description
386     end
387     name = gsub(name,'"',"0x") -- goodie: tex hex notation
388     local start, stop = match(name,"^(.-)[%-%:](.-)$")
389     if start and stop then
390         start, stop = tonumber(start,16) or tonumber(start), tonumber(stop,16) or tonumber(stop)
391         if start and stop then
392             return start, stop, nil
393         end
394     end
395     local slot = tonumber(name,16) or tonumber(name)
396     return slot, slot, nil
397 end
398
399 local categorytags = allocate {
400     lu = "Letter Uppercase",
401     ll = "Letter Lowercase",
402     lt = "Letter Titlecase",
403     lm = "Letter Modifier",
404     lo = "Letter Other",
405     mn = "Mark Nonspacing",
406     mc = "Mark Spacing Combining",
407     me = "Mark Enclosing",
408     nd = "Number Decimal Digit",
409     nl = "Number Letter",
410     no = "Number Other",
411     pc = "Punctuation Connector",
412     pd = "Punctuation Dash",
413     ps = "Punctuation Open",
414     pe = "Punctuation Close",
415     pi = "Punctuation Initial Quote",
416     pf = "Punctuation Final Quote",
417     po = "Punctuation Other",
418     sm = "Symbol Math",
419     sc = "Symbol Currency",
420     sk = "Symbol Modifier",
421     so = "Symbol Other",
422     zs = "Separator Space",
423     zl = "Separator Line",
424     zp = "Separator Paragraph",
425     cc = "Other Control",
426     cf = "Other Format",
427     cs = "Other Surrogate",
428     co = "Other Private Use",
429     cn = "Other Not Assigned",
430 }
431
432 characters.categorytags = categorytags
433
434 --~ special   : cf (softhyphen) zs (emspace)
435 --~ characters: ll lm lo lt lu mn nl no pc pd pe pf pi po ps sc sk sm so
436
437 local is_character = allocate ( tohash {
438     "lu","ll","lt","lm","lo",
439     "nd","nl","no",
440     "mn",
441     "nl","no",
442     "pc","pd","ps","pe","pi","pf","po",
443     "sm","sc","sk","so"
444 } )
445
446 local is_letter = allocate ( tohash {
447     "ll","lm","lo","lt","lu"
448 } )
449
450 local is_command = allocate ( tohash {
451     "cf","zs"
452 } )
453
454 local is_spacing = allocate ( tohash {
455     "zs", "zl","zp",
456 } )
457
458 local is_mark = allocate ( tohash {
459     "mn", "ms",
460 } )
461
462 -- to be redone: store checked characters
463
464 characters.is_character = is_character
465 characters.is_letter    = is_letter
466 characters.is_command   = is_command
467 characters.is_spacing   = is_spacing
468 characters.is_mark      = is_mark
469
470 local mt = { -- yes or no ?
471     __index = function(t,k)
472         if type(k) == "number" then
473             local c = data[k].category
474             return c and rawget(t,c)
475         else
476             -- avoid auto conversion in data.characters lookups
477         end
478     end
479 }
480
481 setmetatableindex(characters.is_character, mt)
482 setmetatableindex(characters.is_letter,    mt)
483 setmetatableindex(characters.is_command,   mt)
484 setmetatableindex(characters.is_spacing,   mt)
485
486 -- linebreak: todo: hash
487 --
488 -- normative   : BK CR LF CM SG GL CB SP ZW NL WJ JL JV JT H2 H3
489 -- informative : XX OP CL QU NS EX SY IS PR PO NU AL ID IN HY BB BA SA AI B2 new:CP
490
491 -- east asian width:
492 --
493 -- N A H W F Na
494
495 characters.bidi = allocate {
496     l   = "Left-to-Right",
497     lre = "Left-to-Right Embedding",
498     lro = "Left-to-Right Override",
499     r   = "Right-to-Left",
500     al  = "Right-to-Left Arabic",
501     rle = "Right-to-Left Embedding",
502     rlo = "Right-to-Left Override",
503     pdf = "Pop Directional Format",
504     en  = "European Number",
505     es  = "European Number Separator",
506     et  = "European Number Terminator",
507     an  = "Arabic Number",
508     cs  = "Common Number Separator",
509     nsm = "Non-Spacing Mark",
510     bn  = "Boundary Neutral",
511     b   = "Paragraph Separator",
512     s   = "Segment Separator",
513     ws  = "Whitespace",
514     on  = "Other Neutrals",
515 }
516
517 --[[ldx--
518 <p>At this point we assume that the big data table is loaded. From this
519 table we derive a few more.</p>
520 --ldx]]--
521
522 if not characters.fallbacks then
523
524     characters.fallbacks = { } -- not than many
525
526     local fallbacks = characters.fallbacks
527
528     for k, d in next, data do
529         local specials = d.specials
530         if specials and specials[1] == "compat" and specials[2] == 0x0020 then
531             local s = specials[3]
532             if s then
533                 fallbacks[k] = s
534                 fallbacks[s] = k
535             end
536         end
537     end
538
539 end
540
541 if storage then
542     storage.register("characters/fallbacks", characters.fallbacks, "characters.fallbacks") -- accents and such
543 end
544
545 characters.directions  = { }
546
547 setmetatableindex(characters.directions,function(t,k)
548     local d = data[k]
549     if d then
550         local v = d.direction
551         if v then
552             t[k] = v
553             return v
554         end
555     end
556     t[k] = false -- maybe 'l'
557     return v
558 end)
559
560 --[[ldx--
561 <p>Next comes a whole series of helper methods. These are (will be) part
562 of the official <l n='api'/>.</p>
563 --ldx]]--
564
565 -- we could make them virtual: characters.contextnames[n]
566
567 function characters.contextname(n) return data[n].contextname or "" end
568 function characters.adobename  (n) return data[n].adobename   or "" end
569 function characters.description(n) return data[n].description or "" end
570 -------- characters.category   (n) return data[n].category    or "" end
571
572 function characters.category(n,verbose)
573     local c = data[n].category
574     if not c then
575         return ""
576     elseif verbose then
577         return categorytags[c]
578     else
579         return c
580     end
581 end
582
583 -- -- some day we will make a table .. not that many calls to utfchar
584 --
585 -- local utfchar = utf.char
586 -- local utfbyte = utf.byte
587 -- local utfbytes = { }
588 -- local utfchars = { }
589 --
590 -- table.setmetatableindex(utfbytes,function(t,k) local v = utfchar(k) t[k] = v return v end)
591 -- table.setmetatableindex(utfchars,function(t,k) local v = utfbyte(k) t[k] = v return v end)
592
593 local function toutfstring(s)
594     if type(s) == "table" then
595         return utfchar(unpack(s)) -- concat { utfchar( unpack(s) ) }
596     else
597         return utfchar(s)
598     end
599 end
600
601 utf.tostring = toutfstring
602
603 local categories = allocate()  characters.categories = categories -- lazy table
604
605 setmetatableindex(categories, function(t,u) if u then local c = data[u] c = c and c.category or u t[u] = c return c end end)
606
607 local lccodes = allocate()  characters.lccodes = lccodes -- lazy table
608 local uccodes = allocate()  characters.uccodes = uccodes -- lazy table
609 local shcodes = allocate()  characters.shcodes = shcodes -- lazy table
610 local fscodes = allocate()  characters.fscodes = fscodes -- lazy table
611
612 setmetatableindex(lccodes, function(t,u) if u then local c = data[u] c = c and c.lccode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
613 setmetatableindex(uccodes, function(t,u) if u then local c = data[u] c = c and c.uccode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
614 setmetatableindex(shcodes, function(t,u) if u then local c = data[u] c = c and c.shcode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
615 setmetatableindex(fscodes, function(t,u) if u then local c = data[u] c = c and c.fscode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
616
617 local lcchars = allocate()  characters.lcchars = lcchars -- lazy table
618 local ucchars = allocate()  characters.ucchars = ucchars -- lazy table
619 local shchars = allocate()  characters.shchars = shchars -- lazy table
620 local fschars = allocate()  characters.fschars = fschars -- lazy table
621
622 setmetatableindex(lcchars, function(t,u) if u then local c = data[u] c = c and c.lccode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
623 setmetatableindex(ucchars, function(t,u) if u then local c = data[u] c = c and c.uccode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
624 setmetatableindex(shchars, function(t,u) if u then local c = data[u] c = c and c.shcode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
625 setmetatableindex(fschars, function(t,u) if u then local c = data[u] c = c and c.fscode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
626
627 local decomposed = allocate()  characters.decomposed = decomposed   -- lazy table
628 local specials   = allocate()  characters.specials   = specials     -- lazy table
629
630 setmetatableindex(decomposed, function(t,u) -- either a table or false
631     if u then
632         local c = data[u]
633         local s = c and c.decomposed or false -- could fall back to specials
634         t[u] = s
635         return s
636     end
637 end)
638
639 setmetatableindex(specials, function(t,u) -- either a table or false
640     if u then
641         local c = data[u]
642         local s = c and c.specials or false
643         t[u] = s
644         return s
645     end
646 end)
647
648 local specialchars = allocate()  characters.specialchars = specialchars -- lazy table
649 local descriptions = allocate()  characters.descriptions = descriptions -- lazy table
650
651 setmetatableindex(specialchars, function(t,u)
652     if u then
653         local c = data[u]
654         local s = c and c.specials
655         if s then
656             local tt, ttn = { }, 0
657             for i=2,#s do
658                 local si = s[i]
659                 local c = data[si]
660                 if is_letter[c.category] then
661                     ttn = ttn + 1
662                     tt[ttn] = utfchar(si)
663                 end
664             end
665             c = concat(tt)
666             t[u] = c
667             return c
668         else
669             if type(u) == "number" then
670                 u = utfchar(u)
671             end
672             t[u] = u
673             return u
674         end
675     end
676 end)
677
678 setmetatableindex(descriptions, function(t,k)
679     -- 0.05 - 0.10 sec
680     for u, c in next, data do
681         local d = c.description
682         if d then
683             d = gsub(d," ","")
684             d = lower(d)
685             t[d] = u
686         end
687     end
688     local d = rawget(t,k)
689     if not d then
690         t[k] = k
691     end
692     return d
693 end)
694
695 function characters.unicodechar(asked)
696     local n = tonumber(asked)
697     if n then
698         return n
699     elseif type(asked) == "string" then
700         return descriptions[asked] or descriptions[gsub(asked," ","")]
701     end
702 end
703
704 -- function characters.lower(str)
705 --     local new, n = { }, 0
706 --     for u in utfvalues(str) do
707 --         n = n + 1
708 --         new[n] = lcchars[u]
709 --     end
710 --     return concat(new)
711 -- end
712 --
713 -- function characters.upper(str)
714 --     local new, n = { }, 0
715 --     for u in utfvalues(str) do
716 --         n = n + 1
717 --         new[n] = ucchars[u]
718 --     end
719 --     return concat(new)
720 -- end
721 --
722 -- function characters.shaped(str)
723 --     local new, n = { }, 0
724 --     for u in utfvalues(str) do
725 --         n = n + 1
726 --         new[n] = shchars[u]
727 --     end
728 --     return concat(new)
729 -- end
730
731 ----- tolower = Cs((utf8byte/lcchars)^0)
732 ----- toupper = Cs((utf8byte/ucchars)^0)
733 ----- toshape = Cs((utf8byte/shchars)^0)
734
735 local tolower = Cs((utf8char/lcchars)^0)
736 local toupper = Cs((utf8char/ucchars)^0)
737 local toshape = Cs((utf8char/shchars)^0)
738
739 patterns.tolower = tolower
740 patterns.toupper = toupper
741 patterns.toshape = toshape
742
743 function characters.lower (str) return lpegmatch(tolower,str) end
744 function characters.upper (str) return lpegmatch(toupper,str) end
745 function characters.shaped(str) return lpegmatch(toshape,str) end
746
747 function characters.lettered(str,spacing)
748     local new, n = { }, 0
749     if spacing then
750         local done = false
751         for u in utfvalues(str) do
752             local c = data[u].category
753             if is_letter[c] then
754                 if done and n > 1 then
755                     n = n + 1
756                     new[n] = " "
757                     done = false
758                 end
759                 n = n + 1
760                 new[n] = utfchar(u)
761             elseif spacing and is_spacing[c] then
762                 done = true
763             end
764         end
765     else
766         for u in utfvalues(str) do
767             if is_letter[data[u].category] then
768                 n = n + 1
769                 new[n] = utfchar(u)
770             end
771         end
772     end
773     return concat(new)
774 end
775
776 --[[ldx--
777 <p>Requesting lower and uppercase codes:</p>
778 --ldx]]--
779
780 function characters.uccode(n) return uccodes[n] end -- obsolete
781 function characters.lccode(n) return lccodes[n] end -- obsolete
782
783 function characters.safechar(n)
784     local c = data[n]
785     if c and c.contextname then
786         return "\\" .. c.contextname
787     else
788         return utfchar(n)
789     end
790 end
791
792 function characters.shape(n)
793     local shcode = shcodes[n]
794     if not shcode then
795         return n, nil
796     elseif type(shcode) == "table" then
797         return shcode[1], shcode[#shcode]
798     else
799         return shcode, nil
800     end
801 end
802
803 -- -- some day we might go this route, but it does not really save that much
804 -- -- so not now (we can generate a lot using mtx-unicode that operates on the
805 -- -- database)
806 --
807 -- -- category cjkwd direction linebreak
808 --
809 -- -- adobename comment contextcommand contextname description fallback lccode
810 -- -- mathclass mathfiller mathname mathspec mathstretch mathsymbol mirror
811 -- -- range shcode specials uccode uccodes unicodeslot
812 --
813 -- local data = {
814 --     ['one']={
815 --         common = {
816 --             category="cc",
817 --             direction="bn",
818 --             linebreak="cm",
819 --         },
820 --         vector = {
821 --             [0x0000] = {
822 --                 description="NULL",
823 --                 group='one',
824 --                 unicodeslot=0x0000,
825 --             },
826 --             {
827 --                 description="START OF HEADING",
828 --                 group='one',
829 --                 unicodeslot=0x0001,
830 --             },
831 --         }
832 --     }
833 -- }
834 --
835 -- local chardata, groupdata = { }, { }
836 --
837 -- for group, gdata in next, data do
838 --     local common, vector = { __index = gdata.common }, gdata.vector
839 --     for character, cdata in next, vector do
840 --         chardata[character] = cdata
841 --         setmetatable(cdata,common)
842 --     end
843 --     groupdata[group] = gdata
844 -- end
845
846 --~ characters.data, characters.groups = chardata, groupdata
847
848 --~  [0xF0000]={
849 --~   category="co",
850 --~   cjkwd="a",
851 --~   description="<Plane 0x000F Private Use, First>",
852 --~   direction="l",
853 --~   unicodeslot=0xF0000,
854 --~  },
855 --~  [0xFFFFD]={
856 --~   category="co",
857 --~   cjkwd="a",
858 --~   description="<Plane 0x000F Private Use, Last>",
859 --~   direction="l",
860 --~   unicodeslot=0xFFFFD,
861 --~  },
862 --~  [0x100000]={
863 --~   category="co",
864 --~   cjkwd="a",
865 --~   description="<Plane 0x0010 Private Use, First>",
866 --~   direction="l",
867 --~   unicodeslot=0x100000,
868 --~  },
869 --~  [0x10FFFD]={
870 --~   category="co",
871 --~   cjkwd="a",
872 --~   description="<Plane 0x0010 Private Use, Last>",
873 --~   direction="l",
874 --~   unicodeslot=0x10FFFD,
875 --~  },
876
877 if not characters.superscripts then
878
879     local superscripts = allocate()   characters.superscripts = superscripts
880     local subscripts   = allocate()   characters.subscripts   = subscripts
881
882     -- skipping U+02120 (service mark) U+02122 (trademark)
883
884     for k, v in next, data do
885         local specials = v.specials
886         if specials then
887             local what = specials[1]
888             if what == "super" then
889                 if #specials == 2 then
890                     superscripts[k] = specials[2]
891                 else
892                     report_defining("ignoring %s %a, char %c, description %a","superscript",ustring(k),k,v.description)
893                 end
894             elseif what == "sub" then
895                 if #specials == 2 then
896                     subscripts[k] = specials[2]
897                 else
898                     report_defining("ignoring %s %a, char %c, description %a","subscript",ustring(k),k,v.description)
899                 end
900             end
901         end
902     end
903
904  -- print(table.serialize(superscripts, "superscripts", { hexify = true }))
905  -- print(table.serialize(subscripts,   "subscripts",   { hexify = true }))
906
907     if storage then
908         storage.register("characters/superscripts", superscripts, "characters.superscripts")
909         storage.register("characters/subscripts",   subscripts,   "characters.subscripts")
910     end
911
912 end
913
914 -- for the moment only a few
915
916 local tracedchars = utilities.strings.tracers
917
918 tracedchars[0x00] = "[signal]"
919 tracedchars[0x20] = "[space]"
920
921 -- the following code will move to char-tex.lua
922
923 -- tex
924
925 if not tex or not context or not commands then return characters end
926
927 local tex           = tex
928 local texsetlccode  = tex.setlccode
929 local texsetuccode  = tex.setuccode
930 local texsetsfcode  = tex.setsfcode
931 local texsetcatcode = tex.setcatcode
932
933 local contextsprint = context.sprint
934 local ctxcatcodes   = catcodes.numbers.ctxcatcodes
935
936 --[[ldx--
937 <p>Instead of using a <l n='tex'/> file to define the named glyphs, we
938 use the table. After all, we have this information available anyway.</p>
939 --ldx]]--
940
941 function commands.makeactive(n,name) --
942     contextsprint(ctxcatcodes,format("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name))
943  -- context("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name)
944 end
945
946 function commands.utfchar(c,n)
947     if n then
948      -- contextsprint(c,charfromnumber(n))
949         contextsprint(c,utfchar(n))
950     else
951      -- contextsprint(charfromnumber(c))
952         contextsprint(utfchar(c))
953     end
954 end
955
956 function commands.safechar(n)
957     local c = data[n]
958     if c and c.contextname then
959         contextsprint("\\" .. c.contextname) -- context[c.contextname]()
960     else
961         contextsprint(utfchar(n))
962     end
963 end
964
965 tex.uprint = commands.utfchar
966
967 local forbidden = tohash { -- at least now
968     0x00A0,
969     0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x200B, 0x200C, 0x200D,
970     0x202F,
971     0x205F,
972  -- 0xFEFF,
973 }
974
975 function characters.define(tobelettered, tobeactivated) -- catcodetables
976
977     if trace_defining then
978         report_defining("defining active character commands")
979     end
980
981     local activated, a = { }, 0
982
983     for u, chr in next, data do -- these will be commands
984         local fallback = chr.fallback
985         if fallback then
986             contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\checkedchar{",u,"}{",fallback,"}}}")
987             a = a + 1
988             activated[a] = u
989         else
990             local contextname = chr.contextname
991             if contextname then
992                 local category = chr.category
993                 if is_character[category] then
994                     if chr.unicodeslot < 128 then
995                         if is_letter[category] then
996                             contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,utfchar(u))) -- has no s
997                         else
998                             contextsprint(ctxcatcodes,format("\\chardef\\%s=%s",contextname,u)) -- has no s
999                         end
1000                     else
1001                         contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,utfchar(u))) -- has no s
1002                     end
1003                 elseif is_command[category] and not forbidden[u] then
1004                     contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\"..contextname,"}}")
1005                     a = a + 1
1006                     activated[a] = u
1007                 end
1008             end
1009         end
1010     end
1011
1012     if tobelettered then -- shared
1013         local saved = tex.catcodetable
1014         for i=1,#tobelettered do
1015             tex.catcodetable = tobelettered[i]
1016             if trace_defining then
1017                 report_defining("defining letters (global, shared)")
1018             end
1019             for u, chr in next, data do
1020                 if not chr.fallback and is_letter[chr.category] and u >= 128 and u <= 65536 then
1021                     texsetcatcode(u,11)
1022                 end
1023                 local range = chr.range
1024                 if range then
1025                     for i=1,range.first,range.last do
1026                         texsetcatcode(i,11)
1027                     end
1028                 end
1029             end
1030             texsetcatcode(0x200C,11) -- non-joiner
1031             texsetcatcode(0x200D,11) -- joiner
1032         end
1033         tex.catcodetable = saved
1034     end
1035
1036     local nofactivated = #tobeactivated
1037     if tobeactivated and nofactivated > 0 then
1038         for i=1,nofactivated do
1039             local u = activated[i]
1040             if u then
1041                 report_defining("character %U is active in set %a, containing %a",u,data[u].description,tobeactivated)
1042             end
1043         end
1044         local saved = tex.catcodetable
1045         for i=1,#tobeactivated do
1046             local vector = tobeactivated[i]
1047             if trace_defining then
1048                 report_defining("defining %a active characters in vector %a",nofactivated,vector)
1049             end
1050             tex.catcodetable = vector
1051             for i=1,nofactivated do
1052                 local u = activated[i]
1053                 if u then
1054                     texsetcatcode(u,13)
1055                 end
1056             end
1057         end
1058         tex.catcodetable = saved
1059     end
1060
1061 end
1062
1063 --[[ldx--
1064 <p>Setting the lccodes is also done in a loop over the data table.</p>
1065 --ldx]]--
1066
1067 local sfmode = "unset" -- unset, traditional, normal
1068
1069 function characters.setcodes()
1070     if trace_defining then
1071         report_defining("defining lc and uc codes")
1072     end
1073     local traditional = sfstate == "traditional" or sfstate == "unset"
1074     for code, chr in next, data do
1075         local cc = chr.category
1076         if is_letter[cc] then
1077             local range = chr.range
1078             if range then
1079                 for i=range.first,range.last do
1080                     texsetcatcode(i,11) -- letter
1081                     texsetlccode(i,i,i) -- self self
1082                 end
1083             else
1084                 local lc, uc = chr.lccode, chr.uccode
1085                 if not lc then
1086                     chr.lccode, lc = code, code
1087                 elseif type(lc) == "table" then
1088                     lc = code
1089                 end
1090                 if not uc then
1091                     chr.uccode, uc = code, code
1092                 elseif type(uc) == "table" then
1093                     uc = code
1094                 end
1095                 texsetcatcode(code,11)   -- letter
1096                 texsetlccode(code,lc,uc)
1097                 if traditional and cc == "lu" then
1098                     texsetsfcode(code,999)
1099                 end
1100             end
1101         elseif is_mark[cc] then
1102             texsetlccode(code,code,code) -- for hyphenation
1103         end
1104     end
1105     if traditional then
1106         sfstate = "traditional"
1107     end
1108 end
1109
1110 -- If this is something that is not documentwide and used a lot, then we
1111 -- need a more clever approach (trivial but not now).
1112
1113 local function setuppersfcodes(v,n)
1114     if sfstate ~= "unset" then
1115         report_defining("setting uppercase sf codes to %a",n)
1116         for code, chr in next, data do
1117             if chr.category == "lu" then
1118                 texsetsfcode(code,n)
1119             end
1120         end
1121     end
1122     sfstate = v
1123 end
1124
1125 directives.register("characters.spaceafteruppercase",function(v)
1126     if v == "traditional" then
1127         setuppersfcodes(v,999)
1128     elseif v == "normal" then
1129         setuppersfcodes(v,1000)
1130     end
1131 end)
1132
1133 -- xml
1134
1135 characters.activeoffset = 0x10000 -- there will be remapped in that byte range
1136
1137 function commands.remapentity(chr,slot)
1138     contextsprint(format("{\\catcode%s=13\\xdef%s{\\string%s}}",slot,utfchar(slot),chr))
1139 end
1140
1141 -- xml.entities = xml.entities or { }
1142 --
1143 -- storage.register("xml/entities",xml.entities,"xml.entities") -- this will move to lxml
1144 --
1145 -- function characters.setmkiventities()
1146 --     local entities = xml.entities
1147 --     entities.lt  = "<"
1148 --     entities.amp = "&"
1149 --     entities.gt  = ">"
1150 -- end
1151 --
1152 -- function characters.setmkiientities()
1153 --     local entities = xml.entities
1154 --     entities.lt  = utfchar(characters.activeoffset + utfbyte("<"))
1155 --     entities.amp = utfchar(characters.activeoffset + utfbyte("&"))
1156 --     entities.gt  = utfchar(characters.activeoffset + utfbyte(">"))
1157 -- end
1158