beta 2014.02.14 17:07
[context.git] / tex / context / base / char-ini.lua
1 if not modules then modules = { } end modules ['char-ini'] = {
2 version = 1.001,
3 comment = "companion to char-ini.mkiv",
4 author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5 copyright = "PRAGMA ADE / ConTeXt Development Team",
6 license = "see context related readme files"
7 }
8
9 -- todo: make two files, one for format generation, one for format use
10
11 -- we can remove the tag range starting at 0xE0000 (special applications)
12
13 local utfchar, utfbyte, utfvalues, ustring, utotable = utf.char, utf.byte, utf.values, utf.ustring, utf.totable
14 local concat, unpack, tohash = table.concat, table.unpack, table.tohash
15 local next, tonumber, type, rawget, rawset = next, tonumber, type, rawget, rawset
16 local format, lower, gsub, match, gmatch = string.format, string.lower, string.gsub, string.match, string.match, string.gmatch
17 local P, R, Cs, lpegmatch, patterns = lpeg.P, lpeg.R, lpeg.Cs, lpeg.match, lpeg.patterns
18
19 local utf8byte = patterns.utf8byte
20 local utf8char = patterns.utf8char
21
22 local allocate = utilities.storage.allocate
23 local mark = utilities.storage.mark
24
25 local setmetatableindex = table.setmetatableindex
26
27 local trace_defining = false trackers.register("characters.defining", function(v) characters_defining = v end)
28
29 local report_defining = logs.reporter("characters")
30
31 --[[ldx--
32 <p>This module implements some methods and creates additional datastructured
33 from the big character table that we use for all kind of purposes:
34 <type>char-def.lua</type>.</p>
35
36 <p>We assume that at this point <type>characters.data</type> is already
37 loaded!</p>
38 --ldx]]--
39
40 characters = characters or { }
41 local characters = characters
42 local data = characters.data
43
44 if data then
45 mark(data) -- why does this fail
46 else
47 report_defining("fatal error: 'char-def.lua' is not loaded")
48 os.exit()
49 end
50
51 --[[ldx--
52 <p>This converts a string (if given) into a number.</p>
53 --ldx]]--
54
55 local pattern = (P("0x") + P("U+")) * ((R("09","AF")^1 * P(-1)) / function(s) return tonumber(s,16) end)
56
57 patterns.chartonumber = pattern
58
59 local function chartonumber(k)
60 if type(k) == "string" then
61 local u = lpegmatch(pattern,k)
62 if u then
63 return utfbyte(u)
64 else
65 return utfbyte(k) or 0
66 end
67 else
68 return k or 0
69 end
70 end
71
72 local function charfromnumber(k)
73 if type(k) == "number" then
74 return utfchar(k) or ""
75 else
76 local u = lpegmatch(pattern,k)
77 if u then
78 return utfchar(u)
79 else
80 return k
81 end
82 end
83 end
84
85 --~ print(chartonumber(97), chartonumber("a"), chartonumber("0x61"), chartonumber("U+61"))
86
87 characters.tonumber = chartonumber
88 characters.fromnumber = charfromnumber
89
90 local private = {
91 description = "PRIVATE SLOT",
92 }
93
94 local ranges = allocate()
95 characters.ranges = ranges
96
97 setmetatableindex(data, function(t,k)
98 local tk = type(k)
99 if tk == "string" then
100 k = lpegmatch(pattern,k) or utfbyte(k)
101 if k then
102 local v = rawget(t,k)
103 if v then
104 return v
105 else
106 tk = "number" -- fall through to range
107 end
108 else
109 return private
110 end
111 end
112 if tk == "number" and k < 0xF0000 then
113 for r=1,#ranges do
114 local rr = ranges[r]
115 if k >= rr.first and k <= rr.last then
116 local extender = rr.extender
117 if extender then
118 local v = extender(k,v)
119 t[k] = v
120 return v
121 end
122 end
123 end
124 end
125 return private -- handy for when we loop over characters in fonts and check for a property
126 end)
127
128 local blocks = allocate {
129 ["aegeannumbers"] = { first = 0x10100, last = 0x1013F, description = "Aegean Numbers" },
130 ["alchemicalsymbols"] = { first = 0x1F700, last = 0x1F77F, description = "Alchemical Symbols" },
131 ["alphabeticpresentationforms"] = { first = 0x0FB00, last = 0x0FB4F, otf="latn", description = "Alphabetic Presentation Forms" },
132 ["ancientgreekmusicalnotation"] = { first = 0x1D200, last = 0x1D24F, otf="grek", description = "Ancient Greek Musical Notation" },
133 ["ancientgreeknumbers"] = { first = 0x10140, last = 0x1018F, otf="grek", description = "Ancient Greek Numbers" },
134 ["ancientsymbols"] = { first = 0x10190, last = 0x101CF, otf="grek", description = "Ancient Symbols" },
135 ["arabic"] = { first = 0x00600, last = 0x006FF, otf="arab", description = "Arabic" },
136 ["arabicextendeda"] = { first = 0x008A0, last = 0x008FF, description = "Arabic Extended-A" },
137 ["arabicmathematicalalphabeticsymbols"] = { first = 0x1EE00, last = 0x1EEFF, description = "Arabic Mathematical Alphabetic Symbols" },
138 ["arabicpresentationformsa"] = { first = 0x0FB50, last = 0x0FDFF, otf="arab", description = "Arabic Presentation Forms-A" },
139 ["arabicpresentationformsb"] = { first = 0x0FE70, last = 0x0FEFF, otf="arab", description = "Arabic Presentation Forms-B" },
140 ["arabicsupplement"] = { first = 0x00750, last = 0x0077F, otf="arab", description = "Arabic Supplement" },
141 ["armenian"] = { first = 0x00530, last = 0x0058F, otf="armn", description = "Armenian" },
142 ["arrows"] = { first = 0x02190, last = 0x021FF, description = "Arrows" },
143 ["avestan"] = { first = 0x10B00, last = 0x10B3F, description = "Avestan" },
144 ["balinese"] = { first = 0x01B00, last = 0x01B7F, otf="bali", description = "Balinese" },
145 ["bamum"] = { first = 0x0A6A0, last = 0x0A6FF, description = "Bamum" },
146 ["bamumsupplement"] = { first = 0x16800, last = 0x16A3F, description = "Bamum Supplement" },
147 ["basiclatin"] = { first = 0x00000, last = 0x0007F, otf="latn", description = "Basic Latin" },
148 ["batak"] = { first = 0x01BC0, last = 0x01BFF, description = "Batak" },
149 ["bengali"] = { first = 0x00980, last = 0x009FF, otf="beng", description = "Bengali" },
150 ["blockelements"] = { first = 0x02580, last = 0x0259F, otf="bopo", description = "Block Elements" },
151 ["bopomofo"] = { first = 0x03100, last = 0x0312F, otf="bopo", description = "Bopomofo" },
152 ["bopomofoextended"] = { first = 0x031A0, last = 0x031BF, otf="bopo", description = "Bopomofo Extended" },
153 ["boxdrawing"] = { first = 0x02500, last = 0x0257F, description = "Box Drawing" },
154 ["brahmi"] = { first = 0x11000, last = 0x1107F, description = "Brahmi" },
155 ["braillepatterns"] = { first = 0x02800, last = 0x028FF, otf="brai", description = "Braille Patterns" },
156 ["buginese"] = { first = 0x01A00, last = 0x01A1F, otf="bugi", description = "Buginese" },
157 ["buhid"] = { first = 0x01740, last = 0x0175F, otf="buhd", description = "Buhid" },
158 ["byzantinemusicalsymbols"] = { first = 0x1D000, last = 0x1D0FF, otf="byzm", description = "Byzantine Musical Symbols" },
159 ["commonindicnumberforms"] = { first = 0x0A830, last = 0x0A83F, description = "Common Indic Number Forms" },
160 ["carian"] = { first = 0x102A0, last = 0x102DF, description = "Carian" },
161 ["cham"] = { first = 0x0AA00, last = 0x0AA5F, description = "Cham" },
162 ["cherokee"] = { first = 0x013A0, last = 0x013FF, otf="cher", description = "Cherokee" },
163 ["cjkcompatibility"] = { first = 0x03300, last = 0x033FF, otf="hang", description = "CJK Compatibility" },
164 ["cjkcompatibilityforms"] = { first = 0x0FE30, last = 0x0FE4F, otf="hang", description = "CJK Compatibility Forms" },
165 ["cjkcompatibilityideographs"] = { first = 0x0F900, last = 0x0FAFF, otf="hang", description = "CJK Compatibility Ideographs" },
166 ["cjkcompatibilityideographssupplement"] = { first = 0x2F800, last = 0x2FA1F, otf="hang", description = "CJK Compatibility Ideographs Supplement" },
167 ["cjkradicalssupplement"] = { first = 0x02E80, last = 0x02EFF, otf="hang", description = "CJK Radicals Supplement" },
168 ["cjkstrokes"] = { first = 0x031C0, last = 0x031EF, otf="hang", description = "CJK Strokes" },
169 ["cjksymbolsandpunctuation"] = { first = 0x03000, last = 0x0303F, otf="hang", description = "CJK Symbols and Punctuation" },
170 ["cjkunifiedideographs"] = { first = 0x04E00, last = 0x09FFF, otf="hang", description = "CJK Unified Ideographs", catcode = "letter" },
171 ["cjkunifiedideographsextensiona"] = { first = 0x03400, last = 0x04DBF, otf="hang", description = "CJK Unified Ideographs Extension A" },
172 ["cjkunifiedideographsextensionb"] = { first = 0x20000, last = 0x2A6DF, otf="hang", description = "CJK Unified Ideographs Extension B" },
173 ["combiningdiacriticalmarks"] = { first = 0x00300, last = 0x0036F, description = "Combining Diacritical Marks" },
174 ["combiningdiacriticalmarksforsymbols"] = { first = 0x020D0, last = 0x020FF, description = "Combining Diacritical Marks for Symbols" },
175 ["combiningdiacriticalmarkssupplement"] = { first = 0x01DC0, last = 0x01DFF, description = "Combining Diacritical Marks Supplement" },
176 ["combininghalfmarks"] = { first = 0x0FE20, last = 0x0FE2F, description = "Combining Half Marks" },
177 ["controlpictures"] = { first = 0x02400, last = 0x0243F, description = "Control Pictures" },
178 ["coptic"] = { first = 0x02C80, last = 0x02CFF, otf="copt", description = "Coptic" },
179 ["countingrodnumerals"] = { first = 0x1D360, last = 0x1D37F, description = "Counting Rod Numerals" },
180 ["cuneiform"] = { first = 0x12000, last = 0x123FF, otf="xsux", description = "Cuneiform" },
181 ["cuneiformnumbersandpunctuation"] = { first = 0x12400, last = 0x1247F, otf="xsux", description = "Cuneiform Numbers and Punctuation" },
182 ["currencysymbols"] = { first = 0x020A0, last = 0x020CF, description = "Currency Symbols" },
183 ["cypriotsyllabary"] = { first = 0x10800, last = 0x1083F, otf="cprt", description = "Cypriot Syllabary" },
184 ["cyrillic"] = { first = 0x00400, last = 0x004FF, otf="cyrl", description = "Cyrillic" },
185 ["cyrillicextendeda"] = { first = 0x02DE0, last = 0x02DFF, otf="cyrl", description = "Cyrillic Extended-A" },
186 ["cyrillicextendedb"] = { first = 0x0A640, last = 0x0A69F, otf="cyrl", description = "Cyrillic Extended-B" },
187 ["cyrillicsupplement"] = { first = 0x00500, last = 0x0052F, otf="cyrl", description = "Cyrillic Supplement" },
188 ["deseret"] = { first = 0x10400, last = 0x1044F, otf="dsrt", description = "Deseret" },
189 ["devanagari"] = { first = 0x00900, last = 0x0097F, otf="deva", description = "Devanagari" },
190 ["devanagariextended"] = { first = 0x0A8E0, last = 0x0A8FF, description = "Devanagari Extended" },
191 ["dingbats"] = { first = 0x02700, last = 0x027BF, description = "Dingbats" },
192 ["dominotiles"] = { first = 0x1F030, last = 0x1F09F, description = "Domino Tiles" },
193 ["egyptianhieroglyphs"] = { first = 0x13000, last = 0x1342F, description = "Egyptian Hieroglyphs" },
194 ["emoticons"] = { first = 0x1F600, last = 0x1F64F, description = "Emoticons" },
195 ["enclosedalphanumericsupplement"] = { first = 0x1F100, last = 0x1F1FF, description = "Enclosed Alphanumeric Supplement" },
196 ["enclosedalphanumerics"] = { first = 0x02460, last = 0x024FF, description = "Enclosed Alphanumerics" },
197 ["enclosedcjklettersandmonths"] = { first = 0x03200, last = 0x032FF, description = "Enclosed CJK Letters and Months" },
198 ["enclosedideographicsupplement"] = { first = 0x1F200, last = 0x1F2FF, description = "Enclosed Ideographic Supplement" },
199 ["ethiopic"] = { first = 0x01200, last = 0x0137F, otf="ethi", description = "Ethiopic" },
200 ["ethiopicextended"] = { first = 0x02D80, last = 0x02DDF, otf="ethi", description = "Ethiopic Extended" },
201 ["ethiopicextendeda"] = { first = 0x0AB00, last = 0x0AB2F, description = "Ethiopic Extended-A" },
202 ["ethiopicsupplement"] = { first = 0x01380, last = 0x0139F, otf="ethi", description = "Ethiopic Supplement" },
203 ["generalpunctuation"] = { first = 0x02000, last = 0x0206F, description = "General Punctuation" },
204 ["geometricshapes"] = { first = 0x025A0, last = 0x025FF, description = "Geometric Shapes" },
205 ["georgian"] = { first = 0x010A0, last = 0x010FF, otf="geor", description = "Georgian" },
206 ["georgiansupplement"] = { first = 0x02D00, last = 0x02D2F, otf="geor", description = "Georgian Supplement" },
207 ["glagolitic"] = { first = 0x02C00, last = 0x02C5F, otf="glag", description = "Glagolitic" },
208 ["gothic"] = { first = 0x10330, last = 0x1034F, otf="goth", description = "Gothic" },
209 ["greekandcoptic"] = { first = 0x00370, last = 0x003FF, otf="grek", description = "Greek and Coptic" },
210 ["greekextended"] = { first = 0x01F00, last = 0x01FFF, otf="grek", description = "Greek Extended" },
211 ["gujarati"] = { first = 0x00A80, last = 0x00AFF, otf="gujr", description = "Gujarati" },
212 ["gurmukhi"] = { first = 0x00A00, last = 0x00A7F, otf="guru", description = "Gurmukhi" },
213 ["halfwidthandfullwidthforms"] = { first = 0x0FF00, last = 0x0FFEF, description = "Halfwidth and Fullwidth Forms" },
214 ["hangulcompatibilityjamo"] = { first = 0x03130, last = 0x0318F, otf="jamo", description = "Hangul Compatibility Jamo" },
215 ["hanguljamo"] = { first = 0x01100, last = 0x011FF, otf="jamo", description = "Hangul Jamo" },
216 ["hanguljamoextendeda"] = { first = 0x0A960, last = 0x0A97F, description = "Hangul Jamo Extended-A" },
217 ["hanguljamoextendedb"] = { first = 0x0D7B0, last = 0x0D7FF, description = "Hangul Jamo Extended-B" },
218 ["hangulsyllables"] = { first = 0x0AC00, last = 0x0D7AF, otf="hang", description = "Hangul Syllables" },
219 ["hanunoo"] = { first = 0x01720, last = 0x0173F, otf="hano", description = "Hanunoo" },
220 ["hebrew"] = { first = 0x00590, last = 0x005FF, otf="hebr", description = "Hebrew" },
221 ["highprivateusesurrogates"] = { first = 0x0DB80, last = 0x0DBFF, description = "High Private Use Surrogates" },
222 ["highsurrogates"] = { first = 0x0D800, last = 0x0DB7F, description = "High Surrogates" },
223 ["hiragana"] = { first = 0x03040, last = 0x0309F, otf="kana", description = "Hiragana" },
224 ["ideographicdescriptioncharacters"] = { first = 0x02FF0, last = 0x02FFF, description = "Ideographic Description Characters" },
225 ["imperialaramaic"] = { first = 0x10840, last = 0x1085F, description = "Imperial Aramaic" },
226 ["inscriptionalpahlavi"] = { first = 0x10B60, last = 0x10B7F, description = "Inscriptional Pahlavi" },
227 ["inscriptionalparthian"] = { first = 0x10B40, last = 0x10B5F, description = "Inscriptional Parthian" },
228 ["ipaextensions"] = { first = 0x00250, last = 0x002AF, description = "IPA Extensions" },
229 ["javanese"] = { first = 0x0A980, last = 0x0A9DF, description = "Javanese" },
230 ["kaithi"] = { first = 0x11080, last = 0x110CF, description = "Kaithi" },
231 ["kanasupplement"] = { first = 0x1B000, last = 0x1B0FF, description = "Kana Supplement" },
232 ["kanbun"] = { first = 0x03190, last = 0x0319F, description = "Kanbun" },
233 ["kangxiradicals"] = { first = 0x02F00, last = 0x02FDF, description = "Kangxi Radicals" },
234 ["kannada"] = { first = 0x00C80, last = 0x00CFF, otf="knda", description = "Kannada" },
235 ["katakana"] = { first = 0x030A0, last = 0x030FF, otf="kana", description = "Katakana" },
236 ["katakanaphoneticextensions"] = { first = 0x031F0, last = 0x031FF, otf="kana", description = "Katakana Phonetic Extensions" },
237 ["kayahli"] = { first = 0x0A900, last = 0x0A92F, description = "Kayah Li" },
238 ["kharoshthi"] = { first = 0x10A00, last = 0x10A5F, otf="khar", description = "Kharoshthi" },
239 ["khmer"] = { first = 0x01780, last = 0x017FF, otf="khmr", description = "Khmer" },
240 ["khmersymbols"] = { first = 0x019E0, last = 0x019FF, otf="khmr", description = "Khmer Symbols" },
241 ["lao"] = { first = 0x00E80, last = 0x00EFF, otf="lao", description = "Lao" },
242 ["latinextendeda"] = { first = 0x00100, last = 0x0017F, otf="latn", description = "Latin Extended-A" },
243 ["latinextendedadditional"] = { first = 0x01E00, last = 0x01EFF, otf="latn", description = "Latin Extended Additional" },
244 ["latinextendedb"] = { first = 0x00180, last = 0x0024F, otf="latn", description = "Latin Extended-B" },
245 ["latinextendedc"] = { first = 0x02C60, last = 0x02C7F, otf="latn", description = "Latin Extended-C" },
246 ["latinextendedd"] = { first = 0x0A720, last = 0x0A7FF, otf="latn", description = "Latin Extended-D" },
247 ["latinsupplement"] = { first = 0x00080, last = 0x000FF, otf="latn", description = "Latin-1 Supplement" },
248 ["lepcha"] = { first = 0x01C00, last = 0x01C4F, description = "Lepcha" },
249 ["letterlikesymbols"] = { first = 0x02100, last = 0x0214F, description = "Letterlike Symbols" },
250 ["limbu"] = { first = 0x01900, last = 0x0194F, otf="limb", description = "Limbu" },
251 ["linearbideograms"] = { first = 0x10080, last = 0x100FF, otf="linb", description = "Linear B Ideograms" },
252 ["linearbsyllabary"] = { first = 0x10000, last = 0x1007F, otf="linb", description = "Linear B Syllabary" },
253 ["lisu"] = { first = 0x0A4D0, last = 0x0A4FF, description = "Lisu" },
254 ["lowsurrogates"] = { first = 0x0DC00, last = 0x0DFFF, description = "Low Surrogates" },
255 ["lycian"] = { first = 0x10280, last = 0x1029F, description = "Lycian" },
256 ["lydian"] = { first = 0x10920, last = 0x1093F, description = "Lydian" },
257 ["mahjongtiles"] = { first = 0x1F000, last = 0x1F02F, description = "Mahjong Tiles" },
258 ["malayalam"] = { first = 0x00D00, last = 0x00D7F, otf="mlym", description = "Malayalam" },
259 ["mandiac"] = { first = 0x00840, last = 0x0085F, otf="mand", description = "Mandaic" },
260 ["mathematicalalphanumericsymbols"] = { first = 0x1D400, last = 0x1D7FF, description = "Mathematical Alphanumeric Symbols" },
261 ["mathematicaloperators"] = { first = 0x02200, last = 0x022FF, description = "Mathematical Operators" },
262 ["meeteimayek"] = { first = 0x0ABC0, last = 0x0ABFF, description = "Meetei Mayek" },
263 ["meeteimayekextensions"] = { first = 0x0AAE0, last = 0x0AAFF, description = "Meetei Mayek Extensions" },
264 ["meroiticcursive"] = { first = 0x109A0, last = 0x109FF, description = "Meroitic Cursive" },
265 ["meroitichieroglyphs"] = { first = 0x10980, last = 0x1099F, description = "Meroitic Hieroglyphs" },
266 ["miao"] = { first = 0x16F00, last = 0x16F9F, description = "Miao" },
267 ["miscellaneousmathematicalsymbolsa"] = { first = 0x027C0, last = 0x027EF, description = "Miscellaneous Mathematical Symbols-A" },
268 ["miscellaneousmathematicalsymbolsb"] = { first = 0x02980, last = 0x029FF, description = "Miscellaneous Mathematical Symbols-B" },
269 ["miscellaneoussymbols"] = { first = 0x02600, last = 0x026FF, description = "Miscellaneous Symbols" },
270 ["miscellaneoussymbolsandarrows"] = { first = 0x02B00, last = 0x02BFF, description = "Miscellaneous Symbols and Arrows" },
271 ["miscellaneoussymbolsandpictographs"] = { first = 0x1F300, last = 0x1F5FF, description = "Miscellaneous Symbols And Pictographs" },
272 ["miscellaneoustechnical"] = { first = 0x02300, last = 0x023FF, description = "Miscellaneous Technical" },
273 ["modifiertoneletters"] = { first = 0x0A700, last = 0x0A71F, description = "Modifier Tone Letters" },
274 ["mongolian"] = { first = 0x01800, last = 0x018AF, otf="mong", description = "Mongolian" },
275 ["musicalsymbols"] = { first = 0x1D100, last = 0x1D1FF, otf="musc", description = "Musical Symbols" },
276 ["myanmar"] = { first = 0x01000, last = 0x0109F, otf="mymr", description = "Myanmar" },
277 ["myanmarextendeda"] = { first = 0x0AA60, last = 0x0AA7F, description = "Myanmar Extended-A" },
278 ["newtailue"] = { first = 0x01980, last = 0x019DF, description = "New Tai Lue" },
279 ["nko"] = { first = 0x007C0, last = 0x007FF, otf="nko", description = "NKo" },
280 ["numberforms"] = { first = 0x02150, last = 0x0218F, description = "Number Forms" },
281 ["ogham"] = { first = 0x01680, last = 0x0169F, otf="ogam", description = "Ogham" },
282 ["olchiki"] = { first = 0x01C50, last = 0x01C7F, description = "Ol Chiki" },
283 ["olditalic"] = { first = 0x10300, last = 0x1032F, otf="ital", description = "Old Italic" },
284 ["oldpersian"] = { first = 0x103A0, last = 0x103DF, otf="xpeo", description = "Old Persian" },
285 ["oldsoutharabian"] = { first = 0x10A60, last = 0x10A7F, description = "Old South Arabian" },
286 ["odlturkic"] = { first = 0x10C00, last = 0x10C4F, description = "Old Turkic" },
287 ["opticalcharacterrecognition"] = { first = 0x02440, last = 0x0245F, description = "Optical Character Recognition" },
288 ["oriya"] = { first = 0x00B00, last = 0x00B7F, otf="orya", description = "Oriya" },
289 ["osmanya"] = { first = 0x10480, last = 0x104AF, otf="osma", description = "Osmanya" },
290 ["phagspa"] = { first = 0x0A840, last = 0x0A87F, otf="phag", description = "Phags-pa" },
291 ["phaistosdisc"] = { first = 0x101D0, last = 0x101FF, description = "Phaistos Disc" },
292 ["phoenician"] = { first = 0x10900, last = 0x1091F, otf="phnx", description = "Phoenician" },
293 ["phoneticextensions"] = { first = 0x01D00, last = 0x01D7F, description = "Phonetic Extensions" },
294 ["phoneticextensionssupplement"] = { first = 0x01D80, last = 0x01DBF, description = "Phonetic Extensions Supplement" },
295 ["playingcards"] = { first = 0x1F0A0, last = 0x1F0FF, description = "Playing Cards" },
296 ["privateusearea"] = { first = 0x0E000, last = 0x0F8FF, description = "Private Use Area" },
297 ["rejang"] = { first = 0x0A930, last = 0x0A95F, description = "Rejang" },
298 ["ruminumeralsymbols"] = { first = 0x10E60, last = 0x10E7F, description = "Rumi Numeral Symbols" },
299 ["runic"] = { first = 0x016A0, last = 0x016FF, otf="runr", description = "Runic" },
300 ["samaritan"] = { first = 0x00800, last = 0x0083F, description = "Samaritan" },
301 ["saurashtra"] = { first = 0x0A880, last = 0x0A8DF, description = "Saurashtra" },
302 ["sharada"] = { first = 0x11180, last = 0x111DF, description = "Sharada" },
303 ["shavian"] = { first = 0x10450, last = 0x1047F, otf="shaw", description = "Shavian" },
304 ["sinhala"] = { first = 0x00D80, last = 0x00DFF, otf="sinh", description = "Sinhala" },
305 ["smallformvariants"] = { first = 0x0FE50, last = 0x0FE6F, description = "Small Form Variants" },
306 ["sorasompeng"] = { first = 0x110D0, last = 0x110FF, description = "Sora Sompeng" },
307 ["spacingmodifierletters"] = { first = 0x002B0, last = 0x002FF, description = "Spacing Modifier Letters" },
308 ["specials"] = { first = 0x0FFF0, last = 0x0FFFF, description = "Specials" },
309 ["sundanese"] = { first = 0x01B80, last = 0x01BBF, description = "Sundanese" },
310 ["sundanesesupplement"] = { first = 0x01CC0, last = 0x01CCF, description = "Sundanese Supplement" },
311 ["superscriptsandsubscripts"] = { first = 0x02070, last = 0x0209F, description = "Superscripts and Subscripts" },
312 ["supplementalarrowsa"] = { first = 0x027F0, last = 0x027FF, description = "Supplemental Arrows-A" },
313 ["supplementalarrowsb"] = { first = 0x02900, last = 0x0297F, description = "Supplemental Arrows-B" },
314 ["supplementalmathematicaloperators"] = { first = 0x02A00, last = 0x02AFF, description = "Supplemental Mathematical Operators" },
315 ["supplementalpunctuation"] = { first = 0x02E00, last = 0x02E7F, description = "Supplemental Punctuation" },
316 ["supplementaryprivateuseareaa"] = { first = 0xF0000, last = 0xFFFFF, description = "Supplementary Private Use Area-A" },
317 ["supplementaryprivateuseareab"] = { first = 0x100000,last = 0x10FFFF, description = "Supplementary Private Use Area-B" },
318 ["sylotinagri"] = { first = 0x0A800, last = 0x0A82F, otf="sylo", description = "Syloti Nagri" },
319 ["syriac"] = { first = 0x00700, last = 0x0074F, otf="syrc", description = "Syriac" },
320 ["tagalog"] = { first = 0x01700, last = 0x0171F, otf="tglg", description = "Tagalog" },
321 ["tagbanwa"] = { first = 0x01760, last = 0x0177F, otf="tagb", description = "Tagbanwa" },
322 ["tags"] = { first = 0xE0000, last = 0xE007F, description = "Tags" },
323 ["taile"] = { first = 0x01950, last = 0x0197F, otf="tale", description = "Tai Le" },
324 ["taitham"] = { first = 0x01A20, last = 0x01AAF, description = "Tai Tham" },
325 ["taiviet"] = { first = 0x0AA80, last = 0x0AADF, description = "Tai Viet" },
326 ["taixuanjingsymbols"] = { first = 0x1D300, last = 0x1D35F, description = "Tai Xuan Jing Symbols" },
327 ["takri"] = { first = 0x11680, last = 0x116CF, description = "Takri" },
328 ["tamil"] = { first = 0x00B80, last = 0x00BFF, otf="taml", description = "Tamil" },
329 ["telugu"] = { first = 0x00C00, last = 0x00C7F, otf="telu", description = "Telugu" },
330 ["thaana"] = { first = 0x00780, last = 0x007BF, otf="thaa", description = "Thaana" },
331 ["thai"] = { first = 0x00E00, last = 0x00E7F, otf="thai", description = "Thai" },
332 ["tibetan"] = { first = 0x00F00, last = 0x00FFF, otf="tibt", description = "Tibetan" },
333 ["tifinagh"] = { first = 0x02D30, last = 0x02D7F, otf="tfng", description = "Tifinagh" },
334 ["transportandmapsymbols"] = { first = 0x1F680, last = 0x1F6FF, description = "Transport And Map Symbols" },
335 ["ugaritic"] = { first = 0x10380, last = 0x1039F, otf="ugar", description = "Ugaritic" },
336 ["unifiedcanadianaboriginalsyllabics"] = { first = 0x01400, last = 0x0167F, otf="cans", description = "Unified Canadian Aboriginal Syllabics" },
337 ["unifiedcanadianaboriginalsyllabicsextended"] = { first = 0x018B0, last = 0x018FF, description = "Unified Canadian Aboriginal Syllabics Extended" },
338 ["vai"] = { first = 0x0A500, last = 0x0A63F, description = "Vai" },
339 ["variationselectors"] = { first = 0x0FE00, last = 0x0FE0F, description = "Variation Selectors" },
340 ["variationselectorssupplement"] = { first = 0xE0100, last = 0xE01EF, description = "Variation Selectors Supplement" },
341 ["vedicextensions"] = { first = 0x01CD0, last = 0x01CFF, description = "Vedic Extensions" },
342 ["verticalforms"] = { first = 0x0FE10, last = 0x0FE1F, description = "Vertical Forms" },
343 ["yijinghexagramsymbols"] = { first = 0x04DC0, last = 0x04DFF, otf="yi", description = "Yijing Hexagram Symbols" },
344 ["yiradicals"] = { first = 0x0A490, last = 0x0A4CF, otf="yi", description = "Yi Radicals" },
345 ["yisyllables"] = { first = 0x0A000, last = 0x0A48F, otf="yi", description = "Yi Syllables" },
346 }
347
348 characters.blocks = blocks
349
350 function characters.blockrange(name)
351 local b = blocks[name]
352 if b then
353 return b.first, b.last
354 else
355 return 0, 0
356 end
357 end
358
359 setmetatableindex(blocks, function(t,k) -- we could use an intermediate table if called often
360 return k and rawget(t,lower(gsub(k,"[^a-zA-Z]","")))
361 end)
362
363 local otfscripts = utilities.storage.allocate()
364 characters.otfscripts = otfscripts
365
366 setmetatableindex(otfscripts,function(t,unicode)
367 for k, v in next, blocks do
368 local first, last = v.first, v.last
369 if unicode >= first and unicode <= last then
370 local script = v.otf or "dflt"
371 for u=first,last do
372 t[u] = script
373 end
374 return script
375 end
376 end
377 -- pretty slow when we're here
378 t[unicode] = "dflt"
379 return "dflt"
380 end)
381
382 function characters.getrange(name) -- used in font fallback definitions (name or range)
383 local range = blocks[name]
384 if range then
385 return range.first, range.last, range.description, range.gaps
386 end
387 name = gsub(name,'"',"0x") -- goodie: tex hex notation
388 local start, stop = match(name,"^(.-)[%-%:](.-)$")
389 if start and stop then
390 start, stop = tonumber(start,16) or tonumber(start), tonumber(stop,16) or tonumber(stop)
391 if start and stop then
392 return start, stop, nil
393 end
394 end
395 local slot = tonumber(name,16) or tonumber(name)
396 return slot, slot, nil
397 end
398
399 local categorytags = allocate {
400 lu = "Letter Uppercase",
401 ll = "Letter Lowercase",
402 lt = "Letter Titlecase",
403 lm = "Letter Modifier",
404 lo = "Letter Other",
405 mn = "Mark Nonspacing",
406 mc = "Mark Spacing Combining",
407 me = "Mark Enclosing",
408 nd = "Number Decimal Digit",
409 nl = "Number Letter",
410 no = "Number Other",
411 pc = "Punctuation Connector",
412 pd = "Punctuation Dash",
413 ps = "Punctuation Open",
414 pe = "Punctuation Close",
415 pi = "Punctuation Initial Quote",
416 pf = "Punctuation Final Quote",
417 po = "Punctuation Other",
418 sm = "Symbol Math",
419 sc = "Symbol Currency",
420 sk = "Symbol Modifier",
421 so = "Symbol Other",
422 zs = "Separator Space",
423 zl = "Separator Line",
424 zp = "Separator Paragraph",
425 cc = "Other Control",
426 cf = "Other Format",
427 cs = "Other Surrogate",
428 co = "Other Private Use",
429 cn = "Other Not Assigned",
430 }
431
432 characters.categorytags = categorytags
433
434 --~ special : cf (softhyphen) zs (emspace)
435 --~ characters: ll lm lo lt lu mn nl no pc pd pe pf pi po ps sc sk sm so
436
437 local is_character = allocate ( tohash {
438 "lu","ll","lt","lm","lo",
439 "nd","nl","no",
440 "mn",
441 "nl","no",
442 "pc","pd","ps","pe","pi","pf","po",
443 "sm","sc","sk","so"
444 } )
445
446 local is_letter = allocate ( tohash {
447 "ll","lm","lo","lt","lu"
448 } )
449
450 local is_command = allocate ( tohash {
451 "cf","zs"
452 } )
453
454 local is_spacing = allocate ( tohash {
455 "zs", "zl","zp",
456 } )
457
458 local is_mark = allocate ( tohash {
459 "mn", "ms",
460 } )
461
462 -- to be redone: store checked characters
463
464 characters.is_character = is_character
465 characters.is_letter = is_letter
466 characters.is_command = is_command
467 characters.is_spacing = is_spacing
468 characters.is_mark = is_mark
469
470 local mt = { -- yes or no ?
471 __index = function(t,k)
472 if type(k) == "number" then
473 local c = data[k].category
474 return c and rawget(t,c)
475 else
476 -- avoid auto conversion in data.characters lookups
477 end
478 end
479 }
480
481 setmetatableindex(characters.is_character, mt)
482 setmetatableindex(characters.is_letter, mt)
483 setmetatableindex(characters.is_command, mt)
484 setmetatableindex(characters.is_spacing, mt)
485
486 -- todo: also define callers for the above
487
488 -- linebreak: todo: hash
489 --
490 -- normative : BK CR LF CM SG GL CB SP ZW NL WJ JL JV JT H2 H3
491 -- informative : XX OP CL QU NS EX SY IS PR PO NU AL ID IN HY BB BA SA AI B2 new:CP
492
493 -- east asian width:
494 --
495 -- N A H W F Na
496
497 characters.bidi = allocate {
498 l = "Left-to-Right",
499 lre = "Left-to-Right Embedding",
500 lro = "Left-to-Right Override",
501 r = "Right-to-Left",
502 al = "Right-to-Left Arabic",
503 rle = "Right-to-Left Embedding",
504 rlo = "Right-to-Left Override",
505 pdf = "Pop Directional Format",
506 en = "European Number",
507 es = "European Number Separator",
508 et = "European Number Terminator",
509 an = "Arabic Number",
510 cs = "Common Number Separator",
511 nsm = "Non-Spacing Mark",
512 bn = "Boundary Neutral",
513 b = "Paragraph Separator",
514 s = "Segment Separator",
515 ws = "Whitespace",
516 on = "Other Neutrals",
517 }
518
519 --[[ldx--
520 <p>At this point we assume that the big data table is loaded. From this
521 table we derive a few more.</p>
522 --ldx]]--
523
524 if not characters.fallbacks then
525
526 characters.fallbacks = { } -- not than many
527
528 local fallbacks = characters.fallbacks
529
530 for k, d in next, data do
531 local specials = d.specials
532 if specials and specials[1] == "compat" and specials[2] == 0x0020 then
533 local s = specials[3]
534 if s then
535 fallbacks[k] = s
536 fallbacks[s] = k
537 end
538 end
539 end
540
541 end
542
543 if storage then
544 storage.register("characters/fallbacks", characters.fallbacks, "characters.fallbacks") -- accents and such
545 end
546
547 characters.directions = { }
548
549 setmetatableindex(characters.directions,function(t,k)
550 local d = data[k]
551 if d then
552 local v = d.direction
553 if v then
554 t[k] = v
555 return v
556 end
557 end
558 t[k] = false -- maybe 'l'
559 return v
560 end)
561
562 characters.mirrors = { }
563
564 setmetatableindex(characters.mirrors,function(t,k)
565 local d = data[k]
566 if d then
567 local v = d.mirror
568 if v then
569 t[k] = v
570 return v
571 end
572 end
573 t[k] = false
574 return v
575 end)
576
577 characters.textclasses = { }
578
579 setmetatableindex(characters.textclasses,function(t,k)
580 local d = data[k]
581 if d then
582 local v = d.textclass
583 if v then
584 t[k] = v
585 return v
586 end
587 end
588 t[k] = false
589 return v
590 end)
591
592 --[[ldx--
593 <p>Next comes a whole series of helper methods. These are (will be) part
594 of the official <l n='api'/>.</p>
595 --ldx]]--
596
597 -- we could make them virtual: characters.contextnames[n]
598
599 function characters.contextname(n) return data[n].contextname or "" end
600 function characters.adobename (n) return data[n].adobename or "" end
601 function characters.description(n) return data[n].description or "" end
602 -------- characters.category (n) return data[n].category or "" end
603
604 function characters.category(n,verbose)
605 local c = data[n].category
606 if not c then
607 return ""
608 elseif verbose then
609 return categorytags[c]
610 else
611 return c
612 end
613 end
614
615 -- -- some day we will make a table .. not that many calls to utfchar
616 --
617 -- local utfchar = utf.char
618 -- local utfbyte = utf.byte
619 -- local utfbytes = { }
620 -- local utfchars = { }
621 --
622 -- table.setmetatableindex(utfbytes,function(t,k) local v = utfchar(k) t[k] = v return v end)
623 -- table.setmetatableindex(utfchars,function(t,k) local v = utfbyte(k) t[k] = v return v end)
624
625 local function toutfstring(s)
626 if type(s) == "table" then
627 return utfchar(unpack(s)) -- concat { utfchar( unpack(s) ) }
628 else
629 return utfchar(s)
630 end
631 end
632
633 utf.tostring = toutfstring
634
635 local categories = allocate() characters.categories = categories -- lazy table
636
637 setmetatableindex(categories, function(t,u) if u then local c = data[u] c = c and c.category or u t[u] = c return c end end)
638
639 local lccodes = allocate() characters.lccodes = lccodes -- lazy table
640 local uccodes = allocate() characters.uccodes = uccodes -- lazy table
641 local shcodes = allocate() characters.shcodes = shcodes -- lazy table
642 local fscodes = allocate() characters.fscodes = fscodes -- lazy table
643
644 setmetatableindex(lccodes, function(t,u) if u then local c = data[u] c = c and c.lccode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
645 setmetatableindex(uccodes, function(t,u) if u then local c = data[u] c = c and c.uccode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
646 setmetatableindex(shcodes, function(t,u) if u then local c = data[u] c = c and c.shcode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
647 setmetatableindex(fscodes, function(t,u) if u then local c = data[u] c = c and c.fscode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
648
649 local lcchars = allocate() characters.lcchars = lcchars -- lazy table
650 local ucchars = allocate() characters.ucchars = ucchars -- lazy table
651 local shchars = allocate() characters.shchars = shchars -- lazy table
652 local fschars = allocate() characters.fschars = fschars -- lazy table
653
654 setmetatableindex(lcchars, function(t,u) if u then local c = data[u] c = c and c.lccode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
655 setmetatableindex(ucchars, function(t,u) if u then local c = data[u] c = c and c.uccode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
656 setmetatableindex(shchars, function(t,u) if u then local c = data[u] c = c and c.shcode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
657 setmetatableindex(fschars, function(t,u) if u then local c = data[u] c = c and c.fscode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
658
659 local decomposed = allocate() characters.decomposed = decomposed -- lazy table
660 local specials = allocate() characters.specials = specials -- lazy table
661
662 setmetatableindex(decomposed, function(t,u) -- either a table or false
663 if u then
664 local c = data[u]
665 local s = c and c.decomposed or false -- could fall back to specials
666 t[u] = s
667 return s
668 end
669 end)
670
671 setmetatableindex(specials, function(t,u) -- either a table or false
672 if u then
673 local c = data[u]
674 local s = c and c.specials or false
675 t[u] = s
676 return s
677 end
678 end)
679
680 local specialchars = allocate() characters.specialchars = specialchars -- lazy table
681 local descriptions = allocate() characters.descriptions = descriptions -- lazy table
682
683 setmetatableindex(specialchars, function(t,u)
684 if u then
685 local c = data[u]
686 local s = c and c.specials
687 if s then
688 local tt, ttn = { }, 0
689 for i=2,#s do
690 local si = s[i]
691 local c = data[si]
692 if is_letter[c.category] then
693 ttn = ttn + 1
694 tt[ttn] = utfchar(si)
695 end
696 end
697 c = concat(tt)
698 t[u] = c
699 return c
700 else
701 if type(u) == "number" then
702 u = utfchar(u)
703 end
704 t[u] = u
705 return u
706 end
707 end
708 end)
709
710 setmetatableindex(descriptions, function(t,k)
711 -- 0.05 - 0.10 sec
712 for u, c in next, data do
713 local d = c.description
714 if d then
715 d = gsub(d," ","")
716 d = lower(d)
717 t[d] = u
718 end
719 end
720 local d = rawget(t,k)
721 if not d then
722 t[k] = k
723 end
724 return d
725 end)
726
727 function characters.unicodechar(asked)
728 local n = tonumber(asked)
729 if n then
730 return n
731 elseif type(asked) == "string" then
732 return descriptions[asked] or descriptions[gsub(asked," ","")]
733 end
734 end
735
736 -- function characters.lower(str)
737 -- local new, n = { }, 0
738 -- for u in utfvalues(str) do
739 -- n = n + 1
740 -- new[n] = lcchars[u]
741 -- end
742 -- return concat(new)
743 -- end
744 --
745 -- function characters.upper(str)
746 -- local new, n = { }, 0
747 -- for u in utfvalues(str) do
748 -- n = n + 1
749 -- new[n] = ucchars[u]
750 -- end
751 -- return concat(new)
752 -- end
753 --
754 -- function characters.shaped(str)
755 -- local new, n = { }, 0
756 -- for u in utfvalues(str) do
757 -- n = n + 1
758 -- new[n] = shchars[u]
759 -- end
760 -- return concat(new)
761 -- end
762
763 ----- tolower = Cs((utf8byte/lcchars)^0)
764 ----- toupper = Cs((utf8byte/ucchars)^0)
765 ----- toshape = Cs((utf8byte/shchars)^0)
766
767 local tolower = Cs((utf8char/lcchars)^0)
768 local toupper = Cs((utf8char/ucchars)^0)
769 local toshape = Cs((utf8char/shchars)^0)
770
771 patterns.tolower = tolower
772 patterns.toupper = toupper
773 patterns.toshape = toshape
774
775 function characters.lower (str) return lpegmatch(tolower,str) end
776 function characters.upper (str) return lpegmatch(toupper,str) end
777 function characters.shaped(str) return lpegmatch(toshape,str) end
778
779 function characters.lettered(str,spacing)
780 local new, n = { }, 0
781 if spacing then
782 local done = false
783 for u in utfvalues(str) do
784 local c = data[u].category
785 if is_letter[c] then
786 if done and n > 1 then
787 n = n + 1
788 new[n] = " "
789 done = false
790 end
791 n = n + 1
792 new[n] = utfchar(u)
793 elseif spacing and is_spacing[c] then
794 done = true
795 end
796 end
797 else
798 for u in utfvalues(str) do
799 if is_letter[data[u].category] then
800 n = n + 1
801 new[n] = utfchar(u)
802 end
803 end
804 end
805 return concat(new)
806 end
807
808 --[[ldx--
809 <p>Requesting lower and uppercase codes:</p>
810 --ldx]]--
811
812 function characters.uccode(n) return uccodes[n] end -- obsolete
813 function characters.lccode(n) return lccodes[n] end -- obsolete
814
815 function characters.safechar(n)
816 local c = data[n]
817 if c and c.contextname then
818 return "\\" .. c.contextname
819 else
820 return utfchar(n)
821 end
822 end
823
824 function characters.shape(n)
825 local shcode = shcodes[n]
826 if not shcode then
827 return n, nil
828 elseif type(shcode) == "table" then
829 return shcode[1], shcode[#shcode]
830 else
831 return shcode, nil
832 end
833 end
834
835 -- -- some day we might go this route, but it does not really save that much
836 -- -- so not now (we can generate a lot using mtx-unicode that operates on the
837 -- -- database)
838 --
839 -- -- category cjkwd direction linebreak
840 --
841 -- -- adobename comment contextcommand contextname description fallback lccode
842 -- -- mathclass mathfiller mathname mathspec mathstretch mathsymbol mirror
843 -- -- range shcode specials uccode uccodes unicodeslot
844 --
845 -- local data = {
846 -- ['one']={
847 -- common = {
848 -- category="cc",
849 -- direction="bn",
850 -- linebreak="cm",
851 -- },
852 -- vector = {
853 -- [0x0000] = {
854 -- description="NULL",
855 -- group='one',
856 -- unicodeslot=0x0000,
857 -- },
858 -- {
859 -- description="START OF HEADING",
860 -- group='one',
861 -- unicodeslot=0x0001,
862 -- },
863 -- }
864 -- }
865 -- }
866 --
867 -- local chardata, groupdata = { }, { }
868 --
869 -- for group, gdata in next, data do
870 -- local common, vector = { __index = gdata.common }, gdata.vector
871 -- for character, cdata in next, vector do
872 -- chardata[character] = cdata
873 -- setmetatable(cdata,common)
874 -- end
875 -- groupdata[group] = gdata
876 -- end
877
878 --~ characters.data, characters.groups = chardata, groupdata
879
880 --~ [0xF0000]={
881 --~ category="co",
882 --~ cjkwd="a",
883 --~ description="<Plane 0x000F Private Use, First>",
884 --~ direction="l",
885 --~ unicodeslot=0xF0000,
886 --~ },
887 --~ [0xFFFFD]={
888 --~ category="co",
889 --~ cjkwd="a",
890 --~ description="<Plane 0x000F Private Use, Last>",
891 --~ direction="l",
892 --~ unicodeslot=0xFFFFD,
893 --~ },
894 --~ [0x100000]={
895 --~ category="co",
896 --~ cjkwd="a",
897 --~ description="<Plane 0x0010 Private Use, First>",
898 --~ direction="l",
899 --~ unicodeslot=0x100000,
900 --~ },
901 --~ [0x10FFFD]={
902 --~ category="co",
903 --~ cjkwd="a",
904 --~ description="<Plane 0x0010 Private Use, Last>",
905 --~ direction="l",
906 --~ unicodeslot=0x10FFFD,
907 --~ },
908
909 if not characters.superscripts then
910
911 local superscripts = allocate() characters.superscripts = superscripts
912 local subscripts = allocate() characters.subscripts = subscripts
913
914 -- skipping U+02120 (service mark) U+02122 (trademark)
915
916 for k, v in next, data do
917 local specials = v.specials
918 if specials then
919 local what = specials[1]
920 if what == "super" then
921 if #specials == 2 then
922 superscripts[k] = specials[2]
923 elseif trace_defining then
924 report_defining("ignoring %s %a, char %c, description %a","superscript",ustring(k),k,v.description)
925 end
926 elseif what == "sub" then
927 if #specials == 2 then
928 subscripts[k] = specials[2]
929 elseif trace_defining then
930 report_defining("ignoring %s %a, char %c, description %a","subscript",ustring(k),k,v.description)
931 end
932 end
933 end
934 end
935
936 -- print(table.serialize(superscripts, "superscripts", { hexify = true }))
937 -- print(table.serialize(subscripts, "subscripts", { hexify = true }))
938
939 if storage then
940 storage.register("characters/superscripts", superscripts, "characters.superscripts")
941 storage.register("characters/subscripts", subscripts, "characters.subscripts")
942 end
943
944 end
945
946 -- for the moment only a few
947
948 local tracedchars = utilities.strings.tracers
949
950 tracedchars[0x00] = "[signal]"
951 tracedchars[0x0A] = "[linefeed]"
952 tracedchars[0x0B] = "[tab]"
953 tracedchars[0x0C] = "[formfeed]"
954 tracedchars[0x0D] = "[return]"
955 tracedchars[0x20] = "[space]"
956
957 function characters.showstring(str)
958 local list = utotable(str)
959 for i=1,#list do
960 report_defining("split % 3i : %C",i,list[i])
961 end
962 end
963
964 -- the following code will move to char-tex.lua
965
966 -- tex
967
968 if not tex or not context or not commands then return characters end
969
970 local tex = tex
971 local texsetlccode = tex.setlccode
972 local texsetuccode = tex.setuccode
973 local texsetsfcode = tex.setsfcode
974 local texsetcatcode = tex.setcatcode
975
976 local contextsprint = context.sprint
977 local ctxcatcodes = catcodes.numbers.ctxcatcodes
978
979 --[[ldx--
980 <p>Instead of using a <l n='tex'/> file to define the named glyphs, we
981 use the table. After all, we have this information available anyway.</p>
982 --ldx]]--
983
984 function commands.makeactive(n,name) --
985 contextsprint(ctxcatcodes,format("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name))
986 -- context("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name)
987 end
988
989 function commands.utfchar(c,n)
990 if n then
991 -- contextsprint(c,charfromnumber(n))
992 contextsprint(c,utfchar(n))
993 else
994 -- contextsprint(charfromnumber(c))
995 contextsprint(utfchar(c))
996 end
997 end
998
999 function commands.safechar(n)
1000 local c = data[n]
1001 if c and c.contextname then
1002 contextsprint("\\" .. c.contextname) -- context[c.contextname]()
1003 else
1004 contextsprint(utfchar(n))
1005 end
1006 end
1007
1008 tex.uprint = commands.utfchar
1009
1010 local forbidden = tohash { -- at least now
1011 0x00A0,
1012 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x200B, 0x200C, 0x200D,
1013 0x202F,
1014 0x205F,
1015 -- 0xFEFF,
1016 }
1017
1018 function characters.define(tobelettered, tobeactivated) -- catcodetables
1019
1020 if trace_defining then
1021 report_defining("defining active character commands")
1022 end
1023
1024 local activated, a = { }, 0
1025
1026 for u, chr in next, data do -- these will be commands
1027 local fallback = chr.fallback
1028 if fallback then
1029 contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\checkedchar{",u,"}{",fallback,"}}}")
1030 a = a + 1
1031 activated[a] = u
1032 else
1033 local contextname = chr.contextname
1034 if contextname then
1035 local category = chr.category
1036 if is_character[category] then
1037 if chr.unicodeslot < 128 then
1038 if is_letter[category] then
1039 contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,utfchar(u))) -- has no s
1040 else
1041 contextsprint(ctxcatcodes,format("\\chardef\\%s=%s",contextname,u)) -- has no s
1042 end
1043 else
1044 contextsprint(ctxcatcodes,format("\\def\\%s{%s}",contextname,utfchar(u))) -- has no s
1045 end
1046 elseif is_command[category] and not forbidden[u] then
1047 contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\"..contextname,"}}")
1048 a = a + 1
1049 activated[a] = u
1050 end
1051 end
1052 end
1053 end
1054
1055 if tobelettered then -- shared
1056 local saved = tex.catcodetable
1057 for i=1,#tobelettered do
1058 tex.catcodetable = tobelettered[i]
1059 if trace_defining then
1060 report_defining("defining letters (global, shared)")
1061 end
1062 for u, chr in next, data do
1063 if not chr.fallback and is_letter[chr.category] and u >= 128 and u <= 65536 then
1064 texsetcatcode(u,11)
1065 end
1066 local range = chr.range
1067 if range then
1068 for i=1,range.first,range.last do -- tricky as not all are letters
1069 texsetcatcode(i,11)
1070 end
1071 end
1072 end
1073 texsetcatcode(0x200C,11) -- non-joiner
1074 texsetcatcode(0x200D,11) -- joiner
1075 for k, v in next, blocks do
1076 if v.catcode == "letter" then
1077 for i=v.first,v.last do
1078 texsetcatcode(i,11)
1079 end
1080 end
1081 end
1082 end
1083 tex.catcodetable = saved
1084 end
1085
1086 local nofactivated = #tobeactivated
1087 if tobeactivated and nofactivated > 0 then
1088 for i=1,nofactivated do
1089 local u = activated[i]
1090 if u then
1091 report_defining("character %U is active in set %a, containing %a",u,data[u].description,tobeactivated)
1092 end
1093 end
1094 local saved = tex.catcodetable
1095 for i=1,#tobeactivated do
1096 local vector = tobeactivated[i]
1097 if trace_defining then
1098 report_defining("defining %a active characters in vector %a",nofactivated,vector)
1099 end
1100 tex.catcodetable = vector
1101 for i=1,nofactivated do
1102 local u = activated[i]
1103 if u then
1104 texsetcatcode(u,13)
1105 end
1106 end
1107 end
1108 tex.catcodetable = saved
1109 end
1110
1111 end
1112
1113 --[[ldx--
1114 <p>Setting the lccodes is also done in a loop over the data table.</p>
1115 --ldx]]--
1116
1117 local sfmode = "unset" -- unset, traditional, normal
1118
1119 function characters.setcodes()
1120 if trace_defining then
1121 report_defining("defining lc and uc codes")
1122 end
1123 local traditional = sfstate == "traditional" or sfstate == "unset"
1124 for code, chr in next, data do
1125 local cc = chr.category
1126 if is_letter[cc] then
1127 local range = chr.range
1128 if range then
1129 for i=range.first,range.last do
1130 texsetcatcode(i,11) -- letter
1131 texsetlccode(i,i,i) -- self self
1132 end
1133 else
1134 local lc, uc = chr.lccode, chr.uccode
1135 if not lc then
1136 chr.lccode, lc = code, code
1137 elseif type(lc) == "table" then
1138 lc = code
1139 end
1140 if not uc then
1141 chr.uccode, uc = code, code
1142 elseif type(uc) == "table" then
1143 uc = code
1144 end
1145 texsetcatcode(code,11) -- letter
1146 texsetlccode(code,lc,uc)
1147 if traditional and cc == "lu" then
1148 texsetsfcode(code,999)
1149 end
1150 end
1151 elseif is_mark[cc] then
1152 texsetlccode(code,code,code) -- for hyphenation
1153 end
1154 end
1155 if traditional then
1156 sfstate = "traditional"
1157 end
1158 end
1159
1160 -- If this is something that is not documentwide and used a lot, then we
1161 -- need a more clever approach (trivial but not now).
1162
1163 local function setuppersfcodes(v,n)
1164 if sfstate ~= "unset" then
1165 report_defining("setting uppercase sf codes to %a",n)
1166 for code, chr in next, data do
1167 if chr.category == "lu" then
1168 texsetsfcode(code,n)
1169 end
1170 end
1171 end
1172 sfstate = v
1173 end
1174
1175 directives.register("characters.spaceafteruppercase",function(v)
1176 if v == "traditional" then
1177 setuppersfcodes(v,999)
1178 elseif v == "normal" then
1179 setuppersfcodes(v,1000)
1180 end
1181 end)
1182
1183 -- tex
1184
1185 function commands.chardescription(slot)
1186 local d = data[slot]
1187 if d then
1188 context(d.description)
1189 end
1190 end
1191
1192 -- xml
1193
1194 characters.activeoffset = 0x10000 -- there will be remapped in that byte range
1195
1196 function commands.remapentity(chr,slot)
1197 contextsprint(format("{\\catcode%s=13\\xdef%s{\\string%s}}",slot,utfchar(slot),chr))
1198 end
1199
1200 -- xml.entities = xml.entities or { }
1201 --
1202 -- storage.register("xml/entities",xml.entities,"xml.entities") -- this will move to lxml
1203 --
1204 -- function characters.setmkiventities()
1205 -- local entities = xml.entities
1206 -- entities.lt = "<"
1207 -- entities.amp = "&"
1208 -- entities.gt = ">"
1209 -- end
1210 --
1211 -- function characters.setmkiientities()
1212 -- local entities = xml.entities
1213 -- entities.lt = utfchar(characters.activeoffset + utfbyte("<"))
1214 -- entities.amp = utfchar(characters.activeoffset + utfbyte("&"))
1215 -- entities.gt = utfchar(characters.activeoffset + utfbyte(">"))
1216 -- end