Bumping manifests a=b2g-bump
[gecko.git] / gfx / thebes / nsUnicodeRange.cpp
blob833f6b539920964d110c829ff59f2ed4c0276754
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #include "nsUnicodeRange.h"
7 #include "nsGkAtoms.h"
8 #include "mozilla/NullPtr.h"
10 // This table depends on unicode range definitions.
11 // Each item's index must correspond unicode range value
12 // eg. x-cyrillic = LangGroupTable[kRangeCyrillic]
13 static nsIAtom **gUnicodeRangeToLangGroupAtomTable[] =
15 &nsGkAtoms::x_cyrillic,
16 &nsGkAtoms::el_,
17 &nsGkAtoms::he,
18 &nsGkAtoms::ar,
19 &nsGkAtoms::th,
20 &nsGkAtoms::ko,
21 &nsGkAtoms::Japanese,
22 &nsGkAtoms::zh_cn,
23 &nsGkAtoms::zh_tw,
24 &nsGkAtoms::x_devanagari,
25 &nsGkAtoms::x_tamil,
26 &nsGkAtoms::x_armn,
27 &nsGkAtoms::x_beng,
28 &nsGkAtoms::x_cans,
29 &nsGkAtoms::x_ethi,
30 &nsGkAtoms::x_geor,
31 &nsGkAtoms::x_gujr,
32 &nsGkAtoms::x_guru,
33 &nsGkAtoms::x_khmr,
34 &nsGkAtoms::x_mlym,
35 &nsGkAtoms::x_orya,
36 &nsGkAtoms::x_telu,
37 &nsGkAtoms::x_knda,
38 &nsGkAtoms::x_sinh,
39 &nsGkAtoms::x_tibt
42 /**********************************************************************
43 * Unicode subranges as defined in unicode 3.0
44 * x-western -> latin
45 * 0000 - 036f
46 * 1e00 - 1eff
47 * 2000 - 206f (general punctuation)
48 * 20a0 - 20cf (currency symbols)
49 * 2100 - 214f (letterlike symbols)
50 * 2150 - 218f (Number Forms)
51 * el -> greek
52 * 0370 - 03ff
53 * 1f00 - 1fff
54 * x-cyrillic -> cyrillic
55 * 0400 - 04ff
56 * he -> hebrew
57 * 0590 - 05ff
58 * ar -> arabic
59 * 0600 - 06ff
60 * fb50 - fdff (arabic presentation forms)
61 * fe70 - feff (arabic presentation forms b)
62 * th - thai
63 * 0e00 - 0e7f
64 * ko -> korean
65 * ac00 - d7af (hangul Syllables)
66 * 1100 - 11ff (jamo)
67 * 3130 - 318f (hangul compatibility jamo)
68 * ja
69 * 3040 - 309f (hiragana)
70 * 30a0 - 30ff (katakana)
71 * zh-CN
72 * zh-TW
74 * CJK
75 * 3100 - 312f (bopomofo)
76 * 31a0 - 31bf (bopomofo extended)
77 * 3000 - 303f (CJK Symbols and Punctuation)
78 * 2e80 - 2eff (CJK radicals supplement)
79 * 2f00 - 2fdf (Kangxi Radicals)
80 * 2ff0 - 2fff (Ideographic Description Characters)
81 * 3190 - 319f (kanbun)
82 * 3200 - 32ff (Enclosed CJK letters and Months)
83 * 3300 - 33ff (CJK compatibility)
84 * 3400 - 4dbf (CJK Unified Ideographs Extension A)
85 * 4e00 - 9faf (CJK Unified Ideographs)
86 * f900 - fa5f (CJK Compatibility Ideographs)
87 * fe30 - fe4f (CJK compatibility Forms)
88 * ff00 - ffef (halfwidth and fullwidth forms)
90 * Armenian
91 * 0530 - 058f
92 * Sriac
93 * 0700 - 074f
94 * Thaana
95 * 0780 - 07bf
96 * Devanagari
97 * 0900 - 097f
98 * Bengali
99 * 0980 - 09ff
100 * Gurmukhi
101 * 0a00 - 0a7f
102 * Gujarati
103 * 0a80 - 0aff
104 * Oriya
105 * 0b00 - 0b7f
106 * Tamil
107 * 0b80 - 0bff
108 * Telugu
109 * 0c00 - 0c7f
110 * Kannada
111 * 0c80 - 0cff
112 * Malayalam
113 * 0d00 - 0d7f
114 * Sinhala
115 * 0d80 - 0def
116 * Lao
117 * 0e80 - 0eff
118 * Tibetan
119 * 0f00 - 0fbf
120 * Myanmar
121 * 1000 - 109f
122 * Georgian
123 * 10a0 - 10ff
124 * Ethiopic
125 * 1200 - 137f
126 * Cherokee
127 * 13a0 - 13ff
128 * Canadian Aboriginal Syllabics
129 * 1400 - 167f
130 * Ogham
131 * 1680 - 169f
132 * Runic
133 * 16a0 - 16ff
134 * Khmer
135 * 1780 - 17ff
136 * Mongolian
137 * 1800 - 18af
138 * Misc - superscripts and subscripts
139 * 2070 - 209f
140 * Misc - Combining Diacritical Marks for Symbols
141 * 20d0 - 20ff
142 * Misc - Arrows
143 * 2190 - 21ff
144 * Misc - Mathematical Operators
145 * 2200 - 22ff
146 * Misc - Miscellaneous Technical
147 * 2300 - 23ff
148 * Misc - Control picture
149 * 2400 - 243f
150 * Misc - Optical character recognition
151 * 2440 - 2450
152 * Misc - Enclose Alphanumerics
153 * 2460 - 24ff
154 * Misc - Box Drawing
155 * 2500 - 257f
156 * Misc - Block Elements
157 * 2580 - 259f
158 * Misc - Geometric Shapes
159 * 25a0 - 25ff
160 * Misc - Miscellaneous Symbols
161 * 2600 - 267f
162 * Misc - Dingbats
163 * 2700 - 27bf
164 * Misc - Braille Patterns
165 * 2800 - 28ff
166 * Yi Syllables
167 * a000 - a48f
168 * Yi radicals
169 * a490 - a4cf
170 * Alphabetic Presentation Forms
171 * fb00 - fb4f
172 * Misc - Combining half Marks
173 * fe20 - fe2f
174 * Misc - small form variants
175 * fe50 - fe6f
176 * Misc - Specials
177 * fff0 - ffff
178 *********************************************************************/
182 #define NUM_OF_SUBTABLES 10
183 #define SUBTABLE_SIZE 16
185 static const uint8_t gUnicodeSubrangeTable[NUM_OF_SUBTABLES][SUBTABLE_SIZE] =
187 { // table for X---
188 kRangeTableBase+1, //u0xxx
189 kRangeTableBase+2, //u1xxx
190 kRangeTableBase+3, //u2xxx
191 kRangeSetCJK, //u3xxx
192 kRangeSetCJK, //u4xxx
193 kRangeSetCJK, //u5xxx
194 kRangeSetCJK, //u6xxx
195 kRangeSetCJK, //u7xxx
196 kRangeSetCJK, //u8xxx
197 kRangeSetCJK, //u9xxx
198 kRangeTableBase+4, //uaxxx
199 kRangeKorean, //ubxxx
200 kRangeKorean, //ucxxx
201 kRangeTableBase+5, //udxxx
202 kRangePrivate, //uexxx
203 kRangeTableBase+6 //ufxxx
205 { //table for 0X--
206 kRangeSetLatin, //u00xx
207 kRangeSetLatin, //u01xx
208 kRangeSetLatin, //u02xx
209 kRangeGreek, //u03xx XXX 0300-036f is in fact kRangeCombiningDiacriticalMarks
210 kRangeCyrillic, //u04xx
211 kRangeTableBase+7, //u05xx, includes Cyrillic supplement, Hebrew, and Armenian
212 kRangeArabic, //u06xx
213 kRangeTertiaryTable, //u07xx
214 kRangeUnassigned, //u08xx
215 kRangeTertiaryTable, //u09xx
216 kRangeTertiaryTable, //u0axx
217 kRangeTertiaryTable, //u0bxx
218 kRangeTertiaryTable, //u0cxx
219 kRangeTertiaryTable, //u0dxx
220 kRangeTertiaryTable, //u0exx
221 kRangeTibetan //u0fxx
223 { //table for 1x--
224 kRangeTertiaryTable, //u10xx
225 kRangeKorean, //u11xx
226 kRangeEthiopic, //u12xx
227 kRangeTertiaryTable, //u13xx
228 kRangeCanadian, //u14xx
229 kRangeCanadian, //u15xx
230 kRangeTertiaryTable, //u16xx
231 kRangeKhmer, //u17xx
232 kRangeMongolian, //u18xx
233 kRangeUnassigned, //u19xx
234 kRangeUnassigned, //u1axx
235 kRangeUnassigned, //u1bxx
236 kRangeUnassigned, //u1cxx
237 kRangeUnassigned, //u1dxx
238 kRangeSetLatin, //u1exx
239 kRangeGreek //u1fxx
241 { //table for 2x--
242 kRangeSetLatin, //u20xx
243 kRangeSetLatin, //u21xx
244 kRangeMathOperators, //u22xx
245 kRangeMiscTechnical, //u23xx
246 kRangeControlOpticalEnclose, //u24xx
247 kRangeBoxBlockGeometrics, //u25xx
248 kRangeMiscSymbols, //u26xx
249 kRangeDingbats, //u27xx
250 kRangeBraillePattern, //u28xx
251 kRangeUnassigned, //u29xx
252 kRangeUnassigned, //u2axx
253 kRangeUnassigned, //u2bxx
254 kRangeUnassigned, //u2cxx
255 kRangeUnassigned, //u2dxx
256 kRangeSetCJK, //u2exx
257 kRangeSetCJK //u2fxx
259 { //table for ax--
260 kRangeYi, //ua0xx
261 kRangeYi, //ua1xx
262 kRangeYi, //ua2xx
263 kRangeYi, //ua3xx
264 kRangeYi, //ua4xx
265 kRangeUnassigned, //ua5xx
266 kRangeUnassigned, //ua6xx
267 kRangeUnassigned, //ua7xx
268 kRangeUnassigned, //ua8xx
269 kRangeUnassigned, //ua9xx
270 kRangeUnassigned, //uaaxx
271 kRangeUnassigned, //uabxx
272 kRangeKorean, //uacxx
273 kRangeKorean, //uadxx
274 kRangeKorean, //uaexx
275 kRangeKorean //uafxx
277 { //table for dx--
278 kRangeKorean, //ud0xx
279 kRangeKorean, //ud1xx
280 kRangeKorean, //ud2xx
281 kRangeKorean, //ud3xx
282 kRangeKorean, //ud4xx
283 kRangeKorean, //ud5xx
284 kRangeKorean, //ud6xx
285 kRangeKorean, //ud7xx
286 kRangeSurrogate, //ud8xx
287 kRangeSurrogate, //ud9xx
288 kRangeSurrogate, //udaxx
289 kRangeSurrogate, //udbxx
290 kRangeSurrogate, //udcxx
291 kRangeSurrogate, //uddxx
292 kRangeSurrogate, //udexx
293 kRangeSurrogate //udfxx
295 { // table for fx--
296 kRangePrivate, //uf0xx
297 kRangePrivate, //uf1xx
298 kRangePrivate, //uf2xx
299 kRangePrivate, //uf3xx
300 kRangePrivate, //uf4xx
301 kRangePrivate, //uf5xx
302 kRangePrivate, //uf6xx
303 kRangePrivate, //uf7xx
304 kRangePrivate, //uf8xx
305 kRangeSetCJK, //uf9xx
306 kRangeSetCJK, //ufaxx
307 kRangeArabic, //ufbxx, includes alphabic presentation form
308 kRangeArabic, //ufcxx
309 kRangeArabic, //ufdxx
310 kRangeTableBase+8, //ufexx
311 kRangeTableBase+9 //uffxx, halfwidth and fullwidth forms, includes Specials
313 { //table for 0x0500 - 0x05ff
314 kRangeCyrillic, //u050x
315 kRangeCyrillic, //u051x
316 kRangeCyrillic, //u052x
317 kRangeArmenian, //u053x
318 kRangeArmenian, //u054x
319 kRangeArmenian, //u055x
320 kRangeArmenian, //u056x
321 kRangeArmenian, //u057x
322 kRangeArmenian, //u058x
323 kRangeHebrew, //u059x
324 kRangeHebrew, //u05ax
325 kRangeHebrew, //u05bx
326 kRangeHebrew, //u05cx
327 kRangeHebrew, //u05dx
328 kRangeHebrew, //u05ex
329 kRangeHebrew //u05fx
331 { //table for 0xfe00 - 0xfeff
332 kRangeSetCJK, //ufe0x
333 kRangeSetCJK, //ufe1x
334 kRangeSetCJK, //ufe2x
335 kRangeSetCJK, //ufe3x
336 kRangeSetCJK, //ufe4x
337 kRangeSetCJK, //ufe5x
338 kRangeSetCJK, //ufe6x
339 kRangeArabic, //ufe7x
340 kRangeArabic, //ufe8x
341 kRangeArabic, //ufe9x
342 kRangeArabic, //ufeax
343 kRangeArabic, //ufebx
344 kRangeArabic, //ufecx
345 kRangeArabic, //ufedx
346 kRangeArabic, //ufeex
347 kRangeArabic //ufefx
349 { //table for 0xff00 - 0xffff
350 kRangeSetCJK, //uff0x, fullwidth latin
351 kRangeSetCJK, //uff1x, fullwidth latin
352 kRangeSetCJK, //uff2x, fullwidth latin
353 kRangeSetCJK, //uff3x, fullwidth latin
354 kRangeSetCJK, //uff4x, fullwidth latin
355 kRangeSetCJK, //uff5x, fullwidth latin
356 kRangeSetCJK, //uff6x, halfwidth katakana
357 kRangeSetCJK, //uff7x, halfwidth katakana
358 kRangeSetCJK, //uff8x, halfwidth katakana
359 kRangeSetCJK, //uff9x, halfwidth katakana
360 kRangeSetCJK, //uffax, halfwidth hangul jamo
361 kRangeSetCJK, //uffbx, halfwidth hangul jamo
362 kRangeSetCJK, //uffcx, halfwidth hangul jamo
363 kRangeSetCJK, //uffdx, halfwidth hangul jamo
364 kRangeSetCJK, //uffex, fullwidth symbols
365 kRangeSpecials, //ufffx, Specials
369 // Most scripts between U+0700 and U+16FF are assigned a chunk of 128 (0x80)
370 // code points so that the number of entries in the tertiary range
371 // table for that range is obtained by dividing (0x1700 - 0x0700) by 128.
372 // Exceptions: Ethiopic, Tibetan, Hangul Jamo and Canadian aboriginal
373 // syllabaries take multiple chunks and Ogham and Runic share a single chunk.
374 #define TERTIARY_TABLE_SIZE ((0x1700 - 0x0700) / 0x80)
376 static const uint8_t gUnicodeTertiaryRangeTable[TERTIARY_TABLE_SIZE] =
377 { //table for 0x0700 - 0x1600
378 kRangeSyriac, //u070x
379 kRangeThaana, //u078x
380 kRangeUnassigned, //u080x place holder(resolved in the 2ndary tab.)
381 kRangeUnassigned, //u088x place holder(resolved in the 2ndary tab.)
382 kRangeDevanagari, //u090x
383 kRangeBengali, //u098x
384 kRangeGurmukhi, //u0a0x
385 kRangeGujarati, //u0a8x
386 kRangeOriya, //u0b0x
387 kRangeTamil, //u0b8x
388 kRangeTelugu, //u0c0x
389 kRangeKannada, //u0c8x
390 kRangeMalayalam, //u0d0x
391 kRangeSinhala, //u0d8x
392 kRangeThai, //u0e0x
393 kRangeLao, //u0e8x
394 kRangeTibetan, //u0f0x place holder(resolved in the 2ndary tab.)
395 kRangeTibetan, //u0f8x place holder(resolved in the 2ndary tab.)
396 kRangeMyanmar, //u100x
397 kRangeGeorgian, //u108x
398 kRangeKorean, //u110x place holder(resolved in the 2ndary tab.)
399 kRangeKorean, //u118x place holder(resolved in the 2ndary tab.)
400 kRangeEthiopic, //u120x place holder(resolved in the 2ndary tab.)
401 kRangeEthiopic, //u128x place holder(resolved in the 2ndary tab.)
402 kRangeEthiopic, //u130x
403 kRangeCherokee, //u138x
404 kRangeCanadian, //u140x place holder(resolved in the 2ndary tab.)
405 kRangeCanadian, //u148x place holder(resolved in the 2ndary tab.)
406 kRangeCanadian, //u150x place holder(resolved in the 2ndary tab.)
407 kRangeCanadian, //u158x place holder(resolved in the 2ndary tab.)
408 kRangeCanadian, //u160x
409 kRangeOghamRunic //u168x this contains two scripts, Ogham & Runic
412 // A two level index is almost enough for locating a range, with the
413 // exception of u03xx and u05xx. Since we don't really care about range for
414 // combining diacritical marks in our font application, they are
415 // not discriminated further. But future adoption of this module for other use
416 // should be aware of this limitation. The implementation can be extended if
417 // there is such a need.
418 // For Indic, Southeast Asian scripts and some other scripts between
419 // U+0700 and U+16FF, it's extended to the third level.
420 uint32_t FindCharUnicodeRange(uint32_t ch)
422 uint32_t range;
424 // aggregate ranges for non-BMP codepoints
425 if (ch > 0xFFFF) {
426 uint32_t p = (ch >> 16);
427 if (p == 1) {
428 return kRangeSMP;
429 } else if (p == 2) {
430 return kRangeSetCJK;
432 return kRangeHigherPlanes;
435 // lookup explicit range for BMP codepoints
436 // first general range
437 range = gUnicodeSubrangeTable[0][ch >> 12];
439 // if general range is good enough, return that
440 if (range < kRangeTableBase)
441 // we try to get a specific range
442 return range;
444 // otherwise, use subrange tables
445 range = gUnicodeSubrangeTable[range - kRangeTableBase][(ch & 0x0f00) >> 8];
446 if (range < kRangeTableBase)
447 return range;
448 if (range < kRangeTertiaryTable)
449 return gUnicodeSubrangeTable[range - kRangeTableBase][(ch & 0x00f0) >> 4];
451 // Yet another table to look at : U+0700 - U+16FF : 128 code point blocks
452 return gUnicodeTertiaryRangeTable[(ch - 0x0700) >> 7];
455 nsIAtom *LangGroupFromUnicodeRange(uint8_t unicodeRange)
457 if (kRangeSpecificItemNum > unicodeRange) {
458 nsIAtom **atom = gUnicodeRangeToLangGroupAtomTable[unicodeRange];
459 return *atom;
461 return nullptr;