1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #include "nsUnicodeRange.h"
8 #include "mozilla/NullPtr.h"
10 // This table depends on unicode range definitions.
11 // Each item's index must correspond unicode range value
12 // eg. x-cyrillic = LangGroupTable[kRangeCyrillic]
13 static nsIAtom
**gUnicodeRangeToLangGroupAtomTable
[] =
15 &nsGkAtoms::x_cyrillic
,
24 &nsGkAtoms::x_devanagari
,
42 /**********************************************************************
43 * Unicode subranges as defined in unicode 3.0
47 * 2000 - 206f (general punctuation)
48 * 20a0 - 20cf (currency symbols)
49 * 2100 - 214f (letterlike symbols)
50 * 2150 - 218f (Number Forms)
54 * x-cyrillic -> cyrillic
60 * fb50 - fdff (arabic presentation forms)
61 * fe70 - feff (arabic presentation forms b)
65 * ac00 - d7af (hangul Syllables)
67 * 3130 - 318f (hangul compatibility jamo)
69 * 3040 - 309f (hiragana)
70 * 30a0 - 30ff (katakana)
75 * 3100 - 312f (bopomofo)
76 * 31a0 - 31bf (bopomofo extended)
77 * 3000 - 303f (CJK Symbols and Punctuation)
78 * 2e80 - 2eff (CJK radicals supplement)
79 * 2f00 - 2fdf (Kangxi Radicals)
80 * 2ff0 - 2fff (Ideographic Description Characters)
81 * 3190 - 319f (kanbun)
82 * 3200 - 32ff (Enclosed CJK letters and Months)
83 * 3300 - 33ff (CJK compatibility)
84 * 3400 - 4dbf (CJK Unified Ideographs Extension A)
85 * 4e00 - 9faf (CJK Unified Ideographs)
86 * f900 - fa5f (CJK Compatibility Ideographs)
87 * fe30 - fe4f (CJK compatibility Forms)
88 * ff00 - ffef (halfwidth and fullwidth forms)
128 * Canadian Aboriginal Syllabics
138 * Misc - superscripts and subscripts
140 * Misc - Combining Diacritical Marks for Symbols
144 * Misc - Mathematical Operators
146 * Misc - Miscellaneous Technical
148 * Misc - Control picture
150 * Misc - Optical character recognition
152 * Misc - Enclose Alphanumerics
156 * Misc - Block Elements
158 * Misc - Geometric Shapes
160 * Misc - Miscellaneous Symbols
164 * Misc - Braille Patterns
170 * Alphabetic Presentation Forms
172 * Misc - Combining half Marks
174 * Misc - small form variants
178 *********************************************************************/
182 #define NUM_OF_SUBTABLES 10
183 #define SUBTABLE_SIZE 16
185 static const uint8_t gUnicodeSubrangeTable
[NUM_OF_SUBTABLES
][SUBTABLE_SIZE
] =
188 kRangeTableBase
+1, //u0xxx
189 kRangeTableBase
+2, //u1xxx
190 kRangeTableBase
+3, //u2xxx
191 kRangeSetCJK
, //u3xxx
192 kRangeSetCJK
, //u4xxx
193 kRangeSetCJK
, //u5xxx
194 kRangeSetCJK
, //u6xxx
195 kRangeSetCJK
, //u7xxx
196 kRangeSetCJK
, //u8xxx
197 kRangeSetCJK
, //u9xxx
198 kRangeTableBase
+4, //uaxxx
199 kRangeKorean
, //ubxxx
200 kRangeKorean
, //ucxxx
201 kRangeTableBase
+5, //udxxx
202 kRangePrivate
, //uexxx
203 kRangeTableBase
+6 //ufxxx
206 kRangeSetLatin
, //u00xx
207 kRangeSetLatin
, //u01xx
208 kRangeSetLatin
, //u02xx
209 kRangeGreek
, //u03xx XXX 0300-036f is in fact kRangeCombiningDiacriticalMarks
210 kRangeCyrillic
, //u04xx
211 kRangeTableBase
+7, //u05xx, includes Cyrillic supplement, Hebrew, and Armenian
212 kRangeArabic
, //u06xx
213 kRangeTertiaryTable
, //u07xx
214 kRangeUnassigned
, //u08xx
215 kRangeTertiaryTable
, //u09xx
216 kRangeTertiaryTable
, //u0axx
217 kRangeTertiaryTable
, //u0bxx
218 kRangeTertiaryTable
, //u0cxx
219 kRangeTertiaryTable
, //u0dxx
220 kRangeTertiaryTable
, //u0exx
221 kRangeTibetan
//u0fxx
224 kRangeTertiaryTable
, //u10xx
225 kRangeKorean
, //u11xx
226 kRangeEthiopic
, //u12xx
227 kRangeTertiaryTable
, //u13xx
228 kRangeCanadian
, //u14xx
229 kRangeCanadian
, //u15xx
230 kRangeTertiaryTable
, //u16xx
232 kRangeMongolian
, //u18xx
233 kRangeUnassigned
, //u19xx
234 kRangeUnassigned
, //u1axx
235 kRangeUnassigned
, //u1bxx
236 kRangeUnassigned
, //u1cxx
237 kRangeUnassigned
, //u1dxx
238 kRangeSetLatin
, //u1exx
242 kRangeSetLatin
, //u20xx
243 kRangeSetLatin
, //u21xx
244 kRangeMathOperators
, //u22xx
245 kRangeMiscTechnical
, //u23xx
246 kRangeControlOpticalEnclose
, //u24xx
247 kRangeBoxBlockGeometrics
, //u25xx
248 kRangeMiscSymbols
, //u26xx
249 kRangeDingbats
, //u27xx
250 kRangeBraillePattern
, //u28xx
251 kRangeUnassigned
, //u29xx
252 kRangeUnassigned
, //u2axx
253 kRangeUnassigned
, //u2bxx
254 kRangeUnassigned
, //u2cxx
255 kRangeUnassigned
, //u2dxx
256 kRangeSetCJK
, //u2exx
265 kRangeUnassigned
, //ua5xx
266 kRangeUnassigned
, //ua6xx
267 kRangeUnassigned
, //ua7xx
268 kRangeUnassigned
, //ua8xx
269 kRangeUnassigned
, //ua9xx
270 kRangeUnassigned
, //uaaxx
271 kRangeUnassigned
, //uabxx
272 kRangeKorean
, //uacxx
273 kRangeKorean
, //uadxx
274 kRangeKorean
, //uaexx
278 kRangeKorean
, //ud0xx
279 kRangeKorean
, //ud1xx
280 kRangeKorean
, //ud2xx
281 kRangeKorean
, //ud3xx
282 kRangeKorean
, //ud4xx
283 kRangeKorean
, //ud5xx
284 kRangeKorean
, //ud6xx
285 kRangeKorean
, //ud7xx
286 kRangeSurrogate
, //ud8xx
287 kRangeSurrogate
, //ud9xx
288 kRangeSurrogate
, //udaxx
289 kRangeSurrogate
, //udbxx
290 kRangeSurrogate
, //udcxx
291 kRangeSurrogate
, //uddxx
292 kRangeSurrogate
, //udexx
293 kRangeSurrogate
//udfxx
296 kRangePrivate
, //uf0xx
297 kRangePrivate
, //uf1xx
298 kRangePrivate
, //uf2xx
299 kRangePrivate
, //uf3xx
300 kRangePrivate
, //uf4xx
301 kRangePrivate
, //uf5xx
302 kRangePrivate
, //uf6xx
303 kRangePrivate
, //uf7xx
304 kRangePrivate
, //uf8xx
305 kRangeSetCJK
, //uf9xx
306 kRangeSetCJK
, //ufaxx
307 kRangeArabic
, //ufbxx, includes alphabic presentation form
308 kRangeArabic
, //ufcxx
309 kRangeArabic
, //ufdxx
310 kRangeTableBase
+8, //ufexx
311 kRangeTableBase
+9 //uffxx, halfwidth and fullwidth forms, includes Specials
313 { //table for 0x0500 - 0x05ff
314 kRangeCyrillic
, //u050x
315 kRangeCyrillic
, //u051x
316 kRangeCyrillic
, //u052x
317 kRangeArmenian
, //u053x
318 kRangeArmenian
, //u054x
319 kRangeArmenian
, //u055x
320 kRangeArmenian
, //u056x
321 kRangeArmenian
, //u057x
322 kRangeArmenian
, //u058x
323 kRangeHebrew
, //u059x
324 kRangeHebrew
, //u05ax
325 kRangeHebrew
, //u05bx
326 kRangeHebrew
, //u05cx
327 kRangeHebrew
, //u05dx
328 kRangeHebrew
, //u05ex
331 { //table for 0xfe00 - 0xfeff
332 kRangeSetCJK
, //ufe0x
333 kRangeSetCJK
, //ufe1x
334 kRangeSetCJK
, //ufe2x
335 kRangeSetCJK
, //ufe3x
336 kRangeSetCJK
, //ufe4x
337 kRangeSetCJK
, //ufe5x
338 kRangeSetCJK
, //ufe6x
339 kRangeArabic
, //ufe7x
340 kRangeArabic
, //ufe8x
341 kRangeArabic
, //ufe9x
342 kRangeArabic
, //ufeax
343 kRangeArabic
, //ufebx
344 kRangeArabic
, //ufecx
345 kRangeArabic
, //ufedx
346 kRangeArabic
, //ufeex
349 { //table for 0xff00 - 0xffff
350 kRangeSetCJK
, //uff0x, fullwidth latin
351 kRangeSetCJK
, //uff1x, fullwidth latin
352 kRangeSetCJK
, //uff2x, fullwidth latin
353 kRangeSetCJK
, //uff3x, fullwidth latin
354 kRangeSetCJK
, //uff4x, fullwidth latin
355 kRangeSetCJK
, //uff5x, fullwidth latin
356 kRangeSetCJK
, //uff6x, halfwidth katakana
357 kRangeSetCJK
, //uff7x, halfwidth katakana
358 kRangeSetCJK
, //uff8x, halfwidth katakana
359 kRangeSetCJK
, //uff9x, halfwidth katakana
360 kRangeSetCJK
, //uffax, halfwidth hangul jamo
361 kRangeSetCJK
, //uffbx, halfwidth hangul jamo
362 kRangeSetCJK
, //uffcx, halfwidth hangul jamo
363 kRangeSetCJK
, //uffdx, halfwidth hangul jamo
364 kRangeSetCJK
, //uffex, fullwidth symbols
365 kRangeSpecials
, //ufffx, Specials
369 // Most scripts between U+0700 and U+16FF are assigned a chunk of 128 (0x80)
370 // code points so that the number of entries in the tertiary range
371 // table for that range is obtained by dividing (0x1700 - 0x0700) by 128.
372 // Exceptions: Ethiopic, Tibetan, Hangul Jamo and Canadian aboriginal
373 // syllabaries take multiple chunks and Ogham and Runic share a single chunk.
374 #define TERTIARY_TABLE_SIZE ((0x1700 - 0x0700) / 0x80)
376 static const uint8_t gUnicodeTertiaryRangeTable
[TERTIARY_TABLE_SIZE
] =
377 { //table for 0x0700 - 0x1600
378 kRangeSyriac
, //u070x
379 kRangeThaana
, //u078x
380 kRangeUnassigned
, //u080x place holder(resolved in the 2ndary tab.)
381 kRangeUnassigned
, //u088x place holder(resolved in the 2ndary tab.)
382 kRangeDevanagari
, //u090x
383 kRangeBengali
, //u098x
384 kRangeGurmukhi
, //u0a0x
385 kRangeGujarati
, //u0a8x
388 kRangeTelugu
, //u0c0x
389 kRangeKannada
, //u0c8x
390 kRangeMalayalam
, //u0d0x
391 kRangeSinhala
, //u0d8x
394 kRangeTibetan
, //u0f0x place holder(resolved in the 2ndary tab.)
395 kRangeTibetan
, //u0f8x place holder(resolved in the 2ndary tab.)
396 kRangeMyanmar
, //u100x
397 kRangeGeorgian
, //u108x
398 kRangeKorean
, //u110x place holder(resolved in the 2ndary tab.)
399 kRangeKorean
, //u118x place holder(resolved in the 2ndary tab.)
400 kRangeEthiopic
, //u120x place holder(resolved in the 2ndary tab.)
401 kRangeEthiopic
, //u128x place holder(resolved in the 2ndary tab.)
402 kRangeEthiopic
, //u130x
403 kRangeCherokee
, //u138x
404 kRangeCanadian
, //u140x place holder(resolved in the 2ndary tab.)
405 kRangeCanadian
, //u148x place holder(resolved in the 2ndary tab.)
406 kRangeCanadian
, //u150x place holder(resolved in the 2ndary tab.)
407 kRangeCanadian
, //u158x place holder(resolved in the 2ndary tab.)
408 kRangeCanadian
, //u160x
409 kRangeOghamRunic
//u168x this contains two scripts, Ogham & Runic
412 // A two level index is almost enough for locating a range, with the
413 // exception of u03xx and u05xx. Since we don't really care about range for
414 // combining diacritical marks in our font application, they are
415 // not discriminated further. But future adoption of this module for other use
416 // should be aware of this limitation. The implementation can be extended if
417 // there is such a need.
418 // For Indic, Southeast Asian scripts and some other scripts between
419 // U+0700 and U+16FF, it's extended to the third level.
420 uint32_t FindCharUnicodeRange(uint32_t ch
)
424 // aggregate ranges for non-BMP codepoints
426 uint32_t p
= (ch
>> 16);
432 return kRangeHigherPlanes
;
435 // lookup explicit range for BMP codepoints
436 // first general range
437 range
= gUnicodeSubrangeTable
[0][ch
>> 12];
439 // if general range is good enough, return that
440 if (range
< kRangeTableBase
)
441 // we try to get a specific range
444 // otherwise, use subrange tables
445 range
= gUnicodeSubrangeTable
[range
- kRangeTableBase
][(ch
& 0x0f00) >> 8];
446 if (range
< kRangeTableBase
)
448 if (range
< kRangeTertiaryTable
)
449 return gUnicodeSubrangeTable
[range
- kRangeTableBase
][(ch
& 0x00f0) >> 4];
451 // Yet another table to look at : U+0700 - U+16FF : 128 code point blocks
452 return gUnicodeTertiaryRangeTable
[(ch
- 0x0700) >> 7];
455 nsIAtom
*LangGroupFromUnicodeRange(uint8_t unicodeRange
)
457 if (kRangeSpecificItemNum
> unicodeRange
) {
458 nsIAtom
**atom
= gUnicodeRangeToLangGroupAtomTable
[unicodeRange
];