1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* ***** BEGIN LICENSE BLOCK *****
3 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
5 * The contents of this file are subject to the Mozilla Public License Version
6 * 1.1 (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
10 * Software distributed under the License is distributed on an "AS IS" basis,
11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 * for the specific language governing rights and limitations under the
15 * The Original Code is mozilla.org code.
17 * The Initial Developer of the Original Code is
18 * Netscape Communications Corporation.
19 * Portions created by the Initial Developer are Copyright (C) 1998
20 * the Initial Developer. All Rights Reserved.
24 * Alternatively, the contents of this file may be used under the terms of
25 * either of the GNU General Public License Version 2 or later (the "GPL"),
26 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 * in which case the provisions of the GPL or the LGPL are applicable instead
28 * of those above. If you wish to allow use of your version of this file only
29 * under the terms of either the GPL or the LGPL, and not to allow others to
30 * use your version of this file under the terms of the MPL, indicate your
31 * decision by deleting the provisions above and replace them with the notice
32 * and other provisions required by the GPL or the LGPL. If you do not delete
33 * the provisions above, a recipient may use your version of this file under
34 * the terms of any one of the MPL, the GPL or the LGPL.
36 * ***** END LICENSE BLOCK ***** */
38 #include "nsUnicodeRange.h"
42 // This table depends on unicode range definitions.
43 // Each item's index must correspond unicode range value
44 // eg. x-cyrillic = LangGroupTable[kRangeCyrillic]
45 static nsIAtom
**gUnicodeRangeToLangGroupAtomTable
[] =
47 &gfxAtoms::x_cyrillic
,
58 &gfxAtoms::x_devanagari
,
76 /**********************************************************************
77 * Unicode subranges as defined in unicode 3.0
78 * x-western, x-central-euro, tr, x-baltic -> latin
81 * 2000 - 206f (general punctuation)
82 * 20a0 - 20cf (currency symbols)
83 * 2100 - 214f (letterlike symbols)
84 * 2150 - 218f (Number Forms)
88 * x-cyrillic -> cyrillic
94 * fb50 - fdff (arabic presentation forms)
95 * fe70 - feff (arabic presentation forms b)
99 * ac00 - d7af (hangul Syllables)
101 * 3130 - 318f (hangul compatibility jamo)
103 * 3040 - 309f (hiragana)
104 * 30a0 - 30ff (katakana)
109 * 3100 - 312f (bopomofo)
110 * 31a0 - 31bf (bopomofo extended)
111 * 3000 - 303f (CJK Symbols and Punctuation)
112 * 2e80 - 2eff (CJK radicals supplement)
113 * 2f00 - 2fdf (Kangxi Radicals)
114 * 2ff0 - 2fff (Ideographic Description Characters)
115 * 3190 - 319f (kanbun)
116 * 3200 - 32ff (Enclosed CJK letters and Months)
117 * 3300 - 33ff (CJK compatibility)
118 * 3400 - 4dbf (CJK Unified Ideographs Extension A)
119 * 4e00 - 9faf (CJK Unified Ideographs)
120 * f900 - fa5f (CJK Compatibility Ideographs)
121 * fe30 - fe4f (CJK compatibility Forms)
122 * ff00 - ffef (halfwidth and fullwidth forms)
162 * Canadian Aboriginal Syllabics
172 * Misc - superscripts and subscripts
174 * Misc - Combining Diacritical Marks for Symbols
178 * Misc - Mathematical Operators
180 * Misc - Miscellaneous Technical
182 * Misc - Control picture
184 * Misc - Optical character recognition
186 * Misc - Enclose Alphanumerics
190 * Misc - Block Elements
192 * Misc - Geometric Shapes
194 * Misc - Miscellaneous Symbols
198 * Misc - Braille Patterns
204 * Alphabetic Presentation Forms
206 * Misc - Combining half Marks
208 * Misc - small form variants
212 *********************************************************************/
216 #define NUM_OF_SUBTABLES 9
217 #define SUBTABLE_SIZE 16
219 static const PRUint8 gUnicodeSubrangeTable
[NUM_OF_SUBTABLES
][SUBTABLE_SIZE
] =
222 kRangeTableBase
+1, //u0xxx
223 kRangeTableBase
+2, //u1xxx
224 kRangeTableBase
+3, //u2xxx
225 kRangeSetCJK
, //u3xxx
226 kRangeSetCJK
, //u4xxx
227 kRangeSetCJK
, //u5xxx
228 kRangeSetCJK
, //u6xxx
229 kRangeSetCJK
, //u7xxx
230 kRangeSetCJK
, //u8xxx
231 kRangeSetCJK
, //u9xxx
232 kRangeTableBase
+4, //uaxxx
233 kRangeKorean
, //ubxxx
234 kRangeKorean
, //ucxxx
235 kRangeTableBase
+5, //udxxx
236 kRangePrivate
, //uexxx
237 kRangeTableBase
+6 //ufxxx
240 kRangeSetLatin
, //u00xx
241 kRangeSetLatin
, //u01xx
242 kRangeSetLatin
, //u02xx
243 kRangeGreek
, //u03xx XXX 0300-036f is in fact kRangeCombiningDiacriticalMarks
244 kRangeCyrillic
, //u04xx
245 kRangeTableBase
+7, //u05xx, includes Cyrillic supplement, Hebrew, and Armenian
246 kRangeArabic
, //u06xx
247 kRangeTertiaryTable
, //u07xx
248 kRangeUnassigned
, //u08xx
249 kRangeTertiaryTable
, //u09xx
250 kRangeTertiaryTable
, //u0axx
251 kRangeTertiaryTable
, //u0bxx
252 kRangeTertiaryTable
, //u0cxx
253 kRangeTertiaryTable
, //u0dxx
254 kRangeTertiaryTable
, //u0exx
255 kRangeTibetan
, //u0fxx
258 kRangeTertiaryTable
, //u10xx
259 kRangeKorean
, //u11xx
260 kRangeEthiopic
, //u12xx
261 kRangeTertiaryTable
, //u13xx
262 kRangeCanadian
, //u14xx
263 kRangeCanadian
, //u15xx
264 kRangeTertiaryTable
, //u16xx
266 kRangeMongolian
, //u18xx
267 kRangeUnassigned
, //u19xx
268 kRangeUnassigned
, //u1axx
269 kRangeUnassigned
, //u1bxx
270 kRangeUnassigned
, //u1cxx
271 kRangeUnassigned
, //u1dxx
272 kRangeSetLatin
, //u1exx
276 kRangeSetLatin
, //u20xx
277 kRangeSetLatin
, //u21xx
278 kRangeMathOperators
, //u22xx
279 kRangeMiscTechnical
, //u23xx
280 kRangeControlOpticalEnclose
, //u24xx
281 kRangeBoxBlockGeometrics
, //u25xx
282 kRangeMiscSymbols
, //u26xx
283 kRangeDingbats
, //u27xx
284 kRangeBraillePattern
, //u28xx
285 kRangeUnassigned
, //u29xx
286 kRangeUnassigned
, //u2axx
287 kRangeUnassigned
, //u2bxx
288 kRangeUnassigned
, //u2cxx
289 kRangeUnassigned
, //u2dxx
290 kRangeSetCJK
, //u2exx
291 kRangeSetCJK
, //u2fxx
299 kRangeUnassigned
, //ua5xx
300 kRangeUnassigned
, //ua6xx
301 kRangeUnassigned
, //ua7xx
302 kRangeUnassigned
, //ua8xx
303 kRangeUnassigned
, //ua9xx
304 kRangeUnassigned
, //uaaxx
305 kRangeUnassigned
, //uabxx
306 kRangeKorean
, //uacxx
307 kRangeKorean
, //uadxx
308 kRangeKorean
, //uaexx
309 kRangeKorean
, //uafxx
312 kRangeKorean
, //ud0xx
313 kRangeKorean
, //ud1xx
314 kRangeKorean
, //ud2xx
315 kRangeKorean
, //ud3xx
316 kRangeKorean
, //ud4xx
317 kRangeKorean
, //ud5xx
318 kRangeKorean
, //ud6xx
319 kRangeKorean
, //ud7xx
320 kRangeSurrogate
, //ud8xx
321 kRangeSurrogate
, //ud9xx
322 kRangeSurrogate
, //udaxx
323 kRangeSurrogate
, //udbxx
324 kRangeSurrogate
, //udcxx
325 kRangeSurrogate
, //uddxx
326 kRangeSurrogate
, //udexx
327 kRangeSurrogate
, //udfxx
330 kRangePrivate
, //uf0xx
331 kRangePrivate
, //uf1xx
332 kRangePrivate
, //uf2xx
333 kRangePrivate
, //uf3xx
334 kRangePrivate
, //uf4xx
335 kRangePrivate
, //uf5xx
336 kRangePrivate
, //uf6xx
337 kRangePrivate
, //uf7xx
338 kRangePrivate
, //uf8xx
339 kRangeSetCJK
, //uf9xx
340 kRangeSetCJK
, //ufaxx
341 kRangeArabic
, //ufbxx, includes alphabic presentation form
342 kRangeArabic
, //ufcxx
343 kRangeArabic
, //ufdxx
344 kRangeArabic
, //ufexx, includes Combining half marks,
345 // CJK compatibility forms,
346 // CJK compatibility forms,
347 // small form variants
348 kRangeTableBase
+8, //uffxx, halfwidth and fullwidth forms, includes Specials
350 { //table for 0x0500 - 0x05ff
351 kRangeCyrillic
, //u050x
352 kRangeCyrillic
, //u051x
353 kRangeCyrillic
, //u052x
354 kRangeArmenian
, //u053x
355 kRangeArmenian
, //u054x
356 kRangeArmenian
, //u055x
357 kRangeArmenian
, //u056x
358 kRangeArmenian
, //u057x
359 kRangeArmenian
, //u058x
360 kRangeHebrew
, //u059x
361 kRangeHebrew
, //u05ax
362 kRangeHebrew
, //u05bx
363 kRangeHebrew
, //u05cx
364 kRangeHebrew
, //u05dx
365 kRangeHebrew
, //u05ex
366 kRangeHebrew
, //u05fx
368 { //table for 0xff00 - 0xffff
369 kRangeSetCJK
, //uff0x, fullwidth latin
370 kRangeSetCJK
, //uff1x, fullwidth latin
371 kRangeSetCJK
, //uff2x, fullwidth latin
372 kRangeSetCJK
, //uff3x, fullwidth latin
373 kRangeSetCJK
, //uff4x, fullwidth latin
374 kRangeSetCJK
, //uff5x, fullwidth latin
375 kRangeSetCJK
, //uff6x, halfwidth katakana
376 kRangeSetCJK
, //uff7x, halfwidth katakana
377 kRangeSetCJK
, //uff8x, halfwidth katakana
378 kRangeSetCJK
, //uff9x, halfwidth katakana
379 kRangeSetCJK
, //uffax, halfwidth hangul jamo
380 kRangeSetCJK
, //uffbx, halfwidth hangul jamo
381 kRangeSetCJK
, //uffcx, halfwidth hangul jamo
382 kRangeSetCJK
, //uffdx, halfwidth hangul jamo
383 kRangeSetCJK
, //uffex, fullwidth symbols
384 kRangeSpecials
, //ufffx, Specials
388 // Most scripts between U+0700 and U+16FF are assigned a chunk of 128 (0x80)
389 // code points so that the number of entries in the tertiary range
390 // table for that range is obtained by dividing (0x1700 - 0x0700) by 128.
391 // Exceptions: Ethiopic, Tibetan, Hangul Jamo and Canadian aboriginal
392 // syllabaries take multiple chunks and Ogham and Runic share a single chunk.
393 #define TERTIARY_TABLE_SIZE ((0x1700 - 0x0700) / 0x80)
395 static const PRUint8 gUnicodeTertiaryRangeTable
[TERTIARY_TABLE_SIZE
] =
396 { //table for 0x0700 - 0x1600
397 kRangeSyriac
, //u070x
398 kRangeThaana
, //u078x
399 kRangeUnassigned
, //u080x place holder(resolved in the 2ndary tab.)
400 kRangeUnassigned
, //u088x place holder(resolved in the 2ndary tab.)
401 kRangeDevanagari
, //u090x
402 kRangeBengali
, //u098x
403 kRangeGurmukhi
, //u0a0x
404 kRangeGujarati
, //u0a8x
407 kRangeTelugu
, //u0c0x
408 kRangeKannada
, //u0c8x
409 kRangeMalayalam
, //u0d0x
410 kRangeSinhala
, //u0d8x
413 kRangeTibetan
, //u0f0x place holder(resolved in the 2ndary tab.)
414 kRangeTibetan
, //u0f8x place holder(resolved in the 2ndary tab.)
415 kRangeMyanmar
, //u100x
416 kRangeGeorgian
, //u108x
417 kRangeKorean
, //u110x place holder(resolved in the 2ndary tab.)
418 kRangeKorean
, //u118x place holder(resolved in the 2ndary tab.)
419 kRangeEthiopic
, //u120x place holder(resolved in the 2ndary tab.)
420 kRangeEthiopic
, //u128x place holder(resolved in the 2ndary tab.)
421 kRangeEthiopic
, //u130x
422 kRangeCherokee
, //u138x
423 kRangeCanadian
, //u140x place holder(resolved in the 2ndary tab.)
424 kRangeCanadian
, //u148x place holder(resolved in the 2ndary tab.)
425 kRangeCanadian
, //u150x place holder(resolved in the 2ndary tab.)
426 kRangeCanadian
, //u158x place holder(resolved in the 2ndary tab.)
427 kRangeCanadian
, //u160x
428 kRangeOghamRunic
, //u168x this contains two scripts, Ogham & Runic
431 // A two level index is almost enough for locating a range, with the
432 // exception of u03xx and u05xx. Since we don't really care about range for
433 // combining diacritical marks in our font application, they are
434 // not discriminated further. But future adoption of this module for other use
435 // should be aware of this limitation. The implementation can be extended if
436 // there is such a need.
437 // For Indic, Southeast Asian scripts and some other scripts between
438 // U+0700 and U+16FF, it's extended to the third level.
439 PRUint32
FindCharUnicodeRange(PRUnichar ch
)
443 //search the first table
444 range
= gUnicodeSubrangeTable
[0][ch
>> 12];
446 if (range
< kRangeTableBase
)
447 // we try to get a specific range
450 // otherwise, we have one more table to look at
451 range
= gUnicodeSubrangeTable
[range
- kRangeTableBase
][(ch
& 0x0f00) >> 8];
452 if (range
< kRangeTableBase
)
454 if (range
< kRangeTertiaryTable
)
455 return gUnicodeSubrangeTable
[range
- kRangeTableBase
][(ch
& 0x00f0) >> 4];
457 // Yet another table to look at : U+0700 - U+16FF : 128 code point blocks
458 return gUnicodeTertiaryRangeTable
[(ch
- 0x0700) >> 7];
461 nsIAtom
*LangGroupFromUnicodeRange(PRUint8 unicodeRange
)
463 if (kRangeSpecificItemNum
> unicodeRange
) {
464 nsIAtom
**atom
= gUnicodeRangeToLangGroupAtomTable
[unicodeRange
];