Backed out changeset b88172246b66 due to Win32 debug failures.
[mozilla-central.git] / gfx / thebes / nsUnicodeRange.cpp
blob78c5898e23d14361e63e652dc36af1a2752ecaa5
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* ***** BEGIN LICENSE BLOCK *****
3 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
5 * The contents of this file are subject to the Mozilla Public License Version
6 * 1.1 (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
10 * Software distributed under the License is distributed on an "AS IS" basis,
11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 * for the specific language governing rights and limitations under the
13 * License.
15 * The Original Code is mozilla.org code.
17 * The Initial Developer of the Original Code is
18 * Netscape Communications Corporation.
19 * Portions created by the Initial Developer are Copyright (C) 1998
20 * the Initial Developer. All Rights Reserved.
22 * Contributor(s):
24 * Alternatively, the contents of this file may be used under the terms of
25 * either of the GNU General Public License Version 2 or later (the "GPL"),
26 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 * in which case the provisions of the GPL or the LGPL are applicable instead
28 * of those above. If you wish to allow use of your version of this file only
29 * under the terms of either the GPL or the LGPL, and not to allow others to
30 * use your version of this file under the terms of the MPL, indicate your
31 * decision by deleting the provisions above and replace them with the notice
32 * and other provisions required by the GPL or the LGPL. If you do not delete
33 * the provisions above, a recipient may use your version of this file under
34 * the terms of any one of the MPL, the GPL or the LGPL.
36 * ***** END LICENSE BLOCK ***** */
38 #include "nsUnicodeRange.h"
39 #include "nsIAtom.h"
40 #include "gfxAtoms.h"
42 // This table depends on unicode range definitions.
43 // Each item's index must correspond unicode range value
44 // eg. x-cyrillic = LangGroupTable[kRangeCyrillic]
45 static nsIAtom **gUnicodeRangeToLangGroupAtomTable[] =
47 &gfxAtoms::x_cyrillic,
48 &gfxAtoms::el,
49 &gfxAtoms::tr,
50 &gfxAtoms::he,
51 &gfxAtoms::ar,
52 &gfxAtoms::x_baltic,
53 &gfxAtoms::th,
54 &gfxAtoms::ko,
55 &gfxAtoms::ja,
56 &gfxAtoms::zh_cn,
57 &gfxAtoms::zh_tw,
58 &gfxAtoms::x_devanagari,
59 &gfxAtoms::x_tamil,
60 &gfxAtoms::x_armn,
61 &gfxAtoms::x_beng,
62 &gfxAtoms::x_cans,
63 &gfxAtoms::x_ethi,
64 &gfxAtoms::x_geor,
65 &gfxAtoms::x_gujr,
66 &gfxAtoms::x_guru,
67 &gfxAtoms::x_khmr,
68 &gfxAtoms::x_mlym,
69 &gfxAtoms::x_orya,
70 &gfxAtoms::x_telu,
71 &gfxAtoms::x_knda,
72 &gfxAtoms::x_sinh,
73 &gfxAtoms::x_tibt
76 /**********************************************************************
77 * Unicode subranges as defined in unicode 3.0
78 * x-western, x-central-euro, tr, x-baltic -> latin
79 * 0000 - 036f
80 * 1e00 - 1eff
81 * 2000 - 206f (general punctuation)
82 * 20a0 - 20cf (currency symbols)
83 * 2100 - 214f (letterlike symbols)
84 * 2150 - 218f (Number Forms)
85 * el -> greek
86 * 0370 - 03ff
87 * 1f00 - 1fff
88 * x-cyrillic -> cyrillic
89 * 0400 - 04ff
90 * he -> hebrew
91 * 0590 - 05ff
92 * ar -> arabic
93 * 0600 - 06ff
94 * fb50 - fdff (arabic presentation forms)
95 * fe70 - feff (arabic presentation forms b)
96 * th - thai
97 * 0e00 - 0e7f
98 * ko -> korean
99 * ac00 - d7af (hangul Syllables)
100 * 1100 - 11ff (jamo)
101 * 3130 - 318f (hangul compatibility jamo)
102 * ja
103 * 3040 - 309f (hiragana)
104 * 30a0 - 30ff (katakana)
105 * zh-CN
106 * zh-TW
108 * CJK
109 * 3100 - 312f (bopomofo)
110 * 31a0 - 31bf (bopomofo extended)
111 * 3000 - 303f (CJK Symbols and Punctuation)
112 * 2e80 - 2eff (CJK radicals supplement)
113 * 2f00 - 2fdf (Kangxi Radicals)
114 * 2ff0 - 2fff (Ideographic Description Characters)
115 * 3190 - 319f (kanbun)
116 * 3200 - 32ff (Enclosed CJK letters and Months)
117 * 3300 - 33ff (CJK compatibility)
118 * 3400 - 4dbf (CJK Unified Ideographs Extension A)
119 * 4e00 - 9faf (CJK Unified Ideographs)
120 * f900 - fa5f (CJK Compatibility Ideographs)
121 * fe30 - fe4f (CJK compatibility Forms)
122 * ff00 - ffef (halfwidth and fullwidth forms)
124 * Armenian
125 * 0530 - 058f
126 * Sriac
127 * 0700 - 074f
128 * Thaana
129 * 0780 - 07bf
130 * Devanagari
131 * 0900 - 097f
132 * Bengali
133 * 0980 - 09ff
134 * Gurmukhi
135 * 0a00 - 0a7f
136 * Gujarati
137 * 0a80 - 0aff
138 * Oriya
139 * 0b00 - 0b7f
140 * Tamil
141 * 0b80 - 0bff
142 * Telugu
143 * 0c00 - 0c7f
144 * Kannada
145 * 0c80 - 0cff
146 * Malayalam
147 * 0d00 - 0d7f
148 * Sinhala
149 * 0d80 - 0def
150 * Lao
151 * 0e80 - 0eff
152 * Tibetan
153 * 0f00 - 0fbf
154 * Myanmar
155 * 1000 - 109f
156 * Georgian
157 * 10a0 - 10ff
158 * Ethiopic
159 * 1200 - 137f
160 * Cherokee
161 * 13a0 - 13ff
162 * Canadian Aboriginal Syllabics
163 * 1400 - 167f
164 * Ogham
165 * 1680 - 169f
166 * Runic
167 * 16a0 - 16ff
168 * Khmer
169 * 1780 - 17ff
170 * Mongolian
171 * 1800 - 18af
172 * Misc - superscripts and subscripts
173 * 2070 - 209f
174 * Misc - Combining Diacritical Marks for Symbols
175 * 20d0 - 20ff
176 * Misc - Arrows
177 * 2190 - 21ff
178 * Misc - Mathematical Operators
179 * 2200 - 22ff
180 * Misc - Miscellaneous Technical
181 * 2300 - 23ff
182 * Misc - Control picture
183 * 2400 - 243f
184 * Misc - Optical character recognition
185 * 2440 - 2450
186 * Misc - Enclose Alphanumerics
187 * 2460 - 24ff
188 * Misc - Box Drawing
189 * 2500 - 257f
190 * Misc - Block Elements
191 * 2580 - 259f
192 * Misc - Geometric Shapes
193 * 25a0 - 25ff
194 * Misc - Miscellaneous Symbols
195 * 2600 - 267f
196 * Misc - Dingbats
197 * 2700 - 27bf
198 * Misc - Braille Patterns
199 * 2800 - 28ff
200 * Yi Syllables
201 * a000 - a48f
202 * Yi radicals
203 * a490 - a4cf
204 * Alphabetic Presentation Forms
205 * fb00 - fb4f
206 * Misc - Combining half Marks
207 * fe20 - fe2f
208 * Misc - small form variants
209 * fe50 - fe6f
210 * Misc - Specials
211 * fff0 - ffff
212 *********************************************************************/
216 #define NUM_OF_SUBTABLES 9
217 #define SUBTABLE_SIZE 16
219 static const PRUint8 gUnicodeSubrangeTable[NUM_OF_SUBTABLES][SUBTABLE_SIZE] =
221 { // table for X---
222 kRangeTableBase+1, //u0xxx
223 kRangeTableBase+2, //u1xxx
224 kRangeTableBase+3, //u2xxx
225 kRangeSetCJK, //u3xxx
226 kRangeSetCJK, //u4xxx
227 kRangeSetCJK, //u5xxx
228 kRangeSetCJK, //u6xxx
229 kRangeSetCJK, //u7xxx
230 kRangeSetCJK, //u8xxx
231 kRangeSetCJK, //u9xxx
232 kRangeTableBase+4, //uaxxx
233 kRangeKorean, //ubxxx
234 kRangeKorean, //ucxxx
235 kRangeTableBase+5, //udxxx
236 kRangePrivate, //uexxx
237 kRangeTableBase+6 //ufxxx
239 { //table for 0X--
240 kRangeSetLatin, //u00xx
241 kRangeSetLatin, //u01xx
242 kRangeSetLatin, //u02xx
243 kRangeGreek, //u03xx XXX 0300-036f is in fact kRangeCombiningDiacriticalMarks
244 kRangeCyrillic, //u04xx
245 kRangeTableBase+7, //u05xx, includes Cyrillic supplement, Hebrew, and Armenian
246 kRangeArabic, //u06xx
247 kRangeTertiaryTable, //u07xx
248 kRangeUnassigned, //u08xx
249 kRangeTertiaryTable, //u09xx
250 kRangeTertiaryTable, //u0axx
251 kRangeTertiaryTable, //u0bxx
252 kRangeTertiaryTable, //u0cxx
253 kRangeTertiaryTable, //u0dxx
254 kRangeTertiaryTable, //u0exx
255 kRangeTibetan, //u0fxx
257 { //table for 1x--
258 kRangeTertiaryTable, //u10xx
259 kRangeKorean, //u11xx
260 kRangeEthiopic, //u12xx
261 kRangeTertiaryTable, //u13xx
262 kRangeCanadian, //u14xx
263 kRangeCanadian, //u15xx
264 kRangeTertiaryTable, //u16xx
265 kRangeKhmer, //u17xx
266 kRangeMongolian, //u18xx
267 kRangeUnassigned, //u19xx
268 kRangeUnassigned, //u1axx
269 kRangeUnassigned, //u1bxx
270 kRangeUnassigned, //u1cxx
271 kRangeUnassigned, //u1dxx
272 kRangeSetLatin, //u1exx
273 kRangeGreek, //u1fxx
275 { //table for 2x--
276 kRangeSetLatin, //u20xx
277 kRangeSetLatin, //u21xx
278 kRangeMathOperators, //u22xx
279 kRangeMiscTechnical, //u23xx
280 kRangeControlOpticalEnclose, //u24xx
281 kRangeBoxBlockGeometrics, //u25xx
282 kRangeMiscSymbols, //u26xx
283 kRangeDingbats, //u27xx
284 kRangeBraillePattern, //u28xx
285 kRangeUnassigned, //u29xx
286 kRangeUnassigned, //u2axx
287 kRangeUnassigned, //u2bxx
288 kRangeUnassigned, //u2cxx
289 kRangeUnassigned, //u2dxx
290 kRangeSetCJK, //u2exx
291 kRangeSetCJK, //u2fxx
293 { //table for ax--
294 kRangeYi, //ua0xx
295 kRangeYi, //ua1xx
296 kRangeYi, //ua2xx
297 kRangeYi, //ua3xx
298 kRangeYi, //ua4xx
299 kRangeUnassigned, //ua5xx
300 kRangeUnassigned, //ua6xx
301 kRangeUnassigned, //ua7xx
302 kRangeUnassigned, //ua8xx
303 kRangeUnassigned, //ua9xx
304 kRangeUnassigned, //uaaxx
305 kRangeUnassigned, //uabxx
306 kRangeKorean, //uacxx
307 kRangeKorean, //uadxx
308 kRangeKorean, //uaexx
309 kRangeKorean, //uafxx
311 { //table for dx--
312 kRangeKorean, //ud0xx
313 kRangeKorean, //ud1xx
314 kRangeKorean, //ud2xx
315 kRangeKorean, //ud3xx
316 kRangeKorean, //ud4xx
317 kRangeKorean, //ud5xx
318 kRangeKorean, //ud6xx
319 kRangeKorean, //ud7xx
320 kRangeSurrogate, //ud8xx
321 kRangeSurrogate, //ud9xx
322 kRangeSurrogate, //udaxx
323 kRangeSurrogate, //udbxx
324 kRangeSurrogate, //udcxx
325 kRangeSurrogate, //uddxx
326 kRangeSurrogate, //udexx
327 kRangeSurrogate, //udfxx
329 { // table for fx--
330 kRangePrivate, //uf0xx
331 kRangePrivate, //uf1xx
332 kRangePrivate, //uf2xx
333 kRangePrivate, //uf3xx
334 kRangePrivate, //uf4xx
335 kRangePrivate, //uf5xx
336 kRangePrivate, //uf6xx
337 kRangePrivate, //uf7xx
338 kRangePrivate, //uf8xx
339 kRangeSetCJK, //uf9xx
340 kRangeSetCJK, //ufaxx
341 kRangeArabic, //ufbxx, includes alphabic presentation form
342 kRangeArabic, //ufcxx
343 kRangeArabic, //ufdxx
344 kRangeArabic, //ufexx, includes Combining half marks,
345 // CJK compatibility forms,
346 // CJK compatibility forms,
347 // small form variants
348 kRangeTableBase+8, //uffxx, halfwidth and fullwidth forms, includes Specials
350 { //table for 0x0500 - 0x05ff
351 kRangeCyrillic, //u050x
352 kRangeCyrillic, //u051x
353 kRangeCyrillic, //u052x
354 kRangeArmenian, //u053x
355 kRangeArmenian, //u054x
356 kRangeArmenian, //u055x
357 kRangeArmenian, //u056x
358 kRangeArmenian, //u057x
359 kRangeArmenian, //u058x
360 kRangeHebrew, //u059x
361 kRangeHebrew, //u05ax
362 kRangeHebrew, //u05bx
363 kRangeHebrew, //u05cx
364 kRangeHebrew, //u05dx
365 kRangeHebrew, //u05ex
366 kRangeHebrew, //u05fx
368 { //table for 0xff00 - 0xffff
369 kRangeSetCJK, //uff0x, fullwidth latin
370 kRangeSetCJK, //uff1x, fullwidth latin
371 kRangeSetCJK, //uff2x, fullwidth latin
372 kRangeSetCJK, //uff3x, fullwidth latin
373 kRangeSetCJK, //uff4x, fullwidth latin
374 kRangeSetCJK, //uff5x, fullwidth latin
375 kRangeSetCJK, //uff6x, halfwidth katakana
376 kRangeSetCJK, //uff7x, halfwidth katakana
377 kRangeSetCJK, //uff8x, halfwidth katakana
378 kRangeSetCJK, //uff9x, halfwidth katakana
379 kRangeSetCJK, //uffax, halfwidth hangul jamo
380 kRangeSetCJK, //uffbx, halfwidth hangul jamo
381 kRangeSetCJK, //uffcx, halfwidth hangul jamo
382 kRangeSetCJK, //uffdx, halfwidth hangul jamo
383 kRangeSetCJK, //uffex, fullwidth symbols
384 kRangeSpecials, //ufffx, Specials
388 // Most scripts between U+0700 and U+16FF are assigned a chunk of 128 (0x80)
389 // code points so that the number of entries in the tertiary range
390 // table for that range is obtained by dividing (0x1700 - 0x0700) by 128.
391 // Exceptions: Ethiopic, Tibetan, Hangul Jamo and Canadian aboriginal
392 // syllabaries take multiple chunks and Ogham and Runic share a single chunk.
393 #define TERTIARY_TABLE_SIZE ((0x1700 - 0x0700) / 0x80)
395 static const PRUint8 gUnicodeTertiaryRangeTable[TERTIARY_TABLE_SIZE] =
396 { //table for 0x0700 - 0x1600
397 kRangeSyriac, //u070x
398 kRangeThaana, //u078x
399 kRangeUnassigned, //u080x place holder(resolved in the 2ndary tab.)
400 kRangeUnassigned, //u088x place holder(resolved in the 2ndary tab.)
401 kRangeDevanagari, //u090x
402 kRangeBengali, //u098x
403 kRangeGurmukhi, //u0a0x
404 kRangeGujarati, //u0a8x
405 kRangeOriya, //u0b0x
406 kRangeTamil, //u0b8x
407 kRangeTelugu, //u0c0x
408 kRangeKannada, //u0c8x
409 kRangeMalayalam, //u0d0x
410 kRangeSinhala, //u0d8x
411 kRangeThai, //u0e0x
412 kRangeLao, //u0e8x
413 kRangeTibetan, //u0f0x place holder(resolved in the 2ndary tab.)
414 kRangeTibetan, //u0f8x place holder(resolved in the 2ndary tab.)
415 kRangeMyanmar, //u100x
416 kRangeGeorgian, //u108x
417 kRangeKorean, //u110x place holder(resolved in the 2ndary tab.)
418 kRangeKorean, //u118x place holder(resolved in the 2ndary tab.)
419 kRangeEthiopic, //u120x place holder(resolved in the 2ndary tab.)
420 kRangeEthiopic, //u128x place holder(resolved in the 2ndary tab.)
421 kRangeEthiopic, //u130x
422 kRangeCherokee, //u138x
423 kRangeCanadian, //u140x place holder(resolved in the 2ndary tab.)
424 kRangeCanadian, //u148x place holder(resolved in the 2ndary tab.)
425 kRangeCanadian, //u150x place holder(resolved in the 2ndary tab.)
426 kRangeCanadian, //u158x place holder(resolved in the 2ndary tab.)
427 kRangeCanadian, //u160x
428 kRangeOghamRunic, //u168x this contains two scripts, Ogham & Runic
431 // A two level index is almost enough for locating a range, with the
432 // exception of u03xx and u05xx. Since we don't really care about range for
433 // combining diacritical marks in our font application, they are
434 // not discriminated further. But future adoption of this module for other use
435 // should be aware of this limitation. The implementation can be extended if
436 // there is such a need.
437 // For Indic, Southeast Asian scripts and some other scripts between
438 // U+0700 and U+16FF, it's extended to the third level.
439 PRUint32 FindCharUnicodeRange(PRUnichar ch)
441 PRUint32 range;
443 //search the first table
444 range = gUnicodeSubrangeTable[0][ch >> 12];
446 if (range < kRangeTableBase)
447 // we try to get a specific range
448 return range;
450 // otherwise, we have one more table to look at
451 range = gUnicodeSubrangeTable[range - kRangeTableBase][(ch & 0x0f00) >> 8];
452 if (range < kRangeTableBase)
453 return range;
454 if (range < kRangeTertiaryTable)
455 return gUnicodeSubrangeTable[range - kRangeTableBase][(ch & 0x00f0) >> 4];
457 // Yet another table to look at : U+0700 - U+16FF : 128 code point blocks
458 return gUnicodeTertiaryRangeTable[(ch - 0x0700) >> 7];
461 nsIAtom *LangGroupFromUnicodeRange(PRUint8 unicodeRange)
463 if (kRangeSpecificItemNum > unicodeRange) {
464 nsIAtom **atom = gUnicodeRangeToLangGroupAtomTable[unicodeRange];
465 return *atom;
467 return nsnull;