2010-04-06 Jb Evain <jbevain@novell.com>
[mcs.git] / class / corlib / Mono.Globalization.Unicode / Normalization.cs
blob8dd8897b43733378d224787b9f90d53cca77c9aa
1 using System;
2 using System.Globalization;
3 using System.Text;
4 using System.Runtime.CompilerServices;
6 using NUtil = Mono.Globalization.Unicode.NormalizationTableUtil;
8 namespace Mono.Globalization.Unicode
10 internal enum NormalizationCheck {
11 Yes,
12 No,
13 Maybe
16 internal unsafe class Normalization
18 public const int NoNfd = 1;
19 public const int NoNfkd = 2;
20 public const int NoNfc = 4;
21 public const int MaybeNfc = 8;
22 public const int NoNfkc = 16;
23 public const int MaybeNfkc = 32;
24 public const int FullCompositionExclusion = 64;
25 public const int IsUnsafe = 128;
26 // public const int ExpandOnNfd = 256;
27 // public const int ExpandOnNfc = 512;
28 // public const int ExpandOnNfkd = 1024;
29 // public const int ExpandOnNfkc = 2048;
31 static uint PropValue (int cp)
33 return props [NUtil.PropIdx (cp)];
36 static int CharMapIdx (int cp)
38 return charMapIndex [NUtil.MapIdx (cp)];
41 static int GetNormalizedStringLength (int ch)
43 int start = charMapIndex [NUtil.MapIdx (ch)];
44 int i = start;
45 while (mappedChars [i] != 0)
46 i++;
47 return i - start;
50 static byte GetCombiningClass (int c)
52 return combiningClass [NUtil.Combining.ToIndex (c)];
55 static int GetPrimaryCompositeFromMapIndex (int src)
57 return mapIdxToComposite [NUtil.Composite.ToIndex (src)];
60 static int GetPrimaryCompositeHelperIndex (int cp)
62 return helperIndex [NUtil.Helper.ToIndex (cp)];
65 static int GetPrimaryCompositeCharIndex (object chars, int start)
67 string s = chars as string;
68 StringBuilder sb = chars as StringBuilder;
69 char startCh = s != null ? s [start] : sb [start];
70 int charsLength = sb != null ? sb.Length : s.Length;
72 int idx = GetPrimaryCompositeHelperIndex ((int) startCh);
73 if (idx == 0)
74 return 0;
75 while (mappedChars [idx] == startCh) {
76 int prevCB = 0;
77 int combiningClass = 0;
78 for (int i = 1, j = 1; ; i++, j++) {
79 prevCB = combiningClass;
81 if (mappedChars [idx + i] == 0)
82 // matched
83 return idx;
84 if (start + i >= charsLength)
85 return 0; // didn't match
87 // handle blocked characters here.
88 char curCh;
89 bool match = false;
90 do {
91 curCh = s != null ?
92 s [start + j] :
93 sb [start + j];
94 combiningClass = GetCombiningClass (curCh);
95 if (mappedChars [idx + i] == curCh) {
96 match = true;
97 break;
99 if (combiningClass < prevCB) // blocked. Give up this map entry.
100 break;
101 if (++j + start >= charsLength || combiningClass == 0)
102 break;
103 } while (true);
105 if (match)
106 continue; // check next character in the current map entry string.
107 if (prevCB < combiningClass) {
108 j--;
109 if (mappedChars [idx + i] == curCh)
110 continue;
111 //if (mappedChars [idx + i] > curCh)
112 // return 0; // no match
114 // otherwise move idx to next item
115 while (mappedChars [i] != 0)
116 i++;
117 idx += i + 1;
118 break;
121 // reached to end of entries
122 return 0;
125 private static string Compose (string source, int checkType)
127 StringBuilder sb = null;
128 Decompose (source, ref sb, checkType);
129 if (sb == null)
130 sb = Combine (source, 0, checkType);
131 else
132 Combine (sb, 0, checkType);
134 return sb != null ? sb.ToString () : source;
137 private static StringBuilder Combine (string source, int start, int checkType)
139 for (int i = 0; i < source.Length; i++) {
140 if (QuickCheck (source [i], checkType) == NormalizationCheck.Yes)
141 continue;
142 StringBuilder sb = new StringBuilder (source.Length + source.Length / 10);
143 sb.Append (source);
144 Combine (sb, i, checkType);
145 return sb;
147 return null;
151 private static bool CanBePrimaryComposite (int i)
153 if (i >= 0x3400 && i <= 0x9FBB)
154 return GetPrimaryCompositeHelperIndex (i) != 0;
155 return (PropValue (i) & IsUnsafe) != 0;
158 private static void Combine (StringBuilder sb, int start, int checkType)
160 for (int i = start; i < sb.Length; i++) {
161 if (QuickCheck (sb [i], checkType) == NormalizationCheck.Yes)
162 continue;
164 int cur = i;
165 // FIXME: It should check "blocked" too
166 for (;i > 0; i--) // this loop does not check sb[0], but regardless of the condition below it should not go under 0.
167 if (GetCombiningClass ((int) sb [i]) == 0)
168 break;
170 int idx = 0; // index to mappedChars
171 for (; i < cur; i++) {
172 idx = GetPrimaryCompositeMapIndex (sb, (int) sb [i], i);
173 if (idx > 0)
174 break;
176 if (idx == 0) {
177 i = cur;
178 continue;
181 int prim = GetPrimaryCompositeFromMapIndex (idx);
182 int len = GetNormalizedStringLength (prim);
183 if (prim == 0 || len == 0)
184 throw new SystemException ("Internal error: should not happen. Input: " + sb);
185 int removed = 0;
186 sb.Insert (i++, (char) prim); // always single character
188 // handle blocked characters here.
189 while (removed < len) {
190 if (sb [i] == mappedChars [idx + removed]) {
191 sb.Remove (i, 1);
192 removed++;
193 // otherwise, skip it.
195 else
196 i++;
198 i = cur - 1;
202 static int GetPrimaryCompositeMapIndex (object o, int cur, int bufferPos)
204 if ((PropValue (cur) & FullCompositionExclusion) != 0)
205 return 0;
206 if (GetCombiningClass (cur) != 0)
207 return 0; // not a starter
208 return GetPrimaryCompositeCharIndex (o, bufferPos);
211 static string Decompose (string source, int checkType)
213 StringBuilder sb = null;
214 Decompose (source, ref sb, checkType);
215 return sb != null ? sb.ToString () : source;
218 static void Decompose (string source,
219 ref StringBuilder sb, int checkType)
221 int [] buf = null;
222 int start = 0;
223 for (int i = 0; i < source.Length; i++)
224 if (QuickCheck (source [i], checkType) == NormalizationCheck.No)
225 DecomposeChar (ref sb, ref buf, source,
226 i, ref start);
227 if (sb != null)
228 sb.Append (source, start, source.Length - start);
229 ReorderCanonical (source, ref sb, 1);
232 static void ReorderCanonical (string src, ref StringBuilder sb, int start)
234 if (sb == null) {
235 // check only with src.
236 for (int i = 1; i < src.Length; i++) {
237 int level = GetCombiningClass (src [i]);
238 if (level == 0)
239 continue;
240 if (GetCombiningClass (src [i - 1]) > level) {
241 sb = new StringBuilder (src.Length);
242 sb.Append (src, 0, src.Length);
243 ReorderCanonical (src, ref sb, i);
244 return;
247 return;
249 // check only with sb
250 for (int i = start; i < sb.Length; i++) {
251 int level = GetCombiningClass (sb [i]);
252 if (level == 0)
253 continue;
254 if (GetCombiningClass (sb [i - 1]) > level) {
255 char c = sb [i - 1];
256 sb [i - 1] = sb [i];
257 sb [i] = c;
258 i--; // apply recursively
263 static void DecomposeChar (ref StringBuilder sb,
264 ref int [] buf, string s, int i, ref int start)
266 if (sb == null)
267 sb = new StringBuilder (s.Length + 100);
268 sb.Append (s, start, i - start);
269 if (buf == null)
270 buf = new int [19];
271 GetCanonical (s [i], buf, 0);
272 for (int x = 0; ; x++) {
273 if (buf [x] == 0)
274 break;
275 if (buf [x] < char.MaxValue)
276 sb.Append ((char) buf [x]);
277 else { // surrogate
278 sb.Append ((char) (buf [x] >> 10 + 0xD800));
279 sb.Append ((char) ((buf [x] & 0x0FFF) + 0xDC00));
282 start = i + 1;
285 public static NormalizationCheck QuickCheck (char c, int type)
287 uint v;
288 switch (type) {
289 default: // NFC
290 v = PropValue ((int) c);
291 return (v & NoNfc) == 0 ?
292 (v & MaybeNfc) == 0 ?
293 NormalizationCheck.Yes :
294 NormalizationCheck.Maybe :
295 NormalizationCheck.No;
296 case 1: // NFD
297 if ('\uAC00' <= c && c <= '\uD7A3')
298 return NormalizationCheck.No;
299 return (PropValue ((int) c) & NoNfd) != 0 ?
300 NormalizationCheck.No : NormalizationCheck.Yes;
301 case 2: // NFKC
302 v = PropValue ((int) c);
303 return (v & NoNfkc) != 0 ? NormalizationCheck.No :
304 (v & MaybeNfkc) != 0 ?
305 NormalizationCheck.Maybe :
306 NormalizationCheck.Yes;
307 case 3: // NFKD
308 if ('\uAC00' <= c && c <= '\uD7A3')
309 return NormalizationCheck.No;
310 return (PropValue ((int) c) & NoNfkd) != 0 ?
311 NormalizationCheck.No : NormalizationCheck.Yes;
315 /* for now we don't use FC_NFKC closure
316 public static bool IsMultiForm (char c)
318 return (PropValue ((int) c) & 0xF0000000) != 0;
321 public static char SingleForm (char c)
323 uint v = PropValue ((int) c);
324 int idx = (int) ((v & 0x7FFF0000) >> 16);
325 return (char) singleNorm [idx];
328 public static void MultiForm (char c, char [] buf, int index)
330 // FIXME: handle surrogate
331 uint v = PropValue ((int) c);
332 int midx = (int) ((v & 0x7FFF0000) >> 16);
333 buf [index] = (char) multiNorm [midx];
334 buf [index + 1] = (char) multiNorm [midx + 1];
335 buf [index + 2] = (char) multiNorm [midx + 2];
336 buf [index + 3] = (char) multiNorm [midx + 3];
337 if (buf [index + 3] != 0)
338 buf [index + 4] = (char) 0; // zero termination
342 const int HangulSBase = 0xAC00, HangulLBase = 0x1100,
343 HangulVBase = 0x1161, HangulTBase = 0x11A7,
344 HangulLCount = 19, HangulVCount = 21, HangulTCount = 28,
345 HangulNCount = HangulVCount * HangulTCount, // 588
346 HangulSCount = HangulLCount * HangulNCount; // 11172
348 private static bool GetCanonicalHangul (int s, int [] buf, int bufIdx)
350 int idx = s - HangulSBase;
351 if (idx < 0 || idx >= HangulSCount) {
352 return false;
355 int L = HangulLBase + idx / HangulNCount;
356 int V = HangulVBase + (idx % HangulNCount) / HangulTCount;
357 int T = HangulTBase + idx % HangulTCount;
359 buf [bufIdx++] = L;
360 buf [bufIdx++] = V;
361 if (T != HangulTBase) {
362 buf [bufIdx++] = T;
364 buf [bufIdx] = (char) 0;
365 return true;
368 public static void GetCanonical (int c, int [] buf, int bufIdx)
370 if (!GetCanonicalHangul (c, buf, bufIdx)) {
371 for (int i = CharMapIdx (c); mappedChars [i] != 0; i++)
372 buf [bufIdx++] = mappedChars [i];
373 buf [bufIdx] = (char) 0;
377 public static bool IsNormalized (string source, int type)
379 int prevCC = -1;
380 for (int i = 0; i < source.Length; i++) {
381 int cc = GetCombiningClass (source [i]);
382 if (cc != 0 && cc < prevCC)
383 return false;
384 prevCC = cc;
385 switch (QuickCheck (source [i], type)) {
386 case NormalizationCheck.Yes:
387 break;
388 case NormalizationCheck.No:
389 return false;
390 case NormalizationCheck.Maybe:
391 // for those forms with composition, it cannot be checked here
392 switch (type) {
393 case 0: // NFC
394 case 2: // NFKC
395 return source == Normalize (source, type);
397 // go on...
399 // partly copied from Combine()
400 int cur = i;
401 for (;i > 0; i--) // this loop does not check sb[0], but regardless of the condition below it should not go under 0.
402 if (GetCombiningClass ((int) source [i]) == 0)
403 break;
404 //i++;
405 // Now i is the "starter"
406 for (; i < cur; i++) {
407 if (GetPrimaryCompositeCharIndex (source, i) != 0)
408 return false;
410 break;
413 return true;
416 public static string Normalize (string source, int type)
418 switch (type) {
419 default:
420 case 2:
421 return Compose (source, type);
422 case 1:
423 case 3:
424 return Decompose (source, type);
428 static byte* props;
429 static int* mappedChars;
430 static short* charMapIndex;
431 static short* helperIndex;
432 static ushort* mapIdxToComposite;
433 static byte* combiningClass;
435 #if GENERATE_TABLE
437 public static readonly bool IsReady = true; // always
439 static Normalization ()
441 fixed (byte* tmp = propsArr) {
442 props = tmp;
444 fixed (int* tmp = mappedCharsArr) {
445 mappedChars = tmp;
447 fixed (short* tmp = charMapIndexArr) {
448 charMapIndex = tmp;
450 fixed (short* tmp = helperIndexArr) {
451 helperIndex = tmp;
453 fixed (ushort* tmp = mapIdxToCompositeArr) {
454 mapIdxToComposite = tmp;
456 fixed (byte* tmp = combiningClassArr) {
457 combiningClass = tmp;
460 #else
462 static object forLock = new object ();
463 public static readonly bool isReady;
465 public static bool IsReady {
466 get { return isReady; }
469 [MethodImpl (MethodImplOptions.InternalCall)]
470 static extern void load_normalization_resource (
471 out IntPtr props, out IntPtr mappedChars,
472 out IntPtr charMapIndex, out IntPtr helperIndex,
473 out IntPtr mapIdxToComposite, out IntPtr combiningClass);
475 static Normalization ()
477 IntPtr p1, p2, p3, p4, p5, p6;
478 lock (forLock) {
479 load_normalization_resource (out p1, out p2, out p3, out p4, out p5, out p6);
480 props = (byte*) p1;
481 mappedChars = (int*) p2;
482 charMapIndex = (short*) p3;
483 helperIndex = (short*) p4;
484 mapIdxToComposite = (ushort*) p5;
485 combiningClass = (byte*) p6;
488 isReady = true;
492 #endif
495 // autogenerated code or icall to fill array runs here