Sync with TP.
[libidn.git] / csharp / NFKC.cs
blob32c959d81a9a750a64baedf8daf6f2c8af23ef8c
1 /// <summary> Copyright (C) 2004, 2005 Free Software Foundation, Inc.
2 /// *
3 /// Author: Alexander Gnauck AG-Software
4 /// *
5 /// This file is part of GNU Libidn.
6 /// *
7 /// This program is free software; you can redistribute it and/or
8 /// modify it under the terms of the GNU General Public License as
9 /// published by the Free Software Foundation; either version 2 of the
10 /// License, or (at your option) any later version.
11 /// *
12 /// This program is distributed in the hope that it will be useful,
13 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
14 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 /// General Public License for more details.
16 /// *
17 /// You should have received a copy of the GNU General Public License
18 /// along with this program; if not, write to the Free Software
19 /// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
20 /// 02111-1307 USA.
21 /// </summary>
23 using System;
25 namespace gnu.inet.encoding
28 public class NFKC
30 /// <summary> Applies NFKC normalization to a string.
31 /// *
32 /// </summary>
33 /// <param name="in">The string to normalize.
34 /// </param>
35 /// <returns> An NFKC normalized string.
36 ///
37 /// </returns>
38 public static System.String normalizeNFKC(System.String sbIn)
40 System.Text.StringBuilder sbOut = new System.Text.StringBuilder();
42 for (int i = 0; i < sbIn.Length; i++)
44 char code = sbIn[i];
46 // In Unicode 3.0, Hangul was defined as the block from U+AC00
47 // to U+D7A3, however, since Unicode 3.2 the block extends until
48 // U+D7AF. The decomposeHangul function only decomposes until
49 // U+D7A3. Should this be changed?
50 if (code >= 0xAC00 && code <= 0xD7AF)
52 sbOut.Append(decomposeHangul(code));
54 else
56 int index = decomposeIndex(code);
57 if (index == - 1)
59 sbOut.Append(code);
61 else
63 sbOut.Append(DecompositionMappings.m[index]);
68 // Bring the stringbuffer into canonical order.
69 canonicalOrdering(sbOut);
71 // Do the canonical composition.
72 int last_cc = 0;
73 int last_start = 0;
75 for (int i = 0; i < sbOut.Length; i++)
77 int cc = combiningClass(sbOut[i]);
79 if (i > 0 && (last_cc == 0 || last_cc != cc))
81 // Try to combine characters
82 char a = sbOut[last_start];
83 char b = sbOut[i];
85 int c = compose(a, b);
87 if (c != - 1)
89 sbOut[last_start] = (char) c;
90 //sbOut.deleteCharAt(i);
91 sbOut.Remove(i, 1);
92 i--;
94 if (i == last_start)
96 last_cc = 0;
98 else
100 last_cc = combiningClass(sbOut[i - 1]);
102 continue;
106 if (cc == 0)
108 last_start = i;
111 last_cc = cc;
114 return sbOut.ToString();
118 /// <summary> Returns the index inside the decomposition table, implemented
119 /// using a binary search.
120 /// *
121 /// </summary>
122 /// <param name="c">Character to look up.
123 /// </param>
124 /// <returns> Index if found, -1 otherwise.
125 ///
126 /// </returns>
127 internal static int decomposeIndex(char c)
129 int start = 0;
130 int end = DecompositionKeys.k.Length / 2;
132 while (true)
134 int half = (start + end) / 2;
135 int code = DecompositionKeys.k[half * 2];
137 if (c == code)
139 return DecompositionKeys.k[half * 2 + 1];
141 if (half == start)
143 // Character not found
144 return - 1;
146 else if (c > code)
148 start = half;
150 else
152 end = half;
157 /// <summary> Returns the combining class of a given character.
158 /// *
159 /// </summary>
160 /// <param name="c">The character.
161 /// </param>
162 /// <returns> The combining class.
163 ///
164 /// </returns>
165 internal static int combiningClass(char c)
167 int h = c >> 8;
168 int l = c & 0xff;
170 int i = CombiningClass.i[h];
171 if (i > - 1)
173 return CombiningClass.c[i][l];
175 else
177 return 0;
181 /// <summary> Rearranges characters in a stringbuffer in order to respect the
182 /// canonical ordering properties.
183 /// *
184 /// </summary>
185 /// <param name="The">StringBuffer to rearrange.
186 ///
187 /// </param>
188 internal static void canonicalOrdering(System.Text.StringBuilder sbIn)
190 bool isOrdered = false;
192 while (!isOrdered)
194 isOrdered = true;
197 // 24.10.2005
198 int lastCC = 0;
199 if (sbIn.Length > 0)
200 lastCC = combiningClass(sbIn[0]);
202 for (int i = 0; i < sbIn.Length - 1; i++)
204 int nextCC = combiningClass(sbIn[i + 1]);
205 if (nextCC != 0 && lastCC > nextCC)
207 for (int j = i + 1; j > 0; j--)
209 if (combiningClass(sbIn[j - 1]) <= nextCC)
211 break;
213 char t = sbIn[j];
214 sbIn[j] = sbIn[j - 1];
215 sbIn[j - 1] = t;
216 isOrdered = false;
218 nextCC = lastCC;
220 lastCC = nextCC;
225 /// <summary> Returns the index inside the composition table.
226 /// *
227 /// </summary>
228 /// <param name="a">Character to look up.
229 /// </param>
230 /// <returns> Index if found, -1 otherwise.
231 ///
232 /// </returns>
233 internal static int composeIndex(char a)
235 if (a >> 8 >= Composition.composePage.Length)
237 return - 1;
239 int ap = Composition.composePage[a >> 8];
240 if (ap == - 1)
242 return - 1;
244 return Composition.composeData[ap][a & 0xff];
247 /// <summary> Tries to compose two characters canonically.
248 /// *
249 /// </summary>
250 /// <param name="a">First character.
251 /// </param>
252 /// <param name="b">Second character.
253 /// </param>
254 /// <returns> The composed character or -1 if no composition could be
255 /// found.
256 ///
257 /// </returns>
258 internal static int compose(char a, char b)
260 int h = composeHangul(a, b);
261 if (h != - 1)
263 return h;
266 int ai = composeIndex(a);
268 if (ai >= Composition.singleFirstStart && ai < Composition.singleSecondStart)
270 if (b == Composition.singleFirst[ai - Composition.singleFirstStart][0])
272 return Composition.singleFirst[ai - Composition.singleFirstStart][1];
274 else
276 return - 1;
280 int bi = composeIndex(b);
282 if (bi >= Composition.singleSecondStart)
284 if (a == Composition.singleSecond[bi - Composition.singleSecondStart][0])
286 return Composition.singleSecond[bi - Composition.singleSecondStart][1];
288 else
290 return - 1;
294 if (ai >= 0 && ai < Composition.multiSecondStart && bi >= Composition.multiSecondStart && bi < Composition.singleFirstStart)
296 char[] f = Composition.multiFirst[ai];
298 if (bi - Composition.multiSecondStart < f.Length)
300 char r = f[bi - Composition.multiSecondStart];
301 if (r == 0)
303 return - 1;
305 else
307 return r;
313 return - 1;
316 /// <summary> Entire hangul code copied from:
317 /// http://www.unicode.org/unicode/reports/tr15/
318 /// *
319 /// Several hangul specific constants
320 /// </summary>
321 internal const int SBase = 0xAC00;
322 internal const int LBase = 0x1100;
323 internal const int VBase = 0x1161;
324 internal const int TBase = 0x11A7;
325 internal const int LCount = 19;
326 internal const int VCount = 21;
327 internal const int TCount = 28;
329 internal static readonly int NCount = VCount * TCount;
331 internal static readonly int SCount = LCount * NCount;
333 /// <summary> Decomposes a hangul character.
334 /// *
335 /// </summary>
336 /// <param name="s">A character to decompose.
337 /// </param>
338 /// <returns> A string containing the hangul decomposition of the input
339 /// character. If no hangul decomposition can be found, a string
340 /// containing the character itself is returned.
341 ///
342 /// </returns>
343 internal static System.String decomposeHangul(char s)
345 int SIndex = s - SBase;
346 if (SIndex < 0 || SIndex >= SCount)
348 return s.ToString();
350 System.Text.StringBuilder result = new System.Text.StringBuilder();
351 int L = LBase + SIndex / NCount;
352 int V = VBase + (SIndex % NCount) / TCount;
353 int T = TBase + SIndex % TCount;
354 result.Append((char) L);
355 result.Append((char) V);
356 if (T != TBase)
357 result.Append((char) T);
358 return result.ToString();
361 /// <summary> Composes two hangul characters.
362 /// *
363 /// </summary>
364 /// <param name="a">First character.
365 /// </param>
366 /// <param name="b">Second character.
367 /// </param>
368 /// <returns> Returns the composed character or -1 if the two
369 /// characters cannot be composed.
370 ///
371 /// </returns>
372 internal static int composeHangul(char a, char b)
374 // 1. check to see if two current characters are L and V
375 int LIndex = a - LBase;
376 if (0 <= LIndex && LIndex < LCount)
378 int VIndex = b - VBase;
379 if (0 <= VIndex && VIndex < VCount)
381 // make syllable of form LV
382 return SBase + (LIndex * VCount + VIndex) * TCount;
386 // 2. check to see if two current characters are LV and T
387 int SIndex = a - SBase;
388 if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0)
390 int TIndex = b - TBase;
391 if (0 <= TIndex && TIndex <= TCount)
393 // make syllable of form LVT
394 return a + TIndex;
397 return - 1;