Bump versions.
[libidn.git] / csharp / NFKC.cs
blob062b852f96db3b6b918b1ba38038aacf99310a4c
1 /// <summary>
2 /// Copyright (C) 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
3 /// *
4 /// Author: Alexander Gnauck AG-Software
5 /// *
6 /// This file is part of GNU Libidn.
7 /// *
8 /// This program is free software; you can redistribute it and/or
9 /// modify it under the terms of the GNU General Public License as
10 /// published by the Free Software Foundation; either version 2 of the
11 /// License, or (at your option) any later version.
12 /// *
13 /// This program is distributed in the hope that it will be useful,
14 /// but WITHOUT ANY WARRANTY; without even the implied warranty of
15 /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 /// General Public License for more details.
17 /// *
18 /// You should have received a copy of the GNU General Public License
19 /// along with this program; if not, write to the Free Software
20 /// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 /// 02111-1307 USA.
22 /// </summary>
24 using System;
26 namespace gnu.inet.encoding
29 public class NFKC
31 /// <summary> Applies NFKC normalization to a string.
32 /// *
33 /// </summary>
34 /// <param name="in">The string to normalize.
35 /// </param>
36 /// <returns> An NFKC normalized string.
37 ///
38 /// </returns>
39 public static System.String normalizeNFKC(System.String sbIn)
41 System.Text.StringBuilder sbOut = new System.Text.StringBuilder();
43 for (int i = 0; i < sbIn.Length; i++)
45 char code = sbIn[i];
47 // In Unicode 3.0, Hangul was defined as the block from U+AC00
48 // to U+D7A3, however, since Unicode 3.2 the block extends until
49 // U+D7AF. The decomposeHangul function only decomposes until
50 // U+D7A3. Should this be changed?
51 if (code >= 0xAC00 && code <= 0xD7AF)
53 sbOut.Append(decomposeHangul(code));
55 else
57 int index = decomposeIndex(code);
58 if (index == - 1)
60 sbOut.Append(code);
62 else
64 sbOut.Append(DecompositionMappings.m[index]);
69 // Bring the stringbuffer into canonical order.
70 canonicalOrdering(sbOut);
72 // Do the canonical composition.
73 int last_cc = 0;
74 int last_start = 0;
76 for (int i = 0; i < sbOut.Length; i++)
78 int cc = combiningClass(sbOut[i]);
80 if (i > 0 && (last_cc == 0 || last_cc != cc))
82 // Try to combine characters
83 char a = sbOut[last_start];
84 char b = sbOut[i];
86 int c = compose(a, b);
88 if (c != - 1)
90 sbOut[last_start] = (char) c;
91 //sbOut.deleteCharAt(i);
92 sbOut.Remove(i, 1);
93 i--;
95 if (i == last_start)
97 last_cc = 0;
99 else
101 last_cc = combiningClass(sbOut[i - 1]);
103 continue;
107 if (cc == 0)
109 last_start = i;
112 last_cc = cc;
115 return sbOut.ToString();
119 /// <summary> Returns the index inside the decomposition table, implemented
120 /// using a binary search.
121 /// *
122 /// </summary>
123 /// <param name="c">Character to look up.
124 /// </param>
125 /// <returns> Index if found, -1 otherwise.
126 ///
127 /// </returns>
128 internal static int decomposeIndex(char c)
130 int start = 0;
131 int end = DecompositionKeys.k.Length / 2;
133 while (true)
135 int half = (start + end) / 2;
136 int code = DecompositionKeys.k[half * 2];
138 if (c == code)
140 return DecompositionKeys.k[half * 2 + 1];
142 if (half == start)
144 // Character not found
145 return - 1;
147 else if (c > code)
149 start = half;
151 else
153 end = half;
158 /// <summary> Returns the combining class of a given character.
159 /// *
160 /// </summary>
161 /// <param name="c">The character.
162 /// </param>
163 /// <returns> The combining class.
164 ///
165 /// </returns>
166 internal static int combiningClass(char c)
168 int h = c >> 8;
169 int l = c & 0xff;
171 int i = CombiningClass.i[h];
172 if (i > - 1)
174 return CombiningClass.c[i][l];
176 else
178 return 0;
182 /// <summary> Rearranges characters in a stringbuffer in order to respect the
183 /// canonical ordering properties.
184 /// *
185 /// </summary>
186 /// <param name="The">StringBuffer to rearrange.
187 ///
188 /// </param>
189 internal static void canonicalOrdering(System.Text.StringBuilder sbIn)
191 bool isOrdered = false;
193 while (!isOrdered)
195 isOrdered = true;
198 // 24.10.2005
199 int lastCC = 0;
200 if (sbIn.Length > 0)
201 lastCC = combiningClass(sbIn[0]);
203 for (int i = 0; i < sbIn.Length - 1; i++)
205 int nextCC = combiningClass(sbIn[i + 1]);
206 if (nextCC != 0 && lastCC > nextCC)
208 for (int j = i + 1; j > 0; j--)
210 if (combiningClass(sbIn[j - 1]) <= nextCC)
212 break;
214 char t = sbIn[j];
215 sbIn[j] = sbIn[j - 1];
216 sbIn[j - 1] = t;
217 isOrdered = false;
219 nextCC = lastCC;
221 lastCC = nextCC;
226 /// <summary> Returns the index inside the composition table.
227 /// *
228 /// </summary>
229 /// <param name="a">Character to look up.
230 /// </param>
231 /// <returns> Index if found, -1 otherwise.
232 ///
233 /// </returns>
234 internal static int composeIndex(char a)
236 if (a >> 8 >= Composition.composePage.Length)
238 return - 1;
240 int ap = Composition.composePage[a >> 8];
241 if (ap == - 1)
243 return - 1;
245 return Composition.composeData[ap][a & 0xff];
248 /// <summary> Tries to compose two characters canonically.
249 /// *
250 /// </summary>
251 /// <param name="a">First character.
252 /// </param>
253 /// <param name="b">Second character.
254 /// </param>
255 /// <returns> The composed character or -1 if no composition could be
256 /// found.
257 ///
258 /// </returns>
259 internal static int compose(char a, char b)
261 int h = composeHangul(a, b);
262 if (h != - 1)
264 return h;
267 int ai = composeIndex(a);
269 if (ai >= Composition.singleFirstStart && ai < Composition.singleSecondStart)
271 if (b == Composition.singleFirst[ai - Composition.singleFirstStart][0])
273 return Composition.singleFirst[ai - Composition.singleFirstStart][1];
275 else
277 return - 1;
281 int bi = composeIndex(b);
283 if (bi >= Composition.singleSecondStart)
285 if (a == Composition.singleSecond[bi - Composition.singleSecondStart][0])
287 return Composition.singleSecond[bi - Composition.singleSecondStart][1];
289 else
291 return - 1;
295 if (ai >= 0 && ai < Composition.multiSecondStart && bi >= Composition.multiSecondStart && bi < Composition.singleFirstStart)
297 char[] f = Composition.multiFirst[ai];
299 if (bi - Composition.multiSecondStart < f.Length)
301 char r = f[bi - Composition.multiSecondStart];
302 if (r == 0)
304 return - 1;
306 else
308 return r;
314 return - 1;
317 /// <summary> Entire hangul code copied from:
318 /// http://www.unicode.org/unicode/reports/tr15/
319 /// *
320 /// Several hangul specific constants
321 /// </summary>
322 internal const int SBase = 0xAC00;
323 internal const int LBase = 0x1100;
324 internal const int VBase = 0x1161;
325 internal const int TBase = 0x11A7;
326 internal const int LCount = 19;
327 internal const int VCount = 21;
328 internal const int TCount = 28;
330 internal static readonly int NCount = VCount * TCount;
332 internal static readonly int SCount = LCount * NCount;
334 /// <summary> Decomposes a hangul character.
335 /// *
336 /// </summary>
337 /// <param name="s">A character to decompose.
338 /// </param>
339 /// <returns> A string containing the hangul decomposition of the input
340 /// character. If no hangul decomposition can be found, a string
341 /// containing the character itself is returned.
342 ///
343 /// </returns>
344 internal static System.String decomposeHangul(char s)
346 int SIndex = s - SBase;
347 if (SIndex < 0 || SIndex >= SCount)
349 return s.ToString();
351 System.Text.StringBuilder result = new System.Text.StringBuilder();
352 int L = LBase + SIndex / NCount;
353 int V = VBase + (SIndex % NCount) / TCount;
354 int T = TBase + SIndex % TCount;
355 result.Append((char) L);
356 result.Append((char) V);
357 if (T != TBase)
358 result.Append((char) T);
359 return result.ToString();
362 /// <summary> Composes two hangul characters.
363 /// *
364 /// </summary>
365 /// <param name="a">First character.
366 /// </param>
367 /// <param name="b">Second character.
368 /// </param>
369 /// <returns> Returns the composed character or -1 if the two
370 /// characters cannot be composed.
371 ///
372 /// </returns>
373 internal static int composeHangul(char a, char b)
375 // 1. check to see if two current characters are L and V
376 int LIndex = a - LBase;
377 if (0 <= LIndex && LIndex < LCount)
379 int VIndex = b - VBase;
380 if (0 <= VIndex && VIndex < VCount)
382 // make syllable of form LV
383 return SBase + (LIndex * VCount + VIndex) * TCount;
387 // 2. check to see if two current characters are LV and T
388 int SIndex = a - SBase;
389 if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0)
391 int TIndex = b - TBase;
392 if (0 <= TIndex && TIndex <= TCount)
394 // make syllable of form LVT
395 return a + TIndex;
398 return - 1;