2 // SimpleCollator.cs : the core collator implementation
5 // Atsushi Enomoto <atsushi@ximian.com>
7 // Copyright (C) 2005 Novell, Inc (http://www.novell.com)
9 // Permission is hereby granted, free of charge, to any person obtaining
10 // a copy of this software and associated documentation files (the
11 // "Software"), to deal in the Software without restriction, including
12 // without limitation the rights to use, copy, modify, merge, publish,
13 // distribute, sublicense, and/or sell copies of the Software, and to
14 // permit persons to whom the Software is furnished to do so, subject to
15 // the following conditions:
17 // The above copyright notice and this permission notice shall be
18 // included in all copies or substantial portions of the Software.
20 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
30 // Here's a summary for supporting contractions, expansions and diacritical
33 // Diacritical mapping is a simple tailoring rule that "remaps" diacritical
34 // weight value from one to another. For now it applies to all range of
35 // characters, but at some stage we might need to limit the remapping ranges.
37 // A Contraction consists of a string (actually char[]) and a sortkey entry
38 // (i.e. byte[]). It indicates that a sequence of characters is interpreted
39 // as a single character that has the mapped sortkey value. There is no
40 // character which goes across "special rules". When the collator encountered
41 // such a sequence of characters, it just returns the sortkey value without
42 // further replacement.
44 // Since it is still likely to happen that a contraction sequence matches
45 // other character than the identical sequence (depending on CompareOptions
46 // and of course, defined sortkey value itself), comparison cannot be done
47 // at source char[] level.
49 // In IndexOf(), the first character in the target string or the target char
50 // itself is turned into sortkey bytes. If the character has a contraction and
51 // that is sortkey map, then it is used instead. If the contraction exists and
52 // that is replacement map, then the first character of the replacement string
53 // is searched instead. IndexOf() always searches only for the top character,
54 // and if it returned negative value, then it returns -1. Otherwise, it then
55 // tries IsPrefix() from that location. If it returns true, then it returns
58 // LAMESPEC: IndexOf() is lame as a whole API. It never matches in the middle
59 // of expansion and there is no proper way to return such indexes within
60 // a single int return value.
62 // For example, try below in .NET:
63 // IndexOf("\u00E6", "a")
64 // IndexOf("\u00E6", "e")
69 using System
.Collections
;
70 using System
.Globalization
;
72 using Uni
= Mono
.Globalization
.Unicode
.MSCompatUnicodeTable
;
73 using UUtil
= Mono
.Globalization
.Unicode
.MSCompatUnicodeTableUtil
;
74 using COpt
= System
.Globalization
.CompareOptions
;
76 namespace Mono
.Globalization
.Unicode
78 internal class SimpleCollator
80 // this environment variable is for debugging quick check.
81 #pragma warning disable 169, 414
82 static bool QuickCheckDisabled
=
83 Environment
.internalGetEnvironmentVariable (
84 "MONO_COLLATION_QUICK_CHECK_DISABLED") == "yes";
85 #pragma warning restore 169, 414
87 unsafe internal struct Context
89 public Context (CompareOptions opt
, byte* alwaysMatchFlags
, byte* neverMatchFlags
, byte* buffer1
, byte* buffer2
, byte* prev1
, bool quickCheckPossible
)
92 AlwaysMatchFlags
= alwaysMatchFlags
;
93 NeverMatchFlags
= neverMatchFlags
;
98 QuickCheckPossible
= quickCheckPossible
;
101 public readonly CompareOptions Option
;
102 public readonly byte* NeverMatchFlags
;
103 public readonly byte* AlwaysMatchFlags
;
104 public byte* Buffer1
;
105 public byte* Buffer2
;
107 public byte* PrevSortKey
;
108 public readonly bool QuickCheckPossible
;
110 public void ClearPrevInfo ()
117 unsafe struct PreviousInfo
120 public byte* SortKey
;
122 public PreviousInfo (bool dummy
)
131 public string Source
;
138 static SimpleCollator invariant
=
139 new SimpleCollator (CultureInfo
.InvariantCulture
);
141 readonly TextInfo textInfo
; // for ToLower().
142 readonly bool frenchSort
;
143 unsafe readonly byte* cjkCatTable
;
144 unsafe readonly byte* cjkLv1Table
;
145 readonly CodePointIndexer cjkIndexer
;
146 unsafe readonly byte* cjkLv2Table
;
147 readonly CodePointIndexer cjkLv2Indexer
;
149 readonly Contraction
[] contractions
;
150 readonly Level2Map
[] level2Maps
;
152 // This flag marks characters as "unsafe", where the character
153 // could be used as part of a contraction (whose length > 1).
154 readonly byte [] unsafeFlags
;
156 const int UnsafeFlagLength
= 0x300 / 8;
158 // readonly byte [] contractionFlags = new byte [16];
160 // This array is internally used inside IndexOf() to store
161 // "no need to check" ASCII characters.
163 // Now that it should be thread safe, this array is allocated
165 // byte [] neverMatchFlags = new byte [128 / 8];
167 #region .ctor() and split functions
169 public SimpleCollator (CultureInfo culture
)
172 textInfo
= culture
.TextInfo
;
175 SetCJKTable (culture
, ref cjkIndexer
,
176 ref cjkCatTable
, ref cjkLv1Table
,
177 ref cjkLv2Indexer
, ref cjkLv2Table
);
180 // Get tailoring info
181 TailoringInfo t
= null;
182 for (CultureInfo ci
= culture
; ci
.LCID
!= 127; ci
= ci
.Parent
) {
183 t
= Uni
.GetTailoringInfo (ci
.LCID
);
187 if (t
== null) // then use invariant
188 t
= Uni
.GetTailoringInfo (127);
190 frenchSort
= t
.FrenchSort
;
191 Uni
.BuildTailoringTables (culture
, t
, ref contractions
,
193 unsafeFlags
= new byte [UnsafeFlagLength
];
194 foreach (Contraction c
in contractions
) {
195 if (c
.Source
.Length
> 1)
196 foreach (char ch
in c
.Source
)
197 unsafeFlags
[(int) ch
/ 8 ]
198 |= (byte) (1 << ((int) ch
& 7));
199 // for (int i = 0; i < c.Source.Length; i++) {
200 // int ch = c.Source [i];
203 // contractionFlags [ch / 8] |= (byte) (1 << (ch & 7));
207 foreach (Contraction c
in invariant
.contractions
) {
208 if (c
.Source
.Length
> 1)
209 foreach (char ch
in c
.Source
)
210 unsafeFlags
[(int) ch
/ 8 ]
211 |= (byte) (1 << ((int) ch
& 7));
212 // for (int i = 0; i < c.Source.Length; i++) {
213 // int ch = c.Source [i];
216 // contractionFlags [ch / 8] |= (byte) (1 << (ch & 7));
220 // FIXME: Since tailorings are mostly for latin
221 // (and in some cases Cyrillic) characters, it would
222 // be much better for performance to store "start
223 // indexes" for > 370 (culture-specific letters).
226 // dump tailoring table
227 Console.WriteLine ("******** building table for {0} : contractions - {1} diacritical - {2}",
228 culture.LCID, contractions.Length, level2Maps.Length);
229 foreach (Contraction c in contractions) {
230 foreach (char cc in c.Source)
231 Console.Write ("{0:X4} ", (int) cc);
232 Console.WriteLine (" -> '{0}'", c.Replacement);
237 unsafe private void SetCJKTable (
238 CultureInfo culture
, ref CodePointIndexer cjkIndexer
,
239 ref byte* catTable
, ref byte* lv1Table
,
240 ref CodePointIndexer lv2Indexer
, ref byte* lv2Table
)
242 string name
= GetNeutralCulture (culture
).Name
;
244 Uni
.FillCJK (name
, ref cjkIndexer
, ref catTable
,
245 ref lv1Table
, ref lv2Indexer
, ref lv2Table
);
248 static CultureInfo
GetNeutralCulture (CultureInfo info
)
250 CultureInfo ret
= info
;
251 while (ret
.Parent
!= null && ret
.Parent
.LCID
!= 127)
258 unsafe byte Category (int cp
)
260 if (cp
< 0x3000 || cjkCatTable
== null)
261 return Uni
.Category (cp
);
262 int idx
= cjkIndexer
.ToIndex (cp
);
263 return idx
< 0 ? Uni
.Category (cp
) : cjkCatTable
[idx
];
266 unsafe byte Level1 (int cp
)
268 if (cp
< 0x3000 || cjkLv1Table
== null)
269 return Uni
.Level1 (cp
);
270 int idx
= cjkIndexer
.ToIndex (cp
);
271 return idx
< 0 ? Uni
.Level1 (cp
) : cjkLv1Table
[idx
];
274 unsafe byte Level2 (int cp
, ExtenderType ext
)
276 if (ext
== ExtenderType
.Buggy
)
278 else if (ext
== ExtenderType
.Conditional
)
281 if (cp
< 0x3000 || cjkLv2Table
== null)
282 return Uni
.Level2 (cp
);
283 int idx
= cjkLv2Indexer
.ToIndex (cp
);
284 byte ret
= idx
< 0 ? (byte) 0 : cjkLv2Table
[idx
];
287 ret
= Uni
.Level2 (cp
);
288 if (level2Maps
.Length
== 0)
290 for (int i
= 0; i
< level2Maps
.Length
; i
++) {
291 if (level2Maps
[i
].Source
== ret
)
292 return level2Maps
[i
].Replace
;
293 else if (level2Maps
[i
].Source
> ret
)
299 static bool IsHalfKana (int cp
, COpt opt
)
301 return (opt
& COpt
.IgnoreWidth
) != 0 ||
302 Uni
.IsHalfWidthKana ((char) cp
);
305 Contraction
GetContraction (string s
, int start
, int end
)
307 // int si = s [start];
308 // if (si < 128 && (contractionFlags [si / 8] & (1 << (si & 7))) == 0)
310 Contraction c
= GetContraction (s
, start
, end
, contractions
);
311 if (c
!= null || lcid
== 127)
313 return GetContraction (s
, start
, end
, invariant
.contractions
);
316 Contraction
GetContraction (string s
, int start
, int end
, Contraction
[] clist
)
318 for (int i
= 0; i
< clist
.Length
; i
++) {
319 Contraction ct
= clist
[i
];
320 int diff
= ct
.Source
[0] - s
[start
];
322 return null; // it's already sorted
325 char [] chars
= ct
.Source
;
326 if (end
- start
< chars
.Length
)
329 for (int n
= 0; n
< chars
.Length
; n
++)
330 if (s
[start
+ n
] != chars
[n
]) {
340 Contraction
GetTailContraction (string s
, int start
, int end
)
342 // int si = s [end - 1];
343 // if (si < 128 && (contractionFlags [si / 8] & (1 << (si & 7))) == 0)
345 Contraction c
= GetTailContraction (s
, start
, end
, contractions
);
346 if (c
!= null || lcid
== 127)
348 return GetTailContraction (s
, start
, end
, invariant
.contractions
);
351 Contraction
GetTailContraction (string s
, int start
, int end
, Contraction
[] clist
)
353 if (start
== end
|| end
< -1 || start
>= s
.Length
|| s
.Length
<= end
+ 1)
354 throw new SystemException (String
.Format ("MONO internal error. Failed to get TailContraction. start = {0} end = {1} string = '{2}'", start
, end
, s
));
355 for (int i
= 0; i
< clist
.Length
; i
++) {
356 Contraction ct
= clist
[i
];
357 char [] chars
= ct
.Source
;
358 if (chars
.Length
> start
- end
)
360 if (chars
[chars
.Length
- 1] != s
[start
])
364 for (int n
= 0, spos
= start
- chars
.Length
+ 1;
367 if (s
[spos
] != chars
[n
]) {
378 Contraction
GetContraction (char c
)
380 Contraction ct
= GetContraction (c
, contractions
);
381 if (ct
!= null || lcid
== 127)
383 return GetContraction (c
, invariant
.contractions
);
386 Contraction
GetContraction (char c
, Contraction
[] clist
)
388 for (int i
= 0; i
< clist
.Length
; i
++) {
389 Contraction ct
= clist
[i
];
390 if (ct
.Source
[0] > c
)
391 return null; // it's already sorted
392 if (ct
.Source
[0] == c
&& ct
.Source
.Length
== 1)
398 int FilterOptions (int i
, COpt opt
)
400 if ((opt
& COpt
.IgnoreWidth
) != 0) {
401 int x
= Uni
.ToWidthCompat (i
);
405 if ((opt
& COpt
.OrdinalIgnoreCase
) != 0)
406 i
= textInfo
.ToLower ((char) i
);
407 if ((opt
& COpt
.IgnoreCase
) != 0)
408 i
= textInfo
.ToLower ((char) i
);
409 if ((opt
& COpt
.IgnoreKanaType
) != 0)
410 i
= Uni
.ToKanaTypeInsensitive (i
);
422 ExtenderType
GetExtenderType (int i
)
424 // LAMESPEC: Windows expects true for U+3005, but
425 // sometimes it does not represent to repeat just
427 // Windows also expects true for U+3031 and U+3032,
428 // but they should *never* repeat one character.
430 // U+2015 becomes an extender only when it is Japanese
432 return lcid
== 16 ? ExtenderType
.Conditional
:
435 if (i
< 0x3005 || i
> 0xFF70)
436 return ExtenderType
.None
;
441 return ExtenderType
.Simple
;
443 return ExtenderType
.Conditional
;
446 return ExtenderType
.Voiced
;
450 return ExtenderType
.None
;
452 case 0x3005: // LAMESPEC: see above.
453 return ExtenderType
.Buggy
;
454 case 0x3031: // LAMESPEC: see above.
455 case 0x3032: // LAMESPEC: see above.
458 return ExtenderType
.Simple
;
461 return ExtenderType
.Voiced
;
463 return ExtenderType
.Conditional
;
465 return ExtenderType
.None
;
469 static byte ToDashTypeValue (ExtenderType ext
, COpt opt
)
471 if ((opt
& COpt
.IgnoreNonSpace
) != 0) // LAMESPEC: huh, why?
474 case ExtenderType
.None
:
476 case ExtenderType
.Conditional
:
483 int FilterExtender (int i
, ExtenderType ext
, COpt opt
)
485 if (ext
== ExtenderType
.Conditional
&&
486 Uni
.HasSpecialWeight ((char) i
)) {
487 bool half
= IsHalfKana ((char) i
, opt
);
488 bool katakana
= !Uni
.IsHiragana ((char) i
);
489 switch (Level1 (i
) & 7) {
491 return half
? 0xFF71 : katakana
? 0x30A2 : 0x3042;
493 return half
? 0xFF72 : katakana
? 0x30A4 : 0x3044;
495 return half
? 0xFF73 : katakana
? 0x30A6 : 0x3046;
497 return half
? 0xFF74 : katakana
? 0x30A8 : 0x3048;
499 return half
? 0xFF75 : katakana
? 0x30AA : 0x304A;
505 static bool IsIgnorable (int i
, COpt opt
)
507 return Uni
.IsIgnorable (i
, (byte) (1 +
508 ((opt
& COpt
.IgnoreSymbols
) != 0 ? 2 : 0) +
509 ((opt
& COpt
.IgnoreNonSpace
) != 0 ? 4 : 0)));
515 return i
/ 8 >= unsafeFlags
.Length
? true : (unsafeFlags
[i
/ 8] & (1 << (i
% 8))) == 0;
520 public SortKey
GetSortKey (string s
)
522 return GetSortKey (s
, CompareOptions
.None
);
525 public SortKey
GetSortKey (string s
, CompareOptions options
)
527 return GetSortKey (s
, 0, s
.Length
, options
);
530 public SortKey
GetSortKey (string s
, int start
, int length
, CompareOptions options
)
532 SortKeyBuffer buf
= new SortKeyBuffer (lcid
);
533 buf
.Initialize (options
, lcid
, s
, frenchSort
);
534 int end
= start
+ length
;
535 GetSortKey (s
, start
, end
, buf
, options
);
536 return buf
.GetResultAndReset ();
539 unsafe void GetSortKey (string s
, int start
, int end
,
540 SortKeyBuffer buf
, CompareOptions opt
)
542 byte* prevbuf
= stackalloc byte [4];
543 ClearBuffer (prevbuf
, 4);
544 Context ctx
= new Context (opt
, null, null, null, null, prevbuf
, false);
546 for (int n
= start
; n
< end
; n
++) {
549 ExtenderType ext
= GetExtenderType (i
);
550 if (ext
!= ExtenderType
.None
) {
551 i
= FilterExtender (ctx
.PrevCode
, ext
, opt
);
553 FillSortKeyRaw (i
, ext
, buf
, opt
);
554 else if (ctx
.PrevSortKey
!= null) {
555 byte* b
= ctx
.PrevSortKey
;
559 b
[2] != 1 ? b
[2] : Level2 (i
, ext
),
560 b
[3] != 1 ? b
[3] : Uni
.Level3 (i
));
562 // otherwise do nothing.
563 // (if the extender is the first char
564 // in the string, then just ignore.)
568 if (IsIgnorable (i
, opt
))
570 i
= FilterOptions (i
, opt
);
572 Contraction ct
= GetContraction (s
, n
, end
);
574 if (ct
.Replacement
!= null) {
575 GetSortKey (ct
.Replacement
, 0, ct
.Replacement
.Length
, buf
, opt
);
577 byte* b
= ctx
.PrevSortKey
;
578 for (int bi
= 0; bi
< ct
.SortKey
.Length
; bi
++)
579 b
[bi
] = ct
.SortKey
[bi
];
583 b
[2] != 1 ? b
[2] : Level2 (i
, ext
),
584 b
[3] != 1 ? b
[3] : Uni
.Level3 (i
));
587 n
+= ct
.Source
.Length
- 1;
590 if (!Uni
.IsIgnorableNonSpacing (i
))
592 FillSortKeyRaw (i
, ExtenderType
.None
, buf
, opt
);
597 void FillSortKeyRaw (int i
, ExtenderType ext
,
598 SortKeyBuffer buf
, CompareOptions opt
)
600 if (0x3400 <= i
&& i
<= 0x4DB5) {
601 int diff
= i
- 0x3400;
602 buf
.AppendCJKExtension (
603 (byte) (0x10 + diff
/ 254),
604 (byte) (diff
% 254 + 2));
608 UnicodeCategory uc
= char.GetUnicodeCategory ((char) i
);
610 case UnicodeCategory
.PrivateUse
:
611 int diff
= i
- 0xE000;
613 (byte) (0xE5 + diff
/ 254),
614 (byte) (diff
% 254 + 2),
618 case UnicodeCategory
.Surrogate
:
619 FillSurrogateSortKeyRaw (i
, buf
);
623 byte level2
= Level2 (i
, ext
);
624 if (Uni
.HasSpecialWeight ((char) i
)) {
625 byte level1
= Level1 (i
);
631 Uni
.IsJapaneseSmallLetter ((char) i
),
632 ToDashTypeValue (ext
, opt
),
633 !Uni
.IsHiragana ((char) i
),
634 IsHalfKana ((char) i
, opt
)
636 if ((opt
& COpt
.IgnoreNonSpace
) == 0 && ext
== ExtenderType
.Voiced
)
637 // Append voice weight
638 buf
.AppendNormal (1, 1, 1, 0);
648 void FillSurrogateSortKeyRaw (int i
, SortKeyBuffer buf
)
657 lower
= (byte) ((i
== 0xD800) ? 0x3E : 0x3F);
658 } else if (0xD840 <= i
&& i
< 0xD880) {
662 } else if (0xDB80 <= i
&& i
< 0xDC00) {
663 diffbase
= 0xDB80 - 0x40;
667 diffbase
= 0xDC00 - 0xF8 + 2;
671 int diff
= i
- diffbase
;
674 (byte) (segment
+ diff
/ 254),
675 (byte) (diff
% 254 + 2),
684 public int Compare (string s1
, string s2
)
686 return Compare (s1
, s2
, CompareOptions
.None
);
689 public int Compare (string s1
, string s2
, CompareOptions options
)
691 return Compare (s1
, 0, s1
.Length
, s2
, 0, s2
.Length
, options
);
694 private int CompareOrdinal (string s1
, int idx1
, int len1
,
695 string s2
, int idx2
, int len2
)
697 int min
= len1
< len2
? len1
: len2
;
698 int end1
= idx1
+ min
;
699 int end2
= idx2
+ min
;
700 if (idx1
< 0 || idx2
< 0 || end1
> s1
.Length
|| end2
> s2
.Length
)
701 throw new SystemException (String
.Format ("CompareInfo Internal Error: Should not happen. {0} {1} {2} {3} {4} {5}", idx1
, idx2
, len1
, len2
, s1
.Length
, s2
.Length
));
702 for (int i1
= idx1
, i2
= idx2
;
703 i1
< end1
&& i2
< end2
; i1
++, i2
++)
704 if (s1
[i1
] != s2
[i2
])
705 return s1
[i1
] - s2
[i2
];
706 return len1
== len2
? 0 :
707 len1
== min
? - 1 : 1;
710 // mostly equivalent to CompareOrdinal, but the return value is
711 // not based on codepoints.
712 private int CompareQuick (string s1
, int idx1
, int len1
,
713 string s2
, int idx2
, int len2
, out bool sourceConsumed
,
714 out bool targetConsumed
, bool immediateBreakup
)
716 sourceConsumed
= false;
717 targetConsumed
= false;
718 int min
= len1
< len2
? len1
: len2
;
719 int end1
= idx1
+ min
;
720 int end2
= idx2
+ min
;
721 if (idx1
< 0 || idx2
< 0 || end1
> s1
.Length
|| end2
> s2
.Length
)
722 throw new SystemException (String
.Format ("CompareInfo Internal Error: Should not happen. {0} {1} {2} {3} {4} {5}", idx1
, idx2
, len1
, len2
, s1
.Length
, s2
.Length
));
723 for (int i1
= idx1
, i2
= idx2
;
724 i1
< end1
&& i2
< end2
; i1
++, i2
++)
725 if (s1
[i1
] != s2
[i2
]) {
726 if (immediateBreakup
)
728 int ret
= Category (s1
[i1
]) - Category (s2
[i2
]);
730 ret
= Level1 (s1
[i1
]) - Level1 (s2
[i2
]);
733 ret
= Uni
.Level3 (s1
[i1
]) - Uni
.Level3 (s2
[i2
]);
735 throw new SystemException (String
.Format ("CompareInfo Internal Error: Should not happen. '{0}' {2} {3} '{1}' {4} {5}", s1
, s2
, idx1
, end1
, idx2
, end2
));
738 sourceConsumed
= len1
<= len2
;
739 targetConsumed
= len1
>= len2
;
740 return len1
== len2
? 0 :
741 len1
== min
? - 1 : 1;
744 private int CompareOrdinalIgnoreCase (string s1
, int idx1
, int len1
,
745 string s2
, int idx2
, int len2
)
747 int min
= len1
< len2
? len1
: len2
;
748 int end1
= idx1
+ min
;
749 int end2
= idx2
+ min
;
750 if (idx1
< 0 || idx2
< 0 || end1
> s1
.Length
|| end2
> s2
.Length
)
751 throw new SystemException (String
.Format ("CompareInfo Internal Error: Should not happen. {0} {1} {2} {3} {4} {5}", idx1
, idx2
, len1
, len2
, s1
.Length
, s2
.Length
));
752 TextInfo ti
= invariant
.textInfo
;
753 for (int i1
= idx1
, i2
= idx2
;
754 i1
< end1
&& i2
< end2
; i1
++, i2
++)
755 if (ti
.ToLower (s1
[i1
]) != ti
.ToLower (s2
[i2
]))
756 return ti
.ToLower (s1
[i1
]) - ti
.ToLower (s2
[i2
]);
757 return len1
== len2
? 0 :
758 len1
== min
? - 1 : 1;
761 public unsafe int Compare (string s1
, int idx1
, int len1
,
762 string s2
, int idx2
, int len2
, CompareOptions options
)
764 // quick equality check
765 if (idx1
== idx2
&& len1
== len2
&&
766 Object
.ReferenceEquals (s1
, s2
))
768 if (options
== CompareOptions
.Ordinal
)
769 return CompareOrdinal (s1
, idx1
, len1
, s2
, idx2
, len2
);
770 if (options
== CompareOptions
.OrdinalIgnoreCase
)
771 return CompareOrdinalIgnoreCase (s1
, idx1
, len1
, s2
, idx2
, len2
);
773 #if false // stable easy version, depends on GetSortKey().
774 SortKey sk1
= GetSortKey (s1
, idx1
, len1
, options
);
775 SortKey sk2
= GetSortKey (s2
, idx2
, len2
, options
);
776 byte [] d1
= sk1
.KeyData
;
777 byte [] d2
= sk2
.KeyData
;
778 int len
= d1
.Length
> d2
.Length
? d2
.Length
: d1
.Length
;
779 for (int i
= 0; i
< len
; i
++)
780 if (d1
[i
] != d2
[i
])
781 return d1
[i
] < d2
[i
] ? -1 : 1;
782 return d1
.Length
== d2
.Length
? 0 : d1
.Length
< d2
.Length
? -1 : 1;
784 byte* sk1
= stackalloc byte [4];
785 byte* sk2
= stackalloc byte [4];
786 ClearBuffer (sk1
, 4);
787 ClearBuffer (sk2
, 4);
788 Context ctx
= new Context (options
, null, null, sk1
, sk2
, null,
789 QuickCheckPossible (s1
, idx1
, idx1
+ len1
, s2
, idx2
, idx2
+ len2
));
792 int ret
= CompareInternal (s1
, idx1
, len1
, s2
, idx2
, len2
, out dummy
, out dummy2
, true, false, ref ctx
);
793 return ret
== 0 ? 0 : ret
< 0 ? -1 : 1;
797 unsafe void ClearBuffer (byte* buffer
, int size
)
799 for (int i
= 0; i
< size
; i
++)
803 bool QuickCheckPossible (string s1
, int idx1
, int end1
,
804 string s2
, int idx2
, int end2
)
807 // looks like it is buggy and inefficient, so just disable it.
810 if (QuickCheckDisabled
)
812 // if (s1.Length > 100 || s2.Length > 100)
814 for (int i
= idx1
; i
< end1
; i
++)
815 if (s1
[i
] < 0x20 && (s1
[i
] < '\x9' || s1
[i
] > '\xD') || s1
[i
] >= 0x80 || s1
[i
] == '-' || s1
[i
] == '\'')
817 for (int i
= idx2
; i
< end2
; i
++)
818 if (s2
[i
] < 0x20 && (s2
[i
] < '\x9' || s2
[i
] > '\xD') || s2
[i
] >= 0x80 || s2
[i
] == '-' || s2
[i
] == '\'')
824 unsafe int CompareInternal (string s1
, int idx1
, int len1
, string s2
,
826 out bool targetConsumed
, out bool sourceConsumed
,
827 bool skipHeadingExtenders
, bool immediateBreakup
,
830 COpt opt
= ctx
.Option
;
833 int end1
= idx1
+ len1
;
834 int end2
= idx2
+ len2
;
835 targetConsumed
= false;
836 sourceConsumed
= false;
837 PreviousInfo prev2
= new PreviousInfo (false);
839 if (opt
== CompareOptions
.None
&& ctx
.QuickCheckPossible
)
840 return CompareQuick (s1
, idx1
, len1
, s2
, idx2
, len2
, out sourceConsumed
, out targetConsumed
, immediateBreakup
);
842 // It holds final result that comes from the comparison
843 // at level 2 or lower. Even if Compare() found the
844 // difference at level 2 or lower, it still has to
845 // continue level 1 comparison. FinalResult is used
846 // when there was no further differences.
848 // It holds the comparison level to do. It starts from
849 // 5, and becomes 1 when we do primary-only comparison.
850 int currentLevel
= 5;
857 // Skip heading extenders
858 if (skipHeadingExtenders
) {
859 for (; idx1
< end1
; idx1
++)
860 if (GetExtenderType (s1
[idx1
]) == ExtenderType
.None
)
862 for (; idx2
< end2
; idx2
++)
863 if (GetExtenderType (s2
[idx2
]) == ExtenderType
.None
)
867 ExtenderType ext1
= ExtenderType
.None
;
868 ExtenderType ext2
= ExtenderType
.None
;
870 int quickCheckPos1
= idx1
;
871 int quickCheckPos2
= idx2
;
872 bool stringSort
= (opt
& COpt
.StringSort
) != 0;
873 bool ignoreNonSpace
= (opt
& COpt
.IgnoreNonSpace
) != 0;
874 Escape escape1
= new Escape ();
875 Escape escape2
= new Escape ();
878 for (; idx1
< end1
; idx1
++)
879 if (!IsIgnorable (s1
[idx1
], opt
))
881 for (; idx2
< end2
; idx2
++)
882 if (!IsIgnorable (s2
[idx2
], opt
))
886 if (escape1
.Source
== null)
889 start1
= escape1
.Start
;
890 idx1
= escape1
.Index
;
892 quickCheckPos1
= escape1
.Optional
;
893 escape1
.Source
= null;
897 if (escape2
.Source
== null)
900 start2
= escape2
.Start
;
901 idx2
= escape2
.Index
;
903 quickCheckPos2
= escape2
.Optional
;
904 escape2
.Source
= null;
908 // If comparison is unstable, then this part is one of the most doubtful part.
909 // Here we run quick codepoint comparison and run back to "stable" area to
910 // compare characters.
912 // Strictly to say, even the first character
913 // could be compared here, but it messed
914 // backward step, so I just avoided mess.
915 if (quickCheckPos1
< idx1
&& quickCheckPos2
< idx2
) {
916 while (idx1
< end1
&& idx2
< end2
&&
917 s1
[idx1
] == s2
[idx2
]) {
921 if (idx1
== end1
|| idx2
== end2
)
922 continue; // check replacement
924 int backwardEnd1
= quickCheckPos1
;
925 int backwardEnd2
= quickCheckPos2
;
926 quickCheckPos1
= idx1
;
927 quickCheckPos2
= idx2
;
932 for (;idx1
> backwardEnd1
; idx1
--)
933 if (Category (s1
[idx1
]) != 1)
935 for (;idx2
> backwardEnd2
; idx2
--)
936 if (Category (s2
[idx2
]) != 1)
938 for (;idx1
> backwardEnd1
; idx1
--)
939 if (IsSafe (s1
[idx1
]))
941 for (;idx2
> backwardEnd2
; idx2
--)
942 if (IsSafe (s2
[idx2
]))
951 int i1
= FilterOptions (s1
[idx1
], opt
);
952 int i2
= FilterOptions (s2
[idx2
], opt
);
953 bool special1
= false;
954 bool special2
= false;
956 // If current character is an expander, then
957 // repeat the previous character.
958 ext1
= GetExtenderType (i1
);
959 if (ext1
!= ExtenderType
.None
) {
960 if (ctx
.PrevCode
< 0) {
961 if (ctx
.PrevSortKey
== null) {
966 sk1
= ctx
.PrevSortKey
;
969 i1
= FilterExtender (ctx
.PrevCode
, ext1
, opt
);
971 ext2
= GetExtenderType (i2
);
972 if (ext2
!= ExtenderType
.None
) {
973 if (prev2
.Code
< 0) {
974 if (prev2
.SortKey
== null) {
982 i2
= FilterExtender (prev2
.Code
, ext2
, opt
);
985 byte cat1
= Category (i1
);
986 byte cat2
= Category (i2
);
988 // Handle special weight characters
990 if (!stringSort
&& currentLevel
== 5) {
991 lv5At1
= escape1
.Source
!= null ?
992 escape1
.Index
- escape1
.Start
:
994 // here Windows has a bug that it does
995 // not consider thirtiary weight.
996 lv5Value1
= Level1 (i1
) << 8 + Uni
.Level3 (i1
);
1002 if (!stringSort
&& currentLevel
== 5) {
1003 lv5At2
= escape2
.Source
!= null ?
1004 escape2
.Index
- escape2
.Start
:
1006 // here Windows has a bug that it does
1007 // not consider thirtiary weight.
1008 lv5Value2
= Level1 (i2
) << 8 + Uni
.Level3 (i2
);
1013 if (cat1
== 6 || cat2
== 6) {
1014 if (currentLevel
== 5) {
1015 if (lv5Value1
== lv5Value2
) {
1016 // There is not really difference here.
1017 lv5At1
= lv5At2
= -1;
1018 lv5Value1
= lv5Value2
= 0;
1026 Contraction ct1
= null;
1027 if (ext1
== ExtenderType
.None
)
1028 ct1
= GetContraction (s1
, idx1
, end1
);
1033 else if (ct1
!= null) {
1034 offset1
= ct1
.Source
.Length
;
1035 if (ct1
.SortKey
!= null) {
1037 for (int i
= 0; i
< ct1
.SortKey
.Length
; i
++)
1038 sk1
[i
] = ct1
.SortKey
[i
];
1040 ctx
.PrevSortKey
= sk1
;
1042 else if (escape1
.Source
== null) {
1043 escape1
.Source
= s1
;
1044 escape1
.Start
= start1
;
1045 escape1
.Index
= cur1
+ ct1
.Source
.Length
;
1047 escape1
.Optional
= quickCheckPos1
;
1048 s1
= ct1
.Replacement
;
1059 sk1
[1] = Level1 (i1
);
1060 if (!ignoreNonSpace
&& currentLevel
> 1)
1061 sk1
[2] = Level2 (i1
, ext1
);
1062 if (currentLevel
> 2)
1063 sk1
[3] = Uni
.Level3 (i1
);
1064 if (currentLevel
> 3)
1065 special1
= Uni
.HasSpecialWeight ((char) i1
);
1070 Contraction ct2
= null;
1071 if (ext2
== ExtenderType
.None
)
1072 ct2
= GetContraction (s2
, idx2
, end2
);
1076 else if (ct2
!= null) {
1077 idx2
+= ct2
.Source
.Length
;
1078 if (ct2
.SortKey
!= null) {
1080 for (int i
= 0; i
< ct2
.SortKey
.Length
; i
++)
1081 sk2
[i
] = ct2
.SortKey
[i
];
1083 prev2
.SortKey
= sk2
;
1085 else if (escape2
.Source
== null) {
1086 escape2
.Source
= s2
;
1087 escape2
.Start
= start2
;
1088 escape2
.Index
= cur2
+ ct2
.Source
.Length
;
1090 escape2
.Optional
= quickCheckPos2
;
1091 s2
= ct2
.Replacement
;
1102 sk2
[1] = Level1 (i2
);
1103 if (!ignoreNonSpace
&& currentLevel
> 1)
1104 sk2
[2] = Level2 (i2
, ext2
);
1105 if (currentLevel
> 2)
1106 sk2
[3] = Uni
.Level3 (i2
);
1107 if (currentLevel
> 3)
1108 special2
= Uni
.HasSpecialWeight ((char) i2
);
1114 // Add offset here so that it does not skip
1115 // idx1 while s2 has a replacement.
1118 if (!ignoreNonSpace
) {
1119 // add diacritical marks in s1 here
1120 while (idx1
< end1
) {
1121 if (Category (s1
[idx1
]) != 1)
1125 sk1
[2] = (byte) (sk1
[2] +
1126 Level2 (s1
[idx1
], ExtenderType
.None
));
1130 // add diacritical marks in s2 here
1131 while (idx2
< end2
) {
1132 if (Category (s2
[idx2
]) != 1)
1136 sk2
[2] = (byte) (sk2
[2] +
1137 Level2 (s2
[idx2
], ExtenderType
.None
));
1142 int ret
= sk1
[0] - sk2
[0];
1143 ret
= ret
!= 0 ? ret
: sk1
[1] - sk2
[1];
1146 if (currentLevel
== 1)
1148 if (!ignoreNonSpace
) {
1149 ret
= sk1
[2] - sk2
[2];
1152 if (immediateBreakup
)
1153 return -1; // different
1154 currentLevel
= frenchSort
? 2 : 1;
1158 if (currentLevel
== 2)
1160 ret
= sk1
[3] - sk2
[3];
1163 if (immediateBreakup
)
1164 return -1; // different
1168 if (currentLevel
== 3)
1170 if (special1
!= special2
) {
1171 if (immediateBreakup
)
1172 return -1; // different
1173 finalResult
= special1
? 1 : -1;
1178 ret
= CompareFlagPair (
1179 !Uni
.IsJapaneseSmallLetter ((char) i1
),
1180 !Uni
.IsJapaneseSmallLetter ((char) i2
));
1181 ret
= ret
!= 0 ? ret
:
1182 ToDashTypeValue (ext1
, opt
) -
1183 ToDashTypeValue (ext2
, opt
);
1184 ret
= ret
!= 0 ? ret
: CompareFlagPair (
1185 Uni
.IsHiragana ((char) i1
),
1186 Uni
.IsHiragana ((char) i2
));
1187 ret
= ret
!= 0 ? ret
: CompareFlagPair (
1188 !IsHalfKana ((char) i1
, opt
),
1189 !IsHalfKana ((char) i2
, opt
));
1191 if (immediateBreakup
)
1192 return -1; // different
1200 // If there were only level 3 or lower differences,
1201 // then we still have to find diacritical differences
1203 if (!ignoreNonSpace
&&
1204 finalResult
!= 0 && currentLevel
> 2) {
1205 while (idx1
< end1
&& idx2
< end2
) {
1206 if (!Uni
.IsIgnorableNonSpacing (s1
[idx1
]))
1208 if (!Uni
.IsIgnorableNonSpacing (s2
[idx2
]))
1210 finalResult
= Level2 (FilterOptions (s1
[idx1
], opt
), ext1
) - Level2 (FilterOptions (s2
[idx2
], opt
), ext2
);
1211 if (finalResult
!= 0)
1215 // they should work only for the first character
1216 ext1
= ExtenderType
.None
;
1217 ext2
= ExtenderType
.None
;
1220 if (currentLevel
== 1 && finalResult
!= 0) {
1221 while (idx1
< end1
) {
1222 if (Uni
.IsIgnorableNonSpacing (s1
[idx1
]))
1227 while (idx2
< end2
) {
1228 if (Uni
.IsIgnorableNonSpacing (s2
[idx2
]))
1234 // we still have to check level 5
1235 if (finalResult
== 0) {
1236 if (lv5At1
< 0 && lv5At2
>= 0)
1238 else if (lv5At2
< 0 && lv5At1
>= 0)
1241 finalResult
= lv5At1
- lv5At2
;
1242 if (finalResult
== 0)
1243 finalResult
= lv5Value1
- lv5Value2
;
1246 if (finalResult
== 0) {
1248 targetConsumed
= true;
1250 sourceConsumed
= true;
1252 return idx1
!= end1
? 1 : idx2
== end2
? finalResult
: -1;
1255 int CompareFlagPair (bool b1
, bool b2
)
1257 return b1
== b2
? 0 : b1
? 1 : -1;
1262 #region IsPrefix() and IsSuffix()
1264 public bool IsPrefix (string src
, string target
, CompareOptions opt
)
1266 return IsPrefix (src
, target
, 0, src
.Length
, opt
);
1269 public unsafe bool IsPrefix (string s
, string target
, int start
, int length
, CompareOptions opt
)
1271 if (target
.Length
== 0)
1273 byte* sk1
= stackalloc byte [4];
1274 byte* sk2
= stackalloc byte [4];
1275 ClearBuffer (sk1
, 4);
1276 ClearBuffer (sk2
, 4);
1277 Context ctx
= new Context (opt
, null, null, sk1
, sk2
, null,
1278 QuickCheckPossible (s
, start
, start
+ length
, target
, 0, target
.Length
));
1279 return IsPrefix (s
, target
, start
, length
, true, ref ctx
);
1282 unsafe bool IsPrefix (string s
, string target
, int start
, int length
, bool skipHeadingExtenders
, ref Context ctx
)
1284 bool consumed
, dummy
;
1285 CompareInternal (s
, start
, length
,
1286 target
, 0, target
.Length
,
1287 out consumed
, out dummy
, skipHeadingExtenders
,
1294 public bool IsSuffix (string src
, string target
, CompareOptions opt
)
1296 return IsSuffix (src
, target
, src
.Length
- 1, src
.Length
, opt
);
1299 public unsafe bool IsSuffix (string s
, string target
, int start
, int length
, CompareOptions opt
)
1301 if (target
.Length
== 0)
1303 int last
= LastIndexOf (s
, target
, start
, length
, opt
);
1304 return last
>= 0 && Compare (s
, last
, s
.Length
- last
, target
, 0, target
.Length
, opt
) == 0;
1306 // quick check : simple codepoint comparison
1307 if (s.Length >= target.Length) {
1309 int se = start - length;
1310 for (int i = target.Length - 1; si >= se && i >= 0; i--, si--)
1311 if (s [si] != target [i])
1313 if (si == start + target.Length)
1317 PreviousInfo prev = new PreviousInfo (false);
1318 byte* sk1 = stackalloc byte [4];
1319 byte* sk2 = stackalloc byte [4];
1320 ClearBuffer (sk1, 4);
1321 ClearBuffer (sk2, 4);
1322 return IsSuffix (opt, s, target, start, length, ref prev, sk1, sk2);
1326 unsafe bool IsSuffix (string s, string t, int start, int length, ref Context ctx)
1329 COpt opt = ctx.Option;
1331 for (;tstart < t.Length; tstart++)
1332 if (!IsIgnorable (t [tstart], opt))
1334 if (tstart == t.Length)
1335 return true; // as if target is String.Empty.
1338 // This is still not working. If it is not likely to get working, then just remove it.
1340 int send = start - length;
1341 int ti = t.Length - 1;
1344 int sStep = start + 1;
1345 int tStep = t.Length;
1346 bool sourceConsumed, targetConsumed;
1348 for (; send < si; si--)
1349 if (!IsIgnorable (s [si]))
1351 for (; tend < ti; ti--)
1352 if (!IsIgnorable (t [ti]))
1356 for (; send < si; si--)
1357 if (IsSafe (s [si]))
1359 for (; tend < ti; ti--)
1360 if (IsSafe (t [ti]))
1362 Console.WriteLine ("==== {0} {1} {2} {3} {4} {5} {6} {7} {8}", s, si, send, length, t, ti, tstart, sStep - si, tStep - ti);
1363 if (CompareInternal (opt, s, si, sStep - si,
1365 out targetConsumed, out sourceConsumed,
1377 // FIXME: it is not efficient for very long target.
1378 bool sourceConsumed, targetConsumed;
1379 int mismatchCount = 0;
1380 for (int i = 0; i < length; i++) {
1381 ctx.ClearPrevInfo ();
1383 int ret = CompareInternal (s, start - i, i + 1,
1384 t, tstart, t.Length - tstart,
1386 // FIXME: could immediately breakup
1387 out sourceConsumed, true, true, ref ctx);
1390 if (!sourceConsumed && targetConsumed)
1392 if (!targetConsumed) {
1394 if (mismatchCount > Uni.MaxExpansionLength)
1395 // The largest length of an
1396 // expansion is 3, so if the
1397 // target was not consumed more
1398 // than 3 times, then the tail
1399 // character does not match.
1409 #region IndexOf() / LastIndexOf()
1413 public int IndexOf (string s
, string target
, CompareOptions opt
)
1415 return IndexOf (s
, target
, 0, s
.Length
, opt
);
1418 int QuickIndexOf (string s
, string target
, int start
, int length
, out bool testWasUnable
)
1420 int testedSourcePos
= -1, testedTargetPos
= -1;
1422 testWasUnable
= true;
1423 if (target
.Length
== 0)
1425 else if (target
.Length
> length
)
1427 testWasUnable
= false;
1429 int end
= start
+ length
- target
.Length
+ 1;
1430 for (int i
= start
; i
< end
; i
++) {
1432 for (int j
= 0; j
< target
.Length
; j
++) {
1433 if (testedTargetPos
< j
) {
1434 if (target
[j
] >= 0x80) {
1435 testWasUnable
= true;
1439 testedTargetPos
= j
;
1441 if (testedSourcePos
< i
+ j
) {
1442 if (s
[i
+ j
] >= 0x80) {
1443 testWasUnable
= true;
1447 testedSourcePos
= i
+ j
;
1449 if (s
[i
+ j
] != target
[j
]) {
1461 public unsafe int IndexOf (string s
, string target
, int start
, int length
, CompareOptions opt
)
1463 if (opt
== CompareOptions
.Ordinal
)
1464 return IndexOfOrdinal (s
, target
, start
, length
);
1465 if (opt
== CompareOptions
.OrdinalIgnoreCase
)
1466 return IndexOfOrdinalIgnoreCase (s
, target
, start
, length
);
1467 if (opt
== CompareOptions
.None
) {
1469 int ret
= QuickIndexOf (s
, target
, start
, length
, out testWasUnable
);
1474 byte* alwaysMatchFlags
= stackalloc byte [16];
1475 byte* neverMatchFlags
= stackalloc byte [16];
1476 byte* targetSortKey
= stackalloc byte [4];
1477 byte* sk1
= stackalloc byte [4];
1478 byte* sk2
= stackalloc byte [4];
1479 ClearBuffer (alwaysMatchFlags
, 16);
1480 ClearBuffer (neverMatchFlags
, 16);
1481 ClearBuffer (targetSortKey
, 4);
1482 ClearBuffer (sk1
, 4);
1483 ClearBuffer (sk2
, 4);
1484 Context ctx
= new Context (opt
, alwaysMatchFlags
, neverMatchFlags
, sk1
, sk2
, null, false);
1486 return IndexOf (s
, target
, start
, length
,
1487 targetSortKey
, ref ctx
);
1490 // note that it wouldn't be used since ordinal IndexOf()
1491 // should be directed to icall.
1492 int IndexOfOrdinal (string s
, string target
, int start
, int length
)
1494 if (target
.Length
== 0)
1496 else if (target
.Length
> length
)
1499 int end
= start
+ length
- target
.Length
+ 1;
1500 for (int i
= start
; i
< end
; i
++) {
1502 for (int j
= 0; j
< target
.Length
; j
++) {
1503 if (s
[i
+ j
] != target
[j
]) {
1515 int IndexOfOrdinalIgnoreCase (string s
, string target
, int start
, int length
)
1517 if (target
.Length
== 0)
1519 else if (target
.Length
> length
)
1522 int end
= start
+ length
- target
.Length
+ 1;
1523 for (int i
= start
; i
< end
; i
++) {
1525 for (int j
= 0; j
< target
.Length
; j
++) {
1526 // I think almost all text has more lower letters than upper ones. Thus with this invariant comparison ToLower() should be faster since it costs less operations.
1527 if (textInfo
.ToLower (s
[i
+ j
]) != textInfo
.ToLower (target
[j
])) {
1541 public int IndexOf (string s
, char target
, CompareOptions opt
)
1543 return IndexOf (s
, target
, 0, s
.Length
, opt
);
1546 public unsafe int IndexOf (string s
, char target
, int start
, int length
, CompareOptions opt
)
1548 if (opt
== CompareOptions
.Ordinal
)
1549 return IndexOfOrdinal (s
, target
, start
, length
);
1550 if (opt
== CompareOptions
.OrdinalIgnoreCase
)
1551 return IndexOfOrdinalIgnoreCase (s
, target
, start
, length
);
1552 byte* alwaysMatchFlags
= stackalloc byte [16];
1553 byte* neverMatchFlags
= stackalloc byte [16];
1554 byte* targetSortKey
= stackalloc byte [4];
1555 byte* sk1
= stackalloc byte [4];
1556 byte* sk2
= stackalloc byte [4];
1557 ClearBuffer (alwaysMatchFlags
, 16);
1558 ClearBuffer (neverMatchFlags
, 16);
1559 ClearBuffer (targetSortKey
, 4);
1560 ClearBuffer (sk1
, 4);
1561 ClearBuffer (sk2
, 4);
1562 Context ctx
= new Context (opt
, alwaysMatchFlags
, neverMatchFlags
, sk1
, sk2
, null, false);
1564 // If target is contraction, then use string search.
1565 Contraction ct
= GetContraction (target
);
1567 if (ct
.Replacement
!= null)
1568 return IndexOf (s
, ct
.Replacement
,
1569 start
, length
, targetSortKey
, ref ctx
);
1571 for (int i
= 0; i
< ct
.SortKey
.Length
; i
++)
1572 sk2
[i
] = ct
.SortKey
[i
];
1573 return IndexOfSortKey (s
, start
, length
, sk2
, char.MinValue
, -1, true, ref ctx
);
1576 int ti
= FilterOptions ((int) target
, opt
);
1577 targetSortKey
[0] = Category (ti
);
1578 targetSortKey
[1] = Level1 (ti
);
1579 if ((opt
& COpt
.IgnoreNonSpace
) == 0)
1581 Level2 (ti
, ExtenderType
.None
);
1582 targetSortKey
[3] = Uni
.Level3 (ti
);
1583 return IndexOfSortKey (s
, start
, length
,
1584 targetSortKey
, target
, ti
,
1585 !Uni
.HasSpecialWeight ((char) ti
), ref ctx
);
1589 // note that it wouldn't be used since ordinal IndexOf()
1590 // should be directed to icall.
1591 int IndexOfOrdinal (string s
, char target
, int start
, int length
)
1593 int end
= start
+ length
;
1594 for (int i
= start
; i
< end
; i
++)
1595 if (s
[i
] == target
)
1600 int IndexOfOrdinalIgnoreCase (string s
, char target
, int start
, int length
)
1602 int end
= start
+ length
;
1603 target
= textInfo
.ToLower (target
);
1604 for (int i
= start
; i
< end
; i
++)
1605 if (textInfo
.ToLower (s
[i
]) == target
)
1610 // Searches target byte[] keydata
1611 unsafe int IndexOfSortKey (string s
, int start
, int length
, byte* sortkey
, char target
, int ti
, bool noLv4
, ref Context ctx
)
1613 int end
= start
+ length
;
1618 if (MatchesForward (s
, ref idx
, end
, ti
, sortkey
, noLv4
, ref ctx
))
1624 // Searches string. Search head character (or keydata when
1625 // the head is contraction sortkey) and try IsPrefix().
1626 unsafe int IndexOf (string s
, string target
, int start
, int length
, byte* targetSortKey
, ref Context ctx
)
1628 COpt opt
= ctx
.Option
;
1630 for (; tidx
< target
.Length
; tidx
++)
1631 if (!IsIgnorable (target
[tidx
], opt
))
1633 if (tidx
== target
.Length
)
1635 Contraction ct
= GetContraction (target
, tidx
, target
.Length
- tidx
);
1636 string replace
= ct
!= null ? ct
.Replacement
: null;
1637 byte* sk
= replace
== null ? targetSortKey
: null;
1639 char tc
= char.MinValue
;
1641 if (ct
!= null && sk
!= null) {
1642 for (int i
= 0; i
< ct
.SortKey
.Length
; i
++)
1643 sk
[i
] = ct
.SortKey
[i
];
1644 } else if (sk
!= null) {
1646 ti
= FilterOptions (target
[tidx
], opt
);
1647 sk
[0] = Category (ti
);
1648 sk
[1] = Level1 (ti
);
1649 if ((opt
& COpt
.IgnoreNonSpace
) == 0)
1650 sk
[2] = Level2 (ti
, ExtenderType
.None
);
1651 sk
[3] = Uni
.Level3 (ti
);
1652 noLv4
= !Uni
.HasSpecialWeight ((char) ti
);
1655 for (tidx
++; tidx
< target
.Length
; tidx
++) {
1656 if (Category (target
[tidx
]) != 1)
1660 sk
[2] = (byte) (sk
[2] + Level2 (target
[tidx
], ExtenderType
.None
));
1666 if (replace
!= null)
1667 idx
= IndexOf (s
, replace
, start
, length
, targetSortKey
, ref ctx
);
1669 idx
= IndexOfSortKey (s
, start
, length
, sk
, tc
, ti
, noLv4
, ref ctx
);
1672 length
-= idx
- start
;
1674 if (IsPrefix (s
, target
, start
, length
, false, ref ctx
))
1676 Contraction cts
= GetContraction (s
, start
, length
);
1678 start
+= cts
.Source
.Length
;
1679 length
-= cts
.Source
.Length
;
1684 } while (length
> 0);
1689 // There are the same number of LastIndexOf() methods as
1690 // IndexOf() related methods, with the same functionalities
1696 public int LastIndexOf (string s
, string target
, CompareOptions opt
)
1698 return LastIndexOf (s
, target
, s
.Length
- 1, s
.Length
, opt
);
1701 public unsafe int LastIndexOf (string s
, string target
, int start
, int length
, CompareOptions opt
)
1703 if (opt
== CompareOptions
.Ordinal
)
1704 return LastIndexOfOrdinal (s
, target
, start
, length
);
1705 if (opt
== CompareOptions
.OrdinalIgnoreCase
)
1706 return LastIndexOfOrdinalIgnoreCase (s
, target
, start
, length
);
1707 byte* alwaysMatchFlags
= stackalloc byte [16];
1708 byte* neverMatchFlags
= stackalloc byte [16];
1709 byte* targetSortKey
= stackalloc byte [4];
1710 byte* sk1
= stackalloc byte [4];
1711 byte* sk2
= stackalloc byte [4];
1712 ClearBuffer (alwaysMatchFlags
, 16);
1713 ClearBuffer (neverMatchFlags
, 16);
1714 ClearBuffer (targetSortKey
, 4);
1715 ClearBuffer (sk1
, 4);
1716 ClearBuffer (sk2
, 4);
1717 // For some unknown reason CompareQuick() does not work fine w/ LastIndexOf().
1718 Context ctx
= new Context (opt
, alwaysMatchFlags
, neverMatchFlags
, sk1
, sk2
, null, false);
1719 return LastIndexOf (s
, target
, start
, length
,
1720 targetSortKey
, ref ctx
);
1723 int LastIndexOfOrdinal (string s
, string target
, int start
, int length
)
1725 if (target
.Length
== 0)
1727 if (s
.Length
< target
.Length
|| target
.Length
> length
)
1729 int end
= start
- length
+ target
.Length
-1;
1730 char tail
= target
[target
.Length
- 1];
1731 for (int i
= start
; i
> end
;) {
1732 if (s
[i
] != tail
) {
1736 int x
= i
- target
.Length
+ 1;
1738 bool mismatch
= false;
1739 for (int j
= target
.Length
- 2; j
>= 0; j
--)
1740 if (s
[x
+ j
] != target
[j
]) {
1751 int LastIndexOfOrdinalIgnoreCase (string s
, string target
, int start
, int length
)
1753 if (target
.Length
== 0)
1755 if (s
.Length
< length
|| target
.Length
> length
)
1757 int end
= start
- length
+ target
.Length
- 1;
1758 char tail
= textInfo
.ToLower (target
[target
.Length
- 1]);
1759 for (int i
= start
; i
> end
;) {
1760 if (textInfo
.ToLower (s
[i
]) != tail
) {
1764 int x
= i
- target
.Length
+ 1;
1766 bool mismatch
= false;
1767 for (int j
= target
.Length
- 2; j
>= 0; j
--)
1768 if (textInfo
.ToLower (s
[x
+ j
]) != textInfo
.ToLower (target
[j
])) {
1781 public int LastIndexOf (string s
, char target
, CompareOptions opt
)
1783 return LastIndexOf (s
, target
, s
.Length
- 1, s
.Length
, opt
);
1786 public unsafe int LastIndexOf (string s
, char target
, int start
, int length
, CompareOptions opt
)
1788 if (opt
== CompareOptions
.Ordinal
)
1789 return LastIndexOfOrdinal (s
, target
, start
, length
);
1790 if (opt
== CompareOptions
.OrdinalIgnoreCase
)
1791 return LastIndexOfOrdinalIgnoreCase (s
, target
, start
, length
);
1792 byte* alwaysMatchFlags
= stackalloc byte [16];
1793 byte* neverMatchFlags
= stackalloc byte [16];
1794 byte* targetSortKey
= stackalloc byte [4];
1795 byte* sk1
= stackalloc byte [4];
1796 byte* sk2
= stackalloc byte [4];
1797 ClearBuffer (alwaysMatchFlags
, 16);
1798 ClearBuffer (neverMatchFlags
, 16);
1799 ClearBuffer (targetSortKey
, 4);
1800 ClearBuffer (sk1
, 4);
1801 ClearBuffer (sk2
, 4);
1802 Context ctx
= new Context (opt
, alwaysMatchFlags
, neverMatchFlags
, sk1
, sk2
, null, false);
1804 // If target is a replacement contraction, then use
1806 Contraction ct
= GetContraction (target
);
1808 if (ct
.Replacement
!= null)
1809 return LastIndexOf (s
,
1810 ct
.Replacement
, start
, length
,
1811 targetSortKey
, ref ctx
);
1813 for (int bi
= 0; bi
< ct
.SortKey
.Length
; bi
++)
1814 sk2
[bi
] = ct
.SortKey
[bi
];
1815 return LastIndexOfSortKey (s
, start
,
1821 int ti
= FilterOptions ((int) target
, opt
);
1822 targetSortKey
[0] = Category (ti
);
1823 targetSortKey
[1] = Level1 (ti
);
1824 if ((opt
& COpt
.IgnoreNonSpace
) == 0)
1825 targetSortKey
[2] = Level2 (ti
, ExtenderType
.None
);
1826 targetSortKey
[3] = Uni
.Level3 (ti
);
1827 return LastIndexOfSortKey (s
, start
, start
,
1828 length
, targetSortKey
,
1829 ti
, !Uni
.HasSpecialWeight ((char) ti
),
1834 int LastIndexOfOrdinal (string s
, char target
, int start
, int length
)
1838 int end
= start
- length
;
1839 for (int i
= start
; i
> end
; i
--)
1840 if (s
[i
] == target
)
1845 int LastIndexOfOrdinalIgnoreCase (string s
, char target
, int start
, int length
)
1849 int end
= start
- length
;
1850 char c
= textInfo
.ToUpper (target
);
1851 for (int i
= start
; i
> end
; i
--)
1852 if (textInfo
.ToUpper (s
[i
]) == c
)
1857 // Searches target byte[] keydata
1858 unsafe int LastIndexOfSortKey (string s
, int start
, int orgStart
, int length
, byte* sortkey
, int ti
, bool noLv4
, ref Context ctx
)
1860 int end
= start
- length
;
1864 if (MatchesBackward (s
, ref idx
, end
, orgStart
,
1865 ti
, sortkey
, noLv4
, ref ctx
))
1871 // Searches string. Search head character (or keydata when
1872 // the head is contraction sortkey) and try IsPrefix().
1873 unsafe int LastIndexOf (string s
, string target
, int start
, int length
, byte* targetSortKey
, ref Context ctx
)
1875 COpt opt
= ctx
.Option
;
1876 int orgStart
= start
;
1878 for (; tidx
< target
.Length
; tidx
++)
1879 if (!IsIgnorable (target
[tidx
], opt
))
1881 if (tidx
== target
.Length
)
1883 Contraction ct
= GetContraction (target
, tidx
, target
.Length
- tidx
);
1884 string replace
= ct
!= null ? ct
.Replacement
: null;
1885 byte* sk
= replace
== null ? targetSortKey
: null;
1889 if (ct
!= null && sk
!= null) {
1890 for (int i
= 0; i
< ct
.SortKey
.Length
; i
++)
1891 sk
[i
] = ct
.SortKey
[i
];
1892 } else if (sk
!= null) {
1893 ti
= FilterOptions (target
[tidx
], opt
);
1894 sk
[0] = Category (ti
);
1895 sk
[1] = Level1 (ti
);
1896 if ((opt
& COpt
.IgnoreNonSpace
) == 0)
1897 sk
[2] = Level2 (ti
, ExtenderType
.None
);
1898 sk
[3] = Uni
.Level3 (ti
);
1899 noLv4
= !Uni
.HasSpecialWeight ((char) ti
);
1902 for (tidx
++; tidx
< target
.Length
; tidx
++) {
1903 if (Category (target
[tidx
]) != 1)
1907 sk
[2] = (byte) (sk
[2] + Level2 (target
[tidx
], ExtenderType
.None
));
1914 if (replace
!= null)
1915 idx
= LastIndexOf (s
, replace
,
1917 targetSortKey
, ref ctx
);
1919 idx
= LastIndexOfSortKey (s
, start
, orgStart
, length
, sk
, ti
, noLv4
, ref ctx
);
1922 length
-= start
- idx
;
1925 if (IsPrefix (s
, target
, idx
, orgStart
- idx
+ 1, false, ref ctx
)) {
1926 for (;idx
< orgStart
; idx
++)
1927 if (!IsIgnorable (s
[idx
], opt
))
1931 Contraction cts
= GetContraction (s
, idx
, orgStart
- idx
+ 1);
1933 start
-= cts
.Source
.Length
;
1934 length
-= cts
.Source
.Length
;
1939 } while (length
> 0);
1943 unsafe bool MatchesForward (string s
, ref int idx
, int end
, int ti
, byte* sortkey
, bool noLv4
, ref Context ctx
)
1946 if (ctx
.AlwaysMatchFlags
!= null && si
< 128 && (ctx
.AlwaysMatchFlags
[si
/ 8] & (1 << (si
% 8))) != 0)
1948 if (ctx
.NeverMatchFlags
!= null &&
1950 (ctx
.NeverMatchFlags
[si
/ 8] & (1 << (si
% 8))) != 0) {
1954 ExtenderType ext
= GetExtenderType (s
[idx
]);
1955 Contraction ct
= null;
1956 if (MatchesForwardCore (s
, ref idx
, end
, ti
, sortkey
, noLv4
, ext
, ref ct
, ref ctx
)) {
1957 if (ctx
.AlwaysMatchFlags
!= null && ct
== null && ext
== ExtenderType
.None
&& si
< 128)
1958 ctx
.AlwaysMatchFlags
[si
/ 8] |= (byte) (1 << (si
% 8));
1961 if (ctx
.NeverMatchFlags
!= null && ct
== null && ext
== ExtenderType
.None
&& si
< 128)
1962 ctx
.NeverMatchFlags
[si
/ 8] |= (byte) (1 << (si
% 8));
1966 unsafe bool MatchesForwardCore (string s
, ref int idx
, int end
, int ti
, byte* sortkey
, bool noLv4
, ExtenderType ext
, ref Contraction ct
, ref Context ctx
)
1968 COpt opt
= ctx
.Option
;
1969 byte* charSortKey
= ctx
.Buffer1
;
1970 bool ignoreNonSpace
= (opt
& COpt
.IgnoreNonSpace
) != 0;
1972 if (ext
== ExtenderType
.None
)
1973 ct
= GetContraction (s
, idx
, end
);
1974 else if (ctx
.PrevCode
< 0) {
1975 if (ctx
.PrevSortKey
== null) {
1979 charSortKey
= ctx
.PrevSortKey
;
1982 si
= FilterExtender (ctx
.PrevCode
, ext
, opt
);
1983 // if lv4 exists, it never matches contraction
1985 idx
+= ct
.Source
.Length
;
1988 if (ct
.SortKey
!= null) {
1989 for (int i
= 0; i
< 4; i
++)
1990 charSortKey
[i
] = sortkey
[i
];
1992 ctx
.PrevSortKey
= charSortKey
;
1994 // Here is the core of LAMESPEC
1995 // described at the top of the source.
1997 return MatchesForward (ct
.Replacement
, ref dummy
,
1998 ct
.Replacement
.Length
, ti
, sortkey
, noLv4
, ref ctx
);
2002 si
= FilterOptions (s
[idx
], opt
);
2004 charSortKey
[0] = Category (si
);
2005 bool noMatch
= false;
2006 if (sortkey
[0] == charSortKey
[0])
2007 charSortKey
[1] = Level1 (si
);
2010 if (!ignoreNonSpace
&& sortkey
[1] == charSortKey
[1])
2011 charSortKey
[2] = Level2 (si
, ext
);
2012 else if (!ignoreNonSpace
)
2015 for (; idx
< end
; idx
++) {
2016 if (Category (s
[idx
]) != 1)
2021 charSortKey
[3] = Uni
.Level3 (si
);
2022 if (charSortKey
[0] != 1)
2025 for (; idx
< end
; idx
++) {
2026 if (Category (s
[idx
]) != 1)
2030 if (charSortKey
[2] == 0)
2031 charSortKey
[2] = 2;
2032 charSortKey
[2] = (byte) (charSortKey
[2]
2033 + Level2 (s
[idx
], ExtenderType
.None
));
2036 return MatchesPrimitive (opt
, charSortKey
, si
, ext
, sortkey
, ti
, noLv4
);
2039 unsafe bool MatchesPrimitive (COpt opt
, byte* source
, int si
, ExtenderType ext
, byte* target
, int ti
, bool noLv4
)
2041 bool ignoreNonSpace
= (opt
& COpt
.IgnoreNonSpace
) != 0;
2042 if (source
[0] != target
[0] ||
2043 source
[1] != target
[1] ||
2044 (!ignoreNonSpace
&& source
[2] != target
[2]) ||
2045 source
[3] != target
[3])
2047 if (noLv4
&& (si
< 0 || !Uni
.HasSpecialWeight ((char) si
)))
2051 // Since target can never be an extender, if the source
2052 // is an expander and it matters, then they never match.
2053 if (!ignoreNonSpace
&& ext
== ExtenderType
.Conditional
)
2055 if (Uni
.IsJapaneseSmallLetter ((char) si
) !=
2056 Uni
.IsJapaneseSmallLetter ((char) ti
) ||
2057 ToDashTypeValue (ext
, opt
) !=
2058 // FIXME: we will have to specify correct value for target
2059 ToDashTypeValue (ExtenderType
.None
, opt
) ||
2060 !Uni
.IsHiragana ((char) si
) !=
2061 !Uni
.IsHiragana ((char) ti
) ||
2062 IsHalfKana ((char) si
, opt
) !=
2063 IsHalfKana ((char) ti
, opt
))
2068 unsafe bool MatchesBackward (string s
, ref int idx
, int end
, int orgStart
, int ti
, byte* sortkey
, bool noLv4
, ref Context ctx
)
2071 if (ctx
.AlwaysMatchFlags
!= null && si
< 128 && (ctx
.AlwaysMatchFlags
[si
/ 8] & (1 << (si
% 8))) != 0)
2073 if (ctx
.NeverMatchFlags
!= null && si
< 128 && (ctx
.NeverMatchFlags
[si
/ 8] & (1 << (si
% 8))) != 0) {
2077 ExtenderType ext
= GetExtenderType (s
[idx
]);
2078 Contraction ct
= null;
2079 if (MatchesBackwardCore (s
, ref idx
, end
, orgStart
, ti
, sortkey
, noLv4
, ext
, ref ct
, ref ctx
)) {
2080 if (ctx
.AlwaysMatchFlags
!= null && ct
== null && ext
== ExtenderType
.None
&& si
< 128)
2081 ctx
.AlwaysMatchFlags
[si
/ 8] |= (byte) (1 << (si
% 8));
2084 if (ctx
.NeverMatchFlags
!= null && ct
== null && ext
== ExtenderType
.None
&& si
< 128) {
2085 ctx
.NeverMatchFlags
[si
/ 8] |= (byte) (1 << (si
% 8));
2090 unsafe bool MatchesBackwardCore (string s
, ref int idx
, int end
, int orgStart
, int ti
, byte* sortkey
, bool noLv4
, ExtenderType ext
, ref Contraction ct
, ref Context ctx
)
2092 COpt opt
= ctx
.Option
;
2093 byte* charSortKey
= ctx
.Buffer1
;
2094 bool ignoreNonSpace
= (opt
& COpt
.IgnoreNonSpace
) != 0;
2097 // To handle extenders in source, we need to
2098 // check next _primary_ character.
2099 if (ext
!= ExtenderType
.None
) {
2100 byte diacritical
= 0;
2101 for (int tmp
= 0; ; tmp
--) {
2102 if (tmp
< 0) // heading extender
2104 if (IsIgnorable (s
[tmp
], opt
))
2106 int tmpi
= FilterOptions (s
[tmp
], opt
);
2107 byte category
= Category (tmpi
);
2108 if (category
== 1) {
2109 diacritical
= Level2 (tmpi
, ExtenderType
.None
);
2112 si
= FilterExtender (tmpi
, ext
, opt
);
2113 charSortKey
[0] = category
;
2114 charSortKey
[1] = Level1 (si
);
2115 if (!ignoreNonSpace
)
2116 charSortKey
[2] = Level2 (si
, ext
);
2117 charSortKey
[3] = Uni
.Level3 (si
);
2118 if (ext
!= ExtenderType
.Conditional
&&
2121 (charSortKey
[2] == 0) ?
2122 (byte) (diacritical
+ 2) :
2128 if (ext
== ExtenderType
.None
)
2129 ct
= GetTailContraction (s
, idx
, end
);
2130 // if lv4 exists, it never matches contraction
2132 idx
-= ct
.Source
.Length
;
2135 if (ct
.SortKey
!= null) {
2136 for (int i
= 0; i
< 4; i
++)
2137 charSortKey
[i
] = sortkey
[i
];
2139 ctx
.PrevSortKey
= charSortKey
;
2141 // Here is the core of LAMESPEC
2142 // described at the top of the source.
2143 int dummy
= ct
.Replacement
.Length
- 1;
2144 return 0 <= LastIndexOfSortKey (
2145 ct
.Replacement
, dummy
, dummy
,
2146 ct
.Replacement
.Length
, sortkey
,
2147 ti
, noLv4
, ref ctx
);
2149 } else if (ext
== ExtenderType
.None
) {
2151 si
= FilterOptions (s
[idx
], opt
);
2153 bool noMatch
= false;
2154 charSortKey
[0] = Category (si
);
2155 if (charSortKey
[0] == sortkey
[0])
2156 charSortKey
[1] = Level1 (si
);
2159 if (!ignoreNonSpace
&& charSortKey
[1] == sortkey
[1])
2160 charSortKey
[2] = Level2 (si
, ext
);
2161 else if (!ignoreNonSpace
)
2165 charSortKey
[3] = Uni
.Level3 (si
);
2166 if (charSortKey
[0] != 1)
2169 if (ext
== ExtenderType
.None
) {
2170 for (int tmp
= cur
+ 1; tmp
< orgStart
; tmp
++) {
2171 if (Category (s
[tmp
]) != 1)
2175 if (charSortKey
[2] == 0)
2176 charSortKey
[2] = 2;
2177 charSortKey
[2] = (byte) (charSortKey
[2]
2178 + Level2 (s
[tmp
], ExtenderType
.None
));
2181 return MatchesPrimitive (opt
, charSortKey
, si
, ext
, sortkey
, ti
, noLv4
);