2 // SimpleCollator.cs : the core collator implementation
5 // Atsushi Enomoto <atsushi@ximian.com>
7 // Copyright (C) 2005 Novell, Inc (http://www.novell.com)
9 // Permission is hereby granted, free of charge, to any person obtaining
10 // a copy of this software and associated documentation files (the
11 // "Software"), to deal in the Software without restriction, including
12 // without limitation the rights to use, copy, modify, merge, publish,
13 // distribute, sublicense, and/or sell copies of the Software, and to
14 // permit persons to whom the Software is furnished to do so, subject to
15 // the following conditions:
17 // The above copyright notice and this permission notice shall be
18 // included in all copies or substantial portions of the Software.
20 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
30 // Here's a summary for supporting contractions, expansions and diacritical
33 // Diacritical mapping is a simple tailoring rule that "remaps" diacritical
34 // weight value from one to another. For now it applies to all range of
35 // characters, but at some stage we might need to limit the remapping ranges.
37 // A Contraction consists of a string (actually char[]) and a sortkey entry
38 // (i.e. byte[]). It indicates that a sequence of characters is interpreted
39 // as a single character that has the mapped sortkey value. There is no
40 // character which goes across "special rules". When the collator encountered
41 // such a sequence of characters, it just returns the sortkey value without
42 // further replacement.
44 // Since it is still likely to happen that a contraction sequence matches
45 // other character than the identical sequence (depending on CompareOptions
46 // and of course, defined sortkey value itself), comparison cannot be done
47 // at source char[] level.
49 // In IndexOf(), the first character in the target string or the target char
50 // itself is turned into sortkey bytes. If the character has a contraction and
51 // that is sortkey map, then it is used instead. If the contraction exists and
52 // that is replacement map, then the first character of the replacement string
53 // is searched instead. IndexOf() always searches only for the top character,
54 // and if it returned negative value, then it returns -1. Otherwise, it then
55 // tries IsPrefix() from that location. If it returns true, then it returns
58 // LAMESPEC: IndexOf() is lame as a whole API. It never matches in the middle
59 // of expansion and there is no proper way to return such indexes within
60 // a single int return value.
62 // For example, try below in .NET:
63 // IndexOf("\u00E6", "a")
64 // IndexOf("\u00E6", "e")
69 using System
.Collections
;
70 using System
.Globalization
;
72 using Uni
= Mono
.Globalization
.Unicode
.MSCompatUnicodeTable
;
73 using UUtil
= Mono
.Globalization
.Unicode
.MSCompatUnicodeTableUtil
;
74 using COpt
= System
.Globalization
.CompareOptions
;
76 namespace Mono
.Globalization
.Unicode
78 internal class SimpleCollator
80 // this environment variable is for debugging quick check.
81 #pragma warning disable 169, 414
82 static bool QuickCheckDisabled
=
83 Environment
.internalGetEnvironmentVariable (
84 "MONO_COLLATION_QUICK_CHECK_DISABLED") == "yes";
85 #pragma warning restore 169, 414
87 unsafe internal struct Context
89 public Context (CompareOptions opt
, byte* alwaysMatchFlags
, byte* neverMatchFlags
, byte* buffer1
, byte* buffer2
, byte* prev1
, bool quickCheckPossible
)
92 AlwaysMatchFlags
= alwaysMatchFlags
;
93 NeverMatchFlags
= neverMatchFlags
;
98 QuickCheckPossible
= quickCheckPossible
;
101 public readonly CompareOptions Option
;
102 public readonly byte* NeverMatchFlags
;
103 public readonly byte* AlwaysMatchFlags
;
104 public byte* Buffer1
;
105 public byte* Buffer2
;
107 public byte* PrevSortKey
;
108 public readonly bool QuickCheckPossible
;
110 public void ClearPrevInfo ()
117 unsafe struct PreviousInfo
120 public byte* SortKey
;
122 public PreviousInfo (bool dummy
)
131 public string Source
;
138 static SimpleCollator invariant
=
139 new SimpleCollator (CultureInfo
.InvariantCulture
);
141 readonly TextInfo textInfo
; // for ToLower().
142 readonly CodePointIndexer cjkIndexer
;
143 readonly Contraction
[] contractions
;
144 readonly Level2Map
[] level2Maps
;
145 // This flag marks characters as "unsafe", where the character
146 // could be used as part of a contraction (whose length > 1).
147 readonly byte [] unsafeFlags
;
149 unsafe readonly byte* cjkCatTable
;
150 unsafe readonly byte* cjkLv1Table
;
151 unsafe readonly byte* cjkLv2Table
;
152 readonly CodePointIndexer cjkLv2Indexer
;
154 readonly bool frenchSort
;
157 const int UnsafeFlagLength
= 0x300 / 8;
159 // readonly byte [] contractionFlags = new byte [16];
161 // This array is internally used inside IndexOf() to store
162 // "no need to check" ASCII characters.
164 // Now that it should be thread safe, this array is allocated
166 // byte [] neverMatchFlags = new byte [128 / 8];
168 #region .ctor() and split functions
170 public SimpleCollator (CultureInfo culture
)
173 textInfo
= culture
.TextInfo
;
176 SetCJKTable (culture
, ref cjkIndexer
,
177 ref cjkCatTable
, ref cjkLv1Table
,
178 ref cjkLv2Indexer
, ref cjkLv2Table
);
181 // Get tailoring info
182 TailoringInfo t
= null;
183 for (CultureInfo ci
= culture
; ci
.LCID
!= 127; ci
= ci
.Parent
) {
184 t
= Uni
.GetTailoringInfo (ci
.LCID
);
188 if (t
== null) // then use invariant
189 t
= Uni
.GetTailoringInfo (127);
191 frenchSort
= t
.FrenchSort
;
192 Uni
.BuildTailoringTables (culture
, t
, ref contractions
,
194 unsafeFlags
= new byte [UnsafeFlagLength
];
195 Console
.WriteLine ("contractions == {0}", contractions
== null);
196 foreach (Contraction c
in contractions
) {
197 if (c
.Source
.Length
> 1)
198 foreach (char ch
in c
.Source
)
199 unsafeFlags
[(int) ch
/ 8 ]
200 |= (byte) (1 << ((int) ch
& 7));
201 // for (int i = 0; i < c.Source.Length; i++) {
202 // int ch = c.Source [i];
205 // contractionFlags [ch / 8] |= (byte) (1 << (ch & 7));
209 foreach (Contraction c
in invariant
.contractions
) {
210 if (c
.Source
.Length
> 1)
211 foreach (char ch
in c
.Source
)
212 unsafeFlags
[(int) ch
/ 8 ]
213 |= (byte) (1 << ((int) ch
& 7));
214 // for (int i = 0; i < c.Source.Length; i++) {
215 // int ch = c.Source [i];
218 // contractionFlags [ch / 8] |= (byte) (1 << (ch & 7));
222 // FIXME: Since tailorings are mostly for latin
223 // (and in some cases Cyrillic) characters, it would
224 // be much better for performance to store "start
225 // indexes" for > 370 (culture-specific letters).
228 // dump tailoring table
229 Console.WriteLine ("******** building table for {0} : contractions - {1} diacritical - {2}",
230 culture.LCID, contractions.Length, level2Maps.Length);
231 foreach (Contraction c in contractions) {
232 foreach (char cc in c.Source)
233 Console.Write ("{0:X4} ", (int) cc);
234 Console.WriteLine (" -> '{0}'", c.Replacement);
239 unsafe private void SetCJKTable (
240 CultureInfo culture
, ref CodePointIndexer cjkIndexer
,
241 ref byte* catTable
, ref byte* lv1Table
,
242 ref CodePointIndexer lv2Indexer
, ref byte* lv2Table
)
244 string name
= GetNeutralCulture (culture
).Name
;
246 Uni
.FillCJK (name
, ref cjkIndexer
, ref catTable
,
247 ref lv1Table
, ref lv2Indexer
, ref lv2Table
);
250 static CultureInfo
GetNeutralCulture (CultureInfo info
)
252 CultureInfo ret
= info
;
253 while (ret
.Parent
!= null && ret
.Parent
.LCID
!= 127)
260 unsafe byte Category (int cp
)
262 if (cp
< 0x3000 || cjkCatTable
== null)
263 return Uni
.Category (cp
);
264 int idx
= cjkIndexer
.ToIndex (cp
);
265 return idx
< 0 ? Uni
.Category (cp
) : cjkCatTable
[idx
];
268 unsafe byte Level1 (int cp
)
270 if (cp
< 0x3000 || cjkLv1Table
== null)
271 return Uni
.Level1 (cp
);
272 int idx
= cjkIndexer
.ToIndex (cp
);
273 return idx
< 0 ? Uni
.Level1 (cp
) : cjkLv1Table
[idx
];
276 unsafe byte Level2 (int cp
, ExtenderType ext
)
278 if (ext
== ExtenderType
.Buggy
)
280 else if (ext
== ExtenderType
.Conditional
)
283 if (cp
< 0x3000 || cjkLv2Table
== null)
284 return Uni
.Level2 (cp
);
285 int idx
= cjkLv2Indexer
.ToIndex (cp
);
286 byte ret
= idx
< 0 ? (byte) 0 : cjkLv2Table
[idx
];
289 ret
= Uni
.Level2 (cp
);
290 if (level2Maps
.Length
== 0)
292 for (int i
= 0; i
< level2Maps
.Length
; i
++) {
293 if (level2Maps
[i
].Source
== ret
)
294 return level2Maps
[i
].Replace
;
295 else if (level2Maps
[i
].Source
> ret
)
301 static bool IsHalfKana (int cp
, COpt opt
)
303 return (opt
& COpt
.IgnoreWidth
) != 0 ||
304 Uni
.IsHalfWidthKana ((char) cp
);
307 Contraction
GetContraction (string s
, int start
, int end
)
309 // int si = s [start];
310 // if (si < 128 && (contractionFlags [si / 8] & (1 << (si & 7))) == 0)
312 Contraction c
= GetContraction (s
, start
, end
, contractions
);
313 if (c
!= null || lcid
== 127)
315 return GetContraction (s
, start
, end
, invariant
.contractions
);
318 Contraction
GetContraction (string s
, int start
, int end
, Contraction
[] clist
)
320 for (int i
= 0; i
< clist
.Length
; i
++) {
321 Contraction ct
= clist
[i
];
322 int diff
= ct
.Source
[0] - s
[start
];
324 return null; // it's already sorted
327 char [] chars
= ct
.Source
;
328 if (end
- start
< chars
.Length
)
331 for (int n
= 0; n
< chars
.Length
; n
++)
332 if (s
[start
+ n
] != chars
[n
]) {
342 Contraction
GetTailContraction (string s
, int start
, int end
)
344 // int si = s [end - 1];
345 // if (si < 128 && (contractionFlags [si / 8] & (1 << (si & 7))) == 0)
347 Contraction c
= GetTailContraction (s
, start
, end
, contractions
);
348 if (c
!= null || lcid
== 127)
350 return GetTailContraction (s
, start
, end
, invariant
.contractions
);
353 Contraction
GetTailContraction (string s
, int start
, int end
, Contraction
[] clist
)
355 if (start
== end
|| end
< -1 || start
>= s
.Length
|| s
.Length
<= end
+ 1)
356 throw new SystemException (String
.Format ("MONO internal error. Failed to get TailContraction. start = {0} end = {1} string = '{2}'", start
, end
, s
));
357 for (int i
= 0; i
< clist
.Length
; i
++) {
358 Contraction ct
= clist
[i
];
359 char [] chars
= ct
.Source
;
360 if (chars
.Length
> start
- end
)
362 if (chars
[chars
.Length
- 1] != s
[start
])
366 for (int n
= 0, spos
= start
- chars
.Length
+ 1;
369 if (s
[spos
] != chars
[n
]) {
380 Contraction
GetContraction (char c
)
382 Contraction ct
= GetContraction (c
, contractions
);
383 if (ct
!= null || lcid
== 127)
385 return GetContraction (c
, invariant
.contractions
);
388 Contraction
GetContraction (char c
, Contraction
[] clist
)
390 for (int i
= 0; i
< clist
.Length
; i
++) {
391 Contraction ct
= clist
[i
];
392 if (ct
.Source
[0] > c
)
393 return null; // it's already sorted
394 if (ct
.Source
[0] == c
&& ct
.Source
.Length
== 1)
400 int FilterOptions (int i
, COpt opt
)
402 if ((opt
& COpt
.IgnoreWidth
) != 0) {
403 int x
= Uni
.ToWidthCompat (i
);
407 if ((opt
& COpt
.OrdinalIgnoreCase
) != 0)
408 i
= textInfo
.ToLower ((char) i
);
409 if ((opt
& COpt
.IgnoreCase
) != 0)
410 i
= textInfo
.ToLower ((char) i
);
411 if ((opt
& COpt
.IgnoreKanaType
) != 0)
412 i
= Uni
.ToKanaTypeInsensitive (i
);
424 ExtenderType
GetExtenderType (int i
)
426 // LAMESPEC: Windows expects true for U+3005, but
427 // sometimes it does not represent to repeat just
429 // Windows also expects true for U+3031 and U+3032,
430 // but they should *never* repeat one character.
432 // U+2015 becomes an extender only when it is Japanese
434 return lcid
== 16 ? ExtenderType
.Conditional
:
437 if (i
< 0x3005 || i
> 0xFF70)
438 return ExtenderType
.None
;
443 return ExtenderType
.Simple
;
445 return ExtenderType
.Conditional
;
448 return ExtenderType
.Voiced
;
452 return ExtenderType
.None
;
454 case 0x3005: // LAMESPEC: see above.
455 return ExtenderType
.Buggy
;
456 case 0x3031: // LAMESPEC: see above.
457 case 0x3032: // LAMESPEC: see above.
460 return ExtenderType
.Simple
;
463 return ExtenderType
.Voiced
;
465 return ExtenderType
.Conditional
;
467 return ExtenderType
.None
;
471 static byte ToDashTypeValue (ExtenderType ext
, COpt opt
)
473 if ((opt
& COpt
.IgnoreNonSpace
) != 0) // LAMESPEC: huh, why?
476 case ExtenderType
.None
:
478 case ExtenderType
.Conditional
:
485 int FilterExtender (int i
, ExtenderType ext
, COpt opt
)
487 if (ext
== ExtenderType
.Conditional
&&
488 Uni
.HasSpecialWeight ((char) i
)) {
489 bool half
= IsHalfKana ((char) i
, opt
);
490 bool katakana
= !Uni
.IsHiragana ((char) i
);
491 switch (Level1 (i
) & 7) {
493 return half
? 0xFF71 : katakana
? 0x30A2 : 0x3042;
495 return half
? 0xFF72 : katakana
? 0x30A4 : 0x3044;
497 return half
? 0xFF73 : katakana
? 0x30A6 : 0x3046;
499 return half
? 0xFF74 : katakana
? 0x30A8 : 0x3048;
501 return half
? 0xFF75 : katakana
? 0x30AA : 0x304A;
507 static bool IsIgnorable (int i
, COpt opt
)
509 return Uni
.IsIgnorable (i
, (byte) (1 +
510 ((opt
& COpt
.IgnoreSymbols
) != 0 ? 2 : 0) +
511 ((opt
& COpt
.IgnoreNonSpace
) != 0 ? 4 : 0)));
517 return i
/ 8 >= unsafeFlags
.Length
? true : (unsafeFlags
[i
/ 8] & (1 << (i
% 8))) == 0;
522 public SortKey
GetSortKey (string s
)
524 return GetSortKey (s
, CompareOptions
.None
);
527 public SortKey
GetSortKey (string s
, CompareOptions options
)
529 return GetSortKey (s
, 0, s
.Length
, options
);
532 public SortKey
GetSortKey (string s
, int start
, int length
, CompareOptions options
)
534 SortKeyBuffer buf
= new SortKeyBuffer (lcid
);
535 buf
.Initialize (options
, lcid
, s
, frenchSort
);
536 int end
= start
+ length
;
537 GetSortKey (s
, start
, end
, buf
, options
);
538 return buf
.GetResultAndReset ();
541 unsafe void GetSortKey (string s
, int start
, int end
,
542 SortKeyBuffer buf
, CompareOptions opt
)
544 byte* prevbuf
= stackalloc byte [4];
545 ClearBuffer (prevbuf
, 4);
546 Context ctx
= new Context (opt
, null, null, null, null, prevbuf
, false);
548 for (int n
= start
; n
< end
; n
++) {
551 ExtenderType ext
= GetExtenderType (i
);
552 if (ext
!= ExtenderType
.None
) {
553 i
= FilterExtender (ctx
.PrevCode
, ext
, opt
);
555 FillSortKeyRaw (i
, ext
, buf
, opt
);
556 else if (ctx
.PrevSortKey
!= null) {
557 byte* b
= ctx
.PrevSortKey
;
561 b
[2] != 1 ? b
[2] : Level2 (i
, ext
),
562 b
[3] != 1 ? b
[3] : Uni
.Level3 (i
));
564 // otherwise do nothing.
565 // (if the extender is the first char
566 // in the string, then just ignore.)
570 if (IsIgnorable (i
, opt
))
572 i
= FilterOptions (i
, opt
);
574 Contraction ct
= GetContraction (s
, n
, end
);
576 if (ct
.Replacement
!= null) {
577 GetSortKey (ct
.Replacement
, 0, ct
.Replacement
.Length
, buf
, opt
);
579 byte* b
= ctx
.PrevSortKey
;
580 for (int bi
= 0; bi
< ct
.SortKey
.Length
; bi
++)
581 b
[bi
] = ct
.SortKey
[bi
];
585 b
[2] != 1 ? b
[2] : Level2 (i
, ext
),
586 b
[3] != 1 ? b
[3] : Uni
.Level3 (i
));
589 n
+= ct
.Source
.Length
- 1;
592 if (!Uni
.IsIgnorableNonSpacing (i
))
594 FillSortKeyRaw (i
, ExtenderType
.None
, buf
, opt
);
599 void FillSortKeyRaw (int i
, ExtenderType ext
,
600 SortKeyBuffer buf
, CompareOptions opt
)
602 if (0x3400 <= i
&& i
<= 0x4DB5) {
603 int diff
= i
- 0x3400;
604 buf
.AppendCJKExtension (
605 (byte) (0x10 + diff
/ 254),
606 (byte) (diff
% 254 + 2));
610 UnicodeCategory uc
= char.GetUnicodeCategory ((char) i
);
612 case UnicodeCategory
.PrivateUse
:
613 int diff
= i
- 0xE000;
615 (byte) (0xE5 + diff
/ 254),
616 (byte) (diff
% 254 + 2),
620 case UnicodeCategory
.Surrogate
:
621 FillSurrogateSortKeyRaw (i
, buf
);
625 byte level2
= Level2 (i
, ext
);
626 if (Uni
.HasSpecialWeight ((char) i
)) {
627 byte level1
= Level1 (i
);
633 Uni
.IsJapaneseSmallLetter ((char) i
),
634 ToDashTypeValue (ext
, opt
),
635 !Uni
.IsHiragana ((char) i
),
636 IsHalfKana ((char) i
, opt
)
638 if ((opt
& COpt
.IgnoreNonSpace
) == 0 && ext
== ExtenderType
.Voiced
)
639 // Append voice weight
640 buf
.AppendNormal (1, 1, 1, 0);
650 void FillSurrogateSortKeyRaw (int i
, SortKeyBuffer buf
)
659 lower
= (byte) ((i
== 0xD800) ? 0x3E : 0x3F);
660 } else if (0xD840 <= i
&& i
< 0xD880) {
664 } else if (0xDB80 <= i
&& i
< 0xDC00) {
665 diffbase
= 0xDB80 - 0x40;
669 diffbase
= 0xDC00 - 0xF8 + 2;
673 int diff
= i
- diffbase
;
676 (byte) (segment
+ diff
/ 254),
677 (byte) (diff
% 254 + 2),
686 public int Compare (string s1
, string s2
)
688 return Compare (s1
, s2
, CompareOptions
.None
);
691 public int Compare (string s1
, string s2
, CompareOptions options
)
693 return Compare (s1
, 0, s1
.Length
, s2
, 0, s2
.Length
, options
);
696 private int CompareOrdinal (string s1
, int idx1
, int len1
,
697 string s2
, int idx2
, int len2
)
699 int min
= len1
< len2
? len1
: len2
;
700 int end1
= idx1
+ min
;
701 int end2
= idx2
+ min
;
702 if (idx1
< 0 || idx2
< 0 || end1
> s1
.Length
|| end2
> s2
.Length
)
703 throw new SystemException (String
.Format ("CompareInfo Internal Error: Should not happen. {0} {1} {2} {3} {4} {5}", idx1
, idx2
, len1
, len2
, s1
.Length
, s2
.Length
));
704 for (int i1
= idx1
, i2
= idx2
;
705 i1
< end1
&& i2
< end2
; i1
++, i2
++)
706 if (s1
[i1
] != s2
[i2
])
707 return s1
[i1
] - s2
[i2
];
708 return len1
== len2
? 0 :
709 len1
== min
? - 1 : 1;
712 // mostly equivalent to CompareOrdinal, but the return value is
713 // not based on codepoints.
714 private int CompareQuick (string s1
, int idx1
, int len1
,
715 string s2
, int idx2
, int len2
, out bool sourceConsumed
,
716 out bool targetConsumed
, bool immediateBreakup
)
718 sourceConsumed
= false;
719 targetConsumed
= false;
720 int min
= len1
< len2
? len1
: len2
;
721 int end1
= idx1
+ min
;
722 int end2
= idx2
+ min
;
723 if (idx1
< 0 || idx2
< 0 || end1
> s1
.Length
|| end2
> s2
.Length
)
724 throw new SystemException (String
.Format ("CompareInfo Internal Error: Should not happen. {0} {1} {2} {3} {4} {5}", idx1
, idx2
, len1
, len2
, s1
.Length
, s2
.Length
));
725 for (int i1
= idx1
, i2
= idx2
;
726 i1
< end1
&& i2
< end2
; i1
++, i2
++)
727 if (s1
[i1
] != s2
[i2
]) {
728 if (immediateBreakup
)
730 int ret
= Category (s1
[i1
]) - Category (s2
[i2
]);
732 ret
= Level1 (s1
[i1
]) - Level1 (s2
[i2
]);
735 ret
= Uni
.Level3 (s1
[i1
]) - Uni
.Level3 (s2
[i2
]);
737 throw new SystemException (String
.Format ("CompareInfo Internal Error: Should not happen. '{0}' {2} {3} '{1}' {4} {5}", s1
, s2
, idx1
, end1
, idx2
, end2
));
740 sourceConsumed
= len1
<= len2
;
741 targetConsumed
= len1
>= len2
;
742 return len1
== len2
? 0 :
743 len1
== min
? - 1 : 1;
746 private int CompareOrdinalIgnoreCase (string s1
, int idx1
, int len1
,
747 string s2
, int idx2
, int len2
)
749 int min
= len1
< len2
? len1
: len2
;
750 int end1
= idx1
+ min
;
751 int end2
= idx2
+ min
;
752 if (idx1
< 0 || idx2
< 0 || end1
> s1
.Length
|| end2
> s2
.Length
)
753 throw new SystemException (String
.Format ("CompareInfo Internal Error: Should not happen. {0} {1} {2} {3} {4} {5}", idx1
, idx2
, len1
, len2
, s1
.Length
, s2
.Length
));
754 TextInfo ti
= invariant
.textInfo
;
755 for (int i1
= idx1
, i2
= idx2
;
756 i1
< end1
&& i2
< end2
; i1
++, i2
++)
757 if (ti
.ToLower (s1
[i1
]) != ti
.ToLower (s2
[i2
]))
758 return ti
.ToLower (s1
[i1
]) - ti
.ToLower (s2
[i2
]);
759 return len1
== len2
? 0 :
760 len1
== min
? - 1 : 1;
763 public unsafe int Compare (string s1
, int idx1
, int len1
,
764 string s2
, int idx2
, int len2
, CompareOptions options
)
766 // quick equality check
767 if (idx1
== idx2
&& len1
== len2
&&
768 Object
.ReferenceEquals (s1
, s2
))
770 if (options
== CompareOptions
.Ordinal
)
771 return CompareOrdinal (s1
, idx1
, len1
, s2
, idx2
, len2
);
772 if (options
== CompareOptions
.OrdinalIgnoreCase
)
773 return CompareOrdinalIgnoreCase (s1
, idx1
, len1
, s2
, idx2
, len2
);
775 #if false // stable easy version, depends on GetSortKey().
776 SortKey sk1
= GetSortKey (s1
, idx1
, len1
, options
);
777 SortKey sk2
= GetSortKey (s2
, idx2
, len2
, options
);
778 byte [] d1
= sk1
.KeyData
;
779 byte [] d2
= sk2
.KeyData
;
780 int len
= d1
.Length
> d2
.Length
? d2
.Length
: d1
.Length
;
781 for (int i
= 0; i
< len
; i
++)
782 if (d1
[i
] != d2
[i
])
783 return d1
[i
] < d2
[i
] ? -1 : 1;
784 return d1
.Length
== d2
.Length
? 0 : d1
.Length
< d2
.Length
? -1 : 1;
786 byte* sk1
= stackalloc byte [4];
787 byte* sk2
= stackalloc byte [4];
788 ClearBuffer (sk1
, 4);
789 ClearBuffer (sk2
, 4);
790 Context ctx
= new Context (options
, null, null, sk1
, sk2
, null,
791 QuickCheckPossible (s1
, idx1
, idx1
+ len1
, s2
, idx2
, idx2
+ len2
));
794 int ret
= CompareInternal (s1
, idx1
, len1
, s2
, idx2
, len2
, out dummy
, out dummy2
, true, false, ref ctx
);
795 return ret
== 0 ? 0 : ret
< 0 ? -1 : 1;
799 unsafe void ClearBuffer (byte* buffer
, int size
)
801 for (int i
= 0; i
< size
; i
++)
805 bool QuickCheckPossible (string s1
, int idx1
, int end1
,
806 string s2
, int idx2
, int end2
)
809 // looks like it is buggy and inefficient, so just disable it.
812 if (QuickCheckDisabled
)
814 // if (s1.Length > 100 || s2.Length > 100)
816 for (int i
= idx1
; i
< end1
; i
++)
817 if (s1
[i
] < 0x20 && (s1
[i
] < '\x9' || s1
[i
] > '\xD') || s1
[i
] >= 0x80 || s1
[i
] == '-' || s1
[i
] == '\'')
819 for (int i
= idx2
; i
< end2
; i
++)
820 if (s2
[i
] < 0x20 && (s2
[i
] < '\x9' || s2
[i
] > '\xD') || s2
[i
] >= 0x80 || s2
[i
] == '-' || s2
[i
] == '\'')
826 unsafe int CompareInternal (string s1
, int idx1
, int len1
, string s2
,
828 out bool targetConsumed
, out bool sourceConsumed
,
829 bool skipHeadingExtenders
, bool immediateBreakup
,
832 COpt opt
= ctx
.Option
;
835 int end1
= idx1
+ len1
;
836 int end2
= idx2
+ len2
;
837 targetConsumed
= false;
838 sourceConsumed
= false;
839 PreviousInfo prev2
= new PreviousInfo (false);
841 if (opt
== CompareOptions
.None
&& ctx
.QuickCheckPossible
)
842 return CompareQuick (s1
, idx1
, len1
, s2
, idx2
, len2
, out sourceConsumed
, out targetConsumed
, immediateBreakup
);
844 // It holds final result that comes from the comparison
845 // at level 2 or lower. Even if Compare() found the
846 // difference at level 2 or lower, it still has to
847 // continue level 1 comparison. FinalResult is used
848 // when there was no further differences.
850 // It holds the comparison level to do. It starts from
851 // 5, and becomes 1 when we do primary-only comparison.
852 int currentLevel
= 5;
859 // Skip heading extenders
860 if (skipHeadingExtenders
) {
861 for (; idx1
< end1
; idx1
++)
862 if (GetExtenderType (s1
[idx1
]) == ExtenderType
.None
)
864 for (; idx2
< end2
; idx2
++)
865 if (GetExtenderType (s2
[idx2
]) == ExtenderType
.None
)
869 ExtenderType ext1
= ExtenderType
.None
;
870 ExtenderType ext2
= ExtenderType
.None
;
872 int quickCheckPos1
= idx1
;
873 int quickCheckPos2
= idx2
;
874 bool stringSort
= (opt
& COpt
.StringSort
) != 0;
875 bool ignoreNonSpace
= (opt
& COpt
.IgnoreNonSpace
) != 0;
876 Escape escape1
= new Escape ();
877 Escape escape2
= new Escape ();
880 for (; idx1
< end1
; idx1
++)
881 if (!IsIgnorable (s1
[idx1
], opt
))
883 for (; idx2
< end2
; idx2
++)
884 if (!IsIgnorable (s2
[idx2
], opt
))
888 if (escape1
.Source
== null)
891 start1
= escape1
.Start
;
892 idx1
= escape1
.Index
;
894 quickCheckPos1
= escape1
.Optional
;
895 escape1
.Source
= null;
899 if (escape2
.Source
== null)
902 start2
= escape2
.Start
;
903 idx2
= escape2
.Index
;
905 quickCheckPos2
= escape2
.Optional
;
906 escape2
.Source
= null;
910 // If comparison is unstable, then this part is one of the most doubtful part.
911 // Here we run quick codepoint comparison and run back to "stable" area to
912 // compare characters.
914 // Strictly to say, even the first character
915 // could be compared here, but it messed
916 // backward step, so I just avoided mess.
917 if (quickCheckPos1
< idx1
&& quickCheckPos2
< idx2
) {
918 while (idx1
< end1
&& idx2
< end2
&&
919 s1
[idx1
] == s2
[idx2
]) {
923 if (idx1
== end1
|| idx2
== end2
)
924 continue; // check replacement
926 int backwardEnd1
= quickCheckPos1
;
927 int backwardEnd2
= quickCheckPos2
;
928 quickCheckPos1
= idx1
;
929 quickCheckPos2
= idx2
;
934 for (;idx1
> backwardEnd1
; idx1
--)
935 if (Category (s1
[idx1
]) != 1)
937 for (;idx2
> backwardEnd2
; idx2
--)
938 if (Category (s2
[idx2
]) != 1)
940 for (;idx1
> backwardEnd1
; idx1
--)
941 if (IsSafe (s1
[idx1
]))
943 for (;idx2
> backwardEnd2
; idx2
--)
944 if (IsSafe (s2
[idx2
]))
953 int i1
= FilterOptions (s1
[idx1
], opt
);
954 int i2
= FilterOptions (s2
[idx2
], opt
);
955 bool special1
= false;
956 bool special2
= false;
958 // If current character is an expander, then
959 // repeat the previous character.
960 ext1
= GetExtenderType (i1
);
961 if (ext1
!= ExtenderType
.None
) {
962 if (ctx
.PrevCode
< 0) {
963 if (ctx
.PrevSortKey
== null) {
968 sk1
= ctx
.PrevSortKey
;
971 i1
= FilterExtender (ctx
.PrevCode
, ext1
, opt
);
973 ext2
= GetExtenderType (i2
);
974 if (ext2
!= ExtenderType
.None
) {
975 if (prev2
.Code
< 0) {
976 if (prev2
.SortKey
== null) {
984 i2
= FilterExtender (prev2
.Code
, ext2
, opt
);
987 byte cat1
= Category (i1
);
988 byte cat2
= Category (i2
);
990 // Handle special weight characters
992 if (!stringSort
&& currentLevel
== 5) {
993 lv5At1
= escape1
.Source
!= null ?
994 escape1
.Index
- escape1
.Start
:
996 // here Windows has a bug that it does
997 // not consider thirtiary weight.
998 lv5Value1
= Level1 (i1
) << 8 + Uni
.Level3 (i1
);
1004 if (!stringSort
&& currentLevel
== 5) {
1005 lv5At2
= escape2
.Source
!= null ?
1006 escape2
.Index
- escape2
.Start
:
1008 // here Windows has a bug that it does
1009 // not consider thirtiary weight.
1010 lv5Value2
= Level1 (i2
) << 8 + Uni
.Level3 (i2
);
1015 if (cat1
== 6 || cat2
== 6) {
1016 if (currentLevel
== 5) {
1017 if (lv5Value1
== lv5Value2
) {
1018 // There is not really difference here.
1019 lv5At1
= lv5At2
= -1;
1020 lv5Value1
= lv5Value2
= 0;
1028 Contraction ct1
= null;
1029 if (ext1
== ExtenderType
.None
)
1030 ct1
= GetContraction (s1
, idx1
, end1
);
1035 else if (ct1
!= null) {
1036 offset1
= ct1
.Source
.Length
;
1037 if (ct1
.SortKey
!= null) {
1039 for (int i
= 0; i
< ct1
.SortKey
.Length
; i
++)
1040 sk1
[i
] = ct1
.SortKey
[i
];
1042 ctx
.PrevSortKey
= sk1
;
1044 else if (escape1
.Source
== null) {
1045 escape1
.Source
= s1
;
1046 escape1
.Start
= start1
;
1047 escape1
.Index
= cur1
+ ct1
.Source
.Length
;
1049 escape1
.Optional
= quickCheckPos1
;
1050 s1
= ct1
.Replacement
;
1061 sk1
[1] = Level1 (i1
);
1062 if (!ignoreNonSpace
&& currentLevel
> 1)
1063 sk1
[2] = Level2 (i1
, ext1
);
1064 if (currentLevel
> 2)
1065 sk1
[3] = Uni
.Level3 (i1
);
1066 if (currentLevel
> 3)
1067 special1
= Uni
.HasSpecialWeight ((char) i1
);
1072 Contraction ct2
= null;
1073 if (ext2
== ExtenderType
.None
)
1074 ct2
= GetContraction (s2
, idx2
, end2
);
1078 else if (ct2
!= null) {
1079 idx2
+= ct2
.Source
.Length
;
1080 if (ct2
.SortKey
!= null) {
1082 for (int i
= 0; i
< ct2
.SortKey
.Length
; i
++)
1083 sk2
[i
] = ct2
.SortKey
[i
];
1085 prev2
.SortKey
= sk2
;
1087 else if (escape2
.Source
== null) {
1088 escape2
.Source
= s2
;
1089 escape2
.Start
= start2
;
1090 escape2
.Index
= cur2
+ ct2
.Source
.Length
;
1092 escape2
.Optional
= quickCheckPos2
;
1093 s2
= ct2
.Replacement
;
1104 sk2
[1] = Level1 (i2
);
1105 if (!ignoreNonSpace
&& currentLevel
> 1)
1106 sk2
[2] = Level2 (i2
, ext2
);
1107 if (currentLevel
> 2)
1108 sk2
[3] = Uni
.Level3 (i2
);
1109 if (currentLevel
> 3)
1110 special2
= Uni
.HasSpecialWeight ((char) i2
);
1116 // Add offset here so that it does not skip
1117 // idx1 while s2 has a replacement.
1120 if (!ignoreNonSpace
) {
1121 // add diacritical marks in s1 here
1122 while (idx1
< end1
) {
1123 if (Category (s1
[idx1
]) != 1)
1127 sk1
[2] = (byte) (sk1
[2] +
1128 Level2 (s1
[idx1
], ExtenderType
.None
));
1132 // add diacritical marks in s2 here
1133 while (idx2
< end2
) {
1134 if (Category (s2
[idx2
]) != 1)
1138 sk2
[2] = (byte) (sk2
[2] +
1139 Level2 (s2
[idx2
], ExtenderType
.None
));
1144 int ret
= sk1
[0] - sk2
[0];
1145 ret
= ret
!= 0 ? ret
: sk1
[1] - sk2
[1];
1148 if (currentLevel
== 1)
1150 if (!ignoreNonSpace
) {
1151 ret
= sk1
[2] - sk2
[2];
1154 if (immediateBreakup
)
1155 return -1; // different
1156 currentLevel
= frenchSort
? 2 : 1;
1160 if (currentLevel
== 2)
1162 ret
= sk1
[3] - sk2
[3];
1165 if (immediateBreakup
)
1166 return -1; // different
1170 if (currentLevel
== 3)
1172 if (special1
!= special2
) {
1173 if (immediateBreakup
)
1174 return -1; // different
1175 finalResult
= special1
? 1 : -1;
1180 ret
= CompareFlagPair (
1181 !Uni
.IsJapaneseSmallLetter ((char) i1
),
1182 !Uni
.IsJapaneseSmallLetter ((char) i2
));
1183 ret
= ret
!= 0 ? ret
:
1184 ToDashTypeValue (ext1
, opt
) -
1185 ToDashTypeValue (ext2
, opt
);
1186 ret
= ret
!= 0 ? ret
: CompareFlagPair (
1187 Uni
.IsHiragana ((char) i1
),
1188 Uni
.IsHiragana ((char) i2
));
1189 ret
= ret
!= 0 ? ret
: CompareFlagPair (
1190 !IsHalfKana ((char) i1
, opt
),
1191 !IsHalfKana ((char) i2
, opt
));
1193 if (immediateBreakup
)
1194 return -1; // different
1202 // If there were only level 3 or lower differences,
1203 // then we still have to find diacritical differences
1205 if (!ignoreNonSpace
&&
1206 finalResult
!= 0 && currentLevel
> 2) {
1207 while (idx1
< end1
&& idx2
< end2
) {
1208 if (!Uni
.IsIgnorableNonSpacing (s1
[idx1
]))
1210 if (!Uni
.IsIgnorableNonSpacing (s2
[idx2
]))
1212 finalResult
= Level2 (FilterOptions (s1
[idx1
], opt
), ext1
) - Level2 (FilterOptions (s2
[idx2
], opt
), ext2
);
1213 if (finalResult
!= 0)
1217 // they should work only for the first character
1218 ext1
= ExtenderType
.None
;
1219 ext2
= ExtenderType
.None
;
1222 if (currentLevel
== 1 && finalResult
!= 0) {
1223 while (idx1
< end1
) {
1224 if (Uni
.IsIgnorableNonSpacing (s1
[idx1
]))
1229 while (idx2
< end2
) {
1230 if (Uni
.IsIgnorableNonSpacing (s2
[idx2
]))
1236 // we still have to check level 5
1237 if (finalResult
== 0) {
1238 if (lv5At1
< 0 && lv5At2
>= 0)
1240 else if (lv5At2
< 0 && lv5At1
>= 0)
1243 finalResult
= lv5At1
- lv5At2
;
1244 if (finalResult
== 0)
1245 finalResult
= lv5Value1
- lv5Value2
;
1248 if (finalResult
== 0) {
1250 targetConsumed
= true;
1252 sourceConsumed
= true;
1254 return idx1
!= end1
? 1 : idx2
== end2
? finalResult
: -1;
1257 int CompareFlagPair (bool b1
, bool b2
)
1259 return b1
== b2
? 0 : b1
? 1 : -1;
1264 #region IsPrefix() and IsSuffix()
1266 public bool IsPrefix (string src
, string target
, CompareOptions opt
)
1268 return IsPrefix (src
, target
, 0, src
.Length
, opt
);
1271 public unsafe bool IsPrefix (string s
, string target
, int start
, int length
, CompareOptions opt
)
1273 if (target
.Length
== 0)
1275 byte* sk1
= stackalloc byte [4];
1276 byte* sk2
= stackalloc byte [4];
1277 ClearBuffer (sk1
, 4);
1278 ClearBuffer (sk2
, 4);
1279 Context ctx
= new Context (opt
, null, null, sk1
, sk2
, null,
1280 QuickCheckPossible (s
, start
, start
+ length
, target
, 0, target
.Length
));
1281 return IsPrefix (s
, target
, start
, length
, true, ref ctx
);
1284 unsafe bool IsPrefix (string s
, string target
, int start
, int length
, bool skipHeadingExtenders
, ref Context ctx
)
1286 bool consumed
, dummy
;
1287 CompareInternal (s
, start
, length
,
1288 target
, 0, target
.Length
,
1289 out consumed
, out dummy
, skipHeadingExtenders
,
1296 public bool IsSuffix (string src
, string target
, CompareOptions opt
)
1298 return IsSuffix (src
, target
, src
.Length
- 1, src
.Length
, opt
);
1301 public unsafe bool IsSuffix (string s
, string target
, int start
, int length
, CompareOptions opt
)
1303 if (target
.Length
== 0)
1305 int last
= LastIndexOf (s
, target
, start
, length
, opt
);
1306 return last
>= 0 && Compare (s
, last
, s
.Length
- last
, target
, 0, target
.Length
, opt
) == 0;
1308 // quick check : simple codepoint comparison
1309 if (s.Length >= target.Length) {
1311 int se = start - length;
1312 for (int i = target.Length - 1; si >= se && i >= 0; i--, si--)
1313 if (s [si] != target [i])
1315 if (si == start + target.Length)
1319 PreviousInfo prev = new PreviousInfo (false);
1320 byte* sk1 = stackalloc byte [4];
1321 byte* sk2 = stackalloc byte [4];
1322 ClearBuffer (sk1, 4);
1323 ClearBuffer (sk2, 4);
1324 return IsSuffix (opt, s, target, start, length, ref prev, sk1, sk2);
1328 unsafe bool IsSuffix (string s, string t, int start, int length, ref Context ctx)
1331 COpt opt = ctx.Option;
1333 for (;tstart < t.Length; tstart++)
1334 if (!IsIgnorable (t [tstart], opt))
1336 if (tstart == t.Length)
1337 return true; // as if target is String.Empty.
1340 // This is still not working. If it is not likely to get working, then just remove it.
1342 int send = start - length;
1343 int ti = t.Length - 1;
1346 int sStep = start + 1;
1347 int tStep = t.Length;
1348 bool sourceConsumed, targetConsumed;
1350 for (; send < si; si--)
1351 if (!IsIgnorable (s [si]))
1353 for (; tend < ti; ti--)
1354 if (!IsIgnorable (t [ti]))
1358 for (; send < si; si--)
1359 if (IsSafe (s [si]))
1361 for (; tend < ti; ti--)
1362 if (IsSafe (t [ti]))
1364 Console.WriteLine ("==== {0} {1} {2} {3} {4} {5} {6} {7} {8}", s, si, send, length, t, ti, tstart, sStep - si, tStep - ti);
1365 if (CompareInternal (opt, s, si, sStep - si,
1367 out targetConsumed, out sourceConsumed,
1379 // FIXME: it is not efficient for very long target.
1380 bool sourceConsumed, targetConsumed;
1381 int mismatchCount = 0;
1382 for (int i = 0; i < length; i++) {
1383 ctx.ClearPrevInfo ();
1385 int ret = CompareInternal (s, start - i, i + 1,
1386 t, tstart, t.Length - tstart,
1388 // FIXME: could immediately breakup
1389 out sourceConsumed, true, true, ref ctx);
1392 if (!sourceConsumed && targetConsumed)
1394 if (!targetConsumed) {
1396 if (mismatchCount > Uni.MaxExpansionLength)
1397 // The largest length of an
1398 // expansion is 3, so if the
1399 // target was not consumed more
1400 // than 3 times, then the tail
1401 // character does not match.
1411 #region IndexOf() / LastIndexOf()
1415 public int IndexOf (string s
, string target
, CompareOptions opt
)
1417 return IndexOf (s
, target
, 0, s
.Length
, opt
);
1420 int QuickIndexOf (string s
, string target
, int start
, int length
, out bool testWasUnable
)
1422 int testedSourcePos
= -1, testedTargetPos
= -1;
1424 testWasUnable
= true;
1425 if (target
.Length
== 0)
1427 else if (target
.Length
> length
)
1429 testWasUnable
= false;
1431 int end
= start
+ length
- target
.Length
+ 1;
1432 for (int i
= start
; i
< end
; i
++) {
1434 for (int j
= 0; j
< target
.Length
; j
++) {
1435 if (testedTargetPos
< j
) {
1436 char c
= target
[j
];
1437 if (c
== 0 || c
>= 0x80) {
1438 testWasUnable
= true;
1442 testedTargetPos
= j
;
1444 if (testedSourcePos
< i
+ j
) {
1446 if (c
== 0 || c
>= 0x80) {
1447 testWasUnable
= true;
1451 testedSourcePos
= i
+ j
;
1453 if (s
[i
+ j
] != target
[j
]) {
1465 public unsafe int IndexOf (string s
, string target
, int start
, int length
, CompareOptions opt
)
1467 if (opt
== CompareOptions
.Ordinal
)
1468 return IndexOfOrdinal (s
, target
, start
, length
);
1469 if (opt
== CompareOptions
.OrdinalIgnoreCase
)
1470 return IndexOfOrdinalIgnoreCase (s
, target
, start
, length
);
1471 if (opt
== CompareOptions
.None
) {
1473 int ret
= QuickIndexOf (s
, target
, start
, length
, out testWasUnable
);
1478 byte* alwaysMatchFlags
= stackalloc byte [16];
1479 byte* neverMatchFlags
= stackalloc byte [16];
1480 byte* targetSortKey
= stackalloc byte [4];
1481 byte* sk1
= stackalloc byte [4];
1482 byte* sk2
= stackalloc byte [4];
1483 ClearBuffer (alwaysMatchFlags
, 16);
1484 ClearBuffer (neverMatchFlags
, 16);
1485 ClearBuffer (targetSortKey
, 4);
1486 ClearBuffer (sk1
, 4);
1487 ClearBuffer (sk2
, 4);
1488 Context ctx
= new Context (opt
, alwaysMatchFlags
, neverMatchFlags
, sk1
, sk2
, null, false);
1490 return IndexOf (s
, target
, start
, length
,
1491 targetSortKey
, ref ctx
);
1494 // note that it wouldn't be used since ordinal IndexOf()
1495 // should be directed to icall.
1496 int IndexOfOrdinal (string s
, string target
, int start
, int length
)
1498 if (target
.Length
== 0)
1500 else if (target
.Length
> length
)
1503 int end
= start
+ length
- target
.Length
+ 1;
1504 for (int i
= start
; i
< end
; i
++) {
1506 for (int j
= 0; j
< target
.Length
; j
++) {
1507 if (s
[i
+ j
] != target
[j
]) {
1519 int IndexOfOrdinalIgnoreCase (string s
, string target
, int start
, int length
)
1521 if (target
.Length
== 0)
1523 else if (target
.Length
> length
)
1526 int end
= start
+ length
- target
.Length
+ 1;
1527 for (int i
= start
; i
< end
; i
++) {
1529 for (int j
= 0; j
< target
.Length
; j
++) {
1530 // I think almost all text has more lower letters than upper ones. Thus with this invariant comparison ToLower() should be faster since it costs less operations.
1531 if (textInfo
.ToLower (s
[i
+ j
]) != textInfo
.ToLower (target
[j
])) {
1545 public int IndexOf (string s
, char target
, CompareOptions opt
)
1547 return IndexOf (s
, target
, 0, s
.Length
, opt
);
1550 public unsafe int IndexOf (string s
, char target
, int start
, int length
, CompareOptions opt
)
1552 if (opt
== CompareOptions
.Ordinal
)
1553 return IndexOfOrdinal (s
, target
, start
, length
);
1554 if (opt
== CompareOptions
.OrdinalIgnoreCase
)
1555 return IndexOfOrdinalIgnoreCase (s
, target
, start
, length
);
1556 byte* alwaysMatchFlags
= stackalloc byte [16];
1557 byte* neverMatchFlags
= stackalloc byte [16];
1558 byte* targetSortKey
= stackalloc byte [4];
1559 byte* sk1
= stackalloc byte [4];
1560 byte* sk2
= stackalloc byte [4];
1561 ClearBuffer (alwaysMatchFlags
, 16);
1562 ClearBuffer (neverMatchFlags
, 16);
1563 ClearBuffer (targetSortKey
, 4);
1564 ClearBuffer (sk1
, 4);
1565 ClearBuffer (sk2
, 4);
1566 Context ctx
= new Context (opt
, alwaysMatchFlags
, neverMatchFlags
, sk1
, sk2
, null, false);
1568 // If target is contraction, then use string search.
1569 Contraction ct
= GetContraction (target
);
1571 if (ct
.Replacement
!= null)
1572 return IndexOf (s
, ct
.Replacement
,
1573 start
, length
, targetSortKey
, ref ctx
);
1575 for (int i
= 0; i
< ct
.SortKey
.Length
; i
++)
1576 sk2
[i
] = ct
.SortKey
[i
];
1577 return IndexOfSortKey (s
, start
, length
, sk2
, char.MinValue
, -1, true, ref ctx
);
1580 int ti
= FilterOptions ((int) target
, opt
);
1581 targetSortKey
[0] = Category (ti
);
1582 targetSortKey
[1] = Level1 (ti
);
1583 if ((opt
& COpt
.IgnoreNonSpace
) == 0)
1585 Level2 (ti
, ExtenderType
.None
);
1586 targetSortKey
[3] = Uni
.Level3 (ti
);
1587 return IndexOfSortKey (s
, start
, length
,
1588 targetSortKey
, target
, ti
,
1589 !Uni
.HasSpecialWeight ((char) ti
), ref ctx
);
1593 // note that it wouldn't be used since ordinal IndexOf()
1594 // should be directed to icall.
1595 int IndexOfOrdinal (string s
, char target
, int start
, int length
)
1597 int end
= start
+ length
;
1598 for (int i
= start
; i
< end
; i
++)
1599 if (s
[i
] == target
)
1604 int IndexOfOrdinalIgnoreCase (string s
, char target
, int start
, int length
)
1606 int end
= start
+ length
;
1607 target
= textInfo
.ToLower (target
);
1608 for (int i
= start
; i
< end
; i
++)
1609 if (textInfo
.ToLower (s
[i
]) == target
)
1614 // Searches target byte[] keydata
1615 unsafe int IndexOfSortKey (string s
, int start
, int length
, byte* sortkey
, char target
, int ti
, bool noLv4
, ref Context ctx
)
1617 int end
= start
+ length
;
1622 if (MatchesForward (s
, ref idx
, end
, ti
, sortkey
, noLv4
, ref ctx
))
1628 // Searches string. Search head character (or keydata when
1629 // the head is contraction sortkey) and try IsPrefix().
1630 unsafe int IndexOf (string s
, string target
, int start
, int length
, byte* targetSortKey
, ref Context ctx
)
1632 COpt opt
= ctx
.Option
;
1634 for (; tidx
< target
.Length
; tidx
++)
1635 if (!IsIgnorable (target
[tidx
], opt
))
1637 if (tidx
== target
.Length
)
1638 // FIXME: this is likely a hack. A string that is consists of \0 differs from those of other ignorable characters.
1639 return IndexOfOrdinal (target
, '\0', 0, target
.Length
) >= 0 ? IndexOfOrdinal (s
, target
, start
, length
) : start
;
1640 Contraction ct
= GetContraction (target
, tidx
, target
.Length
- tidx
);
1641 string replace
= ct
!= null ? ct
.Replacement
: null;
1642 byte* sk
= replace
== null ? targetSortKey
: null;
1644 char tc
= char.MinValue
;
1646 if (ct
!= null && sk
!= null) {
1647 for (int i
= 0; i
< ct
.SortKey
.Length
; i
++)
1648 sk
[i
] = ct
.SortKey
[i
];
1649 } else if (sk
!= null) {
1651 ti
= FilterOptions (target
[tidx
], opt
);
1652 sk
[0] = Category (ti
);
1653 sk
[1] = Level1 (ti
);
1654 if ((opt
& COpt
.IgnoreNonSpace
) == 0)
1655 sk
[2] = Level2 (ti
, ExtenderType
.None
);
1656 sk
[3] = Uni
.Level3 (ti
);
1657 noLv4
= !Uni
.HasSpecialWeight ((char) ti
);
1660 for (tidx
++; tidx
< target
.Length
; tidx
++) {
1661 if (Category (target
[tidx
]) != 1)
1665 sk
[2] = (byte) (sk
[2] + Level2 (target
[tidx
], ExtenderType
.None
));
1671 if (replace
!= null)
1672 idx
= IndexOf (s
, replace
, start
, length
, targetSortKey
, ref ctx
);
1674 idx
= IndexOfSortKey (s
, start
, length
, sk
, tc
, ti
, noLv4
, ref ctx
);
1677 length
-= idx
- start
;
1679 if (IsPrefix (s
, target
, start
, length
, false, ref ctx
))
1681 Contraction cts
= GetContraction (s
, start
, length
);
1683 start
+= cts
.Source
.Length
;
1684 length
-= cts
.Source
.Length
;
1689 } while (length
> 0);
1694 // There are the same number of LastIndexOf() methods as
1695 // IndexOf() related methods, with the same functionalities
1701 public int LastIndexOf (string s
, string target
, CompareOptions opt
)
1703 return LastIndexOf (s
, target
, s
.Length
- 1, s
.Length
, opt
);
1706 public unsafe int LastIndexOf (string s
, string target
, int start
, int length
, CompareOptions opt
)
1708 if (opt
== CompareOptions
.Ordinal
)
1709 return LastIndexOfOrdinal (s
, target
, start
, length
);
1710 if (opt
== CompareOptions
.OrdinalIgnoreCase
)
1711 return LastIndexOfOrdinalIgnoreCase (s
, target
, start
, length
);
1712 byte* alwaysMatchFlags
= stackalloc byte [16];
1713 byte* neverMatchFlags
= stackalloc byte [16];
1714 byte* targetSortKey
= stackalloc byte [4];
1715 byte* sk1
= stackalloc byte [4];
1716 byte* sk2
= stackalloc byte [4];
1717 ClearBuffer (alwaysMatchFlags
, 16);
1718 ClearBuffer (neverMatchFlags
, 16);
1719 ClearBuffer (targetSortKey
, 4);
1720 ClearBuffer (sk1
, 4);
1721 ClearBuffer (sk2
, 4);
1722 // For some unknown reason CompareQuick() does not work fine w/ LastIndexOf().
1723 Context ctx
= new Context (opt
, alwaysMatchFlags
, neverMatchFlags
, sk1
, sk2
, null, false);
1724 return LastIndexOf (s
, target
, start
, length
,
1725 targetSortKey
, ref ctx
);
1728 int LastIndexOfOrdinal (string s
, string target
, int start
, int length
)
1730 if (target
.Length
== 0)
1732 if (s
.Length
< target
.Length
|| target
.Length
> length
)
1734 int end
= start
- length
+ target
.Length
-1;
1735 char tail
= target
[target
.Length
- 1];
1736 for (int i
= start
; i
> end
;) {
1737 if (s
[i
] != tail
) {
1741 int x
= i
- target
.Length
+ 1;
1743 bool mismatch
= false;
1744 for (int j
= target
.Length
- 2; j
>= 0; j
--)
1745 if (s
[x
+ j
] != target
[j
]) {
1756 int LastIndexOfOrdinalIgnoreCase (string s
, string target
, int start
, int length
)
1758 if (target
.Length
== 0)
1760 if (s
.Length
< length
|| target
.Length
> length
)
1762 int end
= start
- length
+ target
.Length
- 1;
1763 char tail
= textInfo
.ToLower (target
[target
.Length
- 1]);
1764 for (int i
= start
; i
> end
;) {
1765 if (textInfo
.ToLower (s
[i
]) != tail
) {
1769 int x
= i
- target
.Length
+ 1;
1771 bool mismatch
= false;
1772 for (int j
= target
.Length
- 2; j
>= 0; j
--)
1773 if (textInfo
.ToLower (s
[x
+ j
]) != textInfo
.ToLower (target
[j
])) {
1786 public int LastIndexOf (string s
, char target
, CompareOptions opt
)
1788 return LastIndexOf (s
, target
, s
.Length
- 1, s
.Length
, opt
);
1791 public unsafe int LastIndexOf (string s
, char target
, int start
, int length
, CompareOptions opt
)
1793 if (opt
== CompareOptions
.Ordinal
)
1794 return LastIndexOfOrdinal (s
, target
, start
, length
);
1795 if (opt
== CompareOptions
.OrdinalIgnoreCase
)
1796 return LastIndexOfOrdinalIgnoreCase (s
, target
, start
, length
);
1797 byte* alwaysMatchFlags
= stackalloc byte [16];
1798 byte* neverMatchFlags
= stackalloc byte [16];
1799 byte* targetSortKey
= stackalloc byte [4];
1800 byte* sk1
= stackalloc byte [4];
1801 byte* sk2
= stackalloc byte [4];
1802 ClearBuffer (alwaysMatchFlags
, 16);
1803 ClearBuffer (neverMatchFlags
, 16);
1804 ClearBuffer (targetSortKey
, 4);
1805 ClearBuffer (sk1
, 4);
1806 ClearBuffer (sk2
, 4);
1807 Context ctx
= new Context (opt
, alwaysMatchFlags
, neverMatchFlags
, sk1
, sk2
, null, false);
1809 // If target is a replacement contraction, then use
1811 Contraction ct
= GetContraction (target
);
1813 if (ct
.Replacement
!= null)
1814 return LastIndexOf (s
,
1815 ct
.Replacement
, start
, length
,
1816 targetSortKey
, ref ctx
);
1818 for (int bi
= 0; bi
< ct
.SortKey
.Length
; bi
++)
1819 sk2
[bi
] = ct
.SortKey
[bi
];
1820 return LastIndexOfSortKey (s
, start
,
1826 int ti
= FilterOptions ((int) target
, opt
);
1827 targetSortKey
[0] = Category (ti
);
1828 targetSortKey
[1] = Level1 (ti
);
1829 if ((opt
& COpt
.IgnoreNonSpace
) == 0)
1830 targetSortKey
[2] = Level2 (ti
, ExtenderType
.None
);
1831 targetSortKey
[3] = Uni
.Level3 (ti
);
1832 return LastIndexOfSortKey (s
, start
, start
,
1833 length
, targetSortKey
,
1834 ti
, !Uni
.HasSpecialWeight ((char) ti
),
1839 int LastIndexOfOrdinal (string s
, char target
, int start
, int length
)
1843 int end
= start
- length
;
1844 for (int i
= start
; i
> end
; i
--)
1845 if (s
[i
] == target
)
1850 int LastIndexOfOrdinalIgnoreCase (string s
, char target
, int start
, int length
)
1854 int end
= start
- length
;
1855 char c
= textInfo
.ToUpper (target
);
1856 for (int i
= start
; i
> end
; i
--)
1857 if (textInfo
.ToUpper (s
[i
]) == c
)
1862 // Searches target byte[] keydata
1863 unsafe int LastIndexOfSortKey (string s
, int start
, int orgStart
, int length
, byte* sortkey
, int ti
, bool noLv4
, ref Context ctx
)
1865 int end
= start
- length
;
1869 if (MatchesBackward (s
, ref idx
, end
, orgStart
,
1870 ti
, sortkey
, noLv4
, ref ctx
))
1876 // Searches string. Search head character (or keydata when
1877 // the head is contraction sortkey) and try IsPrefix().
1878 unsafe int LastIndexOf (string s
, string target
, int start
, int length
, byte* targetSortKey
, ref Context ctx
)
1880 COpt opt
= ctx
.Option
;
1881 int orgStart
= start
;
1883 for (; tidx
< target
.Length
; tidx
++)
1884 if (!IsIgnorable (target
[tidx
], opt
))
1886 if (tidx
== target
.Length
)
1887 // FIXME: this is likely a hack. A string that is consists of \0 differs from those of other ignorable characters.
1888 return IndexOfOrdinal (target
, '\0', 0, target
.Length
) >= 0 ? LastIndexOfOrdinal (s
, target
, start
, length
) : start
;
1889 Contraction ct
= GetContraction (target
, tidx
, target
.Length
- tidx
);
1890 string replace
= ct
!= null ? ct
.Replacement
: null;
1891 byte* sk
= replace
== null ? targetSortKey
: null;
1895 if (ct
!= null && sk
!= null) {
1896 for (int i
= 0; i
< ct
.SortKey
.Length
; i
++)
1897 sk
[i
] = ct
.SortKey
[i
];
1898 } else if (sk
!= null) {
1899 ti
= FilterOptions (target
[tidx
], opt
);
1900 sk
[0] = Category (ti
);
1901 sk
[1] = Level1 (ti
);
1902 if ((opt
& COpt
.IgnoreNonSpace
) == 0)
1903 sk
[2] = Level2 (ti
, ExtenderType
.None
);
1904 sk
[3] = Uni
.Level3 (ti
);
1905 noLv4
= !Uni
.HasSpecialWeight ((char) ti
);
1908 for (tidx
++; tidx
< target
.Length
; tidx
++) {
1909 if (Category (target
[tidx
]) != 1)
1913 sk
[2] = (byte) (sk
[2] + Level2 (target
[tidx
], ExtenderType
.None
));
1920 if (replace
!= null)
1921 idx
= LastIndexOf (s
, replace
,
1923 targetSortKey
, ref ctx
);
1925 idx
= LastIndexOfSortKey (s
, start
, orgStart
, length
, sk
, ti
, noLv4
, ref ctx
);
1928 length
-= start
- idx
;
1931 if (IsPrefix (s
, target
, idx
, orgStart
- idx
+ 1, false, ref ctx
)) {
1932 for (;idx
< orgStart
; idx
++)
1933 if (!IsIgnorable (s
[idx
], opt
))
1937 Contraction cts
= GetContraction (s
, idx
, orgStart
- idx
+ 1);
1939 start
-= cts
.Source
.Length
;
1940 length
-= cts
.Source
.Length
;
1945 } while (length
> 0);
1949 unsafe bool MatchesForward (string s
, ref int idx
, int end
, int ti
, byte* sortkey
, bool noLv4
, ref Context ctx
)
1952 if (ctx
.AlwaysMatchFlags
!= null && si
< 128 && (ctx
.AlwaysMatchFlags
[si
/ 8] & (1 << (si
% 8))) != 0)
1954 if (ctx
.NeverMatchFlags
!= null &&
1956 (ctx
.NeverMatchFlags
[si
/ 8] & (1 << (si
% 8))) != 0) {
1960 ExtenderType ext
= GetExtenderType (s
[idx
]);
1961 Contraction ct
= null;
1962 if (MatchesForwardCore (s
, ref idx
, end
, ti
, sortkey
, noLv4
, ext
, ref ct
, ref ctx
)) {
1963 if (ctx
.AlwaysMatchFlags
!= null && ct
== null && ext
== ExtenderType
.None
&& si
< 128)
1964 ctx
.AlwaysMatchFlags
[si
/ 8] |= (byte) (1 << (si
% 8));
1967 if (ctx
.NeverMatchFlags
!= null && ct
== null && ext
== ExtenderType
.None
&& si
< 128)
1968 ctx
.NeverMatchFlags
[si
/ 8] |= (byte) (1 << (si
% 8));
1972 unsafe bool MatchesForwardCore (string s
, ref int idx
, int end
, int ti
, byte* sortkey
, bool noLv4
, ExtenderType ext
, ref Contraction ct
, ref Context ctx
)
1974 COpt opt
= ctx
.Option
;
1975 byte* charSortKey
= ctx
.Buffer1
;
1976 bool ignoreNonSpace
= (opt
& COpt
.IgnoreNonSpace
) != 0;
1978 if (ext
== ExtenderType
.None
)
1979 ct
= GetContraction (s
, idx
, end
);
1980 else if (ctx
.PrevCode
< 0) {
1981 if (ctx
.PrevSortKey
== null) {
1985 charSortKey
= ctx
.PrevSortKey
;
1988 si
= FilterExtender (ctx
.PrevCode
, ext
, opt
);
1989 // if lv4 exists, it never matches contraction
1991 idx
+= ct
.Source
.Length
;
1994 if (ct
.SortKey
!= null) {
1995 for (int i
= 0; i
< 4; i
++)
1996 charSortKey
[i
] = sortkey
[i
];
1998 ctx
.PrevSortKey
= charSortKey
;
2000 // Here is the core of LAMESPEC
2001 // described at the top of the source.
2003 return MatchesForward (ct
.Replacement
, ref dummy
,
2004 ct
.Replacement
.Length
, ti
, sortkey
, noLv4
, ref ctx
);
2008 si
= FilterOptions (s
[idx
], opt
);
2010 charSortKey
[0] = Category (si
);
2011 bool noMatch
= false;
2012 if (sortkey
[0] == charSortKey
[0])
2013 charSortKey
[1] = Level1 (si
);
2016 if (!ignoreNonSpace
&& sortkey
[1] == charSortKey
[1])
2017 charSortKey
[2] = Level2 (si
, ext
);
2018 else if (!ignoreNonSpace
)
2021 for (; idx
< end
; idx
++) {
2022 if (Category (s
[idx
]) != 1)
2027 charSortKey
[3] = Uni
.Level3 (si
);
2028 if (charSortKey
[0] != 1)
2031 for (; idx
< end
; idx
++) {
2032 if (Category (s
[idx
]) != 1)
2036 if (charSortKey
[2] == 0)
2037 charSortKey
[2] = 2;
2038 charSortKey
[2] = (byte) (charSortKey
[2]
2039 + Level2 (s
[idx
], ExtenderType
.None
));
2042 return MatchesPrimitive (opt
, charSortKey
, si
, ext
, sortkey
, ti
, noLv4
);
2045 unsafe bool MatchesPrimitive (COpt opt
, byte* source
, int si
, ExtenderType ext
, byte* target
, int ti
, bool noLv4
)
2047 bool ignoreNonSpace
= (opt
& COpt
.IgnoreNonSpace
) != 0;
2048 if (source
[0] != target
[0] ||
2049 source
[1] != target
[1] ||
2050 (!ignoreNonSpace
&& source
[2] != target
[2]) ||
2051 source
[3] != target
[3])
2053 if (noLv4
&& (si
< 0 || !Uni
.HasSpecialWeight ((char) si
)))
2057 // Since target can never be an extender, if the source
2058 // is an expander and it matters, then they never match.
2059 if (!ignoreNonSpace
&& ext
== ExtenderType
.Conditional
)
2061 if (Uni
.IsJapaneseSmallLetter ((char) si
) !=
2062 Uni
.IsJapaneseSmallLetter ((char) ti
) ||
2063 ToDashTypeValue (ext
, opt
) !=
2064 // FIXME: we will have to specify correct value for target
2065 ToDashTypeValue (ExtenderType
.None
, opt
) ||
2066 !Uni
.IsHiragana ((char) si
) !=
2067 !Uni
.IsHiragana ((char) ti
) ||
2068 IsHalfKana ((char) si
, opt
) !=
2069 IsHalfKana ((char) ti
, opt
))
2074 unsafe bool MatchesBackward (string s
, ref int idx
, int end
, int orgStart
, int ti
, byte* sortkey
, bool noLv4
, ref Context ctx
)
2077 if (ctx
.AlwaysMatchFlags
!= null && si
< 128 && (ctx
.AlwaysMatchFlags
[si
/ 8] & (1 << (si
% 8))) != 0)
2079 if (ctx
.NeverMatchFlags
!= null && si
< 128 && (ctx
.NeverMatchFlags
[si
/ 8] & (1 << (si
% 8))) != 0) {
2083 ExtenderType ext
= GetExtenderType (s
[idx
]);
2084 Contraction ct
= null;
2085 if (MatchesBackwardCore (s
, ref idx
, end
, orgStart
, ti
, sortkey
, noLv4
, ext
, ref ct
, ref ctx
)) {
2086 if (ctx
.AlwaysMatchFlags
!= null && ct
== null && ext
== ExtenderType
.None
&& si
< 128)
2087 ctx
.AlwaysMatchFlags
[si
/ 8] |= (byte) (1 << (si
% 8));
2090 if (ctx
.NeverMatchFlags
!= null && ct
== null && ext
== ExtenderType
.None
&& si
< 128) {
2091 ctx
.NeverMatchFlags
[si
/ 8] |= (byte) (1 << (si
% 8));
2096 unsafe bool MatchesBackwardCore (string s
, ref int idx
, int end
, int orgStart
, int ti
, byte* sortkey
, bool noLv4
, ExtenderType ext
, ref Contraction ct
, ref Context ctx
)
2098 COpt opt
= ctx
.Option
;
2099 byte* charSortKey
= ctx
.Buffer1
;
2100 bool ignoreNonSpace
= (opt
& COpt
.IgnoreNonSpace
) != 0;
2103 // To handle extenders in source, we need to
2104 // check next _primary_ character.
2105 if (ext
!= ExtenderType
.None
) {
2106 byte diacritical
= 0;
2107 for (int tmp
= idx
; ; tmp
--) {
2108 if (tmp
< 0) // heading extender
2110 if (IsIgnorable (s
[tmp
], opt
))
2112 int tmpi
= FilterOptions (s
[tmp
], opt
);
2113 byte category
= Category (tmpi
);
2114 if (category
== 1) {
2115 diacritical
= Level2 (tmpi
, ExtenderType
.None
);
2118 si
= FilterExtender (tmpi
, ext
, opt
);
2119 charSortKey
[0] = category
;
2120 charSortKey
[1] = Level1 (si
);
2121 if (!ignoreNonSpace
)
2122 charSortKey
[2] = Level2 (si
, ext
);
2123 charSortKey
[3] = Uni
.Level3 (si
);
2124 if (ext
!= ExtenderType
.Conditional
&&
2127 (charSortKey
[2] == 0) ?
2128 (byte) (diacritical
+ 2) :
2134 if (ext
== ExtenderType
.None
)
2135 ct
= GetTailContraction (s
, idx
, end
);
2136 // if lv4 exists, it never matches contraction
2138 idx
-= ct
.Source
.Length
;
2141 if (ct
.SortKey
!= null) {
2142 for (int i
= 0; i
< 4; i
++)
2143 charSortKey
[i
] = sortkey
[i
];
2145 ctx
.PrevSortKey
= charSortKey
;
2147 // Here is the core of LAMESPEC
2148 // described at the top of the source.
2149 int dummy
= ct
.Replacement
.Length
- 1;
2150 return 0 <= LastIndexOfSortKey (
2151 ct
.Replacement
, dummy
, dummy
,
2152 ct
.Replacement
.Length
, sortkey
,
2153 ti
, noLv4
, ref ctx
);
2155 } else if (ext
== ExtenderType
.None
) {
2157 si
= FilterOptions (s
[idx
], opt
);
2159 bool noMatch
= false;
2160 charSortKey
[0] = Category (si
);
2161 if (charSortKey
[0] == sortkey
[0])
2162 charSortKey
[1] = Level1 (si
);
2165 if (!ignoreNonSpace
&& charSortKey
[1] == sortkey
[1])
2166 charSortKey
[2] = Level2 (si
, ext
);
2167 else if (!ignoreNonSpace
)
2171 charSortKey
[3] = Uni
.Level3 (si
);
2172 if (charSortKey
[0] != 1)
2175 if (ext
== ExtenderType
.None
) {
2176 for (int tmp
= cur
+ 1; tmp
< orgStart
; tmp
++) {
2177 if (Category (s
[tmp
]) != 1)
2181 if (charSortKey
[2] == 0)
2182 charSortKey
[2] = 2;
2183 charSortKey
[2] = (byte) (charSortKey
[2]
2184 + Level2 (s
[tmp
], ExtenderType
.None
));
2187 return MatchesPrimitive (opt
, charSortKey
, si
, ext
, sortkey
, ti
, noLv4
);