1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
5 /*============================================================
9 ** Purpose: This is the value class representing a Unicode character
10 ** Char methods until we create this functionality.
13 ===========================================================*/
15 using System
.Diagnostics
;
16 using System
.Globalization
;
17 using System
.Runtime
.InteropServices
;
23 [StructLayout(LayoutKind
.Sequential
)]
24 [System
.Runtime
.CompilerServices
.TypeForwardedFrom("mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089")]
25 public readonly struct Char
: IComparable
, IComparable
<char>, IEquatable
<char>, IConvertible
30 private readonly char m_value
; // Do not rename (binary serialization)
35 // The maximum character value.
36 public const char MaxValue
= (char)0xFFFF;
37 // The minimum character value.
38 public const char MinValue
= (char)0x00;
40 private const byte IsWhiteSpaceFlag
= 0x80;
41 private const byte IsUpperCaseLetterFlag
= 0x40;
42 private const byte IsLowerCaseLetterFlag
= 0x20;
43 private const byte UnicodeCategoryMask
= 0x1F;
45 // Contains information about the C0, Basic Latin, C1, and Latin-1 Supplement ranges [ U+0000..U+00FF ], with:
46 // - 0x80 bit if set means 'is whitespace'
47 // - 0x40 bit if set means 'is uppercase letter'
48 // - 0x20 bit if set means 'is lowercase letter'
49 // - bottom 5 bits are the UnicodeCategory of the character
51 // n.b. This data is locked to an earlier version of the Unicode standard (2.0, perhaps?), so
52 // the UnicodeCategory data contained here doesn't necessarily reflect the UnicodeCategory data
53 // contained within the CharUnicodeInfo or Rune types, which generally follow the latest Unicode
55 private static ReadOnlySpan
<byte> Latin1CharInfo
=> new byte[]
57 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x8E, 0x8E, 0x8E, 0x8E, 0x8E, 0x0E, 0x0E, // U+0000..U+000F
58 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, // U+0010..U+001F
59 0x8B, 0x18, 0x18, 0x18, 0x1A, 0x18, 0x18, 0x18, 0x14, 0x15, 0x18, 0x19, 0x18, 0x13, 0x18, 0x18, // U+0020..U+002F
60 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x18, 0x18, 0x19, 0x19, 0x19, 0x18, // U+0030..U+003F
61 0x18, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // U+0040..U+004F
62 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x14, 0x18, 0x15, 0x1B, 0x12, // U+0050..U+005F
63 0x1B, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // U+0060..U+006F
64 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x14, 0x19, 0x15, 0x19, 0x0E, // U+0070..U+007F
65 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x8E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, // U+0080..U+008F
66 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, // U+0090..U+009F
67 0x8B, 0x18, 0x1A, 0x1A, 0x1A, 0x1A, 0x1C, 0x1C, 0x1B, 0x1C, 0x21, 0x16, 0x19, 0x13, 0x1C, 0x1B, // U+00A0..U+00AF
68 0x1C, 0x19, 0x0A, 0x0A, 0x1B, 0x21, 0x1C, 0x18, 0x1B, 0x0A, 0x21, 0x17, 0x0A, 0x0A, 0x0A, 0x18, // U+00B0..U+00BF
69 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // U+00C0..U+00CF
70 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x19, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x21, // U+00D0..U+00DF
71 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // U+00E0..U+00EF
72 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x19, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // U+00F0..U+00FF
75 // Return true for all characters below or equal U+00ff, which is ASCII + Latin-1 Supplement.
76 private static bool IsLatin1(char ch
)
78 return (uint)ch
< (uint)Latin1CharInfo
.Length
;
81 // Return true for all characters below or equal U+007f, which is ASCII.
82 private static bool IsAscii(char ch
)
84 return (uint)ch
<= '\x007f';
87 // Return the Unicode category for Unicode character <= 0x00ff.
88 private static UnicodeCategory
GetLatin1UnicodeCategory(char ch
)
90 Debug
.Assert(IsLatin1(ch
), "char.GetLatin1UnicodeCategory(): ch should be <= 00ff");
91 return (UnicodeCategory
)(Latin1CharInfo
[ch
] & UnicodeCategoryMask
);
99 // Overriden Instance Methods
102 // Calculate a hashcode for a 2 byte Unicode character.
103 public override int GetHashCode()
105 return (int)m_value
| ((int)m_value
<< 16);
108 // Used for comparing two boxed Char objects.
110 public override bool Equals(object? obj
)
116 return m_value
== ((char)obj
).m_value
;
119 [System
.Runtime
.Versioning
.NonVersionable
]
120 public bool Equals(char obj
)
122 return m_value
== obj
;
125 // Compares this object to another object, returning an integer that
126 // indicates the relationship.
127 // Returns a value less than zero if this object
128 // null is considered to be less than any instance.
129 // If object is not of type Char, this method throws an ArgumentException.
131 public int CompareTo(object? value)
137 if (!(value is char))
139 throw new ArgumentException(SR
.Arg_MustBeChar
);
142 return m_value
- ((char)value).m_value
;
145 public int CompareTo(char value)
147 return m_value
- value;
150 // Overrides System.Object.ToString.
151 public override string ToString()
153 return char.ToString(m_value
);
156 public string ToString(IFormatProvider
? provider
)
158 return char.ToString(m_value
);
162 // Formatting Methods
165 /*===================================ToString===================================
166 **This static methods takes a character and returns the String representation of it.
167 ==============================================================================*/
168 // Provides a string representation of a character.
169 public static string ToString(char c
) => string.CreateFromChar(c
);
171 public static char Parse(string s
)
175 throw new ArgumentNullException(nameof(s
));
180 throw new FormatException(SR
.Format_NeedSingleChar
);
185 public static bool TryParse(string? s
, out char result
)
203 /*=================================ISDIGIT======================================
204 **A wrapper for char. Returns a boolean indicating whether **
205 **character c is considered to be a digit. **
206 ==============================================================================*/
207 // Determines whether a character is a digit.
208 public static bool IsDigit(char c
)
212 return IsInRange(c
, '0', '9');
214 return CharUnicodeInfo
.GetUnicodeCategory(c
) == UnicodeCategory
.DecimalDigitNumber
;
217 internal static bool IsInRange(char c
, char min
, char max
) => (uint)(c
- min
) <= (uint)(max
- min
);
219 private static bool IsInRange(UnicodeCategory c
, UnicodeCategory min
, UnicodeCategory max
) => (uint)(c
- min
) <= (uint)(max
- min
);
221 /*=================================CheckLetter=====================================
222 ** Check if the specified UnicodeCategory belongs to the letter categories.
223 ==============================================================================*/
224 internal static bool CheckLetter(UnicodeCategory uc
)
226 return IsInRange(uc
, UnicodeCategory
.UppercaseLetter
, UnicodeCategory
.OtherLetter
);
229 /*=================================ISLETTER=====================================
230 **A wrapper for char. Returns a boolean indicating whether **
231 **character c is considered to be a letter. **
232 ==============================================================================*/
233 // Determines whether a character is a letter.
234 public static bool IsLetter(char c
)
238 // For the version of the Unicode standard the Char type is locked to, the
239 // Latin-1 range doesn't include letters in categories other than "upper" and "lower".
240 return (Latin1CharInfo
[c
] & (IsUpperCaseLetterFlag
| IsLowerCaseLetterFlag
)) != 0;
242 return CheckLetter(CharUnicodeInfo
.GetUnicodeCategory(c
));
245 private static bool IsWhiteSpaceLatin1(char c
)
247 Debug
.Assert(IsLatin1(c
));
248 return (Latin1CharInfo
[c
] & IsWhiteSpaceFlag
) != 0;
251 /*===============================ISWHITESPACE===================================
252 **A wrapper for char. Returns a boolean indicating whether **
253 **character c is considered to be a whitespace character. **
254 ==============================================================================*/
255 // Determines whether a character is whitespace.
256 public static bool IsWhiteSpace(char c
)
260 return IsWhiteSpaceLatin1(c
);
262 return CheckSeparator(CharUnicodeInfo
.GetUnicodeCategory(c
));
265 /*===================================IsUpper====================================
266 **Arguments: c -- the characater to be checked.
267 **Returns: True if c is an uppercase character.
268 ==============================================================================*/
269 // Determines whether a character is upper-case.
270 public static bool IsUpper(char c
)
274 return (Latin1CharInfo
[c
] & IsUpperCaseLetterFlag
) != 0;
276 return CharUnicodeInfo
.GetUnicodeCategory(c
) == UnicodeCategory
.UppercaseLetter
;
279 /*===================================IsLower====================================
280 **Arguments: c -- the characater to be checked.
281 **Returns: True if c is an lowercase character.
282 ==============================================================================*/
283 // Determines whether a character is lower-case.
284 public static bool IsLower(char c
)
288 return (Latin1CharInfo
[c
] & IsLowerCaseLetterFlag
) != 0;
290 return CharUnicodeInfo
.GetUnicodeCategory(c
) == UnicodeCategory
.LowercaseLetter
;
293 internal static bool CheckPunctuation(UnicodeCategory uc
)
295 return IsInRange(uc
, UnicodeCategory
.ConnectorPunctuation
, UnicodeCategory
.OtherPunctuation
);
298 /*================================IsPunctuation=================================
299 **Arguments: c -- the characater to be checked.
300 **Returns: True if c is an punctuation mark
301 ==============================================================================*/
302 // Determines whether a character is a punctuation mark.
303 public static bool IsPunctuation(char c
)
307 return CheckPunctuation(GetLatin1UnicodeCategory(c
));
309 return CheckPunctuation(CharUnicodeInfo
.GetUnicodeCategory(c
));
312 /*=================================CheckLetterOrDigit=====================================
313 ** Check if the specified UnicodeCategory belongs to the letter or digit categories.
314 ==============================================================================*/
315 internal static bool CheckLetterOrDigit(UnicodeCategory uc
)
317 return CheckLetter(uc
) || uc
== UnicodeCategory
.DecimalDigitNumber
;
320 // Determines whether a character is a letter or a digit.
321 public static bool IsLetterOrDigit(char c
)
325 return CheckLetterOrDigit(GetLatin1UnicodeCategory(c
));
327 return CheckLetterOrDigit(CharUnicodeInfo
.GetUnicodeCategory(c
));
330 /*===================================ToUpper====================================
332 ==============================================================================*/
333 // Converts a character to upper-case for the specified culture.
334 // <;<;Not fully implemented>;>;
335 public static char ToUpper(char c
, CultureInfo culture
)
338 throw new ArgumentNullException(nameof(culture
));
339 return culture
.TextInfo
.ToUpper(c
);
342 /*=================================TOUPPER======================================
343 **A wrapper for char.ToUpperCase. Converts character c to its **
344 **uppercase equivalent. If c is already an uppercase character or is not an **
345 **alphabetic, nothing happens. **
346 ==============================================================================*/
347 // Converts a character to upper-case for the default culture.
349 public static char ToUpper(char c
)
351 return CultureInfo
.CurrentCulture
.TextInfo
.ToUpper(c
);
354 // Converts a character to upper-case for invariant culture.
355 public static char ToUpperInvariant(char c
)
357 return CultureInfo
.InvariantCulture
.TextInfo
.ToUpper(c
);
360 /*===================================ToLower====================================
362 ==============================================================================*/
363 // Converts a character to lower-case for the specified culture.
364 // <;<;Not fully implemented>;>;
365 public static char ToLower(char c
, CultureInfo culture
)
368 throw new ArgumentNullException(nameof(culture
));
369 return culture
.TextInfo
.ToLower(c
);
372 /*=================================TOLOWER======================================
373 **A wrapper for char.ToLowerCase. Converts character c to its **
374 **lowercase equivalent. If c is already a lowercase character or is not an **
375 **alphabetic, nothing happens. **
376 ==============================================================================*/
377 // Converts a character to lower-case for the default culture.
378 public static char ToLower(char c
)
380 return CultureInfo
.CurrentCulture
.TextInfo
.ToLower(c
);
383 // Converts a character to lower-case for invariant culture.
384 public static char ToLowerInvariant(char c
)
386 return CultureInfo
.InvariantCulture
.TextInfo
.ToLower(c
);
390 // IConvertible implementation
392 public TypeCode
GetTypeCode()
394 return TypeCode
.Char
;
397 bool IConvertible
.ToBoolean(IFormatProvider
? provider
)
399 throw new InvalidCastException(SR
.Format(SR
.InvalidCast_FromTo
, "Char", "Boolean"));
402 char IConvertible
.ToChar(IFormatProvider
? provider
)
407 sbyte IConvertible
.ToSByte(IFormatProvider
? provider
)
409 return Convert
.ToSByte(m_value
);
412 byte IConvertible
.ToByte(IFormatProvider
? provider
)
414 return Convert
.ToByte(m_value
);
417 short IConvertible
.ToInt16(IFormatProvider
? provider
)
419 return Convert
.ToInt16(m_value
);
422 ushort IConvertible
.ToUInt16(IFormatProvider
? provider
)
424 return Convert
.ToUInt16(m_value
);
427 int IConvertible
.ToInt32(IFormatProvider
? provider
)
429 return Convert
.ToInt32(m_value
);
432 uint IConvertible
.ToUInt32(IFormatProvider
? provider
)
434 return Convert
.ToUInt32(m_value
);
437 long IConvertible
.ToInt64(IFormatProvider
? provider
)
439 return Convert
.ToInt64(m_value
);
442 ulong IConvertible
.ToUInt64(IFormatProvider
? provider
)
444 return Convert
.ToUInt64(m_value
);
447 float IConvertible
.ToSingle(IFormatProvider
? provider
)
449 throw new InvalidCastException(SR
.Format(SR
.InvalidCast_FromTo
, "Char", "Single"));
452 double IConvertible
.ToDouble(IFormatProvider
? provider
)
454 throw new InvalidCastException(SR
.Format(SR
.InvalidCast_FromTo
, "Char", "Double"));
457 decimal IConvertible
.ToDecimal(IFormatProvider
? provider
)
459 throw new InvalidCastException(SR
.Format(SR
.InvalidCast_FromTo
, "Char", "Decimal"));
462 DateTime IConvertible
.ToDateTime(IFormatProvider
? provider
)
464 throw new InvalidCastException(SR
.Format(SR
.InvalidCast_FromTo
, "Char", "DateTime"));
467 object IConvertible
.ToType(Type type
, IFormatProvider
? provider
)
469 return Convert
.DefaultToType((IConvertible
)this, type
, provider
);
472 public static bool IsControl(char c
)
476 return GetLatin1UnicodeCategory(c
) == UnicodeCategory
.Control
;
478 return CharUnicodeInfo
.GetUnicodeCategory(c
) == UnicodeCategory
.Control
;
481 public static bool IsControl(string s
, int index
)
484 throw new ArgumentNullException(nameof(s
));
485 if (((uint)index
) >= ((uint)s
.Length
))
487 throw new ArgumentOutOfRangeException(nameof(index
));
492 return GetLatin1UnicodeCategory(c
) == UnicodeCategory
.Control
;
494 return CharUnicodeInfo
.GetUnicodeCategory(s
, index
) == UnicodeCategory
.Control
;
497 public static bool IsDigit(string s
, int index
)
500 throw new ArgumentNullException(nameof(s
));
501 if (((uint)index
) >= ((uint)s
.Length
))
503 throw new ArgumentOutOfRangeException(nameof(index
));
508 return IsInRange(c
, '0', '9');
510 return CharUnicodeInfo
.GetUnicodeCategory(s
, index
) == UnicodeCategory
.DecimalDigitNumber
;
513 public static bool IsLetter(string s
, int index
)
516 throw new ArgumentNullException(nameof(s
));
517 if (((uint)index
) >= ((uint)s
.Length
))
519 throw new ArgumentOutOfRangeException(nameof(index
));
524 // The Latin-1 range doesn't include letters in categories other than "upper" and "lower"
525 return (Latin1CharInfo
[c
] & (IsUpperCaseLetterFlag
| IsLowerCaseLetterFlag
)) != 0;
527 return CheckLetter(CharUnicodeInfo
.GetUnicodeCategory(s
, index
));
530 public static bool IsLetterOrDigit(string s
, int index
)
533 throw new ArgumentNullException(nameof(s
));
534 if (((uint)index
) >= ((uint)s
.Length
))
536 throw new ArgumentOutOfRangeException(nameof(index
));
541 return CheckLetterOrDigit(GetLatin1UnicodeCategory(c
));
543 return CheckLetterOrDigit(CharUnicodeInfo
.GetUnicodeCategory(s
, index
));
546 public static bool IsLower(string s
, int index
)
549 throw new ArgumentNullException(nameof(s
));
550 if (((uint)index
) >= ((uint)s
.Length
))
552 throw new ArgumentOutOfRangeException(nameof(index
));
557 return (Latin1CharInfo
[c
] & IsLowerCaseLetterFlag
) != 0;
560 return CharUnicodeInfo
.GetUnicodeCategory(s
, index
) == UnicodeCategory
.LowercaseLetter
;
563 /*=================================CheckNumber=====================================
564 ** Check if the specified UnicodeCategory belongs to the number categories.
565 ==============================================================================*/
567 internal static bool CheckNumber(UnicodeCategory uc
)
569 return IsInRange(uc
, UnicodeCategory
.DecimalDigitNumber
, UnicodeCategory
.OtherNumber
);
572 public static bool IsNumber(char c
)
578 return IsInRange(c
, '0', '9');
580 return CheckNumber(GetLatin1UnicodeCategory(c
));
582 return CheckNumber(CharUnicodeInfo
.GetUnicodeCategory(c
));
585 public static bool IsNumber(string s
, int index
)
588 throw new ArgumentNullException(nameof(s
));
589 if (((uint)index
) >= ((uint)s
.Length
))
591 throw new ArgumentOutOfRangeException(nameof(index
));
598 return IsInRange(c
, '0', '9');
600 return CheckNumber(GetLatin1UnicodeCategory(c
));
602 return CheckNumber(CharUnicodeInfo
.GetUnicodeCategory(s
, index
));
605 ////////////////////////////////////////////////////////////////////////
609 // Determines if the given character is a punctuation character.
611 ////////////////////////////////////////////////////////////////////////
613 public static bool IsPunctuation(string s
, int index
)
616 throw new ArgumentNullException(nameof(s
));
617 if (((uint)index
) >= ((uint)s
.Length
))
619 throw new ArgumentOutOfRangeException(nameof(index
));
624 return CheckPunctuation(GetLatin1UnicodeCategory(c
));
626 return CheckPunctuation(CharUnicodeInfo
.GetUnicodeCategory(s
, index
));
629 /*================================= CheckSeparator ============================
630 ** Check if the specified UnicodeCategory belongs to the seprator categories.
631 ==============================================================================*/
633 internal static bool CheckSeparator(UnicodeCategory uc
)
635 return IsInRange(uc
, UnicodeCategory
.SpaceSeparator
, UnicodeCategory
.ParagraphSeparator
);
638 private static bool IsSeparatorLatin1(char c
)
640 // U+00a0 = NO-BREAK SPACE
641 // There is no LineSeparator or ParagraphSeparator in Latin 1 range.
642 return c
== '\x0020' || c
== '\x00a0';
645 public static bool IsSeparator(char c
)
649 return IsSeparatorLatin1(c
);
651 return CheckSeparator(CharUnicodeInfo
.GetUnicodeCategory(c
));
654 public static bool IsSeparator(string s
, int index
)
657 throw new ArgumentNullException(nameof(s
));
658 if (((uint)index
) >= ((uint)s
.Length
))
660 throw new ArgumentOutOfRangeException(nameof(index
));
665 return IsSeparatorLatin1(c
);
667 return CheckSeparator(CharUnicodeInfo
.GetUnicodeCategory(s
, index
));
670 public static bool IsSurrogate(char c
)
672 return IsInRange(c
, CharUnicodeInfo
.HIGH_SURROGATE_START
, CharUnicodeInfo
.LOW_SURROGATE_END
);
675 public static bool IsSurrogate(string s
, int index
)
679 throw new ArgumentNullException(nameof(s
));
681 if (((uint)index
) >= ((uint)s
.Length
))
683 throw new ArgumentOutOfRangeException(nameof(index
));
685 return IsSurrogate(s
[index
]);
688 /*================================= CheckSymbol ============================
689 ** Check if the specified UnicodeCategory belongs to the symbol categories.
690 ==============================================================================*/
692 internal static bool CheckSymbol(UnicodeCategory uc
)
694 return IsInRange(uc
, UnicodeCategory
.MathSymbol
, UnicodeCategory
.OtherSymbol
);
697 public static bool IsSymbol(char c
)
701 return CheckSymbol(GetLatin1UnicodeCategory(c
));
703 return CheckSymbol(CharUnicodeInfo
.GetUnicodeCategory(c
));
706 public static bool IsSymbol(string s
, int index
)
709 throw new ArgumentNullException(nameof(s
));
710 if (((uint)index
) >= ((uint)s
.Length
))
712 throw new ArgumentOutOfRangeException(nameof(index
));
717 return CheckSymbol(GetLatin1UnicodeCategory(c
));
719 return CheckSymbol(CharUnicodeInfo
.GetUnicodeCategory(s
, index
));
722 public static bool IsUpper(string s
, int index
)
725 throw new ArgumentNullException(nameof(s
));
726 if (((uint)index
) >= ((uint)s
.Length
))
728 throw new ArgumentOutOfRangeException(nameof(index
));
733 return (Latin1CharInfo
[c
] & IsUpperCaseLetterFlag
) != 0;
736 return CharUnicodeInfo
.GetUnicodeCategory(s
, index
) == UnicodeCategory
.UppercaseLetter
;
739 public static bool IsWhiteSpace(string s
, int index
)
742 throw new ArgumentNullException(nameof(s
));
743 if (((uint)index
) >= ((uint)s
.Length
))
745 throw new ArgumentOutOfRangeException(nameof(index
));
752 return IsWhiteSpaceLatin1(ch
);
755 return CheckSeparator(CharUnicodeInfo
.GetUnicodeCategory(s
, index
));
758 public static UnicodeCategory
GetUnicodeCategory(char c
)
762 return GetLatin1UnicodeCategory(c
);
764 return CharUnicodeInfo
.GetUnicodeCategory((int)c
);
767 public static UnicodeCategory
GetUnicodeCategory(string s
, int index
)
770 throw new ArgumentNullException(nameof(s
));
771 if (((uint)index
) >= ((uint)s
.Length
))
773 throw new ArgumentOutOfRangeException(nameof(index
));
775 if (IsLatin1(s
[index
]))
777 return GetLatin1UnicodeCategory(s
[index
]);
779 return CharUnicodeInfo
.InternalGetUnicodeCategory(s
, index
);
782 public static double GetNumericValue(char c
)
784 return CharUnicodeInfo
.GetNumericValue(c
);
787 public static double GetNumericValue(string s
, int index
)
790 throw new ArgumentNullException(nameof(s
));
791 if (((uint)index
) >= ((uint)s
.Length
))
793 throw new ArgumentOutOfRangeException(nameof(index
));
795 return CharUnicodeInfo
.GetNumericValue(s
, index
);
798 /*================================= IsHighSurrogate ============================
799 ** Check if a char is a high surrogate.
800 ==============================================================================*/
801 public static bool IsHighSurrogate(char c
)
803 return IsInRange(c
, CharUnicodeInfo
.HIGH_SURROGATE_START
, CharUnicodeInfo
.HIGH_SURROGATE_END
);
806 public static bool IsHighSurrogate(string s
, int index
)
810 throw new ArgumentNullException(nameof(s
));
812 if (index
< 0 || index
>= s
.Length
)
814 throw new ArgumentOutOfRangeException(nameof(index
));
816 return IsHighSurrogate(s
[index
]);
819 /*================================= IsLowSurrogate ============================
820 ** Check if a char is a low surrogate.
821 ==============================================================================*/
822 public static bool IsLowSurrogate(char c
)
824 return IsInRange(c
, CharUnicodeInfo
.LOW_SURROGATE_START
, CharUnicodeInfo
.LOW_SURROGATE_END
);
827 public static bool IsLowSurrogate(string s
, int index
)
831 throw new ArgumentNullException(nameof(s
));
833 if (index
< 0 || index
>= s
.Length
)
835 throw new ArgumentOutOfRangeException(nameof(index
));
837 return IsLowSurrogate(s
[index
]);
840 /*================================= IsSurrogatePair ============================
841 ** Check if the string specified by the index starts with a surrogate pair.
842 ==============================================================================*/
843 public static bool IsSurrogatePair(string s
, int index
)
847 throw new ArgumentNullException(nameof(s
));
849 if (index
< 0 || index
>= s
.Length
)
851 throw new ArgumentOutOfRangeException(nameof(index
));
853 if (index
+ 1 < s
.Length
)
855 return IsSurrogatePair(s
[index
], s
[index
+ 1]);
860 public static bool IsSurrogatePair(char highSurrogate
, char lowSurrogate
)
862 // Since both the high and low surrogate ranges are exactly 0x400 elements
863 // wide, and since this is a power of two, we can perform a single comparison
864 // by baselining each value to the start of its respective range and taking
865 // the logical OR of them.
867 uint highSurrogateOffset
= (uint)highSurrogate
- CharUnicodeInfo
.HIGH_SURROGATE_START
;
868 uint lowSurrogateOffset
= (uint)lowSurrogate
- CharUnicodeInfo
.LOW_SURROGATE_START
;
869 return (highSurrogateOffset
| lowSurrogateOffset
) <= CharUnicodeInfo
.HIGH_SURROGATE_RANGE
;
872 internal const int UNICODE_PLANE00_END
= 0x00ffff;
873 // The starting codepoint for Unicode plane 1. Plane 1 contains 0x010000 ~ 0x01ffff.
874 internal const int UNICODE_PLANE01_START
= 0x10000;
875 // The end codepoint for Unicode plane 16. This is the maximum code point value allowed for Unicode.
876 // Plane 16 contains 0x100000 ~ 0x10ffff.
877 internal const int UNICODE_PLANE16_END
= 0x10ffff;
879 /*================================= ConvertFromUtf32 ============================
880 ** Convert an UTF32 value into a surrogate pair.
881 ==============================================================================*/
883 public static string ConvertFromUtf32(int utf32
)
885 if (!UnicodeUtility
.IsValidUnicodeScalar((uint)utf32
))
887 throw new ArgumentOutOfRangeException(nameof(utf32
), SR
.ArgumentOutOfRange_InvalidUTF32
);
890 return Rune
.UnsafeCreate((uint)utf32
).ToString();
893 /*=============================ConvertToUtf32===================================
894 ** Convert a surrogate pair to UTF32 value
895 ==============================================================================*/
897 public static int ConvertToUtf32(char highSurrogate
, char lowSurrogate
)
899 // First, extend both to 32 bits, then calculate the offset of
900 // each candidate surrogate char from the start of its range.
902 uint highSurrogateOffset
= (uint)highSurrogate
- CharUnicodeInfo
.HIGH_SURROGATE_START
;
903 uint lowSurrogateOffset
= (uint)lowSurrogate
- CharUnicodeInfo
.LOW_SURROGATE_START
;
905 // This is a single comparison which allows us to check both for validity at once since
906 // both the high surrogate range and the low surrogate range are the same length.
907 // If the comparison fails, we call to a helper method to throw the correct exception message.
909 if ((highSurrogateOffset
| lowSurrogateOffset
) > CharUnicodeInfo
.HIGH_SURROGATE_RANGE
)
911 ConvertToUtf32_ThrowInvalidArgs(highSurrogateOffset
);
914 // The 0x40u << 10 below is to account for uuuuu = wwww + 1 in the surrogate encoding.
915 return ((int)highSurrogateOffset
<< 10) + (lowSurrogate
- CharUnicodeInfo
.LOW_SURROGATE_START
) + (0x40 << 10);
919 private static void ConvertToUtf32_ThrowInvalidArgs(uint highSurrogateOffset
)
921 // If the high surrogate is not within its expected range, throw an exception
922 // whose message fingers it as invalid. If it's within the expected range,
923 // change the message to read that the low surrogate was the problem.
925 if (highSurrogateOffset
> CharUnicodeInfo
.HIGH_SURROGATE_RANGE
)
927 throw new ArgumentOutOfRangeException(
928 paramName: "highSurrogate",
929 message: SR
.ArgumentOutOfRange_InvalidHighSurrogate
);
933 throw new ArgumentOutOfRangeException(
934 paramName: "lowSurrogate",
935 message: SR
.ArgumentOutOfRange_InvalidLowSurrogate
);
939 /*=============================ConvertToUtf32===================================
940 ** Convert a character or a surrogate pair starting at index of the specified string
942 ** The char pointed by index should be a surrogate pair or a BMP character.
943 ** This method throws if a high-surrogate is not followed by a low surrogate.
944 ** This method throws if a low surrogate is seen without preceding a high-surrogate.
945 ==============================================================================*/
947 public static int ConvertToUtf32(string s
, int index
)
951 throw new ArgumentNullException(nameof(s
));
954 if (index
< 0 || index
>= s
.Length
)
956 throw new ArgumentOutOfRangeException(nameof(index
), SR
.ArgumentOutOfRange_Index
);
958 // Check if the character at index is a high surrogate.
959 int temp1
= (int)s
[index
] - CharUnicodeInfo
.HIGH_SURROGATE_START
;
960 if (temp1
>= 0 && temp1
<= 0x7ff)
962 // Found a surrogate char.
965 // Found a high surrogate.
966 if (index
< s
.Length
- 1)
968 int temp2
= (int)s
[index
+ 1] - CharUnicodeInfo
.LOW_SURROGATE_START
;
969 if (temp2
>= 0 && temp2
<= 0x3ff)
971 // Found a low surrogate.
972 return (temp1
* 0x400) + temp2
+ UNICODE_PLANE01_START
;
976 throw new ArgumentException(SR
.Format(SR
.Argument_InvalidHighSurrogate
, index
), nameof(s
));
981 // Found a high surrogate at the end of the string.
982 throw new ArgumentException(SR
.Format(SR
.Argument_InvalidHighSurrogate
, index
), nameof(s
));
987 // Find a low surrogate at the character pointed by index.
988 throw new ArgumentException(SR
.Format(SR
.Argument_InvalidLowSurrogate
, index
), nameof(s
));
991 // Not a high-surrogate or low-surrogate. Genereate the UTF32 value for the BMP characters.
992 return (int)s
[index
];