netcore/System.Private.CoreLib/shared/System/Text/Rune.cs

   1 // Licensed to the .NET Foundation under one or more agreements.
   2 // The .NET Foundation licenses this file to you under the MIT license.
   3 // See the LICENSE file in the project root for more information.
   4
   5 using System.Buffers;
   6 using System.Diagnostics;
   7 using System.Globalization;
   8 using System.Runtime.CompilerServices;
   9 using System.Text.Unicode;
  10
  11 namespace System.Text
  12 {
  13     /// <summary>
  14     /// Represents a Unicode scalar value ([ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive).
  15     /// </summary>
  16     /// <remarks>
  17     /// This type's constructors and conversion operators validate the input, so consumers can call the APIs
  18     /// assuming that the underlying <see cref="Rune"/> instance is well-formed.
  19     /// </remarks>
  20     [DebuggerDisplay("{DebuggerDisplay,nq}")]
  21     public readonly struct Rune : IComparable<Rune>, IEquatable<Rune>
  22     {
  23         private const byte IsWhiteSpaceFlag = 0x80;
  24         private const byte IsLetterOrDigitFlag = 0x40;
  25         private const byte UnicodeCategoryMask = 0x1F;
  26
  27         // Contains information about the ASCII character range [ U+0000..U+007F ], with:
  28         // - 0x80 bit if set means 'is whitespace'
  29         // - 0x40 bit if set means 'is letter or digit'
  30         // - 0x20 bit is reserved for future use
  31         // - bottom 5 bits are the UnicodeCategory of the character
  32         private static ReadOnlySpan<byte> AsciiCharInfo => new byte[]
  33         {
  34             0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x8E, 0x8E, 0x8E, 0x8E, 0x8E, 0x0E, 0x0E,
  35             0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E,
  36             0x8B, 0x18, 0x18, 0x18, 0x1A, 0x18, 0x18, 0x18, 0x14, 0x15, 0x18, 0x19, 0x18, 0x13, 0x18, 0x18,
  37             0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x18, 0x18, 0x19, 0x19, 0x19, 0x18,
  38             0x18, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
  39             0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x14, 0x18, 0x15, 0x1B, 0x12,
  40             0x1B, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41,
  41             0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x14, 0x19, 0x15, 0x19, 0x0E
  42         };
  43
  44         private readonly uint _value;
  45
  46         /// <summary>
  47         /// Creates a <see cref="Rune"/> from the provided UTF-16 code unit.
  48         /// </summary>
  49         /// <exception cref="ArgumentOutOfRangeException">
  50         /// If <paramref name="ch"/> represents a UTF-16 surrogate code point
  51         /// U+D800..U+DFFF, inclusive.
  52         /// </exception>
  53         public Rune(char ch)
  54         {
  55             uint expanded = ch;
  56             if (UnicodeUtility.IsSurrogateCodePoint(expanded))
  57             {
  58                 ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.ch);
  59             }
  60             _value = expanded;
  61         }
  62
  63         /// <summary>
  64         /// Creates a <see cref="Rune"/> from the provided UTF-16 surrogate pair.
  65         /// </summary>
  66         /// <exception cref="ArgumentOutOfRangeException">
  67         /// If <paramref name="highSurrogate"/> does not represent a UTF-16 high surrogate code point
  68         /// or <paramref name="lowSurrogate"/> does not represent a UTF-16 low surrogate code point.
  69         /// </exception>
  70         public Rune(char highSurrogate, char lowSurrogate)
  71             : this((uint)char.ConvertToUtf32(highSurrogate, lowSurrogate), false)
  72         {
  73         }
  74
  75         /// <summary>
  76         /// Creates a <see cref="Rune"/> from the provided Unicode scalar value.
  77         /// </summary>
  78         /// <exception cref="ArgumentOutOfRangeException">
  79         /// If <paramref name="value"/> does not represent a value Unicode scalar value.
  80         /// </exception>
  81         public Rune(int value)
  82             : this((uint)value)
  83         {
  84         }
  85
  86         /// <summary>
  87         /// Creates a <see cref="Rune"/> from the provided Unicode scalar value.
  88         /// </summary>
  89         /// <exception cref="ArgumentOutOfRangeException">
  90         /// If <paramref name="value"/> does not represent a value Unicode scalar value.
  91         /// </exception>
  92         [CLSCompliant(false)]
  93         public Rune(uint value)
  94         {
  95             if (!UnicodeUtility.IsValidUnicodeScalar(value))
  96             {
  97                 ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.value);
  98             }
  99             _value = value;
 100         }
 101
 102         // non-validating ctor
 103         private Rune(uint scalarValue, bool unused)
 104         {
 105             UnicodeDebug.AssertIsValidScalar(scalarValue);
 106             _value = scalarValue;
 107         }
 108
 109         public static bool operator ==(Rune left, Rune right) => (left._value == right._value);
 110
 111         public static bool operator !=(Rune left, Rune right) => (left._value != right._value);
 112
 113         public static bool operator <(Rune left, Rune right) => (left._value < right._value);
 114
 115         public static bool operator <=(Rune left, Rune right) => (left._value <= right._value);
 116
 117         public static bool operator >(Rune left, Rune right) => (left._value > right._value);
 118
 119         public static bool operator >=(Rune left, Rune right) => (left._value >= right._value);
 120
 121         // Operators below are explicit because they may throw.
 122
 123         public static explicit operator Rune(char ch) => new Rune(ch);
 124
 125         [CLSCompliant(false)]
 126         public static explicit operator Rune(uint value) => new Rune(value);
 127
 128         public static explicit operator Rune(int value) => new Rune(value);
 129
 130         // Displayed as "'<char>' (U+XXXX)"; e.g., "'e' (U+0065)"
 131         private string DebuggerDisplay => FormattableString.Invariant($"U+{_value:X4} '{(IsValid(_value) ? ToString() : "\uFFFD")}'");
 132
 133         /// <summary>
 134         /// Returns true if and only if this scalar value is ASCII ([ U+0000..U+007F ])
 135         /// and therefore representable by a single UTF-8 code unit.
 136         /// </summary>
 137         public bool IsAscii => UnicodeUtility.IsAsciiCodePoint(_value);
 138
 139         /// <summary>
 140         /// Returns true if and only if this scalar value is within the BMP ([ U+0000..U+FFFF ])
 141         /// and therefore representable by a single UTF-16 code unit.
 142         /// </summary>
 143         public bool IsBmp => UnicodeUtility.IsBmpCodePoint(_value);
 144
 145         /// <summary>
 146         /// Returns the Unicode plane (0 to 16, inclusive) which contains this scalar.
 147         /// </summary>
 148         public int Plane => UnicodeUtility.GetPlane(_value);
 149
 150         /// <summary>
 151         /// A <see cref="Rune"/> instance that represents the Unicode replacement character U+FFFD.
 152         /// </summary>
 153         public static Rune ReplacementChar => UnsafeCreate(UnicodeUtility.ReplacementChar);
 154
 155         /// <summary>
 156         /// Returns the length in code units (<see cref="char"/>) of the
 157         /// UTF-16 sequence required to represent this scalar value.
 158         /// </summary>
 159         /// <remarks>
 160         /// The return value will be 1 or 2.
 161         /// </remarks>
 162         public int Utf16SequenceLength => UnicodeUtility.GetUtf16SequenceLength(_value);
 163
 164         /// <summary>
 165         /// Returns the length in code units of the
 166         /// UTF-8 sequence required to represent this scalar value.
 167         /// </summary>
 168         /// <remarks>
 169         /// The return value will be 1 through 4, inclusive.
 170         /// </remarks>
 171         public int Utf8SequenceLength => UnicodeUtility.GetUtf8SequenceLength(_value);
 172
 173         /// <summary>
 174         /// Returns the Unicode scalar value as an integer.
 175         /// </summary>
 176         public int Value => (int)_value;
 177
 178         private static Rune ChangeCaseCultureAware(Rune rune, TextInfo textInfo, bool toUpper)
 179         {
 180             Debug.Assert(!GlobalizationMode.Invariant, "This should've been checked by the caller.");
 181             Debug.Assert(textInfo != null, "This should've been checked by the caller.");
 182
 183             Span<char> original = stackalloc char[2]; // worst case scenario = 2 code units (for a surrogate pair)
 184             Span<char> modified = stackalloc char[2]; // case change should preserve UTF-16 code unit count
 185
 186             int charCount = rune.EncodeToUtf16(original);
 187             original = original.Slice(0, charCount);
 188             modified = modified.Slice(0, charCount);
 189
 190             if (toUpper)
 191             {
 192                 textInfo.ChangeCaseToUpper(original, modified);
 193             }
 194             else
 195             {
 196                 textInfo.ChangeCaseToLower(original, modified);
 197             }
 198
 199             // We use simple case folding rules, which disallows moving between the BMP and supplementary
 200             // planes when performing a case conversion. The helper methods which reconstruct a Rune
 201             // contain debug asserts for this condition.
 202
 203             if (rune.IsBmp)
 204             {
 205                 return UnsafeCreate(modified[0]);
 206             }
 207             else
 208             {
 209                 return UnsafeCreate(UnicodeUtility.GetScalarFromUtf16SurrogatePair(modified[0], modified[1]));
 210             }
 211         }
 212
 213         public int CompareTo(Rune other) => this._value.CompareTo(other._value);
 214
 215         /// <summary>
 216         /// Decodes the <see cref="Rune"/> at the beginning of the provided UTF-16 source buffer.
 217         /// </summary>
 218         /// <returns>
 219         /// <para>
 220         /// If the source buffer begins with a valid UTF-16 encoded scalar value, returns <see cref="OperationStatus.Done"/>,
 221         /// and outs via <paramref name="result"/> the decoded <see cref="Rune"/> and via <paramref name="charsConsumed"/> the
 222         /// number of <see langword="char"/>s used in the input buffer to encode the <see cref="Rune"/>.
 223         /// </para>
 224         /// <para>
 225         /// If the source buffer is empty or contains only a standalone UTF-16 high surrogate character, returns <see cref="OperationStatus.NeedMoreData"/>,
 226         /// and outs via <paramref name="result"/> <see cref="ReplacementChar"/> and via <paramref name="charsConsumed"/> the length of the input buffer.
 227         /// </para>
 228         /// <para>
 229         /// If the source buffer begins with an ill-formed UTF-16 encoded scalar value, returns <see cref="OperationStatus.InvalidData"/>,
 230         /// and outs via <paramref name="result"/> <see cref="ReplacementChar"/> and via <paramref name="charsConsumed"/> the number of
 231         /// <see langword="char"/>s used in the input buffer to encode the ill-formed sequence.
 232         /// </para>
 233         /// </returns>
 234         /// <remarks>
 235         /// The general calling convention is to call this method in a loop, slicing the <paramref name="source"/> buffer by
 236         /// <paramref name="charsConsumed"/> elements on each iteration of the loop. On each iteration of the loop <paramref name="result"/>
 237         /// will contain the real scalar value if successfully decoded, or it will contain <see cref="ReplacementChar"/> if
 238         /// the data could not be successfully decoded. This pattern provides convenient automatic U+FFFD substitution of
 239         /// invalid sequences while iterating through the loop.
 240         /// </remarks>
 241         public static OperationStatus DecodeFromUtf16(ReadOnlySpan<char> source, out Rune result, out int charsConsumed)
 242         {
 243             if (!source.IsEmpty)
 244             {
 245                 // First, check for the common case of a BMP scalar value.
 246                 // If this is correct, return immediately.
 247
 248                 char firstChar = source[0];
 249                 if (TryCreate(firstChar, out result))
 250                 {
 251                     charsConsumed = 1;
 252                     return OperationStatus.Done;
 253                 }
 254
 255                 // First thing we saw was a UTF-16 surrogate code point.
 256                 // Let's optimistically assume for now it's a high surrogate and hope
 257                 // that combining it with the next char yields useful results.
 258
 259                 if (1 < (uint)source.Length)
 260                 {
 261                     char secondChar = source[1];
 262                     if (TryCreate(firstChar, secondChar, out result))
 263                     {
 264                         // Success! Formed a supplementary scalar value.
 265                         charsConsumed = 2;
 266                         return OperationStatus.Done;
 267                     }
 268                     else
 269                     {
 270                         // Either the first character was a low surrogate, or the second
 271                         // character was not a low surrogate. This is an error.
 272                         goto InvalidData;
 273                     }
 274                 }
 275                 else if (!char.IsHighSurrogate(firstChar))
 276                 {
 277                     // Quick check to make sure we're not going to report NeedMoreData for
 278                     // a single-element buffer where the data is a standalone low surrogate
 279                     // character. Since no additional data will ever make this valid, we'll
 280                     // report an error immediately.
 281                     goto InvalidData;
 282                 }
 283             }
 284
 285             // If we got to this point, the input buffer was empty, or the buffer
 286             // was a single element in length and that element was a high surrogate char.
 287
 288             charsConsumed = source.Length;
 289             result = ReplacementChar;
 290             return OperationStatus.NeedMoreData;
 291
 292         InvalidData:
 293
 294             charsConsumed = 1; // maximal invalid subsequence for UTF-16 is always a single code unit in length
 295             result = ReplacementChar;
 296             return OperationStatus.InvalidData;
 297         }
 298
 299         /// <summary>
 300         /// Decodes the <see cref="Rune"/> at the beginning of the provided UTF-8 source buffer.
 301         /// </summary>
 302         /// <returns>
 303         /// <para>
 304         /// If the source buffer begins with a valid UTF-8 encoded scalar value, returns <see cref="OperationStatus.Done"/>,
 305         /// and outs via <paramref name="result"/> the decoded <see cref="Rune"/> and via <paramref name="bytesConsumed"/> the
 306         /// number of <see langword="byte"/>s used in the input buffer to encode the <see cref="Rune"/>.
 307         /// </para>
 308         /// <para>
 309         /// If the source buffer is empty or contains only a standalone UTF-8 high surrogate character, returns <see cref="OperationStatus.NeedMoreData"/>,
 310         /// and outs via <paramref name="result"/> <see cref="ReplacementChar"/> and via <paramref name="bytesConsumed"/> the length of the input buffer.
 311         /// </para>
 312         /// <para>
 313         /// If the source buffer begins with an ill-formed UTF-8 encoded scalar value, returns <see cref="OperationStatus.InvalidData"/>,
 314         /// and outs via <paramref name="result"/> <see cref="ReplacementChar"/> and via <paramref name="bytesConsumed"/> the number of
 315         /// <see langword="char"/>s used in the input buffer to encode the ill-formed sequence.
 316         /// </para>
 317         /// </returns>
 318         /// <remarks>
 319         /// The general calling convention is to call this method in a loop, slicing the <paramref name="source"/> buffer by
 320         /// <paramref name="bytesConsumed"/> elements on each iteration of the loop. On each iteration of the loop <paramref name="result"/>
 321         /// will contain the real scalar value if successfully decoded, or it will contain <see cref="ReplacementChar"/> if
 322         /// the data could not be successfully decoded. This pattern provides convenient automatic U+FFFD substitution of
 323         /// invalid sequences while iterating through the loop.
 324         /// </remarks>
 325         public static OperationStatus DecodeFromUtf8(ReadOnlySpan<byte> source, out Rune result, out int bytesConsumed)
 326         {
 327             // This method follows the Unicode Standard's recommendation for detecting
 328             // the maximal subpart of an ill-formed subsequence. See The Unicode Standard,
 329             // Ch. 3.9 for more details. In summary, when reporting an invalid subsequence,
 330             // it tries to consume as many code units as possible as long as those code
 331             // units constitute the beginning of a longer well-formed subsequence per Table 3-7.
 332
 333             int index = 0;
 334
 335             // Try reading input[0].
 336
 337             if ((uint)index >= (uint)source.Length)
 338             {
 339                 goto NeedsMoreData;
 340             }
 341
 342             uint tempValue = source[index];
 343             if (!UnicodeUtility.IsAsciiCodePoint(tempValue))
 344             {
 345                 goto NotAscii;
 346             }
 347
 348         Finish:
 349
 350             bytesConsumed = index + 1;
 351             Debug.Assert(1 <= bytesConsumed && bytesConsumed <= 4); // Valid subsequences are always length [1..4]
 352             result = UnsafeCreate(tempValue);
 353             return OperationStatus.Done;
 354
 355         NotAscii:
 356
 357             // Per Table 3-7, the beginning of a multibyte sequence must be a code unit in
 358             // the range [C2..F4]. If it's outside of that range, it's either a standalone
 359             // continuation byte, or it's an overlong two-byte sequence, or it's an out-of-range
 360             // four-byte sequence.
 361
 362             if (!UnicodeUtility.IsInRangeInclusive(tempValue, 0xC2, 0xF4))
 363             {
 364                 goto FirstByteInvalid;
 365             }
 366
 367             tempValue = (tempValue - 0xC2) << 6;
 368
 369             // Try reading input[1].
 370
 371             index++;
 372             if ((uint)index >= (uint)source.Length)
 373             {
 374                 goto NeedsMoreData;
 375             }
 376
 377             // Continuation bytes are of the form [10xxxxxx], which means that their two's
 378             // complement representation is in the range [-65..-128]. This allows us to
 379             // perform a single comparison to see if a byte is a continuation byte.
 380
 381             int thisByteSignExtended = (sbyte)source[index];
 382             if (thisByteSignExtended >= -64)
 383             {
 384                 goto Invalid;
 385             }
 386
 387             tempValue += (uint)thisByteSignExtended;
 388             tempValue += 0x80; // remove the continuation byte marker
 389             tempValue += (0xC2 - 0xC0) << 6; // remove the leading byte marker
 390
 391             if (tempValue < 0x0800)
 392             {
 393                 Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0080, 0x07FF));
 394                 goto Finish; // this is a valid 2-byte sequence
 395             }
 396
 397             // This appears to be a 3- or 4-byte sequence. Since per Table 3-7 we now have
 398             // enough information (from just two code units) to detect overlong or surrogate
 399             // sequences, we need to perform these checks now.
 400
 401             if (!UnicodeUtility.IsInRangeInclusive(tempValue, ((0xE0 - 0xC0) << 6) + (0xA0 - 0x80), ((0xF4 - 0xC0) << 6) + (0x8F - 0x80)))
 402             {
 403                 // The first two bytes were not in the range [[E0 A0]..[F4 8F]].
 404                 // This is an overlong 3-byte sequence or an out-of-range 4-byte sequence.
 405                 goto Invalid;
 406             }
 407
 408             if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xED - 0xC0) << 6) + (0xA0 - 0x80), ((0xED - 0xC0) << 6) + (0xBF - 0x80)))
 409             {
 410                 // This is a UTF-16 surrogate code point, which is invalid in UTF-8.
 411                 goto Invalid;
 412             }
 413
 414             if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xF0 - 0xC0) << 6) + (0x80 - 0x80), ((0xF0 - 0xC0) << 6) + (0x8F - 0x80)))
 415             {
 416                 // This is an overlong 4-byte sequence.
 417                 goto Invalid;
 418             }
 419
 420             // The first two bytes were just fine. We don't need to perform any other checks
 421             // on the remaining bytes other than to see that they're valid continuation bytes.
 422
 423             // Try reading input[2].
 424
 425             index++;
 426             if ((uint)index >= (uint)source.Length)
 427             {
 428                 goto NeedsMoreData;
 429             }
 430
 431             thisByteSignExtended = (sbyte)source[index];
 432             if (thisByteSignExtended >= -64)
 433             {
 434                 goto Invalid; // this byte is not a UTF-8 continuation byte
 435             }
 436
 437             tempValue <<= 6;
 438             tempValue += (uint)thisByteSignExtended;
 439             tempValue += 0x80; // remove the continuation byte marker
 440             tempValue -= (0xE0 - 0xC0) << 12; // remove the leading byte marker
 441
 442             if (tempValue <= 0xFFFF)
 443             {
 444                 Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0800, 0xFFFF));
 445                 goto Finish; // this is a valid 3-byte sequence
 446             }
 447
 448             // Try reading input[3].
 449
 450             index++;
 451             if ((uint)index >= (uint)source.Length)
 452             {
 453                 goto NeedsMoreData;
 454             }
 455
 456             thisByteSignExtended = (sbyte)source[index];
 457             if (thisByteSignExtended >= -64)
 458             {
 459                 goto Invalid; // this byte is not a UTF-8 continuation byte
 460             }
 461
 462             tempValue <<= 6;
 463             tempValue += (uint)thisByteSignExtended;
 464             tempValue += 0x80; // remove the continuation byte marker
 465             tempValue -= (0xF0 - 0xE0) << 18; // remove the leading byte marker
 466
 467             UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(tempValue);
 468             goto Finish; // this is a valid 4-byte sequence
 469
 470         FirstByteInvalid:
 471
 472             index = 1; // Invalid subsequences are always at least length 1.
 473
 474         Invalid:
 475
 476             Debug.Assert(1 <= index && index <= 3); // Invalid subsequences are always length 1..3
 477             bytesConsumed = index;
 478             result = ReplacementChar;
 479             return OperationStatus.InvalidData;
 480
 481         NeedsMoreData:
 482
 483             Debug.Assert(0 <= index && index <= 3); // Incomplete subsequences are always length 0..3
 484             bytesConsumed = index;
 485             result = ReplacementChar;
 486             return OperationStatus.NeedMoreData;
 487         }
 488
 489         /// <summary>
 490         /// Decodes the <see cref="Rune"/> at the end of the provided UTF-16 source buffer.
 491         /// </summary>
 492         /// <remarks>
 493         /// This method is very similar to <see cref="DecodeFromUtf16(ReadOnlySpan{char}, out Rune, out int)"/>, but it allows
 494         /// the caller to loop backward instead of forward. The typical calling convention is that on each iteration
 495         /// of the loop, the caller should slice off the final <paramref name="charsConsumed"/> elements of
 496         /// the <paramref name="source"/> buffer.
 497         /// </remarks>
 498         public static OperationStatus DecodeLastFromUtf16(ReadOnlySpan<char> source, out Rune result, out int charsConsumed)
 499         {
 500             int index = source.Length - 1;
 501             if ((uint)index < (uint)source.Length)
 502             {
 503                 // First, check for the common case of a BMP scalar value.
 504                 // If this is correct, return immediately.
 505
 506                 char finalChar = source[index];
 507                 if (TryCreate(finalChar, out result))
 508                 {
 509                     charsConsumed = 1;
 510                     return OperationStatus.Done;
 511                 }
 512
 513                 if (char.IsLowSurrogate(finalChar))
 514                 {
 515                     // The final character was a UTF-16 low surrogate code point.
 516                     // This must be preceded by a UTF-16 high surrogate code point, otherwise
 517                     // we have a standalone low surrogate, which is always invalid.
 518
 519                     index--;
 520                     if ((uint)index < (uint)source.Length)
 521                     {
 522                         char penultimateChar = source[index];
 523                         if (TryCreate(penultimateChar, finalChar, out result))
 524                         {
 525                             // Success! Formed a supplementary scalar value.
 526                             charsConsumed = 2;
 527                             return OperationStatus.Done;
 528                         }
 529                     }
 530
 531                     // If we got to this point, we saw a standalone low surrogate
 532                     // and must report an error.
 533
 534                     charsConsumed = 1; // standalone surrogate
 535                     result = ReplacementChar;
 536                     return OperationStatus.InvalidData;
 537                 }
 538             }
 539
 540             // If we got this far, the source buffer was empty, or the source buffer ended
 541             // with a UTF-16 high surrogate code point. These aren't errors since they could
 542             // be valid given more input data.
 543
 544             charsConsumed = (int)((uint)(-source.Length) >> 31); // 0 -> 0, all other lengths -> 1
 545             result = ReplacementChar;
 546             return OperationStatus.NeedMoreData;
 547         }
 548
 549         /// <summary>
 550         /// Decodes the <see cref="Rune"/> at the end of the provided UTF-8 source buffer.
 551         /// </summary>
 552         /// <remarks>
 553         /// This method is very similar to <see cref="DecodeFromUtf8(ReadOnlySpan{byte}, out Rune, out int)"/>, but it allows
 554         /// the caller to loop backward instead of forward. The typical calling convention is that on each iteration
 555         /// of the loop, the caller should slice off the final <paramref name="bytesConsumed"/> elements of
 556         /// the <paramref name="source"/> buffer.
 557         /// </remarks>
 558         public static OperationStatus DecodeLastFromUtf8(ReadOnlySpan<byte> source, out Rune value, out int bytesConsumed)
 559         {
 560             int index = source.Length - 1;
 561             if ((uint)index < (uint)source.Length)
 562             {
 563                 // The buffer contains at least one byte. Let's check the fast case where the
 564                 // buffer ends with an ASCII byte.
 565
 566                 uint tempValue = source[index];
 567                 if (UnicodeUtility.IsAsciiCodePoint(tempValue))
 568                 {
 569                     bytesConsumed = 1;
 570                     value = UnsafeCreate(tempValue);
 571                     return OperationStatus.Done;
 572                 }
 573
 574                 // If the final byte is not an ASCII byte, we may be beginning or in the middle of
 575                 // a UTF-8 multi-code unit sequence. We need to back up until we see the start of
 576                 // the multi-code unit sequence; we can detect the leading byte because all multi-byte
 577                 // sequences begin with a byte whose 0x40 bit is set. Since all multi-byte sequences
 578                 // are no greater than 4 code units in length, we only need to search back a maximum
 579                 // of four bytes.
 580
 581                 if (((byte)tempValue & 0x40) != 0)
 582                 {
 583                     // This is a UTF-8 leading byte. We'll do a forward read from here.
 584                     // It'll return invalid (if given C0, F5, etc.) or incomplete. Both are fine.
 585
 586                     return DecodeFromUtf8(source.Slice(index), out value, out bytesConsumed);
 587                 }
 588
 589                 // If we got to this point, the final byte was a UTF-8 continuation byte.
 590                 // Let's check the three bytes immediately preceding this, looking for the starting byte.
 591
 592                 for (int i = 3; i > 0; i--)
 593                 {
 594                     index--;
 595                     if ((uint)index >= (uint)source.Length)
 596                     {
 597                         goto Invalid; // out of data
 598                     }
 599
 600                     // The check below will get hit for ASCII (values 00..7F) and for UTF-8 starting bytes
 601                     // (bits 0xC0 set, values C0..FF). In two's complement this is the range [-64..127].
 602                     // It's just a fast way for us to terminate the search.
 603
 604                     if ((sbyte)source[index] >= -64)
 605                     {
 606                         goto ForwardDecode;
 607                     }
 608                 }
 609
 610             Invalid:
 611
 612                 // If we got to this point, either:
 613                 // - the last 4 bytes of the input buffer are continuation bytes;
 614                 // - the entire input buffer (if fewer than 4 bytes) consists only of continuation bytes; or
 615                 // - there's no UTF-8 leading byte between the final continuation byte of the buffer and
 616                 //   the previous well-formed subsequence or maximal invalid subsequence.
 617                 //
 618                 // In all of these cases, the final byte must be a maximal invalid subsequence of length 1.
 619                 // See comment near the end of this method for more information.
 620
 621                 value = ReplacementChar;
 622                 bytesConsumed = 1;
 623                 return OperationStatus.InvalidData;
 624
 625             ForwardDecode:
 626
 627                 // If we got to this point, we found an ASCII byte or a UTF-8 starting byte at position source[index].
 628                 // Technically this could also mean we found an invalid byte like C0 or F5 at this position, but that's
 629                 // fine since it'll be handled by the forward read. From this position, we'll perform a forward read
 630                 // and see if we consumed the entirety of the buffer.
 631
 632                 source = source.Slice(index);
 633                 Debug.Assert(!source.IsEmpty, "Shouldn't reach this for empty inputs.");
 634
 635                 OperationStatus operationStatus = DecodeFromUtf8(source, out Rune tempRune, out int tempBytesConsumed);
 636                 if (tempBytesConsumed == source.Length)
 637                 {
 638                     // If this forward read consumed the entirety of the end of the input buffer, we can return it
 639                     // as the result of this function. It could be well-formed, incomplete, or invalid. If it's
 640                     // invalid and we consumed the remainder of the buffer, we know we've found the maximal invalid
 641                     // subsequence, which is what we wanted anyway.
 642
 643                     bytesConsumed = tempBytesConsumed;
 644                     value = tempRune;
 645                     return operationStatus;
 646                 }
 647
 648                 // If we got to this point, we know that the final continuation byte wasn't consumed by the forward
 649                 // read that we just performed above. This means that the continuation byte has to be part of an
 650                 // invalid subsequence since there's no UTF-8 leading byte between what we just consumed and the
 651                 // continuation byte at the end of the input. Furthermore, since any maximal invalid subsequence
 652                 // of length > 1 must have a UTF-8 leading byte as its first code unit, this implies that the
 653                 // continuation byte at the end of the buffer is itself a maximal invalid subsequence of length 1.
 654
 655                 goto Invalid;
 656             }
 657             else
 658             {
 659                 // Source buffer was empty.
 660                 value = ReplacementChar;
 661                 bytesConsumed = 0;
 662                 return OperationStatus.NeedMoreData;
 663             }
 664         }
 665
 666         /// <summary>
 667         /// Encodes this <see cref="Rune"/> to a UTF-16 destination buffer.
 668         /// </summary>
 669         /// <param name="destination">The buffer to which to write this value as UTF-16.</param>
 670         /// <returns>The number of <see cref="char"/>s written to <paramref name="destination"/>.</returns>
 671         /// <exception cref="ArgumentException">
 672         /// If <paramref name="destination"/> is not large enough to hold the output.
 673         /// </exception>
 674         public int EncodeToUtf16(Span<char> destination)
 675         {
 676             if (!TryEncodeToUtf16(destination, out int charsWritten))
 677             {
 678                 ThrowHelper.ThrowArgumentException_DestinationTooShort();
 679             }
 680
 681             return charsWritten;
 682         }
 683
 684         /// <summary>
 685         /// Encodes this <see cref="Rune"/> to a UTF-8 destination buffer.
 686         /// </summary>
 687         /// <param name="destination">The buffer to which to write this value as UTF-8.</param>
 688         /// <returns>The number of <see cref="byte"/>s written to <paramref name="destination"/>.</returns>
 689         /// <exception cref="ArgumentException">
 690         /// If <paramref name="destination"/> is not large enough to hold the output.
 691         /// </exception>
 692         public int EncodeToUtf8(Span<byte> destination)
 693         {
 694             if (!TryEncodeToUtf8(destination, out int bytesWritten))
 695             {
 696                 ThrowHelper.ThrowArgumentException_DestinationTooShort();
 697             }
 698
 699             return bytesWritten;
 700         }
 701
 702         public override bool Equals(object? obj) => (obj is Rune other) && this.Equals(other);
 703
 704         public bool Equals(Rune other) => (this == other);
 705
 706         public override int GetHashCode() => Value;
 707
 708         /// <summary>
 709         /// Gets the <see cref="Rune"/> which begins at index <paramref name="index"/> in
 710         /// string <paramref name="input"/>.
 711         /// </summary>
 712         /// <remarks>
 713         /// Throws if <paramref name="input"/> is null, if <paramref name="index"/> is out of range, or
 714         /// if <paramref name="index"/> does not reference the start of a valid scalar value within <paramref name="input"/>.
 715         /// </remarks>
 716         public static Rune GetRuneAt(string input, int index)
 717         {
 718             int runeValue = ReadRuneFromString(input, index);
 719             if (runeValue < 0)
 720             {
 721                 ThrowHelper.ThrowArgumentException_CannotExtractScalar(ExceptionArgument.index);
 722             }
 723
 724             return UnsafeCreate((uint)runeValue);
 725         }
 726
 727         /// <summary>
 728         /// Returns <see langword="true"/> iff <paramref name="value"/> is a valid Unicode scalar
 729         /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive.
 730         /// </summary>
 731         public static bool IsValid(int value) => IsValid((uint)value);
 732
 733         /// <summary>
 734         /// Returns <see langword="true"/> iff <paramref name="value"/> is a valid Unicode scalar
 735         /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive.
 736         /// </summary>
 737         [CLSCompliant(false)]
 738         public static bool IsValid(uint value) => UnicodeUtility.IsValidUnicodeScalar(value);
 739
 740         // returns a negative number on failure
 741         internal static int ReadFirstRuneFromUtf16Buffer(ReadOnlySpan<char> input)
 742         {
 743             if (input.IsEmpty)
 744             {
 745                 return -1;
 746             }
 747
 748             // Optimistically assume input is within BMP.
 749
 750             uint returnValue = input[0];
 751             if (UnicodeUtility.IsSurrogateCodePoint(returnValue))
 752             {
 753                 if (!UnicodeUtility.IsHighSurrogateCodePoint(returnValue))
 754                 {
 755                     return -1;
 756                 }
 757
 758                 // Treat 'returnValue' as the high surrogate.
 759
 760                 if (1 >= (uint)input.Length)
 761                 {
 762                     return -1; // not an argument exception - just a "bad data" failure
 763                 }
 764
 765                 uint potentialLowSurrogate = input[1];
 766                 if (!UnicodeUtility.IsLowSurrogateCodePoint(potentialLowSurrogate))
 767                 {
 768                     return -1;
 769                 }
 770
 771                 returnValue = UnicodeUtility.GetScalarFromUtf16SurrogatePair(returnValue, potentialLowSurrogate);
 772             }
 773
 774             return (int)returnValue;
 775         }
 776
 777         // returns a negative number on failure
 778         private static int ReadRuneFromString(string input, int index)
 779         {
 780             if (input is null)
 781             {
 782                 ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input);
 783             }
 784
 785             if ((uint)index >= (uint)input!.Length)
 786             {
 787                 ThrowHelper.ThrowArgumentOutOfRange_IndexException();
 788             }
 789
 790             // Optimistically assume input is within BMP.
 791
 792             uint returnValue = input[index];
 793             if (UnicodeUtility.IsSurrogateCodePoint(returnValue))
 794             {
 795                 if (!UnicodeUtility.IsHighSurrogateCodePoint(returnValue))
 796                 {
 797                     return -1;
 798                 }
 799
 800                 // Treat 'returnValue' as the high surrogate.
 801                 //
 802                 // If this becomes a hot code path, we can skip the below bounds check by reading
 803                 // off the end of the string using unsafe code. Since strings are null-terminated,
 804                 // we're guaranteed not to read a valid low surrogate, so we'll fail correctly if
 805                 // the string terminates unexpectedly.
 806
 807                 index++;
 808                 if ((uint)index >= (uint)input.Length)
 809                 {
 810                     return -1; // not an argument exception - just a "bad data" failure
 811                 }
 812
 813                 uint potentialLowSurrogate = input[index];
 814                 if (!UnicodeUtility.IsLowSurrogateCodePoint(potentialLowSurrogate))
 815                 {
 816                     return -1;
 817                 }
 818
 819                 returnValue = UnicodeUtility.GetScalarFromUtf16SurrogatePair(returnValue, potentialLowSurrogate);
 820             }
 821
 822             return (int)returnValue;
 823         }
 824
 825         /// <summary>
 826         /// Returns a <see cref="string"/> representation of this <see cref="Rune"/> instance.
 827         /// </summary>
 828         public override string ToString()
 829         {
 830             if (IsBmp)
 831             {
 832                 return string.CreateFromChar((char)_value);
 833             }
 834             else
 835             {
 836                 UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar(_value, out char high, out char low);
 837                 return string.CreateFromChar(high, low);
 838             }
 839         }
 840
 841         /// <summary>
 842         /// Attempts to create a <see cref="Rune"/> from the provided input value.
 843         /// </summary>
 844         public static bool TryCreate(char ch, out Rune result)
 845         {
 846             uint extendedValue = ch;
 847             if (!UnicodeUtility.IsSurrogateCodePoint(extendedValue))
 848             {
 849                 result = UnsafeCreate(extendedValue);
 850                 return true;
 851             }
 852             else
 853             {
 854                 result = default;
 855                 return false;
 856             }
 857         }
 858
 859         /// <summary>
 860         /// Attempts to create a <see cref="Rune"/> from the provided UTF-16 surrogate pair.
 861         /// Returns <see langword="false"/> if the input values don't represent a well-formed UTF-16surrogate pair.
 862         /// </summary>
 863         public static bool TryCreate(char highSurrogate, char lowSurrogate, out Rune result)
 864         {
 865             // First, extend both to 32 bits, then calculate the offset of
 866             // each candidate surrogate char from the start of its range.
 867
 868             uint highSurrogateOffset = (uint)highSurrogate - CharUnicodeInfo.HIGH_SURROGATE_START;
 869             uint lowSurrogateOffset = (uint)lowSurrogate - CharUnicodeInfo.LOW_SURROGATE_START;
 870
 871             // This is a single comparison which allows us to check both for validity at once since
 872             // both the high surrogate range and the low surrogate range are the same length.
 873             // If the comparison fails, we call to a helper method to throw the correct exception message.
 874
 875             if ((highSurrogateOffset | lowSurrogateOffset) <= CharUnicodeInfo.HIGH_SURROGATE_RANGE)
 876             {
 877                 // The 0x40u << 10 below is to account for uuuuu = wwww + 1 in the surrogate encoding.
 878                 result = UnsafeCreate((highSurrogateOffset << 10) + ((uint)lowSurrogate - CharUnicodeInfo.LOW_SURROGATE_START) + (0x40u << 10));
 879                 return true;
 880             }
 881             else
 882             {
 883                 // Didn't have a high surrogate followed by a low surrogate.
 884                 result = default;
 885                 return false;
 886             }
 887         }
 888
 889         /// <summary>
 890         /// Attempts to create a <see cref="Rune"/> from the provided input value.
 891         /// </summary>
 892         public static bool TryCreate(int value, out Rune result) => TryCreate((uint)value, out result);
 893
 894         /// <summary>
 895         /// Attempts to create a <see cref="Rune"/> from the provided input value.
 896         /// </summary>
 897         [CLSCompliant(false)]
 898         public static bool TryCreate(uint value, out Rune result)
 899         {
 900             if (UnicodeUtility.IsValidUnicodeScalar(value))
 901             {
 902                 result = UnsafeCreate(value);
 903                 return true;
 904             }
 905             else
 906             {
 907                 result = default;
 908                 return false;
 909             }
 910         }
 911
 912         /// <summary>
 913         /// Encodes this <see cref="Rune"/> to a UTF-16 destination buffer.
 914         /// </summary>
 915         /// <param name="destination">The buffer to which to write this value as UTF-16.</param>
 916         /// <param name="charsWritten">
 917         /// The number of <see cref="char"/>s written to <paramref name="destination"/>,
 918         /// or 0 if the destination buffer is not large enough to contain the output.</param>
 919         /// <returns>True if the value was written to the buffer; otherwise, false.</returns>
 920         /// <remarks>
 921         /// The <see cref="Utf16SequenceLength"/> property can be queried ahead of time to determine
 922         /// the required size of the <paramref name="destination"/> buffer.
 923         /// </remarks>
 924         public bool TryEncodeToUtf16(Span<char> destination, out int charsWritten)
 925         {
 926             if (destination.Length >= 1)
 927             {
 928                 if (IsBmp)
 929                 {
 930                     destination[0] = (char)_value;
 931                     charsWritten = 1;
 932                     return true;
 933                 }
 934                 else if (destination.Length >= 2)
 935                 {
 936                     UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar(_value, out destination[0], out destination[1]);
 937                     charsWritten = 2;
 938                     return true;
 939                 }
 940             }
 941
 942             // Destination buffer not large enough
 943
 944             charsWritten = default;
 945             return false;
 946         }
 947
 948         /// <summary>
 949         /// Encodes this <see cref="Rune"/> to a destination buffer as UTF-8 bytes.
 950         /// </summary>
 951         /// <param name="destination">The buffer to which to write this value as UTF-8.</param>
 952         /// <param name="bytesWritten">
 953         /// The number of <see cref="byte"/>s written to <paramref name="destination"/>,
 954         /// or 0 if the destination buffer is not large enough to contain the output.</param>
 955         /// <returns>True if the value was written to the buffer; otherwise, false.</returns>
 956         /// <remarks>
 957         /// The <see cref="Utf8SequenceLength"/> property can be queried ahead of time to determine
 958         /// the required size of the <paramref name="destination"/> buffer.
 959         /// </remarks>
 960         public bool TryEncodeToUtf8(Span<byte> destination, out int bytesWritten)
 961         {
 962             // The bit patterns below come from the Unicode Standard, Table 3-6.
 963
 964             if (destination.Length >= 1)
 965             {
 966                 if (IsAscii)
 967                 {
 968                     destination[0] = (byte)_value;
 969                     bytesWritten = 1;
 970                     return true;
 971                 }
 972
 973                 if (destination.Length >= 2)
 974                 {
 975                     if (_value <= 0x7FFu)
 976                     {
 977                         // Scalar 00000yyy yyxxxxxx -> bytes [ 110yyyyy 10xxxxxx ]
 978                         destination[0] = (byte)((_value + (0b110u << 11)) >> 6);
 979                         destination[1] = (byte)((_value & 0x3Fu) + 0x80u);
 980                         bytesWritten = 2;
 981                         return true;
 982                     }
 983
 984                     if (destination.Length >= 3)
 985                     {
 986                         if (_value <= 0xFFFFu)
 987                         {
 988                             // Scalar zzzzyyyy yyxxxxxx -> bytes [ 1110zzzz 10yyyyyy 10xxxxxx ]
 989                             destination[0] = (byte)((_value + (0b1110 << 16)) >> 12);
 990                             destination[1] = (byte)(((_value & (0x3Fu << 6)) >> 6) + 0x80u);
 991                             destination[2] = (byte)((_value & 0x3Fu) + 0x80u);
 992                             bytesWritten = 3;
 993                             return true;
 994                         }
 995
 996                         if (destination.Length >= 4)
 997                         {
 998                             // Scalar 000uuuuu zzzzyyyy yyxxxxxx -> bytes [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ]
 999                             destination[0] = (byte)((_value + (0b11110 << 21)) >> 18);
1000                             destination[1] = (byte)(((_value & (0x3Fu << 12)) >> 12) + 0x80u);
1001                             destination[2] = (byte)(((_value & (0x3Fu << 6)) >> 6) + 0x80u);
1002                             destination[3] = (byte)((_value & 0x3Fu) + 0x80u);
1003                             bytesWritten = 4;
1004                             return true;
1005                         }
1006                     }
1007                 }
1008             }
1009
1010             // Destination buffer not large enough
1011
1012             bytesWritten = default;
1013             return false;
1014         }
1015
1016         /// <summary>
1017         /// Attempts to get the <see cref="Rune"/> which begins at index <paramref name="index"/> in
1018         /// string <paramref name="input"/>.
1019         /// </summary>
1020         /// <returns><see langword="true"/> if a scalar value was successfully extracted from the specified index,
1021         /// <see langword="false"/> if a value could not be extracted due to invalid data.</returns>
1022         /// <remarks>
1023         /// Throws only if <paramref name="input"/> is null or <paramref name="index"/> is out of range.
1024         /// </remarks>
1025         public static bool TryGetRuneAt(string input, int index, out Rune value)
1026         {
1027             int runeValue = ReadRuneFromString(input, index);
1028             if (runeValue >= 0)
1029             {
1030                 value = UnsafeCreate((uint)runeValue);
1031                 return true;
1032             }
1033             else
1034             {
1035                 value = default;
1036                 return false;
1037             }
1038         }
1039
1040         // Allows constructing a Unicode scalar value from an arbitrary 32-bit integer without
1041         // validation. It is the caller's responsibility to have performed manual validation
1042         // before calling this method. If a Rune instance is forcibly constructed
1043         // from invalid input, the APIs on this type have undefined behavior, potentially including
1044         // introducing a security hole in the consuming application.
1045         //
1046         // An example of a security hole resulting from an invalid Rune value, which could result
1047         // in a stack overflow.
1048         //
1049         // public int GetMarvin32HashCode(Rune r) {
1050         //   Span<char> buffer = stackalloc char[r.Utf16SequenceLength];
1051         //   r.TryEncode(buffer, ...);
1052         //   return Marvin32.ComputeHash(buffer.AsBytes());
1053         // }
1054
1055         /// <summary>
1056         /// Creates a <see cref="Rune"/> without performing validation on the input.
1057         /// </summary>
1058         [MethodImpl(MethodImplOptions.AggressiveInlining)]
1059         internal static Rune UnsafeCreate(uint scalarValue) => new Rune(scalarValue, false);
1060
1061         // These are analogs of APIs on System.Char
1062
1063         public static double GetNumericValue(Rune value)
1064         {
1065             if (value.IsAscii)
1066             {
1067                 uint baseNum = value._value - '0';
1068                 return (baseNum <= 9) ? (double)baseNum : -1;
1069             }
1070             else
1071             {
1072                 // not an ASCII char; fall back to globalization table
1073                 return CharUnicodeInfo.InternalGetNumericValue(value.Value);
1074             }
1075         }
1076
1077         public static UnicodeCategory GetUnicodeCategory(Rune value)
1078         {
1079             if (value.IsAscii)
1080             {
1081                 return (UnicodeCategory)(AsciiCharInfo[value.Value] & UnicodeCategoryMask);
1082             }
1083             else
1084             {
1085                 return GetUnicodeCategoryNonAscii(value);
1086             }
1087         }
1088
1089         private static UnicodeCategory GetUnicodeCategoryNonAscii(Rune value)
1090         {
1091             Debug.Assert(!value.IsAscii, "Shouldn't use this non-optimized code path for ASCII characters.");
1092             return CharUnicodeInfo.GetUnicodeCategory(value.Value);
1093         }
1094
1095         // Returns true iff this Unicode category represents a letter
1096         private static bool IsCategoryLetter(UnicodeCategory category)
1097         {
1098             return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.UppercaseLetter, (uint)UnicodeCategory.OtherLetter);
1099         }
1100
1101         // Returns true iff this Unicode category represents a letter or a decimal digit
1102         private static bool IsCategoryLetterOrDecimalDigit(UnicodeCategory category)
1103         {
1104             return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.UppercaseLetter, (uint)UnicodeCategory.OtherLetter)
1105                 || (category == UnicodeCategory.DecimalDigitNumber);
1106         }
1107
1108         // Returns true iff this Unicode category represents a number
1109         private static bool IsCategoryNumber(UnicodeCategory category)
1110         {
1111             return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.DecimalDigitNumber, (uint)UnicodeCategory.OtherNumber);
1112         }
1113
1114         // Returns true iff this Unicode category represents a punctuation mark
1115         private static bool IsCategoryPunctuation(UnicodeCategory category)
1116         {
1117             return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.ConnectorPunctuation, (uint)UnicodeCategory.OtherPunctuation);
1118         }
1119
1120         // Returns true iff this Unicode category represents a separator
1121         private static bool IsCategorySeparator(UnicodeCategory category)
1122         {
1123             return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.SpaceSeparator, (uint)UnicodeCategory.ParagraphSeparator);
1124         }
1125
1126         // Returns true iff this Unicode category represents a symbol
1127         private static bool IsCategorySymbol(UnicodeCategory category)
1128         {
1129             return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.MathSymbol, (uint)UnicodeCategory.OtherSymbol);
1130         }
1131
1132         public static bool IsControl(Rune value)
1133         {
1134             // Per the Unicode stability policy, the set of control characters
1135             // is forever fixed at [ U+0000..U+001F ], [ U+007F..U+009F ]. No
1136             // characters will ever be added to the "control characters" group.
1137             // See http://www.unicode.org/policies/stability_policy.html.
1138
1139             // Logic below depends on Rune.Value never being -1 (since Rune is a validating type)
1140             // 00..1F (+1) => 01..20 (&~80) => 01..20
1141             // 7F..9F (+1) => 80..A0 (&~80) => 00..20
1142
1143             return (((value._value + 1) & ~0x80u) <= 0x20u);
1144         }
1145
1146         public static bool IsDigit(Rune value)
1147         {
1148             if (value.IsAscii)
1149             {
1150                 return UnicodeUtility.IsInRangeInclusive(value._value, '0', '9');
1151             }
1152             else
1153             {
1154                 return GetUnicodeCategoryNonAscii(value) == UnicodeCategory.DecimalDigitNumber;
1155             }
1156         }
1157
1158         public static bool IsLetter(Rune value)
1159         {
1160             if (value.IsAscii)
1161             {
1162                 return (((value._value - 'A') & ~0x20u) <= (uint)('Z' - 'A')); // [A-Za-z]
1163             }
1164             else
1165             {
1166                 return IsCategoryLetter(GetUnicodeCategoryNonAscii(value));
1167             }
1168         }
1169
1170         public static bool IsLetterOrDigit(Rune value)
1171         {
1172             if (value.IsAscii)
1173             {
1174                 return ((AsciiCharInfo[value.Value] & IsLetterOrDigitFlag) != 0);
1175             }
1176             else
1177             {
1178                 return IsCategoryLetterOrDecimalDigit(GetUnicodeCategoryNonAscii(value));
1179             }
1180         }
1181
1182         public static bool IsLower(Rune value)
1183         {
1184             if (value.IsAscii)
1185             {
1186                 return UnicodeUtility.IsInRangeInclusive(value._value, 'a', 'z');
1187             }
1188             else
1189             {
1190                 return GetUnicodeCategoryNonAscii(value) == UnicodeCategory.LowercaseLetter;
1191             }
1192         }
1193
1194         public static bool IsNumber(Rune value)
1195         {
1196             if (value.IsAscii)
1197             {
1198                 return UnicodeUtility.IsInRangeInclusive(value._value, '0', '9');
1199             }
1200             else
1201             {
1202                 return IsCategoryNumber(GetUnicodeCategoryNonAscii(value));
1203             }
1204         }
1205
1206         public static bool IsPunctuation(Rune value)
1207         {
1208             return IsCategoryPunctuation(GetUnicodeCategory(value));
1209         }
1210
1211         public static bool IsSeparator(Rune value)
1212         {
1213             return IsCategorySeparator(GetUnicodeCategory(value));
1214         }
1215
1216         public static bool IsSymbol(Rune value)
1217         {
1218             return IsCategorySymbol(GetUnicodeCategory(value));
1219         }
1220
1221         public static bool IsUpper(Rune value)
1222         {
1223             if (value.IsAscii)
1224             {
1225                 return UnicodeUtility.IsInRangeInclusive(value._value, 'A', 'Z');
1226             }
1227             else
1228             {
1229                 return GetUnicodeCategoryNonAscii(value) == UnicodeCategory.UppercaseLetter;
1230             }
1231         }
1232
1233         public static bool IsWhiteSpace(Rune value)
1234         {
1235             if (value.IsAscii)
1236             {
1237                 return (AsciiCharInfo[value.Value] & IsWhiteSpaceFlag) != 0;
1238             }
1239
1240             // U+0085 is special since it's a whitespace character but is in the Control category
1241             // instead of a normal separator category. No other code point outside the ASCII range
1242             // has this mismatch.
1243
1244             if (value._value == 0x0085u)
1245             {
1246                 return true;
1247             }
1248
1249             return IsCategorySeparator(GetUnicodeCategoryNonAscii(value));
1250         }
1251
1252         public static Rune ToLower(Rune value, CultureInfo culture)
1253         {
1254             if (culture is null)
1255             {
1256                 ThrowHelper.ThrowArgumentNullException(ExceptionArgument.culture);
1257             }
1258
1259             // We don't want to special-case ASCII here since the specified culture might handle
1260             // ASCII characters differently than the invariant culture (e.g., Turkish I). Instead
1261             // we'll just jump straight to the globalization tables if they're available.
1262
1263             if (GlobalizationMode.Invariant)
1264             {
1265                 return ToLowerInvariant(value);
1266             }
1267
1268             return ChangeCaseCultureAware(value, culture!.TextInfo, toUpper: false);
1269         }
1270
1271         public static Rune ToLowerInvariant(Rune value)
1272         {
1273             // Handle the most common case (ASCII data) first. Within the common case, we expect
1274             // that there'll be a mix of lowercase & uppercase chars, so make the conversion branchless.
1275
1276             if (value.IsAscii)
1277             {
1278                 // It's ok for us to use the UTF-16 conversion utility for this since the high
1279                 // 16 bits of the value will never be set so will be left unchanged.
1280                 return UnsafeCreate(Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(value._value));
1281             }
1282
1283             if (GlobalizationMode.Invariant)
1284             {
1285                 // If the value isn't ASCII and if the globalization tables aren't available,
1286                 // case changing has no effect.
1287                 return value;
1288             }
1289
1290             // Non-ASCII data requires going through the case folding tables.
1291
1292             return ChangeCaseCultureAware(value, TextInfo.Invariant, toUpper: false);
1293         }
1294
1295         public static Rune ToUpper(Rune value, CultureInfo culture)
1296         {
1297             if (culture is null)
1298             {
1299                 ThrowHelper.ThrowArgumentNullException(ExceptionArgument.culture);
1300             }
1301
1302             // We don't want to special-case ASCII here since the specified culture might handle
1303             // ASCII characters differently than the invariant culture (e.g., Turkish I). Instead
1304             // we'll just jump straight to the globalization tables if they're available.
1305
1306             if (GlobalizationMode.Invariant)
1307             {
1308                 return ToUpperInvariant(value);
1309             }
1310
1311             return ChangeCaseCultureAware(value, culture!.TextInfo, toUpper: true);
1312         }
1313
1314         public static Rune ToUpperInvariant(Rune value)
1315         {
1316             // Handle the most common case (ASCII data) first. Within the common case, we expect
1317             // that there'll be a mix of lowercase & uppercase chars, so make the conversion branchless.
1318
1319             if (value.IsAscii)
1320             {
1321                 // It's ok for us to use the UTF-16 conversion utility for this since the high
1322                 // 16 bits of the value will never be set so will be left unchanged.
1323                 return UnsafeCreate(Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(value._value));
1324             }
1325
1326             if (GlobalizationMode.Invariant)
1327             {
1328                 // If the value isn't ASCII and if the globalization tables aren't available,
1329                 // case changing has no effect.
1330                 return value;
1331             }
1332
1333             // Non-ASCII data requires going through the case folding tables.
1334
1335             return ChangeCaseCultureAware(value, TextInfo.Invariant, toUpper: true);
1336         }
1337     }
1338 }