Fix StyleCop warning SA1121 (use built-in types)
[mono-project.git] / netcore / System.Private.CoreLib / shared / System / Text / Rune.cs
blobcafc68bf578d7d834645866e64afe7d8e9630e3d
1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
5 using System.Buffers;
6 using System.Diagnostics;
7 using System.Globalization;
8 using System.Runtime.CompilerServices;
9 using System.Text.Unicode;
11 namespace System.Text
13 /// <summary>
14 /// Represents a Unicode scalar value ([ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive).
15 /// </summary>
16 /// <remarks>
17 /// This type's constructors and conversion operators validate the input, so consumers can call the APIs
18 /// assuming that the underlying <see cref="Rune"/> instance is well-formed.
19 /// </remarks>
20 [DebuggerDisplay("{DebuggerDisplay,nq}")]
21 public readonly struct Rune : IComparable<Rune>, IEquatable<Rune>
23 private const byte IsWhiteSpaceFlag = 0x80;
24 private const byte IsLetterOrDigitFlag = 0x40;
25 private const byte UnicodeCategoryMask = 0x1F;
27 // Contains information about the ASCII character range [ U+0000..U+007F ], with:
28 // - 0x80 bit if set means 'is whitespace'
29 // - 0x40 bit if set means 'is letter or digit'
30 // - 0x20 bit is reserved for future use
31 // - bottom 5 bits are the UnicodeCategory of the character
32 private static ReadOnlySpan<byte> AsciiCharInfo => new byte[]
34 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x8E, 0x8E, 0x8E, 0x8E, 0x8E, 0x0E, 0x0E,
35 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E,
36 0x8B, 0x18, 0x18, 0x18, 0x1A, 0x18, 0x18, 0x18, 0x14, 0x15, 0x18, 0x19, 0x18, 0x13, 0x18, 0x18,
37 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x18, 0x18, 0x19, 0x19, 0x19, 0x18,
38 0x18, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
39 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x14, 0x18, 0x15, 0x1B, 0x12,
40 0x1B, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41,
41 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x14, 0x19, 0x15, 0x19, 0x0E
44 private readonly uint _value;
46 /// <summary>
47 /// Creates a <see cref="Rune"/> from the provided UTF-16 code unit.
48 /// </summary>
49 /// <exception cref="ArgumentOutOfRangeException">
50 /// If <paramref name="ch"/> represents a UTF-16 surrogate code point
51 /// U+D800..U+DFFF, inclusive.
52 /// </exception>
53 public Rune(char ch)
55 uint expanded = ch;
56 if (UnicodeUtility.IsSurrogateCodePoint(expanded))
58 ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.ch);
60 _value = expanded;
63 /// <summary>
64 /// Creates a <see cref="Rune"/> from the provided UTF-16 surrogate pair.
65 /// </summary>
66 /// <exception cref="ArgumentOutOfRangeException">
67 /// If <paramref name="highSurrogate"/> does not represent a UTF-16 high surrogate code point
68 /// or <paramref name="lowSurrogate"/> does not represent a UTF-16 low surrogate code point.
69 /// </exception>
70 public Rune(char highSurrogate, char lowSurrogate)
71 : this((uint)char.ConvertToUtf32(highSurrogate, lowSurrogate), false)
75 /// <summary>
76 /// Creates a <see cref="Rune"/> from the provided Unicode scalar value.
77 /// </summary>
78 /// <exception cref="ArgumentOutOfRangeException">
79 /// If <paramref name="value"/> does not represent a value Unicode scalar value.
80 /// </exception>
81 public Rune(int value)
82 : this((uint)value)
86 /// <summary>
87 /// Creates a <see cref="Rune"/> from the provided Unicode scalar value.
88 /// </summary>
89 /// <exception cref="ArgumentOutOfRangeException">
90 /// If <paramref name="value"/> does not represent a value Unicode scalar value.
91 /// </exception>
92 [CLSCompliant(false)]
93 public Rune(uint value)
95 if (!UnicodeUtility.IsValidUnicodeScalar(value))
97 ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.value);
99 _value = value;
102 // non-validating ctor
103 private Rune(uint scalarValue, bool unused)
105 UnicodeDebug.AssertIsValidScalar(scalarValue);
106 _value = scalarValue;
109 public static bool operator ==(Rune left, Rune right) => (left._value == right._value);
111 public static bool operator !=(Rune left, Rune right) => (left._value != right._value);
113 public static bool operator <(Rune left, Rune right) => (left._value < right._value);
115 public static bool operator <=(Rune left, Rune right) => (left._value <= right._value);
117 public static bool operator >(Rune left, Rune right) => (left._value > right._value);
119 public static bool operator >=(Rune left, Rune right) => (left._value >= right._value);
121 // Operators below are explicit because they may throw.
123 public static explicit operator Rune(char ch) => new Rune(ch);
125 [CLSCompliant(false)]
126 public static explicit operator Rune(uint value) => new Rune(value);
128 public static explicit operator Rune(int value) => new Rune(value);
130 // Displayed as "'<char>' (U+XXXX)"; e.g., "'e' (U+0065)"
131 private string DebuggerDisplay => FormattableString.Invariant($"U+{_value:X4} '{(IsValid(_value) ? ToString() : "\uFFFD")}'");
133 /// <summary>
134 /// Returns true if and only if this scalar value is ASCII ([ U+0000..U+007F ])
135 /// and therefore representable by a single UTF-8 code unit.
136 /// </summary>
137 public bool IsAscii => UnicodeUtility.IsAsciiCodePoint(_value);
139 /// <summary>
140 /// Returns true if and only if this scalar value is within the BMP ([ U+0000..U+FFFF ])
141 /// and therefore representable by a single UTF-16 code unit.
142 /// </summary>
143 public bool IsBmp => UnicodeUtility.IsBmpCodePoint(_value);
145 /// <summary>
146 /// Returns the Unicode plane (0 to 16, inclusive) which contains this scalar.
147 /// </summary>
148 public int Plane => UnicodeUtility.GetPlane(_value);
150 /// <summary>
151 /// A <see cref="Rune"/> instance that represents the Unicode replacement character U+FFFD.
152 /// </summary>
153 public static Rune ReplacementChar => UnsafeCreate(UnicodeUtility.ReplacementChar);
155 /// <summary>
156 /// Returns the length in code units (<see cref="char"/>) of the
157 /// UTF-16 sequence required to represent this scalar value.
158 /// </summary>
159 /// <remarks>
160 /// The return value will be 1 or 2.
161 /// </remarks>
162 public int Utf16SequenceLength => UnicodeUtility.GetUtf16SequenceLength(_value);
164 /// <summary>
165 /// Returns the length in code units of the
166 /// UTF-8 sequence required to represent this scalar value.
167 /// </summary>
168 /// <remarks>
169 /// The return value will be 1 through 4, inclusive.
170 /// </remarks>
171 public int Utf8SequenceLength => UnicodeUtility.GetUtf8SequenceLength(_value);
173 /// <summary>
174 /// Returns the Unicode scalar value as an integer.
175 /// </summary>
176 public int Value => (int)_value;
178 private static Rune ChangeCaseCultureAware(Rune rune, TextInfo textInfo, bool toUpper)
180 Debug.Assert(!GlobalizationMode.Invariant, "This should've been checked by the caller.");
181 Debug.Assert(textInfo != null, "This should've been checked by the caller.");
183 Span<char> original = stackalloc char[2]; // worst case scenario = 2 code units (for a surrogate pair)
184 Span<char> modified = stackalloc char[2]; // case change should preserve UTF-16 code unit count
186 int charCount = rune.EncodeToUtf16(original);
187 original = original.Slice(0, charCount);
188 modified = modified.Slice(0, charCount);
190 if (toUpper)
192 textInfo.ChangeCaseToUpper(original, modified);
194 else
196 textInfo.ChangeCaseToLower(original, modified);
199 // We use simple case folding rules, which disallows moving between the BMP and supplementary
200 // planes when performing a case conversion. The helper methods which reconstruct a Rune
201 // contain debug asserts for this condition.
203 if (rune.IsBmp)
205 return UnsafeCreate(modified[0]);
207 else
209 return UnsafeCreate(UnicodeUtility.GetScalarFromUtf16SurrogatePair(modified[0], modified[1]));
213 public int CompareTo(Rune other) => this._value.CompareTo(other._value);
215 /// <summary>
216 /// Decodes the <see cref="Rune"/> at the beginning of the provided UTF-16 source buffer.
217 /// </summary>
218 /// <returns>
219 /// <para>
220 /// If the source buffer begins with a valid UTF-16 encoded scalar value, returns <see cref="OperationStatus.Done"/>,
221 /// and outs via <paramref name="result"/> the decoded <see cref="Rune"/> and via <paramref name="charsConsumed"/> the
222 /// number of <see langword="char"/>s used in the input buffer to encode the <see cref="Rune"/>.
223 /// </para>
224 /// <para>
225 /// If the source buffer is empty or contains only a standalone UTF-16 high surrogate character, returns <see cref="OperationStatus.NeedMoreData"/>,
226 /// and outs via <paramref name="result"/> <see cref="ReplacementChar"/> and via <paramref name="charsConsumed"/> the length of the input buffer.
227 /// </para>
228 /// <para>
229 /// If the source buffer begins with an ill-formed UTF-16 encoded scalar value, returns <see cref="OperationStatus.InvalidData"/>,
230 /// and outs via <paramref name="result"/> <see cref="ReplacementChar"/> and via <paramref name="charsConsumed"/> the number of
231 /// <see langword="char"/>s used in the input buffer to encode the ill-formed sequence.
232 /// </para>
233 /// </returns>
234 /// <remarks>
235 /// The general calling convention is to call this method in a loop, slicing the <paramref name="source"/> buffer by
236 /// <paramref name="charsConsumed"/> elements on each iteration of the loop. On each iteration of the loop <paramref name="result"/>
237 /// will contain the real scalar value if successfully decoded, or it will contain <see cref="ReplacementChar"/> if
238 /// the data could not be successfully decoded. This pattern provides convenient automatic U+FFFD substitution of
239 /// invalid sequences while iterating through the loop.
240 /// </remarks>
241 public static OperationStatus DecodeFromUtf16(ReadOnlySpan<char> source, out Rune result, out int charsConsumed)
243 if (!source.IsEmpty)
245 // First, check for the common case of a BMP scalar value.
246 // If this is correct, return immediately.
248 char firstChar = source[0];
249 if (TryCreate(firstChar, out result))
251 charsConsumed = 1;
252 return OperationStatus.Done;
255 // First thing we saw was a UTF-16 surrogate code point.
256 // Let's optimistically assume for now it's a high surrogate and hope
257 // that combining it with the next char yields useful results.
259 if (1 < (uint)source.Length)
261 char secondChar = source[1];
262 if (TryCreate(firstChar, secondChar, out result))
264 // Success! Formed a supplementary scalar value.
265 charsConsumed = 2;
266 return OperationStatus.Done;
268 else
270 // Either the first character was a low surrogate, or the second
271 // character was not a low surrogate. This is an error.
272 goto InvalidData;
275 else if (!char.IsHighSurrogate(firstChar))
277 // Quick check to make sure we're not going to report NeedMoreData for
278 // a single-element buffer where the data is a standalone low surrogate
279 // character. Since no additional data will ever make this valid, we'll
280 // report an error immediately.
281 goto InvalidData;
285 // If we got to this point, the input buffer was empty, or the buffer
286 // was a single element in length and that element was a high surrogate char.
288 charsConsumed = source.Length;
289 result = ReplacementChar;
290 return OperationStatus.NeedMoreData;
292 InvalidData:
294 charsConsumed = 1; // maximal invalid subsequence for UTF-16 is always a single code unit in length
295 result = ReplacementChar;
296 return OperationStatus.InvalidData;
299 /// <summary>
300 /// Decodes the <see cref="Rune"/> at the beginning of the provided UTF-8 source buffer.
301 /// </summary>
302 /// <returns>
303 /// <para>
304 /// If the source buffer begins with a valid UTF-8 encoded scalar value, returns <see cref="OperationStatus.Done"/>,
305 /// and outs via <paramref name="result"/> the decoded <see cref="Rune"/> and via <paramref name="bytesConsumed"/> the
306 /// number of <see langword="byte"/>s used in the input buffer to encode the <see cref="Rune"/>.
307 /// </para>
308 /// <para>
309 /// If the source buffer is empty or contains only a standalone UTF-8 high surrogate character, returns <see cref="OperationStatus.NeedMoreData"/>,
310 /// and outs via <paramref name="result"/> <see cref="ReplacementChar"/> and via <paramref name="bytesConsumed"/> the length of the input buffer.
311 /// </para>
312 /// <para>
313 /// If the source buffer begins with an ill-formed UTF-8 encoded scalar value, returns <see cref="OperationStatus.InvalidData"/>,
314 /// and outs via <paramref name="result"/> <see cref="ReplacementChar"/> and via <paramref name="bytesConsumed"/> the number of
315 /// <see langword="char"/>s used in the input buffer to encode the ill-formed sequence.
316 /// </para>
317 /// </returns>
318 /// <remarks>
319 /// The general calling convention is to call this method in a loop, slicing the <paramref name="source"/> buffer by
320 /// <paramref name="bytesConsumed"/> elements on each iteration of the loop. On each iteration of the loop <paramref name="result"/>
321 /// will contain the real scalar value if successfully decoded, or it will contain <see cref="ReplacementChar"/> if
322 /// the data could not be successfully decoded. This pattern provides convenient automatic U+FFFD substitution of
323 /// invalid sequences while iterating through the loop.
324 /// </remarks>
325 public static OperationStatus DecodeFromUtf8(ReadOnlySpan<byte> source, out Rune result, out int bytesConsumed)
327 // This method follows the Unicode Standard's recommendation for detecting
328 // the maximal subpart of an ill-formed subsequence. See The Unicode Standard,
329 // Ch. 3.9 for more details. In summary, when reporting an invalid subsequence,
330 // it tries to consume as many code units as possible as long as those code
331 // units constitute the beginning of a longer well-formed subsequence per Table 3-7.
333 int index = 0;
335 // Try reading input[0].
337 if ((uint)index >= (uint)source.Length)
339 goto NeedsMoreData;
342 uint tempValue = source[index];
343 if (!UnicodeUtility.IsAsciiCodePoint(tempValue))
345 goto NotAscii;
348 Finish:
350 bytesConsumed = index + 1;
351 Debug.Assert(1 <= bytesConsumed && bytesConsumed <= 4); // Valid subsequences are always length [1..4]
352 result = UnsafeCreate(tempValue);
353 return OperationStatus.Done;
355 NotAscii:
357 // Per Table 3-7, the beginning of a multibyte sequence must be a code unit in
358 // the range [C2..F4]. If it's outside of that range, it's either a standalone
359 // continuation byte, or it's an overlong two-byte sequence, or it's an out-of-range
360 // four-byte sequence.
362 if (!UnicodeUtility.IsInRangeInclusive(tempValue, 0xC2, 0xF4))
364 goto FirstByteInvalid;
367 tempValue = (tempValue - 0xC2) << 6;
369 // Try reading input[1].
371 index++;
372 if ((uint)index >= (uint)source.Length)
374 goto NeedsMoreData;
377 // Continuation bytes are of the form [10xxxxxx], which means that their two's
378 // complement representation is in the range [-65..-128]. This allows us to
379 // perform a single comparison to see if a byte is a continuation byte.
381 int thisByteSignExtended = (sbyte)source[index];
382 if (thisByteSignExtended >= -64)
384 goto Invalid;
387 tempValue += (uint)thisByteSignExtended;
388 tempValue += 0x80; // remove the continuation byte marker
389 tempValue += (0xC2 - 0xC0) << 6; // remove the leading byte marker
391 if (tempValue < 0x0800)
393 Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0080, 0x07FF));
394 goto Finish; // this is a valid 2-byte sequence
397 // This appears to be a 3- or 4-byte sequence. Since per Table 3-7 we now have
398 // enough information (from just two code units) to detect overlong or surrogate
399 // sequences, we need to perform these checks now.
401 if (!UnicodeUtility.IsInRangeInclusive(tempValue, ((0xE0 - 0xC0) << 6) + (0xA0 - 0x80), ((0xF4 - 0xC0) << 6) + (0x8F - 0x80)))
403 // The first two bytes were not in the range [[E0 A0]..[F4 8F]].
404 // This is an overlong 3-byte sequence or an out-of-range 4-byte sequence.
405 goto Invalid;
408 if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xED - 0xC0) << 6) + (0xA0 - 0x80), ((0xED - 0xC0) << 6) + (0xBF - 0x80)))
410 // This is a UTF-16 surrogate code point, which is invalid in UTF-8.
411 goto Invalid;
414 if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xF0 - 0xC0) << 6) + (0x80 - 0x80), ((0xF0 - 0xC0) << 6) + (0x8F - 0x80)))
416 // This is an overlong 4-byte sequence.
417 goto Invalid;
420 // The first two bytes were just fine. We don't need to perform any other checks
421 // on the remaining bytes other than to see that they're valid continuation bytes.
423 // Try reading input[2].
425 index++;
426 if ((uint)index >= (uint)source.Length)
428 goto NeedsMoreData;
431 thisByteSignExtended = (sbyte)source[index];
432 if (thisByteSignExtended >= -64)
434 goto Invalid; // this byte is not a UTF-8 continuation byte
437 tempValue <<= 6;
438 tempValue += (uint)thisByteSignExtended;
439 tempValue += 0x80; // remove the continuation byte marker
440 tempValue -= (0xE0 - 0xC0) << 12; // remove the leading byte marker
442 if (tempValue <= 0xFFFF)
444 Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0800, 0xFFFF));
445 goto Finish; // this is a valid 3-byte sequence
448 // Try reading input[3].
450 index++;
451 if ((uint)index >= (uint)source.Length)
453 goto NeedsMoreData;
456 thisByteSignExtended = (sbyte)source[index];
457 if (thisByteSignExtended >= -64)
459 goto Invalid; // this byte is not a UTF-8 continuation byte
462 tempValue <<= 6;
463 tempValue += (uint)thisByteSignExtended;
464 tempValue += 0x80; // remove the continuation byte marker
465 tempValue -= (0xF0 - 0xE0) << 18; // remove the leading byte marker
467 UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(tempValue);
468 goto Finish; // this is a valid 4-byte sequence
470 FirstByteInvalid:
472 index = 1; // Invalid subsequences are always at least length 1.
474 Invalid:
476 Debug.Assert(1 <= index && index <= 3); // Invalid subsequences are always length 1..3
477 bytesConsumed = index;
478 result = ReplacementChar;
479 return OperationStatus.InvalidData;
481 NeedsMoreData:
483 Debug.Assert(0 <= index && index <= 3); // Incomplete subsequences are always length 0..3
484 bytesConsumed = index;
485 result = ReplacementChar;
486 return OperationStatus.NeedMoreData;
489 /// <summary>
490 /// Decodes the <see cref="Rune"/> at the end of the provided UTF-16 source buffer.
491 /// </summary>
492 /// <remarks>
493 /// This method is very similar to <see cref="DecodeFromUtf16(ReadOnlySpan{char}, out Rune, out int)"/>, but it allows
494 /// the caller to loop backward instead of forward. The typical calling convention is that on each iteration
495 /// of the loop, the caller should slice off the final <paramref name="charsConsumed"/> elements of
496 /// the <paramref name="source"/> buffer.
497 /// </remarks>
498 public static OperationStatus DecodeLastFromUtf16(ReadOnlySpan<char> source, out Rune result, out int charsConsumed)
500 int index = source.Length - 1;
501 if ((uint)index < (uint)source.Length)
503 // First, check for the common case of a BMP scalar value.
504 // If this is correct, return immediately.
506 char finalChar = source[index];
507 if (TryCreate(finalChar, out result))
509 charsConsumed = 1;
510 return OperationStatus.Done;
513 if (char.IsLowSurrogate(finalChar))
515 // The final character was a UTF-16 low surrogate code point.
516 // This must be preceded by a UTF-16 high surrogate code point, otherwise
517 // we have a standalone low surrogate, which is always invalid.
519 index--;
520 if ((uint)index < (uint)source.Length)
522 char penultimateChar = source[index];
523 if (TryCreate(penultimateChar, finalChar, out result))
525 // Success! Formed a supplementary scalar value.
526 charsConsumed = 2;
527 return OperationStatus.Done;
531 // If we got to this point, we saw a standalone low surrogate
532 // and must report an error.
534 charsConsumed = 1; // standalone surrogate
535 result = ReplacementChar;
536 return OperationStatus.InvalidData;
540 // If we got this far, the source buffer was empty, or the source buffer ended
541 // with a UTF-16 high surrogate code point. These aren't errors since they could
542 // be valid given more input data.
544 charsConsumed = (int)((uint)(-source.Length) >> 31); // 0 -> 0, all other lengths -> 1
545 result = ReplacementChar;
546 return OperationStatus.NeedMoreData;
549 /// <summary>
550 /// Decodes the <see cref="Rune"/> at the end of the provided UTF-8 source buffer.
551 /// </summary>
552 /// <remarks>
553 /// This method is very similar to <see cref="DecodeFromUtf8(ReadOnlySpan{byte}, out Rune, out int)"/>, but it allows
554 /// the caller to loop backward instead of forward. The typical calling convention is that on each iteration
555 /// of the loop, the caller should slice off the final <paramref name="bytesConsumed"/> elements of
556 /// the <paramref name="source"/> buffer.
557 /// </remarks>
558 public static OperationStatus DecodeLastFromUtf8(ReadOnlySpan<byte> source, out Rune value, out int bytesConsumed)
560 int index = source.Length - 1;
561 if ((uint)index < (uint)source.Length)
563 // The buffer contains at least one byte. Let's check the fast case where the
564 // buffer ends with an ASCII byte.
566 uint tempValue = source[index];
567 if (UnicodeUtility.IsAsciiCodePoint(tempValue))
569 bytesConsumed = 1;
570 value = UnsafeCreate(tempValue);
571 return OperationStatus.Done;
574 // If the final byte is not an ASCII byte, we may be beginning or in the middle of
575 // a UTF-8 multi-code unit sequence. We need to back up until we see the start of
576 // the multi-code unit sequence; we can detect the leading byte because all multi-byte
577 // sequences begin with a byte whose 0x40 bit is set. Since all multi-byte sequences
578 // are no greater than 4 code units in length, we only need to search back a maximum
579 // of four bytes.
581 if (((byte)tempValue & 0x40) != 0)
583 // This is a UTF-8 leading byte. We'll do a forward read from here.
584 // It'll return invalid (if given C0, F5, etc.) or incomplete. Both are fine.
586 return DecodeFromUtf8(source.Slice(index), out value, out bytesConsumed);
589 // If we got to this point, the final byte was a UTF-8 continuation byte.
590 // Let's check the three bytes immediately preceding this, looking for the starting byte.
592 for (int i = 3; i > 0; i--)
594 index--;
595 if ((uint)index >= (uint)source.Length)
597 goto Invalid; // out of data
600 // The check below will get hit for ASCII (values 00..7F) and for UTF-8 starting bytes
601 // (bits 0xC0 set, values C0..FF). In two's complement this is the range [-64..127].
602 // It's just a fast way for us to terminate the search.
604 if ((sbyte)source[index] >= -64)
606 goto ForwardDecode;
610 Invalid:
612 // If we got to this point, either:
613 // - the last 4 bytes of the input buffer are continuation bytes;
614 // - the entire input buffer (if fewer than 4 bytes) consists only of continuation bytes; or
615 // - there's no UTF-8 leading byte between the final continuation byte of the buffer and
616 // the previous well-formed subsequence or maximal invalid subsequence.
618 // In all of these cases, the final byte must be a maximal invalid subsequence of length 1.
619 // See comment near the end of this method for more information.
621 value = ReplacementChar;
622 bytesConsumed = 1;
623 return OperationStatus.InvalidData;
625 ForwardDecode:
627 // If we got to this point, we found an ASCII byte or a UTF-8 starting byte at position source[index].
628 // Technically this could also mean we found an invalid byte like C0 or F5 at this position, but that's
629 // fine since it'll be handled by the forward read. From this position, we'll perform a forward read
630 // and see if we consumed the entirety of the buffer.
632 source = source.Slice(index);
633 Debug.Assert(!source.IsEmpty, "Shouldn't reach this for empty inputs.");
635 OperationStatus operationStatus = DecodeFromUtf8(source, out Rune tempRune, out int tempBytesConsumed);
636 if (tempBytesConsumed == source.Length)
638 // If this forward read consumed the entirety of the end of the input buffer, we can return it
639 // as the result of this function. It could be well-formed, incomplete, or invalid. If it's
640 // invalid and we consumed the remainder of the buffer, we know we've found the maximal invalid
641 // subsequence, which is what we wanted anyway.
643 bytesConsumed = tempBytesConsumed;
644 value = tempRune;
645 return operationStatus;
648 // If we got to this point, we know that the final continuation byte wasn't consumed by the forward
649 // read that we just performed above. This means that the continuation byte has to be part of an
650 // invalid subsequence since there's no UTF-8 leading byte between what we just consumed and the
651 // continuation byte at the end of the input. Furthermore, since any maximal invalid subsequence
652 // of length > 1 must have a UTF-8 leading byte as its first code unit, this implies that the
653 // continuation byte at the end of the buffer is itself a maximal invalid subsequence of length 1.
655 goto Invalid;
657 else
659 // Source buffer was empty.
660 value = ReplacementChar;
661 bytesConsumed = 0;
662 return OperationStatus.NeedMoreData;
666 /// <summary>
667 /// Encodes this <see cref="Rune"/> to a UTF-16 destination buffer.
668 /// </summary>
669 /// <param name="destination">The buffer to which to write this value as UTF-16.</param>
670 /// <returns>The number of <see cref="char"/>s written to <paramref name="destination"/>.</returns>
671 /// <exception cref="ArgumentException">
672 /// If <paramref name="destination"/> is not large enough to hold the output.
673 /// </exception>
674 public int EncodeToUtf16(Span<char> destination)
676 if (!TryEncodeToUtf16(destination, out int charsWritten))
678 ThrowHelper.ThrowArgumentException_DestinationTooShort();
681 return charsWritten;
684 /// <summary>
685 /// Encodes this <see cref="Rune"/> to a UTF-8 destination buffer.
686 /// </summary>
687 /// <param name="destination">The buffer to which to write this value as UTF-8.</param>
688 /// <returns>The number of <see cref="byte"/>s written to <paramref name="destination"/>.</returns>
689 /// <exception cref="ArgumentException">
690 /// If <paramref name="destination"/> is not large enough to hold the output.
691 /// </exception>
692 public int EncodeToUtf8(Span<byte> destination)
694 if (!TryEncodeToUtf8(destination, out int bytesWritten))
696 ThrowHelper.ThrowArgumentException_DestinationTooShort();
699 return bytesWritten;
702 public override bool Equals(object? obj) => (obj is Rune other) && this.Equals(other);
704 public bool Equals(Rune other) => (this == other);
706 public override int GetHashCode() => Value;
708 /// <summary>
709 /// Gets the <see cref="Rune"/> which begins at index <paramref name="index"/> in
710 /// string <paramref name="input"/>.
711 /// </summary>
712 /// <remarks>
713 /// Throws if <paramref name="input"/> is null, if <paramref name="index"/> is out of range, or
714 /// if <paramref name="index"/> does not reference the start of a valid scalar value within <paramref name="input"/>.
715 /// </remarks>
716 public static Rune GetRuneAt(string input, int index)
718 int runeValue = ReadRuneFromString(input, index);
719 if (runeValue < 0)
721 ThrowHelper.ThrowArgumentException_CannotExtractScalar(ExceptionArgument.index);
724 return UnsafeCreate((uint)runeValue);
727 /// <summary>
728 /// Returns <see langword="true"/> iff <paramref name="value"/> is a valid Unicode scalar
729 /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive.
730 /// </summary>
731 public static bool IsValid(int value) => IsValid((uint)value);
733 /// <summary>
734 /// Returns <see langword="true"/> iff <paramref name="value"/> is a valid Unicode scalar
735 /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive.
736 /// </summary>
737 [CLSCompliant(false)]
738 public static bool IsValid(uint value) => UnicodeUtility.IsValidUnicodeScalar(value);
740 // returns a negative number on failure
741 internal static int ReadFirstRuneFromUtf16Buffer(ReadOnlySpan<char> input)
743 if (input.IsEmpty)
745 return -1;
748 // Optimistically assume input is within BMP.
750 uint returnValue = input[0];
751 if (UnicodeUtility.IsSurrogateCodePoint(returnValue))
753 if (!UnicodeUtility.IsHighSurrogateCodePoint(returnValue))
755 return -1;
758 // Treat 'returnValue' as the high surrogate.
760 if (1 >= (uint)input.Length)
762 return -1; // not an argument exception - just a "bad data" failure
765 uint potentialLowSurrogate = input[1];
766 if (!UnicodeUtility.IsLowSurrogateCodePoint(potentialLowSurrogate))
768 return -1;
771 returnValue = UnicodeUtility.GetScalarFromUtf16SurrogatePair(returnValue, potentialLowSurrogate);
774 return (int)returnValue;
777 // returns a negative number on failure
778 private static int ReadRuneFromString(string input, int index)
780 if (input is null)
782 ThrowHelper.ThrowArgumentNullException(ExceptionArgument.input);
785 if ((uint)index >= (uint)input!.Length)
787 ThrowHelper.ThrowArgumentOutOfRange_IndexException();
790 // Optimistically assume input is within BMP.
792 uint returnValue = input[index];
793 if (UnicodeUtility.IsSurrogateCodePoint(returnValue))
795 if (!UnicodeUtility.IsHighSurrogateCodePoint(returnValue))
797 return -1;
800 // Treat 'returnValue' as the high surrogate.
802 // If this becomes a hot code path, we can skip the below bounds check by reading
803 // off the end of the string using unsafe code. Since strings are null-terminated,
804 // we're guaranteed not to read a valid low surrogate, so we'll fail correctly if
805 // the string terminates unexpectedly.
807 index++;
808 if ((uint)index >= (uint)input.Length)
810 return -1; // not an argument exception - just a "bad data" failure
813 uint potentialLowSurrogate = input[index];
814 if (!UnicodeUtility.IsLowSurrogateCodePoint(potentialLowSurrogate))
816 return -1;
819 returnValue = UnicodeUtility.GetScalarFromUtf16SurrogatePair(returnValue, potentialLowSurrogate);
822 return (int)returnValue;
825 /// <summary>
826 /// Returns a <see cref="string"/> representation of this <see cref="Rune"/> instance.
827 /// </summary>
828 public override string ToString()
830 if (IsBmp)
832 return string.CreateFromChar((char)_value);
834 else
836 UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar(_value, out char high, out char low);
837 return string.CreateFromChar(high, low);
841 /// <summary>
842 /// Attempts to create a <see cref="Rune"/> from the provided input value.
843 /// </summary>
844 public static bool TryCreate(char ch, out Rune result)
846 uint extendedValue = ch;
847 if (!UnicodeUtility.IsSurrogateCodePoint(extendedValue))
849 result = UnsafeCreate(extendedValue);
850 return true;
852 else
854 result = default;
855 return false;
859 /// <summary>
860 /// Attempts to create a <see cref="Rune"/> from the provided UTF-16 surrogate pair.
861 /// Returns <see langword="false"/> if the input values don't represent a well-formed UTF-16surrogate pair.
862 /// </summary>
863 public static bool TryCreate(char highSurrogate, char lowSurrogate, out Rune result)
865 // First, extend both to 32 bits, then calculate the offset of
866 // each candidate surrogate char from the start of its range.
868 uint highSurrogateOffset = (uint)highSurrogate - CharUnicodeInfo.HIGH_SURROGATE_START;
869 uint lowSurrogateOffset = (uint)lowSurrogate - CharUnicodeInfo.LOW_SURROGATE_START;
871 // This is a single comparison which allows us to check both for validity at once since
872 // both the high surrogate range and the low surrogate range are the same length.
873 // If the comparison fails, we call to a helper method to throw the correct exception message.
875 if ((highSurrogateOffset | lowSurrogateOffset) <= CharUnicodeInfo.HIGH_SURROGATE_RANGE)
877 // The 0x40u << 10 below is to account for uuuuu = wwww + 1 in the surrogate encoding.
878 result = UnsafeCreate((highSurrogateOffset << 10) + ((uint)lowSurrogate - CharUnicodeInfo.LOW_SURROGATE_START) + (0x40u << 10));
879 return true;
881 else
883 // Didn't have a high surrogate followed by a low surrogate.
884 result = default;
885 return false;
889 /// <summary>
890 /// Attempts to create a <see cref="Rune"/> from the provided input value.
891 /// </summary>
892 public static bool TryCreate(int value, out Rune result) => TryCreate((uint)value, out result);
894 /// <summary>
895 /// Attempts to create a <see cref="Rune"/> from the provided input value.
896 /// </summary>
897 [CLSCompliant(false)]
898 public static bool TryCreate(uint value, out Rune result)
900 if (UnicodeUtility.IsValidUnicodeScalar(value))
902 result = UnsafeCreate(value);
903 return true;
905 else
907 result = default;
908 return false;
912 /// <summary>
913 /// Encodes this <see cref="Rune"/> to a UTF-16 destination buffer.
914 /// </summary>
915 /// <param name="destination">The buffer to which to write this value as UTF-16.</param>
916 /// <param name="charsWritten">
917 /// The number of <see cref="char"/>s written to <paramref name="destination"/>,
918 /// or 0 if the destination buffer is not large enough to contain the output.</param>
919 /// <returns>True if the value was written to the buffer; otherwise, false.</returns>
920 /// <remarks>
921 /// The <see cref="Utf16SequenceLength"/> property can be queried ahead of time to determine
922 /// the required size of the <paramref name="destination"/> buffer.
923 /// </remarks>
924 public bool TryEncodeToUtf16(Span<char> destination, out int charsWritten)
926 if (destination.Length >= 1)
928 if (IsBmp)
930 destination[0] = (char)_value;
931 charsWritten = 1;
932 return true;
934 else if (destination.Length >= 2)
936 UnicodeUtility.GetUtf16SurrogatesFromSupplementaryPlaneScalar(_value, out destination[0], out destination[1]);
937 charsWritten = 2;
938 return true;
942 // Destination buffer not large enough
944 charsWritten = default;
945 return false;
948 /// <summary>
949 /// Encodes this <see cref="Rune"/> to a destination buffer as UTF-8 bytes.
950 /// </summary>
951 /// <param name="destination">The buffer to which to write this value as UTF-8.</param>
952 /// <param name="bytesWritten">
953 /// The number of <see cref="byte"/>s written to <paramref name="destination"/>,
954 /// or 0 if the destination buffer is not large enough to contain the output.</param>
955 /// <returns>True if the value was written to the buffer; otherwise, false.</returns>
956 /// <remarks>
957 /// The <see cref="Utf8SequenceLength"/> property can be queried ahead of time to determine
958 /// the required size of the <paramref name="destination"/> buffer.
959 /// </remarks>
960 public bool TryEncodeToUtf8(Span<byte> destination, out int bytesWritten)
962 // The bit patterns below come from the Unicode Standard, Table 3-6.
964 if (destination.Length >= 1)
966 if (IsAscii)
968 destination[0] = (byte)_value;
969 bytesWritten = 1;
970 return true;
973 if (destination.Length >= 2)
975 if (_value <= 0x7FFu)
977 // Scalar 00000yyy yyxxxxxx -> bytes [ 110yyyyy 10xxxxxx ]
978 destination[0] = (byte)((_value + (0b110u << 11)) >> 6);
979 destination[1] = (byte)((_value & 0x3Fu) + 0x80u);
980 bytesWritten = 2;
981 return true;
984 if (destination.Length >= 3)
986 if (_value <= 0xFFFFu)
988 // Scalar zzzzyyyy yyxxxxxx -> bytes [ 1110zzzz 10yyyyyy 10xxxxxx ]
989 destination[0] = (byte)((_value + (0b1110 << 16)) >> 12);
990 destination[1] = (byte)(((_value & (0x3Fu << 6)) >> 6) + 0x80u);
991 destination[2] = (byte)((_value & 0x3Fu) + 0x80u);
992 bytesWritten = 3;
993 return true;
996 if (destination.Length >= 4)
998 // Scalar 000uuuuu zzzzyyyy yyxxxxxx -> bytes [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ]
999 destination[0] = (byte)((_value + (0b11110 << 21)) >> 18);
1000 destination[1] = (byte)(((_value & (0x3Fu << 12)) >> 12) + 0x80u);
1001 destination[2] = (byte)(((_value & (0x3Fu << 6)) >> 6) + 0x80u);
1002 destination[3] = (byte)((_value & 0x3Fu) + 0x80u);
1003 bytesWritten = 4;
1004 return true;
1010 // Destination buffer not large enough
1012 bytesWritten = default;
1013 return false;
1016 /// <summary>
1017 /// Attempts to get the <see cref="Rune"/> which begins at index <paramref name="index"/> in
1018 /// string <paramref name="input"/>.
1019 /// </summary>
1020 /// <returns><see langword="true"/> if a scalar value was successfully extracted from the specified index,
1021 /// <see langword="false"/> if a value could not be extracted due to invalid data.</returns>
1022 /// <remarks>
1023 /// Throws only if <paramref name="input"/> is null or <paramref name="index"/> is out of range.
1024 /// </remarks>
1025 public static bool TryGetRuneAt(string input, int index, out Rune value)
1027 int runeValue = ReadRuneFromString(input, index);
1028 if (runeValue >= 0)
1030 value = UnsafeCreate((uint)runeValue);
1031 return true;
1033 else
1035 value = default;
1036 return false;
1040 // Allows constructing a Unicode scalar value from an arbitrary 32-bit integer without
1041 // validation. It is the caller's responsibility to have performed manual validation
1042 // before calling this method. If a Rune instance is forcibly constructed
1043 // from invalid input, the APIs on this type have undefined behavior, potentially including
1044 // introducing a security hole in the consuming application.
1046 // An example of a security hole resulting from an invalid Rune value, which could result
1047 // in a stack overflow.
1049 // public int GetMarvin32HashCode(Rune r) {
1050 // Span<char> buffer = stackalloc char[r.Utf16SequenceLength];
1051 // r.TryEncode(buffer, ...);
1052 // return Marvin32.ComputeHash(buffer.AsBytes());
1053 // }
1055 /// <summary>
1056 /// Creates a <see cref="Rune"/> without performing validation on the input.
1057 /// </summary>
1058 [MethodImpl(MethodImplOptions.AggressiveInlining)]
1059 internal static Rune UnsafeCreate(uint scalarValue) => new Rune(scalarValue, false);
1061 // These are analogs of APIs on System.Char
1063 public static double GetNumericValue(Rune value)
1065 if (value.IsAscii)
1067 uint baseNum = value._value - '0';
1068 return (baseNum <= 9) ? (double)baseNum : -1;
1070 else
1072 // not an ASCII char; fall back to globalization table
1073 return CharUnicodeInfo.InternalGetNumericValue(value.Value);
1077 public static UnicodeCategory GetUnicodeCategory(Rune value)
1079 if (value.IsAscii)
1081 return (UnicodeCategory)(AsciiCharInfo[value.Value] & UnicodeCategoryMask);
1083 else
1085 return GetUnicodeCategoryNonAscii(value);
1089 private static UnicodeCategory GetUnicodeCategoryNonAscii(Rune value)
1091 Debug.Assert(!value.IsAscii, "Shouldn't use this non-optimized code path for ASCII characters.");
1092 return CharUnicodeInfo.GetUnicodeCategory(value.Value);
1095 // Returns true iff this Unicode category represents a letter
1096 private static bool IsCategoryLetter(UnicodeCategory category)
1098 return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.UppercaseLetter, (uint)UnicodeCategory.OtherLetter);
1101 // Returns true iff this Unicode category represents a letter or a decimal digit
1102 private static bool IsCategoryLetterOrDecimalDigit(UnicodeCategory category)
1104 return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.UppercaseLetter, (uint)UnicodeCategory.OtherLetter)
1105 || (category == UnicodeCategory.DecimalDigitNumber);
1108 // Returns true iff this Unicode category represents a number
1109 private static bool IsCategoryNumber(UnicodeCategory category)
1111 return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.DecimalDigitNumber, (uint)UnicodeCategory.OtherNumber);
1114 // Returns true iff this Unicode category represents a punctuation mark
1115 private static bool IsCategoryPunctuation(UnicodeCategory category)
1117 return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.ConnectorPunctuation, (uint)UnicodeCategory.OtherPunctuation);
1120 // Returns true iff this Unicode category represents a separator
1121 private static bool IsCategorySeparator(UnicodeCategory category)
1123 return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.SpaceSeparator, (uint)UnicodeCategory.ParagraphSeparator);
1126 // Returns true iff this Unicode category represents a symbol
1127 private static bool IsCategorySymbol(UnicodeCategory category)
1129 return UnicodeUtility.IsInRangeInclusive((uint)category, (uint)UnicodeCategory.MathSymbol, (uint)UnicodeCategory.OtherSymbol);
1132 public static bool IsControl(Rune value)
1134 // Per the Unicode stability policy, the set of control characters
1135 // is forever fixed at [ U+0000..U+001F ], [ U+007F..U+009F ]. No
1136 // characters will ever be added to the "control characters" group.
1137 // See http://www.unicode.org/policies/stability_policy.html.
1139 // Logic below depends on Rune.Value never being -1 (since Rune is a validating type)
1140 // 00..1F (+1) => 01..20 (&~80) => 01..20
1141 // 7F..9F (+1) => 80..A0 (&~80) => 00..20
1143 return (((value._value + 1) & ~0x80u) <= 0x20u);
1146 public static bool IsDigit(Rune value)
1148 if (value.IsAscii)
1150 return UnicodeUtility.IsInRangeInclusive(value._value, '0', '9');
1152 else
1154 return GetUnicodeCategoryNonAscii(value) == UnicodeCategory.DecimalDigitNumber;
1158 public static bool IsLetter(Rune value)
1160 if (value.IsAscii)
1162 return (((value._value - 'A') & ~0x20u) <= (uint)('Z' - 'A')); // [A-Za-z]
1164 else
1166 return IsCategoryLetter(GetUnicodeCategoryNonAscii(value));
1170 public static bool IsLetterOrDigit(Rune value)
1172 if (value.IsAscii)
1174 return ((AsciiCharInfo[value.Value] & IsLetterOrDigitFlag) != 0);
1176 else
1178 return IsCategoryLetterOrDecimalDigit(GetUnicodeCategoryNonAscii(value));
1182 public static bool IsLower(Rune value)
1184 if (value.IsAscii)
1186 return UnicodeUtility.IsInRangeInclusive(value._value, 'a', 'z');
1188 else
1190 return GetUnicodeCategoryNonAscii(value) == UnicodeCategory.LowercaseLetter;
1194 public static bool IsNumber(Rune value)
1196 if (value.IsAscii)
1198 return UnicodeUtility.IsInRangeInclusive(value._value, '0', '9');
1200 else
1202 return IsCategoryNumber(GetUnicodeCategoryNonAscii(value));
1206 public static bool IsPunctuation(Rune value)
1208 return IsCategoryPunctuation(GetUnicodeCategory(value));
1211 public static bool IsSeparator(Rune value)
1213 return IsCategorySeparator(GetUnicodeCategory(value));
1216 public static bool IsSymbol(Rune value)
1218 return IsCategorySymbol(GetUnicodeCategory(value));
1221 public static bool IsUpper(Rune value)
1223 if (value.IsAscii)
1225 return UnicodeUtility.IsInRangeInclusive(value._value, 'A', 'Z');
1227 else
1229 return GetUnicodeCategoryNonAscii(value) == UnicodeCategory.UppercaseLetter;
1233 public static bool IsWhiteSpace(Rune value)
1235 if (value.IsAscii)
1237 return (AsciiCharInfo[value.Value] & IsWhiteSpaceFlag) != 0;
1240 // U+0085 is special since it's a whitespace character but is in the Control category
1241 // instead of a normal separator category. No other code point outside the ASCII range
1242 // has this mismatch.
1244 if (value._value == 0x0085u)
1246 return true;
1249 return IsCategorySeparator(GetUnicodeCategoryNonAscii(value));
1252 public static Rune ToLower(Rune value, CultureInfo culture)
1254 if (culture is null)
1256 ThrowHelper.ThrowArgumentNullException(ExceptionArgument.culture);
1259 // We don't want to special-case ASCII here since the specified culture might handle
1260 // ASCII characters differently than the invariant culture (e.g., Turkish I). Instead
1261 // we'll just jump straight to the globalization tables if they're available.
1263 if (GlobalizationMode.Invariant)
1265 return ToLowerInvariant(value);
1268 return ChangeCaseCultureAware(value, culture!.TextInfo, toUpper: false);
1271 public static Rune ToLowerInvariant(Rune value)
1273 // Handle the most common case (ASCII data) first. Within the common case, we expect
1274 // that there'll be a mix of lowercase & uppercase chars, so make the conversion branchless.
1276 if (value.IsAscii)
1278 // It's ok for us to use the UTF-16 conversion utility for this since the high
1279 // 16 bits of the value will never be set so will be left unchanged.
1280 return UnsafeCreate(Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(value._value));
1283 if (GlobalizationMode.Invariant)
1285 // If the value isn't ASCII and if the globalization tables aren't available,
1286 // case changing has no effect.
1287 return value;
1290 // Non-ASCII data requires going through the case folding tables.
1292 return ChangeCaseCultureAware(value, TextInfo.Invariant, toUpper: false);
1295 public static Rune ToUpper(Rune value, CultureInfo culture)
1297 if (culture is null)
1299 ThrowHelper.ThrowArgumentNullException(ExceptionArgument.culture);
1302 // We don't want to special-case ASCII here since the specified culture might handle
1303 // ASCII characters differently than the invariant culture (e.g., Turkish I). Instead
1304 // we'll just jump straight to the globalization tables if they're available.
1306 if (GlobalizationMode.Invariant)
1308 return ToUpperInvariant(value);
1311 return ChangeCaseCultureAware(value, culture!.TextInfo, toUpper: true);
1314 public static Rune ToUpperInvariant(Rune value)
1316 // Handle the most common case (ASCII data) first. Within the common case, we expect
1317 // that there'll be a mix of lowercase & uppercase chars, so make the conversion branchless.
1319 if (value.IsAscii)
1321 // It's ok for us to use the UTF-16 conversion utility for this since the high
1322 // 16 bits of the value will never be set so will be left unchanged.
1323 return UnsafeCreate(Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(value._value));
1326 if (GlobalizationMode.Invariant)
1328 // If the value isn't ASCII and if the globalization tables aren't available,
1329 // case changing has no effect.
1330 return value;
1333 // Non-ASCII data requires going through the case folding tables.
1335 return ChangeCaseCultureAware(value, TextInfo.Invariant, toUpper: true);