1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
6 using System
.Diagnostics
;
7 using System
.Globalization
;
8 using System
.Runtime
.CompilerServices
;
9 using System
.Text
.Unicode
;
14 /// Represents a Unicode scalar value ([ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive).
17 /// This type's constructors and conversion operators validate the input, so consumers can call the APIs
18 /// assuming that the underlying <see cref="Rune"/> instance is well-formed.
20 [DebuggerDisplay("{DebuggerDisplay,nq}")]
21 public readonly struct Rune
: IComparable
<Rune
>, IEquatable
<Rune
>
23 private const byte IsWhiteSpaceFlag
= 0x80;
24 private const byte IsLetterOrDigitFlag
= 0x40;
25 private const byte UnicodeCategoryMask
= 0x1F;
27 // Contains information about the ASCII character range [ U+0000..U+007F ], with:
28 // - 0x80 bit if set means 'is whitespace'
29 // - 0x40 bit if set means 'is letter or digit'
30 // - 0x20 bit is reserved for future use
31 // - bottom 5 bits are the UnicodeCategory of the character
32 private static ReadOnlySpan
<byte> AsciiCharInfo
=> new byte[]
34 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x8E, 0x8E, 0x8E, 0x8E, 0x8E, 0x0E, 0x0E,
35 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E,
36 0x8B, 0x18, 0x18, 0x18, 0x1A, 0x18, 0x18, 0x18, 0x14, 0x15, 0x18, 0x19, 0x18, 0x13, 0x18, 0x18,
37 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x48, 0x18, 0x18, 0x19, 0x19, 0x19, 0x18,
38 0x18, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
39 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x14, 0x18, 0x15, 0x1B, 0x12,
40 0x1B, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41,
41 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x14, 0x19, 0x15, 0x19, 0x0E
44 private readonly uint _value
;
47 /// Creates a <see cref="Rune"/> from the provided UTF-16 code unit.
49 /// <exception cref="ArgumentOutOfRangeException">
50 /// If <paramref name="ch"/> represents a UTF-16 surrogate code point
51 /// U+D800..U+DFFF, inclusive.
56 if (UnicodeUtility
.IsSurrogateCodePoint(expanded
))
58 ThrowHelper
.ThrowArgumentOutOfRangeException(ExceptionArgument
.ch
);
64 /// Creates a <see cref="Rune"/> from the provided UTF-16 surrogate pair.
66 /// <exception cref="ArgumentOutOfRangeException">
67 /// If <paramref name="highSurrogate"/> does not represent a UTF-16 high surrogate code point
68 /// or <paramref name="lowSurrogate"/> does not represent a UTF-16 low surrogate code point.
70 public Rune(char highSurrogate
, char lowSurrogate
)
71 : this((uint)char.ConvertToUtf32(highSurrogate
, lowSurrogate
), false)
76 /// Creates a <see cref="Rune"/> from the provided Unicode scalar value.
78 /// <exception cref="ArgumentOutOfRangeException">
79 /// If <paramref name="value"/> does not represent a value Unicode scalar value.
81 public Rune(int value)
87 /// Creates a <see cref="Rune"/> from the provided Unicode scalar value.
89 /// <exception cref="ArgumentOutOfRangeException">
90 /// If <paramref name="value"/> does not represent a value Unicode scalar value.
93 public Rune(uint value)
95 if (!UnicodeUtility
.IsValidUnicodeScalar(value))
97 ThrowHelper
.ThrowArgumentOutOfRangeException(ExceptionArgument
.value);
102 // non-validating ctor
103 private Rune(uint scalarValue
, bool unused
)
105 UnicodeDebug
.AssertIsValidScalar(scalarValue
);
106 _value
= scalarValue
;
109 public static bool operator ==(Rune left
, Rune right
) => (left
._value
== right
._value
);
111 public static bool operator !=(Rune left
, Rune right
) => (left
._value
!= right
._value
);
113 public static bool operator <(Rune left
, Rune right
) => (left
._value
< right
._value
);
115 public static bool operator <=(Rune left
, Rune right
) => (left
._value
<= right
._value
);
117 public static bool operator >(Rune left
, Rune right
) => (left
._value
> right
._value
);
119 public static bool operator >=(Rune left
, Rune right
) => (left
._value
>= right
._value
);
121 // Operators below are explicit because they may throw.
123 public static explicit operator Rune(char ch
) => new Rune(ch
);
125 [CLSCompliant(false)]
126 public static explicit operator Rune(uint value) => new Rune(value);
128 public static explicit operator Rune(int value) => new Rune(value);
130 // Displayed as "'<char>' (U+XXXX)"; e.g., "'e' (U+0065)"
131 private string DebuggerDisplay
=> FormattableString
.Invariant($"U+{_value:X4} '{(IsValid(_value) ? ToString() : "\uFFFD")}'");
134 /// Returns true if and only if this scalar value is ASCII ([ U+0000..U+007F ])
135 /// and therefore representable by a single UTF-8 code unit.
137 public bool IsAscii
=> UnicodeUtility
.IsAsciiCodePoint(_value
);
140 /// Returns true if and only if this scalar value is within the BMP ([ U+0000..U+FFFF ])
141 /// and therefore representable by a single UTF-16 code unit.
143 public bool IsBmp
=> UnicodeUtility
.IsBmpCodePoint(_value
);
146 /// Returns the Unicode plane (0 to 16, inclusive) which contains this scalar.
148 public int Plane
=> UnicodeUtility
.GetPlane(_value
);
151 /// A <see cref="Rune"/> instance that represents the Unicode replacement character U+FFFD.
153 public static Rune ReplacementChar
=> UnsafeCreate(UnicodeUtility
.ReplacementChar
);
156 /// Returns the length in code units (<see cref="char"/>) of the
157 /// UTF-16 sequence required to represent this scalar value.
160 /// The return value will be 1 or 2.
162 public int Utf16SequenceLength
=> UnicodeUtility
.GetUtf16SequenceLength(_value
);
165 /// Returns the length in code units of the
166 /// UTF-8 sequence required to represent this scalar value.
169 /// The return value will be 1 through 4, inclusive.
171 public int Utf8SequenceLength
=> UnicodeUtility
.GetUtf8SequenceLength(_value
);
174 /// Returns the Unicode scalar value as an integer.
176 public int Value
=> (int)_value
;
178 private static Rune
ChangeCaseCultureAware(Rune rune
, TextInfo textInfo
, bool toUpper
)
180 Debug
.Assert(!GlobalizationMode
.Invariant
, "This should've been checked by the caller.");
181 Debug
.Assert(textInfo
!= null, "This should've been checked by the caller.");
183 Span
<char> original
= stackalloc char[2]; // worst case scenario = 2 code units (for a surrogate pair)
184 Span
<char> modified
= stackalloc char[2]; // case change should preserve UTF-16 code unit count
186 int charCount
= rune
.EncodeToUtf16(original
);
187 original
= original
.Slice(0, charCount
);
188 modified
= modified
.Slice(0, charCount
);
192 textInfo
.ChangeCaseToUpper(original
, modified
);
196 textInfo
.ChangeCaseToLower(original
, modified
);
199 // We use simple case folding rules, which disallows moving between the BMP and supplementary
200 // planes when performing a case conversion. The helper methods which reconstruct a Rune
201 // contain debug asserts for this condition.
205 return UnsafeCreate(modified
[0]);
209 return UnsafeCreate(UnicodeUtility
.GetScalarFromUtf16SurrogatePair(modified
[0], modified
[1]));
213 public int CompareTo(Rune other
) => this._value
.CompareTo(other
._value
);
216 /// Decodes the <see cref="Rune"/> at the beginning of the provided UTF-16 source buffer.
220 /// If the source buffer begins with a valid UTF-16 encoded scalar value, returns <see cref="OperationStatus.Done"/>,
221 /// and outs via <paramref name="result"/> the decoded <see cref="Rune"/> and via <paramref name="charsConsumed"/> the
222 /// number of <see langword="char"/>s used in the input buffer to encode the <see cref="Rune"/>.
225 /// If the source buffer is empty or contains only a standalone UTF-16 high surrogate character, returns <see cref="OperationStatus.NeedMoreData"/>,
226 /// and outs via <paramref name="result"/> <see cref="ReplacementChar"/> and via <paramref name="charsConsumed"/> the length of the input buffer.
229 /// If the source buffer begins with an ill-formed UTF-16 encoded scalar value, returns <see cref="OperationStatus.InvalidData"/>,
230 /// and outs via <paramref name="result"/> <see cref="ReplacementChar"/> and via <paramref name="charsConsumed"/> the number of
231 /// <see langword="char"/>s used in the input buffer to encode the ill-formed sequence.
235 /// The general calling convention is to call this method in a loop, slicing the <paramref name="source"/> buffer by
236 /// <paramref name="charsConsumed"/> elements on each iteration of the loop. On each iteration of the loop <paramref name="result"/>
237 /// will contain the real scalar value if successfully decoded, or it will contain <see cref="ReplacementChar"/> if
238 /// the data could not be successfully decoded. This pattern provides convenient automatic U+FFFD substitution of
239 /// invalid sequences while iterating through the loop.
241 public static OperationStatus
DecodeFromUtf16(ReadOnlySpan
<char> source
, out Rune result
, out int charsConsumed
)
245 // First, check for the common case of a BMP scalar value.
246 // If this is correct, return immediately.
248 char firstChar
= source
[0];
249 if (TryCreate(firstChar
, out result
))
252 return OperationStatus
.Done
;
255 // First thing we saw was a UTF-16 surrogate code point.
256 // Let's optimistically assume for now it's a high surrogate and hope
257 // that combining it with the next char yields useful results.
259 if (1 < (uint)source
.Length
)
261 char secondChar
= source
[1];
262 if (TryCreate(firstChar
, secondChar
, out result
))
264 // Success! Formed a supplementary scalar value.
266 return OperationStatus
.Done
;
270 // Either the first character was a low surrogate, or the second
271 // character was not a low surrogate. This is an error.
275 else if (!char.IsHighSurrogate(firstChar
))
277 // Quick check to make sure we're not going to report NeedMoreData for
278 // a single-element buffer where the data is a standalone low surrogate
279 // character. Since no additional data will ever make this valid, we'll
280 // report an error immediately.
285 // If we got to this point, the input buffer was empty, or the buffer
286 // was a single element in length and that element was a high surrogate char.
288 charsConsumed
= source
.Length
;
289 result
= ReplacementChar
;
290 return OperationStatus
.NeedMoreData
;
294 charsConsumed
= 1; // maximal invalid subsequence for UTF-16 is always a single code unit in length
295 result
= ReplacementChar
;
296 return OperationStatus
.InvalidData
;
300 /// Decodes the <see cref="Rune"/> at the beginning of the provided UTF-8 source buffer.
304 /// If the source buffer begins with a valid UTF-8 encoded scalar value, returns <see cref="OperationStatus.Done"/>,
305 /// and outs via <paramref name="result"/> the decoded <see cref="Rune"/> and via <paramref name="bytesConsumed"/> the
306 /// number of <see langword="byte"/>s used in the input buffer to encode the <see cref="Rune"/>.
309 /// If the source buffer is empty or contains only a standalone UTF-8 high surrogate character, returns <see cref="OperationStatus.NeedMoreData"/>,
310 /// and outs via <paramref name="result"/> <see cref="ReplacementChar"/> and via <paramref name="bytesConsumed"/> the length of the input buffer.
313 /// If the source buffer begins with an ill-formed UTF-8 encoded scalar value, returns <see cref="OperationStatus.InvalidData"/>,
314 /// and outs via <paramref name="result"/> <see cref="ReplacementChar"/> and via <paramref name="bytesConsumed"/> the number of
315 /// <see langword="char"/>s used in the input buffer to encode the ill-formed sequence.
319 /// The general calling convention is to call this method in a loop, slicing the <paramref name="source"/> buffer by
320 /// <paramref name="bytesConsumed"/> elements on each iteration of the loop. On each iteration of the loop <paramref name="result"/>
321 /// will contain the real scalar value if successfully decoded, or it will contain <see cref="ReplacementChar"/> if
322 /// the data could not be successfully decoded. This pattern provides convenient automatic U+FFFD substitution of
323 /// invalid sequences while iterating through the loop.
325 public static OperationStatus
DecodeFromUtf8(ReadOnlySpan
<byte> source
, out Rune result
, out int bytesConsumed
)
327 // This method follows the Unicode Standard's recommendation for detecting
328 // the maximal subpart of an ill-formed subsequence. See The Unicode Standard,
329 // Ch. 3.9 for more details. In summary, when reporting an invalid subsequence,
330 // it tries to consume as many code units as possible as long as those code
331 // units constitute the beginning of a longer well-formed subsequence per Table 3-7.
335 // Try reading input[0].
337 if ((uint)index
>= (uint)source
.Length
)
342 uint tempValue
= source
[index
];
343 if (!UnicodeUtility
.IsAsciiCodePoint(tempValue
))
350 bytesConsumed
= index
+ 1;
351 Debug
.Assert(1 <= bytesConsumed
&& bytesConsumed
<= 4); // Valid subsequences are always length [1..4]
352 result
= UnsafeCreate(tempValue
);
353 return OperationStatus
.Done
;
357 // Per Table 3-7, the beginning of a multibyte sequence must be a code unit in
358 // the range [C2..F4]. If it's outside of that range, it's either a standalone
359 // continuation byte, or it's an overlong two-byte sequence, or it's an out-of-range
360 // four-byte sequence.
362 if (!UnicodeUtility
.IsInRangeInclusive(tempValue
, 0xC2, 0xF4))
364 goto FirstByteInvalid
;
367 tempValue
= (tempValue
- 0xC2) << 6;
369 // Try reading input[1].
372 if ((uint)index
>= (uint)source
.Length
)
377 // Continuation bytes are of the form [10xxxxxx], which means that their two's
378 // complement representation is in the range [-65..-128]. This allows us to
379 // perform a single comparison to see if a byte is a continuation byte.
381 int thisByteSignExtended
= (sbyte)source
[index
];
382 if (thisByteSignExtended
>= -64)
387 tempValue
+= (uint)thisByteSignExtended
;
388 tempValue
+= 0x80; // remove the continuation byte marker
389 tempValue
+= (0xC2 - 0xC0) << 6; // remove the leading byte marker
391 if (tempValue
< 0x0800)
393 Debug
.Assert(UnicodeUtility
.IsInRangeInclusive(tempValue
, 0x0080, 0x07FF));
394 goto Finish
; // this is a valid 2-byte sequence
397 // This appears to be a 3- or 4-byte sequence. Since per Table 3-7 we now have
398 // enough information (from just two code units) to detect overlong or surrogate
399 // sequences, we need to perform these checks now.
401 if (!UnicodeUtility
.IsInRangeInclusive(tempValue
, ((0xE0 - 0xC0) << 6) + (0xA0 - 0x80), ((0xF4 - 0xC0) << 6) + (0x8F - 0x80)))
403 // The first two bytes were not in the range [[E0 A0]..[F4 8F]].
404 // This is an overlong 3-byte sequence or an out-of-range 4-byte sequence.
408 if (UnicodeUtility
.IsInRangeInclusive(tempValue
, ((0xED - 0xC0) << 6) + (0xA0 - 0x80), ((0xED - 0xC0) << 6) + (0xBF - 0x80)))
410 // This is a UTF-16 surrogate code point, which is invalid in UTF-8.
414 if (UnicodeUtility
.IsInRangeInclusive(tempValue
, ((0xF0 - 0xC0) << 6) + (0x80 - 0x80), ((0xF0 - 0xC0) << 6) + (0x8F - 0x80)))
416 // This is an overlong 4-byte sequence.
420 // The first two bytes were just fine. We don't need to perform any other checks
421 // on the remaining bytes other than to see that they're valid continuation bytes.
423 // Try reading input[2].
426 if ((uint)index
>= (uint)source
.Length
)
431 thisByteSignExtended
= (sbyte)source
[index
];
432 if (thisByteSignExtended
>= -64)
434 goto Invalid
; // this byte is not a UTF-8 continuation byte
438 tempValue
+= (uint)thisByteSignExtended
;
439 tempValue
+= 0x80; // remove the continuation byte marker
440 tempValue
-= (0xE0 - 0xC0) << 12; // remove the leading byte marker
442 if (tempValue
<= 0xFFFF)
444 Debug
.Assert(UnicodeUtility
.IsInRangeInclusive(tempValue
, 0x0800, 0xFFFF));
445 goto Finish
; // this is a valid 3-byte sequence
448 // Try reading input[3].
451 if ((uint)index
>= (uint)source
.Length
)
456 thisByteSignExtended
= (sbyte)source
[index
];
457 if (thisByteSignExtended
>= -64)
459 goto Invalid
; // this byte is not a UTF-8 continuation byte
463 tempValue
+= (uint)thisByteSignExtended
;
464 tempValue
+= 0x80; // remove the continuation byte marker
465 tempValue
-= (0xF0 - 0xE0) << 18; // remove the leading byte marker
467 UnicodeDebug
.AssertIsValidSupplementaryPlaneScalar(tempValue
);
468 goto Finish
; // this is a valid 4-byte sequence
472 index
= 1; // Invalid subsequences are always at least length 1.
476 Debug
.Assert(1 <= index
&& index
<= 3); // Invalid subsequences are always length 1..3
477 bytesConsumed
= index
;
478 result
= ReplacementChar
;
479 return OperationStatus
.InvalidData
;
483 Debug
.Assert(0 <= index
&& index
<= 3); // Incomplete subsequences are always length 0..3
484 bytesConsumed
= index
;
485 result
= ReplacementChar
;
486 return OperationStatus
.NeedMoreData
;
490 /// Decodes the <see cref="Rune"/> at the end of the provided UTF-16 source buffer.
493 /// This method is very similar to <see cref="DecodeFromUtf16(ReadOnlySpan{char}, out Rune, out int)"/>, but it allows
494 /// the caller to loop backward instead of forward. The typical calling convention is that on each iteration
495 /// of the loop, the caller should slice off the final <paramref name="charsConsumed"/> elements of
496 /// the <paramref name="source"/> buffer.
498 public static OperationStatus
DecodeLastFromUtf16(ReadOnlySpan
<char> source
, out Rune result
, out int charsConsumed
)
500 int index
= source
.Length
- 1;
501 if ((uint)index
< (uint)source
.Length
)
503 // First, check for the common case of a BMP scalar value.
504 // If this is correct, return immediately.
506 char finalChar
= source
[index
];
507 if (TryCreate(finalChar
, out result
))
510 return OperationStatus
.Done
;
513 if (char.IsLowSurrogate(finalChar
))
515 // The final character was a UTF-16 low surrogate code point.
516 // This must be preceded by a UTF-16 high surrogate code point, otherwise
517 // we have a standalone low surrogate, which is always invalid.
520 if ((uint)index
< (uint)source
.Length
)
522 char penultimateChar
= source
[index
];
523 if (TryCreate(penultimateChar
, finalChar
, out result
))
525 // Success! Formed a supplementary scalar value.
527 return OperationStatus
.Done
;
531 // If we got to this point, we saw a standalone low surrogate
532 // and must report an error.
534 charsConsumed
= 1; // standalone surrogate
535 result
= ReplacementChar
;
536 return OperationStatus
.InvalidData
;
540 // If we got this far, the source buffer was empty, or the source buffer ended
541 // with a UTF-16 high surrogate code point. These aren't errors since they could
542 // be valid given more input data.
544 charsConsumed
= (int)((uint)(-source
.Length
) >> 31); // 0 -> 0, all other lengths -> 1
545 result
= ReplacementChar
;
546 return OperationStatus
.NeedMoreData
;
550 /// Decodes the <see cref="Rune"/> at the end of the provided UTF-8 source buffer.
553 /// This method is very similar to <see cref="DecodeFromUtf8(ReadOnlySpan{byte}, out Rune, out int)"/>, but it allows
554 /// the caller to loop backward instead of forward. The typical calling convention is that on each iteration
555 /// of the loop, the caller should slice off the final <paramref name="bytesConsumed"/> elements of
556 /// the <paramref name="source"/> buffer.
558 public static OperationStatus
DecodeLastFromUtf8(ReadOnlySpan
<byte> source
, out Rune
value, out int bytesConsumed
)
560 int index
= source
.Length
- 1;
561 if ((uint)index
< (uint)source
.Length
)
563 // The buffer contains at least one byte. Let's check the fast case where the
564 // buffer ends with an ASCII byte.
566 uint tempValue
= source
[index
];
567 if (UnicodeUtility
.IsAsciiCodePoint(tempValue
))
570 value = UnsafeCreate(tempValue
);
571 return OperationStatus
.Done
;
574 // If the final byte is not an ASCII byte, we may be beginning or in the middle of
575 // a UTF-8 multi-code unit sequence. We need to back up until we see the start of
576 // the multi-code unit sequence; we can detect the leading byte because all multi-byte
577 // sequences begin with a byte whose 0x40 bit is set. Since all multi-byte sequences
578 // are no greater than 4 code units in length, we only need to search back a maximum
581 if (((byte)tempValue
& 0x40) != 0)
583 // This is a UTF-8 leading byte. We'll do a forward read from here.
584 // It'll return invalid (if given C0, F5, etc.) or incomplete. Both are fine.
586 return DecodeFromUtf8(source
.Slice(index
), out value, out bytesConsumed
);
589 // If we got to this point, the final byte was a UTF-8 continuation byte.
590 // Let's check the three bytes immediately preceding this, looking for the starting byte.
592 for (int i
= 3; i
> 0; i
--)
595 if ((uint)index
>= (uint)source
.Length
)
597 goto Invalid
; // out of data
600 // The check below will get hit for ASCII (values 00..7F) and for UTF-8 starting bytes
601 // (bits 0xC0 set, values C0..FF). In two's complement this is the range [-64..127].
602 // It's just a fast way for us to terminate the search.
604 if ((sbyte)source
[index
] >= -64)
612 // If we got to this point, either:
613 // - the last 4 bytes of the input buffer are continuation bytes;
614 // - the entire input buffer (if fewer than 4 bytes) consists only of continuation bytes; or
615 // - there's no UTF-8 leading byte between the final continuation byte of the buffer and
616 // the previous well-formed subsequence or maximal invalid subsequence.
618 // In all of these cases, the final byte must be a maximal invalid subsequence of length 1.
619 // See comment near the end of this method for more information.
621 value = ReplacementChar
;
623 return OperationStatus
.InvalidData
;
627 // If we got to this point, we found an ASCII byte or a UTF-8 starting byte at position source[index].
628 // Technically this could also mean we found an invalid byte like C0 or F5 at this position, but that's
629 // fine since it'll be handled by the forward read. From this position, we'll perform a forward read
630 // and see if we consumed the entirety of the buffer.
632 source
= source
.Slice(index
);
633 Debug
.Assert(!source
.IsEmpty
, "Shouldn't reach this for empty inputs.");
635 OperationStatus operationStatus
= DecodeFromUtf8(source
, out Rune tempRune
, out int tempBytesConsumed
);
636 if (tempBytesConsumed
== source
.Length
)
638 // If this forward read consumed the entirety of the end of the input buffer, we can return it
639 // as the result of this function. It could be well-formed, incomplete, or invalid. If it's
640 // invalid and we consumed the remainder of the buffer, we know we've found the maximal invalid
641 // subsequence, which is what we wanted anyway.
643 bytesConsumed
= tempBytesConsumed
;
645 return operationStatus
;
648 // If we got to this point, we know that the final continuation byte wasn't consumed by the forward
649 // read that we just performed above. This means that the continuation byte has to be part of an
650 // invalid subsequence since there's no UTF-8 leading byte between what we just consumed and the
651 // continuation byte at the end of the input. Furthermore, since any maximal invalid subsequence
652 // of length > 1 must have a UTF-8 leading byte as its first code unit, this implies that the
653 // continuation byte at the end of the buffer is itself a maximal invalid subsequence of length 1.
659 // Source buffer was empty.
660 value = ReplacementChar
;
662 return OperationStatus
.NeedMoreData
;
667 /// Encodes this <see cref="Rune"/> to a UTF-16 destination buffer.
669 /// <param name="destination">The buffer to which to write this value as UTF-16.</param>
670 /// <returns>The number of <see cref="char"/>s written to <paramref name="destination"/>.</returns>
671 /// <exception cref="ArgumentException">
672 /// If <paramref name="destination"/> is not large enough to hold the output.
674 public int EncodeToUtf16(Span
<char> destination
)
676 if (!TryEncodeToUtf16(destination
, out int charsWritten
))
678 ThrowHelper
.ThrowArgumentException_DestinationTooShort();
685 /// Encodes this <see cref="Rune"/> to a UTF-8 destination buffer.
687 /// <param name="destination">The buffer to which to write this value as UTF-8.</param>
688 /// <returns>The number of <see cref="byte"/>s written to <paramref name="destination"/>.</returns>
689 /// <exception cref="ArgumentException">
690 /// If <paramref name="destination"/> is not large enough to hold the output.
692 public int EncodeToUtf8(Span
<byte> destination
)
694 if (!TryEncodeToUtf8(destination
, out int bytesWritten
))
696 ThrowHelper
.ThrowArgumentException_DestinationTooShort();
702 public override bool Equals(object? obj
) => (obj
is Rune other
) && this.Equals(other
);
704 public bool Equals(Rune other
) => (this == other
);
706 public override int GetHashCode() => Value
;
709 /// Gets the <see cref="Rune"/> which begins at index <paramref name="index"/> in
710 /// string <paramref name="input"/>.
713 /// Throws if <paramref name="input"/> is null, if <paramref name="index"/> is out of range, or
714 /// if <paramref name="index"/> does not reference the start of a valid scalar value within <paramref name="input"/>.
716 public static Rune
GetRuneAt(string input
, int index
)
718 int runeValue
= ReadRuneFromString(input
, index
);
721 ThrowHelper
.ThrowArgumentException_CannotExtractScalar(ExceptionArgument
.index
);
724 return UnsafeCreate((uint)runeValue
);
728 /// Returns <see langword="true"/> iff <paramref name="value"/> is a valid Unicode scalar
729 /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive.
731 public static bool IsValid(int value) => IsValid((uint)value);
734 /// Returns <see langword="true"/> iff <paramref name="value"/> is a valid Unicode scalar
735 /// value, i.e., is in [ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive.
737 [CLSCompliant(false)]
738 public static bool IsValid(uint value) => UnicodeUtility
.IsValidUnicodeScalar(value);
740 // returns a negative number on failure
741 internal static int ReadFirstRuneFromUtf16Buffer(ReadOnlySpan
<char> input
)
748 // Optimistically assume input is within BMP.
750 uint returnValue
= input
[0];
751 if (UnicodeUtility
.IsSurrogateCodePoint(returnValue
))
753 if (!UnicodeUtility
.IsHighSurrogateCodePoint(returnValue
))
758 // Treat 'returnValue' as the high surrogate.
760 if (1 >= (uint)input
.Length
)
762 return -1; // not an argument exception - just a "bad data" failure
765 uint potentialLowSurrogate
= input
[1];
766 if (!UnicodeUtility
.IsLowSurrogateCodePoint(potentialLowSurrogate
))
771 returnValue
= UnicodeUtility
.GetScalarFromUtf16SurrogatePair(returnValue
, potentialLowSurrogate
);
774 return (int)returnValue
;
777 // returns a negative number on failure
778 private static int ReadRuneFromString(string input
, int index
)
782 ThrowHelper
.ThrowArgumentNullException(ExceptionArgument
.input
);
785 if ((uint)index
>= (uint)input
!.Length
)
787 ThrowHelper
.ThrowArgumentOutOfRange_IndexException();
790 // Optimistically assume input is within BMP.
792 uint returnValue
= input
[index
];
793 if (UnicodeUtility
.IsSurrogateCodePoint(returnValue
))
795 if (!UnicodeUtility
.IsHighSurrogateCodePoint(returnValue
))
800 // Treat 'returnValue' as the high surrogate.
802 // If this becomes a hot code path, we can skip the below bounds check by reading
803 // off the end of the string using unsafe code. Since strings are null-terminated,
804 // we're guaranteed not to read a valid low surrogate, so we'll fail correctly if
805 // the string terminates unexpectedly.
808 if ((uint)index
>= (uint)input
.Length
)
810 return -1; // not an argument exception - just a "bad data" failure
813 uint potentialLowSurrogate
= input
[index
];
814 if (!UnicodeUtility
.IsLowSurrogateCodePoint(potentialLowSurrogate
))
819 returnValue
= UnicodeUtility
.GetScalarFromUtf16SurrogatePair(returnValue
, potentialLowSurrogate
);
822 return (int)returnValue
;
826 /// Returns a <see cref="string"/> representation of this <see cref="Rune"/> instance.
828 public override string ToString()
832 return string.CreateFromChar((char)_value
);
836 UnicodeUtility
.GetUtf16SurrogatesFromSupplementaryPlaneScalar(_value
, out char high
, out char low
);
837 return string.CreateFromChar(high
, low
);
842 /// Attempts to create a <see cref="Rune"/> from the provided input value.
844 public static bool TryCreate(char ch
, out Rune result
)
846 uint extendedValue
= ch
;
847 if (!UnicodeUtility
.IsSurrogateCodePoint(extendedValue
))
849 result
= UnsafeCreate(extendedValue
);
860 /// Attempts to create a <see cref="Rune"/> from the provided UTF-16 surrogate pair.
861 /// Returns <see langword="false"/> if the input values don't represent a well-formed UTF-16surrogate pair.
863 public static bool TryCreate(char highSurrogate
, char lowSurrogate
, out Rune result
)
865 // First, extend both to 32 bits, then calculate the offset of
866 // each candidate surrogate char from the start of its range.
868 uint highSurrogateOffset
= (uint)highSurrogate
- CharUnicodeInfo
.HIGH_SURROGATE_START
;
869 uint lowSurrogateOffset
= (uint)lowSurrogate
- CharUnicodeInfo
.LOW_SURROGATE_START
;
871 // This is a single comparison which allows us to check both for validity at once since
872 // both the high surrogate range and the low surrogate range are the same length.
873 // If the comparison fails, we call to a helper method to throw the correct exception message.
875 if ((highSurrogateOffset
| lowSurrogateOffset
) <= CharUnicodeInfo
.HIGH_SURROGATE_RANGE
)
877 // The 0x40u << 10 below is to account for uuuuu = wwww + 1 in the surrogate encoding.
878 result
= UnsafeCreate((highSurrogateOffset
<< 10) + ((uint)lowSurrogate
- CharUnicodeInfo
.LOW_SURROGATE_START
) + (0x40u
<< 10));
883 // Didn't have a high surrogate followed by a low surrogate.
890 /// Attempts to create a <see cref="Rune"/> from the provided input value.
892 public static bool TryCreate(int value, out Rune result
) => TryCreate((uint)value, out result
);
895 /// Attempts to create a <see cref="Rune"/> from the provided input value.
897 [CLSCompliant(false)]
898 public static bool TryCreate(uint value, out Rune result
)
900 if (UnicodeUtility
.IsValidUnicodeScalar(value))
902 result
= UnsafeCreate(value);
913 /// Encodes this <see cref="Rune"/> to a UTF-16 destination buffer.
915 /// <param name="destination">The buffer to which to write this value as UTF-16.</param>
916 /// <param name="charsWritten">
917 /// The number of <see cref="char"/>s written to <paramref name="destination"/>,
918 /// or 0 if the destination buffer is not large enough to contain the output.</param>
919 /// <returns>True if the value was written to the buffer; otherwise, false.</returns>
921 /// The <see cref="Utf16SequenceLength"/> property can be queried ahead of time to determine
922 /// the required size of the <paramref name="destination"/> buffer.
924 public bool TryEncodeToUtf16(Span
<char> destination
, out int charsWritten
)
926 if (destination
.Length
>= 1)
930 destination
[0] = (char)_value
;
934 else if (destination
.Length
>= 2)
936 UnicodeUtility
.GetUtf16SurrogatesFromSupplementaryPlaneScalar(_value
, out destination
[0], out destination
[1]);
942 // Destination buffer not large enough
944 charsWritten
= default;
949 /// Encodes this <see cref="Rune"/> to a destination buffer as UTF-8 bytes.
951 /// <param name="destination">The buffer to which to write this value as UTF-8.</param>
952 /// <param name="bytesWritten">
953 /// The number of <see cref="byte"/>s written to <paramref name="destination"/>,
954 /// or 0 if the destination buffer is not large enough to contain the output.</param>
955 /// <returns>True if the value was written to the buffer; otherwise, false.</returns>
957 /// The <see cref="Utf8SequenceLength"/> property can be queried ahead of time to determine
958 /// the required size of the <paramref name="destination"/> buffer.
960 public bool TryEncodeToUtf8(Span
<byte> destination
, out int bytesWritten
)
962 // The bit patterns below come from the Unicode Standard, Table 3-6.
964 if (destination
.Length
>= 1)
968 destination
[0] = (byte)_value
;
973 if (destination
.Length
>= 2)
975 if (_value
<= 0x7FFu
)
977 // Scalar 00000yyy yyxxxxxx -> bytes [ 110yyyyy 10xxxxxx ]
978 destination
[0] = (byte)((_value
+ (0b110u
<< 11)) >> 6);
979 destination
[1] = (byte)((_value
& 0x3Fu
) + 0x80u
);
984 if (destination
.Length
>= 3)
986 if (_value
<= 0xFFFFu
)
988 // Scalar zzzzyyyy yyxxxxxx -> bytes [ 1110zzzz 10yyyyyy 10xxxxxx ]
989 destination
[0] = (byte)((_value
+ (0b1110
<< 16)) >> 12);
990 destination
[1] = (byte)(((_value
& (0x3Fu
<< 6)) >> 6) + 0x80u
);
991 destination
[2] = (byte)((_value
& 0x3Fu
) + 0x80u
);
996 if (destination
.Length
>= 4)
998 // Scalar 000uuuuu zzzzyyyy yyxxxxxx -> bytes [ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx ]
999 destination
[0] = (byte)((_value
+ (0b11110
<< 21)) >> 18);
1000 destination
[1] = (byte)(((_value
& (0x3Fu
<< 12)) >> 12) + 0x80u
);
1001 destination
[2] = (byte)(((_value
& (0x3Fu
<< 6)) >> 6) + 0x80u
);
1002 destination
[3] = (byte)((_value
& 0x3Fu
) + 0x80u
);
1010 // Destination buffer not large enough
1012 bytesWritten
= default;
1017 /// Attempts to get the <see cref="Rune"/> which begins at index <paramref name="index"/> in
1018 /// string <paramref name="input"/>.
1020 /// <returns><see langword="true"/> if a scalar value was successfully extracted from the specified index,
1021 /// <see langword="false"/> if a value could not be extracted due to invalid data.</returns>
1023 /// Throws only if <paramref name="input"/> is null or <paramref name="index"/> is out of range.
1025 public static bool TryGetRuneAt(string input
, int index
, out Rune
value)
1027 int runeValue
= ReadRuneFromString(input
, index
);
1030 value = UnsafeCreate((uint)runeValue
);
1040 // Allows constructing a Unicode scalar value from an arbitrary 32-bit integer without
1041 // validation. It is the caller's responsibility to have performed manual validation
1042 // before calling this method. If a Rune instance is forcibly constructed
1043 // from invalid input, the APIs on this type have undefined behavior, potentially including
1044 // introducing a security hole in the consuming application.
1046 // An example of a security hole resulting from an invalid Rune value, which could result
1047 // in a stack overflow.
1049 // public int GetMarvin32HashCode(Rune r) {
1050 // Span<char> buffer = stackalloc char[r.Utf16SequenceLength];
1051 // r.TryEncode(buffer, ...);
1052 // return Marvin32.ComputeHash(buffer.AsBytes());
1056 /// Creates a <see cref="Rune"/> without performing validation on the input.
1058 [MethodImpl(MethodImplOptions
.AggressiveInlining
)]
1059 internal static Rune
UnsafeCreate(uint scalarValue
) => new Rune(scalarValue
, false);
1061 // These are analogs of APIs on System.Char
1063 public static double GetNumericValue(Rune
value)
1067 uint baseNum
= value._value
- '0';
1068 return (baseNum
<= 9) ? (double)baseNum
: -1;
1072 // not an ASCII char; fall back to globalization table
1073 return CharUnicodeInfo
.InternalGetNumericValue(value.Value
);
1077 public static UnicodeCategory
GetUnicodeCategory(Rune
value)
1081 return (UnicodeCategory
)(AsciiCharInfo
[value.Value
] & UnicodeCategoryMask
);
1085 return GetUnicodeCategoryNonAscii(value);
1089 private static UnicodeCategory
GetUnicodeCategoryNonAscii(Rune
value)
1091 Debug
.Assert(!value.IsAscii
, "Shouldn't use this non-optimized code path for ASCII characters.");
1092 return CharUnicodeInfo
.GetUnicodeCategory(value.Value
);
1095 // Returns true iff this Unicode category represents a letter
1096 private static bool IsCategoryLetter(UnicodeCategory category
)
1098 return UnicodeUtility
.IsInRangeInclusive((uint)category
, (uint)UnicodeCategory
.UppercaseLetter
, (uint)UnicodeCategory
.OtherLetter
);
1101 // Returns true iff this Unicode category represents a letter or a decimal digit
1102 private static bool IsCategoryLetterOrDecimalDigit(UnicodeCategory category
)
1104 return UnicodeUtility
.IsInRangeInclusive((uint)category
, (uint)UnicodeCategory
.UppercaseLetter
, (uint)UnicodeCategory
.OtherLetter
)
1105 || (category
== UnicodeCategory
.DecimalDigitNumber
);
1108 // Returns true iff this Unicode category represents a number
1109 private static bool IsCategoryNumber(UnicodeCategory category
)
1111 return UnicodeUtility
.IsInRangeInclusive((uint)category
, (uint)UnicodeCategory
.DecimalDigitNumber
, (uint)UnicodeCategory
.OtherNumber
);
1114 // Returns true iff this Unicode category represents a punctuation mark
1115 private static bool IsCategoryPunctuation(UnicodeCategory category
)
1117 return UnicodeUtility
.IsInRangeInclusive((uint)category
, (uint)UnicodeCategory
.ConnectorPunctuation
, (uint)UnicodeCategory
.OtherPunctuation
);
1120 // Returns true iff this Unicode category represents a separator
1121 private static bool IsCategorySeparator(UnicodeCategory category
)
1123 return UnicodeUtility
.IsInRangeInclusive((uint)category
, (uint)UnicodeCategory
.SpaceSeparator
, (uint)UnicodeCategory
.ParagraphSeparator
);
1126 // Returns true iff this Unicode category represents a symbol
1127 private static bool IsCategorySymbol(UnicodeCategory category
)
1129 return UnicodeUtility
.IsInRangeInclusive((uint)category
, (uint)UnicodeCategory
.MathSymbol
, (uint)UnicodeCategory
.OtherSymbol
);
1132 public static bool IsControl(Rune
value)
1134 // Per the Unicode stability policy, the set of control characters
1135 // is forever fixed at [ U+0000..U+001F ], [ U+007F..U+009F ]. No
1136 // characters will ever be added to the "control characters" group.
1137 // See http://www.unicode.org/policies/stability_policy.html.
1139 // Logic below depends on Rune.Value never being -1 (since Rune is a validating type)
1140 // 00..1F (+1) => 01..20 (&~80) => 01..20
1141 // 7F..9F (+1) => 80..A0 (&~80) => 00..20
1143 return (((value._value
+ 1) & ~
0x80u
) <= 0x20u
);
1146 public static bool IsDigit(Rune
value)
1150 return UnicodeUtility
.IsInRangeInclusive(value._value
, '0', '9');
1154 return GetUnicodeCategoryNonAscii(value) == UnicodeCategory
.DecimalDigitNumber
;
1158 public static bool IsLetter(Rune
value)
1162 return (((value._value
- 'A') & ~
0x20u
) <= (uint)('Z' - 'A')); // [A-Za-z]
1166 return IsCategoryLetter(GetUnicodeCategoryNonAscii(value));
1170 public static bool IsLetterOrDigit(Rune
value)
1174 return ((AsciiCharInfo
[value.Value
] & IsLetterOrDigitFlag
) != 0);
1178 return IsCategoryLetterOrDecimalDigit(GetUnicodeCategoryNonAscii(value));
1182 public static bool IsLower(Rune
value)
1186 return UnicodeUtility
.IsInRangeInclusive(value._value
, 'a', 'z');
1190 return GetUnicodeCategoryNonAscii(value) == UnicodeCategory
.LowercaseLetter
;
1194 public static bool IsNumber(Rune
value)
1198 return UnicodeUtility
.IsInRangeInclusive(value._value
, '0', '9');
1202 return IsCategoryNumber(GetUnicodeCategoryNonAscii(value));
1206 public static bool IsPunctuation(Rune
value)
1208 return IsCategoryPunctuation(GetUnicodeCategory(value));
1211 public static bool IsSeparator(Rune
value)
1213 return IsCategorySeparator(GetUnicodeCategory(value));
1216 public static bool IsSymbol(Rune
value)
1218 return IsCategorySymbol(GetUnicodeCategory(value));
1221 public static bool IsUpper(Rune
value)
1225 return UnicodeUtility
.IsInRangeInclusive(value._value
, 'A', 'Z');
1229 return GetUnicodeCategoryNonAscii(value) == UnicodeCategory
.UppercaseLetter
;
1233 public static bool IsWhiteSpace(Rune
value)
1237 return (AsciiCharInfo
[value.Value
] & IsWhiteSpaceFlag
) != 0;
1240 // U+0085 is special since it's a whitespace character but is in the Control category
1241 // instead of a normal separator category. No other code point outside the ASCII range
1242 // has this mismatch.
1244 if (value._value
== 0x0085u
)
1249 return IsCategorySeparator(GetUnicodeCategoryNonAscii(value));
1252 public static Rune
ToLower(Rune
value, CultureInfo culture
)
1254 if (culture
is null)
1256 ThrowHelper
.ThrowArgumentNullException(ExceptionArgument
.culture
);
1259 // We don't want to special-case ASCII here since the specified culture might handle
1260 // ASCII characters differently than the invariant culture (e.g., Turkish I). Instead
1261 // we'll just jump straight to the globalization tables if they're available.
1263 if (GlobalizationMode
.Invariant
)
1265 return ToLowerInvariant(value);
1268 return ChangeCaseCultureAware(value, culture
!.TextInfo
, toUpper
: false);
1271 public static Rune
ToLowerInvariant(Rune
value)
1273 // Handle the most common case (ASCII data) first. Within the common case, we expect
1274 // that there'll be a mix of lowercase & uppercase chars, so make the conversion branchless.
1278 // It's ok for us to use the UTF-16 conversion utility for this since the high
1279 // 16 bits of the value will never be set so will be left unchanged.
1280 return UnsafeCreate(Utf16Utility
.ConvertAllAsciiCharsInUInt32ToLowercase(value._value
));
1283 if (GlobalizationMode
.Invariant
)
1285 // If the value isn't ASCII and if the globalization tables aren't available,
1286 // case changing has no effect.
1290 // Non-ASCII data requires going through the case folding tables.
1292 return ChangeCaseCultureAware(value, TextInfo
.Invariant
, toUpper
: false);
1295 public static Rune
ToUpper(Rune
value, CultureInfo culture
)
1297 if (culture
is null)
1299 ThrowHelper
.ThrowArgumentNullException(ExceptionArgument
.culture
);
1302 // We don't want to special-case ASCII here since the specified culture might handle
1303 // ASCII characters differently than the invariant culture (e.g., Turkish I). Instead
1304 // we'll just jump straight to the globalization tables if they're available.
1306 if (GlobalizationMode
.Invariant
)
1308 return ToUpperInvariant(value);
1311 return ChangeCaseCultureAware(value, culture
!.TextInfo
, toUpper
: true);
1314 public static Rune
ToUpperInvariant(Rune
value)
1316 // Handle the most common case (ASCII data) first. Within the common case, we expect
1317 // that there'll be a mix of lowercase & uppercase chars, so make the conversion branchless.
1321 // It's ok for us to use the UTF-16 conversion utility for this since the high
1322 // 16 bits of the value will never be set so will be left unchanged.
1323 return UnsafeCreate(Utf16Utility
.ConvertAllAsciiCharsInUInt32ToUppercase(value._value
));
1326 if (GlobalizationMode
.Invariant
)
1328 // If the value isn't ASCII and if the globalization tables aren't available,
1329 // case changing has no effect.
1333 // Non-ASCII data requires going through the case folding tables.
1335 return ChangeCaseCultureAware(value, TextInfo
.Invariant
, toUpper
: true);