Introduce Utf8Span, which is a span of UTF-8 text (dotnet/coreclr#26711)
[mono-project.git] / netcore / System.Private.CoreLib / shared / System / Text / Unicode / Utf8Utility.cs
blob01aa0a9ac9f627ce329c3483f91fd8751042d0d4
1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
5 using System.Buffers;
6 using System.Diagnostics;
7 using System.IO;
8 using System.Runtime.CompilerServices;
9 using System.Runtime.InteropServices;
10 using Internal.Runtime.CompilerServices;
12 namespace System.Text.Unicode
14 internal static partial class Utf8Utility
16 /// <summary>
17 /// The maximum number of bytes that can result from UTF-8 transcoding
18 /// any Unicode scalar value.
19 /// </summary>
20 internal const int MaxBytesPerScalar = 4;
22 /// <summary>
23 /// The UTF-8 representation of <see cref="UnicodeUtility.ReplacementChar"/>.
24 /// </summary>
25 private static ReadOnlySpan<byte> ReplacementCharSequence => new byte[] { 0xEF, 0xBF, 0xBD };
27 /// <summary>
28 /// Returns the byte index in <paramref name="utf8Data"/> where the first invalid UTF-8 sequence begins,
29 /// or -1 if the buffer contains no invalid sequences. Also outs the <paramref name="isAscii"/> parameter
30 /// stating whether all data observed (up to the first invalid sequence or the end of the buffer, whichever
31 /// comes first) is ASCII.
32 /// </summary>
33 [MethodImpl(MethodImplOptions.AggressiveInlining)]
34 public static unsafe int GetIndexOfFirstInvalidUtf8Sequence(ReadOnlySpan<byte> utf8Data, out bool isAscii)
36 fixed (byte* pUtf8Data = &MemoryMarshal.GetReference(utf8Data))
38 byte* pFirstInvalidByte = GetPointerToFirstInvalidByte(pUtf8Data, utf8Data.Length, out int utf16CodeUnitCountAdjustment, out _);
39 int index = (int)(void*)Unsafe.ByteOffset(ref *pUtf8Data, ref *pFirstInvalidByte);
41 isAscii = (utf16CodeUnitCountAdjustment == 0); // If UTF-16 char count == UTF-8 byte count, it's ASCII.
42 return (index < utf8Data.Length) ? index : -1;
46 #if FEATURE_UTF8STRING
47 /// <summary>
48 /// Returns a value stating whether <paramref name="utf8Data"/> contains only well-formed UTF-8 data.
49 /// </summary>
50 [MethodImpl(MethodImplOptions.AggressiveInlining)]
51 public static unsafe bool IsWellFormedUtf8(ReadOnlySpan<byte> utf8Data)
53 fixed (byte* pUtf8Data = &MemoryMarshal.GetReference(utf8Data))
55 // The return value here will point to the end of the span if the data is well-formed.
56 byte* pFirstInvalidByte = GetPointerToFirstInvalidByte(pUtf8Data, utf8Data.Length, out int _, out _);
57 return (pFirstInvalidByte == (pUtf8Data + (uint)utf8Data.Length));
61 /// <summary>
62 /// Returns <paramref name="value"/> if it is null or contains only well-formed UTF-8 data;
63 /// otherwises allocates a new <see cref="Utf8String"/> instance containing the same data as
64 /// <paramref name="value"/> but where all invalid UTF-8 sequences have been replaced
65 /// with U+FFFD.
66 /// </summary>
67 public static Utf8String ValidateAndFixupUtf8String(Utf8String value)
69 if (value.Length == 0)
71 return value;
74 ReadOnlySpan<byte> valueAsBytes = value.AsBytes();
76 int idxOfFirstInvalidData = GetIndexOfFirstInvalidUtf8Sequence(valueAsBytes, out _);
77 if (idxOfFirstInvalidData < 0)
79 return value;
82 // TODO_UTF8STRING: Replace this with the faster implementation once it's available.
83 // (The faster implementation is in the dev/utf8string_bak branch currently.)
85 MemoryStream memStream = new MemoryStream();
86 memStream.Write(valueAsBytes.Slice(0, idxOfFirstInvalidData));
88 valueAsBytes = valueAsBytes.Slice(idxOfFirstInvalidData);
91 if (Rune.DecodeFromUtf8(valueAsBytes, out _, out int bytesConsumed) == OperationStatus.Done)
93 // Valid scalar value - copy data as-is to MemoryStream
94 memStream.Write(valueAsBytes.Slice(0, bytesConsumed));
96 else
98 // Invalid scalar value - copy U+FFFD to MemoryStream
99 memStream.Write(ReplacementCharSequence);
102 valueAsBytes = valueAsBytes.Slice(bytesConsumed);
103 } while (!valueAsBytes.IsEmpty);
105 bool success = memStream.TryGetBuffer(out ArraySegment<byte> memStreamBuffer);
106 Debug.Assert(success, "Couldn't get underlying MemoryStream buffer.");
108 return Utf8String.DangerousCreateWithoutValidation(memStreamBuffer, assumeWellFormed: true);
110 #endif // FEATURE_UTF8STRING