1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
6 using System
.Diagnostics
;
8 using System
.Runtime
.CompilerServices
;
9 using System
.Runtime
.InteropServices
;
10 using Internal
.Runtime
.CompilerServices
;
12 namespace System
.Text
.Unicode
14 internal static partial class Utf8Utility
17 /// The maximum number of bytes that can result from UTF-8 transcoding
18 /// any Unicode scalar value.
20 internal const int MaxBytesPerScalar
= 4;
23 /// The UTF-8 representation of <see cref="UnicodeUtility.ReplacementChar"/>.
25 private static ReadOnlySpan
<byte> ReplacementCharSequence
=> new byte[] { 0xEF, 0xBF, 0xBD }
;
28 /// Returns the byte index in <paramref name="utf8Data"/> where the first invalid UTF-8 sequence begins,
29 /// or -1 if the buffer contains no invalid sequences. Also outs the <paramref name="isAscii"/> parameter
30 /// stating whether all data observed (up to the first invalid sequence or the end of the buffer, whichever
31 /// comes first) is ASCII.
33 [MethodImpl(MethodImplOptions
.AggressiveInlining
)]
34 public static unsafe int GetIndexOfFirstInvalidUtf8Sequence(ReadOnlySpan
<byte> utf8Data
, out bool isAscii
)
36 fixed (byte* pUtf8Data
= &MemoryMarshal
.GetReference(utf8Data
))
38 byte* pFirstInvalidByte
= GetPointerToFirstInvalidByte(pUtf8Data
, utf8Data
.Length
, out int utf16CodeUnitCountAdjustment
, out _
);
39 int index
= (int)(void*)Unsafe
.ByteOffset(ref *pUtf8Data
, ref *pFirstInvalidByte
);
41 isAscii
= (utf16CodeUnitCountAdjustment
== 0); // If UTF-16 char count == UTF-8 byte count, it's ASCII.
42 return (index
< utf8Data
.Length
) ? index
: -1;
46 #if FEATURE_UTF8STRING
48 /// Returns a value stating whether <paramref name="utf8Data"/> contains only well-formed UTF-8 data.
50 [MethodImpl(MethodImplOptions
.AggressiveInlining
)]
51 public static unsafe bool IsWellFormedUtf8(ReadOnlySpan
<byte> utf8Data
)
53 fixed (byte* pUtf8Data
= &MemoryMarshal
.GetReference(utf8Data
))
55 // The return value here will point to the end of the span if the data is well-formed.
56 byte* pFirstInvalidByte
= GetPointerToFirstInvalidByte(pUtf8Data
, utf8Data
.Length
, out int _
, out _
);
57 return (pFirstInvalidByte
== (pUtf8Data
+ (uint)utf8Data
.Length
));
62 /// Returns <paramref name="value"/> if it is null or contains only well-formed UTF-8 data;
63 /// otherwises allocates a new <see cref="Utf8String"/> instance containing the same data as
64 /// <paramref name="value"/> but where all invalid UTF-8 sequences have been replaced
67 public static Utf8String
ValidateAndFixupUtf8String(Utf8String
value)
69 if (value.Length
== 0)
74 ReadOnlySpan
<byte> valueAsBytes
= value.AsBytes();
76 int idxOfFirstInvalidData
= GetIndexOfFirstInvalidUtf8Sequence(valueAsBytes
, out _
);
77 if (idxOfFirstInvalidData
< 0)
82 // TODO_UTF8STRING: Replace this with the faster implementation once it's available.
83 // (The faster implementation is in the dev/utf8string_bak branch currently.)
85 MemoryStream memStream
= new MemoryStream();
86 memStream
.Write(valueAsBytes
.Slice(0, idxOfFirstInvalidData
));
88 valueAsBytes
= valueAsBytes
.Slice(idxOfFirstInvalidData
);
91 if (Rune
.DecodeFromUtf8(valueAsBytes
, out _
, out int bytesConsumed
) == OperationStatus
.Done
)
93 // Valid scalar value - copy data as-is to MemoryStream
94 memStream
.Write(valueAsBytes
.Slice(0, bytesConsumed
));
98 // Invalid scalar value - copy U+FFFD to MemoryStream
99 memStream
.Write(ReplacementCharSequence
);
102 valueAsBytes
= valueAsBytes
.Slice(bytesConsumed
);
103 } while (!valueAsBytes
.IsEmpty
);
105 bool success
= memStream
.TryGetBuffer(out ArraySegment
<byte> memStreamBuffer
);
106 Debug
.Assert(success
, "Couldn't get underlying MemoryStream buffer.");
108 return Utf8String
.DangerousCreateWithoutValidation(memStreamBuffer
, assumeWellFormed
: true);
110 #endif // FEATURE_UTF8STRING