From 12b737d980f0a8bc6f275bd8eabb0eb1011f6f90 Mon Sep 17 00:00:00 2001 From: Ben Adams Date: Thu, 13 Jun 2019 22:21:40 +0100 Subject: [PATCH] Intrinsicify SpanHelpers.IndexOf(char) (dotnet/coreclr#22505) * Helpers to support Intrinsics in SpanHelpers.Char * Intrinsicify SpanHelpers.IndexOf(char) * Feedback * fix * Fix assert * Improve comment warning * fix * fix * Fix * Fix Signed-off-by: dotnet-bot --- .../shared/System/SpanHelpers.Char.cs | 336 +++++++++++++++++---- .../System.Private.CoreLib/shared/System/String.cs | 103 +------ 2 files changed, 279 insertions(+), 160 deletions(-) diff --git a/netcore/System.Private.CoreLib/shared/System/SpanHelpers.Char.cs b/netcore/System.Private.CoreLib/shared/System/SpanHelpers.Char.cs index 9bf1c57244a..cfa2750328c 100644 --- a/netcore/System.Private.CoreLib/shared/System/SpanHelpers.Char.cs +++ b/netcore/System.Private.CoreLib/shared/System/SpanHelpers.Char.cs @@ -5,14 +5,17 @@ using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; using Internal.Runtime.CompilerServices; #if BIT64 using nuint = System.UInt64; +using nint = System.Int64; #else using nuint = System.UInt32; +using nint = System.Int32; #endif namespace System @@ -218,93 +221,243 @@ namespace System { Debug.Assert(length >= 0); - fixed (char* pChars = &searchSpace) - { - char* pCh = pChars; - char* pEndCh = pCh + length; + nint offset = 0; + nint lengthToExamine = length; - if (Vector.IsHardwareAccelerated && length >= Vector.Count * 2) + if (((int)Unsafe.AsPointer(ref searchSpace) & 1) != 0) + { + // Input isn't char aligned, we won't be able to align it to a Vector + } + else if (Sse2.IsSupported) + { + // Avx2 branch also operates on Sse2 sizes, so check is combined. + // Needs to be double length to allow us to align the data first. + if (length >= Vector128.Count * 2) { - // Figure out how many characters to read sequentially until we are vector aligned - // This is equivalent to: - // unaligned = ((int)pCh % Unsafe.SizeOf>()) / elementsPerByte - // length = (Vector.Count - unaligned) % Vector.Count - const int elementsPerByte = sizeof(ushort) / sizeof(byte); - int unaligned = ((int)pCh & (Unsafe.SizeOf>() - 1)) / elementsPerByte; - length = (Vector.Count - unaligned) & (Vector.Count - 1); + lengthToExamine = UnalignedCountVector128(ref searchSpace); } - - SequentialScan: - while (length >= 4) + } + else if (Vector.IsHardwareAccelerated) + { + // Needs to be double length to allow us to align the data first. + if (length >= Vector.Count * 2) { - length -= 4; + lengthToExamine = UnalignedCountVector(ref searchSpace); + } + } - if (pCh[0] == value) - goto Found; - if (pCh[1] == value) - goto Found1; - if (pCh[2] == value) - goto Found2; - if (pCh[3] == value) - goto Found3; + SequentialScan: + // In the non-vector case lengthToExamine is the total length. + // In the vector case lengthToExamine first aligns to Vector, + // then in a second pass after the Vector lengths is the + // remaining data that is shorter than a Vector length. + while (lengthToExamine >= 4) + { + ref char current = ref Add(ref searchSpace, offset); + + if (value == current) + goto Found; + if (value == Add(ref current, 1)) + goto Found1; + if (value == Add(ref current, 2)) + goto Found2; + if (value == Add(ref current, 3)) + goto Found3; + + offset += 4; + lengthToExamine -= 4; + } - pCh += 4; - } + while (lengthToExamine > 0) + { + if (value == Add(ref searchSpace, offset)) + goto Found; - while (length > 0) + offset += 1; + lengthToExamine -= 1; + } + + // We get past SequentialScan only if IsHardwareAccelerated or intrinsic .IsSupported is true. However, we still have the redundant check to allow + // the JIT to see that the code is unreachable and eliminate it when the platform does not have hardware accelerated. + if (Avx2.IsSupported) + { + if (offset < length) { - length--; + Debug.Assert(length - offset >= Vector128.Count); + if (((nint)Unsafe.AsPointer(ref Unsafe.Add(ref searchSpace, (IntPtr)offset)) & (nint)(Vector256.Count - 1)) != 0) + { + // Not currently aligned to Vector256 (is aligned to Vector128); this can cause a problem for searches + // with no upper bound e.g. String.wcslen. Start with a check on Vector128 to align to Vector256, + // before moving to processing Vector256. + + // If the input searchSpan has been fixed or pinned, this ensures we do not fault across memory pages + // while searching for an end of string. Specifically that this assumes that the length is either correct + // or that the data is pinned otherwise it may cause an AccessViolation from crossing a page boundary into an + // unowned page. If the search is unbounded (e.g. null terminator in wcslen) and the search value is not found, + // again this will likely cause an AccessViolation. However, correctly bounded searches will return -1 rather + // than ever causing an AV. + + // If the searchSpan has not been fixed or pinned the GC can relocate it during the execution of this + // method, so the alignment only acts as best endeavour. The GC cost is likely to dominate over + // the misalignment that may occur after; to we default to giving the GC a free hand to relocate and + // its up to the caller whether they are operating over fixed data. + Vector128 values = Vector128.Create((ushort)value); + Vector128 search = LoadVector128(ref searchSpace, offset); + + // Same method as below + int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search).AsByte()); + if (matches == 0) + { + // Zero flags set so no matches + offset += Vector128.Count; + } + else + { + // Find bitflag offset of first match and add to current offset + return (int)(offset + (BitOperations.TrailingZeroCount(matches) / sizeof(char))); + } + } - if (pCh[0] == value) - goto Found; + lengthToExamine = GetCharVector256SpanLength(offset, length); + if (lengthToExamine > 0) + { + Vector256 values = Vector256.Create((ushort)value); + do + { + Debug.Assert(lengthToExamine >= Vector256.Count); + + Vector256 search = LoadVector256(ref searchSpace, offset); + int matches = Avx2.MoveMask(Avx2.CompareEqual(values, search).AsByte()); + // Note that MoveMask has converted the equal vector elements into a set of bit flags, + // So the bit position in 'matches' corresponds to the element offset. + if (matches == 0) + { + // Zero flags set so no matches + offset += Vector256.Count; + lengthToExamine -= Vector256.Count; + continue; + } + + // Find bitflag offset of first match and add to current offset, + // flags are in bytes so divide for chars + return (int)(offset + (BitOperations.TrailingZeroCount(matches) / sizeof(char))); + } while (lengthToExamine > 0); + } - pCh++; + lengthToExamine = GetCharVector128SpanLength(offset, length); + if (lengthToExamine > 0) + { + Debug.Assert(lengthToExamine >= Vector128.Count); + + Vector128 values = Vector128.Create((ushort)value); + Vector128 search = LoadVector128(ref searchSpace, offset); + + // Same method as above + int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search).AsByte()); + if (matches == 0) + { + // Zero flags set so no matches + offset += Vector128.Count; + // Don't need to change lengthToExamine here as we don't use its current value again. + } + else + { + // Find bitflag offset of first match and add to current offset, + // flags are in bytes so divide for chars + return (int)(offset + (BitOperations.TrailingZeroCount(matches) / sizeof(char))); + } + } + + if (offset < length) + { + lengthToExamine = length - offset; + goto SequentialScan; + } } + } + else if (Sse2.IsSupported) + { + if (offset < length) + { + Debug.Assert(length - offset >= Vector128.Count); - // We get past SequentialScan only if IsHardwareAccelerated is true. However, we still have the redundant check to allow - // the JIT to see that the code is unreachable and eliminate it when the platform does not have hardware accelerated. - if (Vector.IsHardwareAccelerated && pCh < pEndCh) + lengthToExamine = GetCharVector128SpanLength(offset, length); + if (lengthToExamine > 0) + { + Vector128 values = Vector128.Create((ushort)value); + do + { + Debug.Assert(lengthToExamine >= Vector128.Count); + + Vector128 search = LoadVector128(ref searchSpace, offset); + + // Same method as above + int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search).AsByte()); + if (matches == 0) + { + // Zero flags set so no matches + offset += Vector128.Count; + lengthToExamine -= Vector128.Count; + continue; + } + + // Find bitflag offset of first match and add to current offset, + // flags are in bytes so divide for chars + return (int)(offset + (BitOperations.TrailingZeroCount(matches) / sizeof(char))); + } while (lengthToExamine > 0); + } + + if (offset < length) + { + lengthToExamine = length - offset; + goto SequentialScan; + } + } + } + else if (Vector.IsHardwareAccelerated) + { + if (offset < length) { - // Get the highest multiple of Vector.Count that is within the search space. - // That will be how many times we iterate in the loop below. - // This is equivalent to: length = Vector.Count * ((int)(pEndCh - pCh) / Vector.Count) - length = (int)((pEndCh - pCh) & ~(Vector.Count - 1)); + Debug.Assert(length - offset >= Vector.Count); - // Get comparison Vector - Vector vComparison = new Vector(value); + lengthToExamine = GetCharVectorSpanLength(offset, length); - while (length > 0) + if (lengthToExamine > 0) { - // Using Unsafe.Read instead of ReadUnaligned since the search space is pinned and pCh is always vector aligned - Debug.Assert(((int)pCh & (Unsafe.SizeOf>() - 1)) == 0); - Vector vMatches = Vector.Equals(vComparison, Unsafe.Read>(pCh)); - if (Vector.Zero.Equals(vMatches)) + Vector values = new Vector((ushort)value); + do { - pCh += Vector.Count; - length -= Vector.Count; - continue; - } - // Find offset of first match - return (int)(pCh - pChars) + LocateFirstFoundChar(vMatches); + Debug.Assert(lengthToExamine >= Vector.Count); + + var matches = Vector.Equals(values, LoadVector(ref searchSpace, offset)); + if (Vector.Zero.Equals(matches)) + { + offset += Vector.Count; + lengthToExamine -= Vector.Count; + continue; + } + + // Find offset of first match + return (int)(offset + LocateFirstFoundChar(matches)); + } while (lengthToExamine > 0); } - if (pCh < pEndCh) + if (offset < length) { - length = (int)(pEndCh - pCh); + lengthToExamine = length - offset; goto SequentialScan; } } - - return -1; - Found3: - pCh++; - Found2: - pCh++; - Found1: - pCh++; - Found: - return (int)(pCh - pChars); } + return -1; + Found3: + return (int)(offset + 3); + Found2: + return (int)(offset + 2); + Found1: + return (int)(offset + 1); + Found: + return (int)(offset); } [MethodImpl(MethodImplOptions.AggressiveOptimization)] @@ -876,5 +1029,62 @@ namespace System { return 3 - (BitOperations.LeadingZeroCount(match) >> 4); } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static ref char Add(ref char source, nint elementOffset) + => ref Unsafe.Add(ref source, (IntPtr)elementOffset); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe Vector LoadVector(ref char start, nint offset) + => Unsafe.ReadUnaligned>(ref Unsafe.As(ref Unsafe.Add(ref start, (IntPtr)offset))); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe Vector128 LoadVector128(ref char start, nint offset) + => Unsafe.ReadUnaligned>(ref Unsafe.As(ref Unsafe.Add(ref start, (IntPtr)offset))); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe Vector256 LoadVector256(ref char start, nint offset) + => Unsafe.ReadUnaligned>(ref Unsafe.As(ref Unsafe.Add(ref start, (IntPtr)offset))); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe UIntPtr LoadUIntPtr(ref char start, nint offset) + => Unsafe.ReadUnaligned(ref Unsafe.As(ref Unsafe.Add(ref start, (IntPtr)offset))); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe nint GetCharVectorSpanLength(nint offset, nint length) + => ((length - offset) & ~(Vector.Count - 1)); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe nint GetCharVector128SpanLength(nint offset, nint length) + => ((length - offset) & ~(Vector128.Count - 1)); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static nint GetCharVector256SpanLength(nint offset, nint length) + => ((length - offset) & ~(Vector256.Count - 1)); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe nint UnalignedCountVector(ref char searchSpace) + { + const int ElementsPerByte = sizeof(ushort) / sizeof(byte); + // Figure out how many characters to read sequentially until we are vector aligned + // This is equivalent to: + // unaligned = ((int)pCh % Unsafe.SizeOf>()) / ElementsPerByte + // length = (Vector.Count - unaligned) % Vector.Count + + // This alignment is only valid if the GC does not relocate; so we use ReadUnaligned to get the data. + // If a GC does occur and alignment is lost, the GC cost will outweigh any gains from alignment so it + // isn't too important to pin to maintain the alignment. + return (nint)(uint)(-(int)Unsafe.AsPointer(ref searchSpace) / ElementsPerByte ) & (Vector.Count - 1); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe nint UnalignedCountVector128(ref char searchSpace) + { + const int ElementsPerByte = sizeof(ushort) / sizeof(byte); + // This alignment is only valid if the GC does not relocate; so we use ReadUnaligned to get the data. + // If a GC does occur and alignment is lost, the GC cost will outweigh any gains from alignment so it + // isn't too important to pin to maintain the alignment. + return (nint)(uint)(-(int)Unsafe.AsPointer(ref searchSpace) / ElementsPerByte ) & (Vector128.Count - 1); + } } } diff --git a/netcore/System.Private.CoreLib/shared/System/String.cs b/netcore/System.Private.CoreLib/shared/System/String.cs index 34558d074c5..e2b5cd4e857 100644 --- a/netcore/System.Private.CoreLib/shared/System/String.cs +++ b/netcore/System.Private.CoreLib/shared/System/String.cs @@ -567,110 +567,19 @@ namespace System return new StringRuneEnumerator(this); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe int wcslen(char* ptr) { - char* end = ptr; - - // First make sure our pointer is aligned on a word boundary - int alignment = IntPtr.Size - 1; - - // If ptr is at an odd address (e.g. 0x5), this loop will simply iterate all the way - while (((uint)end & (uint)alignment) != 0) - { - if (*end == 0) goto FoundZero; - end++; - } - -#if !BIT64 - // The following code is (somewhat surprisingly!) significantly faster than a naive loop, - // at least on x86 and the current jit. - - // The loop condition below works because if "end[0] & end[1]" is non-zero, that means - // neither operand can have been zero. If is zero, we have to look at the operands individually, - // but we hope this going to fairly rare. - - // In general, it would be incorrect to access end[1] if we haven't made sure - // end[0] is non-zero. However, we know the ptr has been aligned by the loop above - // so end[0] and end[1] must be in the same word (and therefore page), so they're either both accessible, or both not. - - while ((end[0] & end[1]) != 0 || (end[0] != 0 && end[1] != 0)) - { - end += 2; - } - - Debug.Assert(end[0] == 0 || end[1] == 0); - if (end[0] != 0) end++; -#else // !BIT64 - // Based on https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord - - // 64-bit implementation: process 1 ulong (word) at a time - - // What we do here is add 0x7fff from each of the - // 4 individual chars within the ulong, using MagicMask. - // If the char > 0 and < 0x8001, it will have its high bit set. - // We then OR with MagicMask, to set all the other bits. - // This will result in all bits set (ulong.MaxValue) for any - // char that fits the above criteria, and something else otherwise. - - // Note that for any char > 0x8000, this will be a false - // positive and we will fallback to the slow path and - // check each char individually. This is OK though, since - // we optimize for the common case (ASCII chars, which are < 0x80). - - // NOTE: We can access a ulong a time since the ptr is aligned, - // and therefore we're only accessing the same word/page. (See notes - // for the 32-bit version above.) - - const ulong MagicMask = 0x7fff7fff7fff7fff; - - while (true) + // IndexOf processes memory in aligned chunks, and thus it won't crash even if it accesses memory beyond the null terminator. + int length = SpanHelpers.IndexOf(ref *ptr, '\0', int.MaxValue); + if (length < 0) { - ulong word = *(ulong*)end; - word += MagicMask; // cause high bit to be set if not zero, and <= 0x8000 - word |= MagicMask; // set everything besides the high bits - - if (word == ulong.MaxValue) // 0xffff... - { - // all of the chars have their bits set (and therefore none can be 0) - end += 4; - continue; - } - - // at least one of them didn't have their high bit set! - // go through each char and check for 0. - - if (end[0] == 0) goto EndAt0; - if (end[1] == 0) goto EndAt1; - if (end[2] == 0) goto EndAt2; - if (end[3] == 0) goto EndAt3; - - // if we reached here, it was a false positive-- just continue - end += 4; + ThrowMustBeNullTerminatedString(); } - EndAt3: end++; - EndAt2: end++; - EndAt1: end++; - EndAt0: -#endif // !BIT64 - - FoundZero: - Debug.Assert(*end == 0); - - int count = (int)(end - ptr); - -#if BIT64 - // Check for overflow - if (ptr + count != end) - throw new ArgumentException(SR.Arg_MustBeNullTerminatedString); -#else - Debug.Assert(ptr + count == end); -#endif - - return count; + return length; } - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe int strlen(byte* ptr) { -- 2.11.4.GIT