Add SearchValues polyfill and use it in CharacterMap

This commit is contained in:
Miha Zupan
2023-11-24 02:23:55 +01:00
parent 4eea9db35c
commit 1f1364e69b
3 changed files with 134 additions and 210 deletions

View File

@@ -2,14 +2,10 @@
// This file is licensed under the BSD-Clause 2 license.
// See the license.txt file in the project root for more information.
using System.Buffers;
using System.Diagnostics;
using System.Linq;
using System.Runtime.CompilerServices;
#if NETCOREAPP3_1_OR_GREATER
using System.Numerics;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
#endif
namespace Markdig.Helpers;
@@ -19,13 +15,9 @@ namespace Markdig.Helpers;
/// <typeparam name="T"></typeparam>
public sealed class CharacterMap<T> where T : class
{
#if NETCOREAPP3_1_OR_GREATER
private readonly Vector128<byte> _asciiBitmap;
#endif
private readonly T[] asciiMap;
private readonly Dictionary<uint, T>? nonAsciiMap;
private readonly BoolVector128 isOpeningCharacter;
private readonly SearchValues<char> _values;
private readonly T[] _asciiMap;
private readonly Dictionary<uint, T>? _nonAsciiMap;
/// <summary>
/// Initializes a new instance of the <see cref="CharacterMap{T}"/> class.
@@ -35,64 +27,38 @@ public sealed class CharacterMap<T> where T : class
public CharacterMap(IEnumerable<KeyValuePair<char, T>> maps)
{
if (maps is null) ThrowHelper.ArgumentNullException(nameof(maps));
var charSet = new HashSet<char>();
int maxChar = 0;
foreach (var map in maps)
{
var openingChar = map.Key;
charSet.Add(openingChar);
if (openingChar < 128)
{
maxChar = Math.Max(maxChar, openingChar);
if (openingChar == 0)
{
ThrowHelper.ArgumentOutOfRangeException("Null is not a valid opening character.", nameof(maps));
}
}
else
{
nonAsciiMap ??= new Dictionary<uint, T>();
}
charSet.Add(map.Key);
}
OpeningCharacters = charSet.ToArray();
Array.Sort(OpeningCharacters);
asciiMap = new T[maxChar + 1];
_asciiMap = new T[128];
foreach (var state in maps)
{
char openingChar = state.Key;
if (openingChar < 128)
{
asciiMap[openingChar] ??= state.Value;
isOpeningCharacter.Set(openingChar);
_asciiMap[openingChar] ??= state.Value;
}
else if (!nonAsciiMap!.ContainsKey(openingChar))
else
{
nonAsciiMap[openingChar] = state.Value;
_nonAsciiMap ??= new Dictionary<uint, T>();
if (!_nonAsciiMap.ContainsKey(openingChar))
{
_nonAsciiMap[openingChar] = state.Value;
}
}
}
#if NETCOREAPP3_1_OR_GREATER
if (nonAsciiMap is null)
{
long bitmap_0_3 = 0;
long bitmap_4_7 = 0;
foreach (char openingChar in OpeningCharacters)
{
int position = (openingChar >> 4) | ((openingChar & 0x0F) << 3);
if (position < 64) bitmap_0_3 |= 1L << position;
else bitmap_4_7 |= 1L << (position - 64);
}
_asciiBitmap = Vector128.Create(bitmap_0_3, bitmap_4_7).AsByte();
}
#endif
_values = SearchValues.Create(OpeningCharacters);
}
/// <summary>
@@ -110,7 +76,7 @@ public sealed class CharacterMap<T> where T : class
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get
{
T[] asciiMap = this.asciiMap;
T[] asciiMap = _asciiMap;
if (openingChar < (uint)asciiMap.Length)
{
return asciiMap[openingChar];
@@ -118,13 +84,12 @@ public sealed class CharacterMap<T> where T : class
else
{
T? map = null;
nonAsciiMap?.TryGetValue(openingChar, out map);
_nonAsciiMap?.TryGetValue(openingChar, out map);
return map;
}
}
}
/// <summary>
/// Searches for an opening character from a registered parser in the specified string.
/// </summary>
@@ -132,167 +97,20 @@ public sealed class CharacterMap<T> where T : class
/// <param name="start">The start.</param>
/// <param name="end">The end.</param>
/// <returns>Index position within the string of the first opening character found in the specified text; if not found, returns -1</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int IndexOfOpeningCharacter(string text, int start, int end)
{
Debug.Assert(text is not null);
Debug.Assert(start >= 0 && end >= 0);
Debug.Assert(end - start + 1 >= 0);
Debug.Assert(end - start + 1 <= text.Length);
if (nonAsciiMap is null)
ReadOnlySpan<char> span = text.AsSpan(start, end - start + 1);
int index = span.IndexOfAny(_values);
if (index >= 0)
{
#if NETCOREAPP3_1_OR_GREATER
if (Ssse3.IsSupported && BitConverter.IsLittleEndian)
{
// Based on http://0x80.pl/articles/simd-byte-lookup.html#universal-algorithm
// Optimized for sets in the [1, 127] range
int lengthMinusOne = end - start;
int charsToProcessVectorized = lengthMinusOne & ~(2 * Vector128<short>.Count - 1);
int finalStart = start + charsToProcessVectorized;
if (start < finalStart)
{
ref char textStartRef = ref Unsafe.Add(ref Unsafe.AsRef(in text.GetPinnableReference()), start);
Vector128<byte> bitmap = _asciiBitmap;
do
{
// Load 32 bytes (16 chars) into two Vector128<short>s (chars)
// Drop the high byte of each char
// Pack the remaining bytes into a single Vector128<byte>
Vector128<byte> input = Sse2.PackUnsignedSaturate(
Unsafe.ReadUnaligned<Vector128<short>>(ref Unsafe.As<char, byte>(ref textStartRef)),
Unsafe.ReadUnaligned<Vector128<short>>(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref textStartRef, Vector128<short>.Count))));
// Extract the higher nibble of each character ((input >> 4) & 0xF)
Vector128<byte> higherNibbles = Sse2.And(Sse2.ShiftRightLogical(input.AsUInt16(), 4).AsByte(), Vector128.Create((byte)0xF));
// Lookup the matching higher nibble for each character based on the lower nibble
// PSHUFB will set the result to 0 for any non-ASCII (> 127) character
Vector128<byte> bitsets = Ssse3.Shuffle(bitmap, input);
// Calculate a bitmask (1 << (higherNibble % 8)) for each character
Vector128<byte> bitmask = Ssse3.Shuffle(Vector128.Create(0x8040201008040201).AsByte(), higherNibbles);
// Check which characters are present in the set
// We are relying on bitsets being zero for non-ASCII characters
Vector128<byte> result = Sse2.And(bitsets, bitmask);
if (!result.Equals(Vector128<byte>.Zero))
{
int resultMask = ~Sse2.MoveMask(Sse2.CompareEqual(result, Vector128<byte>.Zero));
return start + BitOperations.TrailingZeroCount((uint)resultMask);
}
start += 2 * Vector128<short>.Count;
textStartRef = ref Unsafe.Add(ref textStartRef, 2 * Vector128<short>.Count);
}
while (start != finalStart);
}
}
ref char textRef = ref Unsafe.AsRef(in text.GetPinnableReference());
for (; start <= end; start++)
{
if (IntPtr.Size == 4)
{
uint c = Unsafe.Add(ref textRef, start);
if (c < 128 && isOpeningCharacter[c])
{
return start;
}
}
else
{
ulong c = Unsafe.Add(ref textRef, start);
if (c < 128 && isOpeningCharacter[c])
{
return start;
}
}
}
#else
unsafe
{
fixed (char* pText = text)
{
for (int i = start; i <= end; i++)
{
char c = pText[i];
if (c < 128 && isOpeningCharacter[c])
{
return i;
}
}
}
}
#endif
return -1;
index += start;
}
else
{
return IndexOfOpeningCharacterNonAscii(text, start, end);
}
}
private int IndexOfOpeningCharacterNonAscii(string text, int start, int end)
{
#if NETCOREAPP3_1_OR_GREATER
ref char textRef = ref Unsafe.AsRef(in text.GetPinnableReference());
for (int i = start; i <= end; i++)
{
char c = Unsafe.Add(ref textRef, i);
if (c < 128 ? isOpeningCharacter[c] : nonAsciiMap!.ContainsKey(c))
{
return i;
}
}
#else
unsafe
{
fixed (char* pText = text)
{
for (int i = start; i <= end; i++)
{
char c = pText[i];
if (c < 128 ? isOpeningCharacter[c] : nonAsciiMap!.ContainsKey(c))
{
return i;
}
}
}
}
#endif
return -1;
return index;
}
}
internal unsafe struct BoolVector128
{
private fixed bool values[128];
public void Set(char c)
{
Debug.Assert(c < 128);
values[c] = true;
}
public readonly bool this[uint c]
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get
{
Debug.Assert(c < 128);
return values[c];
}
}
public readonly bool this[ulong c]
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get
{
Debug.Assert(c < 128 && IntPtr.Size == 8);
return values[c];
}
}
}

View File

@@ -36,9 +36,7 @@ public sealed class LiteralInlineParser : InlineParser
var startPosition = processor.GetSourcePosition(slice.Start, out int line, out int column);
// Slightly faster to perform our own search for opening characters
var nextStart = processor.Parsers.IndexOfOpeningCharacter(text, slice.Start + 1, slice.End);
//var nextStart = str.IndexOfAny(processor.SpecialCharacters, slice.Start + 1, slice.Length - 1);
int length;
if (nextStart < 0)

View File

@@ -0,0 +1,108 @@
// Copyright (c) Alexandre Mutel. All rights reserved.
// This file is licensed under the BSD-Clause 2 license.
// See the license.txt file in the project root for more information.
#if !NET8_0_OR_GREATER
using System.Diagnostics;
using System.Runtime.CompilerServices;
namespace System.Buffers;
internal static class SearchValues
{
public static SearchValues<char> Create(ReadOnlySpan<char> values)
{
return new PreNet8CompatSearchValues(values);
}
public static int IndexOfAny(this ReadOnlySpan<char> span, SearchValues<char> values)
{
return values.IndexOfAny(span);
}
public static int IndexOfAny(this Span<char> span, SearchValues<char> values)
{
return values.IndexOfAny(span);
}
}
internal abstract class SearchValues<T>
{
public abstract int IndexOfAny(ReadOnlySpan<char> span);
}
internal sealed class PreNet8CompatSearchValues : SearchValues<char>
{
private readonly BoolVector128 _ascii;
private readonly HashSet<char>? _nonAscii;
public PreNet8CompatSearchValues(ReadOnlySpan<char> values)
{
foreach (char c in values)
{
if (c < 128)
{
_ascii.Set(c);
}
else
{
_nonAscii ??= new HashSet<char>();
_nonAscii.Add(c);
}
}
}
public override int IndexOfAny(ReadOnlySpan<char> span)
{
if (_nonAscii is null)
{
for (int i = 0; i < span.Length; i++)
{
char c = span[i];
if (c < 128 && _ascii[c])
{
return i;
}
}
}
else
{
for (int i = 0; i < span.Length; i++)
{
char c = span[i];
if (c < 128 ? _ascii[c] : _nonAscii.Contains(c))
{
return i;
}
}
}
return -1;
}
private unsafe struct BoolVector128
{
private fixed bool _values[128];
public void Set(char c)
{
Debug.Assert(c < 128);
_values[c] = true;
}
public readonly bool this[uint c]
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get
{
Debug.Assert(c < 128);
return _values[c];
}
}
}
}
#endif