Merge pull request #305 from MihaZupan/emoji-and-abbreviations-parser

Emoji and abbreviations parser
This commit is contained in:
Alexandre Mutel
2019-02-08 17:37:59 +01:00
committed by GitHub
9 changed files with 1385 additions and 259 deletions

View File

@@ -1,4 +1,4 @@
// Copyright (c) Alexandre Mutel. All rights reserved.
// Copyright (c) Alexandre Mutel. All rights reserved.
// This file is licensed under the BSD-Clause 2 license.
// See the license.txt file in the project root for more information.
@@ -52,7 +52,7 @@ namespace Testamina.Markdig.Benchmarks
}
}
/*
public class TestMatchPerf
{
private readonly TextMatchHelper matcher;
@@ -82,4 +82,5 @@ namespace Testamina.Markdig.Benchmarks
}
}
}
*/
}

View File

@@ -1,4 +1,4 @@
// Generated: 21. 01. 2019 14:26:34
// Generated: 6. 02. 2019 16:15:54
// --------------------------------
// Abbreviations
@@ -193,5 +193,44 @@ namespace Markdig.Tests.Specs.Abbreviations
Console.WriteLine("Example 9\nSection Extensions / Abbreviation\n");
TestParser.TestSpec("*[PR]: Pull Request\n\nPRAA", "<p>PRAA</p>", "abbreviations|advanced");
}
// Single character abbreviations should be matched
[Test]
public void ExtensionsAbbreviation_Example010()
{
// Example 10
// Section: Extensions / Abbreviation
//
// The following Markdown:
// *[A]: Foo
//
// A
//
// Should be rendered as:
// <p><abbr title="Foo">A</abbr></p>
Console.WriteLine("Example 10\nSection Extensions / Abbreviation\n");
TestParser.TestSpec("*[A]: Foo\n\nA", "<p><abbr title=\"Foo\">A</abbr></p>", "abbreviations|advanced");
}
// The longest matching abbreviation should be used
[Test]
public void ExtensionsAbbreviation_Example011()
{
// Example 11
// Section: Extensions / Abbreviation
//
// The following Markdown:
// *[Foo]: foo
// *[Foo Bar]: foobar
//
// Foo B
//
// Should be rendered as:
// <p><abbr title="foo">Foo</abbr> B</p>
Console.WriteLine("Example 11\nSection Extensions / Abbreviation\n");
TestParser.TestSpec("*[Foo]: foo\n*[Foo Bar]: foobar\n\nFoo B", "<p><abbr title=\"foo\">Foo</abbr> B</p>", "abbreviations|advanced");
}
}
}

View File

@@ -97,4 +97,25 @@ Abbreviations should only match when surrounded by whitespace:
PRAA
.
<p>PRAA</p>
````````````````````````````````
Single character abbreviations should be matched
```````````````````````````````` example
*[A]: Foo
A
.
<p><abbr title="Foo">A</abbr></p>
````````````````````````````````
The longest matching abbreviation should be used
```````````````````````````````` example
*[Foo]: foo
*[Foo Bar]: foobar
Foo B
.
<p><abbr title="foo">Foo</abbr> B</p>
````````````````````````````````

View File

@@ -1,4 +1,4 @@
// Copyright (c) Alexandre Mutel. All rights reserved.
// Copyright (c) Alexandre Mutel. All rights reserved.
// This file is licensed under the BSD-Clause 2 license.
// See the license.txt file in the project root for more information.
using System.Collections.Generic;
@@ -12,7 +12,7 @@ namespace Markdig.Extensions.Abbreviations
/// <summary>
/// A block parser for abbreviations.
/// </summary>
/// <seealso cref="Markdig.Parsers.BlockParser" />
/// <seealso cref="BlockParser" />
public class AbbreviationParser : BlockParser
{
/// <summary>
@@ -40,8 +40,7 @@ namespace Markdig.Extensions.Abbreviations
}
SourceSpan labelSpan;
string label;
if (!LinkHelper.TryParseLabel(ref slice, out label, out labelSpan))
if (!LinkHelper.TryParseLabel(ref slice, out string label, out labelSpan))
{
return BlockState.None;
}
@@ -85,8 +84,7 @@ namespace Markdig.Extensions.Abbreviations
}
// Build a text matcher from the abbreviations labels
var labels = new HashSet<string>(abbreviations.Keys);
var matcher = new TextMatchHelper(labels);
var prefixTree = new CompactPrefixTree<Abbreviation>(abbreviations);
inlineProcessor.LiteralInlineParser.PostMatch += (InlineProcessor processor, ref StringSlice slice) =>
{
@@ -98,20 +96,35 @@ namespace Markdig.Extensions.Abbreviations
// This is slow, but we don't have much the choice
var content = literal.Content;
var text = content.Text;
for (int i = content.Start; i < content.End; i++)
{
string match;
if (matcher.TryMatch(text, i, content.End - i + 1, out match) && IsValidAbbreviation(match, content, i))
{
var indexAfterMatch = i + match.Length;
// We should have a match, but in case...
Abbreviation abbr;
if (!abbreviations.TryGetValue(match, out abbr))
for (int i = content.Start; i <= content.End; i++)
{
// Abbreviation must be a whole word == start at the start of a line or after a whitespace
if (i != 0)
{
for (i = i - 1; i <= content.End; i++)
{
if (text[i].IsWhitespace())
{
i++;
goto ValidAbbreviationStart;
}
}
break;
}
ValidAbbreviationStart:;
if (prefixTree.TryMatchLongest(text, i, content.End - i + 1, out KeyValuePair<string, Abbreviation> abbreviationMatch))
{
var match = abbreviationMatch.Key;
if (!IsValidAbbreviationEnding(match, content, i))
{
continue;
continue;
}
var indexAfterMatch = i + match.Length;
// If we don't have a container, create a new one
if (container == null)
{
@@ -124,13 +137,11 @@ namespace Markdig.Extensions.Abbreviations
};
}
int line;
int column;
var abbrInline = new AbbreviationInline(abbr)
var abbrInline = new AbbreviationInline(abbreviationMatch.Value)
{
Span =
{
Start = processor.GetSourcePosition(i, out line, out column),
Start = processor.GetSourcePosition(i, out int line, out int column),
},
Line = line,
Column = column
@@ -138,13 +149,9 @@ namespace Markdig.Extensions.Abbreviations
abbrInline.Span.End = abbrInline.Span.Start + match.Length - 1;
// Append the previous literal
if (i > content.Start)
{
if (literal.Parent == null)
{
container.AppendChild(literal);
}
if (i > content.Start && literal.Parent == null)
{
container.AppendChild(literal);
}
literal.Span.End = abbrInline.Span.Start - 1;
@@ -152,11 +159,10 @@ namespace Markdig.Extensions.Abbreviations
literal.Content.End = i - 1;
// Appned the abbreviation
// Append the abbreviation
container.AppendChild(abbrInline);
// If this is the end of the string, clear the literal
// and exit
// If this is the end of the string, clear the literal and exit
if (content.End == indexAfterMatch - 1)
{
literal = null;
@@ -188,34 +194,12 @@ namespace Markdig.Extensions.Abbreviations
};
}
private static bool IsValidAbbreviation(string match, StringSlice content, int matchIndex)
private static bool IsValidAbbreviationEnding(string match, StringSlice content, int matchIndex)
{
// The word matched must be embraced by punctuation or whitespace or \0.
var index = matchIndex - 1;
while (index >= content.Start)
{
var c = content.PeekCharAbsolute(index);
if (!(c == '\0' || c.IsWhitespace() || c.IsAsciiPunctuation()))
{
return false;
}
if (c.IsAlphaNumeric())
{
return false;
}
if (!c.IsAsciiPunctuation() || c.IsWhitespace())
{
break;
}
index--;
}
// This will check if the next char at the end of the StringSlice is whitespace, punctuation or \0.
var contentNew = content;
contentNew.End = content.End + 1;
index = matchIndex + match.Length;
int index = matchIndex + match.Length;
while (index <= contentNew.End)
{
var c = contentNew.PeekCharAbsolute(index);

View File

@@ -3,8 +3,7 @@
// See the license.txt file in the project root for more information.
using System;
using System.Collections;
using System.Collections.Generic;
using System.Collections.Generic;
using Markdig.Helpers;
using Markdig.Parsers;
@@ -13,13 +12,13 @@ namespace Markdig.Extensions.Emoji
/// <summary>
/// The inline parser used to for emoji.
/// </summary>
/// <seealso cref="Markdig.Parsers.InlineParser" />
/// <seealso cref="InlineParser" />
public class EmojiParser : InlineParser
{
private static readonly Dictionary<string, string> EmojiToUnicodeDefault;
private static readonly Dictionary<string, string> SmileyToEmojiDefault;
private TextMatchHelper textMatchHelper;
private CompactPrefixTree<string> _emojiPrefixTree;
/// <summary>
/// Initializes a new instance of the <see cref="EmojiParser"/> class.
@@ -28,8 +27,6 @@ namespace Markdig.Extensions.Emoji
{
EnableSmiley = enableSmiley;
OpeningCharacters = null;
EmojiToUnicode = new Dictionary<string, string>(EmojiToUnicodeDefault);
SmileyToEmoji = new Dictionary<string, string>(SmileyToEmojiDefault);
}
/// <summary>
@@ -37,91 +34,97 @@ namespace Markdig.Extensions.Emoji
/// </summary>
public bool EnableSmiley { get; set; }
/// <summary>
/// Gets the emoji to unicode mapping. This can be modified before this parser is initialized.
/// </summary>
public Dictionary<string, string> EmojiToUnicode { get; }
private Dictionary<string, string> _emojiToUnicode = null;
/// <summary>
/// Gets the emoji to unicode mapping. This can be modified before this parser is initialized.
/// </summary>
public Dictionary<string, string> EmojiToUnicode
{
// Lazy init, maybe it won't be necesarry at all
get => (_emojiToUnicode = _emojiToUnicode ?? new Dictionary<string, string>(EmojiToUnicodeDefault, StringComparer.Ordinal));
}
/// <summary>
/// Gets the smiley to emoji mapping. This can be modified before this parser is initialized.
/// </summary>
public Dictionary<string, string> SmileyToEmoji { get; }
private Dictionary<string, string> _smileyToEmoji = null;
/// <summary>
/// Gets the smiley to emoji mapping. This can be modified before this parser is initialized.
/// </summary>
public Dictionary<string, string> SmileyToEmoji
{
// Lazy init, maybe it won't be necesarry at all
get => (_smileyToEmoji = _smileyToEmoji ?? new Dictionary<string, string>(SmileyToEmojiDefault, StringComparer.Ordinal));
}
public override void Initialize()
{
var firstChars = new HashSet<char>();
var textToMatch = new HashSet<string>();
{
// Don't allocate a new dictionary if we don't need it
var emojiToUnicode = _emojiToUnicode ?? EmojiToUnicodeDefault;
foreach (var emoji in EmojiToUnicode)
{
firstChars.Add(emoji.Key[0]);
textToMatch.Add(emoji.Key);
}
foreach (var smiley in SmileyToEmoji)
{
firstChars.Add(smiley.Key[0]);
textToMatch.Add(smiley.Key);
}
textMatchHelper = new TextMatchHelper(textToMatch);
OpeningCharacters = new List<char>(firstChars).ToArray();
Array.Sort(OpeningCharacters);
if (EnableSmiley)
{
// Don't allocate a new dictionary if we don't need it
var smileyToEmoji = _smileyToEmoji ?? SmileyToEmojiDefault;
int jointCount = emojiToUnicode.Count + smileyToEmoji.Count;
// Count * 2 seems to be a good fit for the data set
_emojiPrefixTree = new CompactPrefixTree<string>(jointCount, jointCount * 2);
foreach (var emoji in emojiToUnicode)
_emojiPrefixTree.Add(emoji);
// This is not the best data set for the prefix tree as it will have to check the first character linearly
// A work-around would require a bunch of substrings / removing the leading ':' from emojis, neither one is pretty
// This way we sacrifice a few microseconds for not introducing breaking changes, emojis aren't all that common anyhow
var firstChars = new HashSet<char> { ':' };
foreach (var smiley in smileyToEmoji)
{
if (!emojiToUnicode.TryGetValue(smiley.Value, out string unicode))
throw new ArgumentException("Invalid smiley target: {0} is not present in the emoji dictionary", smiley.Value);
firstChars.Add(smiley.Key[0]);
if (!_emojiPrefixTree.TryAdd(smiley.Key, unicode))
throw new ArgumentException("Smiley {0} is already present in the Emoji dictionary", smiley.Key);
}
OpeningCharacters = new List<char>(firstChars).ToArray();
}
else
{
OpeningCharacters = new[] { ':' };
_emojiPrefixTree = new CompactPrefixTree<string>(emojiToUnicode);
};
}
public override bool Match(InlineProcessor processor, ref StringSlice slice)
{
string match;
// Previous char must be a space
if (!slice.PeekCharExtra(-1).IsWhiteSpaceOrZero())
if (!slice.PeekCharExtra(-1).IsWhiteSpaceOrZero())
{
return false;
}
// Try to match an emoji
if (!_emojiPrefixTree.TryMatchLongest(slice.Text, slice.Start, slice.Length, out KeyValuePair<string, string> match))
{
return false;
}
// Try to match an existing emoji
var startPosition = slice.Start;
if (!textMatchHelper.TryMatch(slice.Text, slice.Start, slice.Length, out match))
{
return false;
}
string emoji = match;
if (EnableSmiley)
{
// If we have a smiley, we decode it to emoji
if (!SmileyToEmoji.TryGetValue(match, out emoji))
{
emoji = match;
}
}
// Decode the eomji to unicode
string unicode;
if (!EmojiToUnicode.TryGetValue(emoji, out unicode))
{
// Should not happen but in case
return false;
}
// Move the cursor to the character after the matched string
slice.Start += match.Length;
// Push the EmojiInline
int line;
int column;
processor.Inline = new EmojiInline(unicode)
processor.Inline = new EmojiInline(match.Value)
{
Span =
{
Start = processor.GetSourcePosition(startPosition, out line, out column),
Start = processor.GetSourcePosition(slice.Start, out int line, out int column),
},
Line = line,
Column = column,
Match = match
Match = match.Key
};
processor.Inline.Span.End = processor.Inline.Span.Start + match.Length - 1;
processor.Inline.Span.End = processor.Inline.Span.Start + match.Key.Length - 1;
// Move the cursor to the character after the matched string
slice.Start += match.Key.Length;
return true;
}
@@ -1082,12 +1085,12 @@ namespace Markdig.Extensions.Emoji
// Custom arrows
{"<-", ":custom_arrow_left:" },
{"->", ":custom_arrow_rigth:" },
{"<->", ":custom_arrow_left_rigth:" },
{"->", ":custom_arrow_right:" },
{"<->", ":custom_arrow_left_right:" },
{"<=", ":custom_arrow_left_strong:" },
{"=>", ":custom_arrow_rigth_strong:" },
{"<=>", ":custom_arrow_left_rigth_strong:" },
{"=>", ":custom_arrow_right_strong:" },
{"<=>", ":custom_arrow_left_right_strong:" },
};
}
#endregion

File diff suppressed because it is too large Load Diff

View File

@@ -1,127 +0,0 @@
// Copyright (c) Alexandre Mutel. All rights reserved.
// This file is licensed under the BSD-Clause 2 license.
// See the license.txt file in the project root for more information.
using System;
using System.Collections.Generic;
namespace Markdig.Helpers
{
/// <summary>
/// Match a text against a list of ASCII string using internally a tree to speedup the lookup
/// </summary>
public class TextMatchHelper
{
private readonly CharNode root;
private readonly ListCache listCache;
/// <summary>
/// Initializes a new instance of the <see cref="TextMatchHelper"/> class.
/// </summary>
/// <param name="matches">The matches to match against.</param>
/// <exception cref="System.ArgumentNullException"></exception>
public TextMatchHelper(HashSet<string> matches)
{
if (matches == null) throw new ArgumentNullException(nameof(matches));
var list = new List<string>(matches);
root = new CharNode();
listCache = new ListCache();
BuildMap(root, 0, list);
listCache.Clear();
}
/// <summary>
/// Tries to match in the text, at offset position, the list of string matches registered to this instance.
/// </summary>
/// <param name="text">The text.</param>
/// <param name="offset">The offset.</param>
/// <param name="length">The length.</param>
/// <param name="match">The match string if the match was successfull.</param>
/// <returns>
/// <c>true</c> if the match was successfull; <c>false</c> otherwise
/// </returns>
/// <exception cref="System.ArgumentNullException"></exception>
public bool TryMatch(string text, int offset, int length, out string match)
{
if (text == null) throw new ArgumentNullException(nameof(text));
// TODO(lazy): we should check offset and length for a better exception experience in case of wrong usage
var node = root;
match = null;
while (length > 0)
{
var c = text[offset];
CharNode nextNode;
if (!node.TryGetValue(c, out nextNode))
{
break;
}
node = nextNode;
offset++;
length--;
}
if (node.Content != null)
{
match = node.Content;
return true;
}
return false;
}
private void BuildMap(CharNode node, int index, List<string> list)
{
// TODO(lazy): This code for building the nodes is not very efficient in terms of memory usage and could be optimized (using structs and indices)
// At least, we are using a cache for the temporary objects build (List<string>)
for (int i = 0; i < list.Count; i++)
{
var str = list[i];
var c = str[index];
CharNode nextNode;
if (!node.TryGetValue(c, out nextNode))
{
nextNode = new CharNode();
node.Add(c, nextNode);
}
// We have found a string for this node
if (index + 1 == str.Length)
{
nextNode.Content = str;
}
else
{
if (nextNode.NextList == null)
{
nextNode.NextList = listCache.Get();
}
nextNode.NextList.Add(str);
}
}
foreach (var charList in node)
{
if (charList.Value.NextList != null)
{
BuildMap(charList.Value, index + 1, charList.Value.NextList);
listCache.Release(charList.Value.NextList);
charList.Value.NextList = null;
}
}
}
private class ListCache : DefaultObjectCache<List<string>>
{
protected override void Reset(List<string> instance)
{
instance.Clear();
}
}
private class CharNode : Dictionary<char, CharNode>
{
public List<string> NextList;
public string Content { get; set; }
}
}
}

View File

@@ -0,0 +1,90 @@
using System;
using System.Diagnostics;
namespace Markdig.Helpers
{
/// <summary>
/// Inspired by CoreLib, taken from https://github.com/MihaZupan/SharpCollections, cc @MihaZupan
/// </summary>
internal static class ThrowHelper
{
public static void ThrowArgumentNullException(ExceptionArgument argument)
{
throw new ArgumentNullException(GetArgumentName(argument));
}
public static void ThrowArgumentException(ExceptionArgument argument, ExceptionReason reason)
{
throw new ArgumentException(GetArgumentName(argument), GetExceptionReason(reason));
}
public static void ThrowArgumentOutOfRangeException(ExceptionArgument argument, ExceptionReason reason)
{
throw new ArgumentOutOfRangeException(GetArgumentName(argument), GetExceptionReason(reason));
}
public static void ThrowIndexOutOfRangeException()
{
throw new IndexOutOfRangeException();
}
private static string GetArgumentName(ExceptionArgument argument)
{
switch (argument)
{
case ExceptionArgument.key:
case ExceptionArgument.input:
case ExceptionArgument.value:
case ExceptionArgument.length:
case ExceptionArgument.text:
return argument.ToString();
case ExceptionArgument.offsetLength:
return "offset and length";
default:
Debug.Assert(false, "The enum value is not defined, please check the ExceptionArgument Enum.");
return "";
}
}
private static string GetExceptionReason(ExceptionReason reason)
{
switch (reason)
{
case ExceptionReason.String_Empty:
return "String must not be empty.";
case ExceptionReason.SmallCapacity:
return "Capacity was less than the current size.";
case ExceptionReason.InvalidOffsetLength:
return "Offset and length must refer to a position in the string.";
case ExceptionReason.DuplicateKey:
return "The given key is already present in the dictionary.";
default:
Debug.Assert(false, "The enum value is not defined, please check the ExceptionReason Enum.");
return "";
}
}
}
internal enum ExceptionArgument
{
key,
input,
value,
length,
offsetLength,
text
}
internal enum ExceptionReason
{
String_Empty,
SmallCapacity,
InvalidOffsetLength,
DuplicateKey,
}
}

View File

@@ -7,7 +7,7 @@
<NeutralLanguage>en-US</NeutralLanguage>
<VersionPrefix>0.15.7</VersionPrefix>
<Authors>Alexandre Mutel</Authors>
<TargetFrameworks>net35;net40;portable40-net40+sl5+win8+wp8+wpa81;netstandard1.1;netstandard2.0;uap10.0</TargetFrameworks>
<TargetFrameworks>net35;net40;portable40-net40+sl5+win8+wp8+wpa81;netstandard1.1;netstandard2.0;uap10.0;netcoreapp2.1</TargetFrameworks>
<AssemblyName>Markdig</AssemblyName>
<PackageId>Markdig</PackageId>
<PackageId Condition="'$(SignAssembly)' == 'true'">Markdig.Signed</PackageId>
@@ -18,6 +18,7 @@
<PackageProjectUrl>https://github.com/lunet-io/markdig</PackageProjectUrl>
<NetStandardImplicitPackageVersion Condition=" '$(TargetFramework)' == 'netstandard1.1' ">1.6.0</NetStandardImplicitPackageVersion>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
<LangVersion>7.3</LangVersion>
</PropertyGroup>
<ItemGroup Condition=" '$(TargetFramework)' == 'net35' ">
@@ -54,6 +55,10 @@
<PropertyGroup Condition=" '$(TargetFramework)' == 'netstandard2.0' ">
<DefineConstants>$(DefineConstants);SUPPORT_FIXED_STRING;SUPPORT_UNSAFE</DefineConstants>
</PropertyGroup>
<PropertyGroup Condition=" '$(TargetFramework)' == 'netcoreapp2.1' ">
<DefineConstants>$(DefineConstants);SUPPORT_FIXED_STRING;SUPPORT_UNSAFE;NETCORE</DefineConstants>
</PropertyGroup>
<PropertyGroup Condition=" '$(TargetFramework)' == 'uap10.0' ">
<TargetPlatformIdentifier>UAP</TargetPlatformIdentifier>