Compare commits

...

6 Commits

Author SHA1 Message Date
Alexandre Mutel
98c687b4ed Merge pull request #666 from meziantou/issues/665
Fix ANE when parsing empty documents with trackTrivia enabled
2022-09-27 07:24:22 +02:00
Gérald Barré
8e4a732efe Fix ANE when parsing empty documents with trackTrivia enabled 2022-09-26 21:55:28 -04:00
Alexandre Mutel
bce4b70dc6 Merge pull request #649 from MihaZupan/commonmark-whitespace-punctuation
Align Whitespace and Punctuation definitions with CommonMark
2022-08-12 07:46:24 +02:00
Alexandre Mutel
1f71520de9 Merge pull request #650 from gfoidl/htmlhelper-TryParseHtmlTagOpenTag_remove_branches
Remove some branches in HtmlHelper.TryParseHtmlTagOpenTag by using bitmask
2022-08-12 07:45:31 +02:00
Günther Foidl
bfd7b6460c Remove some branches in HtmlHelper.TryParseHtmlTagOpenTag by using bitmasks 2022-07-21 12:11:03 +02:00
Miha Zupan
0e26ec5382 Align Whitespace and Punctuation definitions with CommonMark 2022-07-17 20:22:26 +02:00
8 changed files with 212 additions and 69 deletions

View File

@@ -0,0 +1,92 @@
using System.Collections.Generic;
using System.Globalization;
using Markdig.Helpers;
using NUnit.Framework;
namespace Markdig.Tests
{
public class TestCharHelper
{
// An ASCII punctuation character is
// !, ", #, $, %, &, ', (, ), *, +, ,, -, ., / (U+00212F),
// :, ;, <, =, >, ?, @ (U+003A0040),
// [, \, ], ^, _, ` (U+005B0060),
// {, |, }, or ~ (U+007B007E).
private static readonly HashSet<char> s_asciiPunctuation = new()
{
'!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/',
':', ';', '<', '=', '>', '?', '@',
'[', '\\', ']', '^', '_', '`',
'{', '|', '}', '~'
};
// A Unicode punctuation character is an ASCII punctuation character or anything in the general Unicode categories
// Pc, Pd, Pe, Pf, Pi, Po, or Ps.
private static readonly HashSet<UnicodeCategory> s_punctuationCategories = new()
{
UnicodeCategory.ConnectorPunctuation,
UnicodeCategory.DashPunctuation,
UnicodeCategory.ClosePunctuation,
UnicodeCategory.FinalQuotePunctuation,
UnicodeCategory.InitialQuotePunctuation,
UnicodeCategory.OtherPunctuation,
UnicodeCategory.OpenPunctuation
};
private static bool ExpectedIsPunctuation(char c)
{
return c <= 127
? s_asciiPunctuation.Contains(c)
: s_punctuationCategories.Contains(CharUnicodeInfo.GetUnicodeCategory(c));
}
private static bool ExpectedIsWhitespace(char c)
{
// A Unicode whitespace character is any code point in the Unicode Zs general category,
// or a tab (U+0009), line feed (U+000A), form feed (U+000C), or carriage return (U+000D).
return c == '\t' || c == '\n' || c == '\u000C' || c == '\r' ||
CharUnicodeInfo.GetUnicodeCategory(c) == UnicodeCategory.SpaceSeparator;
}
[Test]
public void IsWhitespace()
{
for (int i = char.MinValue; i <= char.MaxValue; i++)
{
char c = (char)i;
Assert.AreEqual(ExpectedIsWhitespace(c), CharHelper.IsWhitespace(c));
}
}
[Test]
public void CheckUnicodeCategory()
{
for (int i = char.MinValue; i <= char.MaxValue; i++)
{
char c = (char)i;
bool expectedSpace = c == 0 || ExpectedIsWhitespace(c);
bool expectedPunctuation = c == 0 || ExpectedIsPunctuation(c);
CharHelper.CheckUnicodeCategory(c, out bool spaceActual, out bool punctuationActual);
Assert.AreEqual(expectedSpace, spaceActual);
Assert.AreEqual(expectedPunctuation, punctuationActual);
}
}
[Test]
public void IsSpaceOrPunctuation()
{
for (int i = char.MinValue; i <= char.MaxValue; i++)
{
char c = (char)i;
bool expected = c == 0 || ExpectedIsWhitespace(c) || ExpectedIsPunctuation(c);
Assert.AreEqual(expected, CharHelper.IsSpaceOrPunctuation(c));
}
}
}
}

View File

@@ -8,6 +8,7 @@ using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using Markdig.Extensions.JiraLinks;
using Markdig.Renderers.Roundtrip;
using Markdig.Syntax;
using NUnit.Framework;
@@ -67,6 +68,15 @@ namespace Markdig.Tests
TestDescendantsOrder.TestSchemas(specsSyntaxTrees);
}
[Test]
public void ParseEmptyDocumentWithTrackTriviaEnabled()
{
var document = Markdown.Parse("", trackTrivia: true);
using var sw = new StringWriter();
new RoundtripRenderer(sw).Render(document);
Assert.AreEqual("", sw.ToString());
}
public static void TestSpec(string inputText, string expectedOutputText, string extensions = null, bool plainText = false, string context = null)
{
context ??= string.Empty;

View File

@@ -10,9 +10,7 @@ namespace Markdig.Tests
{
[TestCase("| S | T |\r\n|---|---| \r\n| G | H |")]
[TestCase("| S | T |\r\n|---|---|\t\r\n| G | H |")]
[TestCase("| S | T |\r\n|---|---|\v\r\n| G | H |")]
[TestCase("| S | T |\r\n|---|---|\f\r\n| G | H |")]
[TestCase("| S | T |\r\n|---|---|\f\v\t \r\n| G | H |")]
[TestCase("| S | \r\n|---|\r\n| G |\r\n\r\n| D | D |\r\n| ---| ---| \r\n| V | V |", 2)]
public void TestTableBug(string markdown, int tableCount = 1)
{

View File

@@ -53,7 +53,7 @@ namespace Markdig.Helpers
// A right-flanking delimiter run is a delimiter run that is
// (1) not preceded by Unicode whitespace, and either
// (1a) not preceded by a punctuation character, or
// (2a) not preceded by a punctuation character, or
// (2b) preceded by a punctuation character and followed by Unicode whitespace or a punctuation character.
// For purposes of this definition, the beginning and the end of the line count as Unicode whitespace.
canClose = !prevIsWhiteSpace &&
@@ -144,9 +144,37 @@ namespace Markdig.Helpers
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool IsWhitespace(this char c)
{
// 2.1 Characters and lines
// A whitespace character is a space(U + 0020), tab(U + 0009), newline(U + 000A), line tabulation (U + 000B), form feed (U + 000C), or carriage return (U + 000D).
return c <= ' ' && (c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r');
// 2.1 Characters and lines
// A Unicode whitespace character is any code point in the Unicode Zs general category,
// or a tab (U+0009), line feed (U+000A), form feed (U+000C), or carriage return (U+000D).
if (c <= ' ')
{
const long Mask =
(1L << ' ') |
(1L << '\t') |
(1L << '\n') |
(1L << '\f') |
(1L << '\r');
return (Mask & (1L << c)) != 0;
}
return c >= '\u00A0' && IsWhitespaceRare(c);
static bool IsWhitespaceRare(char c)
{
// return CharUnicodeInfo.GetUnicodeCategory(c) == UnicodeCategory.SpaceSeparator;
if (c < 5760)
{
return c == '\u00A0';
}
else
{
return c <= 12288 &&
(c == 5760 || IsInInclusiveRange(c, 8192, 8202) || c == 8239 || c == 8287 || c == 12288);
}
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -171,46 +199,47 @@ namespace Markdig.Helpers
// Check if a char is a space or a punctuation
public static void CheckUnicodeCategory(this char c, out bool space, out bool punctuation)
{
// Credits: code from CommonMark.NET
// Copyright (c) 2014, Kārlis Gaņģis All rights reserved.
// See license for details: https://github.com/Knagis/CommonMark.NET/blob/master/LICENSE.md
if (c <= 'ÿ')
if (IsWhitespace(c))
{
space = c == '\0' || c == ' ' || (c >= '\t' && c <= '\r') || c == '\u00a0' || c == '\u0085';
punctuation = c == '\0' || (c >= 33 && c <= 47) || (c >= 58 && c <= 64) || (c >= 91 && c <= 96) || (c >= 123 && c <= 126);
space = true;
punctuation = false;
}
else if (c <= 127)
{
space = c == '\0';
punctuation = c == '\0' || IsAsciiPunctuation(c);
}
else
{
var category = CharUnicodeInfo.GetUnicodeCategory(c);
space = category == UnicodeCategory.SpaceSeparator
|| category == UnicodeCategory.LineSeparator
|| category == UnicodeCategory.ParagraphSeparator;
punctuation = !space &&
(category == UnicodeCategory.ConnectorPunctuation
// A Unicode punctuation character is an ASCII punctuation character
// or anything in the general Unicode categories Pc, Pd, Pe, Pf, Pi, Po, or Ps.
space = false;
UnicodeCategory category = CharUnicodeInfo.GetUnicodeCategory(c);
punctuation = category == UnicodeCategory.ConnectorPunctuation
|| category == UnicodeCategory.DashPunctuation
|| category == UnicodeCategory.OpenPunctuation
|| category == UnicodeCategory.ClosePunctuation
|| category == UnicodeCategory.InitialQuotePunctuation
|| category == UnicodeCategory.FinalQuotePunctuation
|| category == UnicodeCategory.OtherPunctuation);
|| category == UnicodeCategory.OtherPunctuation;
}
}
// Same as CheckUnicodeCategory
internal static bool IsSpaceOrPunctuation(this char c)
{
if (c <= 'ÿ')
if (IsWhitespace(c))
{
return c == '\0' || c == ' ' || (c >= '\t' && c <= '\r') || c == '\u00a0' || c == '\u0085' ||
(c >= 33 && c <= 47 && c != 38) || (c >= 58 && c <= 64) || (c >= 91 && c <= 96) || (c >= 123 && c <= 126);
return true;
}
else if (c <= 127)
{
return c == '\0' || IsAsciiPunctuation(c);
}
else
{
var category = CharUnicodeInfo.GetUnicodeCategory(c);
return category == UnicodeCategory.SpaceSeparator
|| category == UnicodeCategory.LineSeparator
|| category == UnicodeCategory.ParagraphSeparator
|| category == UnicodeCategory.ConnectorPunctuation
return category == UnicodeCategory.ConnectorPunctuation
|| category == UnicodeCategory.DashPunctuation
|| category == UnicodeCategory.OpenPunctuation
|| category == UnicodeCategory.ClosePunctuation
@@ -289,44 +318,16 @@ namespace Markdig.Helpers
public static bool IsAsciiPunctuation(this char c)
{
// 2.1 Characters and lines
// An ASCII punctuation character is !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \, ], ^, _, `, {, |, }, or ~.
switch (c)
{
case '!':
case '"':
case '#':
case '$':
case '%':
case '&':
case '\'':
case '(':
case ')':
case '*':
case '+':
case ',':
case '-':
case '.':
case '/':
case ':':
case ';':
case '<':
case '=':
case '>':
case '?':
case '@':
case '[':
case '\\':
case ']':
case '^':
case '_':
case '`':
case '{':
case '|':
case '}':
case '~':
return true;
}
return false;
// An ASCII punctuation character is
// !, ", #, $, %, &, ', (, ), *, +, ,, -, ., / (U+00212F),
// :, ;, <, =, >, ?, @ (U+003A0040),
// [, \, ], ^, _, ` (U+005B0060),
// {, |, }, or ~ (U+007B007E).
return c <= 127 && (
IsInInclusiveRange(c, 33, 47) ||
IsInInclusiveRange(c, 58, 64) ||
IsInInclusiveRange(c, 91, 96) ||
IsInInclusiveRange(c, 123, 126));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]

View File

@@ -4,6 +4,7 @@
using System;
using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;
namespace Markdig.Helpers
{
@@ -193,7 +194,7 @@ namespace Markdig.Helpers
{
return false;
}
if (c == ' ' || c == '\n' || c == '"' || c == '\'' || c == '=' || c == '<' || c == '>' || c == '`')
if (IsSpaceOrSpecialHtmlChar(c))
{
break;
}
@@ -202,6 +203,26 @@ namespace Markdig.Helpers
c = text.NextChar();
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
static bool IsSpaceOrSpecialHtmlChar(char c)
{
if (c > '>')
{
return c == '`';
}
const long BitMask =
(1L << ' ')
| (1L << '\n')
| (1L << '"')
| (1L << '\'')
| (1L << '=')
| (1L << '<')
| (1L << '>');
return (BitMask & (1L << c)) != 0;
}
// We need at least one char after '='
if (matchCount == 0)
{
@@ -227,7 +248,7 @@ namespace Markdig.Helpers
while (true)
{
c = text.NextChar();
if (c.IsAlphaNumeric() || c == '_' || c == ':' || c == '.' || c == '-')
if (c.IsAlphaNumeric() || IsCharToAppend(c))
{
builder.Append(c);
}
@@ -235,6 +256,23 @@ namespace Markdig.Helpers
{
break;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
static bool IsCharToAppend(char c)
{
if ((uint)(c - '-') > '_' - '-')
{
return false;
}
const long BitMask =
(1L << '_')
| (1L << ':')
| (1L << '.')
| (1L << '-');
return (BitMask & (1L << c)) != 0;
}
}
hasAttribute = true;

View File

@@ -55,7 +55,7 @@ namespace Markdig.Helpers
public static void ArgumentOutOfRangeException(string paramName) => throw new ArgumentOutOfRangeException(paramName);
[DoesNotReturn]
public static void ArgumentOutOfRangeException(string message, string paramName) => throw new ArgumentOutOfRangeException(message, paramName);
public static void ArgumentOutOfRangeException(string message, string paramName) => throw new ArgumentOutOfRangeException(paramName, message);
[DoesNotReturn]
public static void ArgumentOutOfRangeException_index() => throw new ArgumentOutOfRangeException("index");

View File

@@ -33,7 +33,7 @@ namespace Markdig.Parsers.Inlines
/// <summary>
/// The character of this emphasis.
/// </summary>
public char Character { get; }
public char Character { get; }
/// <summary>
/// The minimum number of character this emphasis is expected to have (must be >=1)

View File

@@ -65,7 +65,11 @@ namespace Markdig.Parsers
var noBlocksFoundBlock = new EmptyBlock(null);
List<StringSlice> linesBefore = blockProcessor.UseLinesBefore();
noBlocksFoundBlock.LinesAfter = new List<StringSlice>();
noBlocksFoundBlock.LinesAfter.AddRange(linesBefore);
if (linesBefore != null)
{
noBlocksFoundBlock.LinesAfter.AddRange(linesBefore);
}
document.Add(noBlocksFoundBlock);
}
else if (lastBlock != null && blockProcessor.LinesBefore != null)