Merge pull request #666 from meziantou/issues/665

Fix ANE when parsing empty documents with trackTrivia enabled
2026-02-06 21:36:15 +00:00 · 2022-09-27 07:24:22 +02:00 · 2022-09-26 21:55:28 -04:00 · 2022-08-12 07:46:24 +02:00 · 2022-08-12 07:45:31 +02:00 · 2022-07-21 12:11:03 +02:00
8 changed files with 212 additions and 69 deletions
--- a/src/Markdig.Tests/TestCharHelper.cs
+++ b/src/Markdig.Tests/TestCharHelper.cs
@@ -0,0 +1,92 @@
+using System.Collections.Generic;
+using System.Globalization;
+using Markdig.Helpers;
+using NUnit.Framework;
+
+namespace Markdig.Tests
+{
+    public class TestCharHelper
+    {
+        // An ASCII punctuation character is
+        // !, ", #, $, %, &, ', (, ), *, +, ,, -, ., / (U+0021–2F),
+        // :, ;, <, =, >, ?, @ (U+003A–0040),
+        // [, \, ], ^, _, ` (U+005B–0060),
+        // {, |, }, or ~ (U+007B–007E).
+        private static readonly HashSet<char> s_asciiPunctuation = new()
+        {
+            '!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/',
+            ':', ';', '<', '=', '>', '?', '@',
+            '[', '\\', ']', '^', '_', '`',
+            '{', '|', '}', '~'
+        };
+
+        // A Unicode punctuation character is an ASCII punctuation character or anything in the general Unicode categories
+        // Pc, Pd, Pe, Pf, Pi, Po, or Ps.
+        private static readonly HashSet<UnicodeCategory> s_punctuationCategories = new()
+        {
+            UnicodeCategory.ConnectorPunctuation,
+            UnicodeCategory.DashPunctuation,
+            UnicodeCategory.ClosePunctuation,
+            UnicodeCategory.FinalQuotePunctuation,
+            UnicodeCategory.InitialQuotePunctuation,
+            UnicodeCategory.OtherPunctuation,
+            UnicodeCategory.OpenPunctuation
+        };
+
+        private static bool ExpectedIsPunctuation(char c)
+        {
+            return c <= 127
+                ? s_asciiPunctuation.Contains(c)
+                : s_punctuationCategories.Contains(CharUnicodeInfo.GetUnicodeCategory(c));
+        }
+
+        private static bool ExpectedIsWhitespace(char c)
+        {
+            // A Unicode whitespace character is any code point in the Unicode Zs general category,
+            // or a tab (U+0009), line feed (U+000A), form feed (U+000C), or carriage return (U+000D).
+            return c == '\t' || c == '\n' || c == '\u000C' || c == '\r' ||
+                CharUnicodeInfo.GetUnicodeCategory(c) == UnicodeCategory.SpaceSeparator;
+        }
+
+        [Test]
+        public void IsWhitespace()
+        {
+            for (int i = char.MinValue; i <= char.MaxValue; i++)
+            {
+                char c = (char)i;
+
+                Assert.AreEqual(ExpectedIsWhitespace(c), CharHelper.IsWhitespace(c));
+            }
+        }
+
+        [Test]
+        public void CheckUnicodeCategory()
+        {
+            for (int i = char.MinValue; i <= char.MaxValue; i++)
+            {
+                char c = (char)i;
+
+                bool expectedSpace = c == 0 || ExpectedIsWhitespace(c);
+                bool expectedPunctuation = c == 0 || ExpectedIsPunctuation(c);
+
+                CharHelper.CheckUnicodeCategory(c, out bool spaceActual, out bool punctuationActual);
+
+                Assert.AreEqual(expectedSpace, spaceActual);
+                Assert.AreEqual(expectedPunctuation, punctuationActual);
+            }
+        }
+
+        [Test]
+        public void IsSpaceOrPunctuation()
+        {
+            for (int i = char.MinValue; i <= char.MaxValue; i++)
+            {
+                char c = (char)i;
+
+                bool expected = c == 0 || ExpectedIsWhitespace(c) || ExpectedIsPunctuation(c);
+
+                Assert.AreEqual(expected, CharHelper.IsSpaceOrPunctuation(c));
+            }
+        }
+    }
+}
--- a/src/Markdig.Tests/TestParser.cs
+++ b/src/Markdig.Tests/TestParser.cs
@@ -8,6 +8,7 @@ using System.Linq;
 using System.Text;
 using System.Text.RegularExpressions;
 using Markdig.Extensions.JiraLinks;
+using Markdig.Renderers.Roundtrip;
 using Markdig.Syntax;
 using NUnit.Framework;

@@ -67,6 +68,15 @@ namespace Markdig.Tests
            TestDescendantsOrder.TestSchemas(specsSyntaxTrees);
        }

+        [Test]
+        public void ParseEmptyDocumentWithTrackTriviaEnabled()
+        {
+            var document = Markdown.Parse("", trackTrivia: true);
+            using var sw = new StringWriter();
+            new RoundtripRenderer(sw).Render(document);
+            Assert.AreEqual("", sw.ToString());
+        }
+
        public static void TestSpec(string inputText, string expectedOutputText, string extensions = null, bool plainText = false, string context = null)
        {
            context ??= string.Empty;
--- a/src/Markdig.Tests/TestPipeTable.cs
+++ b/src/Markdig.Tests/TestPipeTable.cs
@@ -10,9 +10,7 @@ namespace Markdig.Tests
    {
        [TestCase("| S | T |\r\n|---|---| \r\n| G | H |")]
        [TestCase("| S | T |\r\n|---|---|\t\r\n| G | H |")]
-        [TestCase("| S | T |\r\n|---|---|\v\r\n| G | H |")]
        [TestCase("| S | T |\r\n|---|---|\f\r\n| G | H |")]
-        [TestCase("| S | T |\r\n|---|---|\f\v\t \r\n| G | H |")]
        [TestCase("| S | \r\n|---|\r\n| G |\r\n\r\n| D | D |\r\n| ---| ---| \r\n| V | V |", 2)]
        public void TestTableBug(string markdown, int tableCount = 1)
        {
--- a/src/Markdig/Helpers/CharHelper.cs
+++ b/src/Markdig/Helpers/CharHelper.cs
@@ -53,7 +53,7 @@ namespace Markdig.Helpers

            // A right-flanking delimiter run is a delimiter run that is
            // (1) not preceded by Unicode whitespace, and either
-            // (1a) not preceded by a punctuation character, or
+            // (2a) not preceded by a punctuation character, or
            // (2b) preceded by a punctuation character and followed by Unicode whitespace or a punctuation character.
            // For purposes of this definition, the beginning and the end of the line count as Unicode whitespace.
            canClose = !prevIsWhiteSpace &&
@@ -144,9 +144,37 @@ namespace Markdig.Helpers
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public static bool IsWhitespace(this char c)
        {
-            // 2.1 Characters and lines 
-            // A whitespace character is a space(U + 0020), tab(U + 0009), newline(U + 000A), line tabulation (U + 000B), form feed (U + 000C), or carriage return (U + 000D).
-            return c <= ' ' && (c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r');
+            // 2.1 Characters and lines
+            // A Unicode whitespace character is any code point in the Unicode Zs general category,
+            // or a tab (U+0009), line feed (U+000A), form feed (U+000C), or carriage return (U+000D).
+            if (c <= ' ')
+            {
+                const long Mask =
+                    (1L << ' ') |
+                    (1L << '\t') |
+                    (1L << '\n') |
+                    (1L << '\f') |
+                    (1L << '\r');
+
+                return (Mask & (1L << c)) != 0;
+            }
+
+            return c >= '\u00A0' && IsWhitespaceRare(c);
+
+            static bool IsWhitespaceRare(char c)
+            {
+                // return CharUnicodeInfo.GetUnicodeCategory(c) == UnicodeCategory.SpaceSeparator;
+
+                if (c < 5760)
+                {
+                    return c == '\u00A0';
+                }
+                else
+                {
+                    return c <= 12288 &&
+                        (c == 5760 || IsInInclusiveRange(c, 8192, 8202) || c == 8239 || c == 8287 || c == 12288);
+                }
+            }
        }

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -171,46 +199,47 @@ namespace Markdig.Helpers
        // Check if a char is a space or a punctuation
        public static void CheckUnicodeCategory(this char c, out bool space, out bool punctuation)
        {
-            // Credits: code from CommonMark.NET
-            // Copyright (c) 2014, Kārlis Gaņģis All rights reserved. 
-            // See license for details:  https://github.com/Knagis/CommonMark.NET/blob/master/LICENSE.md
-            if (c <= 'ÿ')
+            if (IsWhitespace(c))
            {
-                space = c == '\0' || c == ' ' || (c >= '\t' && c <= '\r') || c == '\u00a0' || c == '\u0085';
-                punctuation = c == '\0' || (c >= 33 && c <= 47) || (c >= 58 && c <= 64) || (c >= 91 && c <= 96) || (c >= 123 && c <= 126);
+                space = true;
+                punctuation = false;
+            }
+            else if (c <= 127)
+            {
+                space = c == '\0';
+                punctuation = c == '\0' || IsAsciiPunctuation(c);
            }
            else
            {
-                var category = CharUnicodeInfo.GetUnicodeCategory(c);
-                space = category == UnicodeCategory.SpaceSeparator
-                    || category == UnicodeCategory.LineSeparator
-                    || category == UnicodeCategory.ParagraphSeparator;
-                punctuation = !space &&
-                    (category == UnicodeCategory.ConnectorPunctuation
+                // A Unicode punctuation character is an ASCII punctuation character
+                // or anything in the general Unicode categories Pc, Pd, Pe, Pf, Pi, Po, or Ps.
+                space = false;
+                UnicodeCategory category = CharUnicodeInfo.GetUnicodeCategory(c);
+                punctuation = category == UnicodeCategory.ConnectorPunctuation
                    || category == UnicodeCategory.DashPunctuation
                    || category == UnicodeCategory.OpenPunctuation
                    || category == UnicodeCategory.ClosePunctuation
                    || category == UnicodeCategory.InitialQuotePunctuation
                    || category == UnicodeCategory.FinalQuotePunctuation
-                    || category == UnicodeCategory.OtherPunctuation);
+                    || category == UnicodeCategory.OtherPunctuation;
            }
        }

        // Same as CheckUnicodeCategory
        internal static bool IsSpaceOrPunctuation(this char c)
        {
-            if (c <= 'ÿ')
+            if (IsWhitespace(c))
            {
-                return c == '\0' || c == ' ' || (c >= '\t' && c <= '\r') || c == '\u00a0' || c == '\u0085' ||
-                    (c >= 33 && c <= 47 && c != 38) || (c >= 58 && c <= 64) || (c >= 91 && c <= 96) || (c >= 123 && c <= 126);
+                return true;
+            }
+            else if (c <= 127)
+            {
+                return c == '\0' || IsAsciiPunctuation(c);
            }
            else
            {
                var category = CharUnicodeInfo.GetUnicodeCategory(c);
-                return category == UnicodeCategory.SpaceSeparator
-                    || category == UnicodeCategory.LineSeparator
-                    || category == UnicodeCategory.ParagraphSeparator
-                    || category == UnicodeCategory.ConnectorPunctuation
+                return category == UnicodeCategory.ConnectorPunctuation
                    || category == UnicodeCategory.DashPunctuation
                    || category == UnicodeCategory.OpenPunctuation
                    || category == UnicodeCategory.ClosePunctuation
@@ -289,44 +318,16 @@ namespace Markdig.Helpers
        public static bool IsAsciiPunctuation(this char c)
        {
            // 2.1 Characters and lines 
-            // An ASCII punctuation character is !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \, ], ^, _, `, {, |, }, or ~.
-            switch (c)
-            {
-                case '!':
-                case '"':
-                case '#':
-                case '$':
-                case '%':
-                case '&':
-                case '\'':
-                case '(':
-                case ')':
-                case '*':
-                case '+':
-                case ',':
-                case '-':
-                case '.':
-                case '/':
-                case ':':
-                case ';':
-                case '<':
-                case '=':
-                case '>':
-                case '?':
-                case '@':
-                case '[':
-                case '\\':
-                case ']':
-                case '^':
-                case '_':
-                case '`':
-                case '{':
-                case '|':
-                case '}':
-                case '~':
-                    return true;
-            }
-            return false;
+            // An ASCII punctuation character is
+            // !, ", #, $, %, &, ', (, ), *, +, ,, -, ., / (U+0021–2F),
+            // :, ;, <, =, >, ?, @ (U+003A–0040),
+            // [, \, ], ^, _, ` (U+005B–0060),
+            // {, |, }, or ~ (U+007B–007E).
+            return c <= 127 && (
+                IsInInclusiveRange(c, 33, 47) ||
+                IsInInclusiveRange(c, 58, 64) ||
+                IsInInclusiveRange(c, 91, 96) ||
+                IsInInclusiveRange(c, 123, 126));
        }

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
--- a/src/Markdig/Helpers/HtmlHelper.cs
+++ b/src/Markdig/Helpers/HtmlHelper.cs
@@ -4,6 +4,7 @@

 using System;
 using System.Diagnostics.CodeAnalysis;
+using System.Runtime.CompilerServices;

 namespace Markdig.Helpers
 {
@@ -193,7 +194,7 @@ namespace Markdig.Helpers
                                {
                                    return false;
                                }
-                                if (c == ' ' || c == '\n' || c == '"' || c == '\'' || c == '=' || c == '<' || c == '>' || c == '`')
+                                if (IsSpaceOrSpecialHtmlChar(c))
                                {
                                    break;
                                }
@@ -202,6 +203,26 @@ namespace Markdig.Helpers
                                c = text.NextChar();
                            }

+                            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+                            static bool IsSpaceOrSpecialHtmlChar(char c)
+                            {
+                                if (c > '>')
+                                {
+                                    return c == '`';
+                                }
+
+                                const long BitMask =
+                                      (1L << ' ')
+                                    | (1L << '\n')
+                                    | (1L << '"')
+                                    | (1L << '\'')
+                                    | (1L << '=')
+                                    | (1L << '<')
+                                    | (1L << '>');
+
+                                return (BitMask & (1L << c)) != 0;
+                            }
+
                            // We need at least one char after '='
                            if (matchCount == 0)
                            {
@@ -227,7 +248,7 @@ namespace Markdig.Helpers
                        while (true)
                        {
                            c = text.NextChar();
-                            if (c.IsAlphaNumeric() || c == '_' || c == ':' || c == '.' || c == '-')
+                            if (c.IsAlphaNumeric() || IsCharToAppend(c))
                            {
                                builder.Append(c);
                            }
@@ -235,6 +256,23 @@ namespace Markdig.Helpers
                            {
                                break;
                            }
+
+                            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+                            static bool IsCharToAppend(char c)
+                            {
+                                if ((uint)(c - '-') > '_' - '-')
+                                {
+                                    return false;
+                                }
+
+                                const long BitMask =
+                                      (1L << '_')
+                                    | (1L << ':')
+                                    | (1L << '.')
+                                    | (1L << '-');
+
+                                return (BitMask & (1L << c)) != 0;
+                            }
                        }

                        hasAttribute = true;
--- a/src/Markdig/Helpers/ThrowHelper.cs
+++ b/src/Markdig/Helpers/ThrowHelper.cs
@@ -55,7 +55,7 @@ namespace Markdig.Helpers
        public static void ArgumentOutOfRangeException(string paramName) => throw new ArgumentOutOfRangeException(paramName);

        [DoesNotReturn]
-        public static void ArgumentOutOfRangeException(string message, string paramName) => throw new ArgumentOutOfRangeException(message, paramName);
+        public static void ArgumentOutOfRangeException(string message, string paramName) => throw new ArgumentOutOfRangeException(paramName, message);

        [DoesNotReturn]
        public static void ArgumentOutOfRangeException_index() => throw new ArgumentOutOfRangeException("index");
--- a/src/Markdig/Parsers/Inlines/EmphasisDescriptor.cs
+++ b/src/Markdig/Parsers/Inlines/EmphasisDescriptor.cs
@@ -33,7 +33,7 @@ namespace Markdig.Parsers.Inlines
        /// <summary>
        /// The character of this emphasis.
        /// </summary>
-        public  char Character { get; }
+        public char Character { get; }

        /// <summary>
        /// The minimum number of character this emphasis is expected to have (must be >=1)
--- a/src/Markdig/Parsers/MarkdownParser.cs
+++ b/src/Markdig/Parsers/MarkdownParser.cs
@@ -65,7 +65,11 @@ namespace Markdig.Parsers
                        var noBlocksFoundBlock = new EmptyBlock(null);
                        List<StringSlice> linesBefore = blockProcessor.UseLinesBefore();
                        noBlocksFoundBlock.LinesAfter = new List<StringSlice>();
-                        noBlocksFoundBlock.LinesAfter.AddRange(linesBefore);
+                        if (linesBefore != null)
+                        {
+                            noBlocksFoundBlock.LinesAfter.AddRange(linesBefore);
+                        }
+
                        document.Add(noBlocksFoundBlock);
                    }
                    else if (lastBlock != null && blockProcessor.LinesBefore != null)
Author	SHA1	Message	Date
Alexandre Mutel	98c687b4ed	Merge pull request #666 from meziantou/issues/665 Fix ANE when parsing empty documents with trackTrivia enabled	2022-09-27 07:24:22 +02:00
Gérald Barré	8e4a732efe	Fix ANE when parsing empty documents with trackTrivia enabled	2022-09-26 21:55:28 -04:00
Alexandre Mutel	bce4b70dc6	Merge pull request #649 from MihaZupan/commonmark-whitespace-punctuation Align Whitespace and Punctuation definitions with CommonMark	2022-08-12 07:46:24 +02:00
Alexandre Mutel	1f71520de9	Merge pull request #650 from gfoidl/htmlhelper-TryParseHtmlTagOpenTag_remove_branches Remove some branches in HtmlHelper.TryParseHtmlTagOpenTag by using bitmask	2022-08-12 07:45:31 +02:00
Günther Foidl	bfd7b6460c	Remove some branches in HtmlHelper.TryParseHtmlTagOpenTag by using bitmasks	2022-07-21 12:11:03 +02:00
Miha Zupan	0e26ec5382	Align Whitespace and Punctuation definitions with CommonMark	2022-07-17 20:22:26 +02:00