Compare commits

..

3 Commits

Author SHA1 Message Date
Miha Zupan
cd7b9ca0ef Test netstandard (#915)
* Add GH Action to test netstandard 2.0 and 2.1

* Account for TFM changes in tests project
2025-11-17 18:46:26 +01:00
Alexandre Mutel
fb698598e4 Use central package management 2025-11-17 08:19:42 +01:00
mos379
12590e5fbe feat(link-helper): improve ASCII normalization handling (#911)
* feat(link-helper): improve ASCII normalization handling

Enhanced the `Urilize` method to better handle ASCII normalization and special characters. Added support for decomposing characters when `allowOnlyAscii` is true and skipping diacritical marks. Introduced handling for special German, Scandinavian, and Icelandic characters via new helper methods: `IsSpecialScandinavianOrGermanChar` and `NormalizeScandinavianOrGermanChar`.

Reorganized `using` directives for better clarity. Updated the processing loop in `Urilize` to handle normalized spans and ASCII equivalents more effectively. These changes improve link generation compatibility across various languages.

* Add tests for Scandinavian and German character normalization

Added tests for NormalizeScandinavianOrGermanChar method to validate character normalization for various special characters in both ASCII and non-ASCII contexts.

* test(link-helper): update ASCII transliteration tests

Updated test cases in `TestUrilizeOnlyAscii_Simple` to reflect
changes in `LinkHelper.Urilize` behavior. Non-ASCII characters
like `æ` and `ø` are now transliterated to their ASCII
equivalents (`ae` and `oe`) instead of being removed.
2025-11-10 22:01:35 +01:00
10 changed files with 248 additions and 35 deletions

44
.github/workflows/test-netstandard.yml vendored Normal file
View File

@@ -0,0 +1,44 @@
name: Test netstandard
on: pull_request
jobs:
test-netstandard:
runs-on: ubuntu-latest
strategy:
matrix:
netstandard-version: ['netstandard2.0', 'netstandard2.1']
steps:
- uses: actions/checkout@v4
- name: Setup .NET
uses: actions/setup-dotnet@v4
with:
dotnet-version: |
8.0.x
9.0.x
- name: Patch build to test ${{ matrix.netstandard-version }}
run: |
cd src
sed -i 's/<TargetFrameworks>.*<\/TargetFrameworks>/<TargetFrameworks>${{ matrix.netstandard-version }}<\/TargetFrameworks>/' Markdig/Markdig.targets
sed -i 's/<TargetFrameworks>.*<\/TargetFrameworks>/<TargetFrameworks>net8.0;net9.0<\/TargetFrameworks>/' Markdig.Tests/Markdig.Tests.csproj
echo "Markdig.targets TFMs:"
grep "TargetFrameworks" Markdig/Markdig.targets
echo "Markdig.Tests.csproj TFMs:"
grep "TargetFrameworks" Markdig.Tests/Markdig.Tests.csproj
- name: Restore dependencies
run: dotnet restore src/Markdig.Tests/Markdig.Tests.csproj
- name: Test Debug
run: |
dotnet build src/Markdig.Tests/Markdig.Tests.csproj -c Debug --no-restore
dotnet test src/Markdig.Tests/Markdig.Tests.csproj -c Debug --no-build
- name: Test Release
run: |
dotnet build src/Markdig.Tests/Markdig.Tests.csproj -c Release --no-restore
dotnet test src/Markdig.Tests/Markdig.Tests.csproj -c Release --no-build

View File

@@ -0,0 +1,23 @@
<Project>
<PropertyGroup>
<ManagePackageVersionsCentrally>true</ManagePackageVersionsCentrally>
<CentralPackageTransitivePinningEnabled>false</CentralPackageTransitivePinningEnabled>
</PropertyGroup>
<ItemGroup>
<PackageVersion Include="BenchmarkDotNet" Version="0.14.0" />
<PackageVersion Include="BenchmarkDotNet.Diagnostics.Windows" Version="0.14.0" />
<PackageVersion Include="CommonMark.NET" Version="0.15.1" />
<PackageVersion Include="Markdown" Version="2.2.1" />
<PackageVersion Include="MarkdownSharp" Version="2.0.5" />
<PackageVersion Include="Microsoft.ApplicationInsights.AspNetCore" Version="2.23.0" />
<PackageVersion Include="Microsoft.Diagnostics.Runtime" Version="3.1.512801" />
<PackageVersion Include="Microsoft.NET.Test.Sdk" Version="18.0.1" />
<PackageVersion Include="MinVer" Version="6.0.0" />
<PackageVersion Include="NUnit" Version="4.4.0" />
<PackageVersion Include="NUnit3TestAdapter" Version="5.2.0" />
<PackageVersion Include="SharpFuzz" Version="2.2.0" />
</ItemGroup>
<ItemGroup Condition=" '$(TargetFramework)' == 'net462' OR '$(TargetFramework)' == 'netstandard2.0'">
<PackageVersion Include="System.Memory" Version="4.6.3" />
</ItemGroup>
</Project>

View File

@@ -19,12 +19,12 @@
</Content>
</ItemGroup>
<ItemGroup>
<PackageReference Include="BenchmarkDotNet" Version="0.14.0" />
<PackageReference Include="BenchmarkDotNet.Diagnostics.Windows" Version="0.14.0" />
<PackageReference Include="CommonMark.NET" Version="0.15.1" />
<PackageReference Include="Markdown" Version="2.2.1" />
<PackageReference Include="MarkdownSharp" Version="2.0.5" />
<PackageReference Include="Microsoft.Diagnostics.Runtime" Version="3.1.512801" />
<PackageReference Include="BenchmarkDotNet" />
<PackageReference Include="BenchmarkDotNet.Diagnostics.Windows" />
<PackageReference Include="CommonMark.NET" />
<PackageReference Include="Markdown" />
<PackageReference Include="MarkdownSharp" />
<PackageReference Include="Microsoft.Diagnostics.Runtime" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\Markdig\Markdig.csproj" />

View File

@@ -1,4 +1,4 @@
<Project Sdk="Microsoft.NET.Sdk">
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
@@ -9,7 +9,7 @@
</PropertyGroup>
<ItemGroup>
<PackageReference Include="SharpFuzz" Version="2.2.0" />
<PackageReference Include="SharpFuzz" />
</ItemGroup>
<ItemGroup>

View File

@@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFrameworks>net6.0;net8.0;net9.0</TargetFrameworks>
<TargetFrameworks>net8.0;net9.0</TargetFrameworks>
<OutputType>Exe</OutputType>
<IsPackable>false</IsPackable>
<ImplicitUsings>enable</ImplicitUsings>
@@ -13,9 +13,9 @@
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.12.0" />
<PackageReference Include="NUnit" Version="4.3.2" />
<PackageReference Include="NUnit3TestAdapter" Version="4.6.0" />
<PackageReference Include="Microsoft.NET.Test.Sdk" />
<PackageReference Include="NUnit" />
<PackageReference Include="NUnit3TestAdapter" />
</ItemGroup>
<ItemGroup>
@@ -36,10 +36,10 @@
<InputSpecFiles Remove="Specs\readme.md" />
<!-- Allow Visual Studio up-to-date check to verify that nothing has changed - https://github.com/dotnet/project-system/blob/main/docs/up-to-date-check.md -->
<UpToDateCheckInput Include="@(InputSpecFiles)" />
<OutputSpecFiles Include="@(InputSpecFiles->'%(RelativeDir)%(Filename).generated.cs')" />
<OutputSpecFiles Include="@(InputSpecFiles-&gt;'%(RelativeDir)%(Filename).generated.cs')" />
</ItemGroup>
<Target Name="GeneratedSpecsFile" BeforeTargets="BeforeCompile;CoreCompile" Inputs="@(ItemSpecExecutable);@(InputSpecFiles)" Outputs="@(ItemSpecExecutable->'%(RelativeDir)%(Filename).timestamp');@(InputSpecFiles->'%(RelativeDir)%(Filename).generated.cs')">
<Target Name="GeneratedSpecsFile" BeforeTargets="BeforeCompile;CoreCompile" Inputs="@(ItemSpecExecutable);@(InputSpecFiles)" Outputs="@(ItemSpecExecutable-&gt;'%(RelativeDir)%(Filename).timestamp');@(InputSpecFiles-&gt;'%(RelativeDir)%(Filename).generated.cs')">
<Message Importance="high" Text="Regenerating Specs Files" />
<Exec Command="dotnet $(SpecExecutable)" />
<WriteLinesToFile File="$(SpecTimestamp)" Lines="$([System.DateTime]::Now)" />

View File

@@ -327,8 +327,8 @@ public class TestLinkHelper
Assert.AreEqual(expectedResult, LinkHelper.Urilize(input, true));
}
[TestCase("bær", "br")]
[TestCase("bør", "br")]
[TestCase("bær", "baer")]
[TestCase("bør", "boer")]
[TestCase("bΘr", "br")]
[TestCase("四五", "")]
public void TestUrilizeOnlyAscii_NonAscii(string input, string expectedResult)
@@ -343,6 +343,75 @@ public class TestLinkHelper
Assert.AreEqual(expectedResult, LinkHelper.Urilize(input, true));
}
// Tests for NormalizeScandinavianOrGermanChar method mappings
// These special characters are always normalized (both allowOnlyAscii=true and false)
//
// Note: When allowOnlyAscii=true, NFD (Canonical Decomposition) is applied first:
// - German umlauts ä,ö,ü decompose to base letter + combining mark (ü -> u + ¨)
// The combining mark is then stripped, leaving just the base letter (ü -> u)
// - å decomposes similarly (å -> a + ˚ -> a)
// - But ø, æ, ß, þ, ð do NOT decompose, so they use NormalizeScandinavianOrGermanChar
//
// When allowOnlyAscii=false, NormalizeScandinavianOrGermanChar is used for ALL special chars
// German ß (Eszett/sharp s) - does NOT decompose with NFD
[TestCase("Straße", "strasse")] // ß -> ss (both allowOnlyAscii=true and false)
// Scandinavian æ, ø - do NOT decompose with NFD
[TestCase("æble", "aeble")] // æ -> ae (both modes)
[TestCase("Ærø", "aeroe")] // Æ -> Ae, ø -> oe (both modes, then lowercase)
[TestCase("København", "koebenhavn")] // ø -> oe (both modes)
[TestCase("Øresund", "oeresund")] // Ø -> Oe (both modes, then lowercase)
// Icelandic þ, ð - do NOT decompose with NFD
[TestCase("þing", "thing")] // þ (thorn) -> th (both modes)
[TestCase("bað", "bad")] // ð (eth) -> d (both modes)
// Mixed special characters (only chars that behave same in both modes)
[TestCase("øst-æble", "oest-aeble")] // ø->oe, æ->ae (both modes)
public void TestUrilizeScandinavianGermanChars(string input, string expectedResult)
{
// These transformations apply regardless of allowOnlyAscii flag
Assert.AreEqual(expectedResult, LinkHelper.Urilize(input, true));
Assert.AreEqual(expectedResult, LinkHelper.Urilize(input, false));
}
// Tests specific to allowOnlyAscii=true behavior
// German umlauts (ä, ö, ü) and å decompose with NFD, so they become base letter only
[TestCase("schön", "schon")] // ö decomposes to o (NFD strips combining mark)
[TestCase("Mädchen", "madchen")] // ä decomposes to a
[TestCase("Übung", "ubung")] // Ü decomposes to U (then lowercase to u)
[TestCase("Düsseldorf", "dusseldorf")] // ü decomposes to u
[TestCase("Käse", "kase")] // ä decomposes to a
[TestCase("gå", "ga")] // å decomposes to a
[TestCase("Ålesund", "alesund")] // Å decomposes to A (then lowercase)
[TestCase("grüßen", "grussen")] // ü decomposes to u, ß -> ss
[TestCase("Þór", "thor")] // Þ -> Th, ó decomposes to o (then lowercase)
[TestCase("Íslandsbanki", "islandsbanki")] // Í decomposes to I (then lowercase)
public void TestUrilizeOnlyAscii_GermanUmlautsDecompose(string input, string expectedResult)
{
// With allowOnlyAscii=true, these characters decompose via NFD and lose their diacritics
Assert.AreEqual(expectedResult, LinkHelper.Urilize(input, true));
}
// Tests specific to allowOnlyAscii=false behavior
// All special chars use NormalizeScandinavianOrGermanChar (including ä, ö, ü, å)
[TestCase("schön", "schoen")] // ö -> oe (NormalizeScandinavianOrGermanChar)
[TestCase("Mädchen", "maedchen")] // ä -> ae
[TestCase("Übung", "uebung")] // Ü -> Ue (then lowercase)
[TestCase("Düsseldorf", "duesseldorf")] // ü -> ue
[TestCase("Käse", "kaese")] // ä -> ae
[TestCase("gå", "gaa")] // å -> aa
[TestCase("Ålesund", "aalesund")] // Å -> Aa (then lowercase)
[TestCase("grüßen", "gruessen")] // ü -> ue, ß -> ss
[TestCase("Þór", "thór")] // Þ -> Th (then lowercase 'th'), ó is kept as-is
[TestCase("Íslandsbanki", "íslandsbanki")] // í is kept as-is when allowOnlyAscii=false
public void TestUrilizeNonAscii_GermanUmlautsExpanded(string input, string expectedResult)
{
// With allowOnlyAscii=false, these characters use NormalizeScandinavianOrGermanChar
Assert.AreEqual(expectedResult, LinkHelper.Urilize(input, false));
}
[TestCase("123", "")]
[TestCase("1,-b", "b")]
[TestCase("b1,-", "b1")] // Not Pandoc equivalent: b1-
@@ -360,11 +429,11 @@ public class TestLinkHelper
Assert.AreEqual(expectedResult, LinkHelper.Urilize(input, false));
}
[TestCase("bær", "bær")]
[TestCase("æ5el", "æ5el")]
[TestCase("-æ5el", "æ5el")]
[TestCase("-frø-", "frø")]
[TestCase("-fr-ø", "fr-ø")]
[TestCase("bær", "baer")]
[TestCase("æ5el", "ae5el")]
[TestCase("-æ5el", "ae5el")]
[TestCase("-frø-", "froe")]
[TestCase("-fr-ø", "fr-oe")]
public void TestUrilizeNonAscii_Simple(string input, string expectedResult)
{
Assert.AreEqual(expectedResult, LinkHelper.Urilize(input, false));
@@ -393,4 +462,4 @@ public class TestLinkHelper
{
TestParser.TestSpec("[Foo]\n\n[Foo]: http://ünicode.com", "<p><a href=\"http://xn--nicode-2ya.com\">Foo</a></p>");
}
}
}

View File

@@ -14,7 +14,7 @@
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Microsoft.ApplicationInsights.AspNetCore" Version="2.22.0" />
<PackageReference Include="Microsoft.ApplicationInsights.AspNetCore" />
</ItemGroup>
<ItemGroup>

View File

@@ -2,11 +2,13 @@
// This file is licensed under the BSD-Clause 2 license.
// See the license.txt file in the project root for more information.
using Markdig.Syntax;
using System.Buffers;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Globalization;
using System.Runtime.CompilerServices;
using Markdig.Syntax;
using System.Text;
namespace Markdig.Helpers;
@@ -30,11 +32,38 @@ public static class LinkHelper
var headingBuffer = new ValueStringBuilder(stackalloc char[ValueStringBuilder.StackallocThreshold]);
bool hasLetter = keepOpeningDigits && headingText.Length > 0 && char.IsLetterOrDigit(headingText[0]);
bool previousIsSpace = false;
for (int i = 0; i < headingText.Length; i++)
// First normalize the string to decompose characters if allowOnlyAscii is true
string normalizedString = string.Empty;
if (allowOnlyAscii)
{
var c = headingText[i];
var normalized = allowOnlyAscii ? CharNormalizer.ConvertToAscii(c) : null;
for (int j = 0; j < (normalized?.Length ?? 1); j++)
normalizedString = headingText.ToString().Normalize(NormalizationForm.FormD);
}
var textToProcess = string.IsNullOrEmpty(normalizedString) ? headingText : normalizedString.AsSpan();
for (int i = 0; i < textToProcess.Length; i++)
{
var c = textToProcess[i];
// Skip combining diacritical marks when normalized
if (allowOnlyAscii && CharUnicodeInfo.GetUnicodeCategory(c) == UnicodeCategory.NonSpacingMark)
{
continue;
}
// Handle German umlauts and Norwegian/Danish characters explicitly (they don't decompose properly)
ReadOnlySpan<char> normalized;
if (IsSpecialScandinavianOrGermanChar(c))
{
normalized = NormalizeScandinavianOrGermanChar(c);
}
else
{
normalized = allowOnlyAscii ? CharNormalizer.ConvertToAscii(c) : null;
}
for (int j = 0; j < (normalized.Length < 1 ? 1 : normalized.Length); j++)
{
if (normalized != null)
{
@@ -101,6 +130,50 @@ public static class LinkHelper
return headingBuffer.ToString();
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool IsSpecialScandinavianOrGermanChar(char c)
{
// German umlauts and ß
// Norwegian/Danish/Swedish æ, ø, å
// Icelandic þ (thorn), ð (eth)
return c == 'ä' || c == 'ö' || c == 'ü' ||
c == 'Ä' || c == 'Ö' || c == 'Ü' ||
c == 'ß' ||
c == 'æ' || c == 'ø' || c == 'å' ||
c == 'Æ' || c == 'Ø' || c == 'Å' ||
c == 'þ' || c == 'ð' ||
c == 'Þ' || c == 'Ð';
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static ReadOnlySpan<char> NormalizeScandinavianOrGermanChar(char c)
{
return c switch
{
// German
'ä' => "ae",
'ö' => "oe",
'ü' => "ue",
'Ä' => "Ae",
'Ö' => "Oe",
'Ü' => "Ue",
'ß' => "ss",
// Norwegian/Danish/Swedish
'æ' => "ae",
'ø' => "oe",
'å' => "aa",
'Æ' => "Ae",
'Ø' => "Oe",
'Å' => "Aa",
// Icelandic
'þ' => "th",
'Þ' => "Th",
'ð' => "d",
'Ð' => "D",
_ => ReadOnlySpan<char>.Empty
};
}
public static string UrilizeAsGfm(string headingText)
{
return UrilizeAsGfm(headingText.AsSpan());
@@ -218,7 +291,8 @@ public static class LinkHelper
}
state = 1;
break;
} else if (c == '@')
}
else if (c == '@')
{
if (state > 0)
{
@@ -234,7 +308,7 @@ public static class LinkHelper
}
// append ':' or '@'
builder.Append(c);
builder.Append(c);
if (state < 0)
{

View File

@@ -25,17 +25,16 @@
</PropertyGroup>
<ItemGroup Condition=" '$(TargetFramework)' == 'net462' OR '$(TargetFramework)' == 'netstandard2.0'">
<PackageReference Include="System.Memory" Version="4.6.0" />
<PackageReference Include="System.Memory" />
</ItemGroup>
<ItemGroup>
<None Include="../../img/markdig.png" Pack="true" PackagePath="" />
<None Include="../../readme.md" Pack="true" PackagePath="/"/>
<PackageReference Include="MinVer" Version="4.3.0">
<None Include="../../readme.md" Pack="true" PackagePath="/" />
<PackageReference Include="MinVer">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
<PackageReference Include="Microsoft.SourceLink.GitHub" Version="8.0.*" PrivateAssets="All"/>
</ItemGroup>
<Target Name="PatchVersion" AfterTargets="MinVer">

View File

@@ -2,13 +2,17 @@
<Folder Name="/Build/">
<File Path="../.editorconfig" />
<File Path="../.gitattributes" />
<File Path="../.github/workflows/ci.yml" />
<File Path="../.gitignore" />
<File Path="../changelog.md" />
<File Path="../license.txt" />
<File Path="../readme.md" />
<File Path="Directory.Packages.props" />
<File Path="global.json" />
</Folder>
<Folder Name="/Build/GitHub Actions/">
<File Path="../.github/workflows/ci.yml" />
<File Path="../.github/workflows/test-netstandard.yml" />
</Folder>
<Project Path="Markdig.Benchmarks/Markdig.Benchmarks.csproj">
<BuildDependency Project="Markdig/Markdig.csproj" />
</Project>