Add CJK-friendly Emphasis Extension (#921)

* Add CJK-friendly Emphasis Extension

* Add auto-generated test file

* Add name for configuration

* Remove useless default value assignments

Co-authored-by: Miha Zupan <mihazupan.zupan1@gmail.com>

* Make `CheckOpenCloseDelimiterCjkFriendly` internal only

* Remove `CjkFriendlyEmphasisExtension` class

* Add some comments including links

* Add direct tests on `CharHelper.CheckOpenCloseDelimiterCjkFriendly`

* Fix generated tests

* Add `#if NET`

* Skip Rune-dependent tests in .NET Standard tests

* Add missing XML Documentation Comments

* Fix URL

* Change test condition

* Add test in .NET Framework 4.8.1

* Add netstandard2.0 to SpecFileGen

* Add fallback for netstandard2.0

* Fix

* Revert "Fix"

This reverts commit 42e998b085.

* Revert "Add fallback for netstandard2.0"

This reverts commit 7400a7bb0b.

* Revert "Add netstandard2.0 to SpecFileGen"

This reverts commit f9aa8e1e8d.

* Revert "Add test in .NET Framework 4.8.1"

This reverts commit d8d6d516ed.

* Fix missing indent

---------

Co-authored-by: Miha Zupan <mihazupan.zupan1@gmail.com>
This commit is contained in:
Tatsunori Uchino
2026-03-01 21:47:54 +09:00
committed by GitHub
parent 7959e3b912
commit 6d5a124863
12 changed files with 474 additions and 6 deletions

View File

@@ -35,10 +35,10 @@ jobs:
- name: Test Debug
run: |
dotnet build src/Markdig.Tests/Markdig.Tests.csproj -c Debug --no-restore
dotnet build src/Markdig.Tests/Markdig.Tests.csproj -c Debug --no-restore -p:MarkdigNoRuneTests=true
dotnet test src/Markdig.Tests/Markdig.Tests.csproj -c Debug --no-build
- name: Test Release
run: |
dotnet build src/Markdig.Tests/Markdig.Tests.csproj -c Release --no-restore
dotnet build src/Markdig.Tests/Markdig.Tests.csproj -c Release --no-restore -p:MarkdigNoRuneTests=true
dotnet test src/Markdig.Tests/Markdig.Tests.csproj -c Release --no-build

View File

@@ -51,6 +51,7 @@ You can **try Markdig online** and compare it to other implementations on [babel
- [**Diagrams**](src/Markdig.Tests/Specs/DiagramsSpecs.md) extension whenever a fenced code block contains a special keyword, it will be converted to a div block with the content as-is (currently, supports [`mermaid`](https://mermaid.js.org) and [`nomnoml`](https://github.com/skanaar/nomnoml) diagrams)
- [**YAML Front Matter**](src/Markdig.Tests/Specs/YamlSpecs.md) to parse without evaluating the front matter and to discard it from the HTML output (typically used for previewing without the front matter in MarkdownEditor)
- [**JIRA links**](src/Markdig.Tests/Specs/JiraLinks.md) to automatically generate links for JIRA project references (Thanks to @clarkd: https://github.com/clarkd/MarkdigJiraLinker)
- [**CJK-friendly Emphasis**](src/Markdig.Tests/Specs/CJKFriendlyEmphasis.md) to mitigate a CommonMark specification issue in CJK languages (Thanks to @tats-u: https://github.com/tats-u/markdown-cjk-friendly)
- Starting with Markdig version `0.20.0+`, Markdig is compatible only with `NETStandard 2.0`, `NETStandard 2.1`, `NETCoreApp 2.1` and `NETCoreApp 3.1`.
If you are looking for support for an old .NET Framework 3.5 or 4.0, you can download Markdig `0.18.3`.

View File

@@ -12,6 +12,10 @@
<NoWarn>$(NoWarn);NETSDK1138</NoWarn>
</PropertyGroup>
<PropertyGroup Condition="'$(MarkdigNoRuneTests)' == 'true'">
<DefineConstants>$(DefineConstants);MARKDIG_NO_RUNE_TESTS</DefineConstants>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Microsoft.NET.Test.Sdk" />
<PackageReference Include="NUnit" />

View File

@@ -0,0 +1,67 @@
// --------------------------------
// CJK-friendly Emphasis
// --------------------------------
using System;
using NUnit.Framework;
namespace Markdig.Tests.Specs.CJKFriendlyEmphasis
{
[TestFixture]
public class TestCJKFriendlyEmphasisExtension
{
// ## CJK-friendly Emphasis Extension
//
// See https://github.com/tats-u/markdown-cjk-friendly/blob/main/specification.md for details about the spec of this extension.
//
// This extension drastically mitigates [the long-standing issue (specification flaw)](https://github.com/commonmark/commonmark-spec/issues/650) in CommonMark that emphasis in CJK languages is often not parsed as expected.
//
// The plain CommonMark cannot recognize even the following emphasis in CJK languages:
[Test]
public void CJKFriendlyEmphasisExtension_Example001()
{
// Example 1
// Section: CJK-friendly Emphasis Extension
//
// The following Markdown:
// **この文を強調できますかCan I emphasize this sentence**残念ながらこの文のせいでできませんUnfortunately not possible due to this sentence
//
// Should be rendered as:
// <p><strong>この文を強調できますかCan I emphasize this sentence</strong>残念ながらこの文のせいでできませんUnfortunately not possible due to this sentence。</p>
TestParser.TestSpec("**この文を強調できますかCan I emphasize this sentence**残念ながらこの文のせいでできませんUnfortunately not possible due to this sentence。", "<p><strong>この文を強調できますかCan I emphasize this sentence</strong>残念ながらこの文のせいでできませんUnfortunately not possible due to this sentence。</p>", "cjk-friendly-emphasis", context: "Example 1\nSection CJK-friendly Emphasis Extension\n");
}
// ````````````````````````````````` example
// 我可以强调**这个`code`**吗Can I emphasize **this `code`**
// .
// <p>我可以强调<code>这个`code`</code>吗Can I emphasize <strong>this <code>code</code></strong></p>
// `````````````````````````````````
[Test]
public void CJKFriendlyEmphasisExtension_Example002()
{
// Example 2
// Section: CJK-friendly Emphasis Extension
//
// The following Markdown:
// **이 용어(This term)**를 강조해 주세요. (Please emphasize **this term**.)
//
// Should be rendered as:
// <p><strong>이 용어(This term)</strong>를 강조해 주세요. (Please emphasize <strong>this term</strong>.)</p>
TestParser.TestSpec("**이 용어(This term)**를 강조해 주세요. (Please emphasize **this term**.)", "<p><strong>이 용어(This term)</strong>를 강조해 주세요. (Please emphasize <strong>this term</strong>.)</p>", "cjk-friendly-emphasis", context: "Example 2\nSection CJK-friendly Emphasis Extension\n");
}
// You can compare the results with and without this extension: https://tats-u.github.io/markdown-cjk-friendly/?sc8=KirjgZPjga7mlofjgpLlvLfoqr_jgafjgY3jgb7jgZnjgYvvvIhDYW4gSSBlbXBoYXNpemUgdGhpcyBzZW50ZW5jZe-8ie-8nyoq5q6L5b-144Gq44GM44KJ44GT44Gu5paH44Gu44Gb44GE44Gn44Gn44GN44G-44Gb44KT77yIVW5mb3J0dW5hdGVseSBub3QgcG9zc2libGUgZHVlIHRvIHRoaXMgc2VudGVuY2XvvInjgIIKCuaIkeWPr-S7peW8uuiwgyoq6L-Z5LiqYGNvZGVgKirlkJfvvIhDYW4gSSBlbXBoYXNpemUgKip0aGlzIGBjb2RlYCoq77yJ77yfCgoqKuydtCDsmqnslrQoVGhpcyB0ZXJtKSoq66W8IOqwleyhsO2VtCDso7zshLjsmpQuIChQbGVhc2UgZW1waGFzaXplICoqdGhpcyB0ZXJtKiouKQo&gfm=1&engine=markdown-it
//
// You will find how poor the plain CommonMark is for CJK languages.
//
// To use this extension, configure the pipeline as follows:
//
// ```csharp
// var pipeline = new MarkdownPipelineBuilder()
// .UseCJKFriendlyEmphasis() // Add this
// .Build();
// ```
}
}

View File

@@ -0,0 +1,37 @@
## CJK-friendly Emphasis Extension
See https://github.com/tats-u/markdown-cjk-friendly/blob/main/specification.md for details about the spec of this extension.
This extension drastically mitigates [the long-standing issue (specification flaw)](https://github.com/commonmark/commonmark-spec/issues/650) in CommonMark that emphasis in CJK languages is often not parsed as expected.
The plain CommonMark cannot recognize even the following emphasis in CJK languages:
```````````````````````````````` example
**この文を強調できますかCan I emphasize this sentence**残念ながらこの文のせいでできませんUnfortunately not possible due to this sentence
.
<p><strong>この文を強調できますかCan I emphasize this sentence</strong>残念ながらこの文のせいでできませんUnfortunately not possible due to this sentence。</p>
````````````````````````````````
````````````````````````````````` example
我可以强调**这个`code`**吗Can I emphasize **this `code`**
.
<p>我可以强调<code>这个`code`</code>吗Can I emphasize <strong>this <code>code</code></strong></p>
`````````````````````````````````
```````````````````````````````` example
**이 용어(This term)**를 강조해 주세요. (Please emphasize **this term**.)
.
<p><strong>이 용어(This term)</strong>를 강조해 주세요. (Please emphasize <strong>this term</strong>.)</p>
````````````````````````````````
You can compare the results with and without this extension: https://tats-u.github.io/markdown-cjk-friendly/?sc8=KirjgZPjga7mlofjgpLlvLfoqr_jgafjgY3jgb7jgZnjgYvvvIhDYW4gSSBlbXBoYXNpemUgdGhpcyBzZW50ZW5jZe-8ie-8nyoq5q6L5b-144Gq44GM44KJ44GT44Gu5paH44Gu44Gb44GE44Gn44Gn44GN44G-44Gb44KT77yIVW5mb3J0dW5hdGVseSBub3QgcG9zc2libGUgZHVlIHRvIHRoaXMgc2VudGVuY2XvvInjgIIKCuaIkeWPr-S7peW8uuiwgyoq6L-Z5LiqYGNvZGVgKirlkJfvvIhDYW4gSSBlbXBoYXNpemUgKip0aGlzIGBjb2RlYCoq77yJ77yfCgoqKuydtCDsmqnslrQoVGhpcyB0ZXJtKSoq66W8IOqwleyhsO2VtCDso7zshLjsmpQuIChQbGVhc2UgZW1waGFzaXplICoqdGhpcyB0ZXJtKiouKQo&gfm=1&engine=markdown-it
You will find how poor the plain CommonMark is for CJK languages.
To use this extension, configure the pipeline as follows:
```csharp
var pipeline = new MarkdownPipelineBuilder()
.UseCJKFriendlyEmphasis() // Add this
.Build();
```

View File

@@ -32,5 +32,6 @@ You will find from the following links the supported extensions in markdig and t
- [**Diagrams**](DiagramsSpecs.md)
- [**YAML frontmatter**](YamlSpecs.md)
- [**JIRA links**](JiraLinks.md)
- [**CJK-friendly Emphasis**](CJKFriendlyEmphasis.md)
> Notice that the links above are not yet the final documentation but are "specification" files used for testing the correctness of markdig for each extension

View File

@@ -0,0 +1,204 @@
// Copyright (c) Alexandre Mutel. All rights reserved.
// This file is licensed under the BSD-Clause 2 license.
// See the license.txt file in the project root for more information.
using Markdig.Helpers;
using System;
using System.Buffers;
using System.Collections.Generic;
using System.Linq;
#if !NET || !MARKDIG_NO_RUNE_TESTS
using System.Text;
#endif
using System.Threading.Tasks;
namespace Markdig.Tests
{
[TestFixture]
public class TestCjkFriendlyEmphasis
{
private static MarkdownPipeline GetPipeline() => new MarkdownPipelineBuilder().UseCjkFriendlyEmphasis().Build();
private static MarkdownPipeline GetPipelineWithStrikethrough() => new MarkdownPipelineBuilder()
.UseCjkFriendlyEmphasis()
.UseEmphasisExtras()
.Build();
[Test]
[TestCase("これは**私のやりたかったこと。**だからするの。", "<p>これは<strong>私のやりたかったこと。</strong>だからするの。</p>\n")]
[TestCase("**[製品ほげ](./product-foo)****[](./product-bar)**", "<p><strong><a href=\"./product-foo\">製品ほげ</a></strong>と<strong><a href=\"./product-bar\">製品ふが</a></strong>をお試しください</p>\n")]
[TestCase("先頭の**`コード`も注意。**", "<p>先頭の<strong><code>コード</code>も注意。</strong></p>\n")]
[TestCase("**末尾の`コード`**も注意。", "<p><strong>末尾の<code>コード</code></strong>も注意。</p>\n")]
[TestCase("税込**¥10,000**で入手できます。", "<p>税込<strong>¥10,000</strong>で入手できます。</p>\n")]
[TestCase("""太郎は**"こんにちわ"**といった""", "<p>太郎は<strong>&quot;こんにちわ&quot;</strong>といった</p>\n")]
[TestCase("**C#**や**F#**は**「.NET」**というプラットフォーム上で動作します。", "<p><strong>C#</strong>や<strong>F#</strong>は<strong>「.NET」</strong>というプラットフォーム上で動作します。</p>\n")]
[TestCase(".NET**.NET Frameworkは不可**では、", "<p>.NET<strong>.NET Frameworkは不可</strong>では、</p>\n")]
[TestCase("大塚︀**(U+585A U+FE00)** 大塚**(U+FA10)**", "<p>大塚︀<strong>(U+585A U+FE00)</strong> 大塚<strong>(U+FA10)</strong></p>\n")]
[TestCase("〽︎**(庵点)**は、\n\n","<p>〽︎<strong>(庵点)</strong>は、</p>\n")]
[TestCase("**true。**false\n\n", "<p><strong>true。</strong>false</p>\n")]
[TestCase("禰󠄀**(ね)**豆子", "<p>禰󠄀<strong>(ね)</strong>豆子</p>\n")]
public void TestCjkFriendlyEmphasisJapanese(string source, string expected)
{
var pipeline = GetPipeline();
var actual = Markdown.ToHtml(source, pipeline);
Assert.AreEqual(expected, actual);
}
[Test]
[TestCase("**이 [링크](https://example.kr/)**만을 강조하고 싶다.", "<p><strong>이 <a href=\"https://example.kr/\">링크</a></strong>만을 강조하고 싶다.</p>\n")]
[TestCase("**스크립트(script)**라고", "<p><strong>스크립트(script)</strong>라고</p>\n")]
[TestCase("패키지를 발행하려면 **`npm publish`**를 실행하십시오.", "<p>패키지를 발행하려면 <strong><code>npm publish</code></strong>를 실행하십시오.</p>\n")]
[TestCase("**안녕(hello)**하세요.", "<p><strong>안녕(hello)</strong>하세요.</p>\n")]
[TestCase("ᅡ**(a)**", "<p>ᅡ<strong>(a)</strong></p>\n")]
[TestCase("**(k)**ᄏ", "<p><strong>(k)</strong>ᄏ</p>\n")]
public void TestCjkFriendlyEmphasisKorean(string source, string expected)
{
var pipeline = GetPipeline();
var actual = Markdown.ToHtml(source, pipeline);
Assert.AreEqual(expected, actual);
}
[Test]
[TestCase("__注意__注意事項", "<p><strong>注意</strong>:注意事項</p>\n")]
[TestCase("注意__注意事項__", "<p>注意:<strong>注意事項</strong></p>\n")]
[TestCase("正體字。_Traditional._", "<p>正體字。︁<em>Traditional.</em></p>\n")]
[TestCase("正體字。__Hong Kong and Taiwan.__", "<p>正體字。︁<strong>Hong Kong and Taiwan.</strong></p>\n")]
[TestCase("简体字 / 新字体。_Simplified._", "<p>简体字 / 新字体。︀<em>Simplified.</em></p>\n")]
[TestCase("简体字 / 新字体。__Mainland China or Japan.__", "<p>简体字 / 新字体。︀<strong>Mainland China or Japan.</strong></p>\n")]
[TestCase("“Git”__Hub__", "<p>“Git”<strong>Hub</strong></p>\n")]
public void TestCjkFriendlyEmphasisUnderscore(string source, string expected)
{
var pipeline = GetPipeline();
var actual = Markdown.ToHtml(source, pipeline);
Assert.AreEqual(expected, actual);
}
[Test]
[TestCase("a~~a()~~あ", "<p>a<del>a()</del>あ</p>\n")]
[TestCase("あ~~()a~~a", "<p>あ<del>()a</del>a</p>\n")]
[TestCase("𩸽~~()a~~a", "<p>𩸽<del>()a</del>a</p>\n")]
[TestCase("a~~a()~~𩸽", "<p>a<del>a()</del>𩸽</p>\n")]
[TestCase("葛󠄀~~()a~~a", "<p>葛󠄀<del>()a</del>a</p>\n")]
[TestCase("羽︀~~()a~~a", "<p>羽︀<del>()a</del>a</p>\n")]
[TestCase("a~~「a~~」", "<p>a<del>「a</del>」</p>\n")]
[TestCase("「~~a」~~a", "<p>「<del>a」</del>a</p>\n")]
[TestCase("~~a~~~~a~~", "<p><del>a</del><del>a</del></p>\n")]
[TestCase("~~日本語。︀~~English.", "<p><del>日本語。︀</del>English.</p>\n")]
[TestCase("~~“a”~~a", "<p><del>“a”</del>a</p>\n")]
public void TestCjkFriendlyEmphasisGfmStrikethrough(string source, string expected)
{
var pipeline = GetPipelineWithStrikethrough();
var actual = Markdown.ToHtml(source, pipeline);
Assert.AreEqual(expected, actual);
}
[Test]
[TestCase("a**〰**a", "<p>a<strong>〰</strong>a</p>\n")]
[TestCase("a**〽**a", "<p>a<strong>〽</strong>a</p>\n")]
[TestCase("a**🈂**a", "<p>a<strong>🈂</strong>a</p>\n")]
[TestCase("a**🈷**a", "<p>a<strong>🈷</strong>a</p>\n")]
[TestCase("a**㊗**a", "<p>a<strong>㊗</strong>a</p>\n")]
[TestCase("a**㊙**a", "<p>a<strong>㊙</strong>a</p>\n")]
public void TestCjkFriendlyPseudoEmoji(string source, string expected)
{
var pipeline = GetPipeline();
var actual = Markdown.ToHtml(source, pipeline);
Assert.AreEqual(expected, actual);
}
#if !NET || !MARKDIG_NO_RUNE_TESTS
// delimiter: '*', '_' = each character, '?' = either
// can open/close = whether the places can be in the range of emphasis
// 2 before, previous, can close, delimiter, can open, next
// *****Basic*****
[TestCase("\0", " ", false, '?', false, " ")]
[TestCase("\0", "𰻞", true, '?', false, " ")]
[TestCase("\0", " ", false, '?', true, "𰻞")]
[TestCase("\0", "𝜵", false, '?', true, "A")]
[TestCase("\0", "A", true, '?', false, "𝜵")]
[TestCase("\0", "𝜵", true, '*', true, "𰻞")]
[TestCase("\0", "A", true, '*', true, "𰻞")]
[TestCase("\0", "𰻞", true, '*', true, "𝜵")]
[TestCase("\0", "𰻞", true, '*', true, "A")]
[TestCase("\0", "𰻞", true, '*', true, "」")]
[TestCase("\0", "「", true, '*', true, "𰻞")]
[TestCase("\0", "A", true, '*', true, "」")]
[TestCase("\0", "「", true, '*', true, "A")]
[TestCase("\0", "𝜵", false, '_', true, "𰻞")]
[TestCase("\0", "A", false, '_', false, "𰻞")]
[TestCase("\0", "𰻞", true, '_', false, "𝜵")]
[TestCase("\0", "𰻞", false, '_', false, "A")]
[TestCase("\0", "𰻞", true, '_', false, "」")]
[TestCase("\0", "「", false, '_', true, "𰻞")]
[TestCase("\0", "A", true, '_', false, "」")]
[TestCase("\0", "「", false, '_', true, "A")]
// *****IVS*****
[TestCase("𩸽", "\U000E0101", true, '*', true, "𝜵")]
[TestCase("𩸽", "\U000E0101", true, '_', false, "𝜵")]
[TestCase("𩸽", "\U000E0101", true, '*', true, "𝜵")]
[TestCase("𩸽", "\U000E0101", true, '_', false, "𝜵")]
// Non-Han + U+E01XX does not appear in the wild
[TestCase("\0", "\U000E0101", true, '*', true, "𝜵")]
[TestCase("\0", "\U000E0101", true, '_', false, "𝜵")]
[TestCase("\0", "\U000E0101", true, '*', true, "𝜵")]
[TestCase("\0", "\U000E0101", true, '_', false, "𝜵")]
// *****SVS*****
[TestCase("羽", "\uFE00", true, '*', true, "𝜵")]
[TestCase("羽", "\uFE00", true, '_', false, "𝜵")]
[TestCase("羽", "\uFE00", true, '*', true, "𝜵")]
[TestCase("羽", "\uFE00", true, '_', false, "𝜵")]
// Slashed zero
[TestCase("0", "\uFE00", true, '?', false, "𝜵")]
[TestCase("0", "\uFE00", true, '?', false, "𝜵")]
[TestCase("“", "\uFE00", false, '?', true, "A")]
[TestCase("“", "\uFE01", true, '*', true, "A")]
[TestCase("“", "\uFE01", false, '_', true, "A")]
[TestCase("\0", "“", false, '?', true, "A")]
[TestCase("\0", "A", true, '?', false, "“")]
// *****Emoji*****
// Default text presentation
[TestCase("\0", "㊙", true, '*', true, "A")]
[TestCase("\0", "㊙", false, '_', true, "A")]
[TestCase("\0", "A", true, '*', true, "㊙")]
[TestCase("\0", "A", true, '_', false, "㊙")]
// Default emoji presentation
[TestCase("\0", "🈯", false, '?', true, "A")]
[TestCase("\0", "A", true, '?', false, "🈯")]
// EAW = Ambiguous (not CJK)
[TestCase("\0", "☎", false, '?', true, "A")]
// Text presentation sequences
[TestCase("㊙", "\uFE0E", true, '*', true, "A")]
[TestCase("㊙", "\uFE0E", false, '_', true, "A")]
// Caution: default emoji presentation character + text presentation selector has not been supported yet
[TestCase("🈯", "\uFE0E", false, '?', true, "A")]
// Emoji presentation sequences
[TestCase("㊙", "\uFE0F", true, '*', true, "A")]
[TestCase("㊙", "\uFE0F", false, '_', false, "A")]
[TestCase("🈯", "\uFE0F", true, '*', true, "A")]
[TestCase("🈯", "\uFE0F", false, '_', false, "A")]
// *****Korean*****
[TestCase("\0", "한", true, '*', true, "𝜵")]
[TestCase("\0", "𝜵", true, '*', true, "한")]
// A part of NFD form
[TestCase("\0", "ᆫ", true, '*', true, "𝜵")]
[TestCase("\0", "𝜵", true, '*', true, "ᆫ")]
[Test]
public void TestCheckOpenCloseDelimiterCjkFriendly(string twoPrevStr, string prevStr, bool shouldBeClosable, char delim, bool shouldBeOpenable, string nextStr)
{
Assert.AreEqual(OperationStatus.Done, Rune.DecodeFromUtf16(twoPrevStr, out var twoPrev, out _));
Assert.AreEqual(OperationStatus.Done, Rune.DecodeFromUtf16(prevStr, out var prev, out _));
Assert.AreEqual(OperationStatus.Done, Rune.DecodeFromUtf16(nextStr, out var next, out _));
CharHelper.CheckOpenCloseDelimiterCjkFriendly(prev, next, twoPrev, delim == '*', out bool isOpen, out bool isClose);
Assert.AreEqual(shouldBeOpenable, isOpen, "isOpen");
Assert.AreEqual(shouldBeClosable, isClose, "isClose");
if (delim == '?')
{
CharHelper.CheckOpenCloseDelimiterCjkFriendly(prev, next, twoPrev, true, out isOpen, out isClose);
Assert.AreEqual(shouldBeOpenable, isOpen, "isOpen (*)");
Assert.AreEqual(shouldBeClosable, isClose, "isClose (*)");
}
}
#endif
}
}

View File

@@ -9,7 +9,7 @@ namespace Markdig.Tests;
[TestFixture]
public class TestStringSlice
{
#if NET
#if !NET || !MARKDIG_NO_RUNE_TESTS
[Test]
public void TestRuneBmp()
{

View File

@@ -165,6 +165,117 @@ public static class CharHelper
}
}
// The signature of this method is still unstable and can be changed in the future. `internal`-only as for now.
internal static void CheckOpenCloseDelimiterCjkFriendly(Rune pc, Rune c, Rune twoPreviousRune, bool enableWithinWord, out bool canOpen, out bool canClose)
{
pc.CheckUnicodeCategory(out bool prevIsWhiteSpace, out bool prevIsPunctuation);
c.CheckUnicodeCategory(out bool nextIsWhiteSpace, out bool nextIsPunctuation);
// https://github.com/tats-u/markdown-cjk-friendly/commit/3c4217bea8248e9abc8be4e7c68748a88557662d
// The above flankingness check can be simplified under the following conditions:
// - If the delimiter run is adjacent to a whitespace character, the flankingness does not depend on the existence of a punctuation character (and (in CJK-friendly emphasis) a CJK character).
// - If the delimiter run is `_`, some rules can be simplified. Additionally, in CJK-friendly emphasis, the flankingness does not depend on whether the delimiter run is adjacent to a CJK character.
if (prevIsWhiteSpace || nextIsWhiteSpace)
{
// Fastest path
canOpen = !nextIsWhiteSpace;
canClose = !prevIsWhiteSpace;
return;
}
bool isMainTwoPrevious = false;
Rune mainPreviousRune = pc;
if (IsNonEmojiGeneralUseVariantSelector(pc))
{
isMainTwoPrevious = true;
mainPreviousRune = twoPreviousRune;
mainPreviousRune.CheckUnicodeCategory(out var _, out prevIsPunctuation);
}
canOpen = prevIsPunctuation;
canClose = nextIsPunctuation;
if (!enableWithinWord)
{
// Fast path for `_` (does not depend on the existence of a CJK character)
return;
}
bool prevIsCjk = IsCjk(mainPreviousRune) || (isMainTwoPrevious ? IsCjkAmbiousPunctuation(mainPreviousRune, pc) : IsIdeographicVariationSelector(mainPreviousRune));
bool nextIsCjk = IsCjk(c);
bool eitherIsCjk = prevIsCjk || nextIsCjk;
canOpen |= eitherIsCjk || !nextIsPunctuation;
canClose |= eitherIsCjk || !prevIsPunctuation;
// https://github.com/tats-u/markdown-cjk-friendly/blob/main/specification.md
// https://github.com/tats-u/markdown-cjk-friendly/blob/main/ranges.md
static bool IsNonEmojiGeneralUseVariantSelector(Rune r) => r.Value is >= 0xFE00 and <= 0xFE0E;
static bool IsIdeographicVariationSelector(Rune r) => r.Value is >= 0xE0100 and <= 0xE01EF;
static bool IsCjkAmbiousPunctuation(Rune main, Rune vs) => vs.Value is 0xFE01 && main.Value is 0x2018 or 0x2019 or 0x201C or 0x201D;
// As of Unicode 17
static bool IsCjk(Rune r) => r.Value is
>= 0x1100 and ( // Fast path for most non-CJK characters
<= 0x11ff
or 0x20a9
or >= 0x2329 and <= 0x232a
or >= 0x2630 and <= 0x2637
or >= 0x268a and <= 0x268f
or >= 0x2e80 and <= 0x2e99
or >= 0x2e9b and <= 0x2ef3
or >= 0x2f00 and <= 0x2fd5
or >= 0x2ff0 and <= 0x303e
or >= 0x3041 and <= 0x3096
or >= 0x3099 and <= 0x30ff
or >= 0x3105 and <= 0x312f
or >= 0x3131 and <= 0x318e
or >= 0x3190 and <= 0x31e5
or >= 0x31ef and <= 0x321e
or >= 0x3220 and <= 0x3247
or >= 0x3250 and <= 0xa48c
or >= 0xa490 and <= 0xa4c6
or >= 0xa960 and <= 0xa97c
or >= 0xac00 and <= 0xd7a3
or >= 0xd7b0 and <= 0xd7c6
or >= 0xd7cb and <= 0xd7fb
or >= 0xf900 and <= 0xfaff
or >= 0xfe10 and <= 0xfe19
or >= 0xfe30 and <= 0xfe52
or >= 0xfe54 and <= 0xfe66
or >= 0xfe68 and <= 0xfe6b
or >= 0xff01 and <= 0xffbe
or >= 0xffc2 and <= 0xffc7
or >= 0xffca and <= 0xffcf
or >= 0xffd2 and <= 0xffd7
or >= 0xffda and <= 0xffdc
or >= 0xffe0 and <= 0xffe6
or >= 0xffe8 and <= 0xffee
or >= 0x16fe0 and <= 0x16fe4
or >= 0x16ff0 and <= 0x16ff6
or >= 0x17000 and <= 0x18cd5
or >= 0x18cff and <= 0x18d1e
or >= 0x18d80 and <= 0x18df2
or >= 0x1aff0 and <= 0x1aff3
or >= 0x1aff5 and <= 0x1affb
or >= 0x1affd and <= 0x1affe
or >= 0x1b000 and <= 0x1b122
or 0x1b132
or >= 0x1b150 and <= 0x1b152
or 0x1b155
or >= 0x1b164 and <= 0x1b167
or >= 0x1b170 and <= 0x1b2fb
or >= 0x1d300 and <= 0x1d356
or >= 0x1d360 and <= 0x1d376
or 0x1f200
or 0x1f202
or >= 0x1f210 and <= 0x1f219
or >= 0x1f21b and <= 0x1f22e
or >= 0x1f230 and <= 0x1f231
or 0x1f237
or 0x1f23b
or >= 0x1f240 and <= 0x1f248
or >= 0x1f260 and <= 0x1f265
or >= 0x20000 and <= 0x3fffd
);
}
/// <summary>
/// Determines whether roman letter partial.
/// </summary>

View File

@@ -108,7 +108,7 @@ public static class MarkdownExtensions
pipeline.Extensions.ReplaceOrAdd<AlertExtension>(new AlertExtension() { RenderKind = renderKind });
return pipeline;
}
/// <summary>
/// Uses this extension to enable autolinks from text `http://`, `https://`, `ftp://`, `mailto:`, `www.xxx.yyy`
/// </summary>
@@ -515,6 +515,18 @@ public static class MarkdownExtensions
return pipeline;
}
/// <summary>
/// Enables CJK-friendly emphasis. <c>**</c> around punctuation in CJK text will be much more likely to be parsed as emphasis as intended.
/// </summary>
/// <param name="pipeline">The pipeline</param>
/// <returns>The modified pipeline</returns>
/// <see href="https://github.com/tats-u/markdown-cjk-friendly/"/>
public static MarkdownPipelineBuilder UseCjkFriendlyEmphasis(this MarkdownPipelineBuilder pipeline)
{
pipeline.InlineParsers.FindExact<EmphasisInlineParser>()?.CjkFriendlyEmphasis = true;
return pipeline;
}
/// <summary>
/// This will disable the HTML support in the markdown processor (for constraint/safe parsing).
/// </summary>
@@ -653,6 +665,9 @@ public static class MarkdownExtensions
case "globalization":
pipeline.UseGlobalization();
break;
case "cjk-friendly-emphasis":
pipeline.UseCjkFriendlyEmphasis();
break;
default:
throw new ArgumentException($"Invalid extension `{extension}` from `{extensions}`", nameof(extensions));
}

View File

@@ -43,6 +43,12 @@ public class EmphasisInlineParser : InlineParser, IPostInlineProcessor
/// </summary>
public List<EmphasisDescriptor> EmphasisDescriptors { get; }
/// <summary>
/// Gets or toggles whether the emphasis parser should be CJK-friendly.
/// </summary>
/// <seealso href="https://github.com/tats-u/markdown-cjk-friendly"/>
public bool CjkFriendlyEmphasis { get; set; } = false;
/// <summary>
/// Determines whether this parser is using the specified character as an emphasis delimiter.
/// </summary>
@@ -161,16 +167,28 @@ public class EmphasisInlineParser : InlineParser, IPostInlineProcessor
var emphasisDesc = emphasisMap![delimiterChar]!;
Rune pc = (Rune)0;
Rune twoPreviousChar = default;
if (processor.Inline is HtmlEntityInline htmlEntityInline)
{
if (htmlEntityInline.Transcoded.Length > 0)
{
pc = htmlEntityInline.Transcoded.RuneAt(htmlEntityInline.Transcoded.End);
if (CjkFriendlyEmphasis)
{
twoPreviousChar = htmlEntityInline.Transcoded.RuneAt(htmlEntityInline.Transcoded.End - pc.Utf16SequenceLength);
}
}
}
if (pc.Value == 0)
{
pc = slice.PeekRuneExtra(-1);
if (CjkFriendlyEmphasis)
{
// This cannot be a delegate (Func<Rune>?) because slice is a reference
twoPreviousChar = slice.PeekRuneExtra(-1 - pc.Utf16SequenceLength);
}
// delimiterChar is BMP, so slice.PeekCharExtra(-2) is (a part of) the character two positions back.
if (pc == (Rune)delimiterChar && slice.PeekCharExtra(-2) != '\\')
{
@@ -199,8 +217,17 @@ public class EmphasisInlineParser : InlineParser, IPostInlineProcessor
Rune.DecodeFromUtf16(htmlString, out c, out _);
}
bool canOpen;
bool canClose;
// Calculate Open-Close for current character
CharHelper.CheckOpenCloseDelimiter(pc, c, emphasisDesc.EnableWithinWord, out bool canOpen, out bool canClose);
if (CjkFriendlyEmphasis)
{
CharHelper.CheckOpenCloseDelimiterCjkFriendly(pc, c, twoPreviousChar, emphasisDesc.EnableWithinWord, out canOpen, out canClose);
}
else
{
CharHelper.CheckOpenCloseDelimiter(pc, c, emphasisDesc.EnableWithinWord, out canOpen, out canClose);
}
// We have potentially an open or close emphasis
if (canOpen || canClose)

View File

@@ -87,6 +87,7 @@ class Program
new Spec("Jira Links", "JiraLinks.md", "jiralinks"),
new Spec("Globalization", "GlobalizationSpecs.md", "globalization+advanced+emojis"),
new Spec("Figures, Footers and Cites", "FigureFooterAndCiteSpecs.md", "figures+footers+citations|advanced"),
new Spec("CJK-friendly Emphasis", "CJKFriendlyEmphasis.md", "cjk-friendly-emphasis"),
new NormalizeSpec("Headings", "Headings.md", ""),
@@ -358,7 +359,7 @@ class Program
static string CompressedName(string name)
{
string compressedName = "";
foreach (var part in name.Replace(',', ' ').Split(' ', StringSplitOptions.RemoveEmptyEntries))
foreach (var part in name.Replace(',' , ' ').Replace('-', ' ').Split(' ', StringSplitOptions.RemoveEmptyEntries))
{
compressedName += char.IsLower(part[0])
? char.ToUpper(part[0]) + (part.Length > 1 ? part.Substring(1) : "")