Compare commits

...

1 Commits

Author SHA1 Message Date
Leonard Hecker
1bcafa3408 wip 2024-12-22 16:20:13 +01:00
2 changed files with 373 additions and 110 deletions

View File

@@ -4,12 +4,14 @@ using System.Xml.Linq;
using TrieType = uint;
// Used as an indicator in joinRules for ÷ ("does not join").
// Used as an indicator in our rules for ÷ ("does not join").
// Underscore is one of the few characters that are permitted as an identifier,
// are monospace in most fonts and also visually distinct from the digits.
const byte _ = 3;
const int _ = -1;
// JoinRules doesn't quite follow UAX #29, as it states:
// @formatter:off
// joinRules doesn't quite follow UAX #29, as it states:
// > Note: Testing two adjacent characters is insufficient for determining a boundary.
//
// I completely agree, however it makes the implementation complex and slow, and it only benefits what can be considered
@@ -50,49 +52,236 @@ const byte _ = 3;
//
// This is a great reference for the resulting table:
// https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.html
byte[][][] joinRules =
int[][][] joinRules =
[
// Base table
[
/* | leading -> trailing codepoint */
/* v | Other | Control | Extend | RI | Prepend | HangulL | HangulV | HangulT | HangulLV | HangulLVT | InCBLinker | InCBConsonant | ExtPic | ZWJ | */
/* Other | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* Control | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */],
/* Extend | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* RI | */ [_ /* | */, _ /* | */, 0 /* | */, 1 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* Prepend | */ [0 /* | */, _ /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */],
/* HangulL | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, 0 /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* HangulV | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* HangulT | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* HangulLV | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* HangulLVT | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* InCBLinker | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, 0 /* | */],
/* InCBConsonant | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* ExtPic | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* ZWJ | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, 0 /* | */, 0 /* | */],
/* leading trailing codepoint */
/* | Other | CR | LF | Control | Extend | RI | Prepend | HangulL | HangulV | HangulT | HangulLV | HangulLVT | InCBLinker | InCBConsonant | ExtPic | ZWJ | */
/* Other | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* CR | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */],
/* LF | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */],
/* Control | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */],
/* Extend | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* RI | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 1 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* Prepend | */ [0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */],
/* HangulL | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, 0 /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* HangulV | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* HangulT | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* HangulLV | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* HangulLVT | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* InCBLinker | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, 0 /* | */],
/* InCBConsonant | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* ExtPic | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* ZWJ | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, 0 /* | */, 0 /* | */],
],
// Once we have encountered a Regional Indicator pair we'll enter this table.
// It's a copy of the base table, but instead of RI × RI, we're RI ÷ RI.
[
/* | leading -> trailing codepoint */
/* v | Other | Control | Extend | RI | Prepend | HangulL | HangulV | HangulT | HangulLV | HangulLVT | InCBLinker | InCBConsonant | ExtPic | ZWJ | */
/* Other | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* Control | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */],
/* Extend | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* RI | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* Prepend | */ [0 /* | */, _ /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */],
/* HangulL | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, 0 /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* HangulV | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* HangulT | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* HangulLV | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* HangulLVT | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* InCBLinker | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, 0 /* | */],
/* InCBConsonant | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* ExtPic | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* ZWJ | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, 0 /* | */, 0 /* | */],
/* leading trailing codepoint */
/* | Other | CR | LF | Control | Extend | RI | Prepend | HangulL | HangulV | HangulT | HangulLV | HangulLVT | InCBLinker | InCBConsonant | ExtPic | ZWJ | */
/* Other | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* CR | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */],
/* LF | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */],
/* Control | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */],
/* Extend | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* RI | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* Prepend | */ [0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */],
/* HangulL | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, 0 /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* HangulV | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* HangulT | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* HangulLV | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* HangulLVT | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* InCBLinker | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, 0 /* | */],
/* InCBConsonant | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* ExtPic | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
/* ZWJ | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, 0 /* | */, 0 /* | */],
],
];
// Documentation for our UAX #14 line break implementation based on Unicode 16.1,
// but heavily modified to allow for use with lookup tables:
//
// NOTE: If you convert these rules into a lookup table, you must apply them in reverse order.
// This is because the rules are ordered from most to least important (e.g. LB8 overrides LB18).
//
// Resolve line breaking classes:
// LB1: Assign a line breaking class [...].
// ❌ Unicode does that for us via the "lb" attribute.
//
// Start and end of text:
// LB2: Never break at the start of text.
// ❌ Functionality not needed.
// LB3: Always break at the end of text.
// ❌ Functionality not needed.
//
// Mandatory breaks:
// LB4: Always break after hard line breaks.
// ❌ Handled by our ucd_* functions.
// LB5: Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks.
// ❌ Handled by our ucd_* functions.
// LB6: Do not break before hard line breaks.
// ❌ Handled by our ucd_* functions.
//
// Explicit breaks and non-breaks:
// LB7: Do not break before spaces or zero width space.
// ❌ It's way simpler to treat spaces as if they always break.
// LB8: Break before any character following a zero-width space, even if one or more spaces intervene.
// ⍻ ZW ÷ modified from ZW SP* ÷ because it's not worth being this anal about accuracy here.
// LB8a: Do not break after a zero width joiner.
// ❌ Our ucd_* functions never break within grapheme clusters.
//
// Combining marks:
// LB9: Do not break a combining character sequence; treat it as if it has the line breaking class of the base character in all of the following rules. Treat ZWJ as if it were CM.
// ❌ Our ucd_* functions never break within grapheme clusters.
// LB10: Treat any remaining combining mark or ZWJ as AL.
// ❌ To be honest, I'm not entirely sure, I understand the implications of this rule.
//
// Word joiner:
// LB11: Do not break before or after Word joiner and related characters.
// ✔ × WJ
// ✔ WJ ×
//
// Non-breaking characters:
// LB12: Do not break after NBSP and related characters.
// ✔ GL ×
// LB12a: Do not break before NBSP and related characters, except after spaces and hyphens.
// ✔ [^SP BA HY] × GL
//
// Opening and closing:
// LB13: Do not break before ']' or '!' or '/', even after spaces.
// ✔ × CL
// ✔ × CP
// ✔ × EX
// ✔ × SY
// LB14: Do not break after '[', even after spaces.
// ⍻ OP × modified from OP SP* × just because it's simpler. It would be nice to address this.
// LB15a: Do not break after an unresolved initial punctuation that lies at the start of the line, after a space, after opening punctuation, or after an unresolved quotation mark, even after spaces.
// ❌ Not implemented. Seemed too complex for little gain?
// LB15b: Do not break before an unresolved final punctuation that lies at the end of the line, before a space, before a prohibited break, or before an unresolved quotation mark, even after spaces.
// ❌ Not implemented. Seemed too complex for little gain?
// LB15c: Break before a decimal mark that follows a space, for instance, in 'subtract .5'.
// ⍻ SP ÷ IS modified from SP ÷ IS NU because this fits neatly with LB15d.
// LB15d: Otherwise, do not break before ';', ',', or '.', even after spaces.
// ✔ × IS
// LB16: Do not break between closing punctuation and a nonstarter (lb=NS), even with intervening spaces.
// ❌ Not implemented. Could be useful in the future, but its usefulness seemed limited to me.
// LB17: Do not break within '——', even with intervening spaces.
// ❌ Not implemented. Terminal applications nor code use em-dashes much anyway.
//
// Spaces:
// LB18: Break after spaces.
// ❌ Implemented because we didn't implement LB7.
//
// Special case rules:
// LB19: Do not break before non-initial unresolved quotation marks, such as ' ” ' or ' " ', nor after non-final unresolved quotation marks, such as ' “ ' or ' " '.
// ⍻ × QU modified from × [ QU - \p{Pi} ]
// ⍻ QU × modified from [ QU - \p{Pf} ] ×
// We implement the Unicode 16.0 instead of 16.1 rules, because it's simpler and allows us to use a LUT.
// LB19a: Unless surrounded by East Asian characters, do not break either side of any unresolved quotation marks.
// ❌ [^$EastAsian] × QU
// ❌ × QU ( [^$EastAsian] | eot )
// ❌ QU × [^$EastAsian]
// ❌ ( sot | [^$EastAsian] ) QU ×
// Same as LB19.
// LB20: Break before and after unresolved CB.
// ❌ We break by default. Unicode inline objects are super irrelevant in a terminal in either case.
// LB20a: Do not break after a word-initial hyphen.
// ❌ Not implemented. Seemed not worth the hassle as the window will almost always be >1 char wide.
// LB21: Do not break before hyphen-minus, other hyphens, fixed-width spaces, small kana, and other non-starters, or after acute accents.
// ✔ × BA
// ✔ × HY
// ✔ × NS
// ✔ BB ×
// LB21a: Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew.
// ❌ Not implemented. Perhaps in the future.
// LB21b: Do not break between Solidus and Hebrew letters.
// ❌ Not implemented. Perhaps in the future.
// LB22: Do not break before ellipses.
// ✔ × IN
//
// Numbers:
// LB23: Do not break between digits and letters.
// ✔ (AL | HL) × NU
// ✔ NU × (AL | HL)
// LB23a: Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
// ✔ PR × (ID | EB | EM)
// ✔ (ID | EB | EM) × PO
// LB24: Do not break between numeric prefix/postfix and letters, or between letters and prefix/postfix.
// ✔ (PR | PO) × (AL | HL)
// ✔ (AL | HL) × (PR | PO)
// LB25: Do not break numbers:
// ⍻ CL × PO modified from NU ( SY | IS )* CL × PO
// ⍻ CP × PO modified from NU ( SY | IS )* CP × PO
// ⍻ CL × PR modified from NU ( SY | IS )* CL × PR
// ⍻ CP × PR modified from NU ( SY | IS )* CP × PR
// ⍻ ( NU | SY | IS ) × PO modified from NU ( SY | IS )* × PO
// ⍻ ( NU | SY | IS ) × PR modified from NU ( SY | IS )* × PR
// ⍻ PO × OP modified from PO × OP NU
// ⍻ PO × OP modified from PO × OP IS NU
// ✔ PO × NU
// ⍻ PR × OP modified from PR × OP NU
// ⍻ PR × OP modified from PR × OP IS NU
// ✔ PR × NU
// ✔ HY × NU
// ✔ IS × NU
// ⍻ ( NU | SY | IS ) × NU modified from NU ( SY | IS )* × NU
// Most were simplified because the cases this additionally allows don't matter much here.
//
// Korean syllable blocks
// LB26: Do not break a Korean syllable.
// ❌ Our ucd_* functions never break within grapheme clusters.
// LB27: Treat a Korean Syllable Block the same as ID.
// ❌ Our ucd_* functions never break within grapheme clusters.
//
// Finally, join alphabetic letters into words and break everything else.
// LB28: Do not break between alphabetics ("at").
// ✔ (AL | HL) × (AL | HL)
// LB28a: Do not break inside the orthographic syllables of Brahmic scripts.
// ❌ Our ucd_* functions never break within grapheme clusters.
// LB29: Do not break between numeric punctuation and alphabetics ("e.g.").
// ✔ IS × (AL | HL)
// LB30: Do not break between letters, numbers, or ordinary symbols and opening or closing parentheses.
// ✔ (AL | HL | NU) × [OP-$EastAsian]
// ✔ [CP-$EastAsian] × (AL | HL | NU)
// LB30a: Break between two regional indicator symbols if and only if there are an even number of regional indicators preceding the position of the break.
// ❌ Our ucd_* functions never break within grapheme clusters.
// LB30b: Do not break between an emoji base (or potential emoji) and an emoji modifier.
// ❌ Our ucd_* functions never break within grapheme clusters.
// LB31: Break everywhere else.
// ❌ Our default behavior.
int[][] joinRulesLineBreak =
[
/* ↓ leading → trailing codepoint */
/* | Other | WordJoiner | ZeroWidthSpace | Glue | Space | BreakAfter | BreakBefore | Hyphen | ClosePunctuation | CloseParenthesis_EA | CloseParenthesis_NotEA | Exclamation | Inseparable | Nonstarter | OpenPunctuation_EA | OpenPunctuation_NotEA | Quotation | InfixNumericSeparator | Numeric | PostfixNumeric | PrefixNumeric | SymbolsAllowingBreakAfter | Alphabetic | Ideographic | */
/* Other | */ [_ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, _ /* | */, 1 /* | */, _ /* | */, _ /* | */],
/* WordJoiner | */ [1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */],
/* ZeroWidthSpace | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */],
/* Glue | */ [1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */],
/* Space | */ [_ /* | */, 1 /* | */, _ /* | */, _ /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 1 /* | */, _ /* | */, _ /* | */],
/* BreakAfter | */ [_ /* | */, 1 /* | */, _ /* | */, _ /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, _ /* | */, 1 /* | */, _ /* | */, _ /* | */],
/* BreakBefore | */ [1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */],
/* Hyphen | */ [_ /* | */, 1 /* | */, _ /* | */, _ /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, _ /* | */, _ /* | */],
/* ClosePunctuation | */ [_ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */],
/* CloseParenthesis_EA | */ [_ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */],
/* CloseParenthesis_NotEA | */ [_ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */],
/* Exclamation | */ [_ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, _ /* | */, 1 /* | */, _ /* | */, _ /* | */],
/* Inseparable | */ [_ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, _ /* | */, 1 /* | */, _ /* | */, _ /* | */],
/* Nonstarter | */ [_ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, _ /* | */, 1 /* | */, _ /* | */, _ /* | */],
/* OpenPunctuation_EA | */ [1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */],
/* OpenPunctuation_NotEA | */ [1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */],
/* Quotation | */ [1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */],
/* InfixNumericSeparator | */ [_ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */],
/* Numeric | */ [_ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */],
/* PostfixNumeric | */ [_ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, 1 /* | */, _ /* | */],
/* PrefixNumeric | */ [_ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */],
/* SymbolsAllowingBreakAfter | */ [_ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */],
/* Alphabetic | */ [_ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */],
/* Ideographic | */ [_ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, _ /* | */],
];
// @formatter:on
if (args.Length != 1)
{
Console.WriteLine(
@@ -113,7 +302,7 @@ var ucd = ExtractValuesFromUcd(args[0]);
// 4 still gives ~30% savings over 3 stages and going beyond 5 gives diminishing returns (<10%).
var trie = BuildBestTrie(ucd.Values, 2, 8, 4);
// The joinRules above has 2 bits per value. This packs it into 32-bit integers to save space.
var rules = PrepareRulesTable(joinRules);
var rules = joinRules.Select(table => PrepareRulesTable(table, 2, 3)).ToArray();
// Each rules item has the same length. Each item is 32 bits = 4 bytes.
var totalSize = trie.TotalSize + rules.Length * rules[0].Length * sizeof(TrieType);
@@ -134,9 +323,12 @@ foreach (var (expected, cp) in ucd.Values.Select((v, i) => (v, i)))
// All the remaining code starting here simply generates the C++ output.
var buf = new StringBuilder();
buf.Append("// Generated by GraphemeTableGen\n");
buf.Append($"// on {DateTime.UtcNow.ToString("yyyy'-'MM'-'dd'T'HH':'mm':'ssK")}, from {ucd.Description}, {totalSize} bytes\n");
buf.Append("// clang-format off\n");
buf.Append($$"""
// Generated by GraphemeTableGen
// on {{DateTime.UtcNow.ToString("yyyy'-'MM'-'dd'T'HH':'mm':'ssK")}}, from {{ucd.Description}}, {{totalSize}} bytes
// clang-format off
""");
foreach (var stage in trie.Stages)
{
@@ -147,7 +339,7 @@ foreach (var stage in trie.Stages)
width = stage.Mask + 1;
}
buf.Append($"static constexpr uint{stage.Bits}_t s_stage{stage.Index}[] = {{");
buf.Append($"static const uint{stage.Bits}_t s_stage{stage.Index}[] = {{");
foreach (var (value, j) in stage.Values.Select((v, j) => (v, j)))
{
if (j % width == 0)
@@ -159,7 +351,7 @@ foreach (var stage in trie.Stages)
buf.Append("\n};\n");
}
buf.Append($"static constexpr uint32_t s_joinRules[{rules.Length}][{rules[0].Length}] = {{\n");
buf.Append($"static const uint32_t s_join_rules[{rules.Length}][{rules[0].Length}] = {{\n");
foreach (var table in rules)
{
buf.Append(" {\n");
@@ -171,11 +363,11 @@ foreach (var table in rules)
}
buf.Append("};\n");
buf.Append("constexpr int ucdLookup(const char32_t cp) noexcept\n");
buf.Append("inline int ucd_lookup(const uint32_t cp)\n");
buf.Append("{\n");
foreach (var stage in trie.Stages)
{
buf.Append($" const auto s{stage.Index} = s_stage{stage.Index}[");
buf.Append($" const uint{stage.Bits}_t s{stage.Index} = s_stage{stage.Index}[");
if (stage.Index == 0)
{
buf.Append($"cp >> {stage.Shift}");
@@ -190,21 +382,27 @@ foreach (var stage in trie.Stages)
buf.Append($" return s{trie.Stages.Count - 1};\n");
buf.Append("}\n");
buf.Append("constexpr int ucdGraphemeJoins(const int state, const int lead, const int trail) noexcept\n");
buf.Append("{\n");
buf.Append(" const auto l = lead & 15;\n");
buf.Append(" const auto t = trail & 15;\n");
buf.Append($" return (s_joinRules[state][l] >> (t * 2)) & 3;\n");
buf.Append("}\n");
buf.Append("constexpr bool ucdGraphemeDone(const int state) noexcept\n");
buf.Append("{\n");
buf.Append($" return state == 3;\n");
buf.Append("}\n");
buf.Append("constexpr int ucdToCharacterWidth(const int val) noexcept\n");
buf.Append("{\n");
buf.Append(" return val >> 6;\n");
buf.Append("}\n");
buf.Append("// clang-format on\n");
buf.Append($$"""
inline int ucd_grapheme_joins(const int state, const int lead, const int trail)
{
const int l = lead & 15;
const int t = trail & 15;
return (s_join_rules[state][l] >> (t * 2)) & 3;
}
inline bool ucd_grapheme_done(const int state)
{
return state == 3;
}
inline int ucd_to_character_width(const int val)
{
return val >> 6;
}
inline int ucd_is_newline(const int val)
{
return val > {{(int)ClusterBreak.Control}};
}
// clang-format on
""");
Console.Write(buf);
return;
@@ -224,6 +422,7 @@ static Ucd ExtractValuesFromUcd(string path)
foreach (var group in doc.Root!.Descendants(ns + "group"))
{
var groupGeneralCategory = group.Attribute("gc")?.Value;
var groupLineBreak = group.Attribute("lb")?.Value;
var groupGraphemeClusterBreak = group.Attribute("GCB")?.Value;
var groupIndicConjunctBreak = group.Attribute("InCB")?.Value;
var groupExtendedPictographic = group.Attribute("ExtPict")?.Value;
@@ -246,6 +445,7 @@ static Ucd ExtractValuesFromUcd(string path)
}
var generalCategory = ch.Attribute("gc")?.Value ?? groupGeneralCategory ?? "";
var lineBreak = ch.Attribute("lb")?.Value ?? groupLineBreak ?? "";
var graphemeClusterBreak = ch.Attribute("GCB")?.Value ?? groupGraphemeClusterBreak ?? "";
var indicConjunctBreak = ch.Attribute("InCB")?.Value ?? groupIndicConjunctBreak ?? "";
var extendedPictographic = ch.Attribute("ExtPict")?.Value ?? groupExtendedPictographic ?? "";
@@ -257,7 +457,9 @@ static Ucd ExtractValuesFromUcd(string path)
// We ignore GB3 which demands that CR × LF do not break apart, because
// * these control characters won't normally reach our text storage
// * otherwise we're in a raw write mode and historically conhost stores them in separate cells
"CR" or "LF" or "CN" => ClusterBreak.Control, // Carriage Return, Line Feed, Control
"CR" => ClusterBreak.CR, // Carriage Return
"LF" => ClusterBreak.LF, // Line Feed
"CN" => ClusterBreak.Control, // Control
"EX" or "SM" => ClusterBreak.Extend, // Extend, SpacingMark
"PP" => ClusterBreak.Prepend, // Prepend
"ZWJ" => ClusterBreak.ZWJ, // Zero Width Joiner
@@ -296,7 +498,7 @@ static Ucd ExtractValuesFromUcd(string path)
{
"N" or "Na" or "H" => CharacterWidth.Narrow, // Half-width, Narrow, Neutral
"F" or "W" => CharacterWidth.Wide, // Wide, Full-width
"A" => CharacterWidth.Ambiguous, // Ambiguous
"A" => CharacterWidth.Narrow, // Ambiguous
_ => throw new Exception($"Unrecognized ea {eastAsian} for U+{firstCp:X4} to U+{lastCp:X4}")
};
@@ -315,10 +517,44 @@ static Ucd ExtractValuesFromUcd(string path)
width = CharacterWidth.ZeroWidth;
break;
case "Me" or "Mn" or "Cf":
width = CharacterWidth.ZeroWidth;
width = CharacterWidth.ZeroWidth;
break;
}
var lbEa = eastAsian is "F" or "W" or "H";
var lb = lineBreak switch
{
"WJ" => LineBreak.WordJoiner,
"ZW" => LineBreak.ZeroWidthSpace,
"GL" => LineBreak.Glue,
"SP" => LineBreak.Space,
"BA" => LineBreak.BreakAfter,
"BB" => LineBreak.BreakBefore,
"HY" => LineBreak.Hyphen,
"CL" => LineBreak.ClosePunctuation,
"CP" when lbEa => LineBreak.CloseParenthesis_EA,
"CP" => LineBreak.CloseParenthesis_NotEA,
"EX" => LineBreak.Exclamation,
"IN" => LineBreak.Inseparable,
"NS" => LineBreak.Nonstarter,
"OP" when lbEa => LineBreak.OpenPunctuation_EA,
"OP" => LineBreak.OpenPunctuation_NotEA,
"QU" => LineBreak.Quotation,
"IS" => LineBreak.InfixNumericSeparator,
"NU" => LineBreak.Numeric,
"PO" => LineBreak.PostfixNumeric,
"PR" => LineBreak.PrefixNumeric,
"SY" => LineBreak.SymbolsAllowingBreakAfter,
"AL" or "HL" => LineBreak.Alphabetic,
"ID" or "EB" or "EM" => LineBreak.Ideographic,
_ => LineBreak.Other,
};
Fill(firstCp, lastCp, TrieValue(cb, width));
}
}
@@ -336,6 +572,12 @@ static Ucd ExtractValuesFromUcd(string path)
// By default, CharacterWidth.Ambiguous, but by convention .Narrow in terminals.
Fill(0x2500, 0x259F, TrieValue(ClusterBreak.Other, CharacterWidth.Narrow));
// U+FE0F Variation Selector-16 is used to turn unqualified Emojis into qualified ones.
// By convention, this turns them from being ambiguous width (= narrow) into wide ones.
// We achieve this here by explicitly giving this codepoint a wide width.
// Later down below we'll clamp width back to <= 2.
Fill(0xFE0F, 0xFE0F, TrieValue(ClusterBreak.Extend, CharacterWidth.Wide));
return new Ucd
{
Description = description,
@@ -354,38 +596,37 @@ static Ucd ExtractValuesFromUcd(string path)
}
}
// Because each item in the list of 2D rule tables only uses 2 bits and not all 8 in each byte,
// Because each item in the list of 2D rule tables only uses 1-2 bits,
// this function packs them into chunks of 32-bit integers to save space.
static uint[][] PrepareRulesTable(byte[][][] rules)
static uint[] PrepareRulesTable(int[][] rules, int bitWidth, int nonJoinerValue)
{
var compressed = new uint[rules.Length][];
for (var i = 0; i < compressed.Length; i++)
{
compressed[i] = new uint[16];
}
var compressed = new uint[rules.Length];
foreach (var (table, prevIndex) in rules.Select((v, i) => (v, i)))
foreach (var lead in Enumerable.Range(0, rules.Length))
{
foreach (var (row, lead) in table.Select((v, i) => (v, i)))
var row = rules[lead];
uint nextIndices = 0;
if (row.Length > 32 / bitWidth)
{
if (table[lead].Length > 16)
{
throw new Exception("Can't pack row into 32 bits");
}
uint nextIndices = 0;
foreach (var (nextIndex, trail) in row.Select((v, i) => (v, i)))
{
if (nextIndex > 3)
{
throw new Exception("Can't pack table index into 2 bits");
}
nextIndices |= (uint)(nextIndex << (trail * 2));
}
compressed[prevIndex][lead] = nextIndices;
throw new Exception("Can't pack row into 32 bits");
}
foreach (var trail in Enumerable.Range(0, row.Length))
{
var value = row[trail];
if (value < 0)
{
value = nonJoinerValue;
}
if (value > (1 << bitWidth) - 1)
{
throw new Exception("Can't pack table index into 2 bits");
}
nextIndices |= (uint)(value << (trail * bitWidth));
}
compressed[lead] = nextIndices;
}
return compressed;
@@ -549,7 +790,6 @@ internal enum CharacterWidth
internal enum ClusterBreak
{
Other, // GB999
Control, // GB3, GB4, GB5 -- includes CR, LF
Extend, // GB9, GB9a -- includes SpacingMark
RI, // GB12, GB13
Prepend, // GB9b
@@ -562,6 +802,50 @@ internal enum ClusterBreak
InCBConsonant, // GB9c
ExtPic, // GB11
ZWJ, // GB9, GB11
// These are intentionally ordered last, as this allows
// us to simplify the ucd_is_newline implementation.
Control, // GB4, GB5
CR, // GB3, GB4, GB5
LF, // GB3, GB4, GB5
}
internal enum LineBreak
{
Other, // Anything else
// Non-tailorable Line Breaking Classes
WordJoiner, // WJ
ZeroWidthSpace, // ZW
Glue, // GL
Space, // SP
// Break Opportunities
BreakAfter, // BA
BreakBefore, // BB
Hyphen, // HY
// Characters Prohibiting Certain Breaks
ClosePunctuation, // CL
CloseParenthesis_EA, // CP, East Asian
CloseParenthesis_NotEA, // CP, not East Asian
Exclamation, // EX
Inseparable, // IN
Nonstarter, // NS
OpenPunctuation_EA, // OP, East Asian
OpenPunctuation_NotEA, // OP, not East Asian
Quotation, // QU
// Numeric Context
InfixNumericSeparator, // IS
Numeric, // NU
PostfixNumeric, // PO
PrefixNumeric, // PR
SymbolsAllowingBreakAfter, // SY
// Other Characters
Alphabetic, // AL & HL
Ideographic, // ID & EB & EM
}
internal class Ucd

View File

@@ -846,16 +846,6 @@ bool CodepointWidthDetector::_graphemeNext(GraphemeState& s, const std::wstring_
{
w = _ambiguousWidth;
}
// U+FE0F Variation Selector-16 is used to turn unqualified Emojis into qualified ones.
// By convention, this turns them from being ambiguous width (= narrow) into wide ones.
// We achieve this here by explicitly giving this codepoint a wide width.
// Later down below we'll clamp width back to <= 2.
if (cp == 0xFE0F)
{
w = 2;
}
width += w;
}
@@ -943,16 +933,6 @@ bool CodepointWidthDetector::_graphemePrev(GraphemeState& s, const std::wstring_
{
w = _ambiguousWidth;
}
// U+FE0F Variation Selector-16 is used to turn unqualified Emojis into qualified ones.
// By convention, this turns them from being ambiguous width (= narrow) into wide ones.
// We achieve this here by explicitly giving this codepoint a wide width.
// Later down below we'll clamp width back to <= 2.
if (cp == 0xFE0F)
{
w = 2;
}
width += w;
}
@@ -1100,7 +1080,6 @@ bool CodepointWidthDetector::_graphemePrevWcswidth(GraphemeState& s, const std::
{
w = _ambiguousWidth;
}
width += w;
const auto hasWidth = width != 0;