mirror of
https://github.com/microsoft/terminal.git
synced 2026-04-07 23:01:09 +00:00
Compare commits
1 Commits
dev/duhowe
...
dev/lhecke
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1bcafa3408 |
@@ -4,12 +4,14 @@ using System.Xml.Linq;
|
||||
|
||||
using TrieType = uint;
|
||||
|
||||
// Used as an indicator in joinRules for ÷ ("does not join").
|
||||
// Used as an indicator in our rules for ÷ ("does not join").
|
||||
// Underscore is one of the few characters that are permitted as an identifier,
|
||||
// are monospace in most fonts and also visually distinct from the digits.
|
||||
const byte _ = 3;
|
||||
const int _ = -1;
|
||||
|
||||
// JoinRules doesn't quite follow UAX #29, as it states:
|
||||
// @formatter:off
|
||||
|
||||
// joinRules doesn't quite follow UAX #29, as it states:
|
||||
// > Note: Testing two adjacent characters is insufficient for determining a boundary.
|
||||
//
|
||||
// I completely agree, however it makes the implementation complex and slow, and it only benefits what can be considered
|
||||
@@ -50,49 +52,236 @@ const byte _ = 3;
|
||||
//
|
||||
// This is a great reference for the resulting table:
|
||||
// https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.html
|
||||
byte[][][] joinRules =
|
||||
int[][][] joinRules =
|
||||
[
|
||||
// Base table
|
||||
[
|
||||
/* | leading -> trailing codepoint */
|
||||
/* v | Other | Control | Extend | RI | Prepend | HangulL | HangulV | HangulT | HangulLV | HangulLVT | InCBLinker | InCBConsonant | ExtPic | ZWJ | */
|
||||
/* Other | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* Control | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */],
|
||||
/* Extend | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* RI | */ [_ /* | */, _ /* | */, 0 /* | */, 1 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* Prepend | */ [0 /* | */, _ /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */],
|
||||
/* HangulL | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, 0 /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* HangulV | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* HangulT | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* HangulLV | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* HangulLVT | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* InCBLinker | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, 0 /* | */],
|
||||
/* InCBConsonant | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* ExtPic | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* ZWJ | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, 0 /* | */, 0 /* | */],
|
||||
/* ↓ leading → trailing codepoint */
|
||||
/* | Other | CR | LF | Control | Extend | RI | Prepend | HangulL | HangulV | HangulT | HangulLV | HangulLVT | InCBLinker | InCBConsonant | ExtPic | ZWJ | */
|
||||
/* Other | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* CR | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */],
|
||||
/* LF | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */],
|
||||
/* Control | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */],
|
||||
/* Extend | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* RI | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 1 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* Prepend | */ [0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */],
|
||||
/* HangulL | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, 0 /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* HangulV | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* HangulT | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* HangulLV | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* HangulLVT | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* InCBLinker | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, 0 /* | */],
|
||||
/* InCBConsonant | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* ExtPic | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* ZWJ | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, 0 /* | */, 0 /* | */],
|
||||
],
|
||||
// Once we have encountered a Regional Indicator pair we'll enter this table.
|
||||
// It's a copy of the base table, but instead of RI × RI, we're RI ÷ RI.
|
||||
[
|
||||
/* | leading -> trailing codepoint */
|
||||
/* v | Other | Control | Extend | RI | Prepend | HangulL | HangulV | HangulT | HangulLV | HangulLVT | InCBLinker | InCBConsonant | ExtPic | ZWJ | */
|
||||
/* Other | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* Control | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */],
|
||||
/* Extend | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* RI | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* Prepend | */ [0 /* | */, _ /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */],
|
||||
/* HangulL | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, 0 /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* HangulV | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* HangulT | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* HangulLV | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* HangulLVT | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* InCBLinker | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, 0 /* | */],
|
||||
/* InCBConsonant | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* ExtPic | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* ZWJ | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, 0 /* | */, 0 /* | */],
|
||||
/* ↓ leading → trailing codepoint */
|
||||
/* | Other | CR | LF | Control | Extend | RI | Prepend | HangulL | HangulV | HangulT | HangulLV | HangulLVT | InCBLinker | InCBConsonant | ExtPic | ZWJ | */
|
||||
/* Other | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* CR | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */],
|
||||
/* LF | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */],
|
||||
/* Control | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */],
|
||||
/* Extend | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* RI | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* Prepend | */ [0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */],
|
||||
/* HangulL | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, 0 /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* HangulV | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* HangulT | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* HangulLV | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* HangulLVT | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* InCBLinker | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, 0 /* | */],
|
||||
/* InCBConsonant | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* ExtPic | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */],
|
||||
/* ZWJ | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, 0 /* | */, 0 /* | */],
|
||||
],
|
||||
];
|
||||
|
||||
// Documentation for our UAX #14 line break implementation based on Unicode 16.1,
|
||||
// but heavily modified to allow for use with lookup tables:
|
||||
//
|
||||
// NOTE: If you convert these rules into a lookup table, you must apply them in reverse order.
|
||||
// This is because the rules are ordered from most to least important (e.g. LB8 overrides LB18).
|
||||
//
|
||||
// Resolve line breaking classes:
|
||||
// LB1: Assign a line breaking class [...].
|
||||
// ❌ Unicode does that for us via the "lb" attribute.
|
||||
//
|
||||
// Start and end of text:
|
||||
// LB2: Never break at the start of text.
|
||||
// ❌ Functionality not needed.
|
||||
// LB3: Always break at the end of text.
|
||||
// ❌ Functionality not needed.
|
||||
//
|
||||
// Mandatory breaks:
|
||||
// LB4: Always break after hard line breaks.
|
||||
// ❌ Handled by our ucd_* functions.
|
||||
// LB5: Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks.
|
||||
// ❌ Handled by our ucd_* functions.
|
||||
// LB6: Do not break before hard line breaks.
|
||||
// ❌ Handled by our ucd_* functions.
|
||||
//
|
||||
// Explicit breaks and non-breaks:
|
||||
// LB7: Do not break before spaces or zero width space.
|
||||
// ❌ It's way simpler to treat spaces as if they always break.
|
||||
// LB8: Break before any character following a zero-width space, even if one or more spaces intervene.
|
||||
// ⍻ ZW ÷ modified from ZW SP* ÷ because it's not worth being this anal about accuracy here.
|
||||
// LB8a: Do not break after a zero width joiner.
|
||||
// ❌ Our ucd_* functions never break within grapheme clusters.
|
||||
//
|
||||
// Combining marks:
|
||||
// LB9: Do not break a combining character sequence; treat it as if it has the line breaking class of the base character in all of the following rules. Treat ZWJ as if it were CM.
|
||||
// ❌ Our ucd_* functions never break within grapheme clusters.
|
||||
// LB10: Treat any remaining combining mark or ZWJ as AL.
|
||||
// ❌ To be honest, I'm not entirely sure, I understand the implications of this rule.
|
||||
//
|
||||
// Word joiner:
|
||||
// LB11: Do not break before or after Word joiner and related characters.
|
||||
// ✔ × WJ
|
||||
// ✔ WJ ×
|
||||
//
|
||||
// Non-breaking characters:
|
||||
// LB12: Do not break after NBSP and related characters.
|
||||
// ✔ GL ×
|
||||
// LB12a: Do not break before NBSP and related characters, except after spaces and hyphens.
|
||||
// ✔ [^SP BA HY] × GL
|
||||
//
|
||||
// Opening and closing:
|
||||
// LB13: Do not break before ']' or '!' or '/', even after spaces.
|
||||
// ✔ × CL
|
||||
// ✔ × CP
|
||||
// ✔ × EX
|
||||
// ✔ × SY
|
||||
// LB14: Do not break after '[', even after spaces.
|
||||
// ⍻ OP × modified from OP SP* × just because it's simpler. It would be nice to address this.
|
||||
// LB15a: Do not break after an unresolved initial punctuation that lies at the start of the line, after a space, after opening punctuation, or after an unresolved quotation mark, even after spaces.
|
||||
// ❌ Not implemented. Seemed too complex for little gain?
|
||||
// LB15b: Do not break before an unresolved final punctuation that lies at the end of the line, before a space, before a prohibited break, or before an unresolved quotation mark, even after spaces.
|
||||
// ❌ Not implemented. Seemed too complex for little gain?
|
||||
// LB15c: Break before a decimal mark that follows a space, for instance, in 'subtract .5'.
|
||||
// ⍻ SP ÷ IS modified from SP ÷ IS NU because this fits neatly with LB15d.
|
||||
// LB15d: Otherwise, do not break before ';', ',', or '.', even after spaces.
|
||||
// ✔ × IS
|
||||
// LB16: Do not break between closing punctuation and a nonstarter (lb=NS), even with intervening spaces.
|
||||
// ❌ Not implemented. Could be useful in the future, but its usefulness seemed limited to me.
|
||||
// LB17: Do not break within '——', even with intervening spaces.
|
||||
// ❌ Not implemented. Terminal applications nor code use em-dashes much anyway.
|
||||
//
|
||||
// Spaces:
|
||||
// LB18: Break after spaces.
|
||||
// ❌ Implemented because we didn't implement LB7.
|
||||
//
|
||||
// Special case rules:
|
||||
// LB19: Do not break before non-initial unresolved quotation marks, such as ' ” ' or ' " ', nor after non-final unresolved quotation marks, such as ' “ ' or ' " '.
|
||||
// ⍻ × QU modified from × [ QU - \p{Pi} ]
|
||||
// ⍻ QU × modified from [ QU - \p{Pf} ] ×
|
||||
// We implement the Unicode 16.0 instead of 16.1 rules, because it's simpler and allows us to use a LUT.
|
||||
// LB19a: Unless surrounded by East Asian characters, do not break either side of any unresolved quotation marks.
|
||||
// ❌ [^$EastAsian] × QU
|
||||
// ❌ × QU ( [^$EastAsian] | eot )
|
||||
// ❌ QU × [^$EastAsian]
|
||||
// ❌ ( sot | [^$EastAsian] ) QU ×
|
||||
// Same as LB19.
|
||||
// LB20: Break before and after unresolved CB.
|
||||
// ❌ We break by default. Unicode inline objects are super irrelevant in a terminal in either case.
|
||||
// LB20a: Do not break after a word-initial hyphen.
|
||||
// ❌ Not implemented. Seemed not worth the hassle as the window will almost always be >1 char wide.
|
||||
// LB21: Do not break before hyphen-minus, other hyphens, fixed-width spaces, small kana, and other non-starters, or after acute accents.
|
||||
// ✔ × BA
|
||||
// ✔ × HY
|
||||
// ✔ × NS
|
||||
// ✔ BB ×
|
||||
// LB21a: Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew.
|
||||
// ❌ Not implemented. Perhaps in the future.
|
||||
// LB21b: Do not break between Solidus and Hebrew letters.
|
||||
// ❌ Not implemented. Perhaps in the future.
|
||||
// LB22: Do not break before ellipses.
|
||||
// ✔ × IN
|
||||
//
|
||||
// Numbers:
|
||||
// LB23: Do not break between digits and letters.
|
||||
// ✔ (AL | HL) × NU
|
||||
// ✔ NU × (AL | HL)
|
||||
// LB23a: Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
|
||||
// ✔ PR × (ID | EB | EM)
|
||||
// ✔ (ID | EB | EM) × PO
|
||||
// LB24: Do not break between numeric prefix/postfix and letters, or between letters and prefix/postfix.
|
||||
// ✔ (PR | PO) × (AL | HL)
|
||||
// ✔ (AL | HL) × (PR | PO)
|
||||
// LB25: Do not break numbers:
|
||||
// ⍻ CL × PO modified from NU ( SY | IS )* CL × PO
|
||||
// ⍻ CP × PO modified from NU ( SY | IS )* CP × PO
|
||||
// ⍻ CL × PR modified from NU ( SY | IS )* CL × PR
|
||||
// ⍻ CP × PR modified from NU ( SY | IS )* CP × PR
|
||||
// ⍻ ( NU | SY | IS ) × PO modified from NU ( SY | IS )* × PO
|
||||
// ⍻ ( NU | SY | IS ) × PR modified from NU ( SY | IS )* × PR
|
||||
// ⍻ PO × OP modified from PO × OP NU
|
||||
// ⍻ PO × OP modified from PO × OP IS NU
|
||||
// ✔ PO × NU
|
||||
// ⍻ PR × OP modified from PR × OP NU
|
||||
// ⍻ PR × OP modified from PR × OP IS NU
|
||||
// ✔ PR × NU
|
||||
// ✔ HY × NU
|
||||
// ✔ IS × NU
|
||||
// ⍻ ( NU | SY | IS ) × NU modified from NU ( SY | IS )* × NU
|
||||
// Most were simplified because the cases this additionally allows don't matter much here.
|
||||
//
|
||||
// Korean syllable blocks
|
||||
// LB26: Do not break a Korean syllable.
|
||||
// ❌ Our ucd_* functions never break within grapheme clusters.
|
||||
// LB27: Treat a Korean Syllable Block the same as ID.
|
||||
// ❌ Our ucd_* functions never break within grapheme clusters.
|
||||
//
|
||||
// Finally, join alphabetic letters into words and break everything else.
|
||||
// LB28: Do not break between alphabetics ("at").
|
||||
// ✔ (AL | HL) × (AL | HL)
|
||||
// LB28a: Do not break inside the orthographic syllables of Brahmic scripts.
|
||||
// ❌ Our ucd_* functions never break within grapheme clusters.
|
||||
// LB29: Do not break between numeric punctuation and alphabetics ("e.g.").
|
||||
// ✔ IS × (AL | HL)
|
||||
// LB30: Do not break between letters, numbers, or ordinary symbols and opening or closing parentheses.
|
||||
// ✔ (AL | HL | NU) × [OP-$EastAsian]
|
||||
// ✔ [CP-$EastAsian] × (AL | HL | NU)
|
||||
// LB30a: Break between two regional indicator symbols if and only if there are an even number of regional indicators preceding the position of the break.
|
||||
// ❌ Our ucd_* functions never break within grapheme clusters.
|
||||
// LB30b: Do not break between an emoji base (or potential emoji) and an emoji modifier.
|
||||
// ❌ Our ucd_* functions never break within grapheme clusters.
|
||||
// LB31: Break everywhere else.
|
||||
// ❌ Our default behavior.
|
||||
int[][] joinRulesLineBreak =
|
||||
[
|
||||
/* ↓ leading → trailing codepoint */
|
||||
/* | Other | WordJoiner | ZeroWidthSpace | Glue | Space | BreakAfter | BreakBefore | Hyphen | ClosePunctuation | CloseParenthesis_EA | CloseParenthesis_NotEA | Exclamation | Inseparable | Nonstarter | OpenPunctuation_EA | OpenPunctuation_NotEA | Quotation | InfixNumericSeparator | Numeric | PostfixNumeric | PrefixNumeric | SymbolsAllowingBreakAfter | Alphabetic | Ideographic | */
|
||||
/* Other | */ [_ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, _ /* | */, 1 /* | */, _ /* | */, _ /* | */],
|
||||
/* WordJoiner | */ [1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */],
|
||||
/* ZeroWidthSpace | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */],
|
||||
/* Glue | */ [1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */],
|
||||
/* Space | */ [_ /* | */, 1 /* | */, _ /* | */, _ /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 1 /* | */, _ /* | */, _ /* | */],
|
||||
/* BreakAfter | */ [_ /* | */, 1 /* | */, _ /* | */, _ /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, _ /* | */, 1 /* | */, _ /* | */, _ /* | */],
|
||||
/* BreakBefore | */ [1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */],
|
||||
/* Hyphen | */ [_ /* | */, 1 /* | */, _ /* | */, _ /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, _ /* | */, _ /* | */],
|
||||
/* ClosePunctuation | */ [_ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */],
|
||||
/* CloseParenthesis_EA | */ [_ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */],
|
||||
/* CloseParenthesis_NotEA | */ [_ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */],
|
||||
/* Exclamation | */ [_ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, _ /* | */, 1 /* | */, _ /* | */, _ /* | */],
|
||||
/* Inseparable | */ [_ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, _ /* | */, 1 /* | */, _ /* | */, _ /* | */],
|
||||
/* Nonstarter | */ [_ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, _ /* | */, 1 /* | */, _ /* | */, _ /* | */],
|
||||
/* OpenPunctuation_EA | */ [1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */],
|
||||
/* OpenPunctuation_NotEA | */ [1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */],
|
||||
/* Quotation | */ [1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */],
|
||||
/* InfixNumericSeparator | */ [_ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */],
|
||||
/* Numeric | */ [_ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */],
|
||||
/* PostfixNumeric | */ [_ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, 1 /* | */, _ /* | */],
|
||||
/* PrefixNumeric | */ [_ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */],
|
||||
/* SymbolsAllowingBreakAfter | */ [_ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */],
|
||||
/* Alphabetic | */ [_ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */],
|
||||
/* Ideographic | */ [_ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, _ /* | */, _ /* | */, 1 /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, 1 /* | */, _ /* | */, _ /* | */],
|
||||
];
|
||||
|
||||
// @formatter:on
|
||||
|
||||
if (args.Length != 1)
|
||||
{
|
||||
Console.WriteLine(
|
||||
@@ -113,7 +302,7 @@ var ucd = ExtractValuesFromUcd(args[0]);
|
||||
// 4 still gives ~30% savings over 3 stages and going beyond 5 gives diminishing returns (<10%).
|
||||
var trie = BuildBestTrie(ucd.Values, 2, 8, 4);
|
||||
// The joinRules above has 2 bits per value. This packs it into 32-bit integers to save space.
|
||||
var rules = PrepareRulesTable(joinRules);
|
||||
var rules = joinRules.Select(table => PrepareRulesTable(table, 2, 3)).ToArray();
|
||||
// Each rules item has the same length. Each item is 32 bits = 4 bytes.
|
||||
var totalSize = trie.TotalSize + rules.Length * rules[0].Length * sizeof(TrieType);
|
||||
|
||||
@@ -134,9 +323,12 @@ foreach (var (expected, cp) in ucd.Values.Select((v, i) => (v, i)))
|
||||
|
||||
// All the remaining code starting here simply generates the C++ output.
|
||||
var buf = new StringBuilder();
|
||||
buf.Append("// Generated by GraphemeTableGen\n");
|
||||
buf.Append($"// on {DateTime.UtcNow.ToString("yyyy'-'MM'-'dd'T'HH':'mm':'ssK")}, from {ucd.Description}, {totalSize} bytes\n");
|
||||
buf.Append("// clang-format off\n");
|
||||
buf.Append($$"""
|
||||
// Generated by GraphemeTableGen
|
||||
// on {{DateTime.UtcNow.ToString("yyyy'-'MM'-'dd'T'HH':'mm':'ssK")}}, from {{ucd.Description}}, {{totalSize}} bytes
|
||||
// clang-format off
|
||||
|
||||
""");
|
||||
|
||||
foreach (var stage in trie.Stages)
|
||||
{
|
||||
@@ -147,7 +339,7 @@ foreach (var stage in trie.Stages)
|
||||
width = stage.Mask + 1;
|
||||
}
|
||||
|
||||
buf.Append($"static constexpr uint{stage.Bits}_t s_stage{stage.Index}[] = {{");
|
||||
buf.Append($"static const uint{stage.Bits}_t s_stage{stage.Index}[] = {{");
|
||||
foreach (var (value, j) in stage.Values.Select((v, j) => (v, j)))
|
||||
{
|
||||
if (j % width == 0)
|
||||
@@ -159,7 +351,7 @@ foreach (var stage in trie.Stages)
|
||||
buf.Append("\n};\n");
|
||||
}
|
||||
|
||||
buf.Append($"static constexpr uint32_t s_joinRules[{rules.Length}][{rules[0].Length}] = {{\n");
|
||||
buf.Append($"static const uint32_t s_join_rules[{rules.Length}][{rules[0].Length}] = {{\n");
|
||||
foreach (var table in rules)
|
||||
{
|
||||
buf.Append(" {\n");
|
||||
@@ -171,11 +363,11 @@ foreach (var table in rules)
|
||||
}
|
||||
buf.Append("};\n");
|
||||
|
||||
buf.Append("constexpr int ucdLookup(const char32_t cp) noexcept\n");
|
||||
buf.Append("inline int ucd_lookup(const uint32_t cp)\n");
|
||||
buf.Append("{\n");
|
||||
foreach (var stage in trie.Stages)
|
||||
{
|
||||
buf.Append($" const auto s{stage.Index} = s_stage{stage.Index}[");
|
||||
buf.Append($" const uint{stage.Bits}_t s{stage.Index} = s_stage{stage.Index}[");
|
||||
if (stage.Index == 0)
|
||||
{
|
||||
buf.Append($"cp >> {stage.Shift}");
|
||||
@@ -190,21 +382,27 @@ foreach (var stage in trie.Stages)
|
||||
buf.Append($" return s{trie.Stages.Count - 1};\n");
|
||||
buf.Append("}\n");
|
||||
|
||||
buf.Append("constexpr int ucdGraphemeJoins(const int state, const int lead, const int trail) noexcept\n");
|
||||
buf.Append("{\n");
|
||||
buf.Append(" const auto l = lead & 15;\n");
|
||||
buf.Append(" const auto t = trail & 15;\n");
|
||||
buf.Append($" return (s_joinRules[state][l] >> (t * 2)) & 3;\n");
|
||||
buf.Append("}\n");
|
||||
buf.Append("constexpr bool ucdGraphemeDone(const int state) noexcept\n");
|
||||
buf.Append("{\n");
|
||||
buf.Append($" return state == 3;\n");
|
||||
buf.Append("}\n");
|
||||
buf.Append("constexpr int ucdToCharacterWidth(const int val) noexcept\n");
|
||||
buf.Append("{\n");
|
||||
buf.Append(" return val >> 6;\n");
|
||||
buf.Append("}\n");
|
||||
buf.Append("// clang-format on\n");
|
||||
buf.Append($$"""
|
||||
inline int ucd_grapheme_joins(const int state, const int lead, const int trail)
|
||||
{
|
||||
const int l = lead & 15;
|
||||
const int t = trail & 15;
|
||||
return (s_join_rules[state][l] >> (t * 2)) & 3;
|
||||
}
|
||||
inline bool ucd_grapheme_done(const int state)
|
||||
{
|
||||
return state == 3;
|
||||
}
|
||||
inline int ucd_to_character_width(const int val)
|
||||
{
|
||||
return val >> 6;
|
||||
}
|
||||
inline int ucd_is_newline(const int val)
|
||||
{
|
||||
return val > {{(int)ClusterBreak.Control}};
|
||||
}
|
||||
// clang-format on
|
||||
""");
|
||||
|
||||
Console.Write(buf);
|
||||
return;
|
||||
@@ -224,6 +422,7 @@ static Ucd ExtractValuesFromUcd(string path)
|
||||
foreach (var group in doc.Root!.Descendants(ns + "group"))
|
||||
{
|
||||
var groupGeneralCategory = group.Attribute("gc")?.Value;
|
||||
var groupLineBreak = group.Attribute("lb")?.Value;
|
||||
var groupGraphemeClusterBreak = group.Attribute("GCB")?.Value;
|
||||
var groupIndicConjunctBreak = group.Attribute("InCB")?.Value;
|
||||
var groupExtendedPictographic = group.Attribute("ExtPict")?.Value;
|
||||
@@ -246,6 +445,7 @@ static Ucd ExtractValuesFromUcd(string path)
|
||||
}
|
||||
|
||||
var generalCategory = ch.Attribute("gc")?.Value ?? groupGeneralCategory ?? "";
|
||||
var lineBreak = ch.Attribute("lb")?.Value ?? groupLineBreak ?? "";
|
||||
var graphemeClusterBreak = ch.Attribute("GCB")?.Value ?? groupGraphemeClusterBreak ?? "";
|
||||
var indicConjunctBreak = ch.Attribute("InCB")?.Value ?? groupIndicConjunctBreak ?? "";
|
||||
var extendedPictographic = ch.Attribute("ExtPict")?.Value ?? groupExtendedPictographic ?? "";
|
||||
@@ -257,7 +457,9 @@ static Ucd ExtractValuesFromUcd(string path)
|
||||
// We ignore GB3 which demands that CR × LF do not break apart, because
|
||||
// * these control characters won't normally reach our text storage
|
||||
// * otherwise we're in a raw write mode and historically conhost stores them in separate cells
|
||||
"CR" or "LF" or "CN" => ClusterBreak.Control, // Carriage Return, Line Feed, Control
|
||||
"CR" => ClusterBreak.CR, // Carriage Return
|
||||
"LF" => ClusterBreak.LF, // Line Feed
|
||||
"CN" => ClusterBreak.Control, // Control
|
||||
"EX" or "SM" => ClusterBreak.Extend, // Extend, SpacingMark
|
||||
"PP" => ClusterBreak.Prepend, // Prepend
|
||||
"ZWJ" => ClusterBreak.ZWJ, // Zero Width Joiner
|
||||
@@ -296,7 +498,7 @@ static Ucd ExtractValuesFromUcd(string path)
|
||||
{
|
||||
"N" or "Na" or "H" => CharacterWidth.Narrow, // Half-width, Narrow, Neutral
|
||||
"F" or "W" => CharacterWidth.Wide, // Wide, Full-width
|
||||
"A" => CharacterWidth.Ambiguous, // Ambiguous
|
||||
"A" => CharacterWidth.Narrow, // Ambiguous
|
||||
_ => throw new Exception($"Unrecognized ea {eastAsian} for U+{firstCp:X4} to U+{lastCp:X4}")
|
||||
};
|
||||
|
||||
@@ -315,10 +517,44 @@ static Ucd ExtractValuesFromUcd(string path)
|
||||
width = CharacterWidth.ZeroWidth;
|
||||
break;
|
||||
case "Me" or "Mn" or "Cf":
|
||||
width = CharacterWidth.ZeroWidth;
|
||||
width = CharacterWidth.ZeroWidth;
|
||||
break;
|
||||
}
|
||||
|
||||
var lbEa = eastAsian is "F" or "W" or "H";
|
||||
var lb = lineBreak switch
|
||||
{
|
||||
"WJ" => LineBreak.WordJoiner,
|
||||
"ZW" => LineBreak.ZeroWidthSpace,
|
||||
"GL" => LineBreak.Glue,
|
||||
"SP" => LineBreak.Space,
|
||||
|
||||
"BA" => LineBreak.BreakAfter,
|
||||
"BB" => LineBreak.BreakBefore,
|
||||
"HY" => LineBreak.Hyphen,
|
||||
|
||||
"CL" => LineBreak.ClosePunctuation,
|
||||
"CP" when lbEa => LineBreak.CloseParenthesis_EA,
|
||||
"CP" => LineBreak.CloseParenthesis_NotEA,
|
||||
"EX" => LineBreak.Exclamation,
|
||||
"IN" => LineBreak.Inseparable,
|
||||
"NS" => LineBreak.Nonstarter,
|
||||
"OP" when lbEa => LineBreak.OpenPunctuation_EA,
|
||||
"OP" => LineBreak.OpenPunctuation_NotEA,
|
||||
"QU" => LineBreak.Quotation,
|
||||
|
||||
"IS" => LineBreak.InfixNumericSeparator,
|
||||
"NU" => LineBreak.Numeric,
|
||||
"PO" => LineBreak.PostfixNumeric,
|
||||
"PR" => LineBreak.PrefixNumeric,
|
||||
"SY" => LineBreak.SymbolsAllowingBreakAfter,
|
||||
|
||||
"AL" or "HL" => LineBreak.Alphabetic,
|
||||
"ID" or "EB" or "EM" => LineBreak.Ideographic,
|
||||
|
||||
_ => LineBreak.Other,
|
||||
};
|
||||
|
||||
Fill(firstCp, lastCp, TrieValue(cb, width));
|
||||
}
|
||||
}
|
||||
@@ -336,6 +572,12 @@ static Ucd ExtractValuesFromUcd(string path)
|
||||
// By default, CharacterWidth.Ambiguous, but by convention .Narrow in terminals.
|
||||
Fill(0x2500, 0x259F, TrieValue(ClusterBreak.Other, CharacterWidth.Narrow));
|
||||
|
||||
// U+FE0F Variation Selector-16 is used to turn unqualified Emojis into qualified ones.
|
||||
// By convention, this turns them from being ambiguous width (= narrow) into wide ones.
|
||||
// We achieve this here by explicitly giving this codepoint a wide width.
|
||||
// Later down below we'll clamp width back to <= 2.
|
||||
Fill(0xFE0F, 0xFE0F, TrieValue(ClusterBreak.Extend, CharacterWidth.Wide));
|
||||
|
||||
return new Ucd
|
||||
{
|
||||
Description = description,
|
||||
@@ -354,38 +596,37 @@ static Ucd ExtractValuesFromUcd(string path)
|
||||
}
|
||||
}
|
||||
|
||||
// Because each item in the list of 2D rule tables only uses 2 bits and not all 8 in each byte,
|
||||
// Because each item in the list of 2D rule tables only uses 1-2 bits,
|
||||
// this function packs them into chunks of 32-bit integers to save space.
|
||||
static uint[][] PrepareRulesTable(byte[][][] rules)
|
||||
static uint[] PrepareRulesTable(int[][] rules, int bitWidth, int nonJoinerValue)
|
||||
{
|
||||
var compressed = new uint[rules.Length][];
|
||||
for (var i = 0; i < compressed.Length; i++)
|
||||
{
|
||||
compressed[i] = new uint[16];
|
||||
}
|
||||
var compressed = new uint[rules.Length];
|
||||
|
||||
foreach (var (table, prevIndex) in rules.Select((v, i) => (v, i)))
|
||||
foreach (var lead in Enumerable.Range(0, rules.Length))
|
||||
{
|
||||
foreach (var (row, lead) in table.Select((v, i) => (v, i)))
|
||||
var row = rules[lead];
|
||||
uint nextIndices = 0;
|
||||
|
||||
if (row.Length > 32 / bitWidth)
|
||||
{
|
||||
if (table[lead].Length > 16)
|
||||
{
|
||||
throw new Exception("Can't pack row into 32 bits");
|
||||
}
|
||||
|
||||
uint nextIndices = 0;
|
||||
foreach (var (nextIndex, trail) in row.Select((v, i) => (v, i)))
|
||||
{
|
||||
if (nextIndex > 3)
|
||||
{
|
||||
throw new Exception("Can't pack table index into 2 bits");
|
||||
}
|
||||
|
||||
nextIndices |= (uint)(nextIndex << (trail * 2));
|
||||
}
|
||||
|
||||
compressed[prevIndex][lead] = nextIndices;
|
||||
throw new Exception("Can't pack row into 32 bits");
|
||||
}
|
||||
|
||||
foreach (var trail in Enumerable.Range(0, row.Length))
|
||||
{
|
||||
var value = row[trail];
|
||||
if (value < 0)
|
||||
{
|
||||
value = nonJoinerValue;
|
||||
}
|
||||
if (value > (1 << bitWidth) - 1)
|
||||
{
|
||||
throw new Exception("Can't pack table index into 2 bits");
|
||||
}
|
||||
nextIndices |= (uint)(value << (trail * bitWidth));
|
||||
}
|
||||
|
||||
compressed[lead] = nextIndices;
|
||||
}
|
||||
|
||||
return compressed;
|
||||
@@ -549,7 +790,6 @@ internal enum CharacterWidth
|
||||
internal enum ClusterBreak
|
||||
{
|
||||
Other, // GB999
|
||||
Control, // GB3, GB4, GB5 -- includes CR, LF
|
||||
Extend, // GB9, GB9a -- includes SpacingMark
|
||||
RI, // GB12, GB13
|
||||
Prepend, // GB9b
|
||||
@@ -562,6 +802,50 @@ internal enum ClusterBreak
|
||||
InCBConsonant, // GB9c
|
||||
ExtPic, // GB11
|
||||
ZWJ, // GB9, GB11
|
||||
|
||||
// These are intentionally ordered last, as this allows
|
||||
// us to simplify the ucd_is_newline implementation.
|
||||
Control, // GB4, GB5
|
||||
CR, // GB3, GB4, GB5
|
||||
LF, // GB3, GB4, GB5
|
||||
}
|
||||
|
||||
internal enum LineBreak
|
||||
{
|
||||
Other, // Anything else
|
||||
|
||||
// Non-tailorable Line Breaking Classes
|
||||
WordJoiner, // WJ
|
||||
ZeroWidthSpace, // ZW
|
||||
Glue, // GL
|
||||
Space, // SP
|
||||
|
||||
// Break Opportunities
|
||||
BreakAfter, // BA
|
||||
BreakBefore, // BB
|
||||
Hyphen, // HY
|
||||
|
||||
// Characters Prohibiting Certain Breaks
|
||||
ClosePunctuation, // CL
|
||||
CloseParenthesis_EA, // CP, East Asian
|
||||
CloseParenthesis_NotEA, // CP, not East Asian
|
||||
Exclamation, // EX
|
||||
Inseparable, // IN
|
||||
Nonstarter, // NS
|
||||
OpenPunctuation_EA, // OP, East Asian
|
||||
OpenPunctuation_NotEA, // OP, not East Asian
|
||||
Quotation, // QU
|
||||
|
||||
// Numeric Context
|
||||
InfixNumericSeparator, // IS
|
||||
Numeric, // NU
|
||||
PostfixNumeric, // PO
|
||||
PrefixNumeric, // PR
|
||||
SymbolsAllowingBreakAfter, // SY
|
||||
|
||||
// Other Characters
|
||||
Alphabetic, // AL & HL
|
||||
Ideographic, // ID & EB & EM
|
||||
}
|
||||
|
||||
internal class Ucd
|
||||
|
||||
@@ -846,16 +846,6 @@ bool CodepointWidthDetector::_graphemeNext(GraphemeState& s, const std::wstring_
|
||||
{
|
||||
w = _ambiguousWidth;
|
||||
}
|
||||
|
||||
// U+FE0F Variation Selector-16 is used to turn unqualified Emojis into qualified ones.
|
||||
// By convention, this turns them from being ambiguous width (= narrow) into wide ones.
|
||||
// We achieve this here by explicitly giving this codepoint a wide width.
|
||||
// Later down below we'll clamp width back to <= 2.
|
||||
if (cp == 0xFE0F)
|
||||
{
|
||||
w = 2;
|
||||
}
|
||||
|
||||
width += w;
|
||||
}
|
||||
|
||||
@@ -943,16 +933,6 @@ bool CodepointWidthDetector::_graphemePrev(GraphemeState& s, const std::wstring_
|
||||
{
|
||||
w = _ambiguousWidth;
|
||||
}
|
||||
|
||||
// U+FE0F Variation Selector-16 is used to turn unqualified Emojis into qualified ones.
|
||||
// By convention, this turns them from being ambiguous width (= narrow) into wide ones.
|
||||
// We achieve this here by explicitly giving this codepoint a wide width.
|
||||
// Later down below we'll clamp width back to <= 2.
|
||||
if (cp == 0xFE0F)
|
||||
{
|
||||
w = 2;
|
||||
}
|
||||
|
||||
width += w;
|
||||
}
|
||||
|
||||
@@ -1100,7 +1080,6 @@ bool CodepointWidthDetector::_graphemePrevWcswidth(GraphemeState& s, const std::
|
||||
{
|
||||
w = _ambiguousWidth;
|
||||
}
|
||||
|
||||
width += w;
|
||||
|
||||
const auto hasWidth = width != 0;
|
||||
|
||||
Reference in New Issue
Block a user