wip

2026-04-07 23:01:09 +00:00 · 2024-12-22 16:20:13 +01:00
2 changed files with 373 additions and 110 deletions
--- a/src/tools/GraphemeTableGen/Program.cs
+++ b/src/tools/GraphemeTableGen/Program.cs
@@ -4,12 +4,14 @@ using System.Xml.Linq;

 using TrieType = uint;

-// Used as an indicator in joinRules for ÷ ("does not join").
+// Used as an indicator in our rules for ÷ ("does not join").
 // Underscore is one of the few characters that are permitted as an identifier,
 // are monospace in most fonts and also visually distinct from the digits.
-const byte _ = 3;
+const int _ = -1;

-// JoinRules doesn't quite follow UAX #29, as it states:
+// @formatter:off
+
+// joinRules doesn't quite follow UAX #29, as it states:
 // > Note: Testing two adjacent characters is insufficient for determining a boundary.
 //
 // I completely agree, however it makes the implementation complex and slow, and it only benefits what can be considered
@@ -50,49 +52,236 @@ const byte _ = 3;
 //
 // This is a great reference for the resulting table:
 // https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.html
-byte[][][] joinRules =
+int[][][] joinRules =
 [
    // Base table
    [
-        /* | leading       -> trailing codepoint                                                                                                                                             */
-        /* v             |   Other  |  Control |  Extend  |    RI    |  Prepend |  HangulL |  HangulV |  HangulT | HangulLV | HangulLVT | InCBLinker | InCBConsonant |  ExtPic  |    ZWJ   | */
-        /* Other         | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
-        /* Control       | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, _ /*  |   */, _ /*    | */, _ /* | */, _ /* | */],
-        /* Extend        | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
-        /* RI            | */ [_ /* | */, _ /* | */, 0 /* | */, 1 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
-        /* Prepend       | */ [0 /* | */, _ /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /*  |  */, 0 /*  |   */, 0 /*    | */, 0 /* | */, 0 /* | */],
-        /* HangulL       | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, 0 /* | */, 0 /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
-        /* HangulV       | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
-        /* HangulT       | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
-        /* HangulLV      | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
-        /* HangulLVT     | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
-        /* InCBLinker    | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, 0 /*    | */, _ /* | */, 0 /* | */],
-        /* InCBConsonant | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
-        /* ExtPic        | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
-        /* ZWJ           | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, 0 /* | */, 0 /* | */],
+        /* ↓ leading        → trailing codepoint                                                                                                                                                                   */
+        /*               |   Other  |    CR    |    LF    |  Control |  Extend  |    RI    | Prepend  |  HangulL |  HangulV |  HangulT | HangulLV | HangulLVT | InCBLinker | InCBConsonant |  ExtPic  |    ZWJ   | */
+        /* Other         | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
+        /* CR            | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, _ /*  |   */, _ /*    | */, _ /* | */, _ /* | */],
+        /* LF            | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, _ /*  |   */, _ /*    | */, _ /* | */, _ /* | */],
+        /* Control       | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, _ /*  |   */, _ /*    | */, _ /* | */, _ /* | */],
+        /* Extend        | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
+        /* RI            | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 1 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
+        /* Prepend       | */ [0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /*  |  */, 0 /*  |   */, 0 /*    | */, 0 /* | */, 0 /* | */],
+        /* HangulL       | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, 0 /* | */, 0 /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
+        /* HangulV       | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
+        /* HangulT       | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
+        /* HangulLV      | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
+        /* HangulLVT     | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
+        /* InCBLinker    | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, 0 /*    | */, _ /* | */, 0 /* | */],
+        /* InCBConsonant | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
+        /* ExtPic        | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
+        /* ZWJ           | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, 0 /* | */, 0 /* | */],
    ],
    // Once we have encountered a Regional Indicator pair we'll enter this table.
    // It's a copy of the base table, but instead of RI × RI, we're RI ÷ RI.
    [
-        /* | leading       -> trailing codepoint                                                                                                                                             */
-        /* v             |   Other  |  Control |  Extend  |    RI    |  Prepend |  HangulL |  HangulV |  HangulT | HangulLV | HangulLVT | InCBLinker | InCBConsonant |  ExtPic  |    ZWJ   | */
-        /* Other         | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
-        /* Control       | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, _ /*  |   */, _ /*    | */, _ /* | */, _ /* | */],
-        /* Extend        | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
-        /* RI            | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
-        /* Prepend       | */ [0 /* | */, _ /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /*  |  */, 0 /*  |   */, 0 /*    | */, 0 /* | */, 0 /* | */],
-        /* HangulL       | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, 0 /* | */, 0 /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
-        /* HangulV       | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
-        /* HangulT       | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
-        /* HangulLV      | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
-        /* HangulLVT     | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
-        /* InCBLinker    | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, 0 /*    | */, _ /* | */, 0 /* | */],
-        /* InCBConsonant | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
-        /* ExtPic        | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
-        /* ZWJ           | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, 0 /* | */, 0 /* | */],
+        /* ↓ leading         → trailing codepoint                                                                                                                                                                  */
+        /*               |   Other  |    CR    |    LF    |  Control |  Extend  |    RI    | Prepend  |  HangulL |  HangulV |  HangulT | HangulLV | HangulLVT | InCBLinker | InCBConsonant |  ExtPic  |    ZWJ   | */
+        /* Other         | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
+        /* CR            | */ [_ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, _ /*  |   */, _ /*    | */, _ /* | */, _ /* | */],
+        /* LF            | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, _ /*  |   */, _ /*    | */, _ /* | */, _ /* | */],
+        /* Control       | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, _ /*  |   */, _ /*    | */, _ /* | */, _ /* | */],
+        /* Extend        | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
+        /* RI            | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
+        /* Prepend       | */ [0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /*  |  */, 0 /*  |   */, 0 /*    | */, 0 /* | */, 0 /* | */],
+        /* HangulL       | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, 0 /* | */, 0 /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
+        /* HangulV       | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
+        /* HangulT       | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
+        /* HangulLV      | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, 0 /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
+        /* HangulLVT     | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
+        /* InCBLinker    | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, 0 /*    | */, _ /* | */, 0 /* | */],
+        /* InCBConsonant | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
+        /* ExtPic        | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, _ /* | */, 0 /* | */],
+        /* ZWJ           | */ [_ /* | */, _ /* | */, _ /* | */, _ /* | */, 0 /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /* | */, _ /*  |  */, 0 /*  |   */, _ /*    | */, 0 /* | */, 0 /* | */],
    ],
 ];

+// Documentation for our UAX #14 line break implementation based on Unicode 16.1,
+// but heavily modified to allow for use with lookup tables:
+//
+// NOTE: If you convert these rules into a lookup table, you must apply them in reverse order.
+//       This is because the rules are ordered from most to least important (e.g. LB8 overrides LB18).
+//
+// Resolve line breaking classes:
+// LB1:   Assign a line breaking class [...].
+//        ❌ Unicode does that for us via the "lb" attribute.
+//
+// Start and end of text:
+// LB2:   Never break at the start of text.
+//        ❌ Functionality not needed.
+// LB3:   Always break at the end of text.
+//        ❌ Functionality not needed.
+//
+// Mandatory breaks:
+// LB4:   Always break after hard line breaks.
+//        ❌ Handled by our ucd_* functions.
+// LB5:   Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks.
+//        ❌ Handled by our ucd_* functions.
+// LB6:   Do not break before hard line breaks.
+//        ❌ Handled by our ucd_* functions.
+//
+// Explicit breaks and non-breaks:
+// LB7:   Do not break before spaces or zero width space.
+//        ❌ It's way simpler to treat spaces as if they always break.
+// LB8:   Break before any character following a zero-width space, even if one or more spaces intervene.
+//        ⍻ ZW ÷    modified from    ZW SP* ÷    because it's not worth being this anal about accuracy here.
+// LB8a:  Do not break after a zero width joiner.
+//        ❌ Our ucd_* functions never break within grapheme clusters.
+//
+// Combining marks:
+// LB9:   Do not break a combining character sequence; treat it as if it has the line breaking class of the base character in all of the following rules. Treat ZWJ as if it were CM.
+//        ❌ Our ucd_* functions never break within grapheme clusters.
+// LB10:  Treat any remaining combining mark or ZWJ as AL.
+//        ❌ To be honest, I'm not entirely sure, I understand the implications of this rule.
+//
+// Word joiner:
+// LB11:  Do not break before or after Word joiner and related characters.
+//        ✔ × WJ
+//        ✔ WJ ×
+//
+// Non-breaking characters:
+// LB12:  Do not break after NBSP and related characters.
+//        ✔ GL ×
+// LB12a: Do not break before NBSP and related characters, except after spaces and hyphens.
+//        ✔ [^SP BA HY] × GL
+//
+// Opening and closing:
+// LB13:  Do not break before ']' or '!' or '/', even after spaces.
+//        ✔ × CL
+//        ✔ × CP
+//        ✔ × EX
+//        ✔ × SY
+// LB14:  Do not break after '[', even after spaces.
+//        ⍻ OP ×    modified from    OP SP* ×    just because it's simpler. It would be nice to address this.
+// LB15a: Do not break after an unresolved initial punctuation that lies at the start of the line, after a space, after opening punctuation, or after an unresolved quotation mark, even after spaces.
+//        ❌ Not implemented. Seemed too complex for little gain?
+// LB15b: Do not break before an unresolved final punctuation that lies at the end of the line, before a space, before a prohibited break, or before an unresolved quotation mark, even after spaces.
+//        ❌ Not implemented. Seemed too complex for little gain?
+// LB15c: Break before a decimal mark that follows a space, for instance, in 'subtract .5'.
+//        ⍻ SP ÷ IS    modified from    SP ÷ IS NU    because this fits neatly with LB15d.
+// LB15d: Otherwise, do not break before ';', ',', or '.', even after spaces.
+//        ✔ × IS
+// LB16:  Do not break between closing punctuation and a nonstarter (lb=NS), even with intervening spaces.
+//        ❌ Not implemented. Could be useful in the future, but its usefulness seemed limited to me.
+// LB17:  Do not break within '——', even with intervening spaces.
+//        ❌ Not implemented. Terminal applications nor code use em-dashes much anyway.
+//
+// Spaces:
+// LB18:  Break after spaces.
+//        ❌ Implemented because we didn't implement LB7.
+//
+// Special case rules:
+// LB19:  Do not break before non-initial unresolved quotation marks, such as ' ” ' or ' " ', nor after non-final unresolved quotation marks, such as ' “ ' or ' " '.
+//        ⍻ × QU    modified from    × [ QU - \p{Pi} ]
+//        ⍻ QU ×    modified from    [ QU - \p{Pf} ] ×
+//        We implement the Unicode 16.0 instead of 16.1 rules, because it's simpler and allows us to use a LUT.
+// LB19a: Unless surrounded by East Asian characters, do not break either side of any unresolved quotation marks.
+//        ❌ [^$EastAsian] × QU
+//        ❌ × QU ( [^$EastAsian] | eot )
+//        ❌ QU × [^$EastAsian]
+//        ❌ ( sot | [^$EastAsian] ) QU ×
+//        Same as LB19.
+// LB20:  Break before and after unresolved CB.
+//        ❌ We break by default. Unicode inline objects are super irrelevant in a terminal in either case.
+// LB20a: Do not break after a word-initial hyphen.
+//        ❌ Not implemented. Seemed not worth the hassle as the window will almost always be >1 char wide.
+// LB21:  Do not break before hyphen-minus, other hyphens, fixed-width spaces, small kana, and other non-starters, or after acute accents.
+//        ✔ × BA
+//        ✔ × HY
+//        ✔ × NS
+//        ✔ BB ×
+// LB21a: Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew.
+//        ❌ Not implemented. Perhaps in the future.
+// LB21b: Do not break between Solidus and Hebrew letters.
+//        ❌ Not implemented. Perhaps in the future.
+// LB22:  Do not break before ellipses.
+//        ✔ × IN
+//
+// Numbers:
+// LB23:  Do not break between digits and letters.
+//        ✔ (AL | HL) × NU
+//        ✔ NU × (AL | HL)
+// LB23a: Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
+//        ✔ PR × (ID | EB | EM)
+//        ✔ (ID | EB | EM) × PO
+// LB24:  Do not break between numeric prefix/postfix and letters, or between letters and prefix/postfix.
+//        ✔ (PR | PO) × (AL | HL)
+//        ✔ (AL | HL) × (PR | PO)
+// LB25:  Do not break numbers:
+//        ⍻ CL × PO                  modified from    NU ( SY | IS )* CL × PO
+//        ⍻ CP × PO                  modified from    NU ( SY | IS )* CP × PO
+//        ⍻ CL × PR                  modified from    NU ( SY | IS )* CL × PR
+//        ⍻ CP × PR                  modified from    NU ( SY | IS )* CP × PR
+//        ⍻ ( NU | SY | IS ) × PO    modified from    NU ( SY | IS )* × PO
+//        ⍻ ( NU | SY | IS ) × PR    modified from    NU ( SY | IS )* × PR
+//        ⍻ PO × OP                  modified from    PO × OP NU
+//        ⍻ PO × OP                  modified from    PO × OP IS NU
+//        ✔ PO × NU
+//        ⍻ PR × OP                  modified from    PR × OP NU
+//        ⍻ PR × OP                  modified from    PR × OP IS NU
+//        ✔ PR × NU
+//        ✔ HY × NU
+//        ✔ IS × NU
+//        ⍻ ( NU | SY | IS ) × NU    modified from    NU ( SY | IS )* × NU
+//        Most were simplified because the cases this additionally allows don't matter much here.
+//
+// Korean syllable blocks
+// LB26:  Do not break a Korean syllable.
+//        ❌ Our ucd_* functions never break within grapheme clusters.
+// LB27:  Treat a Korean Syllable Block the same as ID.
+//        ❌ Our ucd_* functions never break within grapheme clusters.
+//
+// Finally, join alphabetic letters into words and break everything else.
+// LB28:  Do not break between alphabetics ("at").
+//        ✔ (AL | HL) × (AL | HL)
+// LB28a: Do not break inside the orthographic syllables of Brahmic scripts.
+//        ❌ Our ucd_* functions never break within grapheme clusters.
+// LB29:  Do not break between numeric punctuation and alphabetics ("e.g.").
+//        ✔ IS × (AL | HL)
+// LB30:  Do not break between letters, numbers, or ordinary symbols and opening or closing parentheses.
+//        ✔ (AL | HL | NU) × [OP-$EastAsian]
+//        ✔ [CP-$EastAsian] × (AL | HL | NU)
+// LB30a: Break between two regional indicator symbols if and only if there are an even number of regional indicators preceding the position of the break.
+//        ❌ Our ucd_* functions never break within grapheme clusters.
+// LB30b: Do not break between an emoji base (or potential emoji) and an emoji modifier.
+//        ❌ Our ucd_* functions never break within grapheme clusters.
+// LB31:  Break everywhere else.
+//        ❌ Our default behavior.
+int[][] joinRulesLineBreak =
+[
+    /* ↓ leading                    → trailing codepoint                                                                                                                                                                                                                                                                                                                                                                               */
+    /*                           |   Other  | WordJoiner | ZeroWidthSpace |   Glue   |   Space  | BreakAfter | BreakBefore | Hyphen   | ClosePunctuation | CloseParenthesis_EA | CloseParenthesis_NotEA | Exclamation | Inseparable | Nonstarter | OpenPunctuation_EA | OpenPunctuation_NotEA | Quotation | InfixNumericSeparator | Numeric  | PostfixNumeric | PrefixNumeric | SymbolsAllowingBreakAfter | Alphabetic | Ideographic | */
+    /* Other                     | */ [_ /* |  */, 1 /*  |    */, _ /*    | */, 1 /* | */, _ /* |  */, 1 /*  |  */, _ /*   | */, 1 /* |    */, 1 /*      |      */, 1 /*       |        */, 1 /*        |  */, 1 /*   |  */, 1 /*   |  */, 1 /*  |      */, _ /*      |        */, _ /*       | */, 1 /*  |       */, 1 /*        | */, _ /* |   */, _ /*     |   */, _ /*    |         */, 1 /*          | */, _ /*   |  */, _ /*   | */],
+    /* WordJoiner                | */ [1 /* |  */, 1 /*  |    */, 1 /*    | */, 1 /* | */, 1 /* |  */, 1 /*  |  */, 1 /*   | */, 1 /* |    */, 1 /*      |      */, 1 /*       |        */, 1 /*        |  */, 1 /*   |  */, 1 /*   |  */, 1 /*  |      */, 1 /*      |        */, 1 /*       | */, 1 /*  |       */, 1 /*        | */, 1 /* |   */, 1 /*     |   */, 1 /*    |         */, 1 /*          | */, 1 /*   |  */, 1 /*   | */],
+    /* ZeroWidthSpace            | */ [_ /* |  */, _ /*  |    */, _ /*    | */, _ /* | */, _ /* |  */, _ /*  |  */, _ /*   | */, _ /* |    */, _ /*      |      */, _ /*       |        */, _ /*        |  */, _ /*   |  */, _ /*   |  */, _ /*  |      */, _ /*      |        */, _ /*       | */, _ /*  |       */, _ /*        | */, _ /* |   */, _ /*     |   */, _ /*    |         */, _ /*          | */, _ /*   |  */, _ /*   | */],
+    /* Glue                      | */ [1 /* |  */, 1 /*  |    */, 1 /*    | */, 1 /* | */, 1 /* |  */, 1 /*  |  */, 1 /*   | */, 1 /* |    */, 1 /*      |      */, 1 /*       |        */, 1 /*        |  */, 1 /*   |  */, 1 /*   |  */, 1 /*  |      */, 1 /*      |        */, 1 /*       | */, 1 /*  |       */, 1 /*        | */, 1 /* |   */, 1 /*     |   */, 1 /*    |         */, 1 /*          | */, 1 /*   |  */, 1 /*   | */],
+    /* Space                     | */ [_ /* |  */, 1 /*  |    */, _ /*    | */, _ /* | */, _ /* |  */, 1 /*  |  */, _ /*   | */, 1 /* |    */, 1 /*      |      */, 1 /*       |        */, 1 /*        |  */, 1 /*   |  */, 1 /*   |  */, 1 /*  |      */, _ /*      |        */, _ /*       | */, 1 /*  |       */, _ /*        | */, _ /* |   */, _ /*     |   */, _ /*    |         */, 1 /*          | */, _ /*   |  */, _ /*   | */],
+    /* BreakAfter                | */ [_ /* |  */, 1 /*  |    */, _ /*    | */, _ /* | */, _ /* |  */, 1 /*  |  */, _ /*   | */, 1 /* |    */, 1 /*      |      */, 1 /*       |        */, 1 /*        |  */, 1 /*   |  */, 1 /*   |  */, 1 /*  |      */, _ /*      |        */, _ /*       | */, 1 /*  |       */, 1 /*        | */, _ /* |   */, _ /*     |   */, _ /*    |         */, 1 /*          | */, _ /*   |  */, _ /*   | */],
+    /* BreakBefore               | */ [1 /* |  */, 1 /*  |    */, 1 /*    | */, 1 /* | */, 1 /* |  */, 1 /*  |  */, 1 /*   | */, 1 /* |    */, 1 /*      |      */, 1 /*       |        */, 1 /*        |  */, 1 /*   |  */, 1 /*   |  */, 1 /*  |      */, 1 /*      |        */, 1 /*       | */, 1 /*  |       */, 1 /*        | */, 1 /* |   */, 1 /*     |   */, 1 /*    |         */, 1 /*          | */, 1 /*   |  */, 1 /*   | */],
+    /* Hyphen                    | */ [_ /* |  */, 1 /*  |    */, _ /*    | */, _ /* | */, _ /* |  */, 1 /*  |  */, _ /*   | */, 1 /* |    */, 1 /*      |      */, 1 /*       |        */, 1 /*        |  */, 1 /*   |  */, 1 /*   |  */, 1 /*  |      */, _ /*      |        */, _ /*       | */, 1 /*  |       */, 1 /*        | */, 1 /* |   */, _ /*     |   */, _ /*    |         */, 1 /*          | */, _ /*   |  */, _ /*   | */],
+    /* ClosePunctuation          | */ [_ /* |  */, 1 /*  |    */, _ /*    | */, 1 /* | */, _ /* |  */, 1 /*  |  */, _ /*   | */, 1 /* |    */, 1 /*      |      */, 1 /*       |        */, 1 /*        |  */, 1 /*   |  */, 1 /*   |  */, 1 /*  |      */, _ /*      |        */, _ /*       | */, 1 /*  |       */, 1 /*        | */, _ /* |   */, 1 /*     |   */, 1 /*    |         */, 1 /*          | */, _ /*   |  */, _ /*   | */],
+    /* CloseParenthesis_EA       | */ [_ /* |  */, 1 /*  |    */, _ /*    | */, 1 /* | */, _ /* |  */, 1 /*  |  */, _ /*   | */, 1 /* |    */, 1 /*      |      */, 1 /*       |        */, 1 /*        |  */, 1 /*   |  */, 1 /*   |  */, 1 /*  |      */, _ /*      |        */, _ /*       | */, 1 /*  |       */, 1 /*        | */, _ /* |   */, 1 /*     |   */, 1 /*    |         */, 1 /*          | */, _ /*   |  */, _ /*   | */],
+    /* CloseParenthesis_NotEA    | */ [_ /* |  */, 1 /*  |    */, _ /*    | */, 1 /* | */, _ /* |  */, 1 /*  |  */, _ /*   | */, 1 /* |    */, 1 /*      |      */, 1 /*       |        */, 1 /*        |  */, 1 /*   |  */, 1 /*   |  */, 1 /*  |      */, _ /*      |        */, _ /*       | */, 1 /*  |       */, 1 /*        | */, 1 /* |   */, 1 /*     |   */, 1 /*    |         */, 1 /*          | */, 1 /*   |  */, _ /*   | */],
+    /* Exclamation               | */ [_ /* |  */, 1 /*  |    */, _ /*    | */, 1 /* | */, _ /* |  */, 1 /*  |  */, _ /*   | */, 1 /* |    */, 1 /*      |      */, 1 /*       |        */, 1 /*        |  */, 1 /*   |  */, 1 /*   |  */, 1 /*  |      */, _ /*      |        */, _ /*       | */, 1 /*  |       */, 1 /*        | */, _ /* |   */, _ /*     |   */, _ /*    |         */, 1 /*          | */, _ /*   |  */, _ /*   | */],
+    /* Inseparable               | */ [_ /* |  */, 1 /*  |    */, _ /*    | */, 1 /* | */, _ /* |  */, 1 /*  |  */, _ /*   | */, 1 /* |    */, 1 /*      |      */, 1 /*       |        */, 1 /*        |  */, 1 /*   |  */, 1 /*   |  */, 1 /*  |      */, _ /*      |        */, _ /*       | */, 1 /*  |       */, 1 /*        | */, _ /* |   */, _ /*     |   */, _ /*    |         */, 1 /*          | */, _ /*   |  */, _ /*   | */],
+    /* Nonstarter                | */ [_ /* |  */, 1 /*  |    */, _ /*    | */, 1 /* | */, _ /* |  */, 1 /*  |  */, _ /*   | */, 1 /* |    */, 1 /*      |      */, 1 /*       |        */, 1 /*        |  */, 1 /*   |  */, 1 /*   |  */, 1 /*  |      */, _ /*      |        */, _ /*       | */, 1 /*  |       */, 1 /*        | */, _ /* |   */, _ /*     |   */, _ /*    |         */, 1 /*          | */, _ /*   |  */, _ /*   | */],
+    /* OpenPunctuation_EA        | */ [1 /* |  */, 1 /*  |    */, 1 /*    | */, 1 /* | */, 1 /* |  */, 1 /*  |  */, 1 /*   | */, 1 /* |    */, 1 /*      |      */, 1 /*       |        */, 1 /*        |  */, 1 /*   |  */, 1 /*   |  */, 1 /*  |      */, 1 /*      |        */, 1 /*       | */, 1 /*  |       */, 1 /*        | */, 1 /* |   */, 1 /*     |   */, 1 /*    |         */, 1 /*          | */, 1 /*   |  */, 1 /*   | */],
+    /* OpenPunctuation_NotEA     | */ [1 /* |  */, 1 /*  |    */, 1 /*    | */, 1 /* | */, 1 /* |  */, 1 /*  |  */, 1 /*   | */, 1 /* |    */, 1 /*      |      */, 1 /*       |        */, 1 /*        |  */, 1 /*   |  */, 1 /*   |  */, 1 /*  |      */, 1 /*      |        */, 1 /*       | */, 1 /*  |       */, 1 /*        | */, 1 /* |   */, 1 /*     |   */, 1 /*    |         */, 1 /*          | */, 1 /*   |  */, 1 /*   | */],
+    /* Quotation                 | */ [1 /* |  */, 1 /*  |    */, 1 /*    | */, 1 /* | */, 1 /* |  */, 1 /*  |  */, 1 /*   | */, 1 /* |    */, 1 /*      |      */, 1 /*       |        */, 1 /*        |  */, 1 /*   |  */, 1 /*   |  */, 1 /*  |      */, 1 /*      |        */, 1 /*       | */, 1 /*  |       */, 1 /*        | */, 1 /* |   */, 1 /*     |   */, 1 /*    |         */, 1 /*          | */, 1 /*   |  */, 1 /*   | */],
+    /* InfixNumericSeparator     | */ [_ /* |  */, 1 /*  |    */, _ /*    | */, 1 /* | */, _ /* |  */, 1 /*  |  */, _ /*   | */, 1 /* |    */, 1 /*      |      */, 1 /*       |        */, 1 /*        |  */, 1 /*   |  */, 1 /*   |  */, 1 /*  |      */, _ /*      |        */, _ /*       | */, 1 /*  |       */, 1 /*        | */, 1 /* |   */, 1 /*     |   */, 1 /*    |         */, 1 /*          | */, 1 /*   |  */, _ /*   | */],
+    /* Numeric                   | */ [_ /* |  */, 1 /*  |    */, _ /*    | */, 1 /* | */, _ /* |  */, 1 /*  |  */, _ /*   | */, 1 /* |    */, 1 /*      |      */, 1 /*       |        */, 1 /*        |  */, 1 /*   |  */, 1 /*   |  */, 1 /*  |      */, _ /*      |        */, 1 /*       | */, 1 /*  |       */, 1 /*        | */, 1 /* |   */, 1 /*     |   */, 1 /*    |         */, 1 /*          | */, 1 /*   |  */, _ /*   | */],
+    /* PostfixNumeric            | */ [_ /* |  */, 1 /*  |    */, _ /*    | */, 1 /* | */, _ /* |  */, 1 /*  |  */, _ /*   | */, 1 /* |    */, 1 /*      |      */, 1 /*       |        */, 1 /*        |  */, 1 /*   |  */, 1 /*   |  */, 1 /*  |      */, 1 /*      |        */, 1 /*       | */, 1 /*  |       */, 1 /*        | */, 1 /* |   */, _ /*     |   */, _ /*    |         */, 1 /*          | */, 1 /*   |  */, _ /*   | */],
+    /* PrefixNumeric             | */ [_ /* |  */, 1 /*  |    */, _ /*    | */, 1 /* | */, _ /* |  */, 1 /*  |  */, _ /*   | */, 1 /* |    */, 1 /*      |      */, 1 /*       |        */, 1 /*        |  */, 1 /*   |  */, 1 /*   |  */, 1 /*  |      */, 1 /*      |        */, 1 /*       | */, 1 /*  |       */, 1 /*        | */, 1 /* |   */, _ /*     |   */, _ /*    |         */, 1 /*          | */, 1 /*   |  */, 1 /*   | */],
+    /* SymbolsAllowingBreakAfter | */ [_ /* |  */, 1 /*  |    */, _ /*    | */, 1 /* | */, _ /* |  */, 1 /*  |  */, _ /*   | */, 1 /* |    */, 1 /*      |      */, 1 /*       |        */, 1 /*        |  */, 1 /*   |  */, 1 /*   |  */, 1 /*  |      */, _ /*      |        */, _ /*       | */, 1 /*  |       */, 1 /*        | */, 1 /* |   */, 1 /*     |   */, 1 /*    |         */, 1 /*          | */, _ /*   |  */, _ /*   | */],
+    /* Alphabetic                | */ [_ /* |  */, 1 /*  |    */, _ /*    | */, 1 /* | */, _ /* |  */, 1 /*  |  */, _ /*   | */, 1 /* |    */, 1 /*      |      */, 1 /*       |        */, 1 /*        |  */, 1 /*   |  */, 1 /*   |  */, 1 /*  |      */, _ /*      |        */, 1 /*       | */, 1 /*  |       */, 1 /*        | */, 1 /* |   */, 1 /*     |   */, 1 /*    |         */, 1 /*          | */, 1 /*   |  */, _ /*   | */],
+    /* Ideographic               | */ [_ /* |  */, 1 /*  |    */, _ /*    | */, 1 /* | */, _ /* |  */, 1 /*  |  */, _ /*   | */, 1 /* |    */, 1 /*      |      */, 1 /*       |        */, 1 /*        |  */, 1 /*   |  */, 1 /*   |  */, 1 /*  |      */, _ /*      |        */, _ /*       | */, 1 /*  |       */, 1 /*        | */, _ /* |   */, 1 /*     |   */, _ /*    |         */, 1 /*          | */, _ /*   |  */, _ /*   | */],
+];
+
+// @formatter:on
+
 if (args.Length != 1)
 {
    Console.WriteLine(
@@ -113,7 +302,7 @@ var ucd = ExtractValuesFromUcd(args[0]);
 // 4 still gives ~30% savings over 3 stages and going beyond 5 gives diminishing returns (<10%).
 var trie = BuildBestTrie(ucd.Values, 2, 8, 4);
 // The joinRules above has 2 bits per value. This packs it into 32-bit integers to save space.
-var rules = PrepareRulesTable(joinRules);
+var rules = joinRules.Select(table => PrepareRulesTable(table, 2, 3)).ToArray();
 // Each rules item has the same length. Each item is 32 bits = 4 bytes.
 var totalSize = trie.TotalSize + rules.Length * rules[0].Length * sizeof(TrieType);

@@ -134,9 +323,12 @@ foreach (var (expected, cp) in ucd.Values.Select((v, i) => (v, i)))

 // All the remaining code starting here simply generates the C++ output.
 var buf = new StringBuilder();
-buf.Append("// Generated by GraphemeTableGen\n");
-buf.Append($"// on {DateTime.UtcNow.ToString("yyyy'-'MM'-'dd'T'HH':'mm':'ssK")}, from {ucd.Description}, {totalSize} bytes\n");
-buf.Append("// clang-format off\n");
+buf.Append($$"""
+// Generated by GraphemeTableGen
+// on {{DateTime.UtcNow.ToString("yyyy'-'MM'-'dd'T'HH':'mm':'ssK")}}, from {{ucd.Description}}, {{totalSize}} bytes
+// clang-format off
+
+""");

 foreach (var stage in trie.Stages)
 {
@@ -147,7 +339,7 @@ foreach (var stage in trie.Stages)
        width = stage.Mask + 1;
    }

-    buf.Append($"static constexpr uint{stage.Bits}_t s_stage{stage.Index}[] = {{");
+    buf.Append($"static const uint{stage.Bits}_t s_stage{stage.Index}[] = {{");
    foreach (var (value, j) in stage.Values.Select((v, j) => (v, j)))
    {
        if (j % width == 0)
@@ -159,7 +351,7 @@ foreach (var stage in trie.Stages)
    buf.Append("\n};\n");
 }

-buf.Append($"static constexpr uint32_t s_joinRules[{rules.Length}][{rules[0].Length}] = {{\n");
+buf.Append($"static const uint32_t s_join_rules[{rules.Length}][{rules[0].Length}] = {{\n");
 foreach (var table in rules)
 {
    buf.Append("    {\n");
@@ -171,11 +363,11 @@ foreach (var table in rules)
 }
 buf.Append("};\n");

-buf.Append("constexpr int ucdLookup(const char32_t cp) noexcept\n");
+buf.Append("inline int ucd_lookup(const uint32_t cp)\n");
 buf.Append("{\n");
 foreach (var stage in trie.Stages)
 {
-    buf.Append($"    const auto s{stage.Index} = s_stage{stage.Index}[");
+    buf.Append($"    const uint{stage.Bits}_t s{stage.Index} = s_stage{stage.Index}[");
    if (stage.Index == 0)
    {
        buf.Append($"cp >> {stage.Shift}");
@@ -190,21 +382,27 @@ foreach (var stage in trie.Stages)
 buf.Append($"    return s{trie.Stages.Count - 1};\n");
 buf.Append("}\n");

-buf.Append("constexpr int ucdGraphemeJoins(const int state, const int lead, const int trail) noexcept\n");
-buf.Append("{\n");
-buf.Append("    const auto l = lead & 15;\n");
-buf.Append("    const auto t = trail & 15;\n");
-buf.Append($"    return (s_joinRules[state][l] >> (t * 2)) & 3;\n");
-buf.Append("}\n");
-buf.Append("constexpr bool ucdGraphemeDone(const int state) noexcept\n");
-buf.Append("{\n");
-buf.Append($"    return state == 3;\n");
-buf.Append("}\n");
-buf.Append("constexpr int ucdToCharacterWidth(const int val) noexcept\n");
-buf.Append("{\n");
-buf.Append("    return val >> 6;\n");
-buf.Append("}\n");
-buf.Append("// clang-format on\n");
+buf.Append($$"""
+inline int ucd_grapheme_joins(const int state, const int lead, const int trail)
+{
+    const int l = lead & 15;
+    const int t = trail & 15;
+    return (s_join_rules[state][l] >> (t * 2)) & 3;
+}
+inline bool ucd_grapheme_done(const int state)
+{
+    return state == 3;
+}
+inline int ucd_to_character_width(const int val)
+{
+    return val >> 6;
+}
+inline int ucd_is_newline(const int val)
+{
+    return val > {{(int)ClusterBreak.Control}};
+}
+// clang-format on
+""");

 Console.Write(buf);
 return;
@@ -224,6 +422,7 @@ static Ucd ExtractValuesFromUcd(string path)
    foreach (var group in doc.Root!.Descendants(ns + "group"))
    {
        var groupGeneralCategory = group.Attribute("gc")?.Value;
+        var groupLineBreak = group.Attribute("lb")?.Value;
        var groupGraphemeClusterBreak = group.Attribute("GCB")?.Value;
        var groupIndicConjunctBreak = group.Attribute("InCB")?.Value;
        var groupExtendedPictographic = group.Attribute("ExtPict")?.Value;
@@ -246,6 +445,7 @@ static Ucd ExtractValuesFromUcd(string path)
            }

            var generalCategory = ch.Attribute("gc")?.Value ?? groupGeneralCategory ?? "";
+            var lineBreak = ch.Attribute("lb")?.Value ?? groupLineBreak ?? "";
            var graphemeClusterBreak = ch.Attribute("GCB")?.Value ?? groupGraphemeClusterBreak ?? "";
            var indicConjunctBreak = ch.Attribute("InCB")?.Value ?? groupIndicConjunctBreak ?? "";
            var extendedPictographic = ch.Attribute("ExtPict")?.Value ?? groupExtendedPictographic ?? "";
@@ -257,7 +457,9 @@ static Ucd ExtractValuesFromUcd(string path)
                // We ignore GB3 which demands that CR × LF do not break apart, because
                // * these control characters won't normally reach our text storage
                // * otherwise we're in a raw write mode and historically conhost stores them in separate cells
-                "CR" or "LF" or "CN" => ClusterBreak.Control, // Carriage Return, Line Feed, Control
+                "CR" => ClusterBreak.CR, // Carriage Return
+                "LF" => ClusterBreak.LF, // Line Feed
+                "CN" => ClusterBreak.Control, // Control
                "EX" or "SM" => ClusterBreak.Extend, // Extend, SpacingMark
                "PP" => ClusterBreak.Prepend, // Prepend
                "ZWJ" => ClusterBreak.ZWJ, // Zero Width Joiner
@@ -296,7 +498,7 @@ static Ucd ExtractValuesFromUcd(string path)
            {
                "N" or "Na" or "H" => CharacterWidth.Narrow, // Half-width, Narrow, Neutral
                "F" or "W" => CharacterWidth.Wide, // Wide, Full-width
-                "A" => CharacterWidth.Ambiguous, // Ambiguous
+                "A" => CharacterWidth.Narrow, // Ambiguous
                _ => throw new Exception($"Unrecognized ea {eastAsian} for U+{firstCp:X4} to U+{lastCp:X4}")
            };

@@ -315,10 +517,44 @@ static Ucd ExtractValuesFromUcd(string path)
                    width = CharacterWidth.ZeroWidth;
                    break;
                case "Me" or "Mn" or "Cf":
-                    width = CharacterWidth.ZeroWidth;
+                width = CharacterWidth.ZeroWidth;
                    break;
            }

+            var lbEa = eastAsian is "F" or "W" or "H";
+            var lb = lineBreak switch
+            {
+                "WJ" => LineBreak.WordJoiner,
+                "ZW" => LineBreak.ZeroWidthSpace,
+                "GL" => LineBreak.Glue,
+                "SP" => LineBreak.Space,
+
+                "BA" => LineBreak.BreakAfter,
+                "BB" => LineBreak.BreakBefore,
+                "HY" => LineBreak.Hyphen,
+
+                "CL" => LineBreak.ClosePunctuation,
+                "CP" when lbEa => LineBreak.CloseParenthesis_EA,
+                "CP" => LineBreak.CloseParenthesis_NotEA,
+                "EX" => LineBreak.Exclamation,
+                "IN" => LineBreak.Inseparable,
+                "NS" => LineBreak.Nonstarter,
+                "OP" when lbEa => LineBreak.OpenPunctuation_EA,
+                "OP" => LineBreak.OpenPunctuation_NotEA,
+                "QU" => LineBreak.Quotation,
+
+                "IS" => LineBreak.InfixNumericSeparator,
+                "NU" => LineBreak.Numeric,
+                "PO" => LineBreak.PostfixNumeric,
+                "PR" => LineBreak.PrefixNumeric,
+                "SY" => LineBreak.SymbolsAllowingBreakAfter,
+
+                "AL" or "HL" => LineBreak.Alphabetic,
+                "ID" or "EB" or "EM" => LineBreak.Ideographic,
+
+                _ => LineBreak.Other,
+            };
+
            Fill(firstCp, lastCp, TrieValue(cb, width));
        }
    }
@@ -336,6 +572,12 @@ static Ucd ExtractValuesFromUcd(string path)
    // By default, CharacterWidth.Ambiguous, but by convention .Narrow in terminals.
    Fill(0x2500, 0x259F, TrieValue(ClusterBreak.Other, CharacterWidth.Narrow));

+    // U+FE0F Variation Selector-16 is used to turn unqualified Emojis into qualified ones.
+    // By convention, this turns them from being ambiguous width (= narrow) into wide ones.
+    // We achieve this here by explicitly giving this codepoint a wide width.
+    // Later down below we'll clamp width back to <= 2.
+    Fill(0xFE0F, 0xFE0F, TrieValue(ClusterBreak.Extend, CharacterWidth.Wide));
+
    return new Ucd
    {
        Description = description,
@@ -354,38 +596,37 @@ static Ucd ExtractValuesFromUcd(string path)
    }
 }

-// Because each item in the list of 2D rule tables only uses 2 bits and not all 8 in each byte,
+// Because each item in the list of 2D rule tables only uses 1-2 bits,
 // this function packs them into chunks of 32-bit integers to save space.
-static uint[][] PrepareRulesTable(byte[][][] rules)
+static uint[] PrepareRulesTable(int[][] rules, int bitWidth, int nonJoinerValue)
 {
-    var compressed = new uint[rules.Length][];
-    for (var i = 0; i < compressed.Length; i++)
-    {
-        compressed[i] = new uint[16];
-    }
+    var compressed = new uint[rules.Length];

-    foreach (var (table, prevIndex) in rules.Select((v, i) => (v, i)))
+    foreach (var lead in Enumerable.Range(0, rules.Length))
    {
-        foreach (var (row, lead) in table.Select((v, i) => (v, i)))
+        var row = rules[lead];
+        uint nextIndices = 0;
+
+        if (row.Length > 32 / bitWidth)
        {
-            if (table[lead].Length > 16)
-            {
-                throw new Exception("Can't pack row into 32 bits");
-            }
-
-            uint nextIndices = 0;
-            foreach (var (nextIndex, trail) in row.Select((v, i) => (v, i)))
-            {
-                if (nextIndex > 3)
-                {
-                    throw new Exception("Can't pack table index into 2 bits");
-                }
-
-                nextIndices |= (uint)(nextIndex << (trail * 2));
-            }
-
-            compressed[prevIndex][lead] = nextIndices;
+            throw new Exception("Can't pack row into 32 bits");
        }
+
+        foreach (var trail in Enumerable.Range(0, row.Length))
+        {
+            var value = row[trail];
+            if (value < 0)
+            {
+                value = nonJoinerValue;
+            }
+            if (value > (1 << bitWidth) - 1)
+            {
+                throw new Exception("Can't pack table index into 2 bits");
+            }
+            nextIndices |= (uint)(value << (trail * bitWidth));
+        }
+
+        compressed[lead] = nextIndices;
    }

    return compressed;
@@ -549,7 +790,6 @@ internal enum CharacterWidth
 internal enum ClusterBreak
 {
    Other,         // GB999
-    Control,       // GB3, GB4, GB5 -- includes CR, LF
    Extend,        // GB9, GB9a -- includes SpacingMark
    RI,            // GB12, GB13
    Prepend,       // GB9b
@@ -562,6 +802,50 @@ internal enum ClusterBreak
    InCBConsonant, // GB9c
    ExtPic,        // GB11
    ZWJ,           // GB9, GB11
+
+    // These are intentionally ordered last, as this allows
+    // us to simplify the ucd_is_newline implementation.
+    Control,       // GB4, GB5
+    CR,            // GB3, GB4, GB5
+    LF,            // GB3, GB4, GB5
+}
+
+internal enum LineBreak
+{
+    Other, // Anything else
+
+    // Non-tailorable Line Breaking Classes
+    WordJoiner, // WJ
+    ZeroWidthSpace, // ZW
+    Glue, // GL
+    Space, // SP
+
+    // Break Opportunities
+    BreakAfter, // BA
+    BreakBefore, // BB
+    Hyphen, // HY
+
+    // Characters Prohibiting Certain Breaks
+    ClosePunctuation, // CL
+    CloseParenthesis_EA, // CP, East Asian
+    CloseParenthesis_NotEA, // CP, not East Asian
+    Exclamation, // EX
+    Inseparable, // IN
+    Nonstarter, // NS
+    OpenPunctuation_EA, // OP, East Asian
+    OpenPunctuation_NotEA, // OP, not East Asian
+    Quotation, // QU
+
+    // Numeric Context
+    InfixNumericSeparator, // IS
+    Numeric, // NU
+    PostfixNumeric, // PO
+    PrefixNumeric, // PR
+    SymbolsAllowingBreakAfter, // SY
+
+    // Other Characters
+    Alphabetic, // AL & HL
+    Ideographic, // ID & EB & EM
 }

 internal class Ucd
--- a/src/types/CodepointWidthDetector.cpp
+++ b/src/types/CodepointWidthDetector.cpp
@@ -846,16 +846,6 @@ bool CodepointWidthDetector::_graphemeNext(GraphemeState& s, const std::wstring_
                {
                    w = _ambiguousWidth;
                }
-
-                // U+FE0F Variation Selector-16 is used to turn unqualified Emojis into qualified ones.
-                // By convention, this turns them from being ambiguous width (= narrow) into wide ones.
-                // We achieve this here by explicitly giving this codepoint a wide width.
-                // Later down below we'll clamp width back to <= 2.
-                if (cp == 0xFE0F)
-                {
-                    w = 2;
-                }
-
                width += w;
            }

@@ -943,16 +933,6 @@ bool CodepointWidthDetector::_graphemePrev(GraphemeState& s, const std::wstring_
                {
                    w = _ambiguousWidth;
                }
-
-                // U+FE0F Variation Selector-16 is used to turn unqualified Emojis into qualified ones.
-                // By convention, this turns them from being ambiguous width (= narrow) into wide ones.
-                // We achieve this here by explicitly giving this codepoint a wide width.
-                // Later down below we'll clamp width back to <= 2.
-                if (cp == 0xFE0F)
-                {
-                    w = 2;
-                }
-
                width += w;
            }

@@ -1100,7 +1080,6 @@ bool CodepointWidthDetector::_graphemePrevWcswidth(GraphemeState& s, const std::
            {
                w = _ambiguousWidth;
            }
-
            width += w;

            const auto hasWidth = width != 0;