From 3d231207720919659d414d0e116a5bd9fc5ddf46 Mon Sep 17 00:00:00 2001 From: Hyeseong Kim Date: Mon, 15 Dec 2025 03:38:04 +0900 Subject: [PATCH 01/13] Move InCB pattern lookup to bottom --- src/grapheme.js | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/src/grapheme.js b/src/grapheme.js index c640f54..eeb3c5e 100644 --- a/src/grapheme.js +++ b/src/grapheme.js @@ -82,21 +82,10 @@ export function* graphemeSegments(input) { /** Beginning category of a segment */ let _catBegin = catBefore; - /** Memoize the beginnig code point a the segment. */ + /** Memoize the beginnig code point of the segment. */ let _hd = cp; while (cursor < len) { - // Note: Lazily update `consonant` and `linker` state - // which is a extra overhead only for Hindi text. - if (cp >= 2325) { - if (!consonant && catBefore === 0) { - consonant = isIndicConjunctConsonant(cp); - } else if (catBefore === 3 /* Extend */) { - // Note: \p{InCB=Linker} is a subset of \p{Extend} - linker = isIndicConjunctLinker(cp); - } - } - cp = /** @type {number} */ (input.codePointAt(cursor)); catAfter = cat(cp); @@ -110,7 +99,7 @@ export function* graphemeSegments(input) { ) { emoji = true; - } else if (catAfter === 0 /* Any */ && cp >= 2325) { + } else if (catAfter === 0 /* Any */) { // Note: Put GB9c rule checking here to reduce. incb = consonant && linker && (consonant = isIndicConjunctConsonant(cp)); // It cannot be both a linker and a consonant. @@ -134,6 +123,12 @@ export function* graphemeSegments(input) { index = cursor; _catBegin = catAfter; _hd = cp; + } else if (cp >= 2325 && cp <= 3386) { + // Update InCB state only when continuing within a segment + if (!consonant && catBefore === 0) + consonant = isIndicConjunctConsonant(_hd); + if (catAfter === 3 /* Extend */) + linker = linker || isIndicConjunctLinker(cp); } cursor += cp <= BMP_MAX ? 1 : 2; From 7e7ac3e9d44fdca3ea2aaf619887819d484b10b5 Mon Sep 17 00:00:00 2001 From: Hyeseong Kim Date: Mon, 15 Dec 2025 04:20:36 +0900 Subject: [PATCH 02/13] test --- src/grapheme.js | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/grapheme.js b/src/grapheme.js index eeb3c5e..dbd1b90 100644 --- a/src/grapheme.js +++ b/src/grapheme.js @@ -74,9 +74,6 @@ export function* graphemeSegments(input) { /** InCB=Linker */ let linker = false; - /** InCB=Consonant InCB=Linker x InCB=Consonant */ - let incb = false; - let index = 0; /** Beginning category of a segment */ @@ -86,6 +83,9 @@ export function* graphemeSegments(input) { let _hd = cp; while (cursor < len) { + /** InCB=Consonant InCB=Linker x InCB=Consonant */ + let incb = false; + cp = /** @type {number} */ (input.codePointAt(cursor)); catAfter = cat(cp); @@ -100,10 +100,8 @@ export function* graphemeSegments(input) { emoji = true; } else if (catAfter === 0 /* Any */) { - // Note: Put GB9c rule checking here to reduce. - incb = consonant && linker && (consonant = isIndicConjunctConsonant(cp)); - // It cannot be both a linker and a consonant. - linker = linker && !consonant; + incb = consonant && linker && isIndicConjunctConsonant(cp); + linker = false; } } @@ -119,16 +117,19 @@ export function* graphemeSegments(input) { // flush emoji = false; - incb = false; + consonant = false; + linker = false; index = cursor; _catBegin = catAfter; _hd = cp; - } else if (cp >= 2325 && cp <= 3386) { + + // Note: Avoid InCB state checking much as possible + } else if (_hd >= 2325 && _hd <= 3386) { // Update InCB state only when continuing within a segment - if (!consonant && catBefore === 0) + if (!consonant) consonant = isIndicConjunctConsonant(_hd); if (catAfter === 3 /* Extend */) - linker = linker || isIndicConjunctLinker(cp); + linker = isIndicConjunctLinker(cp); } cursor += cp <= BMP_MAX ? 1 : 2; From 5e31ffe3fede5eca87dc47a8250ef92ed29329f3 Mon Sep 17 00:00:00 2001 From: Hyeseong Kim Date: Mon, 15 Dec 2025 04:35:53 +0900 Subject: [PATCH 03/13] test --- src/grapheme.js | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/grapheme.js b/src/grapheme.js index dbd1b90..b0c0361 100644 --- a/src/grapheme.js +++ b/src/grapheme.js @@ -123,12 +123,13 @@ export function* graphemeSegments(input) { _catBegin = catAfter; _hd = cp; - // Note: Avoid InCB state checking much as possible - } else if (_hd >= 2325 && _hd <= 3386) { + } else if (cp >= 2325) { + // Note: Avoid InCB state checking much as possible // Update InCB state only when continuing within a segment if (!consonant) consonant = isIndicConjunctConsonant(_hd); - if (catAfter === 3 /* Extend */) + + if (catAfter === 3) linker = isIndicConjunctLinker(cp); } From 60beb45080853715a48cd233874451f0901b3b04 Mon Sep 17 00:00:00 2001 From: Hyeseong Kim Date: Mon, 15 Dec 2025 04:40:07 +0900 Subject: [PATCH 04/13] test --- src/grapheme.js | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/grapheme.js b/src/grapheme.js index b0c0361..ff465c6 100644 --- a/src/grapheme.js +++ b/src/grapheme.js @@ -99,9 +99,8 @@ export function* graphemeSegments(input) { ) { emoji = true; - } else if (catAfter === 0 /* Any */) { + } else { incb = consonant && linker && isIndicConjunctConsonant(cp); - linker = false; } } @@ -118,7 +117,6 @@ export function* graphemeSegments(input) { // flush emoji = false; consonant = false; - linker = false; index = cursor; _catBegin = catAfter; _hd = cp; @@ -126,11 +124,13 @@ export function* graphemeSegments(input) { } else if (cp >= 2325) { // Note: Avoid InCB state checking much as possible // Update InCB state only when continuing within a segment - if (!consonant) + if (!consonant && catBefore === 0) consonant = isIndicConjunctConsonant(_hd); - if (catAfter === 3) + if (consonant && catAfter === 3) linker = isIndicConjunctLinker(cp); + else if (catAfter === 0) + linker = false; } cursor += cp <= BMP_MAX ? 1 : 2; From 7b60cfef4d47945a4d978cbdfbfad545e666441d Mon Sep 17 00:00:00 2001 From: Hyeseong Kim Date: Mon, 15 Dec 2025 04:49:59 +0900 Subject: [PATCH 05/13] test --- src/grapheme.js | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/grapheme.js b/src/grapheme.js index ff465c6..9f78bfc 100644 --- a/src/grapheme.js +++ b/src/grapheme.js @@ -74,6 +74,9 @@ export function* graphemeSegments(input) { /** InCB=Linker */ let linker = false; + /** InCB=Consonant InCB=Linker x InCB=Consonant */ + let incb = false; + let index = 0; /** Beginning category of a segment */ @@ -83,9 +86,6 @@ export function* graphemeSegments(input) { let _hd = cp; while (cursor < len) { - /** InCB=Consonant InCB=Linker x InCB=Consonant */ - let incb = false; - cp = /** @type {number} */ (input.codePointAt(cursor)); catAfter = cat(cp); @@ -116,12 +116,12 @@ export function* graphemeSegments(input) { // flush emoji = false; - consonant = false; + incb = false; index = cursor; _catBegin = catAfter; _hd = cp; - } else if (cp >= 2325) { + } else { // Note: Avoid InCB state checking much as possible // Update InCB state only when continuing within a segment if (!consonant && catBefore === 0) From 304360072ce5987320fbdf5eb795d2a570570988 Mon Sep 17 00:00:00 2001 From: Hyeseong Kim Date: Mon, 15 Dec 2025 04:53:14 +0900 Subject: [PATCH 06/13] test --- src/grapheme.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/grapheme.js b/src/grapheme.js index 9f78bfc..5f765ce 100644 --- a/src/grapheme.js +++ b/src/grapheme.js @@ -99,7 +99,7 @@ export function* graphemeSegments(input) { ) { emoji = true; - } else { + } else if (catAfter === 0) { incb = consonant && linker && isIndicConjunctConsonant(cp); } } From d79d4de12c18d26be2634bfb11328e5428814b9c Mon Sep 17 00:00:00 2001 From: Hyeseong Kim Date: Mon, 15 Dec 2025 04:58:35 +0900 Subject: [PATCH 07/13] test --- src/grapheme.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/grapheme.js b/src/grapheme.js index 5f765ce..793dee2 100644 --- a/src/grapheme.js +++ b/src/grapheme.js @@ -121,7 +121,7 @@ export function* graphemeSegments(input) { _catBegin = catAfter; _hd = cp; - } else { + } else if (cp >= 2325) { // Note: Avoid InCB state checking much as possible // Update InCB state only when continuing within a segment if (!consonant && catBefore === 0) From c0613a5d610a3a019e522290f7c1cf1fa4725a2c Mon Sep 17 00:00:00 2001 From: Hyeseong Kim Date: Mon, 15 Dec 2025 05:02:56 +0900 Subject: [PATCH 08/13] test --- src/grapheme.js | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/grapheme.js b/src/grapheme.js index 793dee2..c49906c 100644 --- a/src/grapheme.js +++ b/src/grapheme.js @@ -101,6 +101,7 @@ export function* graphemeSegments(input) { } else if (catAfter === 0) { incb = consonant && linker && isIndicConjunctConsonant(cp); + linker = false; } } @@ -121,7 +122,7 @@ export function* graphemeSegments(input) { _catBegin = catAfter; _hd = cp; - } else if (cp >= 2325) { + } else if (cp >= 2325 && cp <= 3386) { // Note: Avoid InCB state checking much as possible // Update InCB state only when continuing within a segment if (!consonant && catBefore === 0) @@ -129,8 +130,6 @@ export function* graphemeSegments(input) { if (consonant && catAfter === 3) linker = isIndicConjunctLinker(cp); - else if (catAfter === 0) - linker = false; } cursor += cp <= BMP_MAX ? 1 : 2; From be2618f7f937d8491d2d6400783d9c78f650139f Mon Sep 17 00:00:00 2001 From: Hyeseong Kim Date: Mon, 15 Dec 2025 05:09:02 +0900 Subject: [PATCH 09/13] test --- src/grapheme.js | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/grapheme.js b/src/grapheme.js index c49906c..08aab97 100644 --- a/src/grapheme.js +++ b/src/grapheme.js @@ -101,7 +101,6 @@ export function* graphemeSegments(input) { } else if (catAfter === 0) { incb = consonant && linker && isIndicConjunctConsonant(cp); - linker = false; } } @@ -125,11 +124,13 @@ export function* graphemeSegments(input) { } else if (cp >= 2325 && cp <= 3386) { // Note: Avoid InCB state checking much as possible // Update InCB state only when continuing within a segment - if (!consonant && catBefore === 0) + if (catBefore === 0) consonant = isIndicConjunctConsonant(_hd); if (consonant && catAfter === 3) linker = isIndicConjunctLinker(cp); + else if (catAfter === 0) + linker = false; } cursor += cp <= BMP_MAX ? 1 : 2; From 9aebd489fb760c42b0a65820ee53140ba1162e36 Mon Sep 17 00:00:00 2001 From: Hyeseong Kim Date: Mon, 15 Dec 2025 05:17:00 +0900 Subject: [PATCH 10/13] test --- src/grapheme.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/grapheme.js b/src/grapheme.js index 08aab97..95951ed 100644 --- a/src/grapheme.js +++ b/src/grapheme.js @@ -121,10 +121,10 @@ export function* graphemeSegments(input) { _catBegin = catAfter; _hd = cp; - } else if (cp >= 2325 && cp <= 3386) { + } else if (_hd >= 2325) { // Note: Avoid InCB state checking much as possible // Update InCB state only when continuing within a segment - if (catBefore === 0) + if (!consonant && catBefore === 0) consonant = isIndicConjunctConsonant(_hd); if (consonant && catAfter === 3) From b71675ed21ac4a397747ab40f8fbaf5feff64d75 Mon Sep 17 00:00:00 2001 From: Hyeseong Kim Date: Mon, 15 Dec 2025 05:19:45 +0900 Subject: [PATCH 11/13] test --- src/grapheme.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/grapheme.js b/src/grapheme.js index 95951ed..793dee2 100644 --- a/src/grapheme.js +++ b/src/grapheme.js @@ -121,7 +121,7 @@ export function* graphemeSegments(input) { _catBegin = catAfter; _hd = cp; - } else if (_hd >= 2325) { + } else if (cp >= 2325) { // Note: Avoid InCB state checking much as possible // Update InCB state only when continuing within a segment if (!consonant && catBefore === 0) From 8f2283d9a9b5214fdc28c162c4e7d8f631618a5e Mon Sep 17 00:00:00 2001 From: Hyeseong Kim Date: Mon, 15 Dec 2025 05:39:25 +0900 Subject: [PATCH 12/13] update bundle stats --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5fb4691..758f1d7 100644 --- a/README.md +++ b/README.md @@ -215,7 +215,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb | Name | Unicode® | ESM? | Size | Size (min) | Size (min+gzip) | Size (min+br) | Size (min+zstd) | |------------------------------|----------|------|----------:|-----------:|----------------:|--------------:|----------------:| -| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 16,704 | 12,554 | 5,308 | 3,958 | 5,010 | +| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 16,685 | 12,549 | 5,314 | 3,952 | 5,012 | | `graphemer` | 15.0.0 | ✖️ ️| 410,435 | 95,104 | 15,752 | 10,660 | 15,911 | | `grapheme-splitter` | 10.0.0 | ✖️ | 122,254 | 23,682 | 7,852 | 4,802 | 6,753 | | `@formatjs/intl-segmenter`* | 15.0.0 | ✖️ | 603,510 | 369,673 | 72,273 | 49,530 | 68,027 | @@ -231,7 +231,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb | Name | Bytecode size | Bytecode size (gzip)* | |------------------------------|--------------:|----------------------:| -| `unicode-segmenter/grapheme` | 26,309 | 13,811 | +| `unicode-segmenter/grapheme` | 26,278 | 13,797 | | `graphemer` | 134,089 | 31,766 | | `grapheme-splitter` | 63,946 | 19,162 | From 095744b7de0b42963bfda00a40f9b210b8b53e08 Mon Sep 17 00:00:00 2001 From: Hyeseong Kim Date: Mon, 15 Dec 2025 06:02:39 +0900 Subject: [PATCH 13/13] changeset --- .changeset/shiny-buttons-bathe.md | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 .changeset/shiny-buttons-bathe.md diff --git a/.changeset/shiny-buttons-bathe.md b/.changeset/shiny-buttons-bathe.md new file mode 100644 index 0000000..3b5ce63 --- /dev/null +++ b/.changeset/shiny-buttons-bathe.md @@ -0,0 +1,8 @@ +--- +"unicode-segmenter": patch +--- + +Move GB9c rule checking to be _after_ the main boundary checking. +To try to avoid unnecessary work as much as possible. + +No noticeable changes, but perf seems to be improved by ~2% for most cases.