diff --git a/.changeset/shiny-buttons-bathe.md b/.changeset/shiny-buttons-bathe.md new file mode 100644 index 0000000..3b5ce63 --- /dev/null +++ b/.changeset/shiny-buttons-bathe.md @@ -0,0 +1,8 @@ +--- +"unicode-segmenter": patch +--- + +Move GB9c rule checking to be _after_ the main boundary checking. +To try to avoid unnecessary work as much as possible. + +No noticeable changes, but perf seems to be improved by ~2% for most cases. diff --git a/README.md b/README.md index 5fb4691..758f1d7 100644 --- a/README.md +++ b/README.md @@ -215,7 +215,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb | Name | Unicode® | ESM? | Size | Size (min) | Size (min+gzip) | Size (min+br) | Size (min+zstd) | |------------------------------|----------|------|----------:|-----------:|----------------:|--------------:|----------------:| -| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 16,704 | 12,554 | 5,308 | 3,958 | 5,010 | +| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 16,685 | 12,549 | 5,314 | 3,952 | 5,012 | | `graphemer` | 15.0.0 | ✖️ ️| 410,435 | 95,104 | 15,752 | 10,660 | 15,911 | | `grapheme-splitter` | 10.0.0 | ✖️ | 122,254 | 23,682 | 7,852 | 4,802 | 6,753 | | `@formatjs/intl-segmenter`* | 15.0.0 | ✖️ | 603,510 | 369,673 | 72,273 | 49,530 | 68,027 | @@ -231,7 +231,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb | Name | Bytecode size | Bytecode size (gzip)* | |------------------------------|--------------:|----------------------:| -| `unicode-segmenter/grapheme` | 26,309 | 13,811 | +| `unicode-segmenter/grapheme` | 26,278 | 13,797 | | `graphemer` | 134,089 | 31,766 | | `grapheme-splitter` | 63,946 | 19,162 | diff --git a/src/grapheme.js b/src/grapheme.js index c640f54..793dee2 100644 --- a/src/grapheme.js +++ b/src/grapheme.js @@ -82,21 +82,10 @@ export function* graphemeSegments(input) { /** Beginning category of a segment */ let _catBegin = catBefore; - /** Memoize the beginnig code point a the segment. */ + /** Memoize the beginnig code point of the segment. */ let _hd = cp; while (cursor < len) { - // Note: Lazily update `consonant` and `linker` state - // which is a extra overhead only for Hindi text. - if (cp >= 2325) { - if (!consonant && catBefore === 0) { - consonant = isIndicConjunctConsonant(cp); - } else if (catBefore === 3 /* Extend */) { - // Note: \p{InCB=Linker} is a subset of \p{Extend} - linker = isIndicConjunctLinker(cp); - } - } - cp = /** @type {number} */ (input.codePointAt(cursor)); catAfter = cat(cp); @@ -110,11 +99,8 @@ export function* graphemeSegments(input) { ) { emoji = true; - } else if (catAfter === 0 /* Any */ && cp >= 2325) { - // Note: Put GB9c rule checking here to reduce. - incb = consonant && linker && (consonant = isIndicConjunctConsonant(cp)); - // It cannot be both a linker and a consonant. - linker = linker && !consonant; + } else if (catAfter === 0) { + incb = consonant && linker && isIndicConjunctConsonant(cp); } } @@ -134,6 +120,17 @@ export function* graphemeSegments(input) { index = cursor; _catBegin = catAfter; _hd = cp; + + } else if (cp >= 2325) { + // Note: Avoid InCB state checking much as possible + // Update InCB state only when continuing within a segment + if (!consonant && catBefore === 0) + consonant = isIndicConjunctConsonant(_hd); + + if (consonant && catAfter === 3) + linker = isIndicConjunctLinker(cp); + else if (catAfter === 0) + linker = false; } cursor += cp <= BMP_MAX ? 1 : 2;