Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .changeset/shiny-buttons-bathe.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
"unicode-segmenter": patch
---

Move GB9c rule checking to be _after_ the main boundary checking.
To try to avoid unnecessary work as much as possible.

No noticeable changes, but perf seems to be improved by ~2% for most cases.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb

| Name | Unicode® | ESM? | Size | Size (min) | Size (min+gzip) | Size (min+br) | Size (min+zstd) |
|------------------------------|----------|------|----------:|-----------:|----------------:|--------------:|----------------:|
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 16,704 | 12,554 | 5,308 | 3,958 | 5,010 |
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 16,685 | 12,549 | 5,314 | 3,952 | 5,012 |
| `graphemer` | 15.0.0 | ✖️ ️| 410,435 | 95,104 | 15,752 | 10,660 | 15,911 |
| `grapheme-splitter` | 10.0.0 | ✖️ | 122,254 | 23,682 | 7,852 | 4,802 | 6,753 |
| `@formatjs/intl-segmenter`* | 15.0.0 | ✖️ | 603,510 | 369,673 | 72,273 | 49,530 | 68,027 |
Expand All @@ -231,7 +231,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb

| Name | Bytecode size | Bytecode size (gzip)* |
|------------------------------|--------------:|----------------------:|
| `unicode-segmenter/grapheme` | 26,309 | 13,811 |
| `unicode-segmenter/grapheme` | 26,278 | 13,797 |
| `graphemer` | 134,089 | 31,766 |
| `grapheme-splitter` | 63,946 | 19,162 |

Expand Down
31 changes: 14 additions & 17 deletions src/grapheme.js
Original file line number Diff line number Diff line change
Expand Up @@ -82,21 +82,10 @@ export function* graphemeSegments(input) {
/** Beginning category of a segment */
let _catBegin = catBefore;

/** Memoize the beginnig code point a the segment. */
/** Memoize the beginnig code point of the segment. */
let _hd = cp;

while (cursor < len) {
// Note: Lazily update `consonant` and `linker` state
// which is a extra overhead only for Hindi text.
if (cp >= 2325) {
if (!consonant && catBefore === 0) {
consonant = isIndicConjunctConsonant(cp);
} else if (catBefore === 3 /* Extend */) {
// Note: \p{InCB=Linker} is a subset of \p{Extend}
linker = isIndicConjunctLinker(cp);
}
}

cp = /** @type {number} */ (input.codePointAt(cursor));
catAfter = cat(cp);

Expand All @@ -110,11 +99,8 @@ export function* graphemeSegments(input) {
) {
emoji = true;

} else if (catAfter === 0 /* Any */ && cp >= 2325) {
// Note: Put GB9c rule checking here to reduce.
incb = consonant && linker && (consonant = isIndicConjunctConsonant(cp));
// It cannot be both a linker and a consonant.
linker = linker && !consonant;
} else if (catAfter === 0) {
incb = consonant && linker && isIndicConjunctConsonant(cp);
}
}

Expand All @@ -134,6 +120,17 @@ export function* graphemeSegments(input) {
index = cursor;
_catBegin = catAfter;
_hd = cp;

} else if (cp >= 2325) {
// Note: Avoid InCB state checking much as possible
// Update InCB state only when continuing within a segment
if (!consonant && catBefore === 0)
consonant = isIndicConjunctConsonant(_hd);

if (consonant && catAfter === 3)
linker = isIndicConjunctLinker(cp);
else if (catAfter === 0)
linker = false;
}

cursor += cp <= BMP_MAX ? 1 : 2;
Expand Down