Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .changeset/four-apples-show.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
"unicode-segmenter": patch
---

Inlined the grapheme boundary checking
to avoid unnecessary function calls in the hotpath and consolidating internal state.

This achieved the runtime perf by 2% and a slight bundle size reduction.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb

| Name | Unicode® | ESM? | Size | Size (min) | Size (min+gzip) | Size (min+br) | Size (min+zstd) |
|------------------------------|----------|------|----------:|-----------:|----------------:|--------------:|----------------:|
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 10,937 | 6,743 | 3,401 | 2,770 | 3,520 |
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 10,774 | 6,675 | 3,368 | 2,755 | 3,497 |
| `graphemer` | 15.0.0 | ✖️ ️| 410,435 | 95,104 | 15,752 | 10,660 | 15,911 |
| `grapheme-splitter` | 10.0.0 | ✖️ | 122,254 | 23,682 | 7,852 | 4,802 | 6,753 |
| `@formatjs/intl-segmenter`* | 15.0.0 | ✖️ | 603,510 | 369,673 | 72,273 | 49,530 | 68,027 |
Expand All @@ -231,7 +231,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb

| Name | Bytecode size | Bytecode size (gzip)* |
|------------------------------|--------------:|----------------------:|
| `unicode-segmenter/grapheme` | 20,446 | 11,561 |
| `unicode-segmenter/grapheme` | 20,295 | 11,420 |
| `graphemer` | 134,089 | 31,766 |
| `grapheme-splitter` | 63,946 | 19,162 |

Expand Down
187 changes: 76 additions & 111 deletions src/grapheme.js
Original file line number Diff line number Diff line change
Expand Up @@ -59,52 +59,86 @@ export function* graphemeSegments(input) {
/** Category of codepoint immediately preceding cursor */
let catBefore = cat(cp);

/** @type {GraphemeCategoryNum | null} Category of codepoint immediately preceding cursor. */
let catAfter = null;
/** @type {GraphemeCategoryNum} Category of codepoint immediately preceding cursor. */
let catAfter = 0;

/** The number of RIS codepoints preceding `cursor`. */
let risCount = 0;

/** Emoji state */
/**
* Emoji state for GB11: tracks if we've seen Extended_Pictographic followed by Extend* ZWJ
* Only relevant when catBefore === ZWJ && catAfter === Extended_Pictographic
*/
let emoji = false;

/** InCB=Consonant */
/** InCB=Consonant - segment started with Indic consonant */
let consonant = false;

/** InCB=Linker */
/** InCB=Linker - seen a linker after consonant */
let linker = false;

/** InCB=Consonant InCB=Linker x InCB=Consonant */
let incb = false;

let index = 0;

/** Beginning category of a segment */
let _catBegin = catBefore;

/** Memoize the beginnig code point of the segment. */
/** Memoize the beginning code point of the segment. */
let _hd = cp;

while (cursor < len) {
cp = /** @type {number} */ (input.codePointAt(cursor));
catAfter = cat(cp);

if (catBefore === 10 /* Regional_Indicator */) {
risCount++;
} else {
risCount = 0;
if (
catAfter === 14 /* ZWJ */
&& (catBefore === 3 /* Extend */ || catBefore === 4 /* Extended_Pictographic */)
) {
emoji = true;
let boundary = true;

} else if (catAfter === 0) {
incb = consonant && linker && isIndicConjunctConsonant(cp);
}
// GB3: CR × LF
if (catBefore === 1) {
boundary = catAfter !== 6;
}
// GB4: (Control | CR | LF) ÷
else if (catBefore === 2 || catBefore === 6) {
boundary = true;
}
// GB5: ÷ (Control | CR | LF)
else if (catAfter === 1 || catAfter === 2 || catAfter === 6) {
boundary = true;
}
// GB9, GB9a: × (Extend | ZWJ | SpacingMark) - most common no-break case
else if (catAfter === 3 || catAfter === 14 || catAfter === 11) {
boundary = false;
}
// GB9b: Prepend ×
else if (catBefore === 9) {
boundary = false;
}
// GB11: ExtPic Extend* ZWJ × ExtPic
else if (catBefore === 14 && catAfter === 4) {
boundary = !emoji;
}
// GB12, GB13: RI × RI (odd count means no break)
else if (catBefore === 10 && catAfter === 10) {
// risCount is count BEFORE current RI, so odd means this is 2nd, 4th, etc.
boundary = risCount++ % 2 === 1;
}
// GB6: L × (L | V | LV | LVT)
else if (catBefore === 5) {
boundary = !(catAfter === 5 || catAfter === 13 || catAfter === 7 || catAfter === 8);
}
// GB7: (LV | V) × (V | T)
else if ((catBefore === 7 || catBefore === 13) && (catAfter === 13 || catAfter === 12)) {
boundary = false;
}
// GB8: (LVT | T) × T
else if ((catBefore === 8 || catBefore === 12) && catAfter === 12) {
boundary = false;
}
// GB9c: InCB=Consonant InCB=Extend* InCB=Linker InCB=Extend* × InCB=Consonant
else if (catAfter === 0 && consonant && linker && isIndicConjunctConsonant(cp)) {
boundary = false;
}
// else GB999: ÷ Any

if (isBoundary(catBefore, catAfter, risCount, emoji, incb)) {
if (boundary) {
yield {
segment: input.slice(index, cursor),
index,
Expand All @@ -114,23 +148,30 @@ export function* graphemeSegments(input) {
_catEnd: catBefore,
};

// flush
// Reset segment state
emoji = false;
incb = false;
risCount = 0;
index = cursor;
_catBegin = catAfter;
_hd = cp;

} else if (cp >= 2325) {
// Note: Avoid InCB state checking much as possible
// Update InCB state only when continuing within a segment
if (!consonant && catBefore === 0)
consonant = isIndicConjunctConsonant(_hd);

if (consonant && catAfter === 3)
linker = isIndicConjunctLinker(cp);
else if (catAfter === 0)
linker = false;
}
// Update state for continuing segment
else {
// emoji state for GB11
if (catAfter === 14 && (catBefore === 3 || catBefore === 4)) {
emoji = true;
}
// InCB state for GB9c
else if (cp >= 2325) {
if (!consonant && catBefore === 0) {
consonant = isIndicConjunctConsonant(_hd);
}
if (consonant && catAfter === 3) {
linker = linker || isIndicConjunctLinker(cp);
} else {
linker = false;
}
}
}

cursor += cp <= BMP_MAX ? 1 : 2;
Expand Down Expand Up @@ -313,79 +354,3 @@ function isIndicConjunctLinker(cp) {
cp === 3405 /* 0x0D4D */
);
}

/**
* @param {GraphemeCategoryNum} catBefore
* @param {GraphemeCategoryNum} catAfter
* @param {number} risCount Regional_Indicator state
* @param {boolean} emoji Extended_Pictographic state
* @param {boolean} incb Indic_Conjunct_Break state
* @return {boolean}
*
* @see https://www.unicode.org/reports/tr29/tr29-43.html#Grapheme_Cluster_Boundary_Rules
*/
function isBoundary(catBefore, catAfter, risCount, emoji, incb) {
// GB3
if (catBefore === 1 && catAfter === 6) {
return false;
}

// GB4
if (catBefore === 1 || catBefore === 2 || catBefore === 6) {
return true;
}

// GB5
if (catAfter === 1 || catAfter === 2 || catAfter === 6) {
return true;
}

// Most common cases - GB9, GB9a extend rules
if (catAfter === 3 || catAfter === 14 || catAfter === 11) {
return false;
}

// GB6 - L x (L | V | LV | LVT)
if (catBefore === 5) {
return !(catAfter === 5 || catAfter === 7 || catAfter === 8 || catAfter === 13);
}

// GB7 - (LV | V) x (V | T)
if (
(catBefore === 7 || catBefore === 13) &&
(catAfter === 13 || catAfter === 12)
) {
return false;
}

// GB8 - (LVT | T) x T
if (
(catBefore === 8 || catBefore === 12) &&
catAfter === 12
) {
return false;
}

// GB9b
if (catBefore === 9) {
return false;
}

// GB9c
if (catAfter === 0 && incb) {
return false;
}

// GB11
if (catBefore === 14 && catAfter === 4) {
return !emoji;
}

// GB12, GB13
if (catBefore === 10 && catAfter === 10) {
return risCount % 2 === 0;
}

// GB999
return true;
}