Skip to content

Commit 97dea30

Browse files
eendebakptclaude
andauthored
gh-150889: Improve performance of unicodedata.normalize() (GH-150890)
Scan the nfc_first/nfc_last reindex tables comparing only .start, range-check the candidate once, and terminate on a sentinel above every codepoint, so each entry costs a single comparison. ~2x faster on non-Latin and combining-heavy NFC/NFKC input; no new data tables. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 2452449 commit 97dea30

4 files changed

Lines changed: 24 additions & 13 deletions

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Speed up :func:`unicodedata.normalize` for the NFC and NFKC forms of non-ASCII text up to a factor 2.

Modules/unicodedata.c

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -785,15 +785,19 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
785785
static int
786786
find_nfc_index(const struct reindex* nfc, Py_UCS4 code)
787787
{
788-
unsigned int index;
789-
for (index = 0; nfc[index].start; index++) {
790-
unsigned int start = nfc[index].start;
791-
if (code < start)
792-
return -1;
793-
if (code <= start + nfc[index].count) {
794-
unsigned int delta = code - start;
795-
return nfc[index].index + delta;
796-
}
788+
/* The table is sorted by .start ascending with disjoint [start, start+count]
789+
ranges and ends with a sentinel whose .start exceeds every codepoint, so
790+
a single .start <= code test per entry also stops at the sentinel. Find
791+
the first entry past code, then range-check the candidate (entry i - 1). */
792+
unsigned int i;
793+
for (i = 0; (Py_UCS4)nfc[i].start <= code; i++) {
794+
}
795+
if (i == 0) {
796+
return -1;
797+
}
798+
unsigned int start = nfc[i - 1].start;
799+
if (code <= start + nfc[i - 1].count) {
800+
return nfc[i - 1].index + (code - start);
797801
}
798802
return -1;
799803
}

Modules/unicodedata_db.h

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Tools/unicode/makeunicodedata.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -342,15 +342,21 @@ def makeunicodedata(unicode, trace):
342342
fprint("#define TOTAL_FIRST",total_first)
343343
fprint("#define TOTAL_LAST",total_last)
344344
fprint("struct reindex{int start;short count,index;};")
345+
# The reindex tables are read only by find_nfc_index(), which scans
346+
# forward while .start <= code. The trailing sentinel's .start must
347+
# exceed every codepoint (so the scan stops with a single comparison)
348+
# and fit the signed int .start field.
349+
nfc_sentinel = 0x7fffffff
350+
assert sys.maxunicode < nfc_sentinel <= 0x7fffffff
345351
fprint("static struct reindex nfc_first[] = {")
346352
for start,end in comp_first_ranges:
347353
fprint(" { %d, %d, %d}," % (start,end-start,comp_first[start]))
348-
fprint(" {0,0,0}")
354+
fprint(" {0x%x, 0, 0}" % nfc_sentinel)
349355
fprint("};\n")
350356
fprint("static struct reindex nfc_last[] = {")
351357
for start,end in comp_last_ranges:
352358
fprint(" { %d, %d, %d}," % (start,end-start,comp_last[start]))
353-
fprint(" {0,0,0}")
359+
fprint(" {0x%x, 0, 0}" % nfc_sentinel)
354360
fprint("};\n")
355361

356362
# FIXME: <fl> the following tables could be made static, and

0 commit comments

Comments
 (0)