Skip to content

Commit 6b505d1

Browse files
miss-islingtonsethmlarsonch4n3-yoonStanFromIrelandpicnixz
authored
[3.14] gh-149079: Fix O(n^2) canonical ordering in unicodedata.normalize() (GH-149080)
Replace the insertion sort used for canonical ordering of combining characters with a hybrid approach: insertion sort for short runs (< 20) and counting sort for longer runs, reducing worst-case complexity from O(n^2) to O(n). This prevents denial of service via crafted Unicode strings with many combining characters in alternating CCC order. (cherry picked from commit 991224b) Co-authored-by: Seth Larson <seth@python.org> Co-authored-by: ch4n3-yoon <ch4n3.yoon@gmail.com> Co-authored-by: Seokchan Yoon <13852925+ch4n3-yoon@users.noreply.github.com> Co-authored-by: Stan Ulbrych <stan@python.org> Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com> Co-authored-by: Petr Viktorin <encukou@gmail.com> Co-authored-by: Serhiy Storchaka <storchaka@gmail.com> Co-authored-by: Maurycy Pawłowski-Wieroński <maurycy@maurycy.com>
1 parent 8ee3bcf commit 6b505d1

3 files changed

Lines changed: 150 additions & 26 deletions

File tree

Lib/test/test_unicodedata.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -580,6 +580,34 @@ def test_issue10254(self):
580580
b = 'C\u0338' * 20 + '\xC7'
581581
self.assertEqual(self.db.normalize('NFC', a), b)
582582

583+
def test_long_combining_mark_run(self):
584+
# gh-149079: avoid quadratic canonical ordering.
585+
payload = "a" + ("\u0300\u0327" * 32)
586+
nfd = "a" + ("\u0327" * 32) + ("\u0300" * 32)
587+
nfc = "\u00e0" + ("\u0327" * 32) + ("\u0300" * 31)
588+
589+
self.assertEqual(self.db.normalize("NFD", payload), nfd)
590+
self.assertEqual(self.db.normalize("NFKD", payload), nfd)
591+
self.assertEqual(self.db.normalize("NFC", payload), nfc)
592+
self.assertEqual(self.db.normalize("NFKC", payload), nfc)
593+
594+
def test_combining_mark_run_fast_paths(self):
595+
# gh-149079: cover short runs and already-sorted long runs.
596+
short_payload = "a" + ("\u0300\u0327" * 9) + "\u0300"
597+
short_nfd = "a" + ("\u0327" * 9) + ("\u0300" * 10)
598+
short_nfc = "\u00e0" + ("\u0327" * 9) + ("\u0300" * 9)
599+
long_sorted = "a" + ("\u0327" * 30) + ("\u0300" * 30)
600+
long_sorted_nfc = "\u00e0" + ("\u0327" * 30) + ("\u0300" * 29)
601+
602+
self.assertEqual(self.db.normalize("NFD", short_payload), short_nfd)
603+
self.assertEqual(self.db.normalize("NFKD", short_payload), short_nfd)
604+
self.assertEqual(self.db.normalize("NFC", short_payload), short_nfc)
605+
self.assertEqual(self.db.normalize("NFKC", short_payload), short_nfc)
606+
self.assertEqual(self.db.normalize("NFD", long_sorted), long_sorted)
607+
self.assertEqual(self.db.normalize("NFKD", long_sorted), long_sorted)
608+
self.assertEqual(self.db.normalize("NFC", long_sorted), long_sorted_nfc)
609+
self.assertEqual(self.db.normalize("NFKC", long_sorted), long_sorted_nfc)
610+
583611
def test_issue29456(self):
584612
# Fix #29456
585613
u1176_str_a = '\u1100\u1176\u11a8'
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Fix a potential denial of service in :func:`unicodedata.normalize`. The
2+
canonical ordering step of Unicode normalization used a quadratic-time insertion
3+
sort for reordering combining characters, which could be exploited with
4+
crafted input containing many combining characters in non-canonical order.
5+
Replaced with a linear-time counting sort for long runs.

Modules/unicodedata.c

Lines changed: 117 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -510,19 +510,80 @@ get_decomp_record(PyObject *self, Py_UCS4 code,
510510
(*index)++;
511511
}
512512

513+
/* Small combining runs are usually cheaper with insertion sort. */
514+
#define CANONICAL_ORDERING_COUNTING_SORT_THRESHOLD 20
515+
516+
static void
517+
canonical_ordering_sort_insertion(int kind, void *data,
518+
Py_ssize_t start, Py_ssize_t end)
519+
{
520+
for (Py_ssize_t i = start + 1; i < end; i++) {
521+
Py_UCS4 code = PyUnicode_READ(kind, data, i);
522+
unsigned char combining = _getrecord_ex(code)->combining;
523+
Py_ssize_t j = i;
524+
525+
while (j > start) {
526+
Py_UCS4 previous = PyUnicode_READ(kind, data, j - 1);
527+
if (_getrecord_ex(previous)->combining <= combining) {
528+
break;
529+
}
530+
PyUnicode_WRITE(kind, data, j, previous);
531+
j--;
532+
}
533+
if (j != i) {
534+
PyUnicode_WRITE(kind, data, j, code);
535+
}
536+
}
537+
}
538+
539+
static void
540+
canonical_ordering_sort_counting(int kind, void *data,
541+
Py_ssize_t start, Py_ssize_t end,
542+
Py_UCS4 *sortbuf)
543+
{
544+
Py_ssize_t counts[256] = {0};
545+
Py_ssize_t run_length = end - start;
546+
Py_ssize_t total = 0;
547+
548+
for (Py_ssize_t i = start; i < end; i++) {
549+
Py_UCS4 code = PyUnicode_READ(kind, data, i);
550+
unsigned char combining = _getrecord_ex(code)->combining;
551+
counts[combining]++;
552+
}
553+
554+
for (size_t i = 0; i < Py_ARRAY_LENGTH(counts); i++) {
555+
Py_ssize_t count = counts[i];
556+
counts[i] = total;
557+
total += count;
558+
}
559+
560+
/* Reuse counts[] as the next output slot for each CCC. */
561+
for (Py_ssize_t i = start; i < end; i++) {
562+
Py_UCS4 code = PyUnicode_READ(kind, data, i);
563+
unsigned char combining = _getrecord_ex(code)->combining;
564+
sortbuf[counts[combining]++] = code;
565+
}
566+
for (Py_ssize_t i = 0; i < run_length; i++) {
567+
PyUnicode_WRITE(kind, data, start + i, sortbuf[i]);
568+
}
569+
}
570+
513571
static PyObject*
514572
nfd_nfkd(PyObject *self, PyObject *input, int k)
515573
{
516574
PyObject *result;
517575
Py_UCS4 *output;
518576
Py_ssize_t i, o, osize;
519-
int kind;
520-
const void *data;
577+
int input_kind, result_kind;
578+
const void *input_data;
579+
void *result_data;
521580
/* Longest decomposition in Unicode 3.2: U+FDFA */
522581
Py_UCS4 stack[20];
523582
Py_ssize_t space, isize;
524583
int index, prefix, count, stackptr;
525584
unsigned char prev, cur;
585+
Py_UCS4 *sortbuf = NULL;
586+
Py_ssize_t sortbuflen = 0;
526587

527588
stackptr = 0;
528589
isize = PyUnicode_GET_LENGTH(input);
@@ -542,11 +603,11 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
542603
return NULL;
543604
}
544605
i = o = 0;
545-
kind = PyUnicode_KIND(input);
546-
data = PyUnicode_DATA(input);
606+
input_kind = PyUnicode_KIND(input);
607+
input_data = PyUnicode_DATA(input);
547608

548609
while (i < isize) {
549-
stack[stackptr++] = PyUnicode_READ(kind, data, i++);
610+
stack[stackptr++] = PyUnicode_READ(input_kind, input_data, i++);
550611
while(stackptr) {
551612
Py_UCS4 code = stack[--stackptr];
552613
/* Hangul Decomposition adds three characters in
@@ -614,34 +675,64 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
614675
if (!result)
615676
return NULL;
616677

617-
kind = PyUnicode_KIND(result);
618-
data = PyUnicode_DATA(result);
678+
result_kind = PyUnicode_KIND(result);
679+
result_data = PyUnicode_DATA(result);
619680

620-
/* Sort canonically. */
681+
/* Sort each consecutive combining-character run canonically. */
621682
i = 0;
622-
prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
623-
for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
624-
cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
625-
if (prev == 0 || cur == 0 || prev <= cur) {
626-
prev = cur;
683+
while (i < o) {
684+
Py_ssize_t run_length, run_start;
685+
int needs_sort = 0;
686+
687+
Py_UCS4 ch = PyUnicode_READ(result_kind, result_data, i);
688+
prev = _getrecord_ex(ch)->combining;
689+
if (prev == 0) {
690+
i++;
627691
continue;
628692
}
629-
/* Non-canonical order. Need to switch *i with previous. */
630-
o = i - 1;
631-
while (1) {
632-
Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
633-
PyUnicode_WRITE(kind, data, o+1,
634-
PyUnicode_READ(kind, data, o));
635-
PyUnicode_WRITE(kind, data, o, tmp);
636-
o--;
637-
if (o < 0)
638-
break;
639-
prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
640-
if (prev == 0 || prev <= cur)
693+
694+
run_start = i++;
695+
while (i < o) {
696+
Py_UCS4 ch = PyUnicode_READ(result_kind, result_data, i);
697+
cur = _getrecord_ex(ch)->combining;
698+
if (cur == 0) {
641699
break;
700+
}
701+
if (prev > cur) {
702+
needs_sort = 1;
703+
}
704+
prev = cur;
705+
i++;
706+
}
707+
if (!needs_sort) {
708+
continue;
709+
}
710+
711+
run_length = i - run_start;
712+
if (run_length < CANONICAL_ORDERING_COUNTING_SORT_THRESHOLD) {
713+
canonical_ordering_sort_insertion(result_kind, result_data,
714+
run_start, i);
715+
continue;
642716
}
643-
prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
717+
718+
if (run_length > sortbuflen) {
719+
Py_UCS4 *new_sortbuf = PyMem_Resize(sortbuf,
720+
Py_UCS4,
721+
run_length);
722+
if (new_sortbuf == NULL) {
723+
PyErr_NoMemory();
724+
PyMem_Free(sortbuf);
725+
Py_DECREF(result);
726+
return NULL;
727+
}
728+
sortbuf = new_sortbuf;
729+
sortbuflen = run_length;
730+
}
731+
732+
canonical_ordering_sort_counting(result_kind, result_data,
733+
run_start, i, sortbuf);
644734
}
735+
PyMem_Free(sortbuf);
645736
return result;
646737
}
647738

0 commit comments

Comments
 (0)