From 7083a2344aa5643c9e2afe49899814928e8bf25f Mon Sep 17 00:00:00 2001 From: Ivan K Date: Sun, 7 Dec 2025 16:20:48 +0300 Subject: [PATCH 1/3] regression test --- inst/tests/tests.Rraw | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 591dcea54..f0631331d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21876,3 +21876,12 @@ test(2347, DT[i, .(result = all(is.na(grp) == is.na(a))), by = grp][,all(result) DT = data.table(a = as.Date("2010-01-01"), b = 1L) test(2348.1, tryCatch(DT[a == as.Date("20100101")], error=conditionCall)[[1L]], quote(charToDate)) test(2348.2, tryCatch(DT[a == as.Date("20100101") | b == 2L], error=conditionCall)[[1L]], quote(charToDate)) + +# rbindlist did not protect the temporary UTF-8 strings, #7452 +DTn = apply(matrix(as.raw(rep(0xa1:0xff, length.out = 100)), 10), 2, rawToChar) +Encoding(DTn) = 'latin1' # will need conversion to UTF-8 +DTl = lapply(DTn, function(n) setNames(list(42), n))[c(1, rep(2:length(DTn), length.out = 3e5), 1)] +DT = suppressMessages(rbindlist(DTl)) # used to crash +test(2349.1, dim(DT), c(300002L, 1L)) +test(2349.2, DT[[1]], rep(42, nrow(DT))) +rm(DTn, DTl, DT) From 5944324546eb3820b0d1a8858fa97e62c0d5002e Mon Sep 17 00:00:00 2001 From: Ivan K Date: Sun, 7 Dec 2025 16:35:48 +0300 Subject: [PATCH 2/3] rbindlist(): pre-convert names to UTF-8 Fixes: #7452 --- src/rbindlist.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/rbindlist.c b/src/rbindlist.c index 2265f434a..b557f7fd4 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -68,6 +68,13 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor int *colMap=NULL; // maps each column in final result to the column of each list item if (usenames==TRUE || usenames==NA_LOGICAL) { + // zeroth pass - convert all names to UTF-8 + SEXP cnl = PROTECT(allocVector(VECSXP, XLENGTH(l))); + for (R_xlen_t i = 0; i < XLENGTH(l); ++i) { + const SEXP cn = getAttrib(VECTOR_ELT(l, i), R_NamesSymbol); + if (xlength(cn)) SET_VECTOR_ELT(cnl, i, coerceUtf8IfNeeded(cn)); + } + const SEXP *cnlp = SEXPPTR_RO(cnl); // here we proceed as if fill=true for brevity (accounting for dups is tricky) and then catch any missings after this branch // when use.names==NA we also proceed here as if use.names was TRUE to save new code and then check afterwards the map is 1:ncol for every item // first find number of unique column names present; i.e. length(unique(unlist(lapply(l,names)))) @@ -81,11 +88,11 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor SEXP li = VECTOR_ELT(l, i); int thisncol=LENGTH(li); if (isNull(li) || !LENGTH(li)) continue; - const SEXP cn = getAttrib(li, R_NamesSymbol); + const SEXP cn = cnlp[i]; if (!length(cn)) continue; const SEXP *cnp = STRING_PTR_RO(cn); for (int j=0; j0) savetl(s); uniq[nuniq++] = s; @@ -110,12 +117,12 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor SEXP li = VECTOR_ELT(l, i); int thisncol=length(li); if (thisncol==0) continue; - const SEXP cn = getAttrib(li, R_NamesSymbol); + const SEXP cn = cnlp[i]; if (!length(cn)) continue; const SEXP *cnp = STRING_PTR_RO(cn); memset(counts, 0, nuniq*sizeof(*counts)); for (int j=0; j Date: Sun, 7 Dec 2025 16:53:37 +0300 Subject: [PATCH 3/3] NEWS item --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 994b54be4..2dca63ab0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -350,6 +350,8 @@ See [#2611](https://github.com/Rdatatable/data.table/issues/2611) for details. T 25. By-group operations on missing rows (e.g. `foo[c(i, NA), bar, by=grp]`) now avoid leaving in data from the previous groups, [#7442](https://github.com/Rdatatable/data.table/issues/7442). Thanks @aitap for the report and the fix. +26. `rbindlist()` now avoids the crash when working with many non-UTF-8 column names, [#7452](https://github.com/Rdatatable/data.table/issues/7452). Thanks @aitap for the report and the fix. + ### NOTES 1. The following in-progress deprecations have proceeded: