From c13642fcce88cce0a3fc0f519a02892295960f97 Mon Sep 17 00:00:00 2001 From: venom1204 Date: Fri, 20 Mar 2026 09:28:18 +0000 Subject: [PATCH 1/3] apllied the changes as requested --- NEWS.md | 2 ++ R/data.table.R | 6 ++++++ R/onLoad.R | 3 ++- R/utils.R | 24 ++++++++++++++++++++++++ inst/tests/tests.Rraw | 19 +++++++++++++++++++ man/data.table-options.Rd | 3 +++ 6 files changed, 56 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 4e64f3faa0..0e0966c6c1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -30,6 +30,8 @@ 5. `tables()` can now optionally report `data.table` objects stored one level deep inside list objects when `depth=1L`, [#2606](https://github.com/Rdatatable/data.table/issues/2606). Thanks @MichaelChirico for the report and @manmita for the PR +6. `setnames()` now supports a global option `datatable.unique.names` to control the creation of duplicate column names. Users can choose between `"off"` (default), `"warn"`, `"error"`, or `"rename"`. This addresses long-standing ambiguity issues when duplicate names were created silently, [#4044](https://github.com/Rdatatable/data.table/issues/4044). Thanks to @venom1204 for the PR. + ### BUG FIXES 1. `fread()` with `skip=0` and `(header=TRUE|FALSE)` no longer skips the first row when it has fewer fields than subsequent rows, [#7463](https://github.com/Rdatatable/data.table/issues/7463). Thanks @emayerhofer for the report and @ben-schwen for the fix. diff --git a/R/data.table.R b/R/data.table.R index a989538b14..bf10b2229c 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2943,6 +2943,12 @@ setnames = function(x,old,new,skip_absent=FALSE) { if (!length(new)) return(invisible(x)) # no changes if (length(i) != length(new)) internal_error("length(i)!=length(new)") # nocov } + + full_names = names(x) + full_names[i] = new + full_names = process_name_policy(full_names) + new = full_names[i] + # update the key if the column name being change is in the key m = chmatch(names(x)[i], key(x)) w = which(!is.na(m)) diff --git a/R/onLoad.R b/R/onLoad.R index b72fee4d1b..581d30692a 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -98,7 +98,8 @@ datatable.auto.index=TRUE, # DT[col=="val"] to auto add index so 2nd time faster datatable.use.index=TRUE, # global switch to address #1422 datatable.prettyprint.char=NULL, # FR #1091 - datatable.old.matrix.autoname=FALSE # #7145: how data.table(x=1, matrix(1)) is auto-named set to change + datatable.old.matrix.autoname=FALSE, # #7145: how data.table(x=1, matrix(1)) is auto-named set to change + datatable.unique.names = NULL ) opts = opts[!names(opts) %chin% names(options())] options(opts) diff --git a/R/utils.R b/R/utils.R index 9d89f6f0a4..cbb9377e67 100644 --- a/R/utils.R +++ b/R/utils.R @@ -35,6 +35,30 @@ check_duplicate_names = function(x, table_name=deparse(substitute(x))) { table_name, brackify(duplicate_names), domain=NA) } +process_name_policy = function(names_vec) { + policy = getOption("datatable.unique.names") + if (is.null(policy) || policy == "off") return(names_vec) + + allowed = c("warn", "error", "rename") + if (!policy %in% allowed) { + warningf("Invalid value for 'datatable.unique.names': [%s]. Falling back to 'off'. Allowed values are: 'off', 'warn', 'error', 'rename'.", as.character(policy)) + return(names_vec) + } + + if (anyDuplicated(names_vec)) { + dups = unique(names_vec[duplicated(names_vec)]) + # Use paste0 to avoid sprintf issues with column names containing '%' + msg = paste0("Duplicate column names created: ", brackify(dups), ". This may cause ambiguity.") + + switch(policy, + warn = warningf(msg), + error = stopf(msg), + rename = return(make.unique(names_vec)) + ) + } + names_vec +} + duplicated_values = function(x) { # fast anyDuplicated for the typical/non-error case; second duplicated() pass for (usually) error case if (!anyDuplicated(x)) return(vector(typeof(x))) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b73b2767a8..8cfd225530 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21577,3 +21577,22 @@ close(con) file.create(f <- tempfile()) test(2367.6, fread(file(f)), data.table(), warning="Connection has size 0.") unlink(f) + +#4044 +DT = as.data.table(iris) +test(2368.1, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), + c("Sepal.Length", "Sepal.Width", "Sepal.Length", "Petal.Width", "Species"), + options = list(datatable.unique.names = "off")) + +test(2368.2, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), + c("Sepal.Length", "Sepal.Width", "Sepal.Length", "Petal.Width", "Species"), + warning = "Duplicate column names created", + options = list(datatable.unique.names = "warn")) + +test(2368.3, setnames(copy(DT), "Petal.Length", "Sepal.Length"), + error = "Duplicate column names created", + options = list(datatable.unique.names = "error")) + +test(2368.4, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), + c("Sepal.Length", "Sepal.Width", "Sepal.Length.1", "Petal.Width", "Species"), + options = list(datatable.unique.names = "rename")) diff --git a/man/data.table-options.Rd b/man/data.table-options.Rd index 439e88ef2f..5d198e5f87 100644 --- a/man/data.table-options.Rd +++ b/man/data.table-options.Rd @@ -105,6 +105,9 @@ \item{\code{datatable.enlist}}{Experimental feature. Default is \code{NULL}. If set to a function (e.g., \code{list}), the \code{j} expression can return a \code{list}, which will then be "enlisted" into columns in the result.} + \item{\code{datatable.unique.names}}{A character string, default \code{NULL} (same as \code{"off"}). + Controls the behavior when operations (\bold{currently only \code{setnames}}) + would result in duplicate column names.} } } From 488b87377044c2a5667fddc0447e1f49417bc13d Mon Sep 17 00:00:00 2001 From: venom1204 Date: Fri, 20 Mar 2026 09:46:28 +0000 Subject: [PATCH 2/3] added test --- inst/tests/tests.Rraw | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 8cfd225530..1678a1006a 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21583,16 +21583,17 @@ DT = as.data.table(iris) test(2368.1, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), c("Sepal.Length", "Sepal.Width", "Sepal.Length", "Petal.Width", "Species"), options = list(datatable.unique.names = "off")) - test(2368.2, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), c("Sepal.Length", "Sepal.Width", "Sepal.Length", "Petal.Width", "Species"), warning = "Duplicate column names created", options = list(datatable.unique.names = "warn")) - test(2368.3, setnames(copy(DT), "Petal.Length", "Sepal.Length"), error = "Duplicate column names created", options = list(datatable.unique.names = "error")) - test(2368.4, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), c("Sepal.Length", "Sepal.Width", "Sepal.Length.1", "Petal.Width", "Species"), options = list(datatable.unique.names = "rename")) +test(2368.5, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), + c("Sepal.Length", "Sepal.Width", "Sepal.Length", "Petal.Width", "Species"), + warning = "Invalid value for 'datatable.unique.names'", + options = list(datatable.unique.names = "invalid_option_name")) From 471ab203fbfaee65509c7d805b7ddf69fcac872c Mon Sep 17 00:00:00 2001 From: venom1204 Date: Fri, 20 Mar 2026 10:20:49 +0000 Subject: [PATCH 3/3] lintr --- R/onLoad.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/onLoad.R b/R/onLoad.R index 581d30692a..10136da1b3 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -99,7 +99,7 @@ datatable.use.index=TRUE, # global switch to address #1422 datatable.prettyprint.char=NULL, # FR #1091 datatable.old.matrix.autoname=FALSE, # #7145: how data.table(x=1, matrix(1)) is auto-named set to change - datatable.unique.names = NULL + datatable.unique.names = NULL ) opts = opts[!names(opts) %chin% names(options())] options(opts)