diff --git a/NEWS.md b/NEWS.md index 4e64f3faa..0e0966c6c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -30,6 +30,8 @@ 5. `tables()` can now optionally report `data.table` objects stored one level deep inside list objects when `depth=1L`, [#2606](https://github.com/Rdatatable/data.table/issues/2606). Thanks @MichaelChirico for the report and @manmita for the PR +6. `setnames()` now supports a global option `datatable.unique.names` to control the creation of duplicate column names. Users can choose between `"off"` (default), `"warn"`, `"error"`, or `"rename"`. This addresses long-standing ambiguity issues when duplicate names were created silently, [#4044](https://github.com/Rdatatable/data.table/issues/4044). Thanks to @venom1204 for the PR. + ### BUG FIXES 1. `fread()` with `skip=0` and `(header=TRUE|FALSE)` no longer skips the first row when it has fewer fields than subsequent rows, [#7463](https://github.com/Rdatatable/data.table/issues/7463). Thanks @emayerhofer for the report and @ben-schwen for the fix. diff --git a/R/data.table.R b/R/data.table.R index a989538b1..bf10b2229 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2943,6 +2943,12 @@ setnames = function(x,old,new,skip_absent=FALSE) { if (!length(new)) return(invisible(x)) # no changes if (length(i) != length(new)) internal_error("length(i)!=length(new)") # nocov } + + full_names = names(x) + full_names[i] = new + full_names = process_name_policy(full_names) + new = full_names[i] + # update the key if the column name being change is in the key m = chmatch(names(x)[i], key(x)) w = which(!is.na(m)) diff --git a/R/onLoad.R b/R/onLoad.R index b72fee4d1..10136da1b 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -98,7 +98,8 @@ datatable.auto.index=TRUE, # DT[col=="val"] to auto add index so 2nd time faster datatable.use.index=TRUE, # global switch to address #1422 datatable.prettyprint.char=NULL, # FR #1091 - datatable.old.matrix.autoname=FALSE # #7145: how data.table(x=1, matrix(1)) is auto-named set to change + datatable.old.matrix.autoname=FALSE, # #7145: how data.table(x=1, matrix(1)) is auto-named set to change + datatable.unique.names = NULL ) opts = opts[!names(opts) %chin% names(options())] options(opts) diff --git a/R/utils.R b/R/utils.R index 9d89f6f0a..cbb9377e6 100644 --- a/R/utils.R +++ b/R/utils.R @@ -35,6 +35,30 @@ check_duplicate_names = function(x, table_name=deparse(substitute(x))) { table_name, brackify(duplicate_names), domain=NA) } +process_name_policy = function(names_vec) { + policy = getOption("datatable.unique.names") + if (is.null(policy) || policy == "off") return(names_vec) + + allowed = c("warn", "error", "rename") + if (!policy %in% allowed) { + warningf("Invalid value for 'datatable.unique.names': [%s]. Falling back to 'off'. Allowed values are: 'off', 'warn', 'error', 'rename'.", as.character(policy)) + return(names_vec) + } + + if (anyDuplicated(names_vec)) { + dups = unique(names_vec[duplicated(names_vec)]) + # Use paste0 to avoid sprintf issues with column names containing '%' + msg = paste0("Duplicate column names created: ", brackify(dups), ". This may cause ambiguity.") + + switch(policy, + warn = warningf(msg), + error = stopf(msg), + rename = return(make.unique(names_vec)) + ) + } + names_vec +} + duplicated_values = function(x) { # fast anyDuplicated for the typical/non-error case; second duplicated() pass for (usually) error case if (!anyDuplicated(x)) return(vector(typeof(x))) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b73b2767a..1678a1006 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21577,3 +21577,23 @@ close(con) file.create(f <- tempfile()) test(2367.6, fread(file(f)), data.table(), warning="Connection has size 0.") unlink(f) + +#4044 +DT = as.data.table(iris) +test(2368.1, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), + c("Sepal.Length", "Sepal.Width", "Sepal.Length", "Petal.Width", "Species"), + options = list(datatable.unique.names = "off")) +test(2368.2, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), + c("Sepal.Length", "Sepal.Width", "Sepal.Length", "Petal.Width", "Species"), + warning = "Duplicate column names created", + options = list(datatable.unique.names = "warn")) +test(2368.3, setnames(copy(DT), "Petal.Length", "Sepal.Length"), + error = "Duplicate column names created", + options = list(datatable.unique.names = "error")) +test(2368.4, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), + c("Sepal.Length", "Sepal.Width", "Sepal.Length.1", "Petal.Width", "Species"), + options = list(datatable.unique.names = "rename")) +test(2368.5, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), + c("Sepal.Length", "Sepal.Width", "Sepal.Length", "Petal.Width", "Species"), + warning = "Invalid value for 'datatable.unique.names'", + options = list(datatable.unique.names = "invalid_option_name")) diff --git a/man/data.table-options.Rd b/man/data.table-options.Rd index 439e88ef2..5d198e5f8 100644 --- a/man/data.table-options.Rd +++ b/man/data.table-options.Rd @@ -105,6 +105,9 @@ \item{\code{datatable.enlist}}{Experimental feature. Default is \code{NULL}. If set to a function (e.g., \code{list}), the \code{j} expression can return a \code{list}, which will then be "enlisted" into columns in the result.} + \item{\code{datatable.unique.names}}{A character string, default \code{NULL} (same as \code{"off"}). + Controls the behavior when operations (\bold{currently only \code{setnames}}) + would result in duplicate column names.} } }