From 89cc83cdb01abe9643ca0b9a11aff919b086d8c5 Mon Sep 17 00:00:00 2001 From: venom1204 Date: Fri, 20 Mar 2026 09:28:18 +0000 Subject: [PATCH 1/8] apllied the changes as requested --- NEWS.md | 2 ++ R/data.table.R | 6 ++++++ R/onLoad.R | 3 ++- R/utils.R | 24 ++++++++++++++++++++++++ inst/tests/tests.Rraw | 19 +++++++++++++++++++ man/data.table-options.Rd | 3 +++ 6 files changed, 56 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index d87656c9f7..069b3b76d2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -32,6 +32,8 @@ 6. `yearqtr()` and `yearmon()` now gain an optional format specifier [#7694](https://github.com/Rdatatable/data.table/issues/7694). 'numeric' is the default, which preserves the original behavior, but 'character' formats `yearqtr()` as YYYYQ# (e.g. 2025Q2) and `yearmon()` as YYYYM## (e.g. 2025M02, 2025M10). Thanks to @jan-swissre for the report and @LunaticSage218 for the implementation. +7. `setnames()` now supports a global option `datatable.unique.names` to control the creation of duplicate column names. Users can choose between `"off"` (default), `"warn"`, `"error"`, or `"rename"`. This addresses long-standing ambiguity issues when duplicate names were created silently, [#4044](https://github.com/Rdatatable/data.table/issues/4044). Thanks to @venom1204 for the PR. + ### BUG FIXES 1. `fread()` with `skip=0` and `(header=TRUE|FALSE)` no longer skips the first row when it has fewer fields than subsequent rows, [#7463](https://github.com/Rdatatable/data.table/issues/7463). Thanks @emayerhofer for the report and @ben-schwen for the fix. diff --git a/R/data.table.R b/R/data.table.R index a3ea551faa..1e5cbcdcc4 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2950,6 +2950,12 @@ setnames = function(x,old,new,skip_absent=FALSE) { if (!length(new)) return(invisible(x)) # no changes if (length(i) != length(new)) internal_error("length(i)!=length(new)") # nocov } + + full_names = names(x) + full_names[i] = new + full_names = process_name_policy(full_names) + new = full_names[i] + # update the key if the column name being change is in the key m = chmatch(names(x)[i], key(x)) w = which(!is.na(m)) diff --git a/R/onLoad.R b/R/onLoad.R index b72fee4d1b..581d30692a 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -98,7 +98,8 @@ datatable.auto.index=TRUE, # DT[col=="val"] to auto add index so 2nd time faster datatable.use.index=TRUE, # global switch to address #1422 datatable.prettyprint.char=NULL, # FR #1091 - datatable.old.matrix.autoname=FALSE # #7145: how data.table(x=1, matrix(1)) is auto-named set to change + datatable.old.matrix.autoname=FALSE, # #7145: how data.table(x=1, matrix(1)) is auto-named set to change + datatable.unique.names = NULL ) opts = opts[!names(opts) %chin% names(options())] options(opts) diff --git a/R/utils.R b/R/utils.R index 27df811832..a182a4b533 100644 --- a/R/utils.R +++ b/R/utils.R @@ -46,6 +46,30 @@ check_duplicate_key = function(x) { invisible() } +process_name_policy = function(names_vec) { + policy = getOption("datatable.unique.names") + if (is.null(policy) || policy == "off") return(names_vec) + + allowed = c("warn", "error", "rename") + if (!policy %in% allowed) { + warningf("Invalid value for 'datatable.unique.names': [%s]. Falling back to 'off'. Allowed values are: 'off', 'warn', 'error', 'rename'.", as.character(policy)) + return(names_vec) + } + + if (anyDuplicated(names_vec)) { + dups = unique(names_vec[duplicated(names_vec)]) + # Use paste0 to avoid sprintf issues with column names containing '%' + msg = paste0("Duplicate column names created: ", brackify(dups), ". This may cause ambiguity.") + + switch(policy, + warn = warningf(msg), + error = stopf(msg), + rename = return(make.unique(names_vec)) + ) + } + names_vec +} + duplicated_values = function(x) { # fast anyDuplicated for the typical/non-error case; second duplicated() pass for (usually) error case if (!anyDuplicated(x)) return(vector(typeof(x))) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 59c2218e6d..7ba98cac87 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21641,3 +21641,22 @@ test(2374.08, key(DT[, .(a, a)]), NULL) test(2374.09, key(subset(DT, select=c(a, a))), NULL) DT = data.table(a=1:2, a.1=3:4, val=10:11) test(2374.10, key(DT[, .(a.1, sum(val)), keyby=.(a, a)]), NULL) + +#4044 +DT = as.data.table(iris) +test(2375.1, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), + c("Sepal.Length", "Sepal.Width", "Sepal.Length", "Petal.Width", "Species"), + options = list(datatable.unique.names = "off")) + +test(2375.2, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), + c("Sepal.Length", "Sepal.Width", "Sepal.Length", "Petal.Width", "Species"), + warning = "Duplicate column names created", + options = list(datatable.unique.names = "warn")) + +test(2375.3, setnames(copy(DT), "Petal.Length", "Sepal.Length"), + error = "Duplicate column names created", + options = list(datatable.unique.names = "error")) + +test(2375.4, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), + c("Sepal.Length", "Sepal.Width", "Sepal.Length.1", "Petal.Width", "Species"), + options = list(datatable.unique.names = "rename")) diff --git a/man/data.table-options.Rd b/man/data.table-options.Rd index 439e88ef2f..5d198e5f87 100644 --- a/man/data.table-options.Rd +++ b/man/data.table-options.Rd @@ -105,6 +105,9 @@ \item{\code{datatable.enlist}}{Experimental feature. Default is \code{NULL}. If set to a function (e.g., \code{list}), the \code{j} expression can return a \code{list}, which will then be "enlisted" into columns in the result.} + \item{\code{datatable.unique.names}}{A character string, default \code{NULL} (same as \code{"off"}). + Controls the behavior when operations (\bold{currently only \code{setnames}}) + would result in duplicate column names.} } } From f4c41eb5ee9fa04a53d64dc1f0df42e087979ddc Mon Sep 17 00:00:00 2001 From: venom1204 Date: Fri, 20 Mar 2026 09:46:28 +0000 Subject: [PATCH 2/8] added test --- inst/tests/tests.Rraw | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7ba98cac87..473011c1e6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21647,16 +21647,17 @@ DT = as.data.table(iris) test(2375.1, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), c("Sepal.Length", "Sepal.Width", "Sepal.Length", "Petal.Width", "Species"), options = list(datatable.unique.names = "off")) - -test(2375.2, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), +test(2368.2, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), c("Sepal.Length", "Sepal.Width", "Sepal.Length", "Petal.Width", "Species"), warning = "Duplicate column names created", options = list(datatable.unique.names = "warn")) - -test(2375.3, setnames(copy(DT), "Petal.Length", "Sepal.Length"), +test(2368.3, setnames(copy(DT), "Petal.Length", "Sepal.Length"), error = "Duplicate column names created", options = list(datatable.unique.names = "error")) - -test(2375.4, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), +test(2368.4, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), c("Sepal.Length", "Sepal.Width", "Sepal.Length.1", "Petal.Width", "Species"), options = list(datatable.unique.names = "rename")) +test(2368.5, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), + c("Sepal.Length", "Sepal.Width", "Sepal.Length", "Petal.Width", "Species"), + warning = "Invalid value for 'datatable.unique.names'", + options = list(datatable.unique.names = "invalid_option_name")) From 4245c16d3111ebdc5d6e43ece0a4506f8a2b9719 Mon Sep 17 00:00:00 2001 From: venom1204 Date: Fri, 20 Mar 2026 10:20:49 +0000 Subject: [PATCH 3/8] lintr --- R/onLoad.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/onLoad.R b/R/onLoad.R index 581d30692a..10136da1b3 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -99,7 +99,7 @@ datatable.use.index=TRUE, # global switch to address #1422 datatable.prettyprint.char=NULL, # FR #1091 datatable.old.matrix.autoname=FALSE, # #7145: how data.table(x=1, matrix(1)) is auto-named set to change - datatable.unique.names = NULL + datatable.unique.names = NULL ) opts = opts[!names(opts) %chin% names(options())] options(opts) From 3cc071ad813e8a4782443d8fa318895b9fa55bcf Mon Sep 17 00:00:00 2001 From: venom1204 Date: Sun, 31 May 2026 19:53:36 +0000 Subject: [PATCH 4/8] incorporated suggestions --- NEWS.md | 2 +- R/utils.R | 9 ++++----- inst/tests/tests.Rraw | 27 ++++++++------------------- man/data.table-options.Rd | 12 +++++++++--- 4 files changed, 22 insertions(+), 28 deletions(-) diff --git a/NEWS.md b/NEWS.md index 069b3b76d2..b3567d9b5a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -32,7 +32,7 @@ 6. `yearqtr()` and `yearmon()` now gain an optional format specifier [#7694](https://github.com/Rdatatable/data.table/issues/7694). 'numeric' is the default, which preserves the original behavior, but 'character' formats `yearqtr()` as YYYYQ# (e.g. 2025Q2) and `yearmon()` as YYYYM## (e.g. 2025M02, 2025M10). Thanks to @jan-swissre for the report and @LunaticSage218 for the implementation. -7. `setnames()` now supports a global option `datatable.unique.names` to control the creation of duplicate column names. Users can choose between `"off"` (default), `"warn"`, `"error"`, or `"rename"`. This addresses long-standing ambiguity issues when duplicate names were created silently, [#4044](https://github.com/Rdatatable/data.table/issues/4044). Thanks to @venom1204 for the PR. +7. `setnames()` now supports a global option `datatable.unique.names` to control the creation of duplicate column names. Users can choose between `"off"` (default), `"warn"`, `"error"`, or `"rename"`, [#4044](https://github.com/Rdatatable/data.table/issues/4044). Thanks to @venom1204 for the PR. ### BUG FIXES diff --git a/R/utils.R b/R/utils.R index a182a4b533..b58c188f8d 100644 --- a/R/utils.R +++ b/R/utils.R @@ -47,8 +47,8 @@ check_duplicate_key = function(x) { } process_name_policy = function(names_vec) { - policy = getOption("datatable.unique.names") - if (is.null(policy) || policy == "off") return(names_vec) + policy = getOption("datatable.unique.names", "off") + if (policy == "off") return(names_vec) allowed = c("warn", "error", "rename") if (!policy %in% allowed) { @@ -58,12 +58,11 @@ process_name_policy = function(names_vec) { if (anyDuplicated(names_vec)) { dups = unique(names_vec[duplicated(names_vec)]) - # Use paste0 to avoid sprintf issues with column names containing '%' msg = paste0("Duplicate column names created: ", brackify(dups), ". This may cause ambiguity.") switch(policy, - warn = warningf(msg), - error = stopf(msg), + warn = warningf("%s", msg), + error = stopf("%s", msg), rename = return(make.unique(names_vec)) ) } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 473011c1e6..0d0c4c2a8c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21642,22 +21642,11 @@ test(2374.09, key(subset(DT, select=c(a, a))), NULL) DT = data.table(a=1:2, a.1=3:4, val=10:11) test(2374.10, key(DT[, .(a.1, sum(val)), keyby=.(a, a)]), NULL) -#4044 -DT = as.data.table(iris) -test(2375.1, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), - c("Sepal.Length", "Sepal.Width", "Sepal.Length", "Petal.Width", "Species"), - options = list(datatable.unique.names = "off")) -test(2368.2, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), - c("Sepal.Length", "Sepal.Width", "Sepal.Length", "Petal.Width", "Species"), - warning = "Duplicate column names created", - options = list(datatable.unique.names = "warn")) -test(2368.3, setnames(copy(DT), "Petal.Length", "Sepal.Length"), - error = "Duplicate column names created", - options = list(datatable.unique.names = "error")) -test(2368.4, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), - c("Sepal.Length", "Sepal.Width", "Sepal.Length.1", "Petal.Width", "Species"), - options = list(datatable.unique.names = "rename")) -test(2368.5, names(setnames(copy(DT), "Petal.Length", "Sepal.Length")), - c("Sepal.Length", "Sepal.Width", "Sepal.Length", "Petal.Width", "Species"), - warning = "Invalid value for 'datatable.unique.names'", - options = list(datatable.unique.names = "invalid_option_name")) +# PR #7674: Add datatable.unique.names option to control duplicate names in setnames() +DT = data.table(a=1, b=2) +test(2375.1, names(setnames(copy(DT), "b", "a")), c("a", "a"), options = list(datatable.unique.names = "off")) +test(2375.2, names(setnames(copy(DT), "b", "a")), c("a", "a"), warning = "Duplicate column names created", options = list(datatable.unique.names = "warn")) +test(2375.3, setnames(copy(DT), "b", "a"), error = "Duplicate column names created", options = list(datatable.unique.names = "error")) +test(2375.4, names(setnames(copy(DT), "b", "a")), c("a", "a.1"), options = list(datatable.unique.names = "rename")) +DT_pct = data.table("a%d"=1, b=2) +test(2375.5, names(setnames(copy(DT_pct), "b", "a%d")), c("a%d", "a%d"), warning = "Duplicate column names created", options = list(datatable.unique.names = "warn")) diff --git a/man/data.table-options.Rd b/man/data.table-options.Rd index 5d198e5f87..a7d0c70843 100644 --- a/man/data.table-options.Rd +++ b/man/data.table-options.Rd @@ -105,9 +105,15 @@ \item{\code{datatable.enlist}}{Experimental feature. Default is \code{NULL}. If set to a function (e.g., \code{list}), the \code{j} expression can return a \code{list}, which will then be "enlisted" into columns in the result.} - \item{\code{datatable.unique.names}}{A character string, default \code{NULL} (same as \code{"off"}). - Controls the behavior when operations (\bold{currently only \code{setnames}}) - would result in duplicate column names.} + \item{\code{datatable.unique.names}}{A character string, default \code{"off"}. + Controls the behavior when \code{setnames} would result in duplicate column names. + Possible values: + \itemize{ + \item \code{"off"}: (default) Duplicates are allowed silently. + \item \code{"warn"}: A warning is issued if duplicates are created. + \item \code{"error"}: An error is raised, preventing the change. + \item \code{"rename"}: Duplicate names are made unique via \code{make.unique}. + }} } } From 34ba933113b5f7a8fca7ef3505659eb99a3e3410 Mon Sep 17 00:00:00 2001 From: venom1204 Date: Sun, 31 May 2026 20:06:50 +0000 Subject: [PATCH 5/8] added test --- inst/tests/tests.Rraw | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0d0c4c2a8c..d667c8fc1e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21650,3 +21650,4 @@ test(2375.3, setnames(copy(DT), "b", "a"), error = "Duplicate column names creat test(2375.4, names(setnames(copy(DT), "b", "a")), c("a", "a.1"), options = list(datatable.unique.names = "rename")) DT_pct = data.table("a%d"=1, b=2) test(2375.5, names(setnames(copy(DT_pct), "b", "a%d")), c("a%d", "a%d"), warning = "Duplicate column names created", options = list(datatable.unique.names = "warn")) +test(2375.6, names(setnames(copy(DT), "b", "a")), c("a", "a"), warning = "Invalid value for 'datatable.unique.names'", options = list(datatable.unique.names = "wrong_choice")) From 628e7e20345a28d24dccc0b8a92825da83cbc1bd Mon Sep 17 00:00:00 2001 From: venom1204 Date: Sun, 31 May 2026 20:07:48 +0000 Subject: [PATCH 6/8] removed news --- NEWS.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index b3567d9b5a..d87656c9f7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -32,8 +32,6 @@ 6. `yearqtr()` and `yearmon()` now gain an optional format specifier [#7694](https://github.com/Rdatatable/data.table/issues/7694). 'numeric' is the default, which preserves the original behavior, but 'character' formats `yearqtr()` as YYYYQ# (e.g. 2025Q2) and `yearmon()` as YYYYM## (e.g. 2025M02, 2025M10). Thanks to @jan-swissre for the report and @LunaticSage218 for the implementation. -7. `setnames()` now supports a global option `datatable.unique.names` to control the creation of duplicate column names. Users can choose between `"off"` (default), `"warn"`, `"error"`, or `"rename"`, [#4044](https://github.com/Rdatatable/data.table/issues/4044). Thanks to @venom1204 for the PR. - ### BUG FIXES 1. `fread()` with `skip=0` and `(header=TRUE|FALSE)` no longer skips the first row when it has fewer fields than subsequent rows, [#7463](https://github.com/Rdatatable/data.table/issues/7463). Thanks @emayerhofer for the report and @ben-schwen for the fix. From 7578680f5451eea4f910468ad8bed58eae74a49e Mon Sep 17 00:00:00 2001 From: venom1204 Date: Sun, 31 May 2026 20:54:09 +0000 Subject: [PATCH 7/8] changed placement of the function --- R/data.table.R | 23 +++++++++++++++++++++++ R/onLoad.R | 2 +- R/utils.R | 23 ----------------------- 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 1e5cbcdcc4..1a5a1d2ac5 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2893,6 +2893,29 @@ setattr = function(x,name,value) { invisible(x) } +process_name_policy = function(names_vec) { + policy = getOption("datatable.unique.names", "off") + if (policy == "off") return(names_vec) + + allowed = c("warn", "error", "rename") + if (!policy %in% allowed) { + warningf("Invalid value for 'datatable.unique.names': [%s]. Falling back to 'off'. Allowed values are: 'off', 'warn', 'error', 'rename'.", as.character(policy)) + return(names_vec) + } + + if (anyDuplicated(names_vec)) { + dups = unique(names_vec[duplicated(names_vec)]) + msg = paste0("Duplicate column names created: ", brackify(dups), ". This may cause ambiguity.") + + switch(policy, + warn = warningf("%s", msg), + error = stopf("%s", msg), + rename = return(make.unique(names_vec)) + ) + } + names_vec +} + setnames = function(x,old,new,skip_absent=FALSE) { # Sets by reference, maintains truelength, no copy of table at all. # But also more convenient than names(DT)[i]="newname" because we can also do setnames(DT,"oldname","newname") diff --git a/R/onLoad.R b/R/onLoad.R index 10136da1b3..ba404305b4 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -99,7 +99,7 @@ datatable.use.index=TRUE, # global switch to address #1422 datatable.prettyprint.char=NULL, # FR #1091 datatable.old.matrix.autoname=FALSE, # #7145: how data.table(x=1, matrix(1)) is auto-named set to change - datatable.unique.names = NULL + datatable.unique.names = "off" ) opts = opts[!names(opts) %chin% names(options())] options(opts) diff --git a/R/utils.R b/R/utils.R index b58c188f8d..27df811832 100644 --- a/R/utils.R +++ b/R/utils.R @@ -46,29 +46,6 @@ check_duplicate_key = function(x) { invisible() } -process_name_policy = function(names_vec) { - policy = getOption("datatable.unique.names", "off") - if (policy == "off") return(names_vec) - - allowed = c("warn", "error", "rename") - if (!policy %in% allowed) { - warningf("Invalid value for 'datatable.unique.names': [%s]. Falling back to 'off'. Allowed values are: 'off', 'warn', 'error', 'rename'.", as.character(policy)) - return(names_vec) - } - - if (anyDuplicated(names_vec)) { - dups = unique(names_vec[duplicated(names_vec)]) - msg = paste0("Duplicate column names created: ", brackify(dups), ". This may cause ambiguity.") - - switch(policy, - warn = warningf("%s", msg), - error = stopf("%s", msg), - rename = return(make.unique(names_vec)) - ) - } - names_vec -} - duplicated_values = function(x) { # fast anyDuplicated for the typical/non-error case; second duplicated() pass for (usually) error case if (!anyDuplicated(x)) return(vector(typeof(x))) From 22b14b7977cc6e9c2b659fa7ffdede9418215428 Mon Sep 17 00:00:00 2001 From: venom1204 Date: Sun, 31 May 2026 21:06:29 +0000 Subject: [PATCH 8/8] .. --- R/data.table.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index 1a5a1d2ac5..648271f8e4 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2906,7 +2906,6 @@ process_name_policy = function(names_vec) { if (anyDuplicated(names_vec)) { dups = unique(names_vec[duplicated(names_vec)]) msg = paste0("Duplicate column names created: ", brackify(dups), ". This may cause ambiguity.") - switch(policy, warn = warningf("%s", msg), error = stopf("%s", msg),