Vitek-Lab · tonywu1999 · May 14, 2026 · May 15, 2026 · May 15, 2026 · Jun 2, 2026
diff --git a/R/dataProcess.R b/R/dataProcess.R
@@ -401,7 +401,7 @@ MSstatsSummarizeSingleLinear = function(single_protein,
 
     if (impute & any(single_protein[["censored"]])) {
         fit_data = if (is_labeled_reference) {
-            single_protein[is_labeled_ref == FALSE, cols, with = FALSE]
+            single_protein[(!is_labeled_ref), cols, with = FALSE]
         } else {
             single_protein[, cols, with = FALSE]
         }
@@ -414,19 +414,19 @@ MSstatsSummarizeSingleLinear = function(single_protein,
         }]
 
         if (is_labeled_reference) {
-            single_protein[, predicted := ifelse(censored & is_labeled_ref == FALSE, predicted, NA)]
-            single_protein[, newABUNDANCE := ifelse(censored & is_labeled_ref == FALSE, predicted, newABUNDANCE)]
+            single_protein[!(censored & !is_labeled_ref), predicted := NA]
+            single_protein[(censored) & !is_labeled_ref,
+                           newABUNDANCE := predicted]
         } else {
-            single_protein[, predicted := ifelse(censored, predicted, NA)]
-            single_protein[, newABUNDANCE := ifelse(censored, predicted, newABUNDANCE)]
+            single_protein[!(censored), predicted := NA]
+            single_protein[(censored), newABUNDANCE := predicted]
         }
-
         survival = single_protein[, intersect(c(cols, "LABEL", "predicted"), colnames(single_protein)), with = FALSE]
     } else {
         survival = single_protein[, intersect(c(cols, "LABEL"), colnames(single_protein)), with = FALSE]
         survival[, predicted := NA]
     }
-    
+
     if (all(!is.na(single_protein$ANOMALYSCORES))) {
         single_protein[, weights :=
             anomaly_weights_z_vec(ANOMALYSCORES),
@@ -547,7 +547,7 @@ MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol,
         converged = TRUE
 
         fit_data = if (is_labeled_reference) {
-            single_protein[is_labeled_ref == FALSE, cols, with = FALSE]
+            single_protein[(!is_labeled_ref), cols, with = FALSE]
         } else {
             single_protein[, cols, with = FALSE]
         }
@@ -569,11 +569,13 @@ MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol,
         }
 
         if (is_labeled_reference) {
-            single_protein[, predicted := ifelse(censored & is_labeled_ref == FALSE, predicted, NA)]
-            single_protein[, newABUNDANCE := ifelse(censored & is_labeled_ref == FALSE, predicted, newABUNDANCE)]
+            single_protein[!(censored & !is_labeled_ref), predicted := NA]
+            single_protein[(censored) & !is_labeled_ref,
+                           newABUNDANCE := predicted]
         } else {
-            single_protein[, predicted := ifelse(censored, predicted, NA)]
-            single_protein[, newABUNDANCE := ifelse(censored, predicted, newABUNDANCE)]
+            single_protein[!(censored), predicted := NA]
+            single_protein[(censored),
+                           newABUNDANCE := predicted]
         }
         survival = single_protein[, intersect(c(cols, "LABEL", "predicted"), colnames(single_protein)), with = FALSE]
     } else {

diff --git a/R/utils_checks.R b/R/utils_checks.R
@@ -170,7 +170,7 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) {
     input = data.table::as.data.table(unclass(input))
 
     if (!"AnomalyScores" %in% colnames(input)){
-      input$AnomalyScores = NA
+      data.table::set(input, j = "AnomalyScores", value = NA)
     }
 
     cols = c("ProteinName", "PeptideSequence", "PeptideModifiedSequence",
@@ -200,7 +200,8 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) {
 
     if (!is.numeric(input$INTENSITY)) {	
         suppressWarnings({
-            input$INTENSITY = as.numeric(as.character(input$INTENSITY))
+            data.table::set(input, j = "INTENSITY",
+                            value = as.numeric(as.character(input$INTENSITY)))
         })
     }
 
@@ -211,12 +212,15 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) {
     cols = toupper(cols)
     cols = intersect(c(cols, "FRACTION", "TECHREPLICATE"),
                      colnames(input))
-    input = input[, cols, with = FALSE]
-
-    input$PEPTIDE = paste(input$PEPTIDESEQUENCE, 
-                          input$PRECURSORCHARGE, sep = "_")
-    input$TRANSITION = paste(input$FRAGMENTION, 
-                             input$PRODUCTCHARGE, sep = "_")
+    drop_cols = setdiff(colnames(input), cols)
+    for (col in drop_cols) data.table::set(input, j = col, value = NULL)
+
+    data.table::set(input, j = "PEPTIDE",
+                    value = paste(input$PEPTIDESEQUENCE,
+                                  input$PRECURSORCHARGE, sep = "_"))
+    data.table::set(input, j = "TRANSITION",
+                    value = paste(input$FRAGMENTION,
+                                  input$PRODUCTCHARGE, sep = "_"))
 
     if (data.table::uniqueN(input$ISOTOPELABELTYPE) > 2) {
         getOption("MSstatsLog")(
@@ -226,7 +230,8 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) {
         stop("Statistical tools in MSstats are only proper for label-free or with reference peptide experiments.")
     }
 
-    input$ISOTOPELABELTYPE <- .mapIsotopeLabelType(input$ISOTOPELABELTYPE)
+    data.table::set(input, j = "ISOTOPELABELTYPE",
+                    value = .mapIsotopeLabelType(input$ISOTOPELABELTYPE))
     input
 }
 
@@ -242,9 +247,14 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) {
         data.table::setnames(
             input, "PEPTIDEMODIFIEDSEQUENCE", "PEPTIDESEQUENCE")
     }
-    input$PEPTIDE = paste(input$PEPTIDESEQUENCE, input$PRECURSORCHARGE, sep = "_")
-    input$TRANSITION = paste(input$FRAGMENTION, input$PRODUCTCHARGE, sep = "_")
-    input$ISOTOPELABELTYPE <- .mapIsotopeLabelType(input$ISOTOPELABELTYPE)
+    data.table::set(input, j = "PEPTIDE",
+                    value = paste(input$PEPTIDESEQUENCE,
+                                  input$PRECURSORCHARGE, sep = "_"))
+    data.table::set(input, j = "TRANSITION",
+                    value = paste(input$FRAGMENTION,
+                                  input$PRODUCTCHARGE, sep = "_"))
+    data.table::set(input, j = "ISOTOPELABELTYPE",
+                    value = .mapIsotopeLabelType(input$ISOTOPELABELTYPE))
     input
 }
 
@@ -322,8 +332,8 @@ setMethod(".checkDataValidity", "MSstatsValidated", .prepareForDataProcess)
     input[, PROTEIN := factor(PROTEIN)]
     input[, PEPTIDE := factor(PEPTIDE)]
     input[, TRANSITION := factor(TRANSITION)]
-    input = input[order(LABEL, GROUP_ORIGINAL, SUBJECT_ORIGINAL,
-                        RUN, PROTEIN, PEPTIDE, TRANSITION), ]
+    data.table::setorder(input, LABEL, GROUP_ORIGINAL, SUBJECT_ORIGINAL,
+                         RUN, PROTEIN, PEPTIDE, TRANSITION)
     input[, GROUP := factor(GROUP)]
     input[, SUBJECT := factor(SUBJECT)]
     input[, FEATURE := factor(FEATURE)]

diff --git a/R/utils_feature_selection.R b/R/utils_feature_selection.R
@@ -74,28 +74,22 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2
 #' @return data.table
 #' @keywords internal
 .selectHighQualityFeatures = function(input, min_feature_count) {
-    PROTEIN = PEPTIDE = FEATURE = originalRUN = ABUNDANCE = is_censored = NULL
-    is_obs = log2inty = LABEL = NULL
-
-    cols = c("PROTEIN", "PEPTIDE", "FEATURE", "originalRUN", "LABEL", 
-             "ABUNDANCE", "censored")
-    cols = intersect(cols, colnames(input))
-    input = input[, cols, with = FALSE]
-    if (!("censored" %in% cols)) {
-        input$censored = FALSE
-    } 
-    data.table::setnames(input, "censored", "is_censored")
+    PROTEIN = PEPTIDE = FEATURE = originalRUN = ABUNDANCE = censored = NULL
+    is_obs = log2inty = is_censored = LABEL = NULL
+
+    has_censored = is.element("censored", colnames(input))
     input = input[, list(protein = as.character(PROTEIN),
                          peptide = as.character(PEPTIDE),
                          feature = as.character(FEATURE),
                          run = as.character(originalRUN),
                          label = as.character(LABEL),
-                         log2inty = ifelse(!(is.na(ABUNDANCE) | is_censored),
-                                           ABUNDANCE, NA),
-                         is_censored)]
-    input[, is_obs := !(is.na(log2inty) | is_censored)]
+                         log2inty = ABUNDANCE,
+                         is_censored = if (has_censored) censored else FALSE)]
+    # Censored or missing intensities are not observations.
+    input[is_censored | is.na(log2inty), log2inty := NA]
+    input[, is_obs := !is.na(log2inty)]
     input[, is_censored := NULL]
-    
+
     features_quality = data.table::rbindlist(lapply(split(input, input$label),
                                                     .flagUninformativeSingleLabel,
                                                     min_feature_count = min_feature_count))

diff --git a/R/utils_normalize.R b/R/utils_normalize.R
@@ -61,8 +61,8 @@ MSstatsNormalize = function(input, normalization_method, peptides_dict = NULL, s
     input[, ABUNDANCE_FRACTION := median(ABUNDANCE_RUN, na.rm = TRUE),
           by = "FRACTION"]
     input[, ABUNDANCE := ABUNDANCE - ABUNDANCE_RUN + ABUNDANCE_FRACTION]
-    input = input[, !(colnames(input) %in% c("ABUNDANCE_RUN", "ABUNDANCE_FRACTION")),
-                  with = FALSE]
+    data.table::set(input, j = "ABUNDANCE_RUN", value = NULL)
+    data.table::set(input, j = "ABUNDANCE_FRACTION", value = NULL)
     getOption("MSstatsLog")("Normalization based on median: OK")
     input
 }
@@ -255,7 +255,9 @@ MSstatsNormalize = function(input, normalization_method, peptides_dict = NULL, s
     input[, ABUNDANCE := ABUNDANCE - median_by_run + median_by_fraction]
 
     getOption("MSstatsLog")("INFO", "Normalization : normalization with global standards protein - okay")
-    input[ , !(colnames(input) %in% c("median_by_run", "median_by_fraction")), with = FALSE]
+    data.table::set(input, j = "median_by_run", value = NULL)
+    data.table::set(input, j = "median_by_fraction", value = NULL)
+    input
 }
 
 
@@ -282,7 +284,7 @@ MSstatsNormalize = function(input, normalization_method, peptides_dict = NULL, s
 #' 
 MSstatsMergeFractions = function(input) {
     ABUNDANCE = INTENSITY = GROUP_ORIGINAL = SUBJECT_ORIGINAL = RUN = NULL
-    originalRUN = FRACTION = TECHREPLICATE = tmp = merged = newRun = NULL
+    originalRUN = FRACTION = TECHREPLICATE = tmp = newRun = NULL
     ncount = FEATURE = NULL
 
     input[!is.na(ABUNDANCE) & ABUNDANCE < 0, "ABUNDANCE"] = 0
@@ -338,29 +340,45 @@ MSstatsMergeFractions = function(input) {
                 getOption("MSstatsLog")("ERROR", msg)
                 stop(msg)
             } else {
-                match_runs[, merged := "merged"]
-                match_runs[, newRun := do.call(paste, c(.SD, sep = "_")), 
-                           .SDcols = c(1:3, ncol(match_runs))]
-                match_runs = unique(match_runs[, list(GROUP_ORIGINAL,
-                                                      SUBJECT_ORIGINAL,
-                                                      newRun)])
-
-                input = merge(input, match_runs,
-                              by = c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"),
-                              all.x = TRUE)
+                # dcast pivoted fractions into columns, so the fraction columns
+                # are everything that is not a sample-identifier column.
+                fraction_cols = setdiff(colnames(match_runs),
+                                        c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"))
+                # Use the first fraction's run as the sample's representative run.
+                first_fraction_run = match_runs[[fraction_cols[1]]]
+                # Build the merged-run name: <group>_<subject>_<run>_merged.
+                match_runs[, newRun := paste(GROUP_ORIGINAL, SUBJECT_ORIGINAL,
+                                             first_fraction_run, "merged",
+                                             sep = "_")]
+                # Reduce to a (group, subject) -> merged-run lookup table.
+                match_runs = match_runs[, list(GROUP_ORIGINAL, SUBJECT_ORIGINAL,
+                                               newRun)]
+                # For each input row, find its sample's row in the lookup.
+                nr_idx = match_runs[input,
+                                    on = c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"),
+                                    which = TRUE, mult = "first"]
+                # Write that merged-run name onto every input row.
+                data.table::set(input, j = "newRun",
+                                value = match_runs$newRun[nr_idx])
+                # Count, per feature/fraction, the rows observed above zero.
                 select_fraction = input[!is.na(ABUNDANCE) & input$ABUNDANCE > 0,
                                         list(ncount = .N),
                                         by = c("FEATURE", "FRACTION")]
+                # Keep only feature/fraction combinations seen at least once.
                 select_fraction = select_fraction[ncount != 0]
-                select_fraction[, tmp := paste(FEATURE, FRACTION, sep = "_")]
-                input$tmp = paste(input$FEATURE, input$FRACTION, sep = "_")
-                input = input[tmp %in% select_fraction$tmp, ]
-                input$originalRUN = input$newRun
-                input$RUN = input$originalRUN
-                input$RUN = factor(input$RUN, levels = unique(input$RUN), 
-                                   labels = seq_along(unique(input$RUN)))
-                input = input[, !(colnames(input) %in% c('tmp','newRun')), 
-                              with = FALSE]
+                # Mark which input rows belong to a kept combination (NA = none).
+                keep_idx = select_fraction[input,
+                                           on = c("FEATURE", "FRACTION"),
+                                           which = TRUE, mult = "first"]
+                # Drop rows whose feature/fraction was never observed.
+                input = input[!is.na(keep_idx)]
+                # The merged run replaces the original per-fraction run.
+                input[, originalRUN := newRun]
+                # Renumber RUN as a factor, one level per merged run.
+                input[, RUN := factor(newRun, levels = unique(newRun),
+                                      labels = seq_along(unique(newRun)))]
+                # Drop the temporary newRun helper column.
+                data.table::set(input, j = "newRun", value = NULL)
             }
         }
     }