Vitek-Lab · tonywu1999 · May 14, 2026 · May 15, 2026 · May 15, 2026 · Jun 2, 2026
diff --git a/R/dataProcess.R b/R/dataProcess.R
@@ -414,19 +414,19 @@ MSstatsSummarizeSingleLinear = function(single_protein,
         }]
 
         if (is_labeled_reference) {
-            single_protein[, predicted := ifelse(censored & is_labeled_ref == FALSE, predicted, NA)]
-            single_protein[, newABUNDANCE := ifelse(censored & is_labeled_ref == FALSE, predicted, newABUNDANCE)]
+            single_protein[!(censored & is_labeled_ref == FALSE), predicted := NA]
+            single_protein[(censored) & is_labeled_ref == FALSE,
+                           newABUNDANCE := predicted]
         } else {
-            single_protein[, predicted := ifelse(censored, predicted, NA)]
-            single_protein[, newABUNDANCE := ifelse(censored, predicted, newABUNDANCE)]
+            single_protein[!(censored), predicted := NA]
+            single_protein[(censored), newABUNDANCE := predicted]
         }
-
         survival = single_protein[, intersect(c(cols, "LABEL", "predicted"), colnames(single_protein)), with = FALSE]
     } else {
         survival = single_protein[, intersect(c(cols, "LABEL"), colnames(single_protein)), with = FALSE]
         survival[, predicted := NA]
     }
-    
+
     if (all(!is.na(single_protein$ANOMALYSCORES))) {
         single_protein[, weights :=
             anomaly_weights_z_vec(ANOMALYSCORES),
@@ -569,11 +569,13 @@ MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol,
         }
 
         if (is_labeled_reference) {
-            single_protein[, predicted := ifelse(censored & is_labeled_ref == FALSE, predicted, NA)]
-            single_protein[, newABUNDANCE := ifelse(censored & is_labeled_ref == FALSE, predicted, newABUNDANCE)]
+            single_protein[!(censored & is_labeled_ref == FALSE), predicted := NA]
+            single_protein[(censored) & is_labeled_ref == FALSE,
+                           newABUNDANCE := predicted]
         } else {
-            single_protein[, predicted := ifelse(censored, predicted, NA)]
-            single_protein[, newABUNDANCE := ifelse(censored, predicted, newABUNDANCE)]
+            single_protein[!(censored), predicted := NA]
+            single_protein[(censored),
+                           newABUNDANCE := predicted]
         }
         survival = single_protein[, intersect(c(cols, "LABEL", "predicted"), colnames(single_protein)), with = FALSE]
     } else {

diff --git a/R/utils_checks.R b/R/utils_checks.R
@@ -211,9 +211,10 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) {
     cols = toupper(cols)
     cols = intersect(c(cols, "FRACTION", "TECHREPLICATE"),
                      colnames(input))
-    input = input[, cols, with = FALSE]
-
-    input$PEPTIDE = paste(input$PEPTIDESEQUENCE, 
+    drop_cols = setdiff(colnames(input), cols)
+    for (col in drop_cols) data.table::set(input, j = col, value = NULL)
+
+    input$PEPTIDE = paste(input$PEPTIDESEQUENCE,
                           input$PRECURSORCHARGE, sep = "_")
     input$TRANSITION = paste(input$FRAGMENTION, 
                              input$PRODUCTCHARGE, sep = "_")
@@ -322,8 +323,8 @@ setMethod(".checkDataValidity", "MSstatsValidated", .prepareForDataProcess)
     input[, PROTEIN := factor(PROTEIN)]
     input[, PEPTIDE := factor(PEPTIDE)]
     input[, TRANSITION := factor(TRANSITION)]
-    input = input[order(LABEL, GROUP_ORIGINAL, SUBJECT_ORIGINAL,
-                        RUN, PROTEIN, PEPTIDE, TRANSITION), ]
+    data.table::setorder(input, LABEL, GROUP_ORIGINAL, SUBJECT_ORIGINAL,
+                         RUN, PROTEIN, PEPTIDE, TRANSITION)
     input[, GROUP := factor(GROUP)]
     input[, SUBJECT := factor(SUBJECT)]
     input[, FEATURE := factor(FEATURE)]

diff --git a/R/utils_feature_selection.R b/R/utils_feature_selection.R
@@ -74,29 +74,22 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2
 #' @return data.table
 #' @keywords internal
 .selectHighQualityFeatures = function(input, min_feature_count) {
-    PROTEIN = PEPTIDE = FEATURE = originalRUN = ABUNDANCE = is_censored = NULL
+    PROTEIN = PEPTIDE = FEATURE = originalRUN = ABUNDANCE = censored = NULL
     is_obs = log2inty = LABEL = NULL
-
-    cols = c("PROTEIN", "PEPTIDE", "FEATURE", "originalRUN", "LABEL", 
-             "ABUNDANCE", "censored")
-    cols = intersect(cols, colnames(input))
-    input = input[, cols, with = FALSE]
-    if (!("censored" %in% cols)) {
-        input$censored = FALSE
-    } 
-    data.table::setnames(input, "censored", "is_censored")
-    input = input[, list(protein = as.character(PROTEIN),
-                         peptide = as.character(PEPTIDE),
-                         feature = as.character(FEATURE),
-                         run = as.character(originalRUN),
-                         label = as.character(LABEL),
-                         log2inty = ifelse(!(is.na(ABUNDANCE) | is_censored),
-                                           ABUNDANCE, NA),
-                         is_censored)]
-    input[, is_obs := !(is.na(log2inty) | is_censored)]
-    input[, is_censored := NULL]
-
-    features_quality = data.table::rbindlist(lapply(split(input, input$label),
+
+    has_censored = is.element("censored", colnames(input))
+    work = input[, list(protein = as.character(PROTEIN),
+                        peptide = as.character(PEPTIDE),
+                        feature = as.character(FEATURE),
+                        run = as.character(originalRUN),
+                        label = as.character(LABEL),
+                        log2inty = ifelse(!(is.na(ABUNDANCE) |
+                                           if (has_censored) censored else FALSE),
+                                          ABUNDANCE, NA),
+                        is_obs = FALSE)]
+    work[, is_obs := !is.na(log2inty)]
+
+    features_quality = data.table::rbindlist(lapply(split(work, work$label),
                                                     .flagUninformativeSingleLabel,
                                                     min_feature_count = min_feature_count))
     features_quality

diff --git a/R/utils_normalize.R b/R/utils_normalize.R
@@ -61,8 +61,8 @@ MSstatsNormalize = function(input, normalization_method, peptides_dict = NULL, s
     input[, ABUNDANCE_FRACTION := median(ABUNDANCE_RUN, na.rm = TRUE),
           by = "FRACTION"]
     input[, ABUNDANCE := ABUNDANCE - ABUNDANCE_RUN + ABUNDANCE_FRACTION]
-    input = input[, !(colnames(input) %in% c("ABUNDANCE_RUN", "ABUNDANCE_FRACTION")),
-                  with = FALSE]
+    data.table::set(input, j = "ABUNDANCE_RUN", value = NULL)
+    data.table::set(input, j = "ABUNDANCE_FRACTION", value = NULL)
     getOption("MSstatsLog")("Normalization based on median: OK")
     input
 }
@@ -255,7 +255,9 @@ MSstatsNormalize = function(input, normalization_method, peptides_dict = NULL, s
     input[, ABUNDANCE := ABUNDANCE - median_by_run + median_by_fraction]
 
     getOption("MSstatsLog")("INFO", "Normalization : normalization with global standards protein - okay")
-    input[ , !(colnames(input) %in% c("median_by_run", "median_by_fraction")), with = FALSE]
+    data.table::set(input, j = "median_by_run", value = NULL)
+    data.table::set(input, j = "median_by_fraction", value = NULL)
+    input
 }
 
 
@@ -344,23 +346,25 @@ MSstatsMergeFractions = function(input) {
                 match_runs = unique(match_runs[, list(GROUP_ORIGINAL,
                                                       SUBJECT_ORIGINAL,
                                                       newRun)])
-
-                input = merge(input, match_runs,
-                              by = c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"),
-                              all.x = TRUE)
+
+                nr_idx = match_runs[input,
+                                    on = c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"),
+                                    which = TRUE, mult = "first"]
+                data.table::set(input, j = "newRun",
+                                value = match_runs$newRun[nr_idx])
                 select_fraction = input[!is.na(ABUNDANCE) & input$ABUNDANCE > 0,
                                         list(ncount = .N),
                                         by = c("FEATURE", "FRACTION")]
                 select_fraction = select_fraction[ncount != 0]
-                select_fraction[, tmp := paste(FEATURE, FRACTION, sep = "_")]
-                input$tmp = paste(input$FEATURE, input$FRACTION, sep = "_")
-                input = input[tmp %in% select_fraction$tmp, ]
+                keep_idx = select_fraction[input,
+                                           on = c("FEATURE", "FRACTION"),
+                                           which = TRUE, mult = "first"]
+                input = input[!is.na(keep_idx)]
                 input$originalRUN = input$newRun
                 input$RUN = input$originalRUN
-                input$RUN = factor(input$RUN, levels = unique(input$RUN), 
+                input$RUN = factor(input$RUN, levels = unique(input$RUN),
                                    labels = seq_along(unique(input$RUN)))
-                input = input[, !(colnames(input) %in% c('tmp','newRun')), 
-                              with = FALSE]
+                data.table::set(input, j = "newRun", value = NULL)
             }
         }
     }

diff --git a/R/utils_output.R b/R/utils_output.R
@@ -34,13 +34,20 @@
 #' output = output = MSstatsSummarizationOutput(input, summarized, processed,
 #' method, impute, cens)
 #' 
-MSstatsSummarizationOutput = function(input, summarized, processed, 
+MSstatsSummarizationOutput = function(input, summarized, processed,
                                       method, impute, censored_symbol) {
     LABEL = TotalGroupMeasurements = GROUP = Protein = RUN = NULL
-
-    input = .finalizeInput(input, summarized, method, impute, censored_symbol)
-    summarized = lapply(summarized, function(x) x[[1]])
-    summarized = data.table::rbindlist(summarized, fill = TRUE)
+
+    predicted_survival = data.table::rbindlist(lapply(summarized, function(x) x[[2]]),
+                                                fill = TRUE)
+    for (i in seq_along(summarized)) summarized[[i]][[2]] = NULL
+    input = .finalizeInput(input, predicted_survival, method, impute, censored_symbol)
+    rm(predicted_survival)
+    protein_summaries = lapply(summarized, function(x) x[[1]])
+    rm(summarized)
+    summarized = data.table::rbindlist(protein_summaries, fill = TRUE)
+    rm(protein_summaries)
+
     if (inherits(summarized, "try-error")) {
         msg = paste("*** error : can't summarize per subplot with ",
                     method, ".")
@@ -82,18 +89,21 @@ MSstatsSummarizationOutput = function(input, summarized, processed,
                               "originalRUN", "censored", "INTENSITY", "ABUNDANCE",
                               "newABUNDANCE", "predicted", "feature_quality",
                               "is_outlier", "remove", "is_labeled_ref"), colnames(input))
-    input = input[, output_cols, with = FALSE]
-
+    drop_cols = setdiff(colnames(input), output_cols)
+    for (col in drop_cols) data.table::set(input, j = col, value = NULL)
+
     if (is.element("remove", colnames(processed))) {
-        processed = processed[(remove), 
-                              intersect(output_cols, 
+        processed = processed[(remove),
+                              intersect(output_cols,
                                         colnames(processed)), with = FALSE]
         input = rbind(input, processed, fill = TRUE)
     }
-    list(FeatureLevelData = as.data.frame(input), 
-         ProteinLevelData = as.data.frame(rqall), 
+    data.table::setDF(input)
+    data.table::setDF(rqall)
+    list(FeatureLevelData = input,
+         ProteinLevelData = rqall,
          SummaryMethod = method)
-    
+
 }
 
 
@@ -104,9 +114,9 @@ MSstatsSummarizationOutput = function(input, summarized, processed,
 #' @param impute if TRUE, censored missing values were imputed
 #' @param censored_symbol censored missing value indicator
 #' @keywords internal
-.finalizeInput = function(input, summarized, method, impute, censored_symbol) {
+.finalizeInput = function(input, predicted_survival, method, impute, censored_symbol) {
     # if (method == "TMP") {
-    input = .finalizeTMP(input, censored_symbol, impute, summarized)
+    input = .finalizeTMP(input, censored_symbol, impute, predicted_survival)
     # } else {
     #     input = .finalizeLinear(input, censored_symbol)
     # }
@@ -117,21 +127,23 @@ MSstatsSummarizationOutput = function(input, summarized, processed,
 #' Summary statistics for output of TMP-based summarization
 #' @inheritParams .finalizeInput
 #' @keywords internal
-.finalizeTMP = function(input, censored_symbol, impute, summarized) {
+.finalizeTMP = function(input, censored_symbol, impute, predicted_survival) {
     NonMissingStats = NumMeasuredFeature = MissingPercentage = LABEL = NULL
     total_features = more50missing = nonmissing_orig = censored = NULL
     INTENSITY = newABUNDANCE = NumImputedFeature = NULL
-
-    survival_predictions = lapply(summarized, function(x) x[[2]])
-    predicted_survival = data.table::rbindlist(survival_predictions, fill = TRUE)
+
     if (impute) {
-        cols = intersect(colnames(input), c("newABUNDANCE",
-                                            "cen", "RUN",
-                                            "FEATURE", "ref_covariate", "LABEL"))
-        input = merge(input[, colnames(input) != "newABUNDANCE", with = FALSE],
-                      predicted_survival,
-                      by = setdiff(cols, "newABUNDANCE"),
-                      all.x = TRUE)
+        join_cols = intersect(intersect(colnames(input),
+                                        colnames(predicted_survival)),
+                              c("cen", "RUN", "FEATURE", "ref_covariate",
+                                "LABEL"))
+        data.table::set(input, j = "newABUNDANCE", value = NULL)
+        idx = predicted_survival[input, on = join_cols, which = TRUE,
+                                 mult = "first"]
+        data.table::set(input, j = "newABUNDANCE",
+                        value = predicted_survival$newABUNDANCE[idx])
+        data.table::set(input, j = "predicted",
+                        value = predicted_survival$predicted[idx])
     }
     input[, NonMissingStats := .getNonMissingFilterStats(.SD, censored_symbol)]
     input[, NumMeasuredFeature := sum(NonMissingStats),
@@ -144,7 +156,7 @@ MSstatsSummarizationOutput = function(input, summarized, processed,
         } else {
             input[, nonmissing_orig := !is.na(INTENSITY)]
         }
-        input[, nonmissing_orig := ifelse(is.na(newABUNDANCE), TRUE, nonmissing_orig)]
+        input[is.na(newABUNDANCE), nonmissing_orig := TRUE]
         if (impute) {
             input[, NumImputedFeature := sum(!nonmissing_orig),
                   by = c("PROTEIN", "RUN", "LABEL")]
@@ -175,7 +187,7 @@ MSstatsSummarizationOutput = function(input, summarized, processed,
         } else {
             input[, nonmissing_orig := !is.na(INTENSITY)]
         }
-        input[, nonmissing_orig := ifelse(is.na(newABUNDANCE), TRUE, nonmissing_orig)]
+        input[is.na(newABUNDANCE), nonmissing_orig := TRUE]
         input[, NumImputedFeature := 0]
     }
     input