Vitek-Lab · tonywu1999 · May 14, 2026 · May 15, 2026 · May 15, 2026 · Jun 2, 2026
diff --git a/R/dataProcess.R b/R/dataProcess.R
@@ -414,19 +414,19 @@ MSstatsSummarizeSingleLinear = function(single_protein,
         }]
 
         if (is_labeled_reference) {
-            single_protein[, predicted := ifelse(censored & is_labeled_ref == FALSE, predicted, NA)]
-            single_protein[, newABUNDANCE := ifelse(censored & is_labeled_ref == FALSE, predicted, newABUNDANCE)]
+            single_protein[!(censored & is_labeled_ref == FALSE), predicted := NA]
+            single_protein[(censored) & is_labeled_ref == FALSE,
+                           newABUNDANCE := predicted]
         } else {
-            single_protein[, predicted := ifelse(censored, predicted, NA)]
-            single_protein[, newABUNDANCE := ifelse(censored, predicted, newABUNDANCE)]
+            single_protein[!(censored), predicted := NA]
+            single_protein[(censored), newABUNDANCE := predicted]
         }
-
         survival = single_protein[, intersect(c(cols, "LABEL", "predicted"), colnames(single_protein)), with = FALSE]
     } else {
         survival = single_protein[, intersect(c(cols, "LABEL"), colnames(single_protein)), with = FALSE]
         survival[, predicted := NA]
     }
-    
+
     if (all(!is.na(single_protein$ANOMALYSCORES))) {
         single_protein[, weights :=
             anomaly_weights_z_vec(ANOMALYSCORES),
@@ -569,11 +569,13 @@ MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol,
         }
 
         if (is_labeled_reference) {
-            single_protein[, predicted := ifelse(censored & is_labeled_ref == FALSE, predicted, NA)]
-            single_protein[, newABUNDANCE := ifelse(censored & is_labeled_ref == FALSE, predicted, newABUNDANCE)]
+            single_protein[!(censored & is_labeled_ref == FALSE), predicted := NA]
+            single_protein[(censored) & is_labeled_ref == FALSE,
+                           newABUNDANCE := predicted]
         } else {
-            single_protein[, predicted := ifelse(censored, predicted, NA)]
-            single_protein[, newABUNDANCE := ifelse(censored, predicted, newABUNDANCE)]
+            single_protein[!(censored), predicted := NA]
+            single_protein[(censored),
+                           newABUNDANCE := predicted]
         }
         survival = single_protein[, intersect(c(cols, "LABEL", "predicted"), colnames(single_protein)), with = FALSE]
     } else {

diff --git a/R/utils_checks.R b/R/utils_checks.R
@@ -211,9 +211,10 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) {
     cols = toupper(cols)
     cols = intersect(c(cols, "FRACTION", "TECHREPLICATE"),
                      colnames(input))
-    input = input[, cols, with = FALSE]
-
-    input$PEPTIDE = paste(input$PEPTIDESEQUENCE, 
+    drop_cols = setdiff(colnames(input), cols)
+    for (col in drop_cols) data.table::set(input, j = col, value = NULL)
+
+    input$PEPTIDE = paste(input$PEPTIDESEQUENCE,
                           input$PRECURSORCHARGE, sep = "_")
     input$TRANSITION = paste(input$FRAGMENTION, 
                              input$PRODUCTCHARGE, sep = "_")
@@ -322,8 +323,8 @@ setMethod(".checkDataValidity", "MSstatsValidated", .prepareForDataProcess)
     input[, PROTEIN := factor(PROTEIN)]
     input[, PEPTIDE := factor(PEPTIDE)]
     input[, TRANSITION := factor(TRANSITION)]
-    input = input[order(LABEL, GROUP_ORIGINAL, SUBJECT_ORIGINAL,
-                        RUN, PROTEIN, PEPTIDE, TRANSITION), ]
+    data.table::setorder(input, LABEL, GROUP_ORIGINAL, SUBJECT_ORIGINAL,
+                         RUN, PROTEIN, PEPTIDE, TRANSITION)
     input[, GROUP := factor(GROUP)]
     input[, SUBJECT := factor(SUBJECT)]
     input[, FEATURE := factor(FEATURE)]

diff --git a/R/utils_feature_selection.R b/R/utils_feature_selection.R
@@ -74,28 +74,22 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2
 #' @return data.table
 #' @keywords internal
 .selectHighQualityFeatures = function(input, min_feature_count) {
-    PROTEIN = PEPTIDE = FEATURE = originalRUN = ABUNDANCE = is_censored = NULL
-    is_obs = log2inty = LABEL = NULL
-
-    cols = c("PROTEIN", "PEPTIDE", "FEATURE", "originalRUN", "LABEL", 
-             "ABUNDANCE", "censored")
-    cols = intersect(cols, colnames(input))
-    input = input[, cols, with = FALSE]
-    if (!("censored" %in% cols)) {
-        input$censored = FALSE
-    } 
-    data.table::setnames(input, "censored", "is_censored")
+    PROTEIN = PEPTIDE = FEATURE = originalRUN = ABUNDANCE = censored = NULL
+    is_obs = log2inty = is_censored = LABEL = NULL
+
+    has_censored = is.element("censored", colnames(input))
     input = input[, list(protein = as.character(PROTEIN),
                          peptide = as.character(PEPTIDE),
                          feature = as.character(FEATURE),
                          run = as.character(originalRUN),
                          label = as.character(LABEL),
-                         log2inty = ifelse(!(is.na(ABUNDANCE) | is_censored),
-                                           ABUNDANCE, NA),
-                         is_censored)]
-    input[, is_obs := !(is.na(log2inty) | is_censored)]
+                         log2inty = ABUNDANCE,
+                         is_censored = if (has_censored) censored else FALSE)]
+    # Censored or missing intensities are not observations.
+    input[is_censored | is.na(log2inty), log2inty := NA]
+    input[, is_obs := !is.na(log2inty)]
     input[, is_censored := NULL]
-    
+
     features_quality = data.table::rbindlist(lapply(split(input, input$label),
                                                     .flagUninformativeSingleLabel,
                                                     min_feature_count = min_feature_count))

diff --git a/R/utils_normalize.R b/R/utils_normalize.R
@@ -61,8 +61,8 @@ MSstatsNormalize = function(input, normalization_method, peptides_dict = NULL, s
     input[, ABUNDANCE_FRACTION := median(ABUNDANCE_RUN, na.rm = TRUE),
           by = "FRACTION"]
     input[, ABUNDANCE := ABUNDANCE - ABUNDANCE_RUN + ABUNDANCE_FRACTION]
-    input = input[, !(colnames(input) %in% c("ABUNDANCE_RUN", "ABUNDANCE_FRACTION")),
-                  with = FALSE]
+    data.table::set(input, j = "ABUNDANCE_RUN", value = NULL)
+    data.table::set(input, j = "ABUNDANCE_FRACTION", value = NULL)
     getOption("MSstatsLog")("Normalization based on median: OK")
     input
 }
@@ -255,7 +255,9 @@ MSstatsNormalize = function(input, normalization_method, peptides_dict = NULL, s
     input[, ABUNDANCE := ABUNDANCE - median_by_run + median_by_fraction]
 
     getOption("MSstatsLog")("INFO", "Normalization : normalization with global standards protein - okay")
-    input[ , !(colnames(input) %in% c("median_by_run", "median_by_fraction")), with = FALSE]
+    data.table::set(input, j = "median_by_run", value = NULL)
+    data.table::set(input, j = "median_by_fraction", value = NULL)
+    input
 }
 
 
@@ -282,7 +284,7 @@ MSstatsNormalize = function(input, normalization_method, peptides_dict = NULL, s
 #' 
 MSstatsMergeFractions = function(input) {
     ABUNDANCE = INTENSITY = GROUP_ORIGINAL = SUBJECT_ORIGINAL = RUN = NULL
-    originalRUN = FRACTION = TECHREPLICATE = tmp = merged = newRun = NULL
+    originalRUN = FRACTION = TECHREPLICATE = tmp = newRun = NULL
     ncount = FEATURE = NULL
 
     input[!is.na(ABUNDANCE) & ABUNDANCE < 0, "ABUNDANCE"] = 0
@@ -338,29 +340,45 @@ MSstatsMergeFractions = function(input) {
                 getOption("MSstatsLog")("ERROR", msg)
                 stop(msg)
             } else {
-                match_runs[, merged := "merged"]
-                match_runs[, newRun := do.call(paste, c(.SD, sep = "_")), 
-                           .SDcols = c(1:3, ncol(match_runs))]
-                match_runs = unique(match_runs[, list(GROUP_ORIGINAL,
-                                                      SUBJECT_ORIGINAL,
-                                                      newRun)])
-
-                input = merge(input, match_runs,
-                              by = c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"),
-                              all.x = TRUE)
+                # dcast pivoted fractions into columns, so the fraction columns
+                # are everything that is not a sample-identifier column.
+                fraction_cols = setdiff(colnames(match_runs),
+                                        c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"))
+                # Use the first fraction's run as the sample's representative run.
+                first_fraction_run = match_runs[[fraction_cols[1]]]
+                # Build the merged-run name: <group>_<subject>_<run>_merged.
+                match_runs[, newRun := paste(GROUP_ORIGINAL, SUBJECT_ORIGINAL,
+                                             first_fraction_run, "merged",
+                                             sep = "_")]
+                # Reduce to a (group, subject) -> merged-run lookup table.
+                match_runs = match_runs[, list(GROUP_ORIGINAL, SUBJECT_ORIGINAL,
+                                               newRun)]
+                # For each input row, find its sample's row in the lookup.
+                nr_idx = match_runs[input,
+                                    on = c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"),
+                                    which = TRUE, mult = "first"]
+                # Write that merged-run name onto every input row.
+                data.table::set(input, j = "newRun",
+                                value = match_runs$newRun[nr_idx])
+                # Count, per feature/fraction, the rows observed above zero.
                 select_fraction = input[!is.na(ABUNDANCE) & input$ABUNDANCE > 0,
                                         list(ncount = .N),
                                         by = c("FEATURE", "FRACTION")]
+                # Keep only feature/fraction combinations seen at least once.
                 select_fraction = select_fraction[ncount != 0]
-                select_fraction[, tmp := paste(FEATURE, FRACTION, sep = "_")]
-                input$tmp = paste(input$FEATURE, input$FRACTION, sep = "_")
-                input = input[tmp %in% select_fraction$tmp, ]
-                input$originalRUN = input$newRun
-                input$RUN = input$originalRUN
-                input$RUN = factor(input$RUN, levels = unique(input$RUN), 
-                                   labels = seq_along(unique(input$RUN)))
-                input = input[, !(colnames(input) %in% c('tmp','newRun')), 
-                              with = FALSE]
+                # Mark which input rows belong to a kept combination (NA = none).
+                keep_idx = select_fraction[input,
+                                           on = c("FEATURE", "FRACTION"),
+                                           which = TRUE, mult = "first"]
+                # Drop rows whose feature/fraction was never observed.
+                input = input[!is.na(keep_idx)]
+                # The merged run replaces the original per-fraction run.
+                input[, originalRUN := newRun]
+                # Renumber RUN as a factor, one level per merged run.
+                input[, RUN := factor(newRun, levels = unique(newRun),
+                                      labels = seq_along(unique(newRun)))]
+                # Drop the temporary newRun helper column.
+                data.table::set(input, j = "newRun", value = NULL)
             }
         }
     }

diff --git a/R/utils_output.R b/R/utils_output.R
@@ -34,22 +34,31 @@
 #' output = output = MSstatsSummarizationOutput(input, summarized, processed,
 #' method, impute, cens)
 #' 
-MSstatsSummarizationOutput = function(input, summarized, processed, 
+MSstatsSummarizationOutput = function(input, summarized, processed,
                                       method, impute, censored_symbol) {
     LABEL = TotalGroupMeasurements = GROUP = Protein = RUN = NULL
-
-    input = .finalizeInput(input, summarized, method, impute, censored_symbol)
-    summarized = lapply(summarized, function(x) x[[1]])
-    summarized = data.table::rbindlist(summarized, fill = TRUE)
-    if (inherits(summarized, "try-error")) {
+
+    # Summarization failure is signalled by a NULL `summarized` (the caller
+    # wraps the per-subplot summarization in tryCatch(error = ...) returning
+    # NULL). Guard before unpacking, otherwise .finalizeInput() below joins on
+    # an empty predicted_survival and errors.
+    if (is.null(summarized)) {
         msg = paste("*** error : can't summarize per subplot with ",
                     method, ".")
         getOption("MSstatsLog")("ERROR", msg)
         getOption("MSstatsMsg")("ERROR", msg)
         rqall = NULL
-        rqmodelqc = NULL
-        workpred = NULL
     } else {
+        predicted_survival = data.table::rbindlist(lapply(summarized, function(x) x[[2]]),
+                                                    fill = TRUE)
+        for (i in seq_along(summarized)) summarized[[i]][[2]] = NULL
+        input = .finalizeInput(input, predicted_survival, method, impute, censored_symbol)
+        rm(predicted_survival)
+        protein_summaries = lapply(summarized, function(x) x[[1]])
+        rm(summarized)
+        summarized = data.table::rbindlist(protein_summaries, fill = TRUE)
+        rm(protein_summaries)
+
         input[, TotalGroupMeasurements := uniqueN(.SD),
               by = c("PROTEIN", "GROUP", "LABEL"),
               .SDcols = c("FEATURE", "originalRUN")]
@@ -67,10 +76,9 @@ MSstatsSummarizationOutput = function(input, summarized, processed,
                       by.y = c(merge_col, "PROTEIN", "LABEL"))
         data.table::setnames(rqall, c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"),
                              c("GROUP", "SUBJECT"), skip_absent = TRUE)
-        
+
         rqall$GROUP = factor(as.character(rqall$GROUP))
         rqall$Protein = factor(rqall$Protein)
-        rqmodelqc = summarized$ModelQC
     }
 
     if (is.element("RUN", colnames(rqall)) & !is.null(rqall)) {
@@ -82,18 +90,21 @@ MSstatsSummarizationOutput = function(input, summarized, processed,
                               "originalRUN", "censored", "INTENSITY", "ABUNDANCE",
                               "newABUNDANCE", "predicted", "feature_quality",
                               "is_outlier", "remove", "is_labeled_ref"), colnames(input))
-    input = input[, output_cols, with = FALSE]
-
+    drop_cols = setdiff(colnames(input), output_cols)
+    for (col in drop_cols) data.table::set(input, j = col, value = NULL)
+
     if (is.element("remove", colnames(processed))) {
-        processed = processed[(remove), 
-                              intersect(output_cols, 
+        processed = processed[(remove),
+                              intersect(output_cols,
                                         colnames(processed)), with = FALSE]
         input = rbind(input, processed, fill = TRUE)
     }
-    list(FeatureLevelData = as.data.frame(input), 
-         ProteinLevelData = as.data.frame(rqall), 
+    data.table::setDF(input)
+    if (!is.null(rqall)) data.table::setDF(rqall)
+    list(FeatureLevelData = input,
+         ProteinLevelData = rqall,
          SummaryMethod = method)
-    
+
 }
 
 
@@ -104,9 +115,9 @@ MSstatsSummarizationOutput = function(input, summarized, processed,
 #' @param impute if TRUE, censored missing values were imputed
 #' @param censored_symbol censored missing value indicator
 #' @keywords internal
-.finalizeInput = function(input, summarized, method, impute, censored_symbol) {
+.finalizeInput = function(input, predicted_survival, method, impute, censored_symbol) {
     # if (method == "TMP") {
-    input = .finalizeTMP(input, censored_symbol, impute, summarized)
+    input = .finalizeTMP(input, censored_symbol, impute, predicted_survival)
     # } else {
     #     input = .finalizeLinear(input, censored_symbol)
     # }
@@ -117,21 +128,23 @@ MSstatsSummarizationOutput = function(input, summarized, processed,
 #' Summary statistics for output of TMP-based summarization
 #' @inheritParams .finalizeInput
 #' @keywords internal
-.finalizeTMP = function(input, censored_symbol, impute, summarized) {
+.finalizeTMP = function(input, censored_symbol, impute, predicted_survival) {
     NonMissingStats = NumMeasuredFeature = MissingPercentage = LABEL = NULL
     total_features = more50missing = nonmissing_orig = censored = NULL
     INTENSITY = newABUNDANCE = NumImputedFeature = NULL
-
-    survival_predictions = lapply(summarized, function(x) x[[2]])
-    predicted_survival = data.table::rbindlist(survival_predictions, fill = TRUE)
+
     if (impute) {
-        cols = intersect(colnames(input), c("newABUNDANCE",
-                                            "cen", "RUN",
-                                            "FEATURE", "ref_covariate", "LABEL"))
-        input = merge(input[, colnames(input) != "newABUNDANCE", with = FALSE],
-                      predicted_survival,
-                      by = setdiff(cols, "newABUNDANCE"),
-                      all.x = TRUE)
+        join_cols = intersect(intersect(colnames(input),
+                                        colnames(predicted_survival)),
+                              c("cen", "RUN", "FEATURE", "ref_covariate",
+                                "LABEL"))
+        data.table::set(input, j = "newABUNDANCE", value = NULL)
+        idx = predicted_survival[input, on = join_cols, which = TRUE,
+                                 mult = "first"]
+        data.table::set(input, j = "newABUNDANCE",
+                        value = predicted_survival$newABUNDANCE[idx])
+        data.table::set(input, j = "predicted",
+                        value = predicted_survival$predicted[idx])
     }
     input[, NonMissingStats := .getNonMissingFilterStats(.SD, censored_symbol)]
     input[, NumMeasuredFeature := sum(NonMissingStats),
@@ -144,7 +157,7 @@ MSstatsSummarizationOutput = function(input, summarized, processed,
         } else {
             input[, nonmissing_orig := !is.na(INTENSITY)]
         }
-        input[, nonmissing_orig := ifelse(is.na(newABUNDANCE), TRUE, nonmissing_orig)]
+        input[is.na(newABUNDANCE), nonmissing_orig := TRUE]
         if (impute) {
             input[, NumImputedFeature := sum(!nonmissing_orig),
                   by = c("PROTEIN", "RUN", "LABEL")]
@@ -175,7 +188,7 @@ MSstatsSummarizationOutput = function(input, summarized, processed,
         } else {
             input[, nonmissing_orig := !is.na(INTENSITY)]
         }
-        input[, nonmissing_orig := ifelse(is.na(newABUNDANCE), TRUE, nonmissing_orig)]
+        input[is.na(newABUNDANCE), nonmissing_orig := TRUE]
         input[, NumImputedFeature := 0]
     }
     input