nf-core · pinin4fjords · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/modules/nf-core/tximeta/tximport/templates/tximport.r b/modules/nf-core/tximeta/tximport/templates/tximport.r
@@ -81,10 +81,11 @@ write_se_table <- function(params, prefix) {
 #' @param gene_id_col Column name for gene IDs.
 #' @param gene_name_col Column name for gene names.
 #'
-#' @return A list containing three elements:
+#' @return A list containing four elements:
 #' - `transcript`: A data frame with transcript IDs, gene IDs, and gene names, indexed by transcript IDs.
 #' - `gene`: A data frame with unique gene IDs and gene names.
 #' - `tx2gene`: A data frame mapping transcript IDs to gene IDs.
+#' - `extra`: A character vector of transcript IDs found in quantification output but missing from the tx2gene file.
 
 read_transcript_info <- function(tinfo_path, tx_col, gene_id_col, gene_name_col){
     info <- file.info(tinfo_path)
@@ -104,13 +105,23 @@ read_transcript_info <- function(tinfo_path, tx_col, gene_id_col, gene_name_col)
     )
 
     extra <- setdiff(rownames(txi[[1]]), as.character(transcript_info[["tx"]]))
+    if (length(extra) > 0) {
+        warning(
+            length(extra), " transcripts found in quantification output but missing from ",
+            "the tx2gene mapping (GTF). These will be included in transcript-level outputs ",
+            "but excluded from gene-level summaries. This usually means the transcript FASTA ",
+            "and GTF are from different sources or versions. First 5: ",
+            paste(head(extra, 5), collapse = ", ")
+        )
+    }
     transcript_info <- rbind(transcript_info, data.frame(tx=extra, gene_id=extra, gene_name=extra, check.names = FALSE))
     transcript_info <- transcript_info[match(rownames(txi[[1]]), transcript_info[["tx"]]), ]
     rownames(transcript_info) <- transcript_info[["tx"]]
 
     list(transcript = transcript_info,
         gene = unique(transcript_info[,2:3]),
-        tx2gene = transcript_info[,1:2])
+        tx2gene = transcript_info[,1:2],
+        extra = extra)
 }
 
 #' Create a SummarizedExperiment Object
@@ -206,6 +217,21 @@ if ("tx2gene" %in% names(transcript_info) && !is.null(transcript_info\$tx2gene))
     gi.ls <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM")
     gi.s <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "scaledTPM")
 
+    # Remove fake gene entries created by unmapped transcripts (where gene_id
+    # was set to the transcript ID). These would break downstream processes
+    # like SummarizedExperiment that try to match gene IDs against the tx2gene.
+    if (length(transcript_info\$extra) > 0) {
+        real_genes <- setdiff(rownames(gi[[1]]), transcript_info\$extra)
+        filter_rows <- function(txi_list, genes) {
+            lapply(txi_list, function(x) {
+                if (is.matrix(x)) x[genes, , drop = FALSE] else x
+            })
+        }
+        gi    <- filter_rows(gi, real_genes)
+        gi.ls <- filter_rows(gi.ls, real_genes)
+        gi.s  <- filter_rows(gi.s, real_genes)
+    }
+
     gene_info <- transcript_info\$gene[match(rownames(gi[[1]]), transcript_info\$gene[["gene_id"]]),]
     rownames(gene_info) <- NULL
     col_data_frame <- DataFrame(coldata)

diff --git a/modules/nf-core/tximeta/tximport/tests/main.nf.test b/modules/nf-core/tximeta/tximport/tests/main.nf.test
@@ -214,6 +214,81 @@ nextflow_process {
 
     }
 
+    test("saccharomyces_cerevisiae - salmon - gtf - mismatched_transcripts") {
+
+        setup {
+            run("UNTAR") {
+                script "../../../untar/main.nf"
+                process {
+                    """
+                    input[0] = Channel.of([
+                        [ id:'test'], // meta map
+                        file(params.modules_testdata_base_path + 'genomics/eukaryotes/saccharomyces_cerevisiae/salmon_results.tar.gz', checkIfExists: true)
+                    ])
+                    """
+                }
+            }
+            run("CUSTOM_TX2GENE") {
+                script "../../../custom/tx2gene/main.nf"
+                process {
+                    """
+                    input[0] = Channel.of([
+                        [ id:'test'], // meta map
+                        file(params.modules_testdata_base_path + 'genomics/eukaryotes/saccharomyces_cerevisiae/genome_gfp.gtf', checkIfExists: true)
+                    ])
+                    input[1] = UNTAR.out.untar.map { meta, dir -> [ meta, dir.listFiles().collect() ] }
+                    input[2] = 'salmon'
+                    input[3] = 'gene_id'
+                    input[4] = 'gene_name'
+                    """
+                }
+            }
+        }
+
+        when {
+            process {
+                """
+                input[0] = UNTAR.out.untar.map { meta, dir -> [ meta, dir.listFiles().collect() ] }
+                // Remove half the entries from tx2gene to simulate FASTA/GTF mismatch
+                input[1] = CUSTOM_TX2GENE.out.tx2gene.map { meta, tx2gene ->
+                    def lines = tx2gene.readLines()
+                    def header = lines[0]
+                    def kept = lines[1..Math.max(1, (int)(lines.size() / 2))]
+                    def modified = file("\${workDir}/truncated_tx2gene.tsv")
+                    modified.text = ([header] + kept).join('\\n') + '\\n'
+                    [meta, modified]
+                }
+                input[2] = 'salmon'
+                """
+            }
+        }
+
+        then {
+            // The truncated tx2gene covers only a subset of transcripts.
+            // Without filtering, unmapped transcripts would appear as fake
+            // genes (gene_id = transcript_id), inflating the gene count to
+            // match the transcript count (yeast has ~1:1 gene:transcript).
+            // With filtering, gene rows should be strictly fewer.
+            def gene_counts = path(process.out.counts_gene[0][1])
+            def gene_lines = gene_counts.readLines()
+            def tx_lines = path(process.out.counts_transcript[0][1]).readLines()
+            def n_genes = gene_lines.size() - 1        // subtract header
+            def n_transcripts = tx_lines.size() - 1
+
+            // Read .command.err from the work directory to check for warning
+            def commandErr = gene_counts.parent.resolve('.command.err').text
+
+            assertAll(
+                { assert process.success },
+                { assert n_genes > 0 },
+                { assert n_genes < n_transcripts :
+                    "Gene count rows ($n_genes) should be fewer than transcript rows ($n_transcripts) after filtering unmapped transcripts" },
+                { assert commandErr.contains("transcripts found in quantification output but missing from") :
+                    "Expected warning about unmapped transcripts in .command.err" }
+            )
+        }
+    }
+
     test("saccharomyces_cerevisiae - salmon - gtf - stub") {
 
         options "-stub"

diff --git a/modules/nf-core/tximeta/tximport/tests/main.nf.test.snap b/modules/nf-core/tximeta/tximport/tests/main.nf.test.snap
@@ -138,11 +138,11 @@
                 ]
             }
         ],
+        "timestamp": "2026-04-09T11:02:50.661043856",
         "meta": {
-            "nf-test": "0.9.2",
-            "nextflow": "25.10.0"
-        },
-        "timestamp": "2025-12-09T09:26:21.185522"
+            "nf-test": "0.9.5",
+            "nextflow": "25.10.4"
+        }
     },
     "saccharomyces_cerevisiae - salmon - gtf": {
         "content": [
@@ -214,11 +214,11 @@
                 "versions.yml:md5,6ff317cceddc686f84d79cb976e1e28b"
             ]
         ],
+        "timestamp": "2026-04-09T11:03:02.256857186",
         "meta": {
-            "nf-test": "0.9.2",
-            "nextflow": "25.10.0"
-        },
-        "timestamp": "2025-12-09T09:27:19.912432"
+            "nf-test": "0.9.5",
+            "nextflow": "25.10.4"
+        }
     },
     "saccharomyces_cerevisiae - salmon - gtf - stub": {
         "content": [
@@ -359,11 +359,11 @@
                 ]
             }
         ],
+        "timestamp": "2026-04-09T11:03:28.936272282",
         "meta": {
-            "nf-test": "0.9.2",
-            "nextflow": "25.10.0"
-        },
-        "timestamp": "2025-12-09T09:28:42.817097"
+            "nf-test": "0.9.5",
+            "nextflow": "25.10.4"
+        }
     },
     "saccharomyces_cerevisiae - kallisto - gtf - custom_column_names": {
         "content": [
@@ -435,11 +435,11 @@
                 "versions.yml:md5,6ff317cceddc686f84d79cb976e1e28b"
             ]
         ],
+        "timestamp": "2026-04-09T11:04:06.859776382",
         "meta": {
-            "nf-test": "0.9.2",
-            "nextflow": "25.10.0"
-        },
-        "timestamp": "2025-12-09T09:29:35.461954"
+            "nf-test": "0.9.5",
+            "nextflow": "25.10.4"
+        }
     },
     "saccharomyces_cerevisiae - rsem - gtf - stub": {
         "content": [
@@ -580,11 +580,11 @@
                 ]
             }
         ],
+        "timestamp": "2026-04-09T11:03:55.191304215",
         "meta": {
-            "nf-test": "0.9.3",
-            "nextflow": "25.10.3"
-        },
-        "timestamp": "2026-02-11T10:59:46.025734857"
+            "nf-test": "0.9.5",
+            "nextflow": "25.10.4"
+        }
     },
     "saccharomyces_cerevisiae - kallisto - gtf": {
         "content": [
@@ -656,11 +656,11 @@
                 "versions.yml:md5,6ff317cceddc686f84d79cb976e1e28b"
             ]
         ],
+        "timestamp": "2026-04-09T11:02:24.289256682",
         "meta": {
-            "nf-test": "0.9.2",
-            "nextflow": "25.10.0"
-        },
-        "timestamp": "2025-12-09T09:23:53.102679"
+            "nf-test": "0.9.5",
+            "nextflow": "25.10.4"
+        }
     },
     "saccharomyces_cerevisiae - rsem - gtf": {
         "content": [
@@ -732,11 +732,11 @@
                 "versions.yml:md5,6ff317cceddc686f84d79cb976e1e28b"
             ]
         ],
+        "timestamp": "2026-04-09T11:03:40.511252751",
         "meta": {
-            "nf-test": "0.9.3",
-            "nextflow": "25.10.3"
-        },
-        "timestamp": "2026-02-11T10:59:31.386695001"
+            "nf-test": "0.9.5",
+            "nextflow": "25.10.4"
+        }
     },
     "saccharomyces_cerevisiae - kallisto - gtf - extra_attributes": {
         "content": [
@@ -808,10 +808,10 @@
                 "versions.yml:md5,6ff317cceddc686f84d79cb976e1e28b"
             ]
         ],
+        "timestamp": "2026-04-09T11:02:35.957379387",
         "meta": {
-            "nf-test": "0.9.2",
-            "nextflow": "25.10.0"
-        },
-        "timestamp": "2025-12-09T09:24:47.963548"
+            "nf-test": "0.9.5",
+            "nextflow": "25.10.4"
+        }
     }
 }