diff --git a/modules/nf-core/tximeta/tximport/templates/tximport.r b/modules/nf-core/tximeta/tximport/templates/tximport.r index 58fd048bd2d..36b870a07e6 100755 --- a/modules/nf-core/tximeta/tximport/templates/tximport.r +++ b/modules/nf-core/tximeta/tximport/templates/tximport.r @@ -81,10 +81,11 @@ write_se_table <- function(params, prefix) { #' @param gene_id_col Column name for gene IDs. #' @param gene_name_col Column name for gene names. #' -#' @return A list containing three elements: +#' @return A list containing four elements: #' - `transcript`: A data frame with transcript IDs, gene IDs, and gene names, indexed by transcript IDs. #' - `gene`: A data frame with unique gene IDs and gene names. #' - `tx2gene`: A data frame mapping transcript IDs to gene IDs. +#' - `extra`: A character vector of transcript IDs found in quantification output but missing from the tx2gene file. read_transcript_info <- function(tinfo_path, tx_col, gene_id_col, gene_name_col){ info <- file.info(tinfo_path) @@ -104,13 +105,23 @@ read_transcript_info <- function(tinfo_path, tx_col, gene_id_col, gene_name_col) ) extra <- setdiff(rownames(txi[[1]]), as.character(transcript_info[["tx"]])) + if (length(extra) > 0) { + warning( + length(extra), " transcripts found in quantification output but missing from ", + "the tx2gene mapping (GTF). These will be included in transcript-level outputs ", + "but excluded from gene-level summaries. This usually means the transcript FASTA ", + "and GTF are from different sources or versions. First 5: ", + paste(head(extra, 5), collapse = ", ") + ) + } transcript_info <- rbind(transcript_info, data.frame(tx=extra, gene_id=extra, gene_name=extra, check.names = FALSE)) transcript_info <- transcript_info[match(rownames(txi[[1]]), transcript_info[["tx"]]), ] rownames(transcript_info) <- transcript_info[["tx"]] list(transcript = transcript_info, gene = unique(transcript_info[,2:3]), - tx2gene = transcript_info[,1:2]) + tx2gene = transcript_info[,1:2], + extra = extra) } #' Create a SummarizedExperiment Object @@ -206,6 +217,21 @@ if ("tx2gene" %in% names(transcript_info) && !is.null(transcript_info\$tx2gene)) gi.ls <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM") gi.s <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "scaledTPM") + # Remove fake gene entries created by unmapped transcripts (where gene_id + # was set to the transcript ID). These would break downstream processes + # like SummarizedExperiment that try to match gene IDs against the tx2gene. + if (length(transcript_info\$extra) > 0) { + real_genes <- setdiff(rownames(gi[[1]]), transcript_info\$extra) + filter_rows <- function(txi_list, genes) { + lapply(txi_list, function(x) { + if (is.matrix(x)) x[genes, , drop = FALSE] else x + }) + } + gi <- filter_rows(gi, real_genes) + gi.ls <- filter_rows(gi.ls, real_genes) + gi.s <- filter_rows(gi.s, real_genes) + } + gene_info <- transcript_info\$gene[match(rownames(gi[[1]]), transcript_info\$gene[["gene_id"]]),] rownames(gene_info) <- NULL col_data_frame <- DataFrame(coldata) diff --git a/modules/nf-core/tximeta/tximport/tests/main.nf.test b/modules/nf-core/tximeta/tximport/tests/main.nf.test index 49787366bfd..fbd59752610 100644 --- a/modules/nf-core/tximeta/tximport/tests/main.nf.test +++ b/modules/nf-core/tximeta/tximport/tests/main.nf.test @@ -214,6 +214,81 @@ nextflow_process { } + test("saccharomyces_cerevisiae - salmon - gtf - mismatched_transcripts") { + + setup { + run("UNTAR") { + script "../../../untar/main.nf" + process { + """ + input[0] = Channel.of([ + [ id:'test'], // meta map + file(params.modules_testdata_base_path + 'genomics/eukaryotes/saccharomyces_cerevisiae/salmon_results.tar.gz', checkIfExists: true) + ]) + """ + } + } + run("CUSTOM_TX2GENE") { + script "../../../custom/tx2gene/main.nf" + process { + """ + input[0] = Channel.of([ + [ id:'test'], // meta map + file(params.modules_testdata_base_path + 'genomics/eukaryotes/saccharomyces_cerevisiae/genome_gfp.gtf', checkIfExists: true) + ]) + input[1] = UNTAR.out.untar.map { meta, dir -> [ meta, dir.listFiles().collect() ] } + input[2] = 'salmon' + input[3] = 'gene_id' + input[4] = 'gene_name' + """ + } + } + } + + when { + process { + """ + input[0] = UNTAR.out.untar.map { meta, dir -> [ meta, dir.listFiles().collect() ] } + // Remove half the entries from tx2gene to simulate FASTA/GTF mismatch + input[1] = CUSTOM_TX2GENE.out.tx2gene.map { meta, tx2gene -> + def lines = tx2gene.readLines() + def header = lines[0] + def kept = lines[1..Math.max(1, (int)(lines.size() / 2))] + def modified = file("\${workDir}/truncated_tx2gene.tsv") + modified.text = ([header] + kept).join('\\n') + '\\n' + [meta, modified] + } + input[2] = 'salmon' + """ + } + } + + then { + // The truncated tx2gene covers only a subset of transcripts. + // Without filtering, unmapped transcripts would appear as fake + // genes (gene_id = transcript_id), inflating the gene count to + // match the transcript count (yeast has ~1:1 gene:transcript). + // With filtering, gene rows should be strictly fewer. + def gene_counts = path(process.out.counts_gene[0][1]) + def gene_lines = gene_counts.readLines() + def tx_lines = path(process.out.counts_transcript[0][1]).readLines() + def n_genes = gene_lines.size() - 1 // subtract header + def n_transcripts = tx_lines.size() - 1 + + // Read .command.err from the work directory to check for warning + def commandErr = gene_counts.parent.resolve('.command.err').text + + assertAll( + { assert process.success }, + { assert n_genes > 0 }, + { assert n_genes < n_transcripts : + "Gene count rows ($n_genes) should be fewer than transcript rows ($n_transcripts) after filtering unmapped transcripts" }, + { assert commandErr.contains("transcripts found in quantification output but missing from") : + "Expected warning about unmapped transcripts in .command.err" } + ) + } + } + test("saccharomyces_cerevisiae - salmon - gtf - stub") { options "-stub" diff --git a/modules/nf-core/tximeta/tximport/tests/main.nf.test.snap b/modules/nf-core/tximeta/tximport/tests/main.nf.test.snap index 322d018585b..0e07111b887 100644 --- a/modules/nf-core/tximeta/tximport/tests/main.nf.test.snap +++ b/modules/nf-core/tximeta/tximport/tests/main.nf.test.snap @@ -138,11 +138,11 @@ ] } ], + "timestamp": "2026-04-09T11:02:50.661043856", "meta": { - "nf-test": "0.9.2", - "nextflow": "25.10.0" - }, - "timestamp": "2025-12-09T09:26:21.185522" + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } }, "saccharomyces_cerevisiae - salmon - gtf": { "content": [ @@ -214,11 +214,11 @@ "versions.yml:md5,6ff317cceddc686f84d79cb976e1e28b" ] ], + "timestamp": "2026-04-09T11:03:02.256857186", "meta": { - "nf-test": "0.9.2", - "nextflow": "25.10.0" - }, - "timestamp": "2025-12-09T09:27:19.912432" + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } }, "saccharomyces_cerevisiae - salmon - gtf - stub": { "content": [ @@ -359,11 +359,11 @@ ] } ], + "timestamp": "2026-04-09T11:03:28.936272282", "meta": { - "nf-test": "0.9.2", - "nextflow": "25.10.0" - }, - "timestamp": "2025-12-09T09:28:42.817097" + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } }, "saccharomyces_cerevisiae - kallisto - gtf - custom_column_names": { "content": [ @@ -435,11 +435,11 @@ "versions.yml:md5,6ff317cceddc686f84d79cb976e1e28b" ] ], + "timestamp": "2026-04-09T11:04:06.859776382", "meta": { - "nf-test": "0.9.2", - "nextflow": "25.10.0" - }, - "timestamp": "2025-12-09T09:29:35.461954" + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } }, "saccharomyces_cerevisiae - rsem - gtf - stub": { "content": [ @@ -580,11 +580,11 @@ ] } ], + "timestamp": "2026-04-09T11:03:55.191304215", "meta": { - "nf-test": "0.9.3", - "nextflow": "25.10.3" - }, - "timestamp": "2026-02-11T10:59:46.025734857" + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } }, "saccharomyces_cerevisiae - kallisto - gtf": { "content": [ @@ -656,11 +656,11 @@ "versions.yml:md5,6ff317cceddc686f84d79cb976e1e28b" ] ], + "timestamp": "2026-04-09T11:02:24.289256682", "meta": { - "nf-test": "0.9.2", - "nextflow": "25.10.0" - }, - "timestamp": "2025-12-09T09:23:53.102679" + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } }, "saccharomyces_cerevisiae - rsem - gtf": { "content": [ @@ -732,11 +732,11 @@ "versions.yml:md5,6ff317cceddc686f84d79cb976e1e28b" ] ], + "timestamp": "2026-04-09T11:03:40.511252751", "meta": { - "nf-test": "0.9.3", - "nextflow": "25.10.3" - }, - "timestamp": "2026-02-11T10:59:31.386695001" + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } }, "saccharomyces_cerevisiae - kallisto - gtf - extra_attributes": { "content": [ @@ -808,10 +808,10 @@ "versions.yml:md5,6ff317cceddc686f84d79cb976e1e28b" ] ], + "timestamp": "2026-04-09T11:02:35.957379387", "meta": { - "nf-test": "0.9.2", - "nextflow": "25.10.0" - }, - "timestamp": "2025-12-09T09:24:47.963548" + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } } } \ No newline at end of file