Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 28 additions & 2 deletions modules/nf-core/tximeta/tximport/templates/tximport.r
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,11 @@ write_se_table <- function(params, prefix) {
#' @param gene_id_col Column name for gene IDs.
#' @param gene_name_col Column name for gene names.
#'
#' @return A list containing three elements:
#' @return A list containing four elements:
#' - `transcript`: A data frame with transcript IDs, gene IDs, and gene names, indexed by transcript IDs.
#' - `gene`: A data frame with unique gene IDs and gene names.
#' - `tx2gene`: A data frame mapping transcript IDs to gene IDs.
#' - `extra`: A character vector of transcript IDs found in quantification output but missing from the tx2gene file.

read_transcript_info <- function(tinfo_path, tx_col, gene_id_col, gene_name_col){
info <- file.info(tinfo_path)
Expand All @@ -104,13 +105,23 @@ read_transcript_info <- function(tinfo_path, tx_col, gene_id_col, gene_name_col)
)

extra <- setdiff(rownames(txi[[1]]), as.character(transcript_info[["tx"]]))
if (length(extra) > 0) {
warning(
length(extra), " transcripts found in quantification output but missing from ",
"the tx2gene mapping (GTF). These will be included in transcript-level outputs ",
"but excluded from gene-level summaries. This usually means the transcript FASTA ",
"and GTF are from different sources or versions. First 5: ",
paste(head(extra, 5), collapse = ", ")
)
}
transcript_info <- rbind(transcript_info, data.frame(tx=extra, gene_id=extra, gene_name=extra, check.names = FALSE))
transcript_info <- transcript_info[match(rownames(txi[[1]]), transcript_info[["tx"]]), ]
rownames(transcript_info) <- transcript_info[["tx"]]

list(transcript = transcript_info,
gene = unique(transcript_info[,2:3]),
tx2gene = transcript_info[,1:2])
tx2gene = transcript_info[,1:2],
extra = extra)
}

#' Create a SummarizedExperiment Object
Expand Down Expand Up @@ -206,6 +217,21 @@ if ("tx2gene" %in% names(transcript_info) && !is.null(transcript_info\$tx2gene))
gi.ls <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM")
gi.s <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "scaledTPM")

# Remove fake gene entries created by unmapped transcripts (where gene_id
# was set to the transcript ID). These would break downstream processes
# like SummarizedExperiment that try to match gene IDs against the tx2gene.
if (length(transcript_info\$extra) > 0) {
real_genes <- setdiff(rownames(gi[[1]]), transcript_info\$extra)
filter_rows <- function(txi_list, genes) {
lapply(txi_list, function(x) {
if (is.matrix(x)) x[genes, , drop = FALSE] else x
})
}
gi <- filter_rows(gi, real_genes)
gi.ls <- filter_rows(gi.ls, real_genes)
gi.s <- filter_rows(gi.s, real_genes)
}

gene_info <- transcript_info\$gene[match(rownames(gi[[1]]), transcript_info\$gene[["gene_id"]]),]
rownames(gene_info) <- NULL
col_data_frame <- DataFrame(coldata)
Expand Down
75 changes: 75 additions & 0 deletions modules/nf-core/tximeta/tximport/tests/main.nf.test
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,81 @@ nextflow_process {

}

test("saccharomyces_cerevisiae - salmon - gtf - mismatched_transcripts") {

setup {
run("UNTAR") {
script "../../../untar/main.nf"
process {
"""
input[0] = Channel.of([
[ id:'test'], // meta map
file(params.modules_testdata_base_path + 'genomics/eukaryotes/saccharomyces_cerevisiae/salmon_results.tar.gz', checkIfExists: true)
])
"""
}
}
run("CUSTOM_TX2GENE") {
script "../../../custom/tx2gene/main.nf"
process {
"""
input[0] = Channel.of([
[ id:'test'], // meta map
file(params.modules_testdata_base_path + 'genomics/eukaryotes/saccharomyces_cerevisiae/genome_gfp.gtf', checkIfExists: true)
])
input[1] = UNTAR.out.untar.map { meta, dir -> [ meta, dir.listFiles().collect() ] }
input[2] = 'salmon'
input[3] = 'gene_id'
input[4] = 'gene_name'
"""
}
}
}

when {
process {
"""
input[0] = UNTAR.out.untar.map { meta, dir -> [ meta, dir.listFiles().collect() ] }
// Remove half the entries from tx2gene to simulate FASTA/GTF mismatch
input[1] = CUSTOM_TX2GENE.out.tx2gene.map { meta, tx2gene ->
def lines = tx2gene.readLines()
def header = lines[0]
def kept = lines[1..Math.max(1, (int)(lines.size() / 2))]
def modified = file("\${workDir}/truncated_tx2gene.tsv")
modified.text = ([header] + kept).join('\\n') + '\\n'
[meta, modified]
}
input[2] = 'salmon'
"""
}
}

then {
// The truncated tx2gene covers only a subset of transcripts.
// Without filtering, unmapped transcripts would appear as fake
// genes (gene_id = transcript_id), inflating the gene count to
// match the transcript count (yeast has ~1:1 gene:transcript).
// With filtering, gene rows should be strictly fewer.
def gene_counts = path(process.out.counts_gene[0][1])
def gene_lines = gene_counts.readLines()
def tx_lines = path(process.out.counts_transcript[0][1]).readLines()
def n_genes = gene_lines.size() - 1 // subtract header
def n_transcripts = tx_lines.size() - 1

// Read .command.err from the work directory to check for warning
def commandErr = gene_counts.parent.resolve('.command.err').text

assertAll(
{ assert process.success },
{ assert n_genes > 0 },
{ assert n_genes < n_transcripts :
"Gene count rows ($n_genes) should be fewer than transcript rows ($n_transcripts) after filtering unmapped transcripts" },
{ assert commandErr.contains("transcripts found in quantification output but missing from") :
"Expected warning about unmapped transcripts in .command.err" }
)
}
}

test("saccharomyces_cerevisiae - salmon - gtf - stub") {

options "-stub"
Expand Down
64 changes: 32 additions & 32 deletions modules/nf-core/tximeta/tximport/tests/main.nf.test.snap
Original file line number Diff line number Diff line change
Expand Up @@ -138,11 +138,11 @@
]
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unlss I'm being blined, I don't see the new test represented in this snapshot?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new test doesn't use snapshots- it's asserting that e.g. we get the warning message we expect etc.

}
],
"timestamp": "2026-04-09T11:02:50.661043856",
"meta": {
"nf-test": "0.9.2",
"nextflow": "25.10.0"
},
"timestamp": "2025-12-09T09:26:21.185522"
"nf-test": "0.9.5",
"nextflow": "25.10.4"
}
},
"saccharomyces_cerevisiae - salmon - gtf": {
"content": [
Expand Down Expand Up @@ -214,11 +214,11 @@
"versions.yml:md5,6ff317cceddc686f84d79cb976e1e28b"
]
],
"timestamp": "2026-04-09T11:03:02.256857186",
"meta": {
"nf-test": "0.9.2",
"nextflow": "25.10.0"
},
"timestamp": "2025-12-09T09:27:19.912432"
"nf-test": "0.9.5",
"nextflow": "25.10.4"
}
},
"saccharomyces_cerevisiae - salmon - gtf - stub": {
"content": [
Expand Down Expand Up @@ -359,11 +359,11 @@
]
}
],
"timestamp": "2026-04-09T11:03:28.936272282",
"meta": {
"nf-test": "0.9.2",
"nextflow": "25.10.0"
},
"timestamp": "2025-12-09T09:28:42.817097"
"nf-test": "0.9.5",
"nextflow": "25.10.4"
}
},
"saccharomyces_cerevisiae - kallisto - gtf - custom_column_names": {
"content": [
Expand Down Expand Up @@ -435,11 +435,11 @@
"versions.yml:md5,6ff317cceddc686f84d79cb976e1e28b"
]
],
"timestamp": "2026-04-09T11:04:06.859776382",
"meta": {
"nf-test": "0.9.2",
"nextflow": "25.10.0"
},
"timestamp": "2025-12-09T09:29:35.461954"
"nf-test": "0.9.5",
"nextflow": "25.10.4"
}
},
"saccharomyces_cerevisiae - rsem - gtf - stub": {
"content": [
Expand Down Expand Up @@ -580,11 +580,11 @@
]
}
],
"timestamp": "2026-04-09T11:03:55.191304215",
"meta": {
"nf-test": "0.9.3",
"nextflow": "25.10.3"
},
"timestamp": "2026-02-11T10:59:46.025734857"
"nf-test": "0.9.5",
"nextflow": "25.10.4"
}
},
"saccharomyces_cerevisiae - kallisto - gtf": {
"content": [
Expand Down Expand Up @@ -656,11 +656,11 @@
"versions.yml:md5,6ff317cceddc686f84d79cb976e1e28b"
]
],
"timestamp": "2026-04-09T11:02:24.289256682",
"meta": {
"nf-test": "0.9.2",
"nextflow": "25.10.0"
},
"timestamp": "2025-12-09T09:23:53.102679"
"nf-test": "0.9.5",
"nextflow": "25.10.4"
}
},
"saccharomyces_cerevisiae - rsem - gtf": {
"content": [
Expand Down Expand Up @@ -732,11 +732,11 @@
"versions.yml:md5,6ff317cceddc686f84d79cb976e1e28b"
]
],
"timestamp": "2026-04-09T11:03:40.511252751",
"meta": {
"nf-test": "0.9.3",
"nextflow": "25.10.3"
},
"timestamp": "2026-02-11T10:59:31.386695001"
"nf-test": "0.9.5",
"nextflow": "25.10.4"
}
},
"saccharomyces_cerevisiae - kallisto - gtf - extra_attributes": {
"content": [
Expand Down Expand Up @@ -808,10 +808,10 @@
"versions.yml:md5,6ff317cceddc686f84d79cb976e1e28b"
]
],
"timestamp": "2026-04-09T11:02:35.957379387",
"meta": {
"nf-test": "0.9.2",
"nextflow": "25.10.0"
},
"timestamp": "2025-12-09T09:24:47.963548"
"nf-test": "0.9.5",
"nextflow": "25.10.4"
}
}
}
Loading