diff --git a/modules/nf-core/masurca/environment.yml b/modules/nf-core/masurca/environment.yml new file mode 100644 index 00000000000..4f7c43533bc --- /dev/null +++ b/modules/nf-core/masurca/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - "bioconda::masurca=4.1.4=ha5bb246_1" diff --git a/modules/nf-core/masurca/main.nf b/modules/nf-core/masurca/main.nf new file mode 100644 index 00000000000..1b98d175925 --- /dev/null +++ b/modules/nf-core/masurca/main.nf @@ -0,0 +1,147 @@ +process MASURCA { + tag "${meta.id}" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community.wave.seqera.io/v2/library/masurca/blobs/sha256:70f76916a4e082b75bb01e08aeb139631056907cdf1bb0012dc10298a37694c0' : + 'community.wave.seqera.io/library/masurca:4.1.4--6c6f779135534097' }" + + input: + tuple val(meta), path(illumina), path(jump), path(pacbio), path(nanopore) + val fragment_mean + val fragment_stdev + val jump_mean + val jump_stdev + val extend_jump_reads + val graph_kmer_size + val use_linking_mates + val lhe_coverage + val mega_reads_one_pass + val limit_jump_coverage + val ca_parameters + val close_gaps + val jf_size + + output: + tuple val(meta), path("assemble.sh"), emit: script + tuple val(meta), path("*scaffolds.fa.gz"), emit: scaffolds + tuple val(meta), path("*_masurca_config.txt"), emit: config + tuple val(meta), path("*-masurca.log"), emit: log + tuple val("${task.process}"), val('masurca'), eval("masurca --version | sed 's/version //g'"), topic: versions, emit: versions_masurca + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + //get input reads - illumina are mandatory, jump/pacbio/nanopore are optional + def illumina_reads = [illumina].flatten() + def jump_reads = jump ? [jump].flatten() : [] + def pacbio_name = pacbio ? pacbio.name : "" + def nanopore_name = nanopore ? nanopore.name : "" + + // Build the config file + def config_lines = [] + + // DATA section + config_lines << "DATA" + config_lines << "#Illumina paired end reads supplied as " + config_lines << "#if single-end, do not specify " + config_lines << "#MUST HAVE Illumina paired end reads to use MaSuRCA" + config_lines << "PE= pe ${fragment_mean} ${fragment_stdev} ${illumina_reads.collect{ fq -> "\$PWD/${fq}"}.join(" ") }" + + // Jump/mate pair reads (optional) + if (jump_reads) { + config_lines << "#Illumina mate pair reads supplied as " + config_lines << "JUMP= sh ${jump_mean} ${jump_stdev} ${jump_reads.collect{ fq -> "\$PWD/${fq}" }.join(" ")}" + } + + + // PacBio and Nanopore reads handling + def long_reads_concat = "" + if (pacbio_name && nanopore_name) { + config_lines << "#if you have both PacBio and Nanopore, supply both as NANOPORE type" + long_reads_concat = "${prefix}_long_reads.fastq.gz" + config_lines << "NANOPORE= \$PWD/${long_reads_concat}" + } else if (pacbio_name) { + config_lines << "#PacBio/CCS reads must be in a single fasta or fastq file with absolute path" + config_lines << "PACBIO=\$PWD/${pacbio_name}" + } else if (nanopore_name) { + config_lines << "#Nanopore reads must be in a single fasta or fastq file with absolute path" + config_lines << "NANOPORE=\$PWD/${nanopore_name}" + } + + config_lines << "END" + config_lines << "" + + // PARAMETERS section + config_lines << "PARAMETERS" + config_lines << "#set this to 1 if your Illumina jumping library reads are shorter than 100bp" + config_lines << "EXTEND_JUMP_READS=${extend_jump_reads}" + config_lines << "#this is k-mer size for deBruijn graph values between 25 and 127 are supported, auto will compute the optimal size based on the read data and GC content" + config_lines << "GRAPH_KMER_SIZE = ${graph_kmer_size}" + config_lines << "#set this to 1 for all Illumina-only assemblies" + config_lines << "#set this to 0 if you have more than 15x coverage by long reads (Pacbio or Nanopore) or any other long reads/mate pairs (Illumina MP, Sanger, 454, etc)" + config_lines << "USE_LINKING_MATES = ${use_linking_mates}" + config_lines << "#use at most this much coverage by the longest Pacbio or Nanopore reads, discard the rest of the reads" + config_lines << "#can increase this to 30 or 35 if your reads are short (N50<7000bp)" + config_lines << "LHE_COVERAGE=${lhe_coverage}" + config_lines << "#set to 0 (default) to do two passes of mega-reads for slower, but higher quality assembly, otherwise set to 1" + config_lines << "MEGA_READS_ONE_PASS=${mega_reads_one_pass}" + config_lines << "#this parameter is useful if you have too many Illumina jumping library mates. Typically set it to 60 for bacteria and 300 for the other organisms" + config_lines << "LIMIT_JUMP_COVERAGE = ${limit_jump_coverage}" + config_lines << "#these are the additional parameters to Celera Assembler. do not worry about performance, number or processors or batch sizes -- these are computed automatically." + config_lines << "#CABOG ASSEMBLY ONLY: set cgwErrorRate=0.25 for bacteria and 0.1<=cgwErrorRate<=0.15 for other organisms." + config_lines << "CA_PARAMETERS = ${ca_parameters}" + config_lines << "#CABOG ASSEMBLY ONLY: whether to attempt to close gaps in scaffolds with Illumina or long read data" + config_lines << "CLOSE_GAPS=${close_gaps}" + config_lines << "#number of cpus to use, set this to the number of CPUs/threads per node you will be using" + config_lines << "NUM_THREADS = ${task.cpus}" + config_lines << "#this is mandatory jellyfish hash size -- a safe value is estimated_genome_size*20" + config_lines << "JF_SIZE = ${jf_size}" + config_lines << "#ILLUMINA ONLY. Set this to 1 to use SOAPdenovo contigging/scaffolding module." + config_lines << "#Assembly will be worse but will run faster. Useful for very large (>=8Gbp) genomes from Illumina-only data" + config_lines << "SOAP_ASSEMBLY=0" + config_lines << "#If you are doing Hybrid Illumina paired end + Nanopore/PacBio assembly ONLY (no Illumina mate pairs or OTHER frg files)." + config_lines << "#Set this to 1 to use Flye assembler for final assembly of corrected mega-reads." + config_lines << "#A lot faster than CABOG, AND QUALITY IS THE SAME OR BETTER." + config_lines << "#Works well even when MEGA_READS_ONE_PASS is set to 1." + config_lines << "#DO NOT use if you have less than 15x coverage by long reads." + config_lines << "FLYE_ASSEMBLY=0" + config_lines << "END" + + def config_content = config_lines.collectMany{ cfg -> cfg.tokenize('\n') }.join('\n ') + + """ + # Write the config file + cat > ${prefix}_masurca_config.txt <<-CONFIG_EOF + ${config_content} + CONFIG_EOF + + # Concatenate long reads if both PacBio and Nanopore are present + ${long_reads_concat ? "cat \$PWD/${pacbio_name} \$PWD/${nanopore_name} > ${long_reads_concat}" : ""} + + # Generate assembly script + masurca ${prefix}_masurca_config.txt + + ./assemble.sh > ${prefix}-masurca.log 2>&1 || (cat ${prefix}-masurca.log >&2; for f in CA*.log; do echo "=== \$f ==="; cat "\$f"; done >&2; find . -name "000001.out" | while read f; do echo "=== \$f ==="; cat "\$f"; done >&2; exit 1) + + if [ -f CA*/primary.genome.scf.fasta ]; then + gzip -cn CA*/primary.genome.scf.fasta > ${prefix}.scaffolds.fa.gz + fi + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir -p CA + touch assemble.sh + touch ${prefix}_masurca_config.txt + echo | gzip > ${prefix}.scaffolds.fa.gz + touch ${prefix}-masurca.log + """ +} diff --git a/modules/nf-core/masurca/meta.yml b/modules/nf-core/masurca/meta.yml new file mode 100644 index 00000000000..f96165adb2f --- /dev/null +++ b/modules/nf-core/masurca/meta.yml @@ -0,0 +1,167 @@ +name: "masurca" +description: The MaSuRCA (Maryland Super Read Cabog Assembler) genome assembly + and analysis toolkit +keywords: + - denovo + - assembly + - debruijn + - genomics +tools: + - "masurca": + description: "MaSuRCA (Maryland Super-Read Celera Assembler) genome assembly software." + homepage: "https://github.com/alekseyzimin/masurca/blob/v4.1.4/README.md" + documentation: "https://github.com/alekseyzimin/masurca/blob/v4.1.4/README.md" + tool_dev_url: "https://github.com/alekseyzimin/masurca" + doi: "10.1101/gr.213405.116" + licence: + - "GPL v3" + identifier: biotools:masurca +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - illumina: + type: file + description: | + Input paired-end FastQ files (R1 and R2). + pattern: "*.{fastq.gz,fastq,fq.gz,fq}" + ontologies: + - edam: http://edamontology.org/format_1930 + - jump: + type: file + description: | + Jump/mate-pair FastQ files. + pattern: "*.{fastq.gz,fastq,fq.gz,fq}" + ontologies: + - edam: http://edamontology.org/format_1930 + - pacbio: + type: file + description: | + PacBio FastQ files. + pattern: "*.{fastq.gz,fastq,fq.gz,fq}" + ontologies: + - edam: http://edamontology.org/format_1930 + - nanopore: + type: file + description: | + Nanopore FastQ files. + pattern: "*.{fastq.gz,fastq,fq.gz,fq}" + ontologies: + - edam: http://edamontology.org/format_1930 + - fragment_mean: + type: integer + description: Mean fragment size for Illumina paired-end reads + - fragment_stdev: + type: integer + description: Standard deviation of fragment size for Illumina paired-end + reads + - jump_mean: + type: integer + description: Mean fragment size for jump/mate-pair reads + - jump_stdev: + type: integer + description: Standard deviation of fragment size for jump/mate-pair reads + - extend_jump_reads: + type: boolean + description: "Whether to extend jump reads (default: 0 - false; use 1 for true)" + - graph_kmer_size: + type: string + description: "K-mer size for the de Bruijn graph (default: 'auto', it can be an + integer or 'auto')" + - use_linking_mates: + type: boolean + description: "Whether to use linking mates (default: 0 - false; use 1 for true)" + - lhe_coverage: + type: integer + description: "LHE coverage (default: 25)" + - mega_reads_one_pass: + type: boolean + description: "Whether to perform one pass of mega-reads (default: 0 - false; use + 1 for true)" + - limit_jump_coverage: + type: integer + description: "Limit for jump read coverage (default: 300)" + - ca_parameters: + type: string + description: "Parameters for the Celera Assembler (default: 'cgwErrorRate=0.15')" + - close_gaps: + type: boolean + description: "Whether to close gaps (default: 0 - false; use 1 for true)" + - jf_size: + type: integer + description: "Jellyfish hash size (default: 200000000)" +output: + script: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "assemble.sh": + type: file + description: MaSuRCA assembly script + pattern: "assemble.sh" + ontologies: [] + scaffolds: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "*scaffolds.fa.gz": + type: file + description: Assembled scaffolds in FASTA format + pattern: "*-scaffolds.fa.gz" + ontologies: + - edam: http://edamontology.org/data_0925 + - edam: http://edamontology.org/format_1929 + - edam: http://edamontology.org/format_3989 + config: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "*_masurca_config.txt": + type: file + description: MaSuRCA configuration file + pattern: "*_masurca_config.txt" + ontologies: [] + log: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "*-masurca.log": + type: file + description: MaSuRCA assembly log file + pattern: "*-masurca.log" + ontologies: [] + versions_masurca: + - - ${task.process}: + type: string + description: The name of the process + - masurca: + type: string + description: The name of the tool + - masurca --version | sed 's/version //g': + type: eval + description: The expression to obtain the version of the tool +topics: + versions: + - - ${task.process}: + type: string + description: The name of the process + - masurca: + type: string + description: The name of the tool + - masurca --version | sed 's/version //g': + type: eval + description: The expression to obtain the version of the tool +authors: + - "@LiaOb21" +maintainers: + - "@LiaOb21" diff --git a/modules/nf-core/masurca/tests/main.nf.test b/modules/nf-core/masurca/tests/main.nf.test new file mode 100644 index 00000000000..dffba62d12a --- /dev/null +++ b/modules/nf-core/masurca/tests/main.nf.test @@ -0,0 +1,384 @@ +nextflow_process { + + name "Test Process MASURCA" + script "../main.nf" + process "MASURCA" + + tag "modules" + tag "modules_nfcore" + tag "masurca" + + test("homo_sapiens - illumina - single_end") { + + when { + process { + """ + input[0] = [ + [ id:'test'], + [file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/fastq/test_1.fastq.gz", checkIfExists: true)], + [], // no jump reads + [], // no pacbio + [] // no nanopore + ] + input[1] = 500 // fragment_mean + input[2] = 50 // fragment_stdev + input[3] = 0 // jump_mean (not used) + input[4] = 0 // jump_stdev (not used) + input[5] = 0 // extend_jump_reads + input[6] = 'auto' // graph_kmer_size + input[7] = 0 // use_linking_mates + input[8] = 25 // lhe_coverage + input[9] = 0 // mega_reads_one_pass + input[10] = 300 // limit_jump_coverage + input[11] = 'cgwErrorRate=0.15' // ca_parameters + input[12] = 0 // close_gaps + input[13] = 200000000 // jf_size + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.scaffolds[0][1]).linesGzip.any { it.contains('AAA') } }, + { assert path(process.out.scaffolds[0][1]).linesGzip.any { it.contains('CCC') } }, + { assert path(process.out.script[0][1]).readLines().any { it.contains("# assemble.sh generated by masurca") } }, + { assert path(process.out.script[0][1]).readLines().any { it.contains("head -n 1 ESTIMATED_GENOME_SIZE.txt") } }, + { assert path(process.out.config[0][1]).readLines().last().contains("END") }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("All done") } }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("primary.genome.scf.fasta") } }, + { assert snapshot( + file(process.out.scaffolds[0][1]).name, + file(process.out.script[0][1]).name, + file(process.out.config[0][1]).name, + file(process.out.log[0][1]).name, + process.out.findAll { key, val -> key.startsWith('versions') } + ).match() } + ) + } + + } + + test("homo_sapiens - illumina - paired_end") { + + when { + process { + """ + input[0] = [ + [ id:'test'], + [ + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/fastq/test_1.fastq.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/fastq/test_2.fastq.gz", checkIfExists: true) + ], + [], // no jump reads + [], // no pacbio + [] // no nanopore + ] + input[1] = 500 // fragment_mean + input[2] = 50 // fragment_stdev + input[3] = 0 // jump_mean (not used) + input[4] = 0 // jump_stdev (not used) + input[5] = 0 // extend_jump_reads + input[6] = 'auto' // graph_kmer_size + input[7] = 0 // use_linking_mates + input[8] = 25 // lhe_coverage + input[9] = 0 // mega_reads_one_pass + input[10] = 300 // limit_jump_coverage + input[11] = 'cgwErrorRate=0.15' // ca_parameters + input[12] = 0 // close_gaps + input[13] = 200000000 // jf_size + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.scaffolds[0][1]).linesGzip.any { it.contains('AAA') } }, + { assert path(process.out.scaffolds[0][1]).linesGzip.any { it.contains('CCC') } }, + { assert path(process.out.script[0][1]).readLines().any { it.contains("# assemble.sh generated by masurca") } }, + { assert path(process.out.script[0][1]).readLines().any { it.contains("head -n 1 ESTIMATED_GENOME_SIZE.txt") } }, + { assert path(process.out.config[0][1]).readLines().last().contains("END") }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("All done") } }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("primary.genome.scf.fasta") } }, + { assert snapshot( + file(process.out.scaffolds[0][1]).name, + file(process.out.script[0][1]).name, + file(process.out.config[0][1]).name, + file(process.out.log[0][1]).name, + process.out.findAll { key, val -> key.startsWith('versions') } + ).match() } + ) + } + + } + + test("homo_sapiens - illumina - paired_end - with_jump") { + + when { + process { + """ + input[0] = [ + [ id:'test'], + [ + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/fastq/test_1.fastq.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/fastq/test_2.fastq.gz", checkIfExists: true) + ], + [ + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/fastq/test2_germline_1.fq.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/fastq/test2_germline_2.fq.gz", checkIfExists: true) + ], + [], // no pacbio + [] // no nanopore + ] + input[1] = 500 // fragment_mean + input[2] = 50 // fragment_stdev + input[3] = 3600 // jump_mean + input[4] = 200 // jump_stdev + input[5] = 0 // extend_jump_reads + input[6] = 'auto' // graph_kmer_size + input[7] = 0 // use_linking_mates + input[8] = 25 // lhe_coverage + input[9] = 0 // mega_reads_one_pass + input[10] = 300 // limit_jump_coverage + input[11] = 'cgwErrorRate=0.15' // ca_parameters + input[12] = 0 // close_gaps + input[13] = 200000000 // jf_size + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.scaffolds[0][1]).linesGzip.any { it.contains('AAA') } }, + { assert path(process.out.scaffolds[0][1]).linesGzip.any { it.contains('CCC') } }, + { assert path(process.out.script[0][1]).readLines().any { it.contains("# assemble.sh generated by masurca") } }, + { assert path(process.out.script[0][1]).readLines().any { it.contains("head -n 1 ESTIMATED_GENOME_SIZE.txt") } }, + { assert path(process.out.config[0][1]).readLines().last().contains("END") }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("All done") } }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("primary.genome.scf.fasta") } }, + { assert snapshot( + file(process.out.scaffolds[0][1]).name, + file(process.out.script[0][1]).name, + file(process.out.config[0][1]).name, + file(process.out.log[0][1]).name, + process.out.findAll { key, val -> key.startsWith('versions') } + ).match() } + ) + } + + } + + test("genomeassembler - hybrid - illumina_pacbio") { + + when { + process { + """ + input[0] = [ + [ id:'test'], + [ + file("https://raw.githubusercontent.com/nf-core/test-datasets/genomeassembler/A_thaliana_Col-0_2mb/SR_Col-0_test_data_1.fastq.gz", checkIfExists: true), + file("https://raw.githubusercontent.com/nf-core/test-datasets/genomeassembler/A_thaliana_Col-0_2mb/SR_Col-0_test_data_2.fastq.gz", checkIfExists: true) + ], + [], // no jump reads + [file("https://raw.githubusercontent.com/nf-core/test-datasets/genomeassembler/A_thaliana_Col-0_2mb/HiFi-Col-0_test_data.fastq.gz", checkIfExists: true)], + [] // no nanopore + ] + input[1] = 500 // fragment_mean + input[2] = 50 // fragment_stdev + input[3] = 0 // jump_mean (not used) + input[4] = 0 // jump_stdev (not used) + input[5] = 0 // extend_jump_reads + input[6] = 'auto' // graph_kmer_size + input[7] = 0 // use_linking_mates + input[8] = 25 // lhe_coverage + input[9] = 0 // mega_reads_one_pass + input[10] = 300 // limit_jump_coverage + input[11] = 'cgwErrorRate=0.15' // ca_parameters + input[12] = 0 // close_gaps + input[13] = 200000000 // jf_size + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.scaffolds[0][1]).linesGzip.any { it.contains('AAA') } }, + { assert path(process.out.scaffolds[0][1]).linesGzip.any { it.contains('CCC') } }, + { assert path(process.out.script[0][1]).readLines().any { it.contains("# assemble.sh generated by masurca") } }, + { assert path(process.out.script[0][1]).readLines().any { it.contains("head -n 1 ESTIMATED_GENOME_SIZE.txt") } }, + { assert path(process.out.config[0][1]).readLines().last().contains("END") }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("All done") } }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("primary.genome.scf.fasta") } }, + { assert snapshot( + file(process.out.scaffolds[0][1]).name, + file(process.out.script[0][1]).name, + file(process.out.config[0][1]).name, + file(process.out.log[0][1]).name, + process.out.findAll { key, val -> key.startsWith('versions') } + ).match() } + ) + } + + } + + test("genomeassembler - hybrid - illumina_nanopore") { + + when { + process { + """ + input[0] = [ + [ id:'test'], + [ + file("https://raw.githubusercontent.com/nf-core/test-datasets/genomeassembler/A_thaliana_Col-0_2mb/SR_Col-0_test_data_1.fastq.gz", checkIfExists: true), + file("https://raw.githubusercontent.com/nf-core/test-datasets/genomeassembler/A_thaliana_Col-0_2mb/SR_Col-0_test_data_2.fastq.gz", checkIfExists: true) + ], + [], // no jump reads + [], // no pacbio + [file("https://raw.githubusercontent.com/nf-core/test-datasets/genomeassembler/A_thaliana_Col-0_2mb/ONT-Col-0_test_data.fastq.gz", checkIfExists: true)] + ] + input[1] = 500 // fragment_mean + input[2] = 50 // fragment_stdev + input[3] = 0 // jump_mean (not used) + input[4] = 0 // jump_stdev (not used) + input[5] = 0 // extend_jump_reads + input[6] = 'auto' // graph_kmer_size + input[7] = 0 // use_linking_mates + input[8] = 25 // lhe_coverage + input[9] = 0 // mega_reads_one_pass + input[10] = 300 // limit_jump_coverage + input[11] = 'cgwErrorRate=0.15' // ca_parameters + input[12] = 0 // close_gaps + input[13] = 200000000 // jf_size + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.scaffolds[0][1]).linesGzip.any { it.contains('AAA') } }, + { assert path(process.out.scaffolds[0][1]).linesGzip.any { it.contains('CCC') } }, + { assert path(process.out.script[0][1]).readLines().any { it.contains("# assemble.sh generated by masurca") } }, + { assert path(process.out.script[0][1]).readLines().any { it.contains("head -n 1 ESTIMATED_GENOME_SIZE.txt") } }, + { assert path(process.out.config[0][1]).readLines().last().contains("END") }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("All done") } }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("primary.genome.scf.fasta") } }, + { assert snapshot( + file(process.out.scaffolds[0][1]).name, + file(process.out.script[0][1]).name, + file(process.out.config[0][1]).name, + file(process.out.log[0][1]).name, + process.out.findAll { key, val -> key.startsWith('versions') } + ).match() } + ) + } + + } + + test("genomeassembler - hybrid - illumina_pacbio_nanopore") { + + when { + process { + """ + input[0] = [ + [ id:'test'], + [ + file("https://raw.githubusercontent.com/nf-core/test-datasets/genomeassembler/A_thaliana_Col-0_2mb/SR_Col-0_test_data_1.fastq.gz", checkIfExists: true), + file("https://raw.githubusercontent.com/nf-core/test-datasets/genomeassembler/A_thaliana_Col-0_2mb/SR_Col-0_test_data_2.fastq.gz", checkIfExists: true) + ], + [], // no jump reads + [file("https://raw.githubusercontent.com/nf-core/test-datasets/genomeassembler/A_thaliana_Col-0_2mb/HiFi-Col-0_test_data.fastq.gz", checkIfExists: true)], + [file("https://raw.githubusercontent.com/nf-core/test-datasets/genomeassembler/A_thaliana_Col-0_2mb/ONT-Col-0_test_data.fastq.gz", checkIfExists: true)] + ] + input[1] = 500 // fragment_mean + input[2] = 50 // fragment_stdev + input[3] = 0 // jump_mean (not used) + input[4] = 0 // jump_stdev (not used) + input[5] = 0 // extend_jump_reads + input[6] = 'auto' // graph_kmer_size + input[7] = 0 // use_linking_mates + input[8] = 25 // lhe_coverage + input[9] = 0 // mega_reads_one_pass + input[10] = 300 // limit_jump_coverage + input[11] = 'cgwErrorRate=0.15' // ca_parameters + input[12] = 0 // close_gaps + input[13] = 200000000 // jf_size + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.scaffolds[0][1]).linesGzip.any { it.contains('AAA') } }, + { assert path(process.out.scaffolds[0][1]).linesGzip.any { it.contains('CCC') } }, + { assert path(process.out.script[0][1]).readLines().any { it.contains("# assemble.sh generated by masurca") } }, + { assert path(process.out.script[0][1]).readLines().any { it.contains("head -n 1 ESTIMATED_GENOME_SIZE.txt") } }, + { assert path(process.out.config[0][1]).readLines().last().contains("END") }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("All done") } }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("primary.genome.scf.fasta") } }, + { assert snapshot( + file(process.out.scaffolds[0][1]).name, + file(process.out.script[0][1]).name, + file(process.out.config[0][1]).name, + file(process.out.log[0][1]).name, + process.out.findAll { key, val -> key.startsWith('versions') } + ).match() } + ) + } + + } + + test("sarscov2 - illumina - paired_end - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test'], + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ], + [], // no jump reads + [], // no pacbio + [] // no nanopore + ] + input[1] = 500 // fragment_mean + input[2] = 50 // fragment_stdev + input[3] = 0 // jump_mean (not used) + input[4] = 0 // jump_stdev (not used) + input[5] = 0 // extend_jump_reads + input[6] = 'auto' // graph_kmer_size + input[7] = 0 // use_linking_mates + input[8] = 25 // lhe_coverage + input[9] = 0 // mega_reads_one_pass + input[10] = 300 // limit_jump_coverage + input[11] = 'cgwErrorRate=0.15' // ca_parameters + input[12] = 0 // close_gaps + input[13] = 200000000 // jf_size + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.scaffolds[0][1]).name, + file(process.out.script[0][1]).name, + file(process.out.config[0][1]).name, + file(process.out.log[0][1]).name, + process.out.findAll { key, val -> key.startsWith('versions') } + ).match() } + ) + } + + } + +} diff --git a/modules/nf-core/masurca/tests/main.nf.test.snap b/modules/nf-core/masurca/tests/main.nf.test.snap new file mode 100644 index 00000000000..be7eb9380be --- /dev/null +++ b/modules/nf-core/masurca/tests/main.nf.test.snap @@ -0,0 +1,156 @@ +{ + "homo_sapiens - illumina - single_end": { + "content": [ + "test.scaffolds.fa.gz", + "assemble.sh", + "test_masurca_config.txt", + "test-masurca.log", + { + "versions_masurca": [ + [ + "MASURCA", + "masurca", + "4.1.4" + ] + ] + } + ], + "timestamp": "2026-04-02T17:02:36.723413139", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "genomeassembler - hybrid - illumina_pacbio_nanopore": { + "content": [ + "test.scaffolds.fa.gz", + "assemble.sh", + "test_masurca_config.txt", + "test-masurca.log", + { + "versions_masurca": [ + [ + "MASURCA", + "masurca", + "4.1.4" + ] + ] + } + ], + "timestamp": "2026-04-02T17:12:19.186769917", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "sarscov2 - illumina - paired_end - stub": { + "content": [ + "test.scaffolds.fa.gz", + "assemble.sh", + "test_masurca_config.txt", + "test-masurca.log", + { + "versions_masurca": [ + [ + "MASURCA", + "masurca", + "4.1.4" + ] + ] + } + ], + "timestamp": "2026-04-02T20:15:28.52290314", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "genomeassembler - hybrid - illumina_pacbio": { + "content": [ + "test.scaffolds.fa.gz", + "assemble.sh", + "test_masurca_config.txt", + "test-masurca.log", + { + "versions_masurca": [ + [ + "MASURCA", + "masurca", + "4.1.4" + ] + ] + } + ], + "timestamp": "2026-04-02T17:06:10.293289505", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "homo_sapiens - illumina - paired_end": { + "content": [ + "test.scaffolds.fa.gz", + "assemble.sh", + "test_masurca_config.txt", + "test-masurca.log", + { + "versions_masurca": [ + [ + "MASURCA", + "masurca", + "4.1.4" + ] + ] + } + ], + "timestamp": "2026-04-02T17:23:14.003329552", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "homo_sapiens - illumina - paired_end - with_jump": { + "content": [ + "test.scaffolds.fa.gz", + "assemble.sh", + "test_masurca_config.txt", + "test-masurca.log", + { + "versions_masurca": [ + [ + "MASURCA", + "masurca", + "4.1.4" + ] + ] + } + ], + "timestamp": "2026-04-02T17:33:32.958497714", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "genomeassembler - hybrid - illumina_nanopore": { + "content": [ + "test.scaffolds.fa.gz", + "assemble.sh", + "test_masurca_config.txt", + "test-masurca.log", + { + "versions_masurca": [ + [ + "MASURCA", + "masurca", + "4.1.4" + ] + ] + } + ], + "timestamp": "2026-04-02T17:08:18.247500418", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file