nf-core · LiaOb21 · Mar 20, 2026 · Mar 23, 2026 · Mar 24, 2026 · Mar 25, 2026
diff --git a/modules/nf-core/masurca/environment.yml b/modules/nf-core/masurca/environment.yml
@@ -0,0 +1,7 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - "bioconda::masurca=4.1.4=ha5bb246_1"
diff --git a/modules/nf-core/masurca/main.nf b/modules/nf-core/masurca/main.nf
@@ -0,0 +1,147 @@
+process MASURCA {
+    tag "${meta.id}"
+    label 'process_high'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://community.wave.seqera.io/v2/library/masurca/blobs/sha256:70f76916a4e082b75bb01e08aeb139631056907cdf1bb0012dc10298a37694c0' :
+        'community.wave.seqera.io/library/masurca:4.1.4--6c6f779135534097' }"
+
+    input:
+    tuple val(meta), path(illumina), path(jump), path(pacbio), path(nanopore)
+    val fragment_mean
+    val fragment_stdev
+    val jump_mean
+    val jump_stdev
+    val extend_jump_reads
+    val graph_kmer_size
+    val use_linking_mates
+    val lhe_coverage
+    val mega_reads_one_pass
+    val limit_jump_coverage
+    val ca_parameters
+    val close_gaps
+    val jf_size
+
+    output:
+    tuple val(meta), path("assemble.sh"), emit: script
+    tuple val(meta), path("*scaffolds.fa.gz"), emit: scaffolds
+    tuple val(meta), path("*_masurca_config.txt"), emit: config
+    tuple val(meta), path("*-masurca.log"), emit: log
+    tuple val("${task.process}"), val('masurca'), eval("masurca --version | sed 's/version //g'"), topic: versions, emit: versions_masurca
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+
+    //get input reads - illumina are mandatory, jump/pacbio/nanopore are optional
+    def illumina_reads = [illumina].flatten()
+    def jump_reads = jump ? [jump].flatten() : []
+    def pacbio_name = pacbio ? pacbio.name : ""
+    def nanopore_name = nanopore ? nanopore.name : ""
+
+    // Build the config file
+    def config_lines = []
+
+    // DATA section
+    config_lines << "DATA"
+    config_lines << "#Illumina paired end reads supplied as <two-character prefix> <fragment mean> <fragment stdev> <forward_reads> <reverse_reads>"
+    config_lines << "#if single-end, do not specify <reverse_reads>"
+    config_lines << "#MUST HAVE Illumina paired end reads to use MaSuRCA"
+    config_lines << "PE= pe ${fragment_mean} ${fragment_stdev} ${illumina_reads.collect{ fq -> "\$PWD/${fq}"}.join(" ") }"
+
+    // Jump/mate pair reads (optional)
+    if (jump_reads) {
+        config_lines << "#Illumina mate pair reads supplied as <two-character prefix> <fragment mean> <fragment stdev> <forward_reads> <reverse_reads>"
+        config_lines << "JUMP= sh ${jump_mean} ${jump_stdev} ${jump_reads.collect{ fq -> "\$PWD/${fq}" }.join(" ")}"
+    }
+
+
+    // PacBio and Nanopore reads handling
+    def long_reads_concat = ""
+    if (pacbio_name && nanopore_name) {
+        config_lines << "#if you have both PacBio and Nanopore, supply both as NANOPORE type"
+        long_reads_concat = "${prefix}_long_reads.fastq.gz"
+        config_lines << "NANOPORE= \$PWD/${long_reads_concat}"
+    } else if (pacbio_name) {
+        config_lines << "#PacBio/CCS reads must be in a single fasta or fastq file with absolute path"
+        config_lines << "PACBIO=\$PWD/${pacbio_name}"
+    } else if (nanopore_name) {
+        config_lines << "#Nanopore reads must be in a single fasta or fastq file with absolute path"
+        config_lines << "NANOPORE=\$PWD/${nanopore_name}"
+    }
+
+    config_lines << "END"
+    config_lines << ""
+
+    // PARAMETERS section
+    config_lines << "PARAMETERS"
+    config_lines << "#set this to 1 if your Illumina jumping library reads are shorter than 100bp"
+    config_lines << "EXTEND_JUMP_READS=${extend_jump_reads}"
+    config_lines << "#this is k-mer size for deBruijn graph values between 25 and 127 are supported, auto will compute the optimal size based on the read data and GC content"
+    config_lines << "GRAPH_KMER_SIZE = ${graph_kmer_size}"
+    config_lines << "#set this to 1 for all Illumina-only assemblies"
+    config_lines << "#set this to 0 if you have more than 15x coverage by long reads (Pacbio or Nanopore) or any other long reads/mate pairs (Illumina MP, Sanger, 454, etc)"
+    config_lines << "USE_LINKING_MATES = ${use_linking_mates}"
+    config_lines << "#use at most this much coverage by the longest Pacbio or Nanopore reads, discard the rest of the reads"
+    config_lines << "#can increase this to 30 or 35 if your reads are short (N50<7000bp)"
+    config_lines << "LHE_COVERAGE=${lhe_coverage}"
+    config_lines << "#set to 0 (default) to do two passes of mega-reads for slower, but higher quality assembly, otherwise set to 1"
+    config_lines << "MEGA_READS_ONE_PASS=${mega_reads_one_pass}"
+    config_lines << "#this parameter is useful if you have too many Illumina jumping library mates. Typically set it to 60 for bacteria and 300 for the other organisms"
+    config_lines << "LIMIT_JUMP_COVERAGE = ${limit_jump_coverage}"
+    config_lines << "#these are the additional parameters to Celera Assembler.  do not worry about performance, number or processors or batch sizes -- these are computed automatically."
+    config_lines << "#CABOG ASSEMBLY ONLY: set cgwErrorRate=0.25 for bacteria and 0.1<=cgwErrorRate<=0.15 for other organisms."
+    config_lines << "CA_PARAMETERS = ${ca_parameters}"
+    config_lines << "#CABOG ASSEMBLY ONLY: whether to attempt to close gaps in scaffolds with Illumina  or long read data"
+    config_lines << "CLOSE_GAPS=${close_gaps}"
+    config_lines << "#number of cpus to use, set this to the number of CPUs/threads per node you will be using"
+    config_lines << "NUM_THREADS = ${task.cpus}"
+    config_lines << "#this is mandatory jellyfish hash size -- a safe value is estimated_genome_size*20"
+    config_lines << "JF_SIZE = ${jf_size}"
+    config_lines << "#ILLUMINA ONLY. Set this to 1 to use SOAPdenovo contigging/scaffolding module."
+    config_lines << "#Assembly will be worse but will run faster. Useful for very large (>=8Gbp) genomes from Illumina-only data"
+    config_lines << "SOAP_ASSEMBLY=0"
+    config_lines << "#If you are doing Hybrid Illumina paired end + Nanopore/PacBio assembly ONLY (no Illumina mate pairs or OTHER frg files)."
+    config_lines << "#Set this to 1 to use Flye assembler for final assembly of corrected mega-reads."
+    config_lines << "#A lot faster than CABOG, AND QUALITY IS THE SAME OR BETTER."
+    config_lines << "#Works well even when MEGA_READS_ONE_PASS is set to 1."
+    config_lines << "#DO NOT use if you have less than 15x coverage by long reads."
+    config_lines << "FLYE_ASSEMBLY=0"
+    config_lines << "END"
+
+    def config_content = config_lines.collectMany{ cfg -> cfg.tokenize('\n') }.join('\n    ')
+
+    """
+    # Write the config file
+    cat > ${prefix}_masurca_config.txt <<-CONFIG_EOF
+    ${config_content}
+    CONFIG_EOF
+
+    # Concatenate long reads if both PacBio and Nanopore are present
+    ${long_reads_concat ? "cat \$PWD/${pacbio_name} \$PWD/${nanopore_name} > ${long_reads_concat}" : ""}
+
+    # Generate assembly script
+    masurca ${prefix}_masurca_config.txt
+
+    ./assemble.sh > ${prefix}-masurca.log 2>&1 || (cat ${prefix}-masurca.log >&2; for f in CA*.log; do echo "=== \$f ==="; cat "\$f"; done >&2; find . -name "000001.out" | while read f; do echo "=== \$f ==="; cat "\$f"; done >&2; exit 1)
+
+    if [ -f CA*/primary.genome.scf.fasta ]; then
+        gzip -cn CA*/primary.genome.scf.fasta > ${prefix}.scaffolds.fa.gz
+    fi
+    """
+
+    stub:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    mkdir -p CA
+    touch assemble.sh
+    touch ${prefix}_masurca_config.txt
+    echo | gzip > ${prefix}.scaffolds.fa.gz
+    touch ${prefix}-masurca.log
+    """
+}
diff --git a/modules/nf-core/masurca/meta.yml b/modules/nf-core/masurca/meta.yml
@@ -0,0 +1,167 @@
+name: "masurca"
+description: The MaSuRCA (Maryland Super Read Cabog Assembler) genome assembly
+  and analysis toolkit
+keywords:
+  - denovo
+  - assembly
+  - debruijn
+  - genomics
+tools:
+  - "masurca":
+      description: "MaSuRCA (Maryland Super-Read Celera Assembler) genome assembly software."
+      homepage: "https://github.com/alekseyzimin/masurca/blob/v4.1.4/README.md"
+      documentation: "https://github.com/alekseyzimin/masurca/blob/v4.1.4/README.md"
+      tool_dev_url: "https://github.com/alekseyzimin/masurca"
+      doi: "10.1101/gr.213405.116"
+      licence:
+        - "GPL v3"
+      identifier: biotools:masurca
+input:
+  - - meta:
+        type: map
+        description: |
+          Groovy Map containing sample information
+          e.g. `[ id:'sample1' ]`
+    - illumina:
+        type: file
+        description: |
+          Input paired-end FastQ files (R1 and R2).
+        pattern: "*.{fastq.gz,fastq,fq.gz,fq}"
+        ontologies:
+          - edam: http://edamontology.org/format_1930
+    - jump:
+        type: file
+        description: |
+          Jump/mate-pair FastQ files.
+        pattern: "*.{fastq.gz,fastq,fq.gz,fq}"
+        ontologies:
+          - edam: http://edamontology.org/format_1930
+    - pacbio:
+        type: file
+        description: |
+          PacBio FastQ files.
+        pattern: "*.{fastq.gz,fastq,fq.gz,fq}"
+        ontologies:
+          - edam: http://edamontology.org/format_1930
+    - nanopore:
+        type: file
+        description: |
+          Nanopore FastQ files.
+        pattern: "*.{fastq.gz,fastq,fq.gz,fq}"
+        ontologies:
+          - edam: http://edamontology.org/format_1930
+  - fragment_mean:
+      type: integer
+      description: Mean fragment size for Illumina paired-end reads
+  - fragment_stdev:
+      type: integer
+      description: Standard deviation of fragment size for Illumina paired-end
+        reads
+  - jump_mean:
+      type: integer
+      description: Mean fragment size for jump/mate-pair reads
+  - jump_stdev:
+      type: integer
+      description: Standard deviation of fragment size for jump/mate-pair reads
+  - extend_jump_reads:
+      type: boolean
+      description: "Whether to extend jump reads (default: 0 - false; use 1 for true)"
+  - graph_kmer_size:
+      type: string
+      description: "K-mer size for the de Bruijn graph (default: 'auto', it can be an
+        integer or 'auto')"
+  - use_linking_mates:
+      type: boolean
+      description: "Whether to use linking mates (default: 0 - false; use 1 for true)"
+  - lhe_coverage:
+      type: integer
+      description: "LHE coverage (default: 25)"
+  - mega_reads_one_pass:
+      type: boolean
+      description: "Whether to perform one pass of mega-reads (default: 0 - false; use
+        1 for true)"
+  - limit_jump_coverage:
+      type: integer
+      description: "Limit for jump read coverage (default: 300)"
+  - ca_parameters:
+      type: string
+      description: "Parameters for the Celera Assembler (default: 'cgwErrorRate=0.15')"
+  - close_gaps:
+      type: boolean
+      description: "Whether to close gaps (default: 0 - false; use 1 for true)"
+  - jf_size:
+      type: integer
+      description: "Jellyfish hash size (default: 200000000)"
+output:
+  script:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. `[ id:'sample1' ]`
+      - "assemble.sh":
+          type: file
+          description: MaSuRCA assembly script
+          pattern: "assemble.sh"
+          ontologies: []
+  scaffolds:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. `[ id:'sample1' ]`
+      - "*scaffolds.fa.gz":
+          type: file
+          description: Assembled scaffolds in FASTA format
+          pattern: "*-scaffolds.fa.gz"
+          ontologies:
+            - edam: http://edamontology.org/data_0925
+            - edam: http://edamontology.org/format_1929
+            - edam: http://edamontology.org/format_3989
+  config:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. `[ id:'sample1' ]`
+      - "*_masurca_config.txt":
+          type: file
+          description: MaSuRCA configuration file
+          pattern: "*_masurca_config.txt"
+          ontologies: []
+  log:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. `[ id:'sample1' ]`
+      - "*-masurca.log":
+          type: file
+          description: MaSuRCA assembly log file
+          pattern: "*-masurca.log"
+          ontologies: []
+  versions_masurca:
+    - - ${task.process}:
+          type: string
+          description: The name of the process
+      - masurca:
+          type: string
+          description: The name of the tool
+      - masurca --version | sed 's/version //g':
+          type: eval
+          description: The expression to obtain the version of the tool
+topics:
+  versions:
+    - - ${task.process}:
+          type: string
+          description: The name of the process
+      - masurca:
+          type: string
+          description: The name of the tool
+      - masurca --version | sed 's/version //g':
+          type: eval
+          description: The expression to obtain the version of the tool
+authors:
+  - "@LiaOb21"
+maintainers:
+  - "@LiaOb21"