nf-core · dbaku42 · Mar 26, 2026 · Apr 2, 2026 · Apr 3, 2026 · Apr 13, 2026
diff --git a/modules/local/cluster_metrics.nf b/modules/local/cluster_metrics.nf
@@ -0,0 +1,36 @@
+process CLUSTER_METRICS {
+    tag "${meta.id}"
+    label 'process_medium'
+    container "${params.python_container ?: params.container_py ?: 'snpclustering-py:latest'}"
+    publishDir "${params.outdir}/metrics", mode: 'copy'
+
+    input:
+    tuple val(meta), path(pca_scores), path(pca_info), path(clusters)
+    path cluster_metrics_script
+
+    output:
+    tuple val(meta), path("${meta.id}_metrics.tsv"), emit: metrics
+    tuple val(meta), path("${meta.id}_k_sweep.csv"), emit: k_sweep
+    tuple val(meta), path("${meta.id}_selected.json"), emit: selected
+    path "versions.yml", emit: versions
+
+    script:
+    def k_min = params.k_min ?: 2
+    def k_max = params.k_max ?: 12
+
+    """
+    python3 ${cluster_metrics_script} \\
+        --features ${pca_scores} \\
+        --clusters ${clusters} \\
+        --k-min ${k_min} \\
+        --k-max ${k_max} \\
+        --out-k-sweep ${meta.id}_k_sweep.csv \\
+        --out-selected ${meta.id}_selected.json \\
+        --out-prefix ${meta.id}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        scikit-learn: \$(python3 -c "import sklearn; print(sklearn.__version__)")
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/cluster_viz.nf b/modules/local/cluster_viz.nf
@@ -0,0 +1,47 @@
+process CLUSTER_VIZ {
+    tag "${meta.id}"
+    label 'process_medium'
+    container "${params.python_container ?: params.container_py ?: 'snpclustering-py:latest'}"
+    publishDir "${params.outdir}/viz", mode: 'copy'
+
+    input:
+    tuple val(meta), path(pca_scores), path(pca_info), path(clusters)
+    path cluster_viz_script
+
+    output:
+    tuple val(meta), path("${meta.id}_umap.png"),
+                     path("${meta.id}_tsne.png"),
+                     path("${meta.id}_pca.png"), emit: plots
+    tuple val(meta), path("${meta.id}_umap.tsv"), emit: umap_tsv
+    tuple val(meta), path("${meta.id}_tsne.tsv"), emit: tsne_tsv
+    path "versions.yml", emit: versions
+
+    script:
+    def umap_n = params.viz_umap_neighbors ?: 15
+    def umap_d = params.viz_umap_min_dist ?: 0.1
+    def tsne_p = params.viz_perplexity ?: 30
+    def tsne_i = params.viz_tsne_iter ?: 1000
+
+    """
+    export NUMBA_DISABLE_JIT=1
+
+    python3 ${cluster_viz_script} \\
+        --features ${pca_scores} \\
+        --clusters ${clusters} \\
+        --pca-scores ${pca_scores} \\
+        --umap-neighbors ${umap_n} \\
+        --umap-min-dist ${umap_d} \\
+        --tsne-perplexity ${tsne_p} \\
+        --tsne-iter ${tsne_i} \\
+        --out-umap-tsv ${meta.id}_umap.tsv \\
+        --out-tsne-tsv ${meta.id}_tsne.tsv \\
+        --out-umap-png ${meta.id}_umap.png \\
+        --out-tsne-png ${meta.id}_tsne.png \\
+        --out-pca-png ${meta.id}_pca.png
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        scikit-learn: \$(python3 -c "import sklearn; print(sklearn.__version__)")
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/clustering.nf b/modules/local/clustering.nf
@@ -0,0 +1,39 @@
+process CLUSTERING {
+    tag "${meta.id}"
+    label 'process_medium'
+    container "${params.python_container}"
+    publishDir "${params.outdir}/clustering", mode: 'copy'
+
+    input:
+    tuple val(meta), path(pca_scores), path(pca_info)
+    path clustering_script
+
+    output:
+    tuple val(meta), path('clusters.csv'),         emit: clusters
+    tuple val(meta), path('clustering_info.json'), emit: clustering_info
+
+    script:
+    def algorithm = params.algorithm ?: 'kmeans'
+    def n_init    = params.n_init ?: 100
+    def init_meth = params.init_method ?: 'random'
+
+    def extra_args = ''
+    if( algorithm == 'kmeans' ) {
+        def k = params.n_clusters ?: 3
+        extra_args = "--k ${k} --n_init ${n_init} --init-method ${init_meth}"
+    }
+    else if( algorithm == 'dbscan' ) {
+        def eps  = params.dbscan_eps
+        def mins = params.dbscan_min_samples
+        extra_args = "--dbscan-eps ${eps} --dbscan-min-samples ${mins}"
+    }
+
+    """
+    python3 ${clustering_script} \\
+        --features ${pca_scores} \\
+        --algorithm ${algorithm} \\
+        ${extra_args} \\
+        --out-clusters clusters.csv \\
+        --out-info clustering_info.json
+    """
+}
diff --git a/modules/local/pca.nf b/modules/local/pca.nf
@@ -0,0 +1,63 @@
+process PCA_FLASHPCA {
+    tag "${meta.id}"
+    cpus { params.threads as int }
+    container params.flashpca_container
+    publishDir "${params.outdir}/04_pca", mode: 'copy'
+
+    input:
+    tuple val(meta), path(bed), path(bim), path(fam)
+    path parser_script
+
+    output:
+    tuple val(meta), path('features.tsv'), path('scaled.tsv'), path('pca_scores.tsv'), path('pca_info.json'), emit: pca
+    path 'versions.yml', emit: versions
+
+    script:
+    def flashpca_bin = params.flashpca_bin ?: 'flashpca'
+    """
+    set -euo pipefail
+
+    command -v ${flashpca_bin} >/dev/null 2>&1 || { echo "ERROR: '${flashpca_bin}' not found in PATH" >&2; exit 127; }
+    command -v python3 >/dev/null 2>&1 || { echo "ERROR: 'python3' not found" >&2; exit 127; }
+
+    prefix="${bed.baseName}"
+
+    ${flashpca_bin} \\
+      --bfile "\$prefix" \\
+      --ndim ${params.n_pcs ?: 40} \\
+      --numthreads ${task.cpus} \\
+      --outpc outpc.raw \\
+      --outmeansd scaled.tsv
+
+    awk '{print \$2}' "${bim}" > features.tsv
+
+    python3 ${parser_script} \\
+      --outpc outpc.raw \\
+      --n-pcs ${params.n_pcs ?: 40} \\
+      --out-pca pca_scores.tsv \\
+      --out-info pca_info.json \\
+      --id-mode fid_iid
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        flashpca: \$(${flashpca_bin} --version 2>&1 | head -n 1 || true)
+        python3: \$(python3 --version 2>&1)
+    END_VERSIONS
+    """
+}
+
+workflow pca_ch {
+    take:
+    plink_ch
+
+    main:
+    parser_script_ch = Channel.value(
+        file("${projectDir}/subworkflows/nf-core/snpclustering/scripts/flashpca_outpc_to_tsv.py", checkIfExists: true)
+    )
+
+    PCA_FLASHPCA(plink_ch, parser_script_ch)
+
+    emit:
+    pca = PCA_FLASHPCA.out.pca
+    versions = PCA_FLASHPCA.out.versions
+}
diff --git a/modules/local/plink2_pgen2bed.nf b/modules/local/plink2_pgen2bed.nf
@@ -0,0 +1,38 @@
+process PLINK2_PGEN2BED {
+    tag "${meta.id}"
+    label 'process_medium'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/plink2:2.00a5.10--h4ac6f70_0' :
+        params.plink2_container }"
+
+    input:
+    tuple val(meta), path(pgen), path(pvar), path(psam)
+
+    output:
+    tuple val(meta), path("*.bed"), emit: bed
+    tuple val(meta), path("*.bim"), emit: bim
+    tuple val(meta), path("*.fam"), emit: fam
+    path "versions.yml", emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def prefix = task.ext.prefix ?: meta.id
+
+    """
+    plink2 \\
+        --pfile ${prefix} \\
+        --max-alleles 2 \\
+        --make-bed \\
+        --threads ${task.cpus} \\
+        --out ${prefix}_bed
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        plink2: \$(plink2 --version 2>&1 | sed 's/^PLINK v//; s/ 64.*\$//')
+    END_VERSIONS
+    """
+}