-
Notifications
You must be signed in to change notification settings - Fork 1k
Add snpclustering subworkflow #11059
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
6af3ab8
dea109e
8aac674
7f27d21
dabc976
81b57d1
b796acd
09473bc
b35034a
d6fdd58
c56e54c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,36 @@ | ||
| process CLUSTER_METRICS { | ||
| tag "${meta.id}" | ||
| label 'process_medium' | ||
| container "${params.python_container ?: params.container_py ?: 'snpclustering-py:latest'}" | ||
| publishDir "${params.outdir}/metrics", mode: 'copy' | ||
|
|
||
| input: | ||
| tuple val(meta), path(pca_scores), path(pca_info), path(clusters) | ||
| path cluster_metrics_script | ||
|
|
||
| output: | ||
| tuple val(meta), path("${meta.id}_metrics.tsv"), emit: metrics | ||
| tuple val(meta), path("${meta.id}_k_sweep.csv"), emit: k_sweep | ||
| tuple val(meta), path("${meta.id}_selected.json"), emit: selected | ||
| path "versions.yml", emit: versions | ||
|
|
||
| script: | ||
| def k_min = params.k_min ?: 2 | ||
| def k_max = params.k_max ?: 12 | ||
|
|
||
| """ | ||
| python3 ${cluster_metrics_script} \\ | ||
| --features ${pca_scores} \\ | ||
| --clusters ${clusters} \\ | ||
| --k-min ${k_min} \\ | ||
| --k-max ${k_max} \\ | ||
| --out-k-sweep ${meta.id}_k_sweep.csv \\ | ||
| --out-selected ${meta.id}_selected.json \\ | ||
| --out-prefix ${meta.id} | ||
|
|
||
| cat <<-END_VERSIONS > versions.yml | ||
| "${task.process}": | ||
| scikit-learn: \$(python3 -c "import sklearn; print(sklearn.__version__)") | ||
| END_VERSIONS | ||
| """ | ||
| } |
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See above comment |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,47 @@ | ||
| process CLUSTER_VIZ { | ||
| tag "${meta.id}" | ||
| label 'process_medium' | ||
| container "${params.python_container ?: params.container_py ?: 'snpclustering-py:latest'}" | ||
| publishDir "${params.outdir}/viz", mode: 'copy' | ||
|
|
||
| input: | ||
| tuple val(meta), path(pca_scores), path(pca_info), path(clusters) | ||
| path cluster_viz_script | ||
|
|
||
| output: | ||
| tuple val(meta), path("${meta.id}_umap.png"), | ||
| path("${meta.id}_tsne.png"), | ||
| path("${meta.id}_pca.png"), emit: plots | ||
| tuple val(meta), path("${meta.id}_umap.tsv"), emit: umap_tsv | ||
| tuple val(meta), path("${meta.id}_tsne.tsv"), emit: tsne_tsv | ||
| path "versions.yml", emit: versions | ||
|
|
||
| script: | ||
| def umap_n = params.viz_umap_neighbors ?: 15 | ||
| def umap_d = params.viz_umap_min_dist ?: 0.1 | ||
| def tsne_p = params.viz_perplexity ?: 30 | ||
| def tsne_i = params.viz_tsne_iter ?: 1000 | ||
|
|
||
| """ | ||
| export NUMBA_DISABLE_JIT=1 | ||
|
|
||
| python3 ${cluster_viz_script} \\ | ||
| --features ${pca_scores} \\ | ||
| --clusters ${clusters} \\ | ||
| --pca-scores ${pca_scores} \\ | ||
| --umap-neighbors ${umap_n} \\ | ||
| --umap-min-dist ${umap_d} \\ | ||
| --tsne-perplexity ${tsne_p} \\ | ||
| --tsne-iter ${tsne_i} \\ | ||
| --out-umap-tsv ${meta.id}_umap.tsv \\ | ||
| --out-tsne-tsv ${meta.id}_tsne.tsv \\ | ||
| --out-umap-png ${meta.id}_umap.png \\ | ||
| --out-tsne-png ${meta.id}_tsne.png \\ | ||
| --out-pca-png ${meta.id}_pca.png | ||
|
|
||
| cat <<-END_VERSIONS > versions.yml | ||
| "${task.process}": | ||
| scikit-learn: \$(python3 -c "import sklearn; print(sklearn.__version__)") | ||
| END_VERSIONS | ||
| """ | ||
| } |
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See above comment |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,39 @@ | ||
| process CLUSTERING { | ||
| tag "${meta.id}" | ||
| label 'process_medium' | ||
| container "${params.python_container}" | ||
| publishDir "${params.outdir}/clustering", mode: 'copy' | ||
|
|
||
| input: | ||
| tuple val(meta), path(pca_scores), path(pca_info) | ||
| path clustering_script | ||
|
|
||
| output: | ||
| tuple val(meta), path('clusters.csv'), emit: clusters | ||
| tuple val(meta), path('clustering_info.json'), emit: clustering_info | ||
|
|
||
| script: | ||
| def algorithm = params.algorithm ?: 'kmeans' | ||
| def n_init = params.n_init ?: 100 | ||
| def init_meth = params.init_method ?: 'random' | ||
|
|
||
| def extra_args = '' | ||
| if( algorithm == 'kmeans' ) { | ||
| def k = params.n_clusters ?: 3 | ||
| extra_args = "--k ${k} --n_init ${n_init} --init-method ${init_meth}" | ||
| } | ||
| else if( algorithm == 'dbscan' ) { | ||
| def eps = params.dbscan_eps | ||
| def mins = params.dbscan_min_samples | ||
| extra_args = "--dbscan-eps ${eps} --dbscan-min-samples ${mins}" | ||
| } | ||
|
|
||
| """ | ||
| python3 ${clustering_script} \\ | ||
| --features ${pca_scores} \\ | ||
| --algorithm ${algorithm} \\ | ||
| ${extra_args} \\ | ||
| --out-clusters clusters.csv \\ | ||
| --out-info clustering_info.json | ||
| """ | ||
| } |
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See above comment |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,63 @@ | ||
| process PCA_FLASHPCA { | ||
| tag "${meta.id}" | ||
| cpus { params.threads as int } | ||
| container params.flashpca_container | ||
| publishDir "${params.outdir}/04_pca", mode: 'copy' | ||
|
|
||
| input: | ||
| tuple val(meta), path(bed), path(bim), path(fam) | ||
| path parser_script | ||
|
|
||
| output: | ||
| tuple val(meta), path('features.tsv'), path('scaled.tsv'), path('pca_scores.tsv'), path('pca_info.json'), emit: pca | ||
| path 'versions.yml', emit: versions | ||
|
|
||
| script: | ||
| def flashpca_bin = params.flashpca_bin ?: 'flashpca' | ||
| """ | ||
| set -euo pipefail | ||
|
|
||
| command -v ${flashpca_bin} >/dev/null 2>&1 || { echo "ERROR: '${flashpca_bin}' not found in PATH" >&2; exit 127; } | ||
| command -v python3 >/dev/null 2>&1 || { echo "ERROR: 'python3' not found" >&2; exit 127; } | ||
|
|
||
| prefix="${bed.baseName}" | ||
|
|
||
| ${flashpca_bin} \\ | ||
| --bfile "\$prefix" \\ | ||
| --ndim ${params.n_pcs ?: 40} \\ | ||
| --numthreads ${task.cpus} \\ | ||
| --outpc outpc.raw \\ | ||
| --outmeansd scaled.tsv | ||
|
|
||
| awk '{print \$2}' "${bim}" > features.tsv | ||
|
|
||
| python3 ${parser_script} \\ | ||
| --outpc outpc.raw \\ | ||
| --n-pcs ${params.n_pcs ?: 40} \\ | ||
| --out-pca pca_scores.tsv \\ | ||
| --out-info pca_info.json \\ | ||
| --id-mode fid_iid | ||
|
|
||
| cat <<-END_VERSIONS > versions.yml | ||
| "${task.process}": | ||
| flashpca: \$(${flashpca_bin} --version 2>&1 | head -n 1 || true) | ||
| python3: \$(python3 --version 2>&1) | ||
| END_VERSIONS | ||
| """ | ||
| } | ||
|
|
||
| workflow pca_ch { | ||
| take: | ||
| plink_ch | ||
|
|
||
| main: | ||
| parser_script_ch = Channel.value( | ||
| file("${projectDir}/subworkflows/nf-core/snpclustering/scripts/flashpca_outpc_to_tsv.py", checkIfExists: true) | ||
| ) | ||
|
|
||
| PCA_FLASHPCA(plink_ch, parser_script_ch) | ||
|
|
||
| emit: | ||
| pca = PCA_FLASHPCA.out.pca | ||
| versions = PCA_FLASHPCA.out.versions | ||
| } |
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See above comment You already reuse plink/vcf, it might be worth building (or reusing if it exists) the plink2/pgen2bed module. I would ask you to do that in a separate PR to keep the review load small :) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,38 @@ | ||
| process PLINK2_PGEN2BED { | ||
| tag "${meta.id}" | ||
| label 'process_medium' | ||
|
|
||
| conda "${moduleDir}/environment.yml" | ||
| container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? | ||
| 'https://depot.galaxyproject.org/singularity/plink2:2.00a5.10--h4ac6f70_0' : | ||
| params.plink2_container }" | ||
|
|
||
| input: | ||
| tuple val(meta), path(pgen), path(pvar), path(psam) | ||
|
|
||
| output: | ||
| tuple val(meta), path("*.bed"), emit: bed | ||
| tuple val(meta), path("*.bim"), emit: bim | ||
| tuple val(meta), path("*.fam"), emit: fam | ||
| path "versions.yml", emit: versions | ||
|
|
||
| when: | ||
| task.ext.when == null || task.ext.when | ||
|
|
||
| script: | ||
| def prefix = task.ext.prefix ?: meta.id | ||
|
|
||
| """ | ||
| plink2 \\ | ||
| --pfile ${prefix} \\ | ||
| --max-alleles 2 \\ | ||
| --make-bed \\ | ||
| --threads ${task.cpus} \\ | ||
| --out ${prefix}_bed | ||
|
|
||
| cat <<-END_VERSIONS > versions.yml | ||
| "${task.process}": | ||
| plink2: \$(plink2 --version 2>&1 | sed 's/^PLINK v//; s/ 64.*\$//') | ||
| END_VERSIONS | ||
| """ | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We cannot have a "local" module to be added to an nf-core subworkflow :) The module needs to be added following the nf-core guidelines and then it can be used in the subworkflow!