diff --git a/deps/libvgio b/deps/libvgio index 3026f7d28e..aec1d10380 160000 --- a/deps/libvgio +++ b/deps/libvgio @@ -1 +1 @@ -Subproject commit 3026f7d28ef1576982968aff4eed7adf5a10f262 +Subproject commit aec1d1038064dc99f6a0fd68389cbaa545466fe4 diff --git a/src/minimizer_mapper.hpp b/src/minimizer_mapper.hpp index 4a618a8ad2..af842fa5d3 100644 --- a/src/minimizer_mapper.hpp +++ b/src/minimizer_mapper.hpp @@ -884,7 +884,7 @@ class MinimizerMapper : public AlignerClient { * If we do gapless extension, turn good full-length gapless extensions into alignments and return them in alignments * Gapless extensions are considered good enough if they have fewer than default_max_extension_mismatches mismatches */ - void do_chaining_on_trees(Alignment& aln, const ZipCodeForest& zip_code_forest, const std::vector& seeds, const VectorView& minimizers, + void do_chaining_on_trees(const Alignment& aln, const ZipCodeForest& zip_code_forest, const std::vector& seeds, const VectorView& minimizers, const vector& seed_anchors, std::vector>& chains, std::vector>& chain_rec_flags, std::vector& chain_rec_counts, std::vector& chain_source_tree, @@ -896,7 +896,7 @@ class MinimizerMapper : public AlignerClient { /** * Collect stats about the best chains for annotating the final alignment */ - void get_best_chain_stats( Alignment& aln, const ZipCodeForest& zip_code_forest, const std::vector& seeds, + void get_best_chain_stats(const Alignment& aln, const ZipCodeForest& zip_code_forest, const std::vector& seeds, const VectorView& minimizers, const std::vector>& chains, const std::vector& chain_source_tree, @@ -906,7 +906,7 @@ class MinimizerMapper : public AlignerClient { double& best_chain_average_jump, size_t& best_chain_anchors, size_t& best_chain_anchor_length, Funnel& funnel) const ; - void do_alignment_on_chains(Alignment& aln, const std::vector& seeds, + void do_alignment_on_chains(const Alignment& aln, const std::vector& seeds, const VectorView& minimizers, const vector& seed_anchors, const std::vector>& chains, @@ -925,7 +925,7 @@ class MinimizerMapper : public AlignerClient { * * If no alignments have a positive score, responsible for creating an unmapped mapping. */ - void pick_mappings_from_alignments(Alignment& aln, const std::vector& alignments, + void pick_mappings_from_alignments(const Alignment& aln, const std::vector& alignments, const std::vector& multiplicity_by_alignment, const std::vector& alignments_to_source, const std::vector& chain_score_estimates, std::vector& mappings, diff --git a/src/minimizer_mapper_from_chains.cpp b/src/minimizer_mapper_from_chains.cpp index e1bfd77441..b90800839e 100644 --- a/src/minimizer_mapper_from_chains.cpp +++ b/src/minimizer_mapper_from_chains.cpp @@ -718,6 +718,28 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { return aln.sequence(); }); + // Create a new alignment object to get rid of old annotations. + { + Alignment temp; + temp.set_sequence(aln.sequence()); + temp.set_name(aln.name()); + temp.set_quality(aln.quality()); + if (has_annotation(aln, "tags")) { + // Preserve any BAM tags, which might really have come in as FASTQ + // comments and which we want to keep. + // TODO: What if these came from a previous Giraffe run though??? + set_annotation(temp, "tags", get_annotation(aln, "tags")); + } + aln = std::move(temp); + } + + // Annotate the read with metadata + if (!sample_name.empty()) { + aln.set_sample_name(sample_name); + } + if (!read_group.empty()) { + aln.set_read_group(read_group); + } // Minimizers sorted by position std::vector minimizers_in_read = this->find_minimizers(aln.sequence(), funnel); @@ -1138,7 +1160,7 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { return mappings; } -void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& zip_code_forest, +void MinimizerMapper::do_chaining_on_trees(const Alignment& aln, const ZipCodeForest& zip_code_forest, const std::vector& seeds, const VectorView& minimizers, const vector& seed_anchors, std::vector>& chains, std::vector>& chain_rec_flags, @@ -1811,7 +1833,7 @@ void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& -void MinimizerMapper::get_best_chain_stats(Alignment& aln, const ZipCodeForest& zip_code_forest, const std::vector& seeds, +void MinimizerMapper::get_best_chain_stats(const Alignment& aln, const ZipCodeForest& zip_code_forest, const std::vector& seeds, const VectorView& minimizers, const std::vector>& chains, const std::vector& chain_source_tree, @@ -1872,7 +1894,7 @@ void MinimizerMapper::get_best_chain_stats(Alignment& aln, const ZipCodeForest& } -void MinimizerMapper::do_alignment_on_chains(Alignment& aln, const std::vector& seeds, +void MinimizerMapper::do_alignment_on_chains(const Alignment& aln, const std::vector& seeds, const VectorView& minimizers, const vector& seed_anchors, const std::vector>& chains, @@ -1897,23 +1919,6 @@ void MinimizerMapper::do_alignment_on_chains(Alignment& aln, const std::vector minimizer_kept_count(minimizers.size(), 0); #endif - // Create a new alignment object to get rid of old annotations. - { - Alignment temp; - temp.set_sequence(aln.sequence()); - temp.set_name(aln.name()); - temp.set_quality(aln.quality()); - aln = std::move(temp); - } - - // Annotate the read with metadata - if (!sample_name.empty()) { - aln.set_sample_name(sample_name); - } - if (!read_group.empty()) { - aln.set_read_group(read_group); - } - // Compute lower limit on chain score to actually investigate int chain_min_score = (int) (min_chain_score_per_base * aln.sequence().size()); // Apply the max in chain score limit @@ -2226,7 +2231,7 @@ void MinimizerMapper::do_alignment_on_chains(Alignment& aln, const std::vector& alignments, +void MinimizerMapper::pick_mappings_from_alignments(const Alignment& aln, const std::vector& alignments, const std::vector& multiplicity_by_alignment, const std::vector& alignments_to_source, const std::vector& chain_score_estimates, diff --git a/test/t/15_vg_surject.t b/test/t/15_vg_surject.t index 4bf8ee6a6e..a3f9e1940f 100644 --- a/test/t/15_vg_surject.t +++ b/test/t/15_vg_surject.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 75 +plan tests 77 vg construct -r small/x.fa >j.vg vg index -x j.xg j.vg @@ -125,9 +125,17 @@ is "$(cat surjected.sam | grep '@RG' | grep 'RG1' | grep 'Sample1' | wc -l)" "1" # a uniform random sequence printf "@read TG:Z:val\nGGCGACGTACTAGGGACTACAGTCCTTCGTCTTTCTCTCTCGACTCCGAA\n+\nHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH\n" > x.fq -is $(vg map -d x -t 1 -f x.fq -5 bam --comments-as-tags | samtools view -f 4 | grep "TG:Z:val" | wc -l | sed 's/^[[:space:]]*//') 1 "Tags are preserved on unmapped reads" +vg map -d x -t 1 -f x.fq --comments-as-tags >x.gam +is $(vg surject -p x -x x.xg --bam-output x.gam | samtools view -f 4 | grep "TG:Z:val" | wc -l | sed 's/^[[:space:]]*//') 1 "Tags are preserved on unmapped reads" +vg map -d x -t 1 -f x.fq --comments-as-tags --gaf >x.gaf +is $(vg surject -p x -x x.xg --bam-output --gaf-input x.gaf | samtools view -f 4 | grep "TG:Z:val" | wc -l | sed 's/^[[:space:]]*//') 1 "Tags are preserved on unmapped reads in GAF" -rm -rf j.vg x.vg j.gam x.gam x.idx j.xg x.xg x.gcsa read.gam reads.gam surjected.sam x.fq +# A sequence that should map +printf "@read TG:Z:val\nACAAGTTAGTTAATCTCTCTGAACTTCAGTTTAATTATCTCTAATATGGA\n+\nHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH\n" > x2.fq +vg map -d x -t 1 -f x2.fq --comments-as-tags --gaf >x2.gaf +is $(vg surject -p x -x x.xg --bam-output --gaf-input x2.gaf | samtools view | grep "TG:Z:val" | wc -l | sed 's/^[[:space:]]*//') 1 "Tags are preserved on mapped reads in GAF" + +rm -rf j.vg x.vg j.gam x.gam x.gaf x.idx j.xg x.xg x.gcsa read.gam reads.gam surjected.sam x.fq vg mod -c graphs/fail.vg >f.vg vg index -k 11 -g f.gcsa -x f.xg f.vg diff --git a/test/t/50_vg_giraffe.t b/test/t/50_vg_giraffe.t index 56c07d4021..6badb11b55 100644 --- a/test/t/50_vg_giraffe.t +++ b/test/t/50_vg_giraffe.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 83 +plan tests 87 vg construct -a -r small/x.fa -v small/x.vcf.gz >x.vg vg index -x x.xg x.vg @@ -116,12 +116,28 @@ vg giraffe -x x.xg -H x.gbwt -m x.shortread.withzip.min -z x.shortread.zipcodes vg giraffe -x x.xg -H x.gbwt -m x.shortread.withzip.min -z x.shortread.zipcodes -d x.dist -f tagged1.fq -f tagged2.fq --comments-as-tags -o BAM > t3.bam vg giraffe -x x.xg -H x.gbwt -m x.shortread.withzip.min -z x.shortread.zipcodes -d x.dist -f tagged1.fq --comments-as-tags -o GAF > t1.gaf +vg giraffe -b chaining-sr -x x.xg -H x.gbwt -m x.shortread.withzip.min -z x.shortread.zipcodes -d x.dist -f tagged1.fq --comments-as-tags -o BAM > t1_chaining.bam +vg giraffe -b chaining-sr -x x.xg -H x.gbwt -m x.shortread.withzip.min -z x.shortread.zipcodes -d x.dist -f tagged1.fq --comments-as-tags -o GAM > t1_chaining.gam +vg giraffe -b chaining-sr -x x.xg -H x.gbwt -m x.shortread.withzip.min -z x.shortread.zipcodes -d x.dist -f tagged1.fq --comments-as-tags -o GAF > t1_chaining.gaf + is "$(samtools view t1.bam | grep T1 | grep T2 | grep T3 | wc -l | sed 's/^[[:space:]]*//')" "1" "BAM tags are preserved on read 1" is "$(samtools view t2.bam | grep T4 | grep T5 | grep T6 | wc -l | sed 's/^[[:space:]]*//')" "1" "BAM tags are preserved on read 2" is "$(samtools view t3.bam | grep T1 | grep T2 | grep T3 | grep read1 | wc -l | sed 's/^[[:space:]]*//')" "1" "BAM tags are preserved on paired read 1" is "$(samtools view t3.bam | grep T4 | grep T5 | grep T6 | grep read2 | wc -l | sed 's/^[[:space:]]*//')" "1" "BAM tags are preserved on paired read 2" is "$(cat t1.gaf | grep T1 | grep T2 | grep T3 | wc -l | sed 's/^[[:space:]]*//')" "1" "GAF tags are preserved on read 1" +is "$(samtools view t1_chaining.bam | grep T1 | grep T2 | grep T3 | wc -l | sed 's/^[[:space:]]*//')" "1" "BAM tags are preserved in chaining mode" +is "$(vg view -aj t1_chaining.gam | jq -r '.annotation.tags' | grep T1 | grep T2 | grep T3 | wc -l | sed 's/^[[:space:]]*//')" "1" "BAM tags are present in GAM in chaining mode" +is "$(cat t1_chaining.gaf | grep T1 | grep T2 | grep T3 | wc -l | sed 's/^[[:space:]]*//')" "1" "GAF tags are preserved in chaining mode" + +# a uniform random sequence +printf "@read TG:Z:val\nGGCGACGTACTAGGGACTACAGTCCTTCGTCTTTCTCTCTCGACTCCGAA\n+\nHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH\n" > x.fq +vg giraffe -b chaining-sr -x x.xg -H x.gbwt -m x.shortread.withzip.min -z x.shortread.zipcodes -d x.dist -f x.fq --comments-as-tags -o GAF >x.gaf +is "$(cat x.gaf | grep 'TG:Z' | wc -l | sed 's/^[[:space:]]*//')" "1" "GAF tags are preserved in chaining mode for unmapped reads" + + +rm t1.bam t2.bam t3.bam t1_chaining.bam t1_chaining.gam t1_chaining.gaf t1.gaf tagged1.fq tagged2.fq x.fq +rm -f read.fq read.gam # Attempts to use mismatched files fail vg gbwt -G graphs/components_walks.gfa -g wrong.gbz @@ -129,9 +145,6 @@ vg giraffe -Z wrong.gbz -d x.dist -m x.shortread.withzip.min -z x.shortread.zipc is "${?}" "1" "mapping with mismatched GBZ and minimizer index fails" is "$(grep -c 'error.*are not compatible' log.txt)" "1" "appropriate error message is printed" -rm t1.bam t2.bam t3.bam t1.gaf tagged1.fq tagged2.fq -rm -f read.fq read.gam - rm -rf explanation_* vg giraffe -Z x.giraffe.gbz -f reads/small.middle.ref.indel.multi.fq --show-work --track-position -b chaining-sr > /dev/null 2>&1