diff --git a/primertrim/align.py b/primertrim/align.py index 0164ecd..030f0f7 100644 --- a/primertrim/align.py +++ b/primertrim/align.py @@ -1,4 +1,4 @@ -import os.path +import logging import subprocess import tempfile @@ -124,7 +124,9 @@ def _call(self, query_fp, output_fp, min_id=0.85, threads=None): if threads is not None: threads_arg = "{:d}".format(threads) args.extend(["--threads", threads_arg]) + logging.info(f"Starting vsearch with arguments: {args}") subprocess.check_call(args, stderr=self.stderr) + logging.info("Finished vsearch") def parse(self, f): for line in f: diff --git a/primertrim/command.py b/primertrim/command.py index d6ff5fd..4b47ef1 100644 --- a/primertrim/command.py +++ b/primertrim/command.py @@ -1,4 +1,5 @@ import argparse +import logging import os import sys import tempfile @@ -97,18 +98,35 @@ def main(argv=None): "(default: %(default)s)" ), ) + + p.add_argument( + "--log_level", + type=int, + help="Sets the log level, default is 20 for info, use 10 for debug (Default: 20)", + default=20, + ) + args = p.parse_args(argv) + logging.basicConfig() + logging.getLogger().setLevel(args.log_level) + logging.info(f"Starting primertrim") if args.input_fastq is None: + logging.info("Looking for input from stdin") args.input_fastq = sys.stdin if args.output_fastq is None: + logging.info("Writing output to stdout") args.output_fastq = sys.stdout queryset = [] for ambiguous_primer in args.primer: for unambiguous_primer in deambiguate(ambiguous_primer): queryset.append(unambiguous_primer) + if len(queryset) > 50: + logging.info(f"Number of queries: {len(queryset)}") + else: + logging.info(f"Query set: {queryset}") matchers = [ CompleteMatcher(queryset, args.mismatches, not args.no_revcomp), @@ -123,12 +141,14 @@ def main(argv=None): else: temp_alignment_dir = tempfile.TemporaryDirectory() alignment_dir = temp_alignment_dir.name + logging.info(f"Using alignment directory: {alignment_dir}") am = AlignmentMatcher(queryset, alignment_dir, args.align_id, args.threads) matchers.append(am) trimmable_reads = TrimmableReads.from_fastq(args.input_fastq) for m in matchers: + logging.info(f"Working with {type(m).__name__}") unmatched_seqs = trimmable_reads.get_unmatched_seqs() matches_found = m.find_in_seqs(unmatched_seqs) for read_id, matchobj in matches_found: @@ -143,11 +163,13 @@ def main(argv=None): def write_fastq(f, reads): + logging.info("Writing fastq") for desc, seq, qual in reads: f.write("@{0}\n{1}\n+\n{2}\n".format(desc, seq, qual)) def write_log(f, loginfo, colnames=None): + logging.info("Writing log") if colnames: f.write("\t".join(colnames)) f.write("\n") diff --git a/primertrim/matcher.py b/primertrim/matcher.py index c7db111..58b4dc3 100644 --- a/primertrim/matcher.py +++ b/primertrim/matcher.py @@ -1,7 +1,9 @@ import abc import collections import itertools +import logging import os.path +import tqdm from .dna import ( AMBIGUOUS_BASES_COMPLEMENT, @@ -59,24 +61,27 @@ def _mismatched_queries(self, n_mismatch): def _iter_mismatched_queries(self, n_mismatch): # This algorithm is terrible unless the number of mismatches is very small - assert n_mismatch in [0, 1, 2, 3] - for query in self.queryset: - idx_sets = itertools.combinations(range(len(query)), n_mismatch) - for idx_set in idx_sets: - # Replace base at each position with a literal "N", to match - # ambiguous bases in other sequences - yield replace_with_n(query, idx_set) - # Change to list because strings are immutable - qchars = list(query) - # Replace the base at each mismatch position with an - # ambiguous base specifying all possibilities BUT the one - # we see. - for idx in idx_set: - qchars[idx] = AMBIGUOUS_BASES_COMPLEMENT[qchars[idx]] - # Expand to all possibilities for mismatching at this - # particular set of positions - for query_with_mismatches in deambiguate(qchars): - yield query_with_mismatches + assert n_mismatch in [0, 1, 2, 3], f"Mismatch number {n_mismatch} is too high" + logging.info(f"Iterating over {n_mismatch} mismatch query set") + with tqdm.tqdm(total=len(self.queryset)) as pbar: + for query in self.queryset: + pbar.update(1) + idx_sets = itertools.combinations(range(len(query)), n_mismatch) + for idx_set in idx_sets: + # Replace base at each position with a literal "N", to match + # ambiguous bases in other sequences + yield replace_with_n(query, idx_set) + # Change to list because strings are immutable + qchars = list(query) + # Replace the base at each mismatch position with an + # ambiguous base specifying all possibilities BUT the one + # we see. + for idx in idx_set: + qchars[idx] = AMBIGUOUS_BASES_COMPLEMENT[qchars[idx]] + # Expand to all possibilities for mismatching at this + # particular set of positions + for query_with_mismatches in deambiguate(qchars): + yield query_with_mismatches def find_match(self, seq): for n_mismatches, queryset in enumerate(self.mismatched_queryset): diff --git a/setup.py b/setup.py index 214a831..cb69331 100644 --- a/setup.py +++ b/setup.py @@ -13,4 +13,5 @@ "ptrim=primertrim.command:main", ], }, + install_requires=["tqdm"], )