Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion primertrim/align.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import os.path
import logging
import subprocess
import tempfile

Expand Down Expand Up @@ -124,7 +124,9 @@ def _call(self, query_fp, output_fp, min_id=0.85, threads=None):
if threads is not None:
threads_arg = "{:d}".format(threads)
args.extend(["--threads", threads_arg])
logging.info(f"Starting vsearch with arguments: {args}")
subprocess.check_call(args, stderr=self.stderr)
logging.info("Finished vsearch")

def parse(self, f):
for line in f:
Expand Down
22 changes: 22 additions & 0 deletions primertrim/command.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import argparse
import logging
import os
import sys
import tempfile
Expand Down Expand Up @@ -97,18 +98,35 @@ def main(argv=None):
"(default: %(default)s)"
),
)

p.add_argument(
"--log_level",
type=int,
help="Sets the log level, default is 20 for info, use 10 for debug (Default: 20)",
default=20,
)

args = p.parse_args(argv)
logging.basicConfig()
logging.getLogger().setLevel(args.log_level)
logging.info(f"Starting primertrim")

if args.input_fastq is None:
logging.info("Looking for input from stdin")
args.input_fastq = sys.stdin

if args.output_fastq is None:
logging.info("Writing output to stdout")
args.output_fastq = sys.stdout

queryset = []
for ambiguous_primer in args.primer:
for unambiguous_primer in deambiguate(ambiguous_primer):
queryset.append(unambiguous_primer)
if len(queryset) > 50:
logging.info(f"Number of queries: {len(queryset)}")
else:
logging.info(f"Query set: {queryset}")

matchers = [
CompleteMatcher(queryset, args.mismatches, not args.no_revcomp),
Expand All @@ -123,12 +141,14 @@ def main(argv=None):
else:
temp_alignment_dir = tempfile.TemporaryDirectory()
alignment_dir = temp_alignment_dir.name
logging.info(f"Using alignment directory: {alignment_dir}")
am = AlignmentMatcher(queryset, alignment_dir, args.align_id, args.threads)
matchers.append(am)

trimmable_reads = TrimmableReads.from_fastq(args.input_fastq)

for m in matchers:
logging.info(f"Working with {type(m).__name__}")
unmatched_seqs = trimmable_reads.get_unmatched_seqs()
matches_found = m.find_in_seqs(unmatched_seqs)
for read_id, matchobj in matches_found:
Expand All @@ -143,11 +163,13 @@ def main(argv=None):


def write_fastq(f, reads):
logging.info("Writing fastq")
for desc, seq, qual in reads:
f.write("@{0}\n{1}\n+\n{2}\n".format(desc, seq, qual))


def write_log(f, loginfo, colnames=None):
logging.info("Writing log")
if colnames:
f.write("\t".join(colnames))
f.write("\n")
Expand Down
41 changes: 23 additions & 18 deletions primertrim/matcher.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import abc
import collections
import itertools
import logging
import os.path
import tqdm

from .dna import (
AMBIGUOUS_BASES_COMPLEMENT,
Expand Down Expand Up @@ -59,24 +61,27 @@ def _mismatched_queries(self, n_mismatch):

def _iter_mismatched_queries(self, n_mismatch):
# This algorithm is terrible unless the number of mismatches is very small
assert n_mismatch in [0, 1, 2, 3]
for query in self.queryset:
idx_sets = itertools.combinations(range(len(query)), n_mismatch)
for idx_set in idx_sets:
# Replace base at each position with a literal "N", to match
# ambiguous bases in other sequences
yield replace_with_n(query, idx_set)
# Change to list because strings are immutable
qchars = list(query)
# Replace the base at each mismatch position with an
# ambiguous base specifying all possibilities BUT the one
# we see.
for idx in idx_set:
qchars[idx] = AMBIGUOUS_BASES_COMPLEMENT[qchars[idx]]
# Expand to all possibilities for mismatching at this
# particular set of positions
for query_with_mismatches in deambiguate(qchars):
yield query_with_mismatches
assert n_mismatch in [0, 1, 2, 3], f"Mismatch number {n_mismatch} is too high"
logging.info(f"Iterating over {n_mismatch} mismatch query set")
with tqdm.tqdm(total=len(self.queryset)) as pbar:
for query in self.queryset:
pbar.update(1)
idx_sets = itertools.combinations(range(len(query)), n_mismatch)
for idx_set in idx_sets:
# Replace base at each position with a literal "N", to match
# ambiguous bases in other sequences
yield replace_with_n(query, idx_set)
# Change to list because strings are immutable
qchars = list(query)
# Replace the base at each mismatch position with an
# ambiguous base specifying all possibilities BUT the one
# we see.
for idx in idx_set:
qchars[idx] = AMBIGUOUS_BASES_COMPLEMENT[qchars[idx]]
# Expand to all possibilities for mismatching at this
# particular set of positions
for query_with_mismatches in deambiguate(qchars):
yield query_with_mismatches

def find_match(self, seq):
for n_mismatches, queryset in enumerate(self.mismatched_queryset):
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@
"ptrim=primertrim.command:main",
],
},
install_requires=["tqdm"],
)