From 1881d9f6a221d4d2cd48aea72634e7cc344126e7 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 15 Jun 2020 15:31:14 +0100 Subject: [PATCH 1/2] Update to the depth filtering for mpileup. We can now count the depth anywhere along the read rather than only at the left end. It's not buffering reads up so it doesn't know what's about to come, so for any individual column it'll still favour reads that end near a column than start near a column, but by counting at the right hand end we avoid the boom & bust approach to before where the coverage could plumet to zero immediately following a large spike. With iter->depth_pos = 1.0 the coverage ends up being a minimum of maxcnt (assuming the true coverage is higher), rather than a maximum of maxcnt. So we go over-depth, but avoid the drop outs. --- htslib/sam.h | 23 ++++++++++++++++++ sam.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 86 insertions(+), 3 deletions(-) diff --git a/htslib/sam.h b/htslib/sam.h index 7d6b983f9..b0206ae6b 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1650,6 +1650,17 @@ typedef struct __bam_mplp_t *bam_mplp_t; HTSLIB_EXPORT void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt); + /** + * bam_plp_set_cntpos() - indicate which end of reach to apply maxcnt. + * @plp: The bam_plp_t initialised using bam_plp_init. + * @end: A fraction along the read from 0.0 (start) to 1.0 (end). + * Defaults to 0.0 which makes maxcnt a true upper-bound value. + * Using 1.0 makes maxcnt a lower-bound, with upper-bound + * varying based on length and coordinate distribution. + */ + HTSLIB_EXPORT + void bam_plp_set_cntpos(bam_plp_t iter, double end); + HTSLIB_EXPORT void bam_plp_reset(bam_plp_t iter); @@ -1712,6 +1723,18 @@ typedef struct __bam_mplp_t *bam_mplp_t; HTSLIB_EXPORT void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt); + /** + * bam_mplp_set_cntpos() - indicate which end of reach to apply maxcnt. + * @mplp: The bam_mplp_t initialised using bam_mplp_init. + * @end: A fraction along the read from 0.0 (start) to 1.0 (end). + * Defaults to 0.0 which makes maxcnt a true upper-bound value. + * Using 1.0 makes maxcnt a lower-bound, with upper-bound + * varying based on length and coordinate distribution. + */ + HTSLIB_EXPORT + void bam_mplp_set_cntpos(bam_mplp_t iter, double end); + + HTSLIB_EXPORT int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp); diff --git a/sam.c b/sam.c index 74dab1f54..434c8b53c 100644 --- a/sam.c +++ b/sam.c @@ -4139,6 +4139,8 @@ int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) { KHASH_MAP_INIT_STR(olap_hash, lbnode_t *) typedef khash_t(olap_hash) olap_hash_t; +#define MAX_AHEAD 4096 + struct __bam_plp_t { mempool_t *mp; lbnode_t *head, *tail; @@ -4153,6 +4155,10 @@ struct __bam_plp_t { void *data; olap_hash_t *overlaps; + uint32_t depth[MAX_AHEAD]; + uint64_t last_depth_pos; + double depth_pos_fract; + // For notification of creation and destruction events // and associated client-owned pointer. int (*plp_construct)(void *data, const bam1_t *b, bam_pileup_cd *cd); @@ -4172,6 +4178,9 @@ bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data) iter->data = data; iter->b = bam_init1(); } + memset(iter->depth, 0, MAX_AHEAD * sizeof(*iter->depth)); + iter->depth_pos_fract = 0; // fraction between 0 = left and 1 = right + iter->last_depth_pos = 0; return iter; } @@ -4494,21 +4503,60 @@ const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_ int bam_plp_push(bam_plp_t iter, const bam1_t *b) { + uint64_t endpos = 0; + if (iter->error) return -1; if (b) { if (b->core.tid < 0) { overlap_remove(iter, b); return 0; } // Skip only unmapped reads here, any additional filtering must be done in iter->func if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; } - if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt) - { + if (iter->depth_pos_fract == 0 + && iter->tid == b->core.tid + && iter->pos == b->core.pos + && iter->mp->cnt > iter->maxcnt) { + // Removal from left-end depth; mp->cnt overlap_remove(iter, b); return 0; + } else if (iter->depth_pos_fract > 0 + && iter->tid == b->core.tid + && iter->pos == b->core.pos) { + // Removal from depth somewhere else along read + endpos = bam_endpos(b); + uint32_t len = (uint32_t)(endpos - b->core.pos - 1); + len = len * iter->depth_pos_fract + 0.4999; + if (b->core.pos + len < iter->last_depth_pos + && iter->depth[(b->core.pos+len)%MAX_AHEAD] > iter->maxcnt) { + // NB this means read longer than MAX_AHEAD (after downsizing + // by depth_pos fraction) will be retained as by definition + // it'll have zero coverage that far out. + overlap_remove(iter, b); + return 0; + } + + // Zero newly observed iter depth elemtns. + uint64_t end_clipped = endpos, p; + if (end_clipped > iter->pos + MAX_AHEAD) + end_clipped = iter->pos + MAX_AHEAD; + if (iter->last_depth_pos < end_clipped) { + //iter->last_depth_pos = end_clipped; + if (iter->last_depth_pos < end_clipped-MAX_AHEAD) + iter->last_depth_pos = end_clipped-MAX_AHEAD; + for (p = iter->last_depth_pos; p < end_clipped; p++) + iter->depth[p % MAX_AHEAD] = 0; + iter->last_depth_pos = end_clipped; + } + + // Increment depth + for (p = b->core.pos; p < end_clipped; p++) + iter->depth[p % MAX_AHEAD]++; } if (bam_copy1(&iter->tail->b, b) == NULL) return -1; iter->tail->b.id = iter->id++; iter->tail->beg = b->core.pos; - iter->tail->end = bam_endpos(b); + if (!endpos) + endpos = bam_endpos(b); + iter->tail->end = endpos; iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t if (b->core.tid < iter->max_tid) { hts_log_error("The input is not sorted (chromosomes out of order)"); @@ -4602,6 +4650,11 @@ void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt) iter->maxcnt = maxcnt; } +void bam_plp_set_cntpos(bam_plp_t iter, double end) +{ + iter->depth_pos_fract = end; +} + /************************ *** Mpileup iterator *** ************************/ @@ -4651,6 +4704,13 @@ void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt) iter->iter[i]->maxcnt = maxcnt; } +void bam_mplp_set_cntpos(bam_mplp_t iter, double end) +{ + int i; + for (i = 0; i < iter->n; ++i) + iter->iter[i]->depth_pos_fract = end; +} + void bam_mplp_destroy(bam_mplp_t iter) { int i; From 46105c989de51413a70b483edf9ed42a98a36ba8 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 16 Jun 2020 09:45:46 +0100 Subject: [PATCH 2/2] Revised the necessity to only remove duplicates with identical start coord. It's not so vital with the revised formula, so within 10 works good enough and prevents some more excessive depth raises. Also (ifdef-ed out) implemented the "other" more complex method, for comparison. Cull whichever we wish before merging. --- sam.c | 46 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/sam.c b/sam.c index 434c8b53c..c3e66071b 100644 --- a/sam.c +++ b/sam.c @@ -35,6 +35,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include // Suppress deprecation message for cigar_tab, which we initialise #include "htslib/hts_defs.h" @@ -4139,7 +4140,7 @@ int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) { KHASH_MAP_INIT_STR(olap_hash, lbnode_t *) typedef khash_t(olap_hash) olap_hash_t; -#define MAX_AHEAD 4096 +#define MAX_AHEAD (65536*8) struct __bam_plp_t { mempool_t *mp; @@ -4155,14 +4156,16 @@ struct __bam_plp_t { void *data; olap_hash_t *overlaps; - uint32_t depth[MAX_AHEAD]; - uint64_t last_depth_pos; - double depth_pos_fract; - // For notification of creation and destruction events // and associated client-owned pointer. int (*plp_construct)(void *data, const bam1_t *b, bam_pileup_cd *cd); int (*plp_destruct )(void *data, const bam1_t *b, bam_pileup_cd *cd); + + // Field used for automatic depth reduction + uint32_t depth[MAX_AHEAD]; // circular depth buffer from b->core.pos on. + uint64_t last_depth_pos; // array filed out to this pos + uint64_t max_depth_pos; // >= max depth up to this pos + double depth_pos_fract; // (Simple method) }; bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data) @@ -4519,10 +4522,13 @@ int bam_plp_push(bam_plp_t iter, const bam1_t *b) return 0; } else if (iter->depth_pos_fract > 0 && iter->tid == b->core.tid - && iter->pos == b->core.pos) { + && b->core.pos - iter->pos <= b->core.l_qseq / 16 + ) { // Removal from depth somewhere else along read endpos = bam_endpos(b); uint32_t len = (uint32_t)(endpos - b->core.pos - 1); + +#if 1 len = len * iter->depth_pos_fract + 0.4999; if (b->core.pos + len < iter->last_depth_pos && iter->depth[(b->core.pos+len)%MAX_AHEAD] > iter->maxcnt) { @@ -4532,6 +4538,27 @@ int bam_plp_push(bam_plp_t iter, const bam1_t *b) overlap_remove(iter, b); return 0; } +#else + len = len * iter->depth_pos_fract + 0.4999; + if (len >= MAX_AHEAD) len = MAX_AHEAD-1; + int64_t pos_delta = (b->core.pos + len) - iter->max_depth_pos -1; + int last_depth = endpos > iter->last_depth_pos + ? 0 + : iter->depth[(iter->last_depth_pos-1)%MAX_AHEAD]; + int pos_depth = iter->depth[b->core.pos % MAX_AHEAD]; + +#ifndef MIN +#define MIN(a,b) ((a)<(b)?(a):(b)) +#endif + if (drand48() > + //(double)h / 0xffffffff > + MIN(1.0, pow((double)pos_delta / (len+.01), 2)) + * MIN(1.0, pow((1-(last_depth+.1)/iter->maxcnt), 2)) + * MIN(1.0, pow((iter->maxcnt+.1)/pos_depth, 2)) *2) { + overlap_remove(iter, b); + return 0; + } +#endif // Zero newly observed iter depth elemtns. uint64_t end_clipped = endpos, p; @@ -4547,8 +4574,11 @@ int bam_plp_push(bam_plp_t iter, const bam1_t *b) } // Increment depth - for (p = b->core.pos; p < end_clipped; p++) - iter->depth[p % MAX_AHEAD]++; + for (p = b->core.pos; p < end_clipped; p++) { + if (++iter->depth[p % MAX_AHEAD] >= iter->maxcnt) + if (iter->max_depth_pos < p) + iter->max_depth_pos = p; + } } if (bam_copy1(&iter->tail->b, b) == NULL) return -1;