Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions htslib/sam.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ DEALINGS IN THE SOFTWARE. */

#include <errno.h>
#include <stdint.h>
#include <sys/types.h>
#include "hts.h"
#include "hts_endian.h"

Expand Down Expand Up @@ -1108,23 +1109,23 @@ int bam_set_qname(bam1_t *b, const char *qname);
@param in [in] pointer to the source string
@param end [out] address of the pointer to the new end of the input string
can be NULL
@param a_cigar [out] address of the destination uint32_t buffer
@param a_cigar [in/out] address of the destination uint32_t buffer
@param a_mem [in/out] address of the allocated number of buffer elements
@return number of processed CIGAR operators; 0 if error
@return number of processed CIGAR operators; -1 on error
*/
HTSLIB_EXPORT
size_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, uint32_t *a_mem);
ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem);

/*! @function
@abstract Parse a CIGAR string into a bam1_t struct
@param in [in] pointer to the source string
@param end [out] address of the pointer to the new end of the input string
can be NULL
@param b [in/out] address of the destination bam1_t struct
@return number of processed CIGAR operators; 0 if error
@return number of processed CIGAR operators; -1 on error
*/
HTSLIB_EXPORT
size_t bam_parse_cigar(const char *in, char **end, bam1_t *b);
ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b);

/*************************
*** BAM/CRAM indexing ***
Expand Down
44 changes: 26 additions & 18 deletions sam.c
Original file line number Diff line number Diff line change
Expand Up @@ -2150,8 +2150,8 @@ int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b)
if (*p != '*') {
uint32_t *cigar = NULL;
int old_l_data = b->l_data;
uint32_t n_cigar = bam_parse_cigar(p, &p, b);
if (!n_cigar || *p++ != '\t') goto err_ret;
int n_cigar = bam_parse_cigar(p, &p, b);
if (n_cigar < 1 || *p++ != '\t') goto err_ret;
cigar = (uint32_t *)(b->data + old_l_data);
c->n_cigar = n_cigar;

Expand Down Expand Up @@ -2319,9 +2319,8 @@ int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b)
return -2;
}

static uint32_t read_ncigar(const char *in) {
static uint32_t read_ncigar(const char *q) {
uint32_t n_cigar = 0;
char *q = (char *)in;
for (; *q && *q != '\t'; ++q)
if (!isdigit_c(*q)) ++n_cigar;
if (!n_cigar) {
Expand All @@ -2340,16 +2339,16 @@ static uint32_t read_ncigar(const char *in) {
@abstract Parse a CIGAR string into preallocated a uint32_t array
@param in [in] pointer to the source string
@param a_cigar [out] address of the destination uint32_t buffer
@return number of processed input characters; 0 if error
@return number of processed input characters; 0 on error
*/
static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) {
int i, overflow = 0;
char *p, *q = (char *)in;
const char *p = in;
for (i = 0; i < n_cigar; i++) {
uint32_t len;
int op;
p = q;
len = hts_str2uint(q, &q, 28, &overflow)<<BAM_CIGAR_SHIFT;
char *q;
len = hts_str2uint(p, &q, 28, &overflow)<<BAM_CIGAR_SHIFT;
if (q == p) {
hts_log_error("CIGAR length invalid at position %d (%s)", (int)(i+1), p);
return 0;
Expand All @@ -2358,7 +2357,8 @@ static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) {
hts_log_error("CIGAR length too long at position %d (%.*s)", (int)(i+1), (int)(q-p+1), p);
return 0;
}
op = bam_cigar_table[(unsigned char)*q++];
p = q;
op = bam_cigar_table[(unsigned char)*p++];
if (op < 0) {
hts_log_error("Unrecognized CIGAR operator");
return 0;
Expand All @@ -2367,19 +2367,23 @@ static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) {
a_cigar[i] |= op;
}

return q-in;
return p-in;
}

size_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, uint32_t *a_mem) {
ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem) {
size_t n_cigar = 0;
int diff;

if (!in || !a_cigar || !a_mem) {
hts_log_error("NULL pointer arguments");
return 0;
return -1;
}
if (end) *end = (char *)in;

if (*in == '*') {
if (end) (*end)++;
return 0;
}
n_cigar = read_ncigar(in);
if (!n_cigar) return 0;
if (n_cigar > *a_mem) {
Expand All @@ -2389,34 +2393,38 @@ size_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, uint32_t
*a_mem = n_cigar;
} else {
hts_log_error("Memory allocation error");
return 0;
return -1;
}
}

if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return 0;
if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return -1;
if (end) *end = (char *)in+diff;

return n_cigar;
}

size_t bam_parse_cigar(const char *in, char **end, bam1_t *b) {
ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) {
size_t n_cigar = 0;
int diff;

if (!in || !b) {
hts_log_error("NULL pointer arguments");
return 0;
return -1;
}
if (end) *end = (char *)in;

if (*in == '*') {
if (end) (*end)++;
return 0;
}
n_cigar = read_ncigar(in);
if (!n_cigar) return 0;
if (possibly_expand_bam_data(b, n_cigar * sizeof(uint32_t)) < 0) {
hts_log_error("Memory allocation error");
return 0;
return -1;
}

if (!(diff = parse_cigar(in, (uint32_t *)(b->data + b->l_data), n_cigar))) return 0;
if (!(diff = parse_cigar(in, (uint32_t *)(b->data + b->l_data), n_cigar))) return -1;
b->l_data += (n_cigar * sizeof(uint32_t));
if (end) *end = (char *)in+diff;

Expand Down
23 changes: 23 additions & 0 deletions test/sam.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ DEALINGS IN THE SOFTWARE. */
#include <math.h>
#include <assert.h>
#include <unistd.h>
#include <sys/types.h>

// Suppress message for faidx_fetch_nseq(), which we're intentionally testing
#include "../htslib/hts_defs.h"
Expand Down Expand Up @@ -2149,6 +2150,27 @@ static void test_bam_set1_write_and_read_back()
ks_free(&ks);
}

static void test_cigar_api(void)
{
uint32_t *buf = NULL;
char *cig = "*";
char *end;
size_t m = 0;
int n;
n = sam_parse_cigar(cig, &end, &buf, &m);
VERIFY(n == 0 && m == 0 && (end-cig) == 1, "failed to parse undefined CIGAR");
cig = "2M3X1I10M5D";
n = sam_parse_cigar(cig, &end, &buf, &m);
VERIFY(n == 5 && m > 0 && (end-cig) == 11, "failed to parse CIGAR string: 2M3X1I10M5D");
n = sam_parse_cigar("722M15D187217376188323783284M67I", NULL, &buf, &m);
Copy link
Copy Markdown
Contributor

@jkbonfield jkbonfield Nov 26, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Edit: ignore me! Sorry it's validating it can't parse it. Good one. :-)

I'm not sure what the point is of this test. 187217376188323783284 is 68 bits long.

Cigars are held inside htslib as uint32_t with 4 bits being the type, so the maximum length of cigar op we currently handle is 268435455 (2^28-1). CRAM could handle longer lengths (if #976 got merged), but I think we rejected that idea as the in-memory htslib struct would need fixing first.

I'm sure we discussed changing the cigar type when we were doing all of our sweeping ABI changes and it was rejected, so I think we're committed to having excessively long cigar lengths as a series of concatenated smaller lengths using the same op type. (I don't see a reason to support that though until we get reads long enough and accurate enough to generate such data. We're considerably far off that still.)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does raise the question though why the test isn't:

VERIFY(n == -1, "failed to flag CIGAR string with long op length: 722M15D187217376188323783284M67I");

The documentation for sam_parse_cigar says -1 on error and "number of processed CIGAR operators" otherwise. If the CIGAR has been rejected (even if syntacically valid) then it should probably return -1 in this case.

VERIFY(n == -1, "failed to flag CIGAR string with long op length: 722M15D187217376188323783284M67I");
n = sam_parse_cigar("53I722MD8X", NULL, &buf, &m);
VERIFY(n == -1, "failed to flag CIGAR string with no op length: 53I722MD8X");

cleanup:
free(buf);
}

int main(int argc, char **argv)
{
int i;
Expand Down Expand Up @@ -2186,6 +2208,7 @@ int main(int argc, char **argv)
test_bam_set1_validate_cigar();
test_bam_set1_validate_size_limits();
test_bam_set1_write_and_read_back();
test_cigar_api();

return status;
}