sam.c 54 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797
  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <string.h>
  4. #include <errno.h>
  5. #include <ctype.h>
  6. #include <zlib.h>
  7. #include "htslib/sam.h"
  8. #include "htslib/bgzf.h"
  9. #include "cram/cram.h"
  10. #include "htslib/hfile.h"
  11. #include "htslib/khash.h"
  12. KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
  13. typedef khash_t(s2i) sdict_t;
  14. /**********************
  15. *** BAM header I/O ***
  16. **********************/
  17. bam_hdr_t *bam_hdr_init()
  18. {
  19. return (bam_hdr_t*)calloc(1, sizeof(bam_hdr_t));
  20. }
  21. void bam_hdr_destroy(bam_hdr_t *h)
  22. {
  23. int32_t i;
  24. if (h == NULL) return;
  25. if (h->target_name) {
  26. for (i = 0; i < h->n_targets; ++i)
  27. free(h->target_name[i]);
  28. free(h->target_name);
  29. free(h->target_len);
  30. }
  31. free(h->text); free(h->cigar_tab);
  32. if (h->sdict) kh_destroy(s2i, (sdict_t*)h->sdict);
  33. free(h);
  34. }
  35. bam_hdr_t *bam_hdr_dup(const bam_hdr_t *h0)
  36. {
  37. if (h0 == NULL) return NULL;
  38. bam_hdr_t *h;
  39. if ((h = bam_hdr_init()) == NULL) return NULL;
  40. // copy the simple data
  41. h->n_targets = h0->n_targets;
  42. h->ignore_sam_err = h0->ignore_sam_err;
  43. h->l_text = h0->l_text;
  44. // Then the pointery stuff
  45. h->cigar_tab = NULL;
  46. h->sdict = NULL;
  47. h->text = (char*)calloc(h->l_text + 1, 1);
  48. memcpy(h->text, h0->text, h->l_text);
  49. h->target_len = (uint32_t*)calloc(h->n_targets, 4);
  50. h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
  51. int i;
  52. for (i = 0; i < h->n_targets; ++i) {
  53. h->target_len[i] = h0->target_len[i];
  54. h->target_name[i] = strdup(h0->target_name[i]);
  55. }
  56. return h;
  57. }
  58. static bam_hdr_t *hdr_from_dict(sdict_t *d)
  59. {
  60. bam_hdr_t *h;
  61. khint_t k;
  62. h = bam_hdr_init();
  63. h->sdict = d;
  64. h->n_targets = kh_size(d);
  65. h->target_len = (uint32_t*)malloc(4 * h->n_targets);
  66. h->target_name = (char**)malloc(sizeof(char*) * h->n_targets);
  67. for (k = kh_begin(d); k != kh_end(d); ++k) {
  68. if (!kh_exist(d, k)) continue;
  69. h->target_name[kh_val(d, k)>>32] = (char*)kh_key(d, k);
  70. h->target_len[kh_val(d, k)>>32] = kh_val(d, k)<<32>>32;
  71. kh_val(d, k) >>= 32;
  72. }
  73. return h;
  74. }
  75. bam_hdr_t *bam_hdr_read(BGZF *fp)
  76. {
  77. bam_hdr_t *h;
  78. char buf[4];
  79. int magic_len, has_EOF;
  80. int32_t i = 1, name_len;
  81. // check EOF
  82. has_EOF = bgzf_check_EOF(fp);
  83. if (has_EOF < 0) {
  84. perror("[W::sam_hdr_read] bgzf_check_EOF");
  85. } else if (has_EOF == 0 && hts_verbose >= 2)
  86. fprintf(stderr, "[W::%s] EOF marker is absent. The input is probably truncated.\n", __func__);
  87. // read "BAM1"
  88. magic_len = bgzf_read(fp, buf, 4);
  89. if (magic_len != 4 || strncmp(buf, "BAM\1", 4)) {
  90. if (hts_verbose >= 1) fprintf(stderr, "[E::%s] invalid BAM binary header\n", __func__);
  91. return 0;
  92. }
  93. h = bam_hdr_init();
  94. // read plain text and the number of reference sequences
  95. bgzf_read(fp, &h->l_text, 4);
  96. if (fp->is_be) ed_swap_4p(&h->l_text);
  97. h->text = (char*)malloc(h->l_text + 1);
  98. h->text[h->l_text] = 0; // make sure it is NULL terminated
  99. bgzf_read(fp, h->text, h->l_text);
  100. bgzf_read(fp, &h->n_targets, 4);
  101. if (fp->is_be) ed_swap_4p(&h->n_targets);
  102. // read reference sequence names and lengths
  103. h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
  104. h->target_len = (uint32_t*)calloc(h->n_targets, 4);
  105. for (i = 0; i != h->n_targets; ++i) {
  106. bgzf_read(fp, &name_len, 4);
  107. if (fp->is_be) ed_swap_4p(&name_len);
  108. h->target_name[i] = (char*)calloc(name_len, 1);
  109. bgzf_read(fp, h->target_name[i], name_len);
  110. bgzf_read(fp, &h->target_len[i], 4);
  111. if (fp->is_be) ed_swap_4p(&h->target_len[i]);
  112. }
  113. return h;
  114. }
  115. int bam_hdr_write(BGZF *fp, const bam_hdr_t *h)
  116. {
  117. char buf[4];
  118. int32_t i, name_len, x;
  119. // write "BAM1"
  120. strncpy(buf, "BAM\1", 4);
  121. bgzf_write(fp, buf, 4);
  122. // write plain text and the number of reference sequences
  123. if (fp->is_be) {
  124. x = ed_swap_4(h->l_text);
  125. bgzf_write(fp, &x, 4);
  126. if (h->l_text) bgzf_write(fp, h->text, h->l_text);
  127. x = ed_swap_4(h->n_targets);
  128. bgzf_write(fp, &x, 4);
  129. } else {
  130. bgzf_write(fp, &h->l_text, 4);
  131. if (h->l_text) bgzf_write(fp, h->text, h->l_text);
  132. bgzf_write(fp, &h->n_targets, 4);
  133. }
  134. // write sequence names and lengths
  135. for (i = 0; i != h->n_targets; ++i) {
  136. char *p = h->target_name[i];
  137. name_len = strlen(p) + 1;
  138. if (fp->is_be) {
  139. x = ed_swap_4(name_len);
  140. bgzf_write(fp, &x, 4);
  141. } else bgzf_write(fp, &name_len, 4);
  142. bgzf_write(fp, p, name_len);
  143. if (fp->is_be) {
  144. x = ed_swap_4(h->target_len[i]);
  145. bgzf_write(fp, &x, 4);
  146. } else bgzf_write(fp, &h->target_len[i], 4);
  147. }
  148. bgzf_flush(fp);
  149. return 0;
  150. }
  151. int bam_name2id(bam_hdr_t *h, const char *ref)
  152. {
  153. sdict_t *d = (sdict_t*)h->sdict;
  154. khint_t k;
  155. if (h->sdict == 0) {
  156. int i, absent;
  157. d = kh_init(s2i);
  158. for (i = 0; i < h->n_targets; ++i) {
  159. k = kh_put(s2i, d, h->target_name[i], &absent);
  160. kh_val(d, k) = i;
  161. }
  162. h->sdict = d;
  163. }
  164. k = kh_get(s2i, d, ref);
  165. return k == kh_end(d)? -1 : kh_val(d, k);
  166. }
  167. /*************************
  168. *** BAM alignment I/O ***
  169. *************************/
  170. bam1_t *bam_init1()
  171. {
  172. return (bam1_t*)calloc(1, sizeof(bam1_t));
  173. }
  174. void bam_destroy1(bam1_t *b)
  175. {
  176. if (b == 0) return;
  177. free(b->data); free(b);
  178. }
  179. bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
  180. {
  181. uint8_t *data = bdst->data;
  182. int m_data = bdst->m_data; // backup data and m_data
  183. if (m_data < bsrc->l_data) { // double the capacity
  184. m_data = bsrc->l_data; kroundup32(m_data);
  185. data = (uint8_t*)realloc(data, m_data);
  186. }
  187. memcpy(data, bsrc->data, bsrc->l_data); // copy var-len data
  188. *bdst = *bsrc; // copy the rest
  189. // restore the backup
  190. bdst->m_data = m_data;
  191. bdst->data = data;
  192. return bdst;
  193. }
  194. bam1_t *bam_dup1(const bam1_t *bsrc)
  195. {
  196. if (bsrc == NULL) return NULL;
  197. bam1_t *bdst = bam_init1();
  198. if (bdst == NULL) return NULL;
  199. return bam_copy1(bdst, bsrc);
  200. }
  201. int bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
  202. {
  203. int k, l;
  204. for (k = l = 0; k < n_cigar; ++k)
  205. if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
  206. l += bam_cigar_oplen(cigar[k]);
  207. return l;
  208. }
  209. int bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
  210. {
  211. int k, l;
  212. for (k = l = 0; k < n_cigar; ++k)
  213. if (bam_cigar_type(bam_cigar_op(cigar[k]))&2)
  214. l += bam_cigar_oplen(cigar[k]);
  215. return l;
  216. }
  217. int32_t bam_endpos(const bam1_t *b)
  218. {
  219. if (!(b->core.flag & BAM_FUNMAP) && b->core.n_cigar > 0)
  220. return b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
  221. else
  222. return b->core.pos + 1;
  223. }
  224. static inline int aux_type2size(uint8_t type)
  225. {
  226. switch (type) {
  227. case 'A': case 'c': case 'C':
  228. return 1;
  229. case 's': case 'S':
  230. return 2;
  231. case 'i': case 'I': case 'f':
  232. return 4;
  233. case 'd':
  234. return 8;
  235. case 'Z': case 'H': case 'B':
  236. return type;
  237. default:
  238. return 0;
  239. }
  240. }
  241. static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host)
  242. {
  243. uint8_t *s;
  244. uint32_t *cigar = (uint32_t*)(data + c->l_qname);
  245. uint32_t i, n;
  246. s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2;
  247. for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]);
  248. while (s < data + l_data) {
  249. int size;
  250. s += 2; // skip key
  251. size = aux_type2size(*s); ++s; // skip type
  252. switch (size) {
  253. case 1: ++s; break;
  254. case 2: ed_swap_2p(s); s += 2; break;
  255. case 4: ed_swap_4p(s); s += 4; break;
  256. case 8: ed_swap_8p(s); s += 8; break;
  257. case 'Z':
  258. case 'H':
  259. while (*s) ++s;
  260. ++s;
  261. break;
  262. case 'B':
  263. size = aux_type2size(*s); ++s;
  264. if (is_host) memcpy(&n, s, 4), ed_swap_4p(s);
  265. else ed_swap_4p(s), memcpy(&n, s, 4);
  266. s += 4;
  267. switch (size) {
  268. case 1: s += n; break;
  269. case 2: for (i = 0; i < n; ++i, s += 2) ed_swap_2p(s); break;
  270. case 4: for (i = 0; i < n; ++i, s += 4) ed_swap_4p(s); break;
  271. case 8: for (i = 0; i < n; ++i, s += 8) ed_swap_8p(s); break;
  272. }
  273. break;
  274. }
  275. }
  276. }
  277. int bam_read1(BGZF *fp, bam1_t *b)
  278. {
  279. bam1_core_t *c = &b->core;
  280. int32_t block_len, ret, i;
  281. uint32_t x[8];
  282. if ((ret = bgzf_read(fp, &block_len, 4)) != 4) {
  283. if (ret == 0) return -1; // normal end-of-file
  284. else return -2; // truncated
  285. }
  286. if (bgzf_read(fp, x, 32) != 32) return -3;
  287. if (fp->is_be) {
  288. ed_swap_4p(&block_len);
  289. for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
  290. }
  291. c->tid = x[0]; c->pos = x[1];
  292. c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
  293. c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
  294. c->l_qseq = x[4];
  295. c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7];
  296. b->l_data = block_len - 32;
  297. if (b->l_data < 0 || c->l_qseq < 0) return -4;
  298. if ((char *)bam_get_aux(b) - (char *)b->data > b->l_data)
  299. return -4;
  300. if (b->m_data < b->l_data) {
  301. b->m_data = b->l_data;
  302. kroundup32(b->m_data);
  303. b->data = (uint8_t*)realloc(b->data, b->m_data);
  304. if (!b->data)
  305. return -4;
  306. }
  307. if (bgzf_read(fp, b->data, b->l_data) != b->l_data) return -4;
  308. //b->l_aux = b->l_data - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2;
  309. if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
  310. return 4 + block_len;
  311. }
  312. int bam_write1(BGZF *fp, const bam1_t *b)
  313. {
  314. const bam1_core_t *c = &b->core;
  315. uint32_t x[8], block_len = b->l_data + 32, y;
  316. int i, ok;
  317. x[0] = c->tid;
  318. x[1] = c->pos;
  319. x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | c->l_qname;
  320. x[3] = (uint32_t)c->flag<<16 | c->n_cigar;
  321. x[4] = c->l_qseq;
  322. x[5] = c->mtid;
  323. x[6] = c->mpos;
  324. x[7] = c->isize;
  325. ok = (bgzf_flush_try(fp, 4 + block_len) >= 0);
  326. if (fp->is_be) {
  327. for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
  328. y = block_len;
  329. if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0);
  330. swap_data(c, b->l_data, b->data, 1);
  331. } else {
  332. if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0);
  333. }
  334. if (ok) ok = (bgzf_write(fp, x, 32) >= 0);
  335. if (ok) ok = (bgzf_write(fp, b->data, b->l_data) >= 0);
  336. if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
  337. return ok? 4 + block_len : -1;
  338. }
  339. /********************
  340. *** BAM indexing ***
  341. ********************/
  342. static hts_idx_t *bam_index(BGZF *fp, int min_shift)
  343. {
  344. int n_lvls, i, fmt;
  345. bam1_t *b;
  346. hts_idx_t *idx;
  347. bam_hdr_t *h;
  348. h = bam_hdr_read(fp);
  349. if (min_shift > 0) {
  350. int64_t max_len = 0, s;
  351. for (i = 0; i < h->n_targets; ++i)
  352. if (max_len < h->target_len[i]) max_len = h->target_len[i];
  353. max_len += 256;
  354. for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
  355. fmt = HTS_FMT_CSI;
  356. } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
  357. idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp), min_shift, n_lvls);
  358. bam_hdr_destroy(h);
  359. b = bam_init1();
  360. while (bam_read1(fp, b) >= 0) {
  361. int l, ret;
  362. l = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
  363. if (l == 0) l = 1; // no zero-length records
  364. ret = hts_idx_push(idx, b->core.tid, b->core.pos, b->core.pos + l, bgzf_tell(fp), !(b->core.flag&BAM_FUNMAP));
  365. if (ret < 0)
  366. {
  367. // unsorted
  368. bam_destroy1(b);
  369. hts_idx_destroy(idx);
  370. return NULL;
  371. }
  372. }
  373. hts_idx_finish(idx, bgzf_tell(fp));
  374. bam_destroy1(b);
  375. return idx;
  376. }
  377. int bam_index_build(const char *fn, int min_shift)
  378. {
  379. hts_idx_t *idx;
  380. htsFile *fp;
  381. int ret = 0;
  382. if ((fp = hts_open(fn, "r")) == 0) return -1;
  383. if (fp->is_cram) {
  384. ret = cram_index_build(fp->fp.cram, fn);
  385. } else {
  386. idx = bam_index(fp->fp.bgzf, min_shift);
  387. if ( !idx )
  388. {
  389. hts_close(fp);
  390. return -1;
  391. }
  392. hts_idx_save(idx, fn, min_shift > 0
  393. ? HTS_FMT_CSI : HTS_FMT_BAI);
  394. hts_idx_destroy(idx);
  395. }
  396. hts_close(fp);
  397. return ret;
  398. }
  399. static int bam_readrec(BGZF *fp, void *ignored, void *bv, int *tid, int *beg, int *end)
  400. {
  401. bam1_t *b = bv;
  402. int ret;
  403. if ((ret = bam_read1(fp, b)) >= 0) {
  404. *tid = b->core.tid; *beg = b->core.pos;
  405. *end = b->core.pos + (b->core.n_cigar? bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)) : 1);
  406. }
  407. return ret;
  408. }
  409. // This is used only with read_rest=1 iterators, so need not set tid/beg/end.
  410. static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, int *end)
  411. {
  412. htsFile *fp = fpv;
  413. bam1_t *b = bv;
  414. return cram_get_bam_seq(fp->fp.cram, &b);
  415. }
  416. // This is used only with read_rest=1 iterators, so need not set tid/beg/end.
  417. static int sam_bam_cram_readrec(BGZF *bgzfp, void *fpv, void *bv, int *tid, int *beg, int *end)
  418. {
  419. htsFile *fp = fpv;
  420. bam1_t *b = bv;
  421. if (fp->is_bin) return bam_read1(bgzfp, b);
  422. else if (fp->is_cram) return cram_get_bam_seq(fp->fp.cram, &b);
  423. else {
  424. // TODO Need headers available to implement this for SAM files
  425. fprintf(stderr, "[sam_bam_cram_readrec] Not implemented for SAM files -- Exiting\n");
  426. abort();
  427. }
  428. }
  429. // The CRAM implementation stores the loaded index within the cram_fd rather
  430. // than separately as is done elsewhere in htslib. So if p is a pointer to
  431. // an hts_idx_t with p->fmt == HTS_FMT_CRAI, then it actually points to an
  432. // hts_cram_idx_t and should be cast accordingly.
  433. typedef struct hts_cram_idx_t {
  434. int fmt;
  435. cram_fd *cram;
  436. } hts_cram_idx_t;
  437. hts_idx_t *sam_index_load(samFile *fp, const char *fn)
  438. {
  439. if (fp->is_bin) return bam_index_load(fn);
  440. else if (fp->is_cram) {
  441. if (cram_index_load(fp->fp.cram, fn) < 0) return NULL;
  442. // Cons up a fake "index" just pointing at the associated cram_fd:
  443. hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t));
  444. if (idx == NULL) return NULL;
  445. idx->fmt = HTS_FMT_CRAI;
  446. idx->cram = fp->fp.cram;
  447. return (hts_idx_t *) idx;
  448. }
  449. else return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
  450. }
  451. static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
  452. {
  453. const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
  454. hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t));
  455. if (iter == NULL) return NULL;
  456. // Cons up a dummy iterator for which hts_itr_next() will simply invoke
  457. // the readrec function:
  458. iter->read_rest = 1;
  459. iter->off = NULL;
  460. iter->bins.a = NULL;
  461. iter->readrec = readrec;
  462. if (tid >= 0) {
  463. cram_range r = { tid, beg+1, end };
  464. if (cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r) != 0) { free(iter); return NULL; }
  465. iter->curr_off = 0;
  466. // The following fields are not required by hts_itr_next(), but are
  467. // filled in in case user code wants to look at them.
  468. iter->tid = tid;
  469. iter->beg = beg;
  470. iter->end = end;
  471. }
  472. else switch (tid) {
  473. case HTS_IDX_REST:
  474. iter->curr_off = 0;
  475. break;
  476. case HTS_IDX_NONE:
  477. iter->curr_off = 0;
  478. iter->finished = 1;
  479. break;
  480. default:
  481. fprintf(stderr, "[cram_itr_query] tid=%d not implemented for CRAM files -- Exiting\n", tid);
  482. abort();
  483. break;
  484. }
  485. return iter;
  486. }
  487. hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
  488. {
  489. const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
  490. if (idx == NULL)
  491. return hts_itr_query(NULL, tid, beg, end, sam_bam_cram_readrec);
  492. else if (cidx->fmt == HTS_FMT_CRAI)
  493. return cram_itr_query(idx, tid, beg, end, cram_readrec);
  494. else
  495. return hts_itr_query(idx, tid, beg, end, bam_readrec);
  496. }
  497. static int cram_name2id(void *fdv, const char *ref)
  498. {
  499. cram_fd *fd = (cram_fd *) fdv;
  500. return sam_hdr_name2ref(fd->header, ref);
  501. }
  502. hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
  503. {
  504. const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
  505. if (cidx->fmt == HTS_FMT_CRAI)
  506. return hts_itr_querys(idx, region, cram_name2id, cidx->cram, cram_itr_query, cram_readrec);
  507. else
  508. return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr, hts_itr_query, bam_readrec);
  509. }
  510. /**********************
  511. *** SAM header I/O ***
  512. **********************/
  513. #include "htslib/kseq.h"
  514. #include "htslib/kstring.h"
  515. bam_hdr_t *sam_hdr_parse(int l_text, const char *text)
  516. {
  517. const char *q, *r, *p;
  518. khash_t(s2i) *d;
  519. d = kh_init(s2i);
  520. for (p = text; *p; ++p) {
  521. if (strncmp(p, "@SQ", 3) == 0) {
  522. char *sn = 0;
  523. int ln = -1;
  524. for (q = p + 4;; ++q) {
  525. if (strncmp(q, "SN:", 3) == 0) {
  526. q += 3;
  527. for (r = q; *r != '\t' && *r != '\n'; ++r);
  528. sn = (char*)calloc(r - q + 1, 1);
  529. strncpy(sn, q, r - q);
  530. q = r;
  531. } else if (strncmp(q, "LN:", 3) == 0)
  532. ln = strtol(q + 3, (char**)&q, 10);
  533. while (*q != '\t' && *q != '\n') ++q;
  534. if (*q == '\n') break;
  535. }
  536. p = q;
  537. if (sn && ln >= 0) {
  538. khint_t k;
  539. int absent;
  540. k = kh_put(s2i, d, sn, &absent);
  541. if (!absent) {
  542. if (hts_verbose >= 2)
  543. fprintf(stderr, "[W::%s] duplicated sequence '%s'\n", __func__, sn);
  544. free(sn);
  545. } else kh_val(d, k) = (int64_t)(kh_size(d) - 1)<<32 | ln;
  546. }
  547. }
  548. while (*p != '\n') ++p;
  549. }
  550. return hdr_from_dict(d);
  551. }
  552. bam_hdr_t *sam_hdr_read(htsFile *fp)
  553. {
  554. if (fp->is_bin) {
  555. return bam_hdr_read(fp->fp.bgzf);
  556. } else if (fp->is_cram) {
  557. return cram_header_to_bam(fp->fp.cram->header);
  558. } else {
  559. kstring_t str;
  560. bam_hdr_t *h;
  561. int has_SQ = 0;
  562. str.l = str.m = 0; str.s = 0;
  563. while (hts_getline(fp, KS_SEP_LINE, &fp->line) >= 0) {
  564. if (fp->line.s[0] != '@') break;
  565. if (fp->line.l > 3 && strncmp(fp->line.s,"@SQ",3) == 0) has_SQ = 1;
  566. kputsn(fp->line.s, fp->line.l, &str);
  567. kputc('\n', &str);
  568. }
  569. if (! has_SQ && fp->fn_aux) {
  570. char line[2048];
  571. FILE *f = fopen(fp->fn_aux, "r");
  572. if (f == NULL) return NULL;
  573. while (fgets(line, sizeof line, f)) {
  574. const char *name = strtok(line, "\t");
  575. const char *length = strtok(NULL, "\t");
  576. ksprintf(&str, "@SQ\tSN:%s\tLN:%s\n", name, length);
  577. }
  578. fclose(f);
  579. }
  580. if (str.l == 0) kputsn("", 0, &str);
  581. h = sam_hdr_parse(str.l, str.s);
  582. h->l_text = str.l; h->text = str.s;
  583. return h;
  584. }
  585. }
  586. int sam_hdr_write(htsFile *fp, const bam_hdr_t *h)
  587. {
  588. if (fp->is_bin) {
  589. bam_hdr_write(fp->fp.bgzf, h);
  590. } else if (fp->is_cram) {
  591. cram_fd *fd = fp->fp.cram;
  592. if (cram_set_header(fd, bam_header_to_cram((bam_hdr_t *)h)) < 0) return -1;
  593. if (fp->fn_aux)
  594. cram_load_reference(fd, fp->fn_aux);
  595. if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1;
  596. } else {
  597. char *p;
  598. hputs(h->text, fp->fp.hfile);
  599. p = strstr(h->text, "@SQ\t"); // FIXME: we need a loop to make sure "@SQ\t" does not match something unwanted!!!
  600. if (p == 0) {
  601. int i;
  602. for (i = 0; i < h->n_targets; ++i) {
  603. fp->line.l = 0;
  604. kputsn("@SQ\tSN:", 7, &fp->line); kputs(h->target_name[i], &fp->line);
  605. kputsn("\tLN:", 4, &fp->line); kputw(h->target_len[i], &fp->line); kputc('\n', &fp->line);
  606. if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
  607. }
  608. }
  609. if ( hflush(fp->fp.hfile) != 0 ) return -1;
  610. }
  611. return 0;
  612. }
  613. /**********************
  614. *** SAM record I/O ***
  615. **********************/
  616. int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b)
  617. {
  618. #define _read_token(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); if (*(_p) != '\t') goto err_ret; *(_p)++ = 0
  619. #define _read_token_aux(_p) (_p); for (; *(_p) && *(_p) != '\t'; ++(_p)); *(_p)++ = 0 // this is different in that it does not test *(_p)=='\t'
  620. #define _get_mem(type_t, _x, _s, _l) ks_resize((_s), (_s)->l + (_l)); *(_x) = (type_t*)((_s)->s + (_s)->l); (_s)->l += (_l)
  621. #define _parse_err(cond, msg) do { if ((cond) && hts_verbose >= 1) { fprintf(stderr, "[E::%s] " msg "\n", __func__); goto err_ret; } } while (0)
  622. #define _parse_warn(cond, msg) if ((cond) && hts_verbose >= 2) fprintf(stderr, "[W::%s] " msg "\n", __func__)
  623. uint8_t *t;
  624. char *p = s->s, *q;
  625. int i;
  626. kstring_t str;
  627. bam1_core_t *c = &b->core;
  628. str.l = b->l_data = 0;
  629. str.s = (char*)b->data; str.m = b->m_data;
  630. memset(c, 0, 32);
  631. if (h->cigar_tab == 0) {
  632. h->cigar_tab = (int8_t*) malloc(128);
  633. for (i = 0; i < 128; ++i)
  634. h->cigar_tab[i] = -1;
  635. for (i = 0; BAM_CIGAR_STR[i]; ++i)
  636. h->cigar_tab[(int)BAM_CIGAR_STR[i]] = i;
  637. }
  638. // qname
  639. q = _read_token(p);
  640. kputsn_(q, p - q, &str);
  641. c->l_qname = p - q;
  642. // flag
  643. c->flag = strtol(p, &p, 0);
  644. if (*p++ != '\t') goto err_ret; // malformated flag
  645. // chr
  646. q = _read_token(p);
  647. if (strcmp(q, "*")) {
  648. _parse_err(h->n_targets == 0, "missing SAM header");
  649. c->tid = bam_name2id(h, q);
  650. _parse_warn(c->tid < 0, "urecognized reference name; treated as unmapped");
  651. } else c->tid = -1;
  652. // pos
  653. c->pos = strtol(p, &p, 10) - 1;
  654. if (*p++ != '\t') goto err_ret;
  655. if (c->pos < 0 && c->tid >= 0) {
  656. _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
  657. c->tid = -1;
  658. }
  659. if (c->tid < 0) c->flag |= BAM_FUNMAP;
  660. // mapq
  661. c->qual = strtol(p, &p, 10);
  662. if (*p++ != '\t') goto err_ret;
  663. // cigar
  664. if (*p != '*') {
  665. uint32_t *cigar;
  666. size_t n_cigar = 0;
  667. for (q = p; *p && *p != '\t'; ++p)
  668. if (!isdigit(*p)) ++n_cigar;
  669. if (*p++ != '\t') goto err_ret;
  670. _parse_err(n_cigar >= 65536, "too many CIGAR operations");
  671. c->n_cigar = n_cigar;
  672. _get_mem(uint32_t, &cigar, &str, c->n_cigar<<2);
  673. for (i = 0; i < c->n_cigar; ++i, ++q) {
  674. int op;
  675. cigar[i] = strtol(q, &q, 10)<<BAM_CIGAR_SHIFT;
  676. op = (uint8_t)*q >= 128? -1 : h->cigar_tab[(int)*q];
  677. _parse_err(op < 0, "unrecognized CIGAR operator");
  678. cigar[i] |= op;
  679. }
  680. i = bam_cigar2rlen(c->n_cigar, cigar);
  681. } else {
  682. _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped");
  683. c->flag |= BAM_FUNMAP;
  684. q = _read_token(p);
  685. i = 1;
  686. }
  687. c->bin = hts_reg2bin(c->pos, c->pos + i, 14, 5);
  688. // mate chr
  689. q = _read_token(p);
  690. if (strcmp(q, "=") == 0) c->mtid = c->tid;
  691. else if (strcmp(q, "*") == 0) c->mtid = -1;
  692. else c->mtid = bam_name2id(h, q);
  693. // mpos
  694. c->mpos = strtol(p, &p, 10) - 1;
  695. if (*p++ != '\t') goto err_ret;
  696. if (c->mpos < 0 && c->mtid >= 0) {
  697. _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
  698. c->mtid = -1;
  699. }
  700. // tlen
  701. c->isize = strtol(p, &p, 10);
  702. if (*p++ != '\t') goto err_ret;
  703. // seq
  704. q = _read_token(p);
  705. if (strcmp(q, "*")) {
  706. c->l_qseq = p - q - 1;
  707. i = bam_cigar2qlen(c->n_cigar, (uint32_t*)(str.s + c->l_qname));
  708. _parse_err(c->n_cigar && i != c->l_qseq, "CIGAR and query sequence are of different length");
  709. i = (c->l_qseq + 1) >> 1;
  710. _get_mem(uint8_t, &t, &str, i);
  711. memset(t, 0, i);
  712. for (i = 0; i < c->l_qseq; ++i)
  713. t[i>>1] |= seq_nt16_table[(int)q[i]] << ((~i&1)<<2);
  714. } else c->l_qseq = 0;
  715. // qual
  716. q = _read_token_aux(p);
  717. _get_mem(uint8_t, &t, &str, c->l_qseq);
  718. if (strcmp(q, "*")) {
  719. _parse_err(p - q - 1 != c->l_qseq, "SEQ and QUAL are of different length");
  720. for (i = 0; i < c->l_qseq; ++i) t[i] = q[i] - 33;
  721. } else memset(t, 0xff, c->l_qseq);
  722. // aux
  723. // Note that (like the bam1_core_t fields) this aux data in b->data is
  724. // stored in host endianness; so there is no byte swapping needed here.
  725. while (p < s->s + s->l) {
  726. uint8_t type;
  727. q = _read_token_aux(p); // FIXME: can be accelerated for long 'B' arrays
  728. _parse_err(p - q - 1 < 6, "incomplete aux field");
  729. kputsn_(q, 2, &str);
  730. q += 3; type = *q++; ++q; // q points to value
  731. if (type == 'A' || type == 'a' || type == 'c' || type == 'C') {
  732. kputc_('A', &str);
  733. kputc_(*q, &str);
  734. } else if (type == 'i' || type == 'I') {
  735. long x;
  736. x = strtol(q, &q, 10);
  737. if (x < 0) {
  738. if (x >= INT8_MIN) {
  739. kputc_('c', &str); kputc_(x, &str);
  740. } else if (x >= INT16_MIN) {
  741. int16_t y = x;
  742. kputc_('s', &str); kputsn_((char*)&y, 2, &str);
  743. } else {
  744. int32_t y = x;
  745. kputc_('i', &str); kputsn_(&y, 4, &str);
  746. }
  747. } else {
  748. if (x <= UINT8_MAX) {
  749. kputc_('C', &str); kputc_(x, &str);
  750. } else if (x <= UINT16_MAX) {
  751. uint16_t y = x;
  752. kputc_('S', &str); kputsn_(&y, 2, &str);
  753. } else {
  754. uint32_t y = x;
  755. kputc_('I', &str); kputsn_(&y, 4, &str);
  756. }
  757. }
  758. } else if (type == 'f') {
  759. float x;
  760. x = strtod(q, &q);
  761. kputc_('f', &str); kputsn_(&x, 4, &str);
  762. } else if (type == 'd') {
  763. double x;
  764. x = strtod(q, &q);
  765. kputc_('d', &str); kputsn_(&x, 8, &str);
  766. } else if (type == 'Z' || type == 'H') {
  767. kputc_(type, &str);kputsn_(q, p - q, &str); // note that this include the trailing NULL
  768. } else if (type == 'B') {
  769. int32_t n;
  770. char *r;
  771. _parse_err(p - q - 1 < 3, "incomplete B-typed aux field");
  772. type = *q++; // q points to the first ',' following the typing byte
  773. for (r = q, n = 0; *r; ++r)
  774. if (*r == ',') ++n;
  775. kputc_('B', &str); kputc_(type, &str); kputsn_(&n, 4, &str);
  776. // FIXME: to evaluate which is faster: a) aligned array and then memmove(); b) unaligned array; c) kputsn_()
  777. if (type == 'c') while (q + 1 < p) { int8_t x = strtol(q + 1, &q, 0); kputc_(x, &str); }
  778. else if (type == 'C') while (q + 1 < p) { uint8_t x = strtoul(q + 1, &q, 0); kputc_(x, &str); }
  779. else if (type == 's') while (q + 1 < p) { int16_t x = strtol(q + 1, &q, 0); kputsn_(&x, 2, &str); }
  780. else if (type == 'S') while (q + 1 < p) { uint16_t x = strtoul(q + 1, &q, 0); kputsn_(&x, 2, &str); }
  781. else if (type == 'i') while (q + 1 < p) { int32_t x = strtol(q + 1, &q, 0); kputsn_(&x, 4, &str); }
  782. else if (type == 'I') while (q + 1 < p) { uint32_t x = strtoul(q + 1, &q, 0); kputsn_(&x, 4, &str); }
  783. else if (type == 'f') while (q + 1 < p) { float x = strtod(q + 1, &q); kputsn_(&x, 4, &str); }
  784. else _parse_err(1, "unrecognized type");
  785. } else _parse_err(1, "unrecognized type");
  786. }
  787. b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m;
  788. return 0;
  789. #undef _parse_warn
  790. #undef _parse_err
  791. #undef _get_mem
  792. #undef _read_token_aux
  793. #undef _read_token
  794. err_ret:
  795. b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m;
  796. return -2;
  797. }
  798. int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b)
  799. {
  800. if (fp->is_bin) {
  801. int r = bam_read1(fp->fp.bgzf, b);
  802. if (r >= 0) {
  803. if (b->core.tid >= h->n_targets || b->core.tid < -1 ||
  804. b->core.mtid >= h->n_targets || b->core.mtid < -1)
  805. return -3;
  806. }
  807. return r;
  808. } else if (fp->is_cram) {
  809. return cram_get_bam_seq(fp->fp.cram, &b);
  810. } else {
  811. int ret;
  812. err_recover:
  813. if (fp->line.l == 0) {
  814. ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
  815. if (ret < 0) return -1;
  816. }
  817. ret = sam_parse1(&fp->line, h, b);
  818. fp->line.l = 0;
  819. if (ret < 0) {
  820. if (hts_verbose >= 1)
  821. fprintf(stderr, "[W::%s] parse error at line %lld\n", __func__, (long long)fp->lineno);
  822. if (h->ignore_sam_err) goto err_recover;
  823. }
  824. return ret;
  825. }
  826. }
  827. int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
  828. {
  829. int i;
  830. uint8_t *s;
  831. const bam1_core_t *c = &b->core;
  832. str->l = 0;
  833. kputsn(bam_get_qname(b), c->l_qname-1, str); kputc('\t', str); // query name
  834. kputw(c->flag, str); kputc('\t', str); // flag
  835. if (c->tid >= 0) { // chr
  836. kputs(h->target_name[c->tid] , str);
  837. kputc('\t', str);
  838. } else kputsn("*\t", 2, str);
  839. kputw(c->pos + 1, str); kputc('\t', str); // pos
  840. kputw(c->qual, str); kputc('\t', str); // qual
  841. if (c->n_cigar) { // cigar
  842. uint32_t *cigar = bam_get_cigar(b);
  843. for (i = 0; i < c->n_cigar; ++i) {
  844. kputw(bam_cigar_oplen(cigar[i]), str);
  845. kputc(bam_cigar_opchr(cigar[i]), str);
  846. }
  847. } else kputc('*', str);
  848. kputc('\t', str);
  849. if (c->mtid < 0) kputsn("*\t", 2, str); // mate chr
  850. else if (c->mtid == c->tid) kputsn("=\t", 2, str);
  851. else {
  852. kputs(h->target_name[c->mtid], str);
  853. kputc('\t', str);
  854. }
  855. kputw(c->mpos + 1, str); kputc('\t', str); // mate pos
  856. kputw(c->isize, str); kputc('\t', str); // template len
  857. if (c->l_qseq) { // seq and qual
  858. uint8_t *s = bam_get_seq(b);
  859. for (i = 0; i < c->l_qseq; ++i) kputc("=ACMGRSVTWYHKDBN"[bam_seqi(s, i)], str);
  860. kputc('\t', str);
  861. s = bam_get_qual(b);
  862. if (s[0] == 0xff) kputc('*', str);
  863. else for (i = 0; i < c->l_qseq; ++i) kputc(s[i] + 33, str);
  864. } else kputsn("*\t*", 3, str);
  865. s = bam_get_aux(b); // aux
  866. while (s+4 <= b->data + b->l_data) {
  867. uint8_t type, key[2];
  868. key[0] = s[0]; key[1] = s[1];
  869. s += 2; type = *s++;
  870. kputc('\t', str); kputsn((char*)key, 2, str); kputc(':', str);
  871. if (type == 'A') {
  872. kputsn("A:", 2, str);
  873. kputc(*s, str);
  874. ++s;
  875. } else if (type == 'C') {
  876. kputsn("i:", 2, str);
  877. kputw(*s, str);
  878. ++s;
  879. } else if (type == 'c') {
  880. kputsn("i:", 2, str);
  881. kputw(*(int8_t*)s, str);
  882. ++s;
  883. } else if (type == 'S') {
  884. if (s+2 <= b->data + b->l_data) {
  885. kputsn("i:", 2, str);
  886. kputw(*(uint16_t*)s, str);
  887. s += 2;
  888. } else return -1;
  889. } else if (type == 's') {
  890. if (s+2 <= b->data + b->l_data) {
  891. kputsn("i:", 2, str);
  892. kputw(*(int16_t*)s, str);
  893. s += 2;
  894. } else return -1;
  895. } else if (type == 'I') {
  896. if (s+4 <= b->data + b->l_data) {
  897. kputsn("i:", 2, str);
  898. kputuw(*(uint32_t*)s, str);
  899. s += 4;
  900. } else return -1;
  901. } else if (type == 'i') {
  902. if (s+4 <= b->data + b->l_data) {
  903. kputsn("i:", 2, str);
  904. kputw(*(int32_t*)s, str);
  905. s += 4;
  906. } else return -1;
  907. } else if (type == 'f') {
  908. if (s+4 <= b->data + b->l_data) {
  909. ksprintf(str, "f:%g", *(float*)s);
  910. s += 4;
  911. } else return -1;
  912. } else if (type == 'd') {
  913. if (s+8 <= b->data + b->l_data) {
  914. ksprintf(str, "d:%g", *(double*)s);
  915. s += 8;
  916. } else return -1;
  917. } else if (type == 'Z' || type == 'H') {
  918. kputc(type, str); kputc(':', str);
  919. while (s < b->data + b->l_data && *s) kputc(*s++, str);
  920. if (s >= b->data + b->l_data)
  921. return -1;
  922. ++s;
  923. } else if (type == 'B') {
  924. uint8_t sub_type = *(s++);
  925. int32_t n;
  926. memcpy(&n, s, 4);
  927. s += 4; // no point to the start of the array
  928. if (s + n >= b->data + b->l_data)
  929. return -1;
  930. kputsn("B:", 2, str); kputc(sub_type, str); // write the typing
  931. for (i = 0; i < n; ++i) { // FIXME: for better performance, put the loop after "if"
  932. kputc(',', str);
  933. if ('c' == sub_type) { kputw(*(int8_t*)s, str); ++s; }
  934. else if ('C' == sub_type) { kputw(*(uint8_t*)s, str); ++s; }
  935. else if ('s' == sub_type) { kputw(*(int16_t*)s, str); s += 2; }
  936. else if ('S' == sub_type) { kputw(*(uint16_t*)s, str); s += 2; }
  937. else if ('i' == sub_type) { kputw(*(int32_t*)s, str); s += 4; }
  938. else if ('I' == sub_type) { kputuw(*(uint32_t*)s, str); s += 4; }
  939. else if ('f' == sub_type) { ksprintf(str, "%g", *(float*)s); s += 4; }
  940. }
  941. }
  942. }
  943. return str->l;
  944. }
  945. int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b)
  946. {
  947. if (fp->is_bin) {
  948. return bam_write1(fp->fp.bgzf, b);
  949. } else if (fp->is_cram) {
  950. return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b);
  951. } else {
  952. if (sam_format1(h, b, &fp->line) < 0) return -1;
  953. kputc('\n', &fp->line);
  954. if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
  955. return fp->line.l;
  956. }
  957. }
  958. /************************
  959. *** Auxiliary fields ***
  960. ************************/
  961. void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data)
  962. {
  963. int ori_len = b->l_data;
  964. b->l_data += 3 + len;
  965. if (b->m_data < b->l_data) {
  966. b->m_data = b->l_data;
  967. kroundup32(b->m_data);
  968. b->data = (uint8_t*)realloc(b->data, b->m_data);
  969. }
  970. b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
  971. b->data[ori_len + 2] = type;
  972. memcpy(b->data + ori_len + 3, data, len);
  973. }
  974. static inline uint8_t *skip_aux(uint8_t *s)
  975. {
  976. int size = aux_type2size(*s); ++s; // skip type
  977. uint32_t n;
  978. switch (size) {
  979. case 'Z':
  980. case 'H':
  981. while (*s) ++s;
  982. return s + 1;
  983. case 'B':
  984. size = aux_type2size(*s); ++s;
  985. memcpy(&n, s, 4); s += 4;
  986. return s + size * n;
  987. case 0:
  988. abort();
  989. break;
  990. default:
  991. return s + size;
  992. }
  993. }
  994. uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
  995. {
  996. uint8_t *s;
  997. int y = tag[0]<<8 | tag[1];
  998. s = bam_get_aux(b);
  999. while (s < b->data + b->l_data) {
  1000. int x = (int)s[0]<<8 | s[1];
  1001. s += 2;
  1002. if (x == y) return s;
  1003. s = skip_aux(s);
  1004. }
  1005. return 0;
  1006. }
  1007. // s MUST BE returned by bam_aux_get()
  1008. int bam_aux_del(bam1_t *b, uint8_t *s)
  1009. {
  1010. uint8_t *p, *aux;
  1011. int l_aux = bam_get_l_aux(b);
  1012. aux = bam_get_aux(b);
  1013. p = s - 2;
  1014. s = skip_aux(s);
  1015. memmove(p, s, l_aux - (s - aux));
  1016. b->l_data -= s - p;
  1017. return 0;
  1018. }
  1019. int32_t bam_aux2i(const uint8_t *s)
  1020. {
  1021. int type;
  1022. type = *s++;
  1023. if (type == 'c') return (int32_t)*(int8_t*)s;
  1024. else if (type == 'C') return (int32_t)*(uint8_t*)s;
  1025. else if (type == 's') return (int32_t)*(int16_t*)s;
  1026. else if (type == 'S') return (int32_t)*(uint16_t*)s;
  1027. else if (type == 'i' || type == 'I') return *(int32_t*)s;
  1028. else return 0;
  1029. }
  1030. double bam_aux2f(const uint8_t *s)
  1031. {
  1032. int type;
  1033. type = *s++;
  1034. if (type == 'd') return *(double*)s;
  1035. else if (type == 'f') return *(float*)s;
  1036. else return 0.0;
  1037. }
  1038. char bam_aux2A(const uint8_t *s)
  1039. {
  1040. int type;
  1041. type = *s++;
  1042. if (type == 'A') return *(char*)s;
  1043. else return 0;
  1044. }
  1045. char *bam_aux2Z(const uint8_t *s)
  1046. {
  1047. int type;
  1048. type = *s++;
  1049. if (type == 'Z' || type == 'H') return (char*)s;
  1050. else return 0;
  1051. }
  1052. int sam_open_mode(char *mode, const char *fn, const char *format)
  1053. {
  1054. // TODO Parse "bam5" etc for compression level
  1055. if (format == NULL) {
  1056. // Try to pick a format based on the filename extension
  1057. const char *ext = fn? strrchr(fn, '.') : NULL;
  1058. if (ext == NULL || strchr(ext, '/')) return -1;
  1059. return sam_open_mode(mode, fn, ext+1);
  1060. }
  1061. else if (strcmp(format, "bam") == 0) strcpy(mode, "b");
  1062. else if (strcmp(format, "cram") == 0) strcpy(mode, "c");
  1063. else if (strcmp(format, "sam") == 0) strcpy(mode, "");
  1064. else return -1;
  1065. return 0;
  1066. }
  1067. #define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n))
  1068. int bam_str2flag(const char *str)
  1069. {
  1070. char *end, *beg = (char*) str;
  1071. long int flag = strtol(str, &end, 0);
  1072. if ( end!=str ) return flag; // the conversion was successful
  1073. flag = 0;
  1074. while ( *str )
  1075. {
  1076. end = beg;
  1077. while ( *end && *end!=',' ) end++;
  1078. if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED;
  1079. else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR;
  1080. else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP;
  1081. else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP;
  1082. else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE;
  1083. else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE;
  1084. else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1;
  1085. else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2;
  1086. else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY;
  1087. else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL;
  1088. else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP;
  1089. else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY;
  1090. else return -1;
  1091. if ( !*end ) break;
  1092. beg = end + 1;
  1093. }
  1094. return flag;
  1095. }
  1096. char *bam_flag2str(int flag)
  1097. {
  1098. kstring_t str = {0,0,0};
  1099. if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED");
  1100. if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR");
  1101. if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP");
  1102. if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP");
  1103. if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE");
  1104. if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE");
  1105. if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1");
  1106. if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2");
  1107. if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY");
  1108. if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL");
  1109. if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP");
  1110. if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY");
  1111. if ( str.l == 0 ) kputsn("", 0, &str);
  1112. return str.s;
  1113. }
  1114. /**************************
  1115. *** Pileup and Mpileup ***
  1116. **************************/
  1117. #if !defined(BAM_NO_PILEUP)
  1118. #include <assert.h>
  1119. /*******************
  1120. *** Memory pool ***
  1121. *******************/
  1122. typedef struct {
  1123. int k, x, y, end;
  1124. } cstate_t;
  1125. static cstate_t g_cstate_null = { -1, 0, 0, 0 };
  1126. typedef struct __linkbuf_t {
  1127. bam1_t b;
  1128. int32_t beg, end;
  1129. cstate_t s;
  1130. struct __linkbuf_t *next;
  1131. } lbnode_t;
  1132. typedef struct {
  1133. int cnt, n, max;
  1134. lbnode_t **buf;
  1135. } mempool_t;
  1136. static mempool_t *mp_init(void)
  1137. {
  1138. mempool_t *mp;
  1139. mp = (mempool_t*)calloc(1, sizeof(mempool_t));
  1140. return mp;
  1141. }
  1142. static void mp_destroy(mempool_t *mp)
  1143. {
  1144. int k;
  1145. for (k = 0; k < mp->n; ++k) {
  1146. free(mp->buf[k]->b.data);
  1147. free(mp->buf[k]);
  1148. }
  1149. free(mp->buf);
  1150. free(mp);
  1151. }
  1152. static inline lbnode_t *mp_alloc(mempool_t *mp)
  1153. {
  1154. ++mp->cnt;
  1155. if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
  1156. else return mp->buf[--mp->n];
  1157. }
  1158. static inline void mp_free(mempool_t *mp, lbnode_t *p)
  1159. {
  1160. --mp->cnt; p->next = 0; // clear lbnode_t::next here
  1161. if (mp->n == mp->max) {
  1162. mp->max = mp->max? mp->max<<1 : 256;
  1163. mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
  1164. }
  1165. mp->buf[mp->n++] = p;
  1166. }
  1167. /**********************
  1168. *** CIGAR resolver ***
  1169. **********************/
  1170. /* s->k: the index of the CIGAR operator that has just been processed.
  1171. s->x: the reference coordinate of the start of s->k
  1172. s->y: the query coordiante of the start of s->k
  1173. */
  1174. static inline int resolve_cigar2(bam_pileup1_t *p, int32_t pos, cstate_t *s)
  1175. {
  1176. #define _cop(c) ((c)&BAM_CIGAR_MASK)
  1177. #define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
  1178. bam1_t *b = p->b;
  1179. bam1_core_t *c = &b->core;
  1180. uint32_t *cigar = bam_get_cigar(b);
  1181. int k;
  1182. // determine the current CIGAR operation
  1183. // fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y);
  1184. if (s->k == -1) { // never processed
  1185. if (c->n_cigar == 1) { // just one operation, save a loop
  1186. if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
  1187. } else { // find the first match or deletion
  1188. for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
  1189. int op = _cop(cigar[k]);
  1190. int l = _cln(cigar[k]);
  1191. if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CEQUAL || op == BAM_CDIFF) break;
  1192. else if (op == BAM_CREF_SKIP) s->x += l;
  1193. else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
  1194. }
  1195. assert(k < c->n_cigar);
  1196. s->k = k;
  1197. }
  1198. } else { // the read has been processed before
  1199. int op, l = _cln(cigar[s->k]);
  1200. if (pos - s->x >= l) { // jump to the next operation
  1201. assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
  1202. op = _cop(cigar[s->k+1]);
  1203. if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
  1204. if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
  1205. s->x += l;
  1206. ++s->k;
  1207. } else { // find the next M/D/N/=/X
  1208. if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
  1209. s->x += l;
  1210. for (k = s->k + 1; k < c->n_cigar; ++k) {
  1211. op = _cop(cigar[k]), l = _cln(cigar[k]);
  1212. if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
  1213. else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
  1214. }
  1215. s->k = k;
  1216. }
  1217. assert(s->k < c->n_cigar); // otherwise a bug
  1218. } // else, do nothing
  1219. }
  1220. { // collect pileup information
  1221. int op, l;
  1222. op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
  1223. p->is_del = p->indel = p->is_refskip = 0;
  1224. if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
  1225. int op2 = _cop(cigar[s->k+1]);
  1226. int l2 = _cln(cigar[s->k+1]);
  1227. if (op2 == BAM_CDEL) p->indel = -(int)l2;
  1228. else if (op2 == BAM_CINS) p->indel = l2;
  1229. else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding
  1230. int l3 = 0;
  1231. for (k = s->k + 2; k < c->n_cigar; ++k) {
  1232. op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
  1233. if (op2 == BAM_CINS) l3 += l2;
  1234. else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
  1235. }
  1236. if (l3 > 0) p->indel = l3;
  1237. }
  1238. }
  1239. if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
  1240. p->qpos = s->y + (pos - s->x);
  1241. } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
  1242. p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
  1243. p->is_refskip = (op == BAM_CREF_SKIP);
  1244. } // cannot be other operations; otherwise a bug
  1245. p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
  1246. }
  1247. return 1;
  1248. }
  1249. /***********************
  1250. *** Pileup iterator ***
  1251. ***********************/
  1252. // Dictionary of overlapping reads
  1253. KHASH_MAP_INIT_STR(olap_hash, lbnode_t *)
  1254. typedef khash_t(olap_hash) olap_hash_t;
  1255. struct __bam_plp_t {
  1256. mempool_t *mp;
  1257. lbnode_t *head, *tail, *dummy;
  1258. int32_t tid, pos, max_tid, max_pos;
  1259. int is_eof, max_plp, error, maxcnt;
  1260. uint64_t id;
  1261. bam_pileup1_t *plp;
  1262. // for the "auto" interface only
  1263. bam1_t *b;
  1264. bam_plp_auto_f func;
  1265. void *data;
  1266. olap_hash_t *overlaps;
  1267. };
  1268. bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
  1269. {
  1270. bam_plp_t iter;
  1271. iter = (bam_plp_t)calloc(1, sizeof(struct __bam_plp_t));
  1272. iter->mp = mp_init();
  1273. iter->head = iter->tail = mp_alloc(iter->mp);
  1274. iter->dummy = mp_alloc(iter->mp);
  1275. iter->max_tid = iter->max_pos = -1;
  1276. iter->maxcnt = 8000;
  1277. if (func) {
  1278. iter->func = func;
  1279. iter->data = data;
  1280. iter->b = bam_init1();
  1281. }
  1282. return iter;
  1283. }
  1284. void bam_plp_init_overlaps(bam_plp_t iter)
  1285. {
  1286. iter->overlaps = kh_init(olap_hash); // hash for tweaking quality of bases in overlapping reads
  1287. }
  1288. void bam_plp_destroy(bam_plp_t iter)
  1289. {
  1290. if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps);
  1291. mp_free(iter->mp, iter->dummy);
  1292. mp_free(iter->mp, iter->head);
  1293. if (iter->mp->cnt != 0)
  1294. fprintf(stderr, "[bam_plp_destroy] memory leak: %d. Continue anyway.\n", iter->mp->cnt);
  1295. mp_destroy(iter->mp);
  1296. if (iter->b) bam_destroy1(iter->b);
  1297. free(iter->plp);
  1298. free(iter);
  1299. }
  1300. //---------------------------------
  1301. //--- Tweak overlapping reads
  1302. //---------------------------------
  1303. /**
  1304. * cigar_iref2iseq_set() - find the first CMATCH setting the ref and the read index
  1305. * cigar_iref2iseq_next() - get the next CMATCH base
  1306. * @cigar: pointer to current cigar block (rw)
  1307. * @cigar_max: pointer just beyond the last cigar block
  1308. * @icig: position within the current cigar block (rw)
  1309. * @iseq: position in the sequence (rw)
  1310. * @iref: position with respect to the beginning of the read (iref_pos - b->core.pos) (rw)
  1311. *
  1312. * Returns BAM_CMATCH or -1 when there is no more cigar to process or the requested position is not covered.
  1313. */
  1314. static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref)
  1315. {
  1316. int pos = *iref;
  1317. if ( pos < 0 ) return -1;
  1318. *icig = 0;
  1319. *iseq = 0;
  1320. *iref = 0;
  1321. while ( *cigar<cigar_max )
  1322. {
  1323. int cig = (**cigar) & BAM_CIGAR_MASK;
  1324. int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
  1325. if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
  1326. if ( cig==BAM_CHARD_CLIP ) { (*cigar)++; *icig = 0; continue; }
  1327. if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
  1328. {
  1329. pos -= ncig;
  1330. if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; }
  1331. (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig;
  1332. continue;
  1333. }
  1334. if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
  1335. if ( cig==BAM_CDEL )
  1336. {
  1337. pos -= ncig;
  1338. if ( pos<0 ) pos = 0;
  1339. (*cigar)++; *icig = 0; *iref += ncig;
  1340. continue;
  1341. }
  1342. fprintf(stderr,"todo: cigar %d\n", cig);
  1343. assert(0);
  1344. }
  1345. *iseq = -1;
  1346. return -1;
  1347. }
  1348. static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref)
  1349. {
  1350. while ( *cigar < cigar_max )
  1351. {
  1352. int cig = (**cigar) & BAM_CIGAR_MASK;
  1353. int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
  1354. if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
  1355. {
  1356. if ( *icig >= ncig - 1 ) { *icig = 0; (*cigar)++; continue; }
  1357. (*iseq)++; (*icig)++; (*iref)++;
  1358. return BAM_CMATCH;
  1359. }
  1360. if ( cig==BAM_CDEL ) { (*cigar)++; (*iref) += ncig; *icig = 0; continue; }
  1361. if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
  1362. if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
  1363. if ( cig==BAM_CHARD_CLIP ) { (*cigar)++; *icig = 0; continue; }
  1364. fprintf(stderr,"todo: cigar %d\n", cig);
  1365. assert(0);
  1366. }
  1367. *iseq = -1;
  1368. *iref = -1;
  1369. return -1;
  1370. }
  1371. static void tweak_overlap_quality(bam1_t *a, bam1_t *b)
  1372. {
  1373. uint32_t *a_cigar = bam_get_cigar(a), *a_cigar_max = a_cigar + a->core.n_cigar;
  1374. uint32_t *b_cigar = bam_get_cigar(b), *b_cigar_max = b_cigar + b->core.n_cigar;
  1375. int a_icig = 0, a_iseq = 0;
  1376. int b_icig = 0, b_iseq = 0;
  1377. uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b);
  1378. uint8_t *a_seq = bam_get_seq(a), *b_seq = bam_get_seq(b);
  1379. int iref = b->core.pos;
  1380. int a_iref = iref - a->core.pos;
  1381. int b_iref = iref - b->core.pos;
  1382. int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
  1383. if ( a_ret<0 ) return; // no overlap
  1384. int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
  1385. if ( b_ret<0 ) return; // no overlap
  1386. #if DBG
  1387. fprintf(stderr,"tweak %s n_cigar=%d %d .. %d-%d vs %d-%d\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar,
  1388. a->core.pos+1,a->core.pos+bam_cigar2rlen(a->core.n_cigar,bam_get_cigar(a)), b->core.pos+1, b->core.pos+bam_cigar2rlen(b->core.n_cigar,bam_get_cigar(b)));
  1389. #endif
  1390. while ( 1 )
  1391. {
  1392. // Increment reference position
  1393. while ( a_iref>=0 && a_iref < iref - a->core.pos )
  1394. a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref);
  1395. if ( a_ret<0 ) break; // done
  1396. if ( iref < a_iref + a->core.pos ) iref = a_iref + a->core.pos;
  1397. while ( b_iref>=0 && b_iref < iref - b->core.pos )
  1398. b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref);
  1399. if ( b_ret<0 ) break; // done
  1400. if ( iref < b_iref + b->core.pos ) iref = b_iref + b->core.pos;
  1401. iref++;
  1402. if ( a_iref+a->core.pos != b_iref+b->core.pos ) continue; // only CMATCH positions, don't know what to do with indels
  1403. if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) )
  1404. {
  1405. #if DBG
  1406. fprintf(stderr,"%c",seq_nt16_str[bam_seqi(a_seq,a_iseq)]);
  1407. #endif
  1408. // we are very confident about this base
  1409. int qual = a_qual[a_iseq] + b_qual[b_iseq];
  1410. a_qual[a_iseq] = qual>200 ? 200 : qual;
  1411. b_qual[b_iseq] = 0;
  1412. }
  1413. else
  1414. {
  1415. if ( a_qual[a_iseq] >= b_qual[b_iseq] )
  1416. {
  1417. #if DBG
  1418. fprintf(stderr,"[%c/%c]",seq_nt16_str[bam_seqi(a_seq,a_iseq)],tolower(seq_nt16_str[bam_seqi(b_seq,b_iseq)]));
  1419. #endif
  1420. a_qual[a_iseq] = 0.8 * a_qual[a_iseq]; // not so confident about a_qual anymore given the mismatch
  1421. b_qual[b_iseq] = 0;
  1422. }
  1423. else
  1424. {
  1425. #if DBG
  1426. fprintf(stderr,"[%c/%c]",tolower(seq_nt16_str[bam_seqi(a_seq,a_iseq)]),seq_nt16_str[bam_seqi(b_seq,b_iseq)]);
  1427. #endif
  1428. b_qual[b_iseq] = 0.8 * b_qual[b_iseq];
  1429. a_qual[a_iseq] = 0;
  1430. }
  1431. }
  1432. }
  1433. #if DBG
  1434. fprintf(stderr,"\n");
  1435. #endif
  1436. }
  1437. // Fix overlapping reads. Simple soft-clipping did not give good results.
  1438. // Lowering qualities of unwanted bases is more selective and works better.
  1439. //
  1440. static void overlap_push(bam_plp_t iter, lbnode_t *node)
  1441. {
  1442. if ( !iter->overlaps ) return;
  1443. // mapped mates and paired reads only
  1444. if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return;
  1445. // no overlap possible, unless some wild cigar
  1446. if ( abs(node->b.core.isize) >= 2*node->b.core.l_qseq ) return;
  1447. khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b));
  1448. if ( kitr==kh_end(iter->overlaps) )
  1449. {
  1450. int ret;
  1451. kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret);
  1452. kh_value(iter->overlaps, kitr) = node;
  1453. }
  1454. else
  1455. {
  1456. lbnode_t *a = kh_value(iter->overlaps, kitr);
  1457. tweak_overlap_quality(&a->b, &node->b);
  1458. kh_del(olap_hash, iter->overlaps, kitr);
  1459. assert(a->end-1 == a->s.end);
  1460. a->end = a->b.core.pos + bam_cigar2rlen(a->b.core.n_cigar, bam_get_cigar(&a->b));
  1461. a->s.end = a->end - 1;
  1462. }
  1463. }
  1464. static void overlap_remove(bam_plp_t iter, const bam1_t *b)
  1465. {
  1466. if ( !iter->overlaps ) return;
  1467. khiter_t kitr;
  1468. if ( b )
  1469. {
  1470. kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b));
  1471. if ( kitr!=kh_end(iter->overlaps) )
  1472. kh_del(olap_hash, iter->overlaps, kitr);
  1473. }
  1474. else
  1475. {
  1476. // remove all
  1477. for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++)
  1478. if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr);
  1479. }
  1480. }
  1481. // Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns
  1482. // pointer to the piled records if next position is ready or NULL if there is not enough records in the
  1483. // buffer yet (the current position is still the maximum position across all buffered reads).
  1484. const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
  1485. {
  1486. if (iter->error) { *_n_plp = -1; return 0; }
  1487. *_n_plp = 0;
  1488. if (iter->is_eof && iter->head->next == 0) return 0;
  1489. while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
  1490. int n_plp = 0;
  1491. lbnode_t *p, *q;
  1492. // write iter->plp at iter->pos
  1493. iter->dummy->next = iter->head;
  1494. for (p = iter->head, q = iter->dummy; p->next; q = p, p = p->next) {
  1495. if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
  1496. overlap_remove(iter, &p->b);
  1497. q->next = p->next; mp_free(iter->mp, p); p = q;
  1498. } else if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
  1499. if (n_plp == iter->max_plp) { // then double the capacity
  1500. iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
  1501. iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
  1502. }
  1503. iter->plp[n_plp].b = &p->b;
  1504. if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
  1505. }
  1506. }
  1507. iter->head = iter->dummy->next; // dummy->next may be changed
  1508. *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
  1509. // update iter->tid and iter->pos
  1510. if (iter->head->next) {
  1511. if (iter->tid > iter->head->b.core.tid) {
  1512. fprintf(stderr, "[%s] unsorted input. Pileup aborts.\n", __func__);
  1513. iter->error = 1;
  1514. *_n_plp = -1;
  1515. return 0;
  1516. }
  1517. }
  1518. if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
  1519. iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
  1520. } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
  1521. iter->pos = iter->head->beg; // jump to the next position
  1522. } else ++iter->pos; // scan contiguously
  1523. // return
  1524. if (n_plp) return iter->plp;
  1525. if (iter->is_eof && iter->head->next == 0) break;
  1526. }
  1527. return 0;
  1528. }
  1529. int bam_plp_push(bam_plp_t iter, const bam1_t *b)
  1530. {
  1531. if (iter->error) return -1;
  1532. if (b) {
  1533. if (b->core.tid < 0) { overlap_remove(iter, b); return 0; }
  1534. // Skip only unmapped reads here, any additional filtering must be done in iter->func
  1535. if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; }
  1536. if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt)
  1537. {
  1538. overlap_remove(iter, b);
  1539. return 0;
  1540. }
  1541. bam_copy1(&iter->tail->b, b);
  1542. overlap_push(iter, iter->tail);
  1543. #ifndef BAM_NO_ID
  1544. iter->tail->b.id = iter->id++;
  1545. #endif
  1546. iter->tail->beg = b->core.pos;
  1547. iter->tail->end = b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
  1548. iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
  1549. if (b->core.tid < iter->max_tid) {
  1550. fprintf(stderr, "[bam_pileup_core] the input is not sorted (chromosomes out of order)\n");
  1551. iter->error = 1;
  1552. return -1;
  1553. }
  1554. if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
  1555. fprintf(stderr, "[bam_pileup_core] the input is not sorted (reads out of order)\n");
  1556. iter->error = 1;
  1557. return -1;
  1558. }
  1559. iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
  1560. if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
  1561. iter->tail->next = mp_alloc(iter->mp);
  1562. iter->tail = iter->tail->next;
  1563. }
  1564. } else iter->is_eof = 1;
  1565. return 0;
  1566. }
  1567. const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
  1568. {
  1569. const bam_pileup1_t *plp;
  1570. if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
  1571. if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
  1572. else { // no pileup line can be obtained; read alignments
  1573. *_n_plp = 0;
  1574. if (iter->is_eof) return 0;
  1575. int ret;
  1576. while ( (ret=iter->func(iter->data, iter->b)) >= 0) {
  1577. if (bam_plp_push(iter, iter->b) < 0) {
  1578. *_n_plp = -1;
  1579. return 0;
  1580. }
  1581. if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
  1582. // otherwise no pileup line can be returned; read the next alignment.
  1583. }
  1584. if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; }
  1585. bam_plp_push(iter, 0);
  1586. if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
  1587. return 0;
  1588. }
  1589. }
  1590. void bam_plp_reset(bam_plp_t iter)
  1591. {
  1592. lbnode_t *p, *q;
  1593. iter->max_tid = iter->max_pos = -1;
  1594. iter->tid = iter->pos = 0;
  1595. iter->is_eof = 0;
  1596. for (p = iter->head; p->next;) {
  1597. overlap_remove(iter, NULL);
  1598. q = p->next;
  1599. mp_free(iter->mp, p);
  1600. p = q;
  1601. }
  1602. iter->head = iter->tail;
  1603. }
  1604. void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
  1605. {
  1606. iter->maxcnt = maxcnt;
  1607. }
  1608. /************************
  1609. *** Mpileup iterator ***
  1610. ************************/
  1611. struct __bam_mplp_t {
  1612. int n;
  1613. uint64_t min, *pos;
  1614. bam_plp_t *iter;
  1615. int *n_plp;
  1616. const bam_pileup1_t **plp;
  1617. };
  1618. bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
  1619. {
  1620. int i;
  1621. bam_mplp_t iter;
  1622. iter = (bam_mplp_t)calloc(1, sizeof(struct __bam_mplp_t));
  1623. iter->pos = (uint64_t*)calloc(n, 8);
  1624. iter->n_plp = (int*)calloc(n, sizeof(int));
  1625. iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*));
  1626. iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t));
  1627. iter->n = n;
  1628. iter->min = (uint64_t)-1;
  1629. for (i = 0; i < n; ++i) {
  1630. iter->iter[i] = bam_plp_init(func, data[i]);
  1631. iter->pos[i] = iter->min;
  1632. }
  1633. return iter;
  1634. }
  1635. void bam_mplp_init_overlaps(bam_mplp_t iter)
  1636. {
  1637. int i;
  1638. for (i = 0; i < iter->n; ++i)
  1639. bam_plp_init_overlaps(iter->iter[i]);
  1640. }
  1641. void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
  1642. {
  1643. int i;
  1644. for (i = 0; i < iter->n; ++i)
  1645. iter->iter[i]->maxcnt = maxcnt;
  1646. }
  1647. void bam_mplp_destroy(bam_mplp_t iter)
  1648. {
  1649. int i;
  1650. for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
  1651. free(iter->iter); free(iter->pos); free(iter->n_plp); free(iter->plp);
  1652. free(iter);
  1653. }
  1654. int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
  1655. {
  1656. int i, ret = 0;
  1657. uint64_t new_min = (uint64_t)-1;
  1658. for (i = 0; i < iter->n; ++i) {
  1659. if (iter->pos[i] == iter->min) {
  1660. int tid, pos;
  1661. iter->plp[i] = bam_plp_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
  1662. if ( iter->iter[i]->error ) return -1;
  1663. iter->pos[i] = iter->plp[i] ? (uint64_t)tid<<32 | pos : 0;
  1664. }
  1665. if (iter->plp[i] && iter->pos[i] < new_min) new_min = iter->pos[i];
  1666. }
  1667. iter->min = new_min;
  1668. if (new_min == (uint64_t)-1) return 0;
  1669. *_tid = new_min>>32; *_pos = (uint32_t)new_min;
  1670. for (i = 0; i < iter->n; ++i) {
  1671. if (iter->pos[i] == iter->min) { // FIXME: valgrind reports "uninitialised value(s) at this line"
  1672. n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
  1673. ++ret;
  1674. } else n_plp[i] = 0, plp[i] = 0;
  1675. }
  1676. return ret;
  1677. }
  1678. #endif // ~!defined(BAM_NO_PILEUP)