hts.c 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335
  1. #include <zlib.h>
  2. #include <ctype.h>
  3. #include <stdio.h>
  4. #include <string.h>
  5. #include <stdlib.h>
  6. #include <fcntl.h>
  7. #include <errno.h>
  8. #include <sys/stat.h>
  9. #include "htslib/bgzf.h"
  10. #include "htslib/hts.h"
  11. #include "cram/cram.h"
  12. #include "htslib/hfile.h"
  13. #include "version.h"
  14. #include "htslib/kseq.h"
  15. #define KS_BGZF 1
  16. #if KS_BGZF
  17. // bgzf now supports gzip-compressed files
  18. KSTREAM_INIT2(, BGZF*, bgzf_read, 65536)
  19. #else
  20. KSTREAM_INIT2(, gzFile, gzread, 16384)
  21. #endif
  22. #include "htslib/khash.h"
  23. KHASH_INIT2(s2i,, kh_cstr_t, int64_t, 1, kh_str_hash_func, kh_str_hash_equal)
  24. int hts_verbose = 3;
  25. const char *hts_version()
  26. {
  27. return HTS_VERSION;
  28. }
  29. const unsigned char seq_nt16_table[256] = {
  30. 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
  31. 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
  32. 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
  33. 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
  34. 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
  35. 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
  36. 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
  37. 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15,
  38. 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
  39. 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
  40. 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
  41. 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
  42. 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
  43. 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
  44. 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
  45. 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
  46. };
  47. const char seq_nt16_str[] = "=ACMGRSVTWYHKDBN";
  48. /**********************
  49. *** Basic file I/O ***
  50. **********************/
  51. // Decompress up to ten or so bytes by peeking at the file, which must be
  52. // positioned at the start of a GZIP block.
  53. static size_t decompress_peek(hFILE *fp, unsigned char *dest, size_t destsize)
  54. {
  55. // Typically at most a couple of hundred bytes of input are required
  56. // to get a few bytes of output from inflate(), so hopefully this buffer
  57. // size suffices in general.
  58. unsigned char buffer[512];
  59. z_stream zs;
  60. ssize_t npeek = hpeek(fp, buffer, sizeof buffer);
  61. if (npeek < 0) return 0;
  62. zs.zalloc = NULL;
  63. zs.zfree = NULL;
  64. zs.next_in = buffer;
  65. zs.avail_in = npeek;
  66. zs.next_out = dest;
  67. zs.avail_out = destsize;
  68. if (inflateInit2(&zs, 31) != Z_OK) return 0;
  69. while (zs.total_out < destsize)
  70. if (inflate(&zs, Z_SYNC_FLUSH) != Z_OK) break;
  71. destsize = zs.total_out;
  72. inflateEnd(&zs);
  73. return destsize;
  74. }
  75. // Returns whether the block contains any control characters, i.e.,
  76. // characters less than SPACE other than whitespace etc (ASCII BEL..CR).
  77. static int is_binary(unsigned char *s, size_t n)
  78. {
  79. size_t i;
  80. for (i = 0; i < n; i++)
  81. if (s[i] < 0x07 || (s[i] >= 0x0e && s[i] < 0x20)) return 1;
  82. return 0;
  83. }
  84. htsFile *hts_open(const char *fn, const char *mode)
  85. {
  86. htsFile *fp = NULL;
  87. hFILE *hfile = hopen(fn, mode);
  88. if (hfile == NULL) goto error;
  89. fp = (htsFile*)calloc(1, sizeof(htsFile));
  90. if (fp == NULL) goto error;
  91. fp->fn = strdup(fn);
  92. fp->is_be = ed_is_big();
  93. if (strchr(mode, 'r')) {
  94. unsigned char s[18];
  95. if (hpeek(hfile, s, 6) == 6 && memcmp(s, "CRAM", 4) == 0 &&
  96. s[4] >= 1 && s[4] <= 2 && s[5] <= 1) {
  97. fp->is_cram = 1;
  98. }
  99. else if (hpeek(hfile, s, 18) == 18 && s[0] == 0x1f && s[1] == 0x8b &&
  100. (s[3] & 4) && memcmp(&s[12], "BC\2\0", 4) == 0) {
  101. // The stream is BGZF-compressed. Decompress a few bytes to see
  102. // whether it's in a binary format (e.g., BAM or BCF, starting
  103. // with four bytes of magic including a control character) or is
  104. // a bgzipped SAM or VCF text file.
  105. fp->is_compressed = 1;
  106. if (is_binary(s, decompress_peek(hfile, s, 4))) fp->is_bin = 1;
  107. else fp->is_kstream = 1;
  108. }
  109. else if (hpeek(hfile, s, 2) == 2 && s[0] == 0x1f && s[1] == 0x8b) {
  110. // Plain GZIP header... so a gzipped text file.
  111. fp->is_compressed = 1;
  112. fp->is_kstream = 1;
  113. }
  114. else if (hpeek(hfile, s, 4) == 4 && is_binary(s, 4)) {
  115. // Binary format, but in a raw non-compressed form.
  116. fp->is_bin = 1;
  117. }
  118. else {
  119. fp->is_kstream = 1;
  120. }
  121. }
  122. else if (strchr(mode, 'w') || strchr(mode, 'a')) {
  123. fp->is_write = 1;
  124. if (strchr(mode, 'b')) fp->is_bin = 1;
  125. if (strchr(mode, 'c')) fp->is_cram = 1;
  126. if (strchr(mode, 'z')) fp->is_compressed = 1;
  127. else if (strchr(mode, 'u')) fp->is_compressed = 0;
  128. else fp->is_compressed = 2; // not set, default behaviour
  129. }
  130. else goto error;
  131. if (fp->is_bin || (fp->is_write && fp->is_compressed==1)) {
  132. fp->fp.bgzf = bgzf_hopen(hfile, mode);
  133. if (fp->fp.bgzf == NULL) goto error;
  134. }
  135. else if (fp->is_cram) {
  136. fp->fp.cram = cram_dopen(hfile, fn, mode);
  137. if (fp->fp.cram == NULL) goto error;
  138. if (!fp->is_write)
  139. cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, 1);
  140. }
  141. else if (fp->is_kstream) {
  142. #if KS_BGZF
  143. BGZF *gzfp = bgzf_hopen(hfile, mode);
  144. #else
  145. // TODO Implement gzip hFILE adaptor
  146. hclose(hfile); // This won't work, especially for stdin
  147. gzFile gzfp = strcmp(fn, "-")? gzopen(fn, "rb") : gzdopen(fileno(stdin), "rb");
  148. #endif
  149. if (gzfp) fp->fp.voidp = ks_init(gzfp);
  150. else goto error;
  151. }
  152. else {
  153. fp->fp.hfile = hfile;
  154. }
  155. return fp;
  156. error:
  157. if (hts_verbose >= 2)
  158. fprintf(stderr, "[E::%s] fail to open file '%s'\n", __func__, fn);
  159. if (hfile)
  160. hclose_abruptly(hfile);
  161. if (fp) {
  162. free(fp->fn);
  163. free(fp->fn_aux);
  164. free(fp);
  165. }
  166. return NULL;
  167. }
  168. int hts_close(htsFile *fp)
  169. {
  170. int ret, save;
  171. if (fp->is_bin || (fp->is_write && fp->is_compressed==1)) {
  172. ret = bgzf_close(fp->fp.bgzf);
  173. } else if (fp->is_cram) {
  174. if (!fp->is_write) {
  175. switch (cram_eof(fp->fp.cram)) {
  176. case 0:
  177. fprintf(stderr, "[E::%s] Failed to decode sequence.\n", __func__);
  178. return -1;
  179. case 2:
  180. fprintf(stderr, "[W::%s] EOF marker is absent. The input is probably truncated.\n", __func__);
  181. break;
  182. default: /* case 1, expected EOF */
  183. break;
  184. }
  185. }
  186. ret = cram_close(fp->fp.cram);
  187. } else if (fp->is_kstream) {
  188. #if KS_BGZF
  189. BGZF *gzfp = ((kstream_t*)fp->fp.voidp)->f;
  190. ret = bgzf_close(gzfp);
  191. #else
  192. gzFile gzfp = ((kstream_t*)fp->fp.voidp)->f;
  193. ret = gzclose(gzfp);
  194. #endif
  195. ks_destroy((kstream_t*)fp->fp.voidp);
  196. } else {
  197. ret = hclose(fp->fp.hfile);
  198. }
  199. save = errno;
  200. free(fp->fn);
  201. free(fp->fn_aux);
  202. free(fp->line.s);
  203. free(fp);
  204. errno = save;
  205. return ret;
  206. }
  207. int hts_set_threads(htsFile *fp, int n)
  208. {
  209. // TODO Plug in CRAM and other threading
  210. if (fp->is_bin) {
  211. return bgzf_mt(fp->fp.bgzf, n, 256);
  212. }
  213. else return 0;
  214. }
  215. int hts_set_fai_filename(htsFile *fp, const char *fn_aux)
  216. {
  217. free(fp->fn_aux);
  218. if (fn_aux) {
  219. fp->fn_aux = strdup(fn_aux);
  220. if (fp->fn_aux == NULL) return -1;
  221. }
  222. else fp->fn_aux = NULL;
  223. return 0;
  224. }
  225. // For VCF/BCF backward sweeper. Not exposing these functions because their
  226. // future is uncertain. Things will probably have to change with hFILE...
  227. BGZF *hts_get_bgzfp(htsFile *fp)
  228. {
  229. if ( fp->is_bin )
  230. return fp->fp.bgzf;
  231. else
  232. return ((kstream_t*)fp->fp.voidp)->f;
  233. }
  234. int hts_useek(htsFile *fp, long uoffset, int where)
  235. {
  236. if ( fp->is_bin )
  237. return bgzf_useek(fp->fp.bgzf, uoffset, where);
  238. else
  239. {
  240. ks_rewind((kstream_t*)fp->fp.voidp);
  241. ((kstream_t*)fp->fp.voidp)->seek_pos = uoffset;
  242. return bgzf_useek(((kstream_t*)fp->fp.voidp)->f, uoffset, where);
  243. }
  244. }
  245. long hts_utell(htsFile *fp)
  246. {
  247. if ( fp->is_bin )
  248. return bgzf_utell(fp->fp.bgzf);
  249. else
  250. return ((kstream_t*)fp->fp.voidp)->seek_pos;
  251. }
  252. int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
  253. {
  254. int ret, dret;
  255. ret = ks_getuntil((kstream_t*)fp->fp.voidp, delimiter, str, &dret);
  256. ++fp->lineno;
  257. return ret;
  258. }
  259. char **hts_readlist(const char *string, int is_file, int *_n)
  260. {
  261. int m = 0, n = 0, dret;
  262. char **s = 0;
  263. if ( is_file )
  264. {
  265. #if KS_BGZF
  266. BGZF *fp = bgzf_open(string, "r");
  267. #else
  268. gzFile fp = gzopen(string, "r");
  269. #endif
  270. if ( !fp ) return NULL;
  271. kstream_t *ks;
  272. kstring_t str;
  273. str.s = 0; str.l = str.m = 0;
  274. ks = ks_init(fp);
  275. while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0)
  276. {
  277. if (str.l == 0) continue;
  278. n++;
  279. hts_expand(char*,n,m,s);
  280. s[n-1] = strdup(str.s);
  281. }
  282. ks_destroy(ks);
  283. #if KS_BGZF
  284. bgzf_close(fp);
  285. #else
  286. gzclose(fp);
  287. #endif
  288. free(str.s);
  289. }
  290. else
  291. {
  292. const char *q = string, *p = string;
  293. while ( 1 )
  294. {
  295. if (*p == ',' || *p == 0)
  296. {
  297. n++;
  298. hts_expand(char*,n,m,s);
  299. s[n-1] = (char*)calloc(p - q + 1, 1);
  300. strncpy(s[n-1], q, p - q);
  301. q = p + 1;
  302. }
  303. if ( !*p ) break;
  304. p++;
  305. }
  306. }
  307. s = (char**)realloc(s, n * sizeof(char*));
  308. *_n = n;
  309. return s;
  310. }
  311. char **hts_readlines(const char *fn, int *_n)
  312. {
  313. int m = 0, n = 0, dret;
  314. char **s = 0;
  315. #if KS_BGZF
  316. BGZF *fp = bgzf_open(fn, "r");
  317. #else
  318. gzFile fp = gzopen(fn, "r");
  319. #endif
  320. if ( fp ) { // read from file
  321. kstream_t *ks;
  322. kstring_t str;
  323. str.s = 0; str.l = str.m = 0;
  324. ks = ks_init(fp);
  325. while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
  326. if (str.l == 0) continue;
  327. if (m == n) {
  328. m = m? m<<1 : 16;
  329. s = (char**)realloc(s, m * sizeof(char*));
  330. }
  331. s[n++] = strdup(str.s);
  332. }
  333. ks_destroy(ks);
  334. #if KS_BGZF
  335. bgzf_close(fp);
  336. #else
  337. gzclose(fp);
  338. #endif
  339. s = (char**)realloc(s, n * sizeof(char*));
  340. free(str.s);
  341. } else if (*fn == ':') { // read from string
  342. const char *q, *p;
  343. for (q = p = fn + 1;; ++p)
  344. if (*p == ',' || *p == 0) {
  345. if (m == n) {
  346. m = m? m<<1 : 16;
  347. s = (char**)realloc(s, m * sizeof(char*));
  348. }
  349. s[n] = (char*)calloc(p - q + 1, 1);
  350. strncpy(s[n++], q, p - q);
  351. q = p + 1;
  352. if (*p == 0) break;
  353. }
  354. } else return 0;
  355. s = (char**)realloc(s, n * sizeof(char*));
  356. *_n = n;
  357. return s;
  358. }
  359. int hts_file_type(const char *fname)
  360. {
  361. int len = strlen(fname);
  362. if ( !strcasecmp(".vcf.gz",fname+len-7) ) return FT_VCF_GZ;
  363. if ( !strcasecmp(".vcf",fname+len-4) ) return FT_VCF;
  364. if ( !strcasecmp(".bcf",fname+len-4) ) return FT_BCF_GZ;
  365. if ( !strcmp("-",fname) ) return FT_STDIN;
  366. // ... etc
  367. int fd = open(fname, O_RDONLY);
  368. if ( !fd ) return 0;
  369. uint8_t magic[5];
  370. if ( read(fd,magic,2)!=2 ) { close(fd); return 0; }
  371. if ( !strncmp((char*)magic,"##",2) ) { close(fd); return FT_VCF; }
  372. if ( !strncmp((char*)magic,"BCF",3) ) { close(fd); return FT_BCF; }
  373. close(fd);
  374. if ( magic[0]==0x1f && magic[1]==0x8b ) // compressed
  375. {
  376. BGZF *fp = bgzf_open(fname, "r");
  377. if ( !fp ) return 0;
  378. if ( bgzf_read(fp, magic, 3)!=3 ) { bgzf_close(fp); return 0; }
  379. bgzf_close(fp);
  380. if ( !strncmp((char*)magic,"##",2) ) return FT_VCF;
  381. if ( !strncmp((char*)magic,"BCF",3) ) return FT_BCF_GZ;
  382. }
  383. return 0;
  384. }
  385. /****************
  386. *** Indexing ***
  387. ****************/
  388. #define HTS_MIN_MARKER_DIST 0x10000
  389. // Finds the special meta bin
  390. // ((1<<(3 * n_lvls + 3)) - 1) / 7 + 1
  391. #define META_BIN(idx) ((idx)->n_bins + 1)
  392. #define pair64_lt(a,b) ((a).u < (b).u)
  393. #include "htslib/ksort.h"
  394. KSORT_INIT(_off, hts_pair64_t, pair64_lt)
  395. typedef struct {
  396. int32_t m, n;
  397. uint64_t loff;
  398. hts_pair64_t *list;
  399. } bins_t;
  400. #include "htslib/khash.h"
  401. KHASH_MAP_INIT_INT(bin, bins_t)
  402. typedef khash_t(bin) bidx_t;
  403. typedef struct {
  404. int32_t n, m;
  405. uint64_t *offset;
  406. } lidx_t;
  407. struct __hts_idx_t {
  408. int fmt, min_shift, n_lvls, n_bins;
  409. uint32_t l_meta;
  410. int32_t n, m;
  411. uint64_t n_no_coor;
  412. bidx_t **bidx;
  413. lidx_t *lidx;
  414. uint8_t *meta;
  415. struct {
  416. uint32_t last_bin, save_bin;
  417. int last_coor, last_tid, save_tid, finished;
  418. uint64_t last_off, save_off;
  419. uint64_t off_beg, off_end;
  420. uint64_t n_mapped, n_unmapped;
  421. } z; // keep internal states
  422. };
  423. static inline void insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end)
  424. {
  425. khint_t k;
  426. bins_t *l;
  427. int absent;
  428. k = kh_put(bin, b, bin, &absent);
  429. l = &kh_value(b, k);
  430. if (absent) {
  431. l->m = 1; l->n = 0;
  432. l->list = (hts_pair64_t*)calloc(l->m, 16);
  433. }
  434. if (l->n == l->m) {
  435. l->m <<= 1;
  436. l->list = (hts_pair64_t*)realloc(l->list, l->m * 16);
  437. }
  438. l->list[l->n].u = beg;
  439. l->list[l->n++].v = end;
  440. }
  441. static inline void insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t offset, int min_shift)
  442. {
  443. int i, beg, end;
  444. beg = _beg >> min_shift;
  445. end = (_end - 1) >> min_shift;
  446. if (l->m < end + 1) {
  447. int old_m = l->m;
  448. l->m = end + 1;
  449. kroundup32(l->m);
  450. l->offset = (uint64_t*)realloc(l->offset, l->m * 8);
  451. memset(l->offset + old_m, 0xff, 8 * (l->m - old_m)); // fill l->offset with (uint64_t)-1
  452. }
  453. if (beg == end) { // to save a loop in this case
  454. if (l->offset[beg] == (uint64_t)-1) l->offset[beg] = offset;
  455. } else {
  456. for (i = beg; i <= end; ++i)
  457. if (l->offset[i] == (uint64_t)-1) l->offset[i] = offset;
  458. }
  459. if (l->n < end + 1) l->n = end + 1;
  460. }
  461. hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls)
  462. {
  463. hts_idx_t *idx;
  464. idx = (hts_idx_t*)calloc(1, sizeof(hts_idx_t));
  465. if (idx == NULL) return NULL;
  466. idx->fmt = fmt;
  467. idx->min_shift = min_shift;
  468. idx->n_lvls = n_lvls;
  469. idx->n_bins = ((1<<(3 * n_lvls + 3)) - 1) / 7;
  470. idx->z.save_bin = idx->z.save_tid = idx->z.last_tid = idx->z.last_bin = 0xffffffffu;
  471. idx->z.save_off = idx->z.last_off = idx->z.off_beg = idx->z.off_end = offset0;
  472. idx->z.last_coor = 0xffffffffu;
  473. if (n) {
  474. idx->n = idx->m = n;
  475. idx->bidx = (bidx_t**)calloc(n, sizeof(bidx_t*));
  476. if (idx->bidx == NULL) { free(idx); return NULL; }
  477. idx->lidx = (lidx_t*) calloc(n, sizeof(lidx_t));
  478. if (idx->lidx == NULL) { free(idx->bidx); free(idx); return NULL; }
  479. }
  480. return idx;
  481. }
  482. static void update_loff(hts_idx_t *idx, int i, int free_lidx)
  483. {
  484. bidx_t *bidx = idx->bidx[i];
  485. lidx_t *lidx = &idx->lidx[i];
  486. khint_t k;
  487. int l;
  488. uint64_t offset0 = 0;
  489. if (bidx) {
  490. k = kh_get(bin, bidx, META_BIN(idx));
  491. if (k != kh_end(bidx))
  492. offset0 = kh_val(bidx, k).list[0].u;
  493. for (l = 0; l < lidx->n && lidx->offset[l] == (uint64_t)-1; ++l)
  494. lidx->offset[l] = offset0;
  495. } else l = 1;
  496. for (; l < lidx->n; ++l) // fill missing values
  497. if (lidx->offset[l] == (uint64_t)-1)
  498. lidx->offset[l] = lidx->offset[l-1];
  499. if (bidx == 0) return;
  500. for (k = kh_begin(bidx); k != kh_end(bidx); ++k) // set loff
  501. if (kh_exist(bidx, k))
  502. {
  503. if ( kh_key(bidx, k) < idx->n_bins )
  504. {
  505. int bot_bin = hts_bin_bot(kh_key(bidx, k), idx->n_lvls);
  506. // disable linear index if bot_bin out of bounds
  507. kh_val(bidx, k).loff = bot_bin < lidx->n ? lidx->offset[bot_bin] : 0;
  508. }
  509. else
  510. kh_val(bidx, k).loff = 0;
  511. }
  512. if (free_lidx) {
  513. free(lidx->offset);
  514. lidx->m = lidx->n = 0;
  515. lidx->offset = 0;
  516. }
  517. }
  518. static void compress_binning(hts_idx_t *idx, int i)
  519. {
  520. bidx_t *bidx = idx->bidx[i];
  521. khint_t k;
  522. int l, m;
  523. if (bidx == 0) return;
  524. // merge a bin to its parent if the bin is too small
  525. for (l = idx->n_lvls; l > 0; --l) {
  526. unsigned start = hts_bin_first(l);
  527. for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
  528. bins_t *p, *q;
  529. if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins || kh_key(bidx, k) < start) continue;
  530. p = &kh_value(bidx, k);
  531. if (l < idx->n_lvls && p->n > 1) ks_introsort(_off, p->n, p->list);
  532. if ((p->list[p->n - 1].v>>16) - (p->list[0].u>>16) < HTS_MIN_MARKER_DIST) {
  533. khint_t kp;
  534. kp = kh_get(bin, bidx, hts_bin_parent(kh_key(bidx, k)));
  535. if (kp == kh_end(bidx)) continue;
  536. q = &kh_val(bidx, kp);
  537. if (q->n + p->n > q->m) {
  538. q->m = q->n + p->n;
  539. kroundup32(q->m);
  540. q->list = (hts_pair64_t*)realloc(q->list, q->m * 16);
  541. }
  542. memcpy(q->list + q->n, p->list, p->n * 16);
  543. q->n += p->n;
  544. free(p->list);
  545. kh_del(bin, bidx, k);
  546. }
  547. }
  548. }
  549. k = kh_get(bin, bidx, 0);
  550. if (k != kh_end(bidx)) ks_introsort(_off, kh_val(bidx, k).n, kh_val(bidx, k).list);
  551. // merge adjacent chunks that start from the same BGZF block
  552. for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
  553. bins_t *p;
  554. if (!kh_exist(bidx, k) || kh_key(bidx, k) >= idx->n_bins) continue;
  555. p = &kh_value(bidx, k);
  556. for (l = 1, m = 0; l < p->n; ++l) {
  557. if (p->list[m].v>>16 >= p->list[l].u>>16) {
  558. if (p->list[m].v < p->list[l].v) p->list[m].v = p->list[l].v;
  559. } else p->list[++m] = p->list[l];
  560. }
  561. p->n = m + 1;
  562. }
  563. }
  564. void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
  565. {
  566. int i;
  567. if (idx == NULL || idx->z.finished) return; // do not run this function on an empty index or multiple times
  568. if (idx->z.save_tid >= 0) {
  569. insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, final_offset);
  570. insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, final_offset);
  571. insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped);
  572. }
  573. for (i = 0; i < idx->n; ++i) {
  574. update_loff(idx, i, (idx->fmt == HTS_FMT_CSI));
  575. compress_binning(idx, i);
  576. }
  577. idx->z.finished = 1;
  578. }
  579. int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped)
  580. {
  581. int bin;
  582. if (tid >= idx->m) { // enlarge the index
  583. int32_t oldm = idx->m;
  584. idx->m = idx->m? idx->m<<1 : 2;
  585. idx->bidx = (bidx_t**)realloc(idx->bidx, idx->m * sizeof(bidx_t*));
  586. idx->lidx = (lidx_t*) realloc(idx->lidx, idx->m * sizeof(lidx_t));
  587. memset(&idx->bidx[oldm], 0, (idx->m - oldm) * sizeof(bidx_t*));
  588. memset(&idx->lidx[oldm], 0, (idx->m - oldm) * sizeof(lidx_t));
  589. }
  590. if (idx->n < tid + 1) idx->n = tid + 1;
  591. if (idx->z.finished) return 0;
  592. if (idx->z.last_tid != tid || (idx->z.last_tid >= 0 && tid < 0)) { // change of chromosome
  593. if ( tid>=0 && idx->n_no_coor )
  594. {
  595. if (hts_verbose >= 1) fprintf(stderr,"[E::%s] NO_COOR reads not in a single block at the end %d %d\n", __func__, tid,idx->z.last_tid);
  596. return -1;
  597. }
  598. if (tid>=0 && idx->bidx[tid] != 0)
  599. {
  600. if (hts_verbose >= 1) fprintf(stderr, "[E::%s] chromosome blocks not continuous\n", __func__);
  601. return -1;
  602. }
  603. idx->z.last_tid = tid;
  604. idx->z.last_bin = 0xffffffffu;
  605. } else if (tid >= 0 && idx->z.last_coor > beg) { // test if positions are out of order
  606. if (hts_verbose >= 1) fprintf(stderr, "[E::%s] unsorted positions\n", __func__);
  607. return -1;
  608. }
  609. if ( tid>=0 )
  610. {
  611. if (idx->bidx[tid] == 0) idx->bidx[tid] = kh_init(bin);
  612. if ( is_mapped)
  613. insert_to_l(&idx->lidx[tid], beg, end, idx->z.last_off, idx->min_shift); // last_off points to the start of the current record
  614. }
  615. else idx->n_no_coor++;
  616. bin = hts_reg2bin(beg, end, idx->min_shift, idx->n_lvls);
  617. if ((int)idx->z.last_bin != bin) { // then possibly write the binning index
  618. if (idx->z.save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
  619. insert_to_b(idx->bidx[idx->z.save_tid], idx->z.save_bin, idx->z.save_off, idx->z.last_off);
  620. if (idx->z.last_bin == 0xffffffffu && idx->z.save_bin != 0xffffffffu) { // change of chr; keep meta information
  621. idx->z.off_end = idx->z.last_off;
  622. insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.off_beg, idx->z.off_end);
  623. insert_to_b(idx->bidx[idx->z.save_tid], META_BIN(idx), idx->z.n_mapped, idx->z.n_unmapped);
  624. idx->z.n_mapped = idx->z.n_unmapped = 0;
  625. idx->z.off_beg = idx->z.off_end;
  626. }
  627. idx->z.save_off = idx->z.last_off;
  628. idx->z.save_bin = idx->z.last_bin = bin;
  629. idx->z.save_tid = tid;
  630. if (tid < 0) { // come to the end of the records having coordinates
  631. hts_idx_finish(idx, offset);
  632. return 0;
  633. }
  634. }
  635. if (is_mapped) ++idx->z.n_mapped;
  636. else ++idx->z.n_unmapped;
  637. idx->z.last_off = offset;
  638. idx->z.last_coor = beg;
  639. return 0;
  640. }
  641. void hts_idx_destroy(hts_idx_t *idx)
  642. {
  643. khint_t k;
  644. int i;
  645. if (idx == 0) return;
  646. // For HTS_FMT_CRAI, idx actually points to a different type -- see sam.c
  647. if (idx->fmt == HTS_FMT_CRAI) { free(idx); return; }
  648. for (i = 0; i < idx->m; ++i) {
  649. bidx_t *bidx = idx->bidx[i];
  650. free(idx->lidx[i].offset);
  651. if (bidx == 0) continue;
  652. for (k = kh_begin(bidx); k != kh_end(bidx); ++k)
  653. if (kh_exist(bidx, k))
  654. free(kh_value(bidx, k).list);
  655. kh_destroy(bin, bidx);
  656. }
  657. free(idx->bidx); free(idx->lidx); free(idx->meta);
  658. free(idx);
  659. }
  660. static inline long idx_read(int is_bgzf, void *fp, void *buf, long l)
  661. {
  662. if (is_bgzf) return bgzf_read((BGZF*)fp, buf, l);
  663. else return (long)fread(buf, 1, l, (FILE*)fp);
  664. }
  665. static inline long idx_write(int is_bgzf, void *fp, const void *buf, long l)
  666. {
  667. if (is_bgzf) return bgzf_write((BGZF*)fp, buf, l);
  668. else return (long)fwrite(buf, 1, l, (FILE*)fp);
  669. }
  670. static inline void swap_bins(bins_t *p)
  671. {
  672. int i;
  673. for (i = 0; i < p->n; ++i) {
  674. ed_swap_8p(&p->list[i].u);
  675. ed_swap_8p(&p->list[i].v);
  676. }
  677. }
  678. static void hts_idx_save_core(const hts_idx_t *idx, void *fp, int fmt)
  679. {
  680. int32_t i, size, is_be;
  681. int is_bgzf = (fmt != HTS_FMT_BAI);
  682. is_be = ed_is_big();
  683. if (is_be) {
  684. uint32_t x = idx->n;
  685. idx_write(is_bgzf, fp, ed_swap_4p(&x), 4);
  686. } else idx_write(is_bgzf, fp, &idx->n, 4);
  687. if (fmt == HTS_FMT_TBI && idx->l_meta) idx_write(is_bgzf, fp, idx->meta, idx->l_meta);
  688. for (i = 0; i < idx->n; ++i) {
  689. khint_t k;
  690. bidx_t *bidx = idx->bidx[i];
  691. lidx_t *lidx = &idx->lidx[i];
  692. // write binning index
  693. size = bidx? kh_size(bidx) : 0;
  694. if (is_be) { // big endian
  695. uint32_t x = size;
  696. idx_write(is_bgzf, fp, ed_swap_4p(&x), 4);
  697. } else idx_write(is_bgzf, fp, &size, 4);
  698. if (bidx == 0) goto write_lidx;
  699. for (k = kh_begin(bidx); k != kh_end(bidx); ++k) {
  700. bins_t *p;
  701. if (!kh_exist(bidx, k)) continue;
  702. p = &kh_value(bidx, k);
  703. if (is_be) { // big endian
  704. uint32_t x;
  705. x = kh_key(bidx, k); idx_write(is_bgzf, fp, ed_swap_4p(&x), 4);
  706. if (fmt == HTS_FMT_CSI) {
  707. uint64_t y = kh_val(bidx, k).loff;
  708. idx_write(is_bgzf, fp, ed_swap_4p(&y), 8);
  709. }
  710. x = p->n; idx_write(is_bgzf, fp, ed_swap_4p(&x), 4);
  711. swap_bins(p);
  712. idx_write(is_bgzf, fp, p->list, 16 * p->n);
  713. swap_bins(p);
  714. } else {
  715. idx_write(is_bgzf, fp, &kh_key(bidx, k), 4);
  716. if (fmt == HTS_FMT_CSI) idx_write(is_bgzf, fp, &kh_val(bidx, k).loff, 8);
  717. //int j;for(j=0;j<p->n;++j)fprintf(stderr,"%d,%llx,%d,%llx:%llx\n",kh_key(bidx,k),kh_val(bidx, k).loff,j,p->list[j].u,p->list[j].v);
  718. idx_write(is_bgzf, fp, &p->n, 4);
  719. idx_write(is_bgzf, fp, p->list, p->n << 4);
  720. }
  721. }
  722. write_lidx:
  723. if (fmt != HTS_FMT_CSI) {
  724. if (is_be) {
  725. int32_t x = lidx->n;
  726. idx_write(is_bgzf, fp, ed_swap_4p(&x), 4);
  727. for (x = 0; x < lidx->n; ++x) ed_swap_8p(&lidx->offset[x]);
  728. idx_write(is_bgzf, fp, lidx->offset, lidx->n << 3);
  729. for (x = 0; x < lidx->n; ++x) ed_swap_8p(&lidx->offset[x]);
  730. } else {
  731. idx_write(is_bgzf, fp, &lidx->n, 4);
  732. idx_write(is_bgzf, fp, lidx->offset, lidx->n << 3);
  733. }
  734. }
  735. }
  736. if (is_be) { // write the number of reads without coordinates
  737. uint64_t x = idx->n_no_coor;
  738. idx_write(is_bgzf, fp, &x, 8);
  739. } else idx_write(is_bgzf, fp, &idx->n_no_coor, 8);
  740. }
  741. void hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt)
  742. {
  743. char *fnidx;
  744. fnidx = (char*)calloc(1, strlen(fn) + 5);
  745. strcpy(fnidx, fn);
  746. if (fmt == HTS_FMT_CSI) {
  747. BGZF *fp;
  748. uint32_t x[3];
  749. int is_be, i;
  750. is_be = ed_is_big();
  751. fp = bgzf_open(strcat(fnidx, ".csi"), "w");
  752. bgzf_write(fp, "CSI\1", 4);
  753. x[0] = idx->min_shift; x[1] = idx->n_lvls; x[2] = idx->l_meta;
  754. if (is_be) {
  755. for (i = 0; i < 3; ++i)
  756. bgzf_write(fp, ed_swap_4p(&x[i]), 4);
  757. } else bgzf_write(fp, &x, 12);
  758. if (idx->l_meta) bgzf_write(fp, idx->meta, idx->l_meta);
  759. hts_idx_save_core(idx, fp, HTS_FMT_CSI);
  760. bgzf_close(fp);
  761. } else if (fmt == HTS_FMT_TBI) {
  762. BGZF *fp;
  763. fp = bgzf_open(strcat(fnidx, ".tbi"), "w");
  764. bgzf_write(fp, "TBI\1", 4);
  765. hts_idx_save_core(idx, fp, HTS_FMT_TBI);
  766. bgzf_close(fp);
  767. } else if (fmt == HTS_FMT_BAI) {
  768. FILE *fp;
  769. fp = fopen(strcat(fnidx, ".bai"), "w");
  770. fwrite("BAI\1", 1, 4, fp);
  771. hts_idx_save_core(idx, fp, HTS_FMT_BAI);
  772. fclose(fp);
  773. } else abort();
  774. free(fnidx);
  775. }
  776. static int hts_idx_load_core(hts_idx_t *idx, void *fp, int fmt)
  777. {
  778. int32_t i, n, is_be;
  779. int is_bgzf = (fmt != HTS_FMT_BAI);
  780. is_be = ed_is_big();
  781. if (idx == NULL) return -4;
  782. for (i = 0; i < idx->n; ++i) {
  783. bidx_t *h;
  784. lidx_t *l = &idx->lidx[i];
  785. uint32_t key;
  786. int j, absent;
  787. bins_t *p;
  788. h = idx->bidx[i] = kh_init(bin);
  789. if (idx_read(is_bgzf, fp, &n, 4) != 4) return -1;
  790. if (is_be) ed_swap_4p(&n);
  791. for (j = 0; j < n; ++j) {
  792. khint_t k;
  793. if (idx_read(is_bgzf, fp, &key, 4) != 4) return -1;
  794. if (is_be) ed_swap_4p(&key);
  795. k = kh_put(bin, h, key, &absent);
  796. if (absent <= 0) return -3; // Duplicate bin number
  797. p = &kh_val(h, k);
  798. if (fmt == HTS_FMT_CSI) {
  799. if (idx_read(is_bgzf, fp, &p->loff, 8) != 8) return -1;
  800. if (is_be) ed_swap_8p(&p->loff);
  801. } else p->loff = 0;
  802. if (idx_read(is_bgzf, fp, &p->n, 4) != 4) return -1;
  803. if (is_be) ed_swap_4p(&p->n);
  804. p->m = p->n;
  805. p->list = (hts_pair64_t*)malloc(p->m * 16);
  806. if (p->list == NULL) return -2;
  807. if (idx_read(is_bgzf, fp, p->list, p->n<<4) != p->n<<4) return -1;
  808. if (is_be) swap_bins(p);
  809. }
  810. if (fmt != HTS_FMT_CSI) { // load linear index
  811. int j;
  812. if (idx_read(is_bgzf, fp, &l->n, 4) != 4) return -1;
  813. if (is_be) ed_swap_4p(&l->n);
  814. l->m = l->n;
  815. l->offset = (uint64_t*)malloc(l->n << 3);
  816. if (l->offset == NULL) return -2;
  817. if (idx_read(is_bgzf, fp, l->offset, l->n << 3) != l->n << 3) return -1;
  818. if (is_be) for (j = 0; j < l->n; ++j) ed_swap_8p(&l->offset[j]);
  819. for (j = 1; j < l->n; ++j) // fill missing values; may happen given older samtools and tabix
  820. if (l->offset[j] == 0) l->offset[j] = l->offset[j-1];
  821. update_loff(idx, i, 1);
  822. }
  823. }
  824. if (idx_read(is_bgzf, fp, &idx->n_no_coor, 8) != 8) idx->n_no_coor = 0;
  825. if (is_be) ed_swap_8p(&idx->n_no_coor);
  826. return 0;
  827. }
  828. hts_idx_t *hts_idx_load_local(const char *fn, int fmt)
  829. {
  830. uint8_t magic[4];
  831. int i, is_be;
  832. hts_idx_t *idx = NULL;
  833. is_be = ed_is_big();
  834. if (fmt == HTS_FMT_CSI) {
  835. BGZF *fp;
  836. uint32_t x[3], n;
  837. uint8_t *meta = 0;
  838. if ((fp = bgzf_open(fn, "r")) == 0) return NULL;
  839. if (bgzf_read(fp, magic, 4) != 4) goto csi_fail;
  840. if (memcmp(magic, "CSI\1", 4) != 0) goto csi_fail;
  841. if (bgzf_read(fp, x, 12) != 12) goto csi_fail;
  842. if (is_be) for (i = 0; i < 3; ++i) ed_swap_4p(&x[i]);
  843. if (x[2]) {
  844. if ((meta = (uint8_t*)malloc(x[2])) == NULL) goto csi_fail;
  845. if (bgzf_read(fp, meta, x[2]) != x[2]) goto csi_fail;
  846. }
  847. if (bgzf_read(fp, &n, 4) != 4) goto csi_fail;
  848. if (is_be) ed_swap_4p(&n);
  849. if ((idx = hts_idx_init(n, fmt, 0, x[0], x[1])) == NULL) goto csi_fail;
  850. idx->l_meta = x[2];
  851. idx->meta = meta;
  852. meta = NULL;
  853. if (hts_idx_load_core(idx, fp, HTS_FMT_CSI) < 0) goto csi_fail;
  854. bgzf_close(fp);
  855. return idx;
  856. csi_fail:
  857. bgzf_close(fp);
  858. hts_idx_destroy(idx);
  859. free(meta);
  860. return NULL;
  861. } else if (fmt == HTS_FMT_TBI) {
  862. BGZF *fp;
  863. uint32_t x[8];
  864. if ((fp = bgzf_open(fn, "r")) == 0) return NULL;
  865. if (bgzf_read(fp, magic, 4) != 4) goto tbi_fail;
  866. if (memcmp(magic, "TBI\1", 4) != 0) goto tbi_fail;
  867. if (bgzf_read(fp, x, 32) != 32) goto tbi_fail;
  868. if (is_be) for (i = 0; i < 8; ++i) ed_swap_4p(&x[i]);
  869. if ((idx = hts_idx_init(x[0], fmt, 0, 14, 5)) == NULL) goto tbi_fail;
  870. idx->l_meta = 28 + x[7];
  871. if ((idx->meta = (uint8_t*)malloc(idx->l_meta)) == NULL) goto tbi_fail;
  872. memcpy(idx->meta, &x[1], 28);
  873. if (bgzf_read(fp, idx->meta + 28, x[7]) != x[7]) goto tbi_fail;
  874. if (hts_idx_load_core(idx, fp, HTS_FMT_TBI) < 0) goto tbi_fail;
  875. bgzf_close(fp);
  876. return idx;
  877. tbi_fail:
  878. bgzf_close(fp);
  879. hts_idx_destroy(idx);
  880. return NULL;
  881. } else if (fmt == HTS_FMT_BAI) {
  882. uint32_t n;
  883. FILE *fp;
  884. if ((fp = fopen(fn, "rb")) == 0) return NULL;
  885. if (fread(magic, 1, 4, fp) != 4) goto bai_fail;
  886. if (memcmp(magic, "BAI\1", 4) != 0) goto bai_fail;
  887. if (fread(&n, 4, 1, fp) != 1) goto bai_fail;
  888. if (is_be) ed_swap_4p(&n);
  889. idx = hts_idx_init(n, fmt, 0, 14, 5);
  890. if (hts_idx_load_core(idx, fp, HTS_FMT_BAI) < 0) goto bai_fail;
  891. fclose(fp);
  892. return idx;
  893. bai_fail:
  894. fclose(fp);
  895. hts_idx_destroy(idx);
  896. return NULL;
  897. } else abort();
  898. }
  899. void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy)
  900. {
  901. if (idx->meta) free(idx->meta);
  902. idx->l_meta = l_meta;
  903. if (is_copy) {
  904. idx->meta = (uint8_t*)malloc(l_meta);
  905. memcpy(idx->meta, meta, l_meta);
  906. } else idx->meta = meta;
  907. }
  908. uint8_t *hts_idx_get_meta(hts_idx_t *idx, int *l_meta)
  909. {
  910. *l_meta = idx->l_meta;
  911. return idx->meta;
  912. }
  913. const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr)
  914. {
  915. if ( !idx->n )
  916. {
  917. *n = 0;
  918. return NULL;
  919. }
  920. int tid = 0, i;
  921. const char **names = (const char**) calloc(idx->n,sizeof(const char*));
  922. for (i=0; i<idx->n; i++)
  923. {
  924. bidx_t *bidx = idx->bidx[i];
  925. if ( !bidx ) continue;
  926. names[tid++] = getid(hdr,i);
  927. }
  928. *n = tid;
  929. return names;
  930. }
  931. int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped)
  932. {
  933. if ( idx->fmt == HTS_FMT_CRAI ) {
  934. *mapped = 0; *unmapped = 0;
  935. return -1;
  936. }
  937. bidx_t *h = idx->bidx[tid];
  938. khint_t k = kh_get(bin, h, META_BIN(idx));
  939. if (k != kh_end(h)) {
  940. *mapped = kh_val(h, k).list[1].u;
  941. *unmapped = kh_val(h, k).list[1].v;
  942. return 0;
  943. } else {
  944. *mapped = 0; *unmapped = 0;
  945. return -1;
  946. }
  947. }
  948. uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx)
  949. {
  950. return idx->n_no_coor;
  951. }
  952. /****************
  953. *** Iterator ***
  954. ****************/
  955. static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls)
  956. {
  957. int l, t, s = min_shift + (n_lvls<<1) + n_lvls;
  958. if (beg >= end) return 0;
  959. if (end >= 1LL<<s) end = 1LL<<s;
  960. for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
  961. int b, e, n, i;
  962. b = t + (beg>>s); e = t + (end>>s); n = e - b + 1;
  963. if (itr->bins.n + n > itr->bins.m) {
  964. itr->bins.m = itr->bins.n + n;
  965. kroundup32(itr->bins.m);
  966. itr->bins.a = (int*)realloc(itr->bins.a, sizeof(int) * itr->bins.m);
  967. }
  968. for (i = b; i <= e; ++i) itr->bins.a[itr->bins.n++] = i;
  969. }
  970. return itr->bins.n;
  971. }
  972. hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
  973. {
  974. int i, n_off, l, bin;
  975. hts_pair64_t *off;
  976. khint_t k;
  977. bidx_t *bidx;
  978. uint64_t min_off;
  979. hts_itr_t *iter = 0;
  980. if (tid < 0) {
  981. int finished0 = 0;
  982. uint64_t off0 = (uint64_t)-1;
  983. khint_t k;
  984. switch (tid) {
  985. case HTS_IDX_START:
  986. // Find the smallest offset, note that sequence ids may not be ordered sequentially
  987. for (i=0; i<idx->n; i++)
  988. {
  989. bidx = idx->bidx[i];
  990. k = kh_get(bin, bidx, META_BIN(idx));
  991. if (k == kh_end(bidx)) continue;
  992. if ( off0 > kh_val(bidx, k).list[0].u ) off0 = kh_val(bidx, k).list[0].u;
  993. }
  994. if ( off0==(uint64_t)-1 && idx->n_no_coor ) off0 = 0; // only no-coor reads in this bam
  995. break;
  996. case HTS_IDX_NOCOOR:
  997. if ( idx->n>0 )
  998. {
  999. bidx = idx->bidx[idx->n - 1];
  1000. k = kh_get(bin, bidx, META_BIN(idx));
  1001. if (k != kh_end(bidx)) off0 = kh_val(bidx, k).list[0].v;
  1002. }
  1003. if ( off0==(uint64_t)-1 && idx->n_no_coor ) off0 = 0; // only no-coor reads in this bam
  1004. break;
  1005. case HTS_IDX_REST:
  1006. off0 = 0;
  1007. break;
  1008. case HTS_IDX_NONE:
  1009. finished0 = 1;
  1010. off0 = 0;
  1011. break;
  1012. default:
  1013. return 0;
  1014. }
  1015. if (off0 != (uint64_t)-1) {
  1016. iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
  1017. iter->read_rest = 1;
  1018. iter->finished = finished0;
  1019. iter->curr_off = off0;
  1020. iter->readrec = readrec;
  1021. return iter;
  1022. } else return 0;
  1023. }
  1024. if (beg < 0) beg = 0;
  1025. if (end < beg) return 0;
  1026. if ((bidx = idx->bidx[tid]) == 0) return 0;
  1027. iter = (hts_itr_t*)calloc(1, sizeof(hts_itr_t));
  1028. iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1;
  1029. iter->readrec = readrec;
  1030. // compute min_off
  1031. bin = hts_bin_first(idx->n_lvls) + (beg>>idx->min_shift);
  1032. do {
  1033. int first;
  1034. k = kh_get(bin, bidx, bin);
  1035. if (k != kh_end(bidx)) break;
  1036. first = (hts_bin_parent(bin)<<3) + 1;
  1037. if (bin > first) --bin;
  1038. else bin = hts_bin_parent(bin);
  1039. } while (bin);
  1040. if (bin == 0) k = kh_get(bin, bidx, bin);
  1041. min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;
  1042. // retrieve bins
  1043. reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls);
  1044. for (i = n_off = 0; i < iter->bins.n; ++i)
  1045. if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx))
  1046. n_off += kh_value(bidx, k).n;
  1047. if (n_off == 0) return iter;
  1048. off = (hts_pair64_t*)calloc(n_off, 16);
  1049. for (i = n_off = 0; i < iter->bins.n; ++i) {
  1050. if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) {
  1051. int j;
  1052. bins_t *p = &kh_value(bidx, k);
  1053. for (j = 0; j < p->n; ++j)
  1054. if (p->list[j].v > min_off) off[n_off++] = p->list[j];
  1055. }
  1056. }
  1057. if (n_off == 0) {
  1058. free(off); return iter;
  1059. }
  1060. ks_introsort(_off, n_off, off);
  1061. // resolve completely contained adjacent blocks
  1062. for (i = 1, l = 0; i < n_off; ++i)
  1063. if (off[l].v < off[i].v) off[++l] = off[i];
  1064. n_off = l + 1;
  1065. // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing
  1066. for (i = 1; i < n_off; ++i)
  1067. if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
  1068. // merge adjacent blocks
  1069. for (i = 1, l = 0; i < n_off; ++i) {
  1070. if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
  1071. else off[++l] = off[i];
  1072. }
  1073. n_off = l + 1;
  1074. iter->n_off = n_off; iter->off = off;
  1075. return iter;
  1076. }
  1077. void hts_itr_destroy(hts_itr_t *iter)
  1078. {
  1079. if (iter) { free(iter->off); free(iter->bins.a); free(iter); }
  1080. }
  1081. const char *hts_parse_reg(const char *s, int *beg, int *end)
  1082. {
  1083. int i, k, l, name_end;
  1084. *beg = *end = -1;
  1085. name_end = l = strlen(s);
  1086. // determine the sequence name
  1087. for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end
  1088. if (i >= 0) name_end = i;
  1089. if (name_end < l) { // check if this is really the end
  1090. int n_hyphen = 0;
  1091. for (i = name_end + 1; i < l; ++i) {
  1092. if (s[i] == '-') ++n_hyphen;
  1093. else if (!isdigit(s[i]) && s[i] != ',') break;
  1094. }
  1095. if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
  1096. }
  1097. // parse the interval
  1098. if (name_end < l) {
  1099. char *tmp;
  1100. tmp = (char*)alloca(l - name_end + 1);
  1101. for (i = name_end + 1, k = 0; i < l; ++i)
  1102. if (s[i] != ',') tmp[k++] = s[i];
  1103. tmp[k] = 0;
  1104. if ((*beg = strtol(tmp, &tmp, 10) - 1) < 0) *beg = 0;
  1105. *end = *tmp? strtol(tmp + 1, &tmp, 10) : 1<<29;
  1106. if (*beg > *end) name_end = l;
  1107. }
  1108. if (name_end == l) *beg = 0, *end = 1<<29;
  1109. return s + name_end;
  1110. }
  1111. hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec)
  1112. {
  1113. int tid, beg, end;
  1114. char *q, *tmp;
  1115. if (strcmp(reg, ".") == 0)
  1116. return itr_query(idx, HTS_IDX_START, 0, 1<<29, readrec);
  1117. else if (strcmp(reg, "*") != 0) {
  1118. q = (char*)hts_parse_reg(reg, &beg, &end);
  1119. tmp = (char*)alloca(q - reg + 1);
  1120. strncpy(tmp, reg, q - reg);
  1121. tmp[q - reg] = 0;
  1122. if ((tid = getid(hdr, tmp)) < 0)
  1123. tid = getid(hdr, reg);
  1124. if (tid < 0) return 0;
  1125. return itr_query(idx, tid, beg, end, readrec);
  1126. } else return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec);
  1127. }
  1128. int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
  1129. {
  1130. int ret, tid, beg, end;
  1131. if (iter == NULL || iter->finished) return -1;
  1132. if (iter->read_rest) {
  1133. if (iter->curr_off) { // seek to the start
  1134. bgzf_seek(fp, iter->curr_off, SEEK_SET);
  1135. iter->curr_off = 0; // only seek once
  1136. }
  1137. ret = iter->readrec(fp, data, r, &tid, &beg, &end);
  1138. if (ret < 0) iter->finished = 1;
  1139. return ret;
  1140. }
  1141. if (iter->off == 0) return -1;
  1142. for (;;) {
  1143. if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
  1144. if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
  1145. if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
  1146. bgzf_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
  1147. iter->curr_off = bgzf_tell(fp);
  1148. }
  1149. ++iter->i;
  1150. }
  1151. if ((ret = iter->readrec(fp, data, r, &tid, &beg, &end)) >= 0) {
  1152. iter->curr_off = bgzf_tell(fp);
  1153. if (tid != iter->tid || beg >= iter->end) { // no need to proceed
  1154. ret = -1; break;
  1155. } else if (end > iter->beg && iter->end > beg) return ret;
  1156. } else break; // end of file or error
  1157. }
  1158. iter->finished = 1;
  1159. return ret;
  1160. }
  1161. /**********************
  1162. *** Retrieve index ***
  1163. **********************/
  1164. static char *test_and_fetch(const char *fn)
  1165. {
  1166. FILE *fp;
  1167. // FIXME Use is_remote_scheme() helper that's true for ftp/http/irods/etc
  1168. if (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn) {
  1169. const int buf_size = 1 * 1024 * 1024;
  1170. hFILE *fp_remote;
  1171. uint8_t *buf;
  1172. int l;
  1173. const char *p;
  1174. for (p = fn + strlen(fn) - 1; p >= fn; --p)
  1175. if (*p == '/') break;
  1176. ++p; // p now points to the local file name
  1177. if ((fp_remote = hopen(fn, "r")) == 0) {
  1178. if (hts_verbose >= 1) fprintf(stderr, "[E::%s] fail to open remote file '%s'\n", __func__, fn);
  1179. return 0;
  1180. }
  1181. if ((fp = fopen(p, "w")) == 0) {
  1182. if (hts_verbose >= 1) fprintf(stderr, "[E::%s] fail to create file '%s' in the working directory\n", __func__, p);
  1183. hclose_abruptly(fp_remote);
  1184. return 0;
  1185. }
  1186. if (hts_verbose >= 3) fprintf(stderr, "[M::%s] downloading file '%s' to local directory\n", __func__, fn);
  1187. buf = (uint8_t*)calloc(buf_size, 1);
  1188. while ((l = hread(fp_remote, buf, buf_size)) > 0) fwrite(buf, 1, l, fp);
  1189. free(buf);
  1190. fclose(fp);
  1191. if (hclose(fp_remote) != 0) fprintf(stderr, "[E::%s] fail to close remote file '%s'\n", __func__, fn);
  1192. return (char*)p;
  1193. } else {
  1194. if ((fp = fopen(fn, "rb")) == 0) return 0;
  1195. fclose(fp);
  1196. return (char*)fn;
  1197. }
  1198. }
  1199. char *hts_idx_getfn(const char *fn, const char *ext)
  1200. {
  1201. int i, l_fn, l_ext;
  1202. char *fnidx, *ret;
  1203. l_fn = strlen(fn); l_ext = strlen(ext);
  1204. fnidx = (char*)calloc(l_fn + l_ext + 1, 1);
  1205. strcpy(fnidx, fn); strcpy(fnidx + l_fn, ext);
  1206. if ((ret = test_and_fetch(fnidx)) == 0) {
  1207. for (i = l_fn - 1; i > 0; --i)
  1208. if (fnidx[i] == '.') break;
  1209. strcpy(fnidx + i, ext);
  1210. ret = test_and_fetch(fnidx);
  1211. }
  1212. if (ret == 0) {
  1213. free(fnidx);
  1214. return 0;
  1215. }
  1216. l_fn = strlen(ret);
  1217. memmove(fnidx, ret, l_fn + 1);
  1218. return fnidx;
  1219. }
  1220. hts_idx_t *hts_idx_load(const char *fn, int fmt)
  1221. {
  1222. char *fnidx;
  1223. hts_idx_t *idx;
  1224. fnidx = hts_idx_getfn(fn, ".csi");
  1225. if (fnidx) fmt = HTS_FMT_CSI;
  1226. else fnidx = hts_idx_getfn(fn, fmt == HTS_FMT_BAI? ".bai" : ".tbi");
  1227. if (fnidx == 0) return 0;
  1228. // Check that the index file is up to date, the main file might have changed
  1229. struct stat stat_idx,stat_main;
  1230. if ( !stat(fn, &stat_main) && !stat(fnidx, &stat_idx) )
  1231. {
  1232. if ( stat_idx.st_mtime < stat_main.st_mtime )
  1233. fprintf(stderr, "Warning: The index file is older than the data file: %s\n", fnidx);
  1234. }
  1235. idx = hts_idx_load_local(fnidx, fmt);
  1236. free(fnidx);
  1237. return idx;
  1238. }