tbx.c 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
  1. #include <stdlib.h>
  2. #include <string.h>
  3. #include <ctype.h>
  4. #include <stdio.h>
  5. #include <assert.h>
  6. #include "htslib/tbx.h"
  7. #include "htslib/bgzf.h"
  8. #include "htslib/khash.h"
  9. KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
  10. tbx_conf_t tbx_conf_gff = { 0, 1, 4, 5, '#', 0 };
  11. tbx_conf_t tbx_conf_bed = { TBX_UCSC, 1, 2, 3, '#', 0 };
  12. tbx_conf_t tbx_conf_psltbl = { TBX_UCSC, 15, 17, 18, '#', 0 };
  13. tbx_conf_t tbx_conf_sam = { TBX_SAM, 3, 4, 0, '@', 0 };
  14. tbx_conf_t tbx_conf_vcf = { TBX_VCF, 1, 2, 0, '#', 0 };
  15. typedef struct {
  16. int64_t beg, end;
  17. char *ss, *se;
  18. int tid;
  19. } tbx_intv_t;
  20. static inline int get_tid(tbx_t *tbx, const char *ss, int is_add)
  21. {
  22. khint_t k;
  23. khash_t(s2i) *d;
  24. if (tbx->dict == 0) tbx->dict = kh_init(s2i);
  25. d = (khash_t(s2i)*)tbx->dict;
  26. if (is_add) {
  27. int absent;
  28. k = kh_put(s2i, d, ss, &absent);
  29. if (absent) {
  30. kh_key(d, k) = strdup(ss);
  31. kh_val(d, k) = kh_size(d) - 1;
  32. }
  33. } else k = kh_get(s2i, d, ss);
  34. return k == kh_end(d)? -1 : kh_val(d, k);
  35. }
  36. int tbx_name2id(tbx_t *tbx, const char *ss)
  37. {
  38. return get_tid(tbx, ss, 0);
  39. }
  40. int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv)
  41. {
  42. int i, b = 0, id = 1, ncols = 0;
  43. char *s;
  44. intv->ss = intv->se = 0; intv->beg = intv->end = -1;
  45. for (i = 0; i <= len; ++i) {
  46. if (line[i] == '\t' || line[i] == 0) {
  47. ++ncols;
  48. if (id == conf->sc) {
  49. intv->ss = line + b; intv->se = line + i;
  50. } else if (id == conf->bc) {
  51. // here ->beg is 0-based.
  52. intv->beg = intv->end = strtol(line + b, &s, 0);
  53. if ( s==line+b ) return -1; // expected int
  54. if (!(conf->preset&TBX_UCSC)) --intv->beg;
  55. else ++intv->end;
  56. if (intv->beg < 0) intv->beg = 0;
  57. if (intv->end < 1) intv->end = 1;
  58. } else {
  59. if ((conf->preset&0xffff) == TBX_GENERIC) {
  60. if (id == conf->ec)
  61. {
  62. intv->end = strtol(line + b, &s, 0);
  63. if ( s==line+b ) return -1; // expected int
  64. }
  65. } else if ((conf->preset&0xffff) == TBX_SAM) {
  66. if (id == 6) { // CIGAR
  67. int l = 0, op;
  68. char *t;
  69. for (s = line + b; s < line + i;) {
  70. long x = strtol(s, &t, 10);
  71. op = toupper(*t);
  72. if (op == 'M' || op == 'D' || op == 'N') l += x;
  73. s = t + 1;
  74. }
  75. if (l == 0) l = 1;
  76. intv->end = intv->beg + l;
  77. }
  78. } else if ((conf->preset&0xffff) == TBX_VCF) {
  79. if (id == 4) {
  80. if (b < i) intv->end = intv->beg + (i - b);
  81. } else if (id == 8) { // look for "END="
  82. int c = line[i];
  83. line[i] = 0;
  84. s = strstr(line + b, "END=");
  85. if (s == line + b) s += 4;
  86. else if (s) {
  87. s = strstr(line + b, ";END=");
  88. if (s) s += 5;
  89. }
  90. if (s) intv->end = strtol(s, &s, 0);
  91. line[i] = c;
  92. }
  93. }
  94. }
  95. b = i + 1;
  96. ++id;
  97. }
  98. }
  99. if (intv->ss == 0 || intv->se == 0 || intv->beg < 0 || intv->end < 0) return -1;
  100. return 0;
  101. }
  102. static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_add)
  103. {
  104. if (tbx_parse1(&tbx->conf, str->l, str->s, intv) == 0) {
  105. int c = *intv->se;
  106. *intv->se = '\0'; intv->tid = get_tid(tbx, intv->ss, is_add); *intv->se = c;
  107. return (intv->tid >= 0 && intv->beg >= 0 && intv->end >= 0)? 0 : -1;
  108. } else {
  109. char *type = NULL;
  110. switch (tbx->conf.preset&0xffff)
  111. {
  112. case TBX_SAM: type = "TBX_SAM"; break;
  113. case TBX_VCF: type = "TBX_VCF"; break;
  114. case TBX_UCSC: type = "TBX_UCSC"; break;
  115. default: type = "TBX_GENERIC"; break;
  116. }
  117. fprintf(stderr, "[E::%s] failed to parse %s, was wrong -p [type] used?\nThe offending line was: \"%s\"\n", __func__, type, str->s);
  118. return -1;
  119. }
  120. }
  121. int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, int *beg, int *end)
  122. {
  123. tbx_t *tbx = (tbx_t *) tbxv;
  124. kstring_t *s = (kstring_t *) sv;
  125. int ret;
  126. if ((ret = bgzf_getline(fp, '\n', s)) >= 0) {
  127. tbx_intv_t intv;
  128. get_intv(tbx, s, &intv, 0);
  129. *tid = intv.tid; *beg = intv.beg; *end = intv.end;
  130. }
  131. return ret;
  132. }
  133. void tbx_set_meta(tbx_t *tbx)
  134. {
  135. int i, l = 0, l_nm;
  136. uint32_t x[7];
  137. char **name;
  138. uint8_t *meta;
  139. khint_t k;
  140. khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
  141. memcpy(x, &tbx->conf, 24);
  142. name = (char**)malloc(sizeof(char*) * kh_size(d));
  143. for (k = kh_begin(d), l = 0; k != kh_end(d); ++k) {
  144. if (!kh_exist(d, k)) continue;
  145. name[kh_val(d, k)] = (char*)kh_key(d, k);
  146. l += strlen(kh_key(d, k)) + 1; // +1 to include '\0'
  147. }
  148. l_nm = x[6] = l;
  149. meta = (uint8_t*)malloc(l_nm + 28);
  150. if (ed_is_big())
  151. for (i = 0; i < 7; ++i)
  152. x[i] = ed_swap_4(x[i]);
  153. memcpy(meta, x, 28);
  154. for (l = 28, i = 0; i < (int)kh_size(d); ++i) {
  155. int x = strlen(name[i]) + 1;
  156. memcpy(meta + l, name[i], x);
  157. l += x;
  158. }
  159. free(name);
  160. hts_idx_set_meta(tbx->idx, l, meta, 0);
  161. }
  162. tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf)
  163. {
  164. tbx_t *tbx;
  165. kstring_t str;
  166. int ret, first = 0, n_lvls, fmt;
  167. int64_t lineno = 0;
  168. uint64_t last_off = 0;
  169. tbx_intv_t intv;
  170. str.s = 0; str.l = str.m = 0;
  171. tbx = (tbx_t*)calloc(1, sizeof(tbx_t));
  172. tbx->conf = *conf;
  173. if (min_shift > 0) n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3, fmt = HTS_FMT_CSI;
  174. else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_TBI;
  175. while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) {
  176. ++lineno;
  177. if (lineno <= tbx->conf.line_skip || str.s[0] == tbx->conf.meta_char) {
  178. last_off = bgzf_tell(fp);
  179. continue;
  180. }
  181. if (first == 0) {
  182. tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls);
  183. first = 1;
  184. }
  185. get_intv(tbx, &str, &intv, 1);
  186. ret = hts_idx_push(tbx->idx, intv.tid, intv.beg, intv.end, bgzf_tell(fp), 1);
  187. if (ret < 0)
  188. {
  189. free(str.s);
  190. tbx_destroy(tbx);
  191. return NULL;
  192. }
  193. }
  194. if ( !tbx->idx ) tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls); // empty file
  195. if ( !tbx->dict ) tbx->dict = kh_init(s2i);
  196. hts_idx_finish(tbx->idx, bgzf_tell(fp));
  197. tbx_set_meta(tbx);
  198. free(str.s);
  199. return tbx;
  200. }
  201. void tbx_destroy(tbx_t *tbx)
  202. {
  203. khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
  204. if (d != NULL)
  205. {
  206. khint_t k;
  207. for (k = kh_begin(d); k != kh_end(d); ++k)
  208. if (kh_exist(d, k)) free((char*)kh_key(d, k));
  209. }
  210. hts_idx_destroy(tbx->idx);
  211. kh_destroy(s2i, d);
  212. free(tbx);
  213. }
  214. int tbx_index_build(const char *fn, int min_shift, const tbx_conf_t *conf)
  215. {
  216. tbx_t *tbx;
  217. BGZF *fp;
  218. if ( bgzf_is_bgzf(fn)!=1 ) { fprintf(stderr,"Not a BGZF file: %s\n", fn); return -1; }
  219. if ((fp = bgzf_open(fn, "r")) == 0) return -1;
  220. if ( !fp->is_compressed ) { bgzf_close(fp); return -1; }
  221. tbx = tbx_index(fp, min_shift, conf);
  222. bgzf_close(fp);
  223. if ( !tbx ) return -1;
  224. hts_idx_save(tbx->idx, fn, min_shift > 0? HTS_FMT_CSI : HTS_FMT_TBI);
  225. tbx_destroy(tbx);
  226. return 0;
  227. }
  228. tbx_t *tbx_index_load(const char *fn)
  229. {
  230. tbx_t *tbx;
  231. uint8_t *meta;
  232. char *nm, *p;
  233. uint32_t x[7];
  234. int l_meta, l_nm;
  235. tbx = (tbx_t*)calloc(1, sizeof(tbx_t));
  236. tbx->idx = hts_idx_load(fn, HTS_FMT_TBI);
  237. if ( !tbx->idx )
  238. {
  239. free(tbx);
  240. return NULL;
  241. }
  242. meta = hts_idx_get_meta(tbx->idx, &l_meta);
  243. memcpy(x, meta, 28);
  244. memcpy(&tbx->conf, x, 24);
  245. p = nm = (char*)meta + 28;
  246. l_nm = x[6];
  247. for (; p - nm < l_nm; p += strlen(p) + 1) get_tid(tbx, p, 1);
  248. return tbx;
  249. }
  250. const char **tbx_seqnames(tbx_t *tbx, int *n)
  251. {
  252. khash_t(s2i) *d = (khash_t(s2i)*)tbx->dict;
  253. if (d == NULL)
  254. {
  255. *n = 0;
  256. return NULL;
  257. }
  258. int tid, m = kh_size(d);
  259. const char **names = (const char**) calloc(m,sizeof(const char*));
  260. khint_t k;
  261. for (k=kh_begin(d); k<kh_end(d); k++)
  262. {
  263. if ( !kh_exist(d,k) ) continue;
  264. tid = kh_val(d,k);
  265. assert( tid<m );
  266. names[tid] = kh_key(d,k);
  267. }
  268. // sanity check: there should be no gaps
  269. for (tid=0; tid<m; tid++)
  270. assert(names[tid]);
  271. *n = m;
  272. return names;
  273. }