faidx.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421
  1. #include "config.h"
  2. #include <ctype.h>
  3. #include <string.h>
  4. #include <stdlib.h>
  5. #include <stdio.h>
  6. #include <stdint.h>
  7. #include "htslib/bgzf.h"
  8. #include "htslib/faidx.h"
  9. #include "htslib/khash.h"
  10. #ifdef _USE_KNETFILE
  11. #include "htslib/knetfile.h"
  12. #endif
  13. typedef struct {
  14. int32_t line_len, line_blen;
  15. int64_t len;
  16. uint64_t offset;
  17. } faidx1_t;
  18. KHASH_MAP_INIT_STR(s, faidx1_t)
  19. struct __faidx_t {
  20. BGZF *bgzf;
  21. int n, m;
  22. char **name;
  23. khash_t(s) *hash;
  24. };
  25. #ifndef kroundup32
  26. #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
  27. #endif
  28. static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset)
  29. {
  30. khint_t k;
  31. int ret;
  32. faidx1_t t;
  33. if (idx->n == idx->m) {
  34. idx->m = idx->m? idx->m<<1 : 16;
  35. idx->name = (char**)realloc(idx->name, sizeof(char*) * idx->m);
  36. }
  37. idx->name[idx->n] = strdup(name);
  38. k = kh_put(s, idx->hash, idx->name[idx->n], &ret);
  39. t.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset;
  40. kh_value(idx->hash, k) = t;
  41. ++idx->n;
  42. }
  43. faidx_t *fai_build_core(BGZF *bgzf)
  44. {
  45. char c, *name;
  46. int l_name, m_name;
  47. int line_len, line_blen, state;
  48. int l1, l2;
  49. faidx_t *idx;
  50. uint64_t offset;
  51. int64_t len;
  52. idx = (faidx_t*)calloc(1, sizeof(faidx_t));
  53. idx->hash = kh_init(s);
  54. name = 0; l_name = m_name = 0;
  55. len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0;
  56. while ( (c=bgzf_getc(bgzf))>=0 ) {
  57. if (c == '\n') { // an empty line
  58. if (state == 1) {
  59. offset = bgzf_utell(bgzf);
  60. continue;
  61. } else if ((state == 0 && len < 0) || state == 2) continue;
  62. }
  63. if (c == '>') { // fasta header
  64. if (len >= 0)
  65. fai_insert_index(idx, name, len, line_len, line_blen, offset);
  66. l_name = 0;
  67. while ( (c=bgzf_getc(bgzf))>=0 && !isspace(c)) {
  68. if (m_name < l_name + 2) {
  69. m_name = l_name + 2;
  70. kroundup32(m_name);
  71. name = (char*)realloc(name, m_name);
  72. }
  73. name[l_name++] = c;
  74. }
  75. name[l_name] = '\0';
  76. if ( c<0 ) {
  77. fprintf(stderr, "[fai_build_core] the last entry has no sequence\n");
  78. free(name); fai_destroy(idx);
  79. return 0;
  80. }
  81. if (c != '\n') while ( (c=bgzf_getc(bgzf))>=0 && c != '\n');
  82. state = 1; len = 0;
  83. offset = bgzf_utell(bgzf);
  84. } else {
  85. if (state == 3) {
  86. fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name);
  87. free(name); fai_destroy(idx);
  88. return 0;
  89. }
  90. if (state == 2) state = 3;
  91. l1 = l2 = 0;
  92. do {
  93. ++l1;
  94. if (isgraph(c)) ++l2;
  95. } while ( (c=bgzf_getc(bgzf))>=0 && c != '\n');
  96. if (state == 3 && l2) {
  97. fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name);
  98. free(name); fai_destroy(idx);
  99. return 0;
  100. }
  101. ++l1; len += l2;
  102. if (state == 1) line_len = l1, line_blen = l2, state = 0;
  103. else if (state == 0) {
  104. if (l1 != line_len || l2 != line_blen) state = 2;
  105. }
  106. }
  107. }
  108. fai_insert_index(idx, name, len, line_len, line_blen, offset);
  109. free(name);
  110. return idx;
  111. }
  112. void fai_save(const faidx_t *fai, FILE *fp)
  113. {
  114. khint_t k;
  115. int i;
  116. for (i = 0; i < fai->n; ++i) {
  117. faidx1_t x;
  118. k = kh_get(s, fai->hash, fai->name[i]);
  119. x = kh_value(fai->hash, k);
  120. #ifdef _WIN32
  121. fprintf(fp, "%s\t%d\t%ld\t%d\t%d\n", fai->name[i], (int)x.len, (long)x.offset, (int)x.line_blen, (int)x.line_len);
  122. #else
  123. fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len);
  124. #endif
  125. }
  126. }
  127. faidx_t *fai_read(FILE *fp)
  128. {
  129. faidx_t *fai;
  130. char *buf, *p;
  131. int len, line_len, line_blen;
  132. #ifdef _WIN32
  133. long offset;
  134. #else
  135. long long offset;
  136. #endif
  137. fai = (faidx_t*)calloc(1, sizeof(faidx_t));
  138. fai->hash = kh_init(s);
  139. buf = (char*)calloc(0x10000, 1);
  140. while (!feof(fp) && fgets(buf, 0x10000, fp)) {
  141. for (p = buf; *p && isgraph(*p); ++p);
  142. *p = 0; ++p;
  143. #ifdef _WIN32
  144. sscanf(p, "%d%ld%d%d", &len, &offset, &line_blen, &line_len);
  145. #else
  146. sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len);
  147. #endif
  148. fai_insert_index(fai, buf, len, line_len, line_blen, offset);
  149. }
  150. free(buf);
  151. return fai;
  152. }
  153. void fai_destroy(faidx_t *fai)
  154. {
  155. int i;
  156. for (i = 0; i < fai->n; ++i) free(fai->name[i]);
  157. free(fai->name);
  158. kh_destroy(s, fai->hash);
  159. if (fai->bgzf) bgzf_close(fai->bgzf);
  160. free(fai);
  161. }
  162. int fai_build(const char *fn)
  163. {
  164. char *str;
  165. BGZF *bgzf;
  166. FILE *fp;
  167. faidx_t *fai;
  168. str = (char*)calloc(strlen(fn) + 5, 1);
  169. sprintf(str, "%s.fai", fn);
  170. bgzf = bgzf_open(fn, "r");
  171. if ( !bgzf ) {
  172. fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",fn);
  173. free(str);
  174. return -1;
  175. }
  176. if ( bgzf->is_compressed ) bgzf_index_build_init(bgzf);
  177. fai = fai_build_core(bgzf);
  178. if ( bgzf->is_compressed ) bgzf_index_dump(bgzf, fn, ".gzi");
  179. bgzf_close(bgzf);
  180. fp = fopen(str, "wb");
  181. if ( !fp ) {
  182. fprintf(stderr, "[fai_build] fail to write FASTA index %s\n",str);
  183. fai_destroy(fai); free(str);
  184. return -1;
  185. }
  186. fai_save(fai, fp);
  187. fclose(fp);
  188. free(str);
  189. fai_destroy(fai);
  190. return 0;
  191. }
  192. #ifdef _USE_KNETFILE
  193. FILE *download_and_open(const char *fn)
  194. {
  195. const int buf_size = 1 * 1024 * 1024;
  196. uint8_t *buf;
  197. FILE *fp;
  198. knetFile *fp_remote;
  199. const char *url = fn;
  200. const char *p;
  201. int l = strlen(fn);
  202. for (p = fn + l - 1; p >= fn; --p)
  203. if (*p == '/') break;
  204. fn = p + 1;
  205. // First try to open a local copy
  206. fp = fopen(fn, "r");
  207. if (fp)
  208. return fp;
  209. // If failed, download from remote and open
  210. fp_remote = knet_open(url, "rb");
  211. if (fp_remote == 0) {
  212. fprintf(stderr, "[download_from_remote] fail to open remote file %s\n",url);
  213. return NULL;
  214. }
  215. if ((fp = fopen(fn, "wb")) == 0) {
  216. fprintf(stderr, "[download_from_remote] fail to create file in the working directory %s\n",fn);
  217. knet_close(fp_remote);
  218. return NULL;
  219. }
  220. buf = (uint8_t*)calloc(buf_size, 1);
  221. while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
  222. fwrite(buf, 1, l, fp);
  223. free(buf);
  224. fclose(fp);
  225. knet_close(fp_remote);
  226. return fopen(fn, "r");
  227. }
  228. #endif
  229. faidx_t *fai_load(const char *fn)
  230. {
  231. char *str;
  232. FILE *fp;
  233. faidx_t *fai;
  234. str = (char*)calloc(strlen(fn) + 5, 1);
  235. sprintf(str, "%s.fai", fn);
  236. #ifdef _USE_KNETFILE
  237. if (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)
  238. {
  239. fp = download_and_open(str);
  240. if ( !fp )
  241. {
  242. fprintf(stderr, "[fai_load] failed to open remote FASTA index %s\n", str);
  243. free(str);
  244. return 0;
  245. }
  246. }
  247. else
  248. #endif
  249. fp = fopen(str, "rb");
  250. if (fp == 0) {
  251. fprintf(stderr, "[fai_load] build FASTA index.\n");
  252. fai_build(fn);
  253. fp = fopen(str, "rb");
  254. if (fp == 0) {
  255. fprintf(stderr, "[fai_load] fail to open FASTA index.\n");
  256. free(str);
  257. return 0;
  258. }
  259. }
  260. fai = fai_read(fp);
  261. fclose(fp);
  262. fai->bgzf = bgzf_open(fn, "rb");
  263. free(str);
  264. if (fai->bgzf == 0) {
  265. fprintf(stderr, "[fai_load] fail to open FASTA file.\n");
  266. return 0;
  267. }
  268. if ( fai->bgzf->is_compressed==1 )
  269. {
  270. if ( bgzf_index_load(fai->bgzf, fn, ".gzi") < 0 )
  271. {
  272. fprintf(stderr, "[fai_load] failed to load .gzi index: %s[.gzi]\n", fn);
  273. fai_destroy(fai);
  274. return NULL;
  275. }
  276. }
  277. return fai;
  278. }
  279. char *fai_fetch(const faidx_t *fai, const char *str, int *len)
  280. {
  281. char *s, c;
  282. int i, l, k, name_end;
  283. khiter_t iter;
  284. faidx1_t val;
  285. khash_t(s) *h;
  286. int beg, end;
  287. beg = end = -1;
  288. h = fai->hash;
  289. name_end = l = strlen(str);
  290. s = (char*)malloc(l+1);
  291. // remove space
  292. for (i = k = 0; i < l; ++i)
  293. if (!isspace(str[i])) s[k++] = str[i];
  294. s[k] = 0; l = k;
  295. // determine the sequence name
  296. for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end
  297. if (i >= 0) name_end = i;
  298. if (name_end < l) { // check if this is really the end
  299. int n_hyphen = 0;
  300. for (i = name_end + 1; i < l; ++i) {
  301. if (s[i] == '-') ++n_hyphen;
  302. else if (!isdigit(s[i]) && s[i] != ',') break;
  303. }
  304. if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
  305. s[name_end] = 0;
  306. iter = kh_get(s, h, s);
  307. if (iter == kh_end(h)) { // cannot find the sequence name
  308. iter = kh_get(s, h, str); // try str as the name
  309. if (iter == kh_end(h)) {
  310. *len = 0;
  311. free(s); return 0;
  312. } else s[name_end] = ':', name_end = l;
  313. }
  314. } else iter = kh_get(s, h, str);
  315. if(iter == kh_end(h)) {
  316. fprintf(stderr, "[fai_fetch] Warning - Reference %s not found in FASTA file, returning empty sequence\n", str);
  317. free(s);
  318. *len = -2;
  319. return 0;
  320. };
  321. val = kh_value(h, iter);
  322. // parse the interval
  323. if (name_end < l) {
  324. for (i = k = name_end + 1; i < l; ++i)
  325. if (s[i] != ',') s[k++] = s[i];
  326. s[k] = 0;
  327. beg = atoi(s + name_end + 1);
  328. for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break;
  329. end = i < k? atoi(s + i + 1) : val.len;
  330. if (beg > 0) --beg;
  331. } else beg = 0, end = val.len;
  332. if (beg >= val.len) beg = val.len;
  333. if (end >= val.len) end = val.len;
  334. if (beg > end) beg = end;
  335. free(s);
  336. // now retrieve the sequence
  337. int ret = bgzf_useek(fai->bgzf, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET);
  338. if ( ret<0 )
  339. {
  340. *len = -1;
  341. fprintf(stderr, "[fai_fetch] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n");
  342. return NULL;
  343. }
  344. l = 0;
  345. s = (char*)malloc(end - beg + 2);
  346. while ( (c=bgzf_getc(fai->bgzf))>=0 && l < end - beg )
  347. if (isgraph(c)) s[l++] = c;
  348. s[l] = '\0';
  349. *len = l;
  350. return s;
  351. }
  352. int faidx_fetch_nseq(const faidx_t *fai)
  353. {
  354. return fai->n;
  355. }
  356. char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len)
  357. {
  358. int l;
  359. char c;
  360. khiter_t iter;
  361. faidx1_t val;
  362. char *seq=NULL;
  363. // Adjust position
  364. iter = kh_get(s, fai->hash, c_name);
  365. if (iter == kh_end(fai->hash))
  366. {
  367. *len = -2;
  368. fprintf(stderr, "[fai_fetch_seq] The sequence \"%s\" not found\n", c_name);
  369. return NULL;
  370. }
  371. val = kh_value(fai->hash, iter);
  372. if(p_end_i < p_beg_i) p_beg_i = p_end_i;
  373. if(p_beg_i < 0) p_beg_i = 0;
  374. else if(val.len <= p_beg_i) p_beg_i = val.len - 1;
  375. if(p_end_i < 0) p_end_i = 0;
  376. else if(val.len <= p_end_i) p_end_i = val.len - 1;
  377. // Now retrieve the sequence
  378. int ret = bgzf_useek(fai->bgzf, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET);
  379. if ( ret<0 )
  380. {
  381. *len = -1;
  382. fprintf(stderr, "[fai_fetch_seq] Error: fai_fetch failed. (Seeking in a compressed, .gzi unindexed, file?)\n");
  383. return NULL;
  384. }
  385. l = 0;
  386. seq = (char*)malloc(p_end_i - p_beg_i + 2);
  387. while ( (c=bgzf_getc(fai->bgzf))>=0 && l < p_end_i - p_beg_i + 1)
  388. if (isgraph(c)) seq[l++] = c;
  389. seq[l] = '\0';
  390. *len = l;
  391. return seq;
  392. }