tabix.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <unistd.h>
  4. #include <string.h>
  5. #include <getopt.h>
  6. #include <sys/types.h>
  7. #include <sys/stat.h>
  8. #include <errno.h>
  9. #include "htslib/tbx.h"
  10. #include "htslib/sam.h"
  11. #include "htslib/vcf.h"
  12. #include "htslib/kseq.h"
  13. #include "htslib/bgzf.h"
  14. #include "htslib/hts.h"
  15. typedef struct
  16. {
  17. int min_shift;
  18. }
  19. args_t;
  20. static void error(const char *format, ...)
  21. {
  22. va_list ap;
  23. va_start(ap, format);
  24. vfprintf(stderr, format, ap);
  25. va_end(ap);
  26. exit(EXIT_FAILURE);
  27. }
  28. #define IS_GFF (1<<0)
  29. #define IS_BED (1<<1)
  30. #define IS_SAM (1<<2)
  31. #define IS_VCF (1<<3)
  32. #define IS_BCF (1<<4)
  33. #define IS_BAM (1<<5)
  34. #define IS_TXT (IS_GFF|IS_BED|IS_SAM|IS_VCF)
  35. int file_type(const char *fname)
  36. {
  37. int l = strlen(fname);
  38. int strcasecmp(const char *s1, const char *s2);
  39. if (l>=7 && strcasecmp(fname+l-7, ".gff.gz") == 0) return IS_GFF;
  40. else if (l>=7 && strcasecmp(fname+l-7, ".bed.gz") == 0) return IS_BED;
  41. else if (l>=7 && strcasecmp(fname+l-7, ".sam.gz") == 0) return IS_SAM;
  42. else if (l>=7 && strcasecmp(fname+l-7, ".vcf.gz") == 0) return IS_VCF;
  43. else if (l>=4 && strcasecmp(fname+l-4, ".bcf") == 0) return IS_BCF;
  44. else if (l>=4 && strcasecmp(fname+l-4, ".bam") == 0) return IS_BAM;
  45. return 0;
  46. }
  47. #define PRINT_HEADER 1
  48. #define HEADER_ONLY 2
  49. static int query_regions(char **argv, int argc, int mode)
  50. {
  51. char *fname = argv[0];
  52. int i, ftype = file_type(fname);
  53. if ( ftype & IS_TXT || !ftype )
  54. {
  55. htsFile *fp = hts_open(fname,"r");
  56. if ( !fp ) error("Could not read %s\n", fname);
  57. tbx_t *tbx = tbx_index_load(fname);
  58. if ( !tbx ) error("Could not load .tbi index of %s\n", fname);
  59. kstring_t str = {0,0,0};
  60. if ( mode )
  61. {
  62. while ( hts_getline(fp, KS_SEP_LINE, &str) >= 0 )
  63. {
  64. if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break;
  65. puts(str.s);
  66. }
  67. }
  68. if ( mode!=HEADER_ONLY )
  69. {
  70. for (i=1; i<argc; i++)
  71. {
  72. hts_itr_t *itr = tbx_itr_querys(tbx, argv[i]);
  73. if ( !itr ) continue;
  74. while (tbx_itr_next(fp, tbx, itr, &str) >= 0) puts(str.s);
  75. tbx_itr_destroy(itr);
  76. }
  77. }
  78. free(str.s);
  79. if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname);
  80. tbx_destroy(tbx);
  81. }
  82. else if ( ftype==IS_BCF ) // output uncompressed VCF
  83. {
  84. htsFile *fp = hts_open(fname,"r");
  85. if ( !fp ) error("Could not read %s\n", fname);
  86. htsFile *out = hts_open("-","w");
  87. if ( !out ) error("Could not open stdout\n", fname);
  88. hts_idx_t *idx = bcf_index_load(fname);
  89. if ( !idx ) error("Could not load .csi index of %s\n", fname);
  90. bcf_hdr_t *hdr = bcf_hdr_read(fp);
  91. if ( !hdr ) error("Could not read the header: %s\n", fname);
  92. if ( mode )
  93. {
  94. bcf_hdr_write(out,hdr);
  95. }
  96. if ( mode!=HEADER_ONLY )
  97. {
  98. bcf1_t *rec = bcf_init();
  99. for (i=1; i<argc; i++)
  100. {
  101. hts_itr_t *itr = bcf_itr_querys(idx,hdr,argv[i]);
  102. if ( !itr ) continue;
  103. while ( bcf_itr_next(fp, itr, rec) >=0 ) bcf_write(out,hdr,rec);
  104. tbx_itr_destroy(itr);
  105. }
  106. bcf_destroy(rec);
  107. }
  108. if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname);
  109. if ( hts_close(out) ) error("hts_close returned non-zero status for stdout\n");
  110. bcf_hdr_destroy(hdr);
  111. hts_idx_destroy(idx);
  112. }
  113. else if ( ftype==IS_BAM ) // todo: BAM
  114. error("Please use \"samtools view\" for querying BAM files.\n");
  115. return 0;
  116. }
  117. static int query_chroms(char *fname)
  118. {
  119. const char **seq;
  120. int i, nseq, ftype = file_type(fname);
  121. if ( ftype & IS_TXT || !ftype )
  122. {
  123. tbx_t *tbx = tbx_index_load(fname);
  124. if ( !tbx ) error("Could not load .tbi index of %s\n", fname);
  125. seq = tbx_seqnames(tbx, &nseq);
  126. for (i=0; i<nseq; i++)
  127. printf("%s\n", seq[i]);
  128. free(seq);
  129. tbx_destroy(tbx);
  130. }
  131. else if ( ftype==IS_BCF )
  132. {
  133. htsFile *fp = hts_open(fname,"r");
  134. if ( !fp ) error("Could not read %s\n", fname);
  135. bcf_hdr_t *hdr = bcf_hdr_read(fp);
  136. if ( !hdr ) error("Could not read the header: %s\n", fname);
  137. hts_close(fp);
  138. hts_idx_t *idx = bcf_index_load(fname);
  139. if ( !idx ) error("Could not load .csi index of %s\n", fname);
  140. seq = bcf_index_seqnames(idx, hdr, &nseq);
  141. for (i=0; i<nseq; i++)
  142. printf("%s\n", seq[i]);
  143. free(seq);
  144. bcf_hdr_destroy(hdr);
  145. hts_idx_destroy(idx);
  146. }
  147. else if ( ftype==IS_BAM ) // todo: BAM
  148. error("BAM: todo\n");
  149. return 0;
  150. }
  151. int reheader_file(const char *fname, const char *header, int ftype, tbx_conf_t *conf)
  152. {
  153. if ( ftype & IS_TXT || !ftype )
  154. {
  155. BGZF *fp = bgzf_open(fname,"r");
  156. if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) return -1;
  157. char *buffer = fp->uncompressed_block;
  158. int skip_until = 0;
  159. // Skip the header: find out the position of the data block
  160. if ( buffer[0]==conf->meta_char )
  161. {
  162. skip_until = 1;
  163. while (1)
  164. {
  165. if ( buffer[skip_until]=='\n' )
  166. {
  167. skip_until++;
  168. if ( skip_until>=fp->block_length )
  169. {
  170. if ( bgzf_read_block(fp) != 0 || !fp->block_length ) error("FIXME: No body in the file: %s\n", fname);
  171. skip_until = 0;
  172. }
  173. // The header has finished
  174. if ( buffer[skip_until]!=conf->meta_char ) break;
  175. }
  176. skip_until++;
  177. if ( skip_until>=fp->block_length )
  178. {
  179. if (bgzf_read_block(fp) != 0 || !fp->block_length) error("FIXME: No body in the file: %s\n", fname);
  180. skip_until = 0;
  181. }
  182. }
  183. }
  184. // Output the new header
  185. FILE *hdr = fopen(header,"r");
  186. if ( !hdr ) error("%s: %s", header,strerror(errno));
  187. int page_size = getpagesize();
  188. char *buf = valloc(page_size);
  189. BGZF *bgzf_out = bgzf_dopen(fileno(stdout), "w");
  190. ssize_t nread;
  191. while ( (nread=fread(buf,1,page_size-1,hdr))>0 )
  192. {
  193. if ( nread<page_size-1 && buf[nread-1]!='\n' ) buf[nread++] = '\n';
  194. if (bgzf_write(bgzf_out, buf, nread) < 0) error("Error: %d\n",bgzf_out->errcode);
  195. }
  196. if ( fclose(hdr) ) error("close failed: %s\n", header);
  197. // Output all remainig data read with the header block
  198. if ( fp->block_length - skip_until > 0 )
  199. {
  200. if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0) error("Error: %d\n",fp->errcode);
  201. }
  202. if (bgzf_flush(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode);
  203. while (1)
  204. {
  205. nread = bgzf_raw_read(fp, buf, page_size);
  206. if ( nread<=0 ) break;
  207. int count = bgzf_raw_write(bgzf_out, buf, nread);
  208. if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread);
  209. }
  210. if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode);
  211. if (bgzf_close(fp) < 0) error("Error: %d\n",fp->errcode);
  212. }
  213. else
  214. error("todo: reheader BCF, BAM\n"); // BCF is difficult, records contain pointers to the header.
  215. return 0;
  216. }
  217. static int usage(void)
  218. {
  219. fprintf(stderr, "\n");
  220. fprintf(stderr, "Version: %s\n", hts_version());
  221. fprintf(stderr, "Usage: tabix [OPTIONS] [FILE] [REGION [...]]\n");
  222. fprintf(stderr, "Options:\n");
  223. fprintf(stderr, " -0, --zero-based coordinates are zero-based\n");
  224. fprintf(stderr, " -b, --begin INT column number for region start [4]\n");
  225. fprintf(stderr, " -c, --comment CHAR skip comment lines starting with CHAR [null]\n");
  226. fprintf(stderr, " -e, --end INT column number for region end (if no end, set INT to -b) [5]\n");
  227. fprintf(stderr, " -f, --force overwrite existing index without asking\n");
  228. fprintf(stderr, " -h, --print-header print also the header lines\n");
  229. fprintf(stderr, " -H, --only-header print only the header lines\n");
  230. fprintf(stderr, " -l, --list-chroms list chromosome names\n");
  231. fprintf(stderr, " -m, --min-shift INT set the minimal interval size to 1<<INT; 0 for the old tabix index [0]\n");
  232. fprintf(stderr, " -p, --preset STR gff, bed, sam, vcf, bcf, bam\n");
  233. fprintf(stderr, " -r, --reheader FILE replace the header with the content of FILE\n");
  234. fprintf(stderr, " -s, --sequence INT column number for sequence names (suppressed by -p) [1]\n");
  235. fprintf(stderr, " -S, --skip-lines INT skip first INT lines [0]\n");
  236. fprintf(stderr, "\n");
  237. return 1;
  238. }
  239. int main(int argc, char *argv[])
  240. {
  241. int c, min_shift = -1, is_force = 0, list_chroms = 0, mode = 0;
  242. tbx_conf_t conf = tbx_conf_gff, *conf_ptr = NULL;
  243. char *reheader = NULL;
  244. static struct option loptions[] =
  245. {
  246. {"help",0,0,'h'},
  247. {"zero-based",0,0,'0'},
  248. {"print-header",0,0,'h'},
  249. {"only-header",0,0,'H'},
  250. {"begin",1,0,'b'},
  251. {"comment",1,0,'c'},
  252. {"end",1,0,'e'},
  253. {"force",0,0,'f'},
  254. {"preset",1,0,'p'},
  255. {"sequence",1,0,'s'},
  256. {"skip-lines",1,0,'S'},
  257. {"list-chroms",0,0,'l'},
  258. {"reheader",1,0,'r'},
  259. {0,0,0,0}
  260. };
  261. while ((c = getopt_long(argc, argv, "hH?0b:c:e:fm:p:s:S:lr:", loptions,NULL)) >= 0)
  262. {
  263. switch (c)
  264. {
  265. case 'r': reheader = optarg; break;
  266. case 'h': mode = PRINT_HEADER; break;
  267. case 'H': mode = HEADER_ONLY; break;
  268. case 'l': list_chroms = 1; break;
  269. case '0': conf.preset |= TBX_UCSC; break;
  270. case 'b': conf.bc = atoi(optarg); break;
  271. case 'e': conf.ec = atoi(optarg); break;
  272. case 'c': conf.meta_char = *optarg; break;
  273. case 'f': is_force = 1; break;
  274. case 'm': min_shift = atoi(optarg); break;
  275. case 'p':
  276. if (strcmp(optarg, "gff") == 0) conf_ptr = &tbx_conf_gff;
  277. else if (strcmp(optarg, "bed") == 0) conf_ptr = &tbx_conf_bed;
  278. else if (strcmp(optarg, "sam") == 0) conf_ptr = &tbx_conf_sam;
  279. else if (strcmp(optarg, "vcf") == 0) conf_ptr = &tbx_conf_vcf;
  280. else error("The preset string not recognised: '%s'\n", optarg);
  281. break;
  282. case 's': conf.sc = atoi(optarg); break;
  283. case 'S': conf.line_skip = atoi(optarg); break;
  284. default: return usage();
  285. }
  286. }
  287. if ( optind==argc ) return usage();
  288. if ( list_chroms )
  289. return query_chroms(argv[optind]);
  290. if ( argc > optind+1 || mode==HEADER_ONLY )
  291. return query_regions(&argv[optind], argc-optind, mode);
  292. char *fname = argv[optind];
  293. int ftype = file_type(fname);
  294. if ( !conf_ptr ) // no preset given
  295. {
  296. if ( ftype==IS_GFF ) conf_ptr = &tbx_conf_gff;
  297. else if ( ftype==IS_BED ) conf_ptr = &tbx_conf_bed;
  298. else if ( ftype==IS_SAM ) conf_ptr = &tbx_conf_sam;
  299. else if ( ftype==IS_VCF ) conf_ptr = &tbx_conf_vcf;
  300. else if ( ftype==IS_BCF )
  301. {
  302. if ( min_shift <= 0 ) min_shift = 14;
  303. }
  304. else if ( ftype==IS_BAM )
  305. {
  306. if ( min_shift <= 0 ) min_shift = 14;
  307. }
  308. }
  309. if ( reheader )
  310. return reheader_file(fname, reheader, ftype, conf_ptr);
  311. if ( conf_ptr )
  312. conf = *conf_ptr;
  313. char *suffix = min_shift <= 0 ? ".tbi" : (ftype==IS_BAM ? ".bai" : ".csi");
  314. char *idx_fname = calloc(strlen(fname) + 5, 1);
  315. strcat(strcpy(idx_fname, fname), suffix);
  316. struct stat stat_tbi, stat_file;
  317. if ( !is_force && stat(idx_fname, &stat_tbi)==0 )
  318. {
  319. // Before complaining about existing index, check if the VCF file isn't
  320. // newer. This is a common source of errors, people tend not to notice
  321. // that tabix failed
  322. stat(fname, &stat_file);
  323. if ( stat_file.st_mtime <= stat_tbi.st_mtime )
  324. error("[tabix] the index file exists. Please use '-f' to overwrite.\n");
  325. }
  326. free(idx_fname);
  327. if ( min_shift > 0 ) // CSI index
  328. {
  329. if ( ftype==IS_BCF )
  330. {
  331. if ( bcf_index_build(fname, min_shift)!=0 ) error("bcf_index_build failed: %s\n", fname);
  332. return 0;
  333. }
  334. if ( ftype==IS_BAM )
  335. {
  336. if ( bam_index_build(fname, min_shift)!=0 ) error("bam_index_build failed: %s\n", fname);
  337. return 0;
  338. }
  339. if ( tbx_index_build(fname, min_shift, &conf)!=0 ) error("tbx_index_build failed: %s\n", fname);
  340. return 0;
  341. }
  342. else
  343. {
  344. if ( tbx_index_build(fname, min_shift, &conf) ) error("tbx_index_build failed: %s\n", fname);
  345. return 0;
  346. }
  347. return 0;
  348. }