vcf_sweep.c 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
  1. #include "htslib/vcf_sweep.h"
  2. #include "htslib/bgzf.h"
  3. #define SW_FWD 0
  4. #define SW_BWD 1
  5. struct _bcf_sweep_t
  6. {
  7. htsFile *file;
  8. bcf_hdr_t *hdr;
  9. BGZF *fp;
  10. int direction; // to tell if the direction has changed
  11. int block_size; // the size of uncompressed data to hold in memory
  12. bcf1_t *rec; // bcf buffer
  13. int nrec, mrec; // number of used records; total size of the buffer
  14. int lrid, lpos, lnals, lals_len, mlals; // to check uniqueness of a record
  15. char *lals;
  16. uint64_t *idx; // uncompressed offsets of VCF/BCF records
  17. int iidx, nidx, midx; // i: current offset; n: used; m: allocated
  18. int idx_done; // the index is built during the first pass
  19. };
  20. BGZF *hts_get_bgzfp(htsFile *fp);
  21. int hts_useek(htsFile *file, long uoffset, int where);
  22. long hts_utell(htsFile *file);
  23. static inline int sw_rec_equal(bcf_sweep_t *sw, bcf1_t *rec)
  24. {
  25. if ( sw->lrid!=rec->rid ) return 0;
  26. if ( sw->lpos!=rec->pos ) return 0;
  27. if ( sw->lnals!=rec->n_allele ) return 0;
  28. char *t = rec->d.allele[sw->lnals-1];
  29. int len = t - rec->d.allele[0] + 1;
  30. while ( *t ) { t++; len++; }
  31. if ( sw->lals_len!=len ) return 0;
  32. if ( memcmp(sw->lals,rec->d.allele[0],len) ) return 0;
  33. return 1;
  34. }
  35. static void sw_rec_save(bcf_sweep_t *sw, bcf1_t *rec)
  36. {
  37. sw->lrid = rec->rid;
  38. sw->lpos = rec->pos;
  39. sw->lnals = rec->n_allele;
  40. char *t = rec->d.allele[sw->lnals-1];
  41. int len = t - rec->d.allele[0] + 1;
  42. while ( *t ) { t++; len++; }
  43. sw->lals_len = len;
  44. hts_expand(char, len, sw->mlals, sw->lals);
  45. memcpy(sw->lals, rec->d.allele[0], len);
  46. }
  47. static void sw_fill_buffer(bcf_sweep_t *sw)
  48. {
  49. if ( !sw->iidx ) return;
  50. sw->iidx--;
  51. int ret = hts_useek(sw->file, sw->idx[sw->iidx], 0);
  52. assert( ret==0 );
  53. sw->nrec = 0;
  54. bcf1_t *rec = &sw->rec[sw->nrec];
  55. while ( (ret=bcf_read1(sw->file, sw->hdr, rec))==0 )
  56. {
  57. bcf_unpack(rec, BCF_UN_STR);
  58. // if not in the last block, stop at the saved record
  59. if ( sw->iidx+1 < sw->nidx && sw_rec_equal(sw,rec) ) break;
  60. sw->nrec++;
  61. hts_expand0(bcf1_t, sw->nrec+1, sw->mrec, sw->rec);
  62. rec = &sw->rec[sw->nrec];
  63. }
  64. sw_rec_save(sw, &sw->rec[0]);
  65. }
  66. bcf_sweep_t *bcf_sweep_init(const char *fname)
  67. {
  68. bcf_sweep_t *sw = (bcf_sweep_t*) calloc(1,sizeof(bcf_sweep_t));
  69. sw->file = hts_open(fname, "r");
  70. sw->fp = hts_get_bgzfp(sw->file);
  71. bgzf_index_build_init(sw->fp);
  72. sw->hdr = bcf_hdr_read(sw->file);
  73. sw->mrec = 1;
  74. sw->rec = (bcf1_t*) calloc(sw->mrec,(sizeof(bcf1_t)));
  75. sw->block_size = 1024*1024*3;
  76. sw->direction = SW_FWD;
  77. return sw;
  78. }
  79. void bcf_empty1(bcf1_t *v);
  80. void bcf_sweep_destroy(bcf_sweep_t *sw)
  81. {
  82. int i;
  83. for (i=0; i<sw->mrec; i++) bcf_empty1(&sw->rec[i]);
  84. free(sw->idx);
  85. free(sw->rec);
  86. free(sw->lals);
  87. bcf_hdr_destroy(sw->hdr);
  88. hts_close(sw->file);
  89. free(sw);
  90. }
  91. static void sw_seek(bcf_sweep_t *sw, int direction)
  92. {
  93. sw->direction = direction;
  94. if ( direction==SW_FWD )
  95. hts_useek(sw->file, sw->idx[0], 0);
  96. else
  97. {
  98. sw->iidx = sw->nidx;
  99. sw->nrec = 0;
  100. }
  101. }
  102. bcf1_t *bcf_sweep_fwd(bcf_sweep_t *sw)
  103. {
  104. if ( sw->direction==SW_BWD ) sw_seek(sw, SW_FWD);
  105. long pos = hts_utell(sw->file);
  106. bcf1_t *rec = &sw->rec[0];
  107. int ret = bcf_read1(sw->file, sw->hdr, rec);
  108. if ( ret!=0 ) // last record, get ready for sweeping backwards
  109. {
  110. sw->idx_done = 1;
  111. sw->fp->idx_build_otf = 0;
  112. sw_seek(sw, SW_BWD);
  113. return NULL;
  114. }
  115. if ( !sw->idx_done )
  116. {
  117. if ( !sw->nidx || pos - sw->idx[sw->nidx-1] > sw->block_size )
  118. {
  119. sw->nidx++;
  120. hts_expand(uint64_t, sw->nidx, sw->midx, sw->idx);
  121. sw->idx[sw->nidx-1] = pos;
  122. }
  123. }
  124. return rec;
  125. }
  126. bcf1_t *bcf_sweep_bwd(bcf_sweep_t *sw)
  127. {
  128. if ( sw->direction==SW_FWD ) sw_seek(sw, SW_BWD);
  129. if ( !sw->nrec ) sw_fill_buffer(sw);
  130. if ( !sw->nrec ) return NULL;
  131. return &sw->rec[ --sw->nrec ];
  132. }
  133. bcf_hdr_t *bcf_sweep_hdr(bcf_sweep_t *sw) { return sw->hdr; }