bam_io.cu 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284
  1. /*
  2. * nvbio
  3. * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. * * Redistributions of source code must retain the above copyright
  8. * notice, this list of conditions and the following disclaimer.
  9. * * Redistributions in binary form must reproduce the above copyright
  10. * notice, this list of conditions and the following disclaimer in the
  11. * documentation and/or other materials provided with the distribution.
  12. * * Neither the name of the NVIDIA CORPORATION nor the
  13. * names of its contributors may be used to endorse or promote products
  14. * derived from this software without specific prior written permission.
  15. *
  16. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  17. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  18. * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  19. * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  20. * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  21. * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  22. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  23. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  25. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. */
  27. #include "assembly_types.h"
  28. #include "bam_io.h"
  29. using namespace nvbio;
  30. /** ---- Read Functionality ---- **/
  31. HTSBAMReader::HTSBAMReader(const char *fname)
  32. {
  33. fp = sam_open(fname, "r");
  34. if (fp == NULL) {
  35. throw nvbio::runtime_error("Could not open %s", fname);
  36. }
  37. header = sam_hdr_read(fp);
  38. fp_offset = bgzf_tell(fp->fp.bgzf);
  39. if (header == NULL) {
  40. throw nvbio::runtime_error("Error parsing BAM file header");
  41. }
  42. }
  43. HTSBAMReader::~HTSBAMReader()
  44. {
  45. bam_hdr_destroy(header);
  46. sam_close(fp);
  47. }
  48. void HTSBAMReader::parse_aln_rec(BAM_alignment_batch_SoA& batch, bam1_t* b, H_VectorU32& cigar_temp) {
  49. if(batch.field_mask & BAM_POSITIONS) {
  50. batch.positions.push_back(b->core.pos);
  51. }
  52. if(batch.field_mask & BAM_REFIDS) {
  53. batch.refIDs.push_back(b->core.tid);
  54. }
  55. if(batch.field_mask & BAM_MAPQ) {
  56. batch.bin.push_back(b->core.bin);
  57. }
  58. if(batch.field_mask & BAM_MAPQ) {
  59. batch.mapq.push_back(b->core.qual);
  60. }
  61. if(batch.field_mask & BAM_FLAGS) {
  62. batch.flags.push_back(b->core.flag);
  63. }
  64. if(batch.field_mask == BAM_ALL) {
  65. batch.next_refIDs.push_back(b->core.mtid);
  66. batch.next_positions.push_back(b->core.mpos);
  67. }
  68. // data portion
  69. uint32 offset = 0;
  70. const uint64 read_name_off = batch.names.size();
  71. const uint32 read_name_len = b->core.l_qname;
  72. if(batch.field_mask & BAM_NAMES) {
  73. batch.names.resize(read_name_off + read_name_len + 1);
  74. memcpy(&batch.names[read_name_off], b->data, read_name_len);
  75. batch.names[read_name_off + read_name_len] = '\0';
  76. }
  77. offset += read_name_len;
  78. BAM_CRQ_index crq_index(batch.cigars.size(), b->core.n_cigar, batch.reads.size(), b->core.l_qseq, batch.qualities.size(), b->core.l_qseq);
  79. if(batch.field_mask & (BAM_CIGARS | BAM_QUALITIES | BAM_READS)) {
  80. batch.crq_index.push_back(crq_index);
  81. }
  82. const uint32 cigar_len = b->core.n_cigar;
  83. if(batch.field_mask & BAM_CIGARS) {
  84. cigar_temp.resize(cigar_len);
  85. if (cigar_len) {
  86. memcpy(&cigar_temp[0], b->data + offset, sizeof(uint32) * cigar_len);
  87. for(uint32 c = 0; c < cigar_len; c++) {
  88. cigar_op op;
  89. op.op = cigar_temp[c] & 0xf;
  90. op.len = cigar_temp[c] >> 4;
  91. batch.cigars.push_back(op);
  92. }
  93. }
  94. }
  95. if (cigar_len) {
  96. offset += sizeof(uint32)*cigar_len;
  97. }
  98. if(batch.field_mask & BAM_READS) {
  99. // figure out the length of the sequence data, rounded up to reach a dword boundary
  100. const uint32 padded_read_len_bp = ((b->core.l_qseq + 7) / 8) * 8;
  101. //const uint64 required_words = util::divide_ri(b->core.l_qseq, 8);
  102. // make sure we have enough memory, then read in the sequence
  103. batch.reads.resize(crq_index.read_start + padded_read_len_bp);
  104. uint64 *storage = (uint64 *)batch.reads.addrof(crq_index.read_start);
  105. memcpy(storage, b->data + offset, b->core.l_qseq / 2);
  106. }
  107. offset += b->core.l_qseq / 2;
  108. if(batch.field_mask & BAM_QUALITIES) {
  109. batch.qualities.resize(crq_index.qual_start + b->core.l_qseq);
  110. memcpy(&batch.qualities[crq_index.qual_start], b->data + offset, b->core.l_qseq);
  111. }
  112. offset += b->core.l_qseq;
  113. const uint32 aux_len = b->l_data - offset;
  114. BAM_NAUX_index idx(batch.aux_data.size(), aux_len, read_name_off);
  115. if(batch.field_mask & (BAM_AUX | BAM_NAMES)) {
  116. batch.aln_index.push_back(idx);
  117. }
  118. if(batch.field_mask & BAM_AUX) {
  119. const uint64 aux_start = batch.aux_data.size();
  120. batch.aux_data.resize(aux_start + aux_len);
  121. memcpy(&batch.aux_data[aux_start], b->data + offset, aux_len);
  122. }
  123. offset += aux_len;
  124. }
  125. // returns false if no records were read
  126. bool HTSBAMReader::read_aln_batch(BAM_alignment_batch_SoA& batch, const uint64 batch_size)
  127. {
  128. batch.reset();
  129. H_VectorU32 cigar_temp(64);
  130. bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
  131. for(uint64 aln_id = 0; aln_id < batch_size; aln_id++) {
  132. if (sam_read1(fp, header, b) < 0) break;
  133. parse_aln_rec(batch, b, cigar_temp);
  134. batch.num_alns++;
  135. if(batch.num_alns % 10000000 == 0) {
  136. printf("Loaded %llu records. \n", batch.num_alns);
  137. }
  138. }
  139. if (batch.num_alns == 0) {
  140. return false;
  141. }
  142. return true;
  143. }
  144. // loads reads overlapping the specified interval [s, e] (including the end points)
  145. // the interval is 0-based (note: the record position is 0-based)
  146. // this is a temporary hack -- should use bam index file instead
  147. // returns false if no records were read
  148. bool HTSBAMReader::read_aln_batch_intv(BAM_alignment_batch_SoA& batch, const uint32 contig, const uint64 start, const uint64 end)
  149. {
  150. long new_offset = -1;
  151. H_VectorU32 cigar_temp(64);
  152. bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
  153. while(1) {
  154. if (sam_read1(fp, header, b) < 0) break;
  155. if(b->core.tid > contig || (b->core.tid == contig && b->core.pos > end)) break; // reads one more record, should jump back
  156. if(b->core.flag & BAM_FUNMAP) continue;
  157. if(b->core.tid == contig && (bam_endpos(b) >= start)) {
  158. if(new_offset == -1) {
  159. new_offset = bgzf_tell(fp->fp.bgzf);
  160. }
  161. //printf("Read: %u %u %u\n", b->core.tid+1, b->core.pos+1, b->core.l_qseq);
  162. parse_aln_rec(batch, b, cigar_temp);
  163. batch.num_alns++;
  164. }
  165. }
  166. if(new_offset != -1) {
  167. fp_offset = new_offset;
  168. }
  169. bgzf_seek(fp->fp.bgzf, fp_offset, SEEK_SET);
  170. if (batch.num_alns == 0) {
  171. return false;
  172. }
  173. return true;
  174. }
  175. bool HTSBAMReader::read_aln_batch(std::vector<bam1_t*>& batch, const uint64 batch_size) {
  176. bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
  177. uint64 aln_id = 0;
  178. for(aln_id = 0; aln_id < batch_size; aln_id++) {
  179. if (sam_read1(fp, header, b) < 0) break;
  180. batch.push_back(b);
  181. }
  182. if (aln_id == 0) {
  183. return false;
  184. }
  185. return true;
  186. }
  187. /** ---- Write Functionality ---- **/
  188. HTSBAMWriter::HTSBAMWriter(const char *fname)
  189. {
  190. fp = sam_open(fname, "wb");
  191. if (fp == NULL) {
  192. throw nvbio::runtime_error("Could not open %s for writing", fname);
  193. }
  194. }
  195. HTSBAMWriter::~HTSBAMWriter()
  196. {
  197. if (fp) {
  198. sam_close(fp);
  199. fp = NULL;
  200. }
  201. }
  202. void HTSBAMWriter::write_hdr(bam_hdr_t* header)
  203. {
  204. sam_hdr_write(fp, header);
  205. }
  206. // assumes the batch contains all the fields
  207. void HTSBAMWriter::write_aln_batch(BAM_alignment_batch_SoA& batch, H_VectorU64 shuffle_idx, bam_hdr_t* header)
  208. {
  209. bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
  210. b->data = (uint8_t*)malloc(ALNREC_SIZE_ALLOC);
  211. b->m_data = ALNREC_SIZE_ALLOC;
  212. for(uint64 idx = 0; idx < batch.num_alns; idx++) {
  213. uint64 i = shuffle_idx[idx];
  214. BAM_CRQ_index crq = batch.crq_index[i];
  215. BAM_NAUX_index naux = batch.aln_index[i];
  216. bam1_core_t *c = &b->core;
  217. c->tid = batch.refIDs[i];
  218. c->pos = batch.positions[i];
  219. c->bin = batch.bin[i];
  220. c->qual = batch.mapq[i];
  221. if(i < batch.num_alns-1) {
  222. c->l_qname = batch.aln_index[i+1].name_start-naux.name_start-1; // remove the trailing \0
  223. } else {
  224. c->l_qname = batch.names.size()-naux.name_start-1;
  225. }
  226. c->flag = batch.flags[i];
  227. c->n_cigar = crq.cigar_len;
  228. c->l_qseq = crq.read_len;
  229. c->mtid = batch.next_refIDs[i];
  230. c->mpos = batch.next_positions[i];
  231. c->isize = 0; //TODO
  232. b->l_data = c->l_qname*sizeof(uint8) + crq.cigar_len*sizeof(cigar_op) + crq.qual_len*sizeof(uint8)
  233. + (crq.read_len/2)*sizeof(uint8) + naux.aux_data_len*sizeof(uint8);
  234. if(b->l_data > b->m_data) {
  235. b->data = (uint8_t*)realloc(b->data, b->l_data);
  236. b->m_data = b->l_data;
  237. }
  238. //qname-cigar-seq-qual-aux
  239. memcpy(b->data, &batch.names[naux.name_start], c->l_qname);
  240. memcpy(b->data, &batch.cigars[crq.cigar_start], crq.cigar_len);
  241. memcpy(b->data, (uint64 *)batch.reads.addrof(crq.read_start), crq.read_len/2);
  242. memcpy(b->data, &batch.qualities[crq.qual_start], crq.qual_len);
  243. memcpy(b->data, &batch.aux_data[naux.aux_data_start], naux.aux_data_len);
  244. sam_write1(fp, header, b);
  245. }
  246. free(b->data);
  247. free(b);
  248. }
  249. void HTSBAMWriter::write_aln_batch(std::vector<bam1_t*>& batch, H_VectorU64 shuffle_idx, bam_hdr_t* header) {
  250. for(uint64 i = 0; i < batch.size(); i++) {
  251. sam_write1(fp, header, batch[i]);
  252. }
  253. }