bam_io.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377
  1. /*
  2. * nvbio
  3. * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. * * Redistributions of source code must retain the above copyright
  8. * notice, this list of conditions and the following disclaimer.
  9. * * Redistributions in binary form must reproduce the above copyright
  10. * notice, this list of conditions and the following disclaimer in the
  11. * documentation and/or other materials provided with the distribution.
  12. * * Neither the name of the NVIDIA CORPORATION nor the
  13. * names of its contributors may be used to endorse or promote products
  14. * derived from this software without specific prior written permission.
  15. *
  16. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  17. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  18. * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  19. * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  20. * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  21. * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  22. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  23. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  25. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. */
  27. #pragma once
  28. #include <htslib/sam.h>
  29. #include <htslib/hts.h>
  30. #include <htslib/bgzf.h>
  31. #include "assembly_types.h"
  32. using namespace nvbio;
  33. /**------------- BAM Format -------------**/
  34. // BAM header
  35. struct BAM_header
  36. {
  37. uint8 magic[4]; // BAM magic string
  38. int32 l_text; // length of the header text
  39. std::string text; // the header text itself
  40. int32 n_ref; // number of reference sequences
  41. std::vector<std::string> sq_names; // reference sequence names
  42. H_VectorU32 sq_lengths; // reference sequence lengths
  43. };
  44. // BAM alignment record header
  45. struct BAM_alignment_header
  46. {
  47. int32 block_size; // length of the remainder of the alignment record
  48. int32 refID; // reference sequence ID, -1 <= refID < n_ref (-1 for a read without a mapping position)
  49. int32 pos; // 0-based leftmost coordinate
  50. uint32 bin_mq_nl; // bin << 16 | MAPQ << 8 | l_read_name
  51. uint32 flag_nc; // FLAG << 16 | n_cigar_op
  52. int32 l_seq; // length of the sequence
  53. int32 next_refID; // refID of the next segment (-1 <= next_refID < n_ref)
  54. int32 next_pos; // 0-based leftmost pos of the next segment
  55. int32 tlen; // template length
  56. uint32 bin(void) const { return bin_mq_nl >> 16; }
  57. uint32 mapq(void) const { return (bin_mq_nl & 0xff00) >> 8; }
  58. uint32 l_read_name(void) const { return bin_mq_nl & 0xff; }
  59. uint32 flags(void) const { return flag_nc >> 16; }
  60. uint32 num_cigar_ops(void) const { return flag_nc & 0xffff; }
  61. };
  62. // BAM alignment record
  63. // consists of the alignment header followed by a variable-length
  64. // data block; the auxiliary data is of block_size - ftell() bytes
  65. struct BAM_alignment_record
  66. {
  67. BAM_alignment_header header; // alignment info
  68. H_VectorU8 data; // qname + cigar + seq + qual + aux
  69. };
  70. // contiguous batch of raw alignment records
  71. struct BAM_alignment_batch_raw
  72. {
  73. H_VectorU8 recs; // raw byte array of alignment records
  74. H_VectorU64 offsets; // offsets into alignment records (num_recs + 1)
  75. };
  76. typedef enum
  77. {
  78. BAM_NAMES = 1,
  79. BAM_CIGARS = 2,
  80. BAM_READS = 4,
  81. BAM_QUALITIES = 8,
  82. BAM_FLAGS = 16,
  83. BAM_POSITIONS = 32,
  84. BAM_REFIDS = 64,
  85. BAM_MAPQ = 128,
  86. BAM_AUX = 256,
  87. BAM_BIN = 512,
  88. BAM_ALL = 0xFFFF
  89. } BAM_field_masks;
  90. typedef enum
  91. {
  92. BAM_FLAGS_PAIRED = 1,
  93. BAM_FLAGS_PROPER_PAIR = 2,
  94. BAM_FLAGS_UNMAPPED = 4,
  95. BAM_FLAGS_MATE_UNMAPPED = 8,
  96. BAM_FLAGS_REVERSE = 16,
  97. BAM_FLAGS_MATE_REVERSE = 32,
  98. BAM_FLAGS_READ_1 = 64,
  99. BAM_FLAGS_READ_2 = 128,
  100. BAM_FLAGS_SECONDARY = 256,
  101. BAM_FLAGS_QC_FAILED = 512,
  102. BAM_FLAGS_DUPLICATE = 1024
  103. } BAM_alignment_flags;
  104. // ----- SoA Functionality ------
  105. // CRQ: cigars, reads, qualities
  106. struct BAM_CRQ_index
  107. {
  108. uint64 cigar_start, cigar_len;
  109. uint64 read_start, read_len;
  110. uint64 qual_start, qual_len;
  111. BAM_CRQ_index(): cigar_start(0), cigar_len(0), read_start(0), read_len(0), qual_start(0), qual_len(0) { }
  112. BAM_CRQ_index(uint64 cigar_start, uint64 cigar_len, uint64 read_start, uint64 read_len, uint64 qual_start, uint64 qual_len)
  113. : cigar_start(cigar_start), cigar_len(cigar_len), read_start(read_start), read_len(read_len),
  114. qual_start(qual_start), qual_len(qual_len) { }
  115. };
  116. struct BAM_NAUX_index
  117. {
  118. uint64 aux_data_start;
  119. uint32 aux_data_len;
  120. uint64 name_start;
  121. BAM_NAUX_index() : aux_data_start(0), aux_data_len(0), name_start(0) { }
  122. BAM_NAUX_index(uint64 aux_data_start, uint32 aux_data_len, uint64 name_start)
  123. : aux_data_start(aux_data_start), aux_data_len(aux_data_len), name_start(name_start) { }
  124. };
  125. #define H_BATCH_SIZE_ALLOC 10000000U
  126. #define ALNREC_SIZE_ALLOC 512
  127. typedef nvbio::vector<device_tag, BAM_CRQ_index> D_VectorCRQIndex;
  128. typedef nvbio::vector<host_tag, BAM_CRQ_index> H_VectorCRQIndex;
  129. // batch of alignment records as SoA
  130. // host-only now
  131. struct BAM_alignment_batch_SoA
  132. {
  133. BAM_alignment_batch_SoA(): field_mask(BAM_ALL), num_alns(0) { reserve(); }
  134. BAM_alignment_batch_SoA(uint32 fields): field_mask(fields), num_alns(0) { reserve(); }
  135. uint32 field_mask; // record fields stored
  136. uint64 num_alns;
  137. H_VectorCigarOp cigars;
  138. H_VectorDNA16 reads;
  139. H_VectorU16 bin;
  140. H_VectorU8 qualities;
  141. H_VectorU16 flags;
  142. H_VectorU32 positions;
  143. H_VectorU32 refIDs;
  144. H_VectorU8 mapq;
  145. H_VectorU8 aux_data;
  146. H_VectorU8 names;
  147. H_VectorU32 next_positions;
  148. H_VectorU32 next_refIDs;
  149. nvbio::vector<host_tag, BAM_CRQ_index> crq_index; // CRQ: cigars, reads, qualities
  150. nvbio::vector<host_tag, BAM_NAUX_index> aln_index; // names, aux
  151. void reserve()
  152. {
  153. if(field_mask & BAM_POSITIONS) positions.reserve(H_BATCH_SIZE_ALLOC);
  154. if(field_mask & BAM_REFIDS) refIDs.reserve(H_BATCH_SIZE_ALLOC);
  155. if(field_mask & BAM_BIN) bin.reserve(H_BATCH_SIZE_ALLOC);
  156. if(field_mask & BAM_MAPQ) mapq.reserve(H_BATCH_SIZE_ALLOC);
  157. if(field_mask & BAM_FLAGS) flags.reserve(H_BATCH_SIZE_ALLOC);
  158. if(field_mask & (BAM_CIGARS | BAM_QUALITIES | BAM_READS)) crq_index.reserve(H_BATCH_SIZE_ALLOC);
  159. if(field_mask & BAM_CIGARS) cigars.reserve(H_BATCH_SIZE_ALLOC * 32);
  160. if(field_mask & BAM_READS) reads.reserve(H_BATCH_SIZE_ALLOC * 125);
  161. if(field_mask & BAM_QUALITIES) qualities.reserve(H_BATCH_SIZE_ALLOC * 125);
  162. if(field_mask & (BAM_AUX | BAM_NAMES)) aln_index.reserve(H_BATCH_SIZE_ALLOC);
  163. if(field_mask & BAM_AUX) aux_data.reserve(H_BATCH_SIZE_ALLOC * 64);
  164. if(field_mask & BAM_NAMES) names.reserve(H_BATCH_SIZE_ALLOC * 256);
  165. if(field_mask == BAM_ALL) {
  166. next_positions.reserve(H_BATCH_SIZE_ALLOC);
  167. next_refIDs.reserve(H_BATCH_SIZE_ALLOC);
  168. }
  169. }
  170. void allocate(const BAM_alignment_batch_SoA& size_batch)
  171. {
  172. if(field_mask & BAM_POSITIONS) positions.resize(size_batch.positions.size());
  173. if(field_mask & BAM_REFIDS) refIDs.resize(size_batch.refIDs.size());
  174. if(field_mask & BAM_BIN) bin.resize(size_batch.bin.size());
  175. if(field_mask & BAM_MAPQ) mapq.resize(size_batch.mapq.size());
  176. if(field_mask & BAM_FLAGS) flags.resize(size_batch.flags.size());
  177. if(field_mask & (BAM_CIGARS | BAM_QUALITIES | BAM_READS)) crq_index.resize(size_batch.crq_index.size());
  178. if(field_mask & BAM_CIGARS) cigars.resize(size_batch.cigars.size());
  179. if(field_mask & BAM_READS) reads.resize(size_batch.reads.size());
  180. if(field_mask & BAM_QUALITIES) qualities.resize(size_batch.qualities.size());
  181. if(field_mask & (BAM_AUX | BAM_NAMES)) aln_index.resize(size_batch.aln_index.size());
  182. if(field_mask & BAM_AUX) aux_data.resize(size_batch.aux_data.size());
  183. if(field_mask & BAM_NAMES) names.resize(size_batch.names.size());
  184. if(field_mask == BAM_ALL) {
  185. next_positions.resize(size_batch.next_positions.size());
  186. next_refIDs.resize(size_batch.next_refIDs.size());
  187. }
  188. }
  189. void reset()
  190. {
  191. num_alns = 0;
  192. positions.clear();
  193. refIDs.clear();
  194. bin.clear();
  195. mapq.clear();
  196. flags.clear();
  197. crq_index.clear();
  198. cigars.clear();
  199. reads.clear();
  200. qualities.clear();
  201. aln_index.clear();
  202. aux_data.clear();
  203. names.clear();
  204. next_positions.clear();
  205. next_refIDs.clear();
  206. }
  207. void free()
  208. {
  209. // force the memory to be freed by swapping
  210. positions = H_VectorU32();
  211. refIDs = H_VectorU32();
  212. bin = H_VectorU16();
  213. mapq = H_VectorU8();
  214. flags = H_VectorU16();
  215. cigars = H_VectorCigarOp();
  216. reads = H_VectorDNA16();
  217. qualities = H_VectorU8();
  218. aux_data = H_VectorU8();
  219. names = H_VectorU8();
  220. crq_index = nvbio::vector<host_tag, BAM_CRQ_index>();
  221. aln_index = nvbio::vector<host_tag, BAM_NAUX_index>();
  222. next_positions = H_VectorU32();
  223. next_refIDs = H_VectorU32();
  224. }
  225. void shuffle(BAM_alignment_batch_SoA& batch_out, H_VectorU64 shuffle_idx) {
  226. batch_out.allocate(*this);
  227. if(field_mask & BAM_POSITIONS) {
  228. thrust::gather(shuffle_idx.begin(), shuffle_idx.end(), positions.begin(), batch_out.positions.begin());
  229. }
  230. if(field_mask & BAM_REFIDS) {
  231. thrust::gather(shuffle_idx.begin(), shuffle_idx.end(), refIDs.begin(), batch_out.refIDs.begin());
  232. }
  233. if(field_mask & BAM_BIN) {
  234. thrust::gather(shuffle_idx.begin(), shuffle_idx.end(), bin.begin(), batch_out.bin.begin());
  235. }
  236. if(field_mask & BAM_MAPQ) {
  237. thrust::gather(shuffle_idx.begin(), shuffle_idx.end(), mapq.begin(), batch_out.mapq.begin());
  238. }
  239. if(field_mask & BAM_FLAGS) {
  240. thrust::gather(shuffle_idx.begin(), shuffle_idx.end(), flags.begin(), batch_out.flags.begin());
  241. }
  242. if(field_mask & BAM_QUALITIES) {
  243. uint64 cigar_offset = 0;
  244. for(uint64 i = 0; i < num_alns; i++) {
  245. uint64 idx = shuffle_idx[i];
  246. thrust::copy_n(cigars.begin() + crq_index[idx].cigar_start, crq_index[idx].cigar_len,
  247. batch_out.cigars.begin() + cigar_offset);
  248. batch_out.crq_index[i].cigar_start = cigar_offset;
  249. batch_out.crq_index[i].cigar_len = crq_index[idx].cigar_len;
  250. cigar_offset += crq_index[idx].cigar_len;
  251. }
  252. }
  253. if(field_mask & BAM_READS) {
  254. uint64 read_offset = 0;
  255. for(uint64 i = 0; i < num_alns; i++) {
  256. uint64 idx = shuffle_idx[i];
  257. uint64* data_in = (uint64 *)reads.addrof(crq_index[idx].read_start);
  258. uint64* data_out = (uint64 *)batch_out.reads.addrof(read_offset);
  259. memcpy(data_out, data_in, crq_index[idx].read_len/2);
  260. batch_out.crq_index[i].read_start = read_offset;
  261. batch_out.crq_index[i].read_len = crq_index[idx].read_len;
  262. const uint32 padded_read_len_bp = ((crq_index[idx].read_len + 7) / 8) * 8;
  263. read_offset += padded_read_len_bp;
  264. }
  265. }
  266. if(field_mask & BAM_QUALITIES) {
  267. uint64 qual_offset = 0;
  268. for(uint64 i = 0; i < num_alns; i++) {
  269. uint64 idx = shuffle_idx[i];
  270. thrust::copy_n(qualities.begin() + crq_index[idx].qual_start, crq_index[idx].qual_len,
  271. batch_out.qualities.begin() + qual_offset);
  272. batch_out.crq_index[i].qual_start = qual_offset;
  273. batch_out.crq_index[i].qual_len = crq_index[idx].qual_len;
  274. qual_offset += crq_index[idx].qual_len;
  275. }
  276. }
  277. if(field_mask & BAM_AUX) {
  278. uint64 aux_offset = 0;
  279. for(uint64 i = 0; i < num_alns; i++) {
  280. uint64 idx = shuffle_idx[i];
  281. thrust::copy_n(aux_data.begin() + aln_index[idx].aux_data_start, aln_index[idx].aux_data_len, batch_out.aux_data.begin() + aux_offset);
  282. batch_out.aln_index[i].aux_data_start = aux_offset;
  283. batch_out.aln_index[i].aux_data_len = aln_index[idx].aux_data_len;
  284. aux_offset += aln_index[idx].aux_data_len;
  285. }
  286. }
  287. if(field_mask & BAM_NAMES){
  288. uint64 names_offset = 0;
  289. for(uint64 i = 0; i < num_alns; i++) {
  290. uint64 idx = shuffle_idx[i];
  291. uint32 len;
  292. if(idx < num_alns-1) {
  293. len = aln_index[idx+1].name_start-aln_index[idx].name_start;
  294. } else {
  295. len = names.size()-aln_index[idx].name_start;
  296. }
  297. thrust::copy_n(names.begin() + aln_index[idx].name_start, len, batch_out.names.begin() + names_offset);
  298. batch_out.aln_index[i].name_start = names_offset;
  299. names_offset += len;
  300. }
  301. }
  302. if(field_mask == BAM_ALL) {
  303. thrust::gather(shuffle_idx.begin(), shuffle_idx.end(), next_positions.begin(), batch_out.next_positions.begin());
  304. thrust::gather(shuffle_idx.begin(), shuffle_idx.end(), next_refIDs.begin(), batch_out.next_refIDs.begin());
  305. }
  306. }
  307. };
  308. /**------------------- BAM Reader -------------**/
  309. // htslib-based BAM file reader
  310. struct HTSBAMReader
  311. {
  312. public:
  313. bam_hdr_t* header;
  314. private:
  315. samFile* fp;
  316. long fp_offset;
  317. public:
  318. HTSBAMReader(const char *fname);
  319. ~HTSBAMReader();
  320. bool read_aln_batch(std::vector<bam1_t*>& batch, const uint64 batch_size = 1000000);
  321. bool read_aln_batch(BAM_alignment_batch_SoA& batch, const uint64 batch_size = 1000000);
  322. bool read_aln_batch_intv(BAM_alignment_batch_SoA& batch, const uint32 contig = 1u, const uint64 start = 0u, const uint64 end = 1000000);
  323. private:
  324. bool read_hdr(void);
  325. void parse_aln_rec(BAM_alignment_batch_SoA& batch, bam1_t* b, H_VectorU32& cigar_temp);
  326. };
  327. /**------------------- BAM Writer -------------**/
  328. // htslib-based BAM file writer
  329. struct HTSBAMWriter
  330. {
  331. private:
  332. samFile* fp;
  333. public:
  334. HTSBAMWriter(const char *fname);
  335. ~HTSBAMWriter();
  336. void write_hdr(bam_hdr_t* header);
  337. void write_aln_batch(std::vector<bam1_t*>& batch, H_VectorU64 shuffle_idx, bam_hdr_t* header);
  338. void write_aln_batch(BAM_alignment_batch_SoA& batch, H_VectorU64 shuffle_idx, bam_hdr_t* header);
  339. };