BamAux.h 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485
  1. // ***************************************************************************
  2. // BamAux.h (c) 2009 Derek Barnett, Michael Str�mberg
  3. // Marth Lab, Department of Biology, Boston College
  4. // All rights reserved.
  5. // ---------------------------------------------------------------------------
  6. // Last modified: 8 December 2009 (DB)
  7. // ---------------------------------------------------------------------------
  8. // Provides the basic constants, data structures, etc. for using BAM files
  9. // ***************************************************************************
  10. #ifndef BAMAUX_H
  11. #define BAMAUX_H
  12. #include "BGZF.h"
  13. // C inclues
  14. #include <cstdlib>
  15. #include <cstring>
  16. // C++ includes
  17. #include <exception>
  18. #include <map>
  19. #include <string>
  20. #include <utility>
  21. #include <vector>
  22. namespace BamTools {
  23. // BAM constants
  24. const int BAM_CORE_SIZE = 32;
  25. const int BAM_CMATCH = 0;
  26. const int BAM_CINS = 1;
  27. const int BAM_CDEL = 2;
  28. const int BAM_CREF_SKIP = 3;
  29. const int BAM_CSOFT_CLIP = 4;
  30. const int BAM_CHARD_CLIP = 5;
  31. const int BAM_CPAD = 6;
  32. const int BAM_CIGAR_SHIFT = 4;
  33. const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1);
  34. // BAM index constants
  35. const int MAX_BIN = 37450; // =(8^6-1)/7+1
  36. const int BAM_MIN_CHUNK_GAP = 32768;
  37. const int BAM_LIDX_SHIFT = 14;
  38. // Explicit variable sizes
  39. const int BT_SIZEOF_INT = 4;
  40. struct CigarOp;
  41. struct RefEdit;
  42. struct BamAlignment {
  43. // Queries against alignment flag
  44. public:
  45. // Returns true if this read is a PCR duplicate (determined by external app)
  46. bool IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); }
  47. // Returns true if this read failed quality control (determined by external app)
  48. bool IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); }
  49. // Returns true if alignment is first mate on read
  50. bool IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); }
  51. // Returns true if alignment is mapped
  52. bool IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); }
  53. // Returns true if alignment's mate is mapped
  54. bool IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); }
  55. // Returns true if alignment's mate mapped to reverse strand
  56. bool IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); }
  57. // Returns true if alignment part of paired-end read
  58. bool IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); }
  59. // Returns true if this position is primary alignment (determined by external app)
  60. bool IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); }
  61. // Returns true if alignment is part of read that satisfied paired-end resolution (determined by external app)
  62. bool IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); }
  63. // Returns true if alignment mapped to reverse strand
  64. bool IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); }
  65. // Returns true if alignment is second mate on read
  66. bool IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); }
  67. public:
  68. // get "RG" tag data
  69. bool GetReadGroup(std::string& readGroup) const
  70. {
  71. const char* rg = GetTag("RG");
  72. if (rg==NULL || !rg)
  73. {
  74. return false;
  75. }
  76. // assign the read group
  77. const unsigned int readGroupLen = (unsigned int)std::strlen(rg);
  78. readGroup = std::string(rg, readGroupLen);
  79. //readGroup.resize(readGroupLen);
  80. //std::memcpy((char*)readGroup.data, rg, readGroupLen );
  81. return true;
  82. }
  83. // get "NM" tag data - contributed by Aaron Quinlan
  84. bool GetEditDistance(uint8_t& editDistance) const { return GetTag( "NM", editDistance ); }
  85. // get generic tag data - NVBIO
  86. template <typename T>
  87. bool GetTag(const char* tagName, T& tagVal) const
  88. {
  89. if ( TagData.empty() ) { return false; }
  90. // localize the tag data
  91. const char* pTagData = (const char*)TagData.c_str();
  92. const char* pTagStorageType = NULL;
  93. const unsigned int tagDataLen = (unsigned int)TagData.size() + 1;
  94. unsigned int numBytesParsed = 0;
  95. bool foundTag = false;
  96. while( numBytesParsed < tagDataLen )
  97. {
  98. // skip white spaces
  99. while ( (*pTagData == ' ' || *pTagData == '\n') && numBytesParsed < tagDataLen)
  100. {
  101. ++numBytesParsed;
  102. ++pTagData;
  103. }
  104. if (*pTagData == '\0' || numBytesParsed == tagDataLen)
  105. break;
  106. const char* pTagType = pTagData;
  107. pTagStorageType = pTagData + 2;
  108. pTagData += 3;
  109. numBytesParsed += 3;
  110. // check the current tag
  111. if (strncmp(pTagType, tagName, 2) == 0) {
  112. foundTag = true;
  113. break;
  114. }
  115. // get the storage class and find the next tag
  116. SkipToNextTag( pTagType, *pTagStorageType, pTagData, numBytesParsed );
  117. }
  118. // return if the edit distance tag was not present
  119. if ( !foundTag ) { return false; }
  120. // assign the tag value
  121. return GetTagValue( tagName, pTagData, *pTagStorageType, &tagVal );
  122. }
  123. // get generic tag data - NVBIO
  124. const char* GetTag(const char* tagName) const
  125. {
  126. if ( TagData.empty() ) { return NULL; }
  127. // localize the tag data
  128. const char* pTagData = (const char*)TagData.c_str();
  129. const unsigned int tagDataLen = (unsigned int)TagData.size() + 1;
  130. unsigned int numBytesParsed = 0;
  131. while( numBytesParsed < tagDataLen )
  132. {
  133. // skip white spaces
  134. while ( (*pTagData == ' ' || *pTagData == '\n') && numBytesParsed < tagDataLen)
  135. {
  136. ++numBytesParsed;
  137. ++pTagData;
  138. }
  139. if (*pTagData == '\0' || numBytesParsed == tagDataLen)
  140. break;
  141. const char* pTagType = pTagData;
  142. const char* pTagStorageType = pTagData + 2;
  143. pTagData += 3;
  144. numBytesParsed += 3;
  145. // check the current tag
  146. if (strncmp(pTagType, tagName, 2) == 0)
  147. return pTagData;
  148. // get the storage class and find the next tag
  149. SkipToNextTag( pTagType, *pTagStorageType, pTagData, numBytesParsed );
  150. }
  151. // return if the edit distance tag was not present
  152. return NULL;
  153. }
  154. private:
  155. static bool CheckType(const char storageType, int8_t) { return (storageType == 'A') || (storageType == 'c'); }
  156. static bool CheckType(const char storageType, uint8_t) { return (storageType == 'C'); }
  157. static bool CheckType(const char storageType, int16_t) { return (storageType == 's') || (storageType == 'c') || (storageType == 'C'); }
  158. static bool CheckType(const char storageType, uint16_t) { return (storageType == 'S') || (storageType == 'C'); }
  159. static bool CheckType(const char storageType, int32_t) { return (storageType == 'i') || (storageType == 'c') || (storageType == 'C') || (storageType == 's') || (storageType == 'S'); }
  160. static bool CheckType(const char storageType, uint32_t) { return (storageType == 'I') || (storageType == 'C') || (storageType == 'S'); }
  161. static bool CheckType(const char storageType, float) { return (storageType == 'f'); }
  162. template <typename T>
  163. static bool GetTagValue(const char* tagName, const char *pTagData, const char storageType, T* tagVal)
  164. {
  165. switch(storageType) {
  166. case 'A':
  167. if (CheckType(storageType,T()) == false)
  168. return false;
  169. *tagVal = T( *(const char*)(pTagData) );
  170. break;
  171. case 'c':
  172. if (CheckType(storageType,T()) == false)
  173. return false;
  174. *tagVal = T( *(const int8_t*)(pTagData) );
  175. break;
  176. case 'C':
  177. if (CheckType(storageType,T()) == false)
  178. return false;
  179. *tagVal = T( *(const uint8_t*)(pTagData) );
  180. break;
  181. case 's':
  182. if (CheckType(storageType,T()) == false)
  183. return false;
  184. *tagVal = T( *(const int16_t*)(pTagData) );
  185. break;
  186. case 'S':
  187. if (CheckType(storageType,T()) == false)
  188. return false;
  189. *tagVal = T( *(const uint16_t*)(pTagData) );
  190. break;
  191. case 'i':
  192. case 'f':
  193. case 'I':
  194. if (CheckType(storageType,T()) == false)
  195. return false;
  196. *tagVal = T( *(const T*)(pTagData) );
  197. break;
  198. case 'B':
  199. {
  200. const char type = *pTagData++;
  201. const uint8_t len0 = *pTagData++;
  202. const uint8_t len1 = *pTagData++;
  203. const uint8_t len2 = *pTagData++;
  204. const uint8_t len3 = *pTagData++;
  205. const uint32_t len = len0 | (len1 << 8) | (len2 << 16) | (len3 << 24);
  206. if (CheckType(type,T()) == false)
  207. return false;
  208. const T* tagData = (const T*)pTagData;
  209. switch (type)
  210. {
  211. case 'c':
  212. case 'C':
  213. for (uint32_t i = 0; i < len; ++i)
  214. *tagVal++ = *(const T*)(tagData++);
  215. break;
  216. case 's':
  217. case 'S':
  218. for (uint32_t i = 0; i < len; ++i)
  219. *tagVal++ = *(const T*)(tagData++);
  220. break;
  221. case 'i':
  222. case 'I':
  223. case 'f':
  224. for (uint32_t i = 0; i < len; ++i)
  225. *tagVal++ = *(const T*)(tagData++);
  226. break;
  227. }
  228. }
  229. case 'Z':
  230. if (CheckType(storageType,T()) == false)
  231. return false;
  232. while(*pTagData != '\0')
  233. *tagVal++ = *pTagData++;
  234. break;
  235. case 'H':
  236. // TODO!
  237. {
  238. char tagString[3];
  239. tagString[0] = tagName[0];
  240. tagString[1] = tagName[1];
  241. tagString[2] = '\0';
  242. fprintf(stderr,"WARNING: tag storage class 'H' is not yet supported, tag[%s:H]\n", tagString);
  243. return false;
  244. }
  245. //break;
  246. default:
  247. {
  248. //char tagString[3];
  249. //tagString[0] = tagName[0];
  250. //tagString[1] = tagName[1];
  251. //tagString[2] = '\0';
  252. //fprintf(stderr,"ERROR: Unknown tag storage class encountered, tag[%s:%c]\n", tagString, storageType);
  253. //exit(1);
  254. }
  255. }
  256. return true;
  257. }
  258. static void SkipToNextTag(const char* tagName, const char storageType, const char* &pTagData, unsigned int& numBytesParsed)
  259. {
  260. switch(storageType) {
  261. case 'A':
  262. case 'c':
  263. case 'C':
  264. ++numBytesParsed;
  265. ++pTagData;
  266. break;
  267. case 'B':
  268. {
  269. const char type = *pTagData++;
  270. const uint8_t len0 = *pTagData++;
  271. const uint8_t len1 = *pTagData++;
  272. const uint8_t len2 = *pTagData++;
  273. const uint8_t len3 = *pTagData++;
  274. const uint32_t len = len0 | (len1 << 8) | (len2 << 16) | (len3 << 24);
  275. switch (type)
  276. {
  277. case 'c':
  278. case 'C':
  279. pTagData += len;
  280. break;
  281. case 's':
  282. case 'S':
  283. pTagData += len * 2;
  284. break;
  285. case 'i':
  286. case 'I':
  287. case 'f':
  288. pTagData += len * 4;
  289. break;
  290. }
  291. break;
  292. }
  293. case 's':
  294. case 'S':
  295. numBytesParsed += 2;
  296. pTagData += 2;
  297. break;
  298. case 'i':
  299. case 'f':
  300. case 'I':
  301. numBytesParsed += 4;
  302. pTagData += 4;
  303. break;
  304. case 'Z':
  305. case 'H':
  306. while(*pTagData != '\0') {
  307. ++numBytesParsed;
  308. ++pTagData;
  309. }
  310. if(*pTagData == '\0')
  311. {
  312. ++numBytesParsed;
  313. ++pTagData;
  314. }
  315. break;
  316. default:
  317. {
  318. //char tagString[3];
  319. //tagString[0] = tagName[0];
  320. //tagString[1] = tagName[1];
  321. //tagString[2] = '\0';
  322. //fprintf(stderr,"ERROR: Unknown tag storage class encountered: [%s:%c]\n", tagString, storageType);
  323. //exit(1);
  324. }
  325. }
  326. }
  327. // Data members
  328. public:
  329. std::string Name; // Read name
  330. int32_t Length; // Query length
  331. std::string QueryBases; // 'Original' sequence (as reported from sequencing machine)
  332. std::string AlignedBases; // 'Aligned' sequence (includes any indels, padding, clipping)
  333. std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values)
  334. std::string TagData; // Tag data (accessor methods will pull the requested information out)
  335. int32_t RefID; // ID number for reference sequence
  336. int32_t Position; // Position (0-based) where alignment starts
  337. uint16_t Bin; // Bin in BAM file where this alignment resides
  338. uint16_t MapQuality; // Mapping quality score
  339. uint32_t AlignmentFlag; // Alignment bit-flag - see Is<something>() methods for available queries
  340. std::vector<CigarOp> CigarData; // CIGAR operations for this alignment
  341. std::vector<RefEdit> DeltaData; // Delta operations from reference to the query sequence
  342. int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned
  343. int32_t MatePosition; // Position (0-based) where alignment's mate starts
  344. int32_t InsertSize; // Mate-pair insert size
  345. // Alignment flag query constants
  346. enum { PAIRED = 1,
  347. PROPER_PAIR = 2,
  348. UNMAPPED = 4,
  349. MATE_UNMAPPED = 8,
  350. REVERSE = 16,
  351. MATE_REVERSE = 32,
  352. READ_1 = 64,
  353. READ_2 = 128,
  354. SECONDARY = 256,
  355. QC_FAILED = 512,
  356. DUPLICATE = 1024
  357. };
  358. };
  359. // ----------------------------------------------------------------
  360. // Auxiliary data structs & typedefs
  361. struct CigarOp {
  362. char Type; // Operation type (MIDNSHP)
  363. uint32_t Length; // Operation length (number of bases)
  364. };
  365. struct RefEdit
  366. {
  367. char Type; //Op type (MID etc.)
  368. uint32_t L; //parameter - length for ID, Sub Code for X
  369. uint32_t pos; //position in the query sequence
  370. };
  371. struct RefData {
  372. // data members
  373. std::string RefName; // Name of reference sequence
  374. uint32_t RefLength; // Length of reference sequence
  375. bool RefHasAlignments; // True if BAM file contains alignments mapped to reference sequence
  376. // constructor
  377. RefData(void)
  378. : RefLength(0)
  379. , RefHasAlignments(false)
  380. { }
  381. };
  382. typedef std::vector<RefData> RefVector;
  383. typedef std::vector<BamAlignment> BamAlignmentVector;
  384. // ----------------------------------------------------------------
  385. // Indexing structs & typedefs
  386. struct Chunk {
  387. // data members
  388. uint64_t Start;
  389. uint64_t Stop;
  390. // constructor
  391. Chunk(const uint64_t& start = 0, const uint64_t& stop = 0)
  392. : Start(start)
  393. , Stop(stop)
  394. { }
  395. };
  396. inline
  397. bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) {
  398. return lhs.Start < rhs.Start;
  399. }
  400. typedef std::vector<Chunk> ChunkVector;
  401. typedef std::map<uint32_t, ChunkVector> BamBinMap;
  402. typedef std::vector<uint64_t> LinearOffsetVector;
  403. struct ReferenceIndex {
  404. // data members
  405. BamBinMap Bins;
  406. LinearOffsetVector Offsets;
  407. // constructor
  408. ReferenceIndex(const BamBinMap& binMap = BamBinMap(),
  409. const LinearOffsetVector& offsets = LinearOffsetVector())
  410. : Bins(binMap)
  411. , Offsets(offsets)
  412. { }
  413. };
  414. typedef std::vector<ReferenceIndex> BamIndex;
  415. } // namespace BamTools
  416. #endif // BAMAUX_H