123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485 |
- // ***************************************************************************
- // BamAux.h (c) 2009 Derek Barnett, Michael Str�mberg
- // Marth Lab, Department of Biology, Boston College
- // All rights reserved.
- // ---------------------------------------------------------------------------
- // Last modified: 8 December 2009 (DB)
- // ---------------------------------------------------------------------------
- // Provides the basic constants, data structures, etc. for using BAM files
- // ***************************************************************************
- #ifndef BAMAUX_H
- #define BAMAUX_H
- #include "BGZF.h"
- // C inclues
- #include <cstdlib>
- #include <cstring>
- // C++ includes
- #include <exception>
- #include <map>
- #include <string>
- #include <utility>
- #include <vector>
- namespace BamTools {
- // BAM constants
- const int BAM_CORE_SIZE = 32;
- const int BAM_CMATCH = 0;
- const int BAM_CINS = 1;
- const int BAM_CDEL = 2;
- const int BAM_CREF_SKIP = 3;
- const int BAM_CSOFT_CLIP = 4;
- const int BAM_CHARD_CLIP = 5;
- const int BAM_CPAD = 6;
- const int BAM_CIGAR_SHIFT = 4;
- const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1);
- // BAM index constants
- const int MAX_BIN = 37450; // =(8^6-1)/7+1
- const int BAM_MIN_CHUNK_GAP = 32768;
- const int BAM_LIDX_SHIFT = 14;
- // Explicit variable sizes
- const int BT_SIZEOF_INT = 4;
- struct CigarOp;
- struct RefEdit;
- struct BamAlignment {
- // Queries against alignment flag
- public:
- // Returns true if this read is a PCR duplicate (determined by external app)
- bool IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); }
- // Returns true if this read failed quality control (determined by external app)
- bool IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); }
- // Returns true if alignment is first mate on read
- bool IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); }
- // Returns true if alignment is mapped
- bool IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); }
- // Returns true if alignment's mate is mapped
- bool IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); }
- // Returns true if alignment's mate mapped to reverse strand
- bool IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); }
- // Returns true if alignment part of paired-end read
- bool IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); }
- // Returns true if this position is primary alignment (determined by external app)
- bool IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); }
- // Returns true if alignment is part of read that satisfied paired-end resolution (determined by external app)
- bool IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); }
- // Returns true if alignment mapped to reverse strand
- bool IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); }
- // Returns true if alignment is second mate on read
- bool IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); }
- public:
- // get "RG" tag data
- bool GetReadGroup(std::string& readGroup) const
- {
- const char* rg = GetTag("RG");
- if (rg==NULL || !rg)
- {
- return false;
- }
- // assign the read group
- const unsigned int readGroupLen = (unsigned int)std::strlen(rg);
- readGroup = std::string(rg, readGroupLen);
- //readGroup.resize(readGroupLen);
- //std::memcpy((char*)readGroup.data, rg, readGroupLen );
- return true;
- }
- // get "NM" tag data - contributed by Aaron Quinlan
- bool GetEditDistance(uint8_t& editDistance) const { return GetTag( "NM", editDistance ); }
- // get generic tag data - NVBIO
- template <typename T>
- bool GetTag(const char* tagName, T& tagVal) const
- {
- if ( TagData.empty() ) { return false; }
- // localize the tag data
- const char* pTagData = (const char*)TagData.c_str();
- const char* pTagStorageType = NULL;
- const unsigned int tagDataLen = (unsigned int)TagData.size() + 1;
- unsigned int numBytesParsed = 0;
- bool foundTag = false;
- while( numBytesParsed < tagDataLen )
- {
- // skip white spaces
- while ( (*pTagData == ' ' || *pTagData == '\n') && numBytesParsed < tagDataLen)
- {
- ++numBytesParsed;
- ++pTagData;
- }
- if (*pTagData == '\0' || numBytesParsed == tagDataLen)
- break;
- const char* pTagType = pTagData;
- pTagStorageType = pTagData + 2;
- pTagData += 3;
- numBytesParsed += 3;
- // check the current tag
- if (strncmp(pTagType, tagName, 2) == 0) {
- foundTag = true;
- break;
- }
- // get the storage class and find the next tag
- SkipToNextTag( pTagType, *pTagStorageType, pTagData, numBytesParsed );
- }
- // return if the edit distance tag was not present
- if ( !foundTag ) { return false; }
- // assign the tag value
- return GetTagValue( tagName, pTagData, *pTagStorageType, &tagVal );
- }
- // get generic tag data - NVBIO
- const char* GetTag(const char* tagName) const
- {
- if ( TagData.empty() ) { return NULL; }
- // localize the tag data
- const char* pTagData = (const char*)TagData.c_str();
- const unsigned int tagDataLen = (unsigned int)TagData.size() + 1;
- unsigned int numBytesParsed = 0;
- while( numBytesParsed < tagDataLen )
- {
- // skip white spaces
- while ( (*pTagData == ' ' || *pTagData == '\n') && numBytesParsed < tagDataLen)
- {
- ++numBytesParsed;
- ++pTagData;
- }
- if (*pTagData == '\0' || numBytesParsed == tagDataLen)
- break;
- const char* pTagType = pTagData;
- const char* pTagStorageType = pTagData + 2;
- pTagData += 3;
- numBytesParsed += 3;
- // check the current tag
- if (strncmp(pTagType, tagName, 2) == 0)
- return pTagData;
- // get the storage class and find the next tag
- SkipToNextTag( pTagType, *pTagStorageType, pTagData, numBytesParsed );
- }
- // return if the edit distance tag was not present
- return NULL;
- }
- private:
- static bool CheckType(const char storageType, int8_t) { return (storageType == 'A') || (storageType == 'c'); }
- static bool CheckType(const char storageType, uint8_t) { return (storageType == 'C'); }
- static bool CheckType(const char storageType, int16_t) { return (storageType == 's') || (storageType == 'c') || (storageType == 'C'); }
- static bool CheckType(const char storageType, uint16_t) { return (storageType == 'S') || (storageType == 'C'); }
- static bool CheckType(const char storageType, int32_t) { return (storageType == 'i') || (storageType == 'c') || (storageType == 'C') || (storageType == 's') || (storageType == 'S'); }
- static bool CheckType(const char storageType, uint32_t) { return (storageType == 'I') || (storageType == 'C') || (storageType == 'S'); }
- static bool CheckType(const char storageType, float) { return (storageType == 'f'); }
- template <typename T>
- static bool GetTagValue(const char* tagName, const char *pTagData, const char storageType, T* tagVal)
- {
- switch(storageType) {
- case 'A':
- if (CheckType(storageType,T()) == false)
- return false;
- *tagVal = T( *(const char*)(pTagData) );
- break;
- case 'c':
- if (CheckType(storageType,T()) == false)
- return false;
- *tagVal = T( *(const int8_t*)(pTagData) );
- break;
- case 'C':
- if (CheckType(storageType,T()) == false)
- return false;
- *tagVal = T( *(const uint8_t*)(pTagData) );
- break;
- case 's':
- if (CheckType(storageType,T()) == false)
- return false;
- *tagVal = T( *(const int16_t*)(pTagData) );
- break;
- case 'S':
- if (CheckType(storageType,T()) == false)
- return false;
- *tagVal = T( *(const uint16_t*)(pTagData) );
- break;
- case 'i':
- case 'f':
- case 'I':
- if (CheckType(storageType,T()) == false)
- return false;
- *tagVal = T( *(const T*)(pTagData) );
- break;
- case 'B':
- {
- const char type = *pTagData++;
- const uint8_t len0 = *pTagData++;
- const uint8_t len1 = *pTagData++;
- const uint8_t len2 = *pTagData++;
- const uint8_t len3 = *pTagData++;
- const uint32_t len = len0 | (len1 << 8) | (len2 << 16) | (len3 << 24);
- if (CheckType(type,T()) == false)
- return false;
- const T* tagData = (const T*)pTagData;
- switch (type)
- {
- case 'c':
- case 'C':
- for (uint32_t i = 0; i < len; ++i)
- *tagVal++ = *(const T*)(tagData++);
- break;
- case 's':
- case 'S':
- for (uint32_t i = 0; i < len; ++i)
- *tagVal++ = *(const T*)(tagData++);
- break;
- case 'i':
- case 'I':
- case 'f':
- for (uint32_t i = 0; i < len; ++i)
- *tagVal++ = *(const T*)(tagData++);
- break;
- }
- }
- case 'Z':
- if (CheckType(storageType,T()) == false)
- return false;
- while(*pTagData != '\0')
- *tagVal++ = *pTagData++;
- break;
- case 'H':
- // TODO!
- {
- char tagString[3];
- tagString[0] = tagName[0];
- tagString[1] = tagName[1];
- tagString[2] = '\0';
- fprintf(stderr,"WARNING: tag storage class 'H' is not yet supported, tag[%s:H]\n", tagString);
- return false;
- }
- //break;
- default:
- {
- //char tagString[3];
- //tagString[0] = tagName[0];
- //tagString[1] = tagName[1];
- //tagString[2] = '\0';
- //fprintf(stderr,"ERROR: Unknown tag storage class encountered, tag[%s:%c]\n", tagString, storageType);
- //exit(1);
- }
- }
- return true;
- }
- static void SkipToNextTag(const char* tagName, const char storageType, const char* &pTagData, unsigned int& numBytesParsed)
- {
- switch(storageType) {
- case 'A':
- case 'c':
- case 'C':
- ++numBytesParsed;
- ++pTagData;
- break;
- case 'B':
- {
- const char type = *pTagData++;
- const uint8_t len0 = *pTagData++;
- const uint8_t len1 = *pTagData++;
- const uint8_t len2 = *pTagData++;
- const uint8_t len3 = *pTagData++;
- const uint32_t len = len0 | (len1 << 8) | (len2 << 16) | (len3 << 24);
- switch (type)
- {
- case 'c':
- case 'C':
- pTagData += len;
- break;
- case 's':
- case 'S':
- pTagData += len * 2;
- break;
- case 'i':
- case 'I':
- case 'f':
- pTagData += len * 4;
- break;
- }
- break;
- }
- case 's':
- case 'S':
- numBytesParsed += 2;
- pTagData += 2;
- break;
- case 'i':
- case 'f':
- case 'I':
- numBytesParsed += 4;
- pTagData += 4;
- break;
- case 'Z':
- case 'H':
- while(*pTagData != '\0') {
- ++numBytesParsed;
- ++pTagData;
- }
- if(*pTagData == '\0')
- {
- ++numBytesParsed;
- ++pTagData;
- }
- break;
- default:
- {
- //char tagString[3];
- //tagString[0] = tagName[0];
- //tagString[1] = tagName[1];
- //tagString[2] = '\0';
- //fprintf(stderr,"ERROR: Unknown tag storage class encountered: [%s:%c]\n", tagString, storageType);
- //exit(1);
- }
- }
- }
- // Data members
- public:
- std::string Name; // Read name
- int32_t Length; // Query length
- std::string QueryBases; // 'Original' sequence (as reported from sequencing machine)
- std::string AlignedBases; // 'Aligned' sequence (includes any indels, padding, clipping)
- std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values)
- std::string TagData; // Tag data (accessor methods will pull the requested information out)
- int32_t RefID; // ID number for reference sequence
- int32_t Position; // Position (0-based) where alignment starts
- uint16_t Bin; // Bin in BAM file where this alignment resides
- uint16_t MapQuality; // Mapping quality score
- uint32_t AlignmentFlag; // Alignment bit-flag - see Is<something>() methods for available queries
- std::vector<CigarOp> CigarData; // CIGAR operations for this alignment
- std::vector<RefEdit> DeltaData; // Delta operations from reference to the query sequence
- int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned
- int32_t MatePosition; // Position (0-based) where alignment's mate starts
- int32_t InsertSize; // Mate-pair insert size
- // Alignment flag query constants
- enum { PAIRED = 1,
- PROPER_PAIR = 2,
- UNMAPPED = 4,
- MATE_UNMAPPED = 8,
- REVERSE = 16,
- MATE_REVERSE = 32,
- READ_1 = 64,
- READ_2 = 128,
- SECONDARY = 256,
- QC_FAILED = 512,
- DUPLICATE = 1024
- };
- };
- // ----------------------------------------------------------------
- // Auxiliary data structs & typedefs
- struct CigarOp {
- char Type; // Operation type (MIDNSHP)
- uint32_t Length; // Operation length (number of bases)
- };
- struct RefEdit
- {
- char Type; //Op type (MID etc.)
- uint32_t L; //parameter - length for ID, Sub Code for X
- uint32_t pos; //position in the query sequence
- };
- struct RefData {
- // data members
- std::string RefName; // Name of reference sequence
- uint32_t RefLength; // Length of reference sequence
- bool RefHasAlignments; // True if BAM file contains alignments mapped to reference sequence
- // constructor
- RefData(void)
- : RefLength(0)
- , RefHasAlignments(false)
- { }
- };
- typedef std::vector<RefData> RefVector;
- typedef std::vector<BamAlignment> BamAlignmentVector;
- // ----------------------------------------------------------------
- // Indexing structs & typedefs
- struct Chunk {
- // data members
- uint64_t Start;
- uint64_t Stop;
- // constructor
- Chunk(const uint64_t& start = 0, const uint64_t& stop = 0)
- : Start(start)
- , Stop(stop)
- { }
- };
- inline
- bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) {
- return lhs.Start < rhs.Start;
- }
- typedef std::vector<Chunk> ChunkVector;
- typedef std::map<uint32_t, ChunkVector> BamBinMap;
- typedef std::vector<uint64_t> LinearOffsetVector;
- struct ReferenceIndex {
- // data members
- BamBinMap Bins;
- LinearOffsetVector Offsets;
- // constructor
- ReferenceIndex(const BamBinMap& binMap = BamBinMap(),
- const LinearOffsetVector& offsets = LinearOffsetVector())
- : Bins(binMap)
- , Offsets(offsets)
- { }
- };
- typedef std::vector<ReferenceIndex> BamIndex;
- } // namespace BamTools
- #endif // BAMAUX_H
|