BamReader.cpp 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041
  1. // ***************************************************************************
  2. // BamReader.cpp (c) 2009 Derek Barnett, Michael Str�mberg
  3. // Marth Lab, Department of Biology, Boston College
  4. // All rights reserved.
  5. // ---------------------------------------------------------------------------
  6. // Last modified: 8 December 2009 (DB)
  7. // ---------------------------------------------------------------------------
  8. // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad
  9. // Institute.
  10. // ---------------------------------------------------------------------------
  11. // Provides the basic functionality for reading BAM files
  12. // ***************************************************************************
  13. // C++ includes
  14. #include <algorithm>
  15. #include <iterator>
  16. #include <string>
  17. #include <vector>
  18. // BamTools includes
  19. #include "BGZF.h"
  20. #include "BamReader.h"
  21. using namespace BamTools;
  22. using namespace std;
  23. struct BamReader::BamReaderPrivate {
  24. // -------------------------------
  25. // data members
  26. // -------------------------------
  27. // general data
  28. BgzfData mBGZF;
  29. string HeaderText;
  30. BamIndex Index;
  31. RefVector References;
  32. bool IsIndexLoaded;
  33. int64_t AlignmentsBeginOffset;
  34. string Filename;
  35. string IndexFilename;
  36. // user-specified region values
  37. bool IsRegionSpecified;
  38. int CurrentRefID;
  39. int CurrentLeft;
  40. // BAM character constants
  41. const char* DNA_LOOKUP;
  42. const char* CIGAR_LOOKUP;
  43. // -------------------------------
  44. // constructor & destructor
  45. // -------------------------------
  46. BamReaderPrivate(void);
  47. ~BamReaderPrivate(void);
  48. // -------------------------------
  49. // "public" interface
  50. // -------------------------------
  51. // flie operations
  52. void Close(void);
  53. bool Jump(int refID, int position = 0);
  54. void Open(const string& filename, const string& indexFilename = "");
  55. bool Rewind(void);
  56. // access alignment data
  57. bool GetNextAlignment(BamAlignment& bAlignment);
  58. // access auxiliary data
  59. const string GetHeaderText(void) const;
  60. const int GetReferenceCount(void) const;
  61. const RefVector GetReferenceData(void) const;
  62. const int GetReferenceID(const string& refName) const;
  63. // index operations
  64. bool CreateIndex(void);
  65. // -------------------------------
  66. // internal methods
  67. // -------------------------------
  68. // *** reading alignments and auxiliary data *** //
  69. // calculate bins that overlap region ( left to reference end for now )
  70. int BinsFromRegion(int refID, int left, uint16_t[MAX_BIN]);
  71. // calculates alignment end position based on starting position and provided CIGAR operations
  72. int CalculateAlignmentEnd(const int& position, const std::vector<CigarOp>& cigarData);
  73. // calculate file offset for first alignment chunk overlapping 'left'
  74. int64_t GetOffset(int refID, int left);
  75. // checks to see if alignment overlaps current region
  76. bool IsOverlap(BamAlignment& bAlignment);
  77. // retrieves header text from BAM file
  78. void LoadHeaderData(void);
  79. // retrieves BAM alignment under file pointer
  80. bool LoadNextAlignment(BamAlignment& bAlignment);
  81. // builds reference data structure from BAM file
  82. void LoadReferenceData(void);
  83. // *** index file handling *** //
  84. // calculates index for BAM file
  85. bool BuildIndex(void);
  86. // clear out inernal index data structure
  87. void ClearIndex(void);
  88. // saves BAM bin entry for index
  89. void InsertBinEntry(BamBinMap& binMap, const uint32_t& saveBin, const uint64_t& saveOffset, const uint64_t& lastOffset);
  90. // saves linear offset entry for index
  91. void InsertLinearOffset(LinearOffsetVector& offsets, const BamAlignment& bAlignment, const uint64_t& lastOffset);
  92. // loads index from BAM index file
  93. bool LoadIndex(void);
  94. // simplifies index by merging 'chunks'
  95. void MergeChunks(void);
  96. // round-up 32-bit integer to next power-of-2
  97. void Roundup32(int& value);
  98. // saves index to BAM index file
  99. bool WriteIndex(void);
  100. };
  101. // -----------------------------------------------------
  102. // BamReader implementation (wrapper around BRPrivate)
  103. // -----------------------------------------------------
  104. // constructor
  105. BamReader::BamReader(void) {
  106. d = new BamReaderPrivate;
  107. }
  108. // destructor
  109. BamReader::~BamReader(void) {
  110. delete d;
  111. d = 0;
  112. }
  113. // file operations
  114. void BamReader::Close(void) { d->Close(); }
  115. bool BamReader::Jump(int refID, int position) { return d->Jump(refID, position); }
  116. void BamReader::Open(const string& filename, const string& indexFilename) { d->Open(filename, indexFilename); }
  117. bool BamReader::Rewind(void) { return d->Rewind(); }
  118. // access alignment data
  119. bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { return d->GetNextAlignment(bAlignment); }
  120. // access auxiliary data
  121. const string BamReader::GetHeaderText(void) const { return d->HeaderText; }
  122. const int BamReader::GetReferenceCount(void) const { return (int)d->References.size(); }
  123. const RefVector BamReader::GetReferenceData(void) const { return d->References; }
  124. const int BamReader::GetReferenceID(const string& refName) const { return d->GetReferenceID(refName); }
  125. // index operations
  126. bool BamReader::CreateIndex(void) { return d->CreateIndex(); }
  127. // -----------------------------------------------------
  128. // BamReaderPrivate implementation
  129. // -----------------------------------------------------
  130. // constructor
  131. BamReader::BamReaderPrivate::BamReaderPrivate(void)
  132. : IsIndexLoaded(false)
  133. , AlignmentsBeginOffset(0)
  134. , IsRegionSpecified(false)
  135. , CurrentRefID(0)
  136. , CurrentLeft(0)
  137. , DNA_LOOKUP("=ACMGRSVTWYHKDBN")
  138. , CIGAR_LOOKUP("MIDNSHP")
  139. { }
  140. // destructor
  141. BamReader::BamReaderPrivate::~BamReaderPrivate(void) {
  142. Close();
  143. }
  144. // calculate bins that overlap region ( left to reference end for now )
  145. int BamReader::BamReaderPrivate::BinsFromRegion(int refID, int left, uint16_t list[MAX_BIN]) {
  146. // get region boundaries
  147. uint32_t begin = (unsigned int)left;
  148. uint32_t end = (unsigned int)References.at(refID).RefLength - 1;
  149. // initialize list, bin '0' always a valid bin
  150. int i = 0;
  151. list[i++] = 0;
  152. // get rest of bins that contain this region
  153. unsigned int k;
  154. for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { list[i++] = k; }
  155. for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { list[i++] = k; }
  156. for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { list[i++] = k; }
  157. for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { list[i++] = k; }
  158. for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { list[i++] = k; }
  159. // return number of bins stored
  160. return i;
  161. }
  162. // populates BAM index data structure from BAM file data
  163. bool BamReader::BamReaderPrivate::BuildIndex(void) {
  164. // check to be sure file is open
  165. if (!mBGZF.IsOpen) { return false; }
  166. // move file pointer to beginning of alignments
  167. Rewind();
  168. // get reference count, reserve index space
  169. int numReferences = (int)References.size();
  170. for ( int i = 0; i < numReferences; ++i ) {
  171. Index.push_back(ReferenceIndex());
  172. }
  173. // sets default constant for bin, ID, offset, coordinate variables
  174. const uint32_t defaultValue = 0xffffffffu;
  175. // bin data
  176. uint32_t saveBin(defaultValue);
  177. uint32_t lastBin(defaultValue);
  178. // reference ID data
  179. int32_t saveRefID(defaultValue);
  180. int32_t lastRefID(defaultValue);
  181. // offset data
  182. uint64_t saveOffset = mBGZF.Tell();
  183. uint64_t lastOffset = saveOffset;
  184. // coordinate data
  185. int32_t lastCoordinate = defaultValue;
  186. BamAlignment bAlignment;
  187. while( GetNextAlignment(bAlignment) ) {
  188. // change of chromosome, save ID, reset bin
  189. if ( lastRefID != bAlignment.RefID ) {
  190. lastRefID = bAlignment.RefID;
  191. lastBin = defaultValue;
  192. }
  193. // if lastCoordinate greater than BAM position - file not sorted properly
  194. else if ( lastCoordinate > bAlignment.Position ) {
  195. printf("BAM file not properly sorted:\n");
  196. printf("Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(), lastCoordinate, bAlignment.Position, bAlignment.RefID);
  197. exit(1);
  198. }
  199. // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions)
  200. if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) {
  201. // save linear offset entry (matched to BAM entry refID)
  202. ReferenceIndex& refIndex = Index.at(bAlignment.RefID);
  203. LinearOffsetVector& offsets = refIndex.Offsets;
  204. InsertLinearOffset(offsets, bAlignment, lastOffset);
  205. }
  206. // if current BamAlignment bin != lastBin, "then possibly write the binning index"
  207. if ( bAlignment.Bin != lastBin ) {
  208. // if not first time through
  209. if ( saveBin != defaultValue ) {
  210. // save Bam bin entry
  211. ReferenceIndex& refIndex = Index.at(saveRefID);
  212. BamBinMap& binMap = refIndex.Bins;
  213. InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);
  214. }
  215. // update saveOffset
  216. saveOffset = lastOffset;
  217. // update bin values
  218. saveBin = bAlignment.Bin;
  219. lastBin = bAlignment.Bin;
  220. // update saveRefID
  221. saveRefID = bAlignment.RefID;
  222. // if invalid RefID, break out (why?)
  223. if ( saveRefID < 0 ) { break; }
  224. }
  225. // make sure that current file pointer is beyond lastOffset
  226. if ( mBGZF.Tell() <= (int64_t)lastOffset ) {
  227. printf("Error in BGZF offsets.\n");
  228. exit(1);
  229. }
  230. // update lastOffset
  231. lastOffset = mBGZF.Tell();
  232. // update lastCoordinate
  233. lastCoordinate = bAlignment.Position;
  234. }
  235. // save any leftover BAM data (as long as refID is valid)
  236. if ( saveRefID >= 0 ) {
  237. // save Bam bin entry
  238. ReferenceIndex& refIndex = Index.at(saveRefID);
  239. BamBinMap& binMap = refIndex.Bins;
  240. InsertBinEntry(binMap, saveBin, saveOffset, lastOffset);
  241. }
  242. // simplify index by merging chunks
  243. MergeChunks();
  244. // iterate over references
  245. BamIndex::iterator indexIter = Index.begin();
  246. BamIndex::iterator indexEnd = Index.end();
  247. for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) {
  248. // get reference index data
  249. ReferenceIndex& refIndex = (*indexIter);
  250. BamBinMap& binMap = refIndex.Bins;
  251. LinearOffsetVector& offsets = refIndex.Offsets;
  252. // store whether reference has alignments or no
  253. References[i].RefHasAlignments = ( binMap.size() > 0 );
  254. // sort linear offsets
  255. sort(offsets.begin(), offsets.end());
  256. }
  257. // rewind file pointer to beginning of alignments, return success/fail
  258. return Rewind();
  259. }
  260. // calculates alignment end position based on starting position and provided CIGAR operations
  261. int BamReader::BamReaderPrivate::CalculateAlignmentEnd(const int& position, const vector<CigarOp>& cigarData) {
  262. // initialize alignment end to starting position
  263. int alignEnd = position;
  264. // iterate over cigar operations
  265. vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
  266. vector<CigarOp>::const_iterator cigarEnd = cigarData.end();
  267. for ( ; cigarIter != cigarEnd; ++cigarIter) {
  268. char cigarType = (*cigarIter).Type;
  269. if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) {
  270. alignEnd += (*cigarIter).Length;
  271. }
  272. }
  273. return alignEnd;
  274. }
  275. // clear index data structure
  276. void BamReader::BamReaderPrivate::ClearIndex(void) {
  277. Index.clear(); // sufficient ??
  278. }
  279. // closes the BAM file
  280. void BamReader::BamReaderPrivate::Close(void) {
  281. mBGZF.Close();
  282. ClearIndex();
  283. HeaderText.clear();
  284. IsRegionSpecified = false;
  285. }
  286. // create BAM index from BAM file (keep structure in memory) and write to default index output file
  287. bool BamReader::BamReaderPrivate::CreateIndex(void) {
  288. // clear out index
  289. ClearIndex();
  290. // build (& save) index from BAM file
  291. bool ok = true;
  292. ok &= BuildIndex();
  293. ok &= WriteIndex();
  294. // return success/fail
  295. return ok;
  296. }
  297. // returns RefID for given RefName (returns References.size() if not found)
  298. const int BamReader::BamReaderPrivate::GetReferenceID(const string& refName) const {
  299. // retrieve names from reference data
  300. vector<string> refNames;
  301. RefVector::const_iterator refIter = References.begin();
  302. RefVector::const_iterator refEnd = References.end();
  303. for ( ; refIter != refEnd; ++refIter) {
  304. refNames.push_back( (*refIter).RefName );
  305. }
  306. // return 'index-of' refName ( if not found, returns refNames.size() )
  307. return (int)distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName));
  308. }
  309. // get next alignment (from specified region, if given)
  310. bool BamReader::BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) {
  311. // if valid alignment available
  312. if ( LoadNextAlignment(bAlignment) ) {
  313. // if region not specified, return success
  314. if ( !IsRegionSpecified ) { return true; }
  315. // load next alignment until region overlap is found
  316. while ( !IsOverlap(bAlignment) ) {
  317. // if no valid alignment available (likely EOF) return failure
  318. if ( !LoadNextAlignment(bAlignment) ) { return false; }
  319. }
  320. // return success (alignment found that overlaps region)
  321. return true;
  322. }
  323. // no valid alignment
  324. else { return false; }
  325. }
  326. // calculate closest indexed file offset for region specified
  327. int64_t BamReader::BamReaderPrivate::GetOffset(int refID, int left) {
  328. // calculate which bins overlap this region
  329. uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2);
  330. int numBins = BinsFromRegion(refID, left, bins);
  331. // get bins for this reference
  332. const ReferenceIndex& refIndex = Index.at(refID);
  333. const BamBinMap& binMap = refIndex.Bins;
  334. // get minimum offset to consider
  335. const LinearOffsetVector& offsets = refIndex.Offsets;
  336. uint64_t minOffset = ( (unsigned int)(left>>BAM_LIDX_SHIFT) >= offsets.size() ) ? 0 : offsets.at(left>>BAM_LIDX_SHIFT);
  337. // store offsets to beginning of alignment 'chunks'
  338. std::vector<int64_t> chunkStarts;
  339. // store all alignment 'chunk' starts for bins in this region
  340. for (int i = 0; i < numBins; ++i ) {
  341. uint16_t binKey = bins[i];
  342. map<uint32_t, ChunkVector>::const_iterator binIter = binMap.find(binKey);
  343. if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) {
  344. const ChunkVector& chunks = (*binIter).second;
  345. std::vector<Chunk>::const_iterator chunksIter = chunks.begin();
  346. std::vector<Chunk>::const_iterator chunksEnd = chunks.end();
  347. for ( ; chunksIter != chunksEnd; ++chunksIter) {
  348. const Chunk& chunk = (*chunksIter);
  349. if ( chunk.Stop > minOffset ) {
  350. chunkStarts.push_back( chunk.Start );
  351. }
  352. }
  353. }
  354. }
  355. // clean up memory
  356. free(bins);
  357. // if no alignments found, else return smallest offset for alignment starts
  358. if ( chunkStarts.size() == 0 ) { return -1; }
  359. else { return *min_element(chunkStarts.begin(), chunkStarts.end()); }
  360. }
  361. // saves BAM bin entry for index
  362. void BamReader::BamReaderPrivate::InsertBinEntry(BamBinMap& binMap,
  363. const uint32_t& saveBin,
  364. const uint64_t& saveOffset,
  365. const uint64_t& lastOffset)
  366. {
  367. // look up saveBin
  368. BamBinMap::iterator binIter = binMap.find(saveBin);
  369. // create new chunk
  370. Chunk newChunk(saveOffset, lastOffset);
  371. // if entry doesn't exist
  372. if ( binIter == binMap.end() ) {
  373. ChunkVector newChunks;
  374. newChunks.push_back(newChunk);
  375. binMap.insert( pair<uint32_t, ChunkVector>(saveBin, newChunks));
  376. }
  377. // otherwise
  378. else {
  379. ChunkVector& binChunks = (*binIter).second;
  380. binChunks.push_back( newChunk );
  381. }
  382. }
  383. // saves linear offset entry for index
  384. void BamReader::BamReaderPrivate::InsertLinearOffset(LinearOffsetVector& offsets,
  385. const BamAlignment& bAlignment,
  386. const uint64_t& lastOffset)
  387. {
  388. // get converted offsets
  389. int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT;
  390. int endOffset = ( CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData) - 1) >> BAM_LIDX_SHIFT;
  391. // resize vector if necessary
  392. int oldSize = (int)offsets.size();
  393. int newSize = endOffset + 1;
  394. if ( oldSize < newSize ) {
  395. Roundup32(newSize);
  396. offsets.resize(newSize, 0);
  397. }
  398. // store offset
  399. for(int i = beginOffset + 1; i <= endOffset ; ++i) {
  400. if ( offsets[i] == 0) {
  401. offsets[i] = lastOffset;
  402. }
  403. }
  404. }
  405. // returns whether alignment overlaps currently specified region (refID, leftBound)
  406. bool BamReader::BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) {
  407. // if on different reference sequence, quit
  408. if ( bAlignment.RefID != CurrentRefID ) { return false; }
  409. // read starts after left boundary
  410. if ( bAlignment.Position >= CurrentLeft) { return true; }
  411. // return whether alignment end overlaps left boundary
  412. return ( CalculateAlignmentEnd(bAlignment.Position, bAlignment.CigarData) >= CurrentLeft );
  413. }
  414. // jumps to specified region(refID, leftBound) in BAM file, returns success/fail
  415. bool BamReader::BamReaderPrivate::Jump(int refID, int position) {
  416. // if data exists for this reference and position is valid
  417. if ( References.at(refID).RefHasAlignments && ((unsigned int)position <= References.at(refID).RefLength) ) {
  418. // set current region
  419. CurrentRefID = refID;
  420. CurrentLeft = position;
  421. IsRegionSpecified = true;
  422. // calculate offset
  423. int64_t offset = GetOffset(CurrentRefID, CurrentLeft);
  424. // if in valid offset, return failure
  425. if ( offset == -1 ) { return false; }
  426. // otherwise return success of seek operation
  427. else { return mBGZF.Seek(offset); }
  428. }
  429. // invalid jump request parameters, return failure
  430. return false;
  431. }
  432. // load BAM header data
  433. void BamReader::BamReaderPrivate::LoadHeaderData(void) {
  434. // check to see if proper BAM header
  435. char buffer[4];
  436. if (mBGZF.Read(buffer, 4) != 4) {
  437. printf("Could not read header type\n");
  438. exit(1);
  439. }
  440. if (strncmp(buffer, "BAM\001", 4)) {
  441. printf("wrong header type!\n");
  442. exit(1);
  443. }
  444. // get BAM header text length
  445. mBGZF.Read(buffer, 4);
  446. const unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer);
  447. // get BAM header text
  448. char* headerText = (char*)calloc(headerTextLength + 1, 1);
  449. mBGZF.Read(headerText, headerTextLength);
  450. HeaderText = (string)((const char*)headerText);
  451. // clean up calloc-ed temp variable
  452. free(headerText);
  453. }
  454. // load existing index data from BAM index file (".bai"), return success/fail
  455. bool BamReader::BamReaderPrivate::LoadIndex(void) {
  456. // clear out index data
  457. ClearIndex();
  458. // skip if index file empty
  459. if ( IndexFilename.empty() ) { return false; }
  460. // open index file, abort on error
  461. FILE* indexStream = fopen(IndexFilename.c_str(), "rb");
  462. if(!indexStream) {
  463. printf("ERROR: Unable to open the BAM index file %s for reading.\n", IndexFilename.c_str() );
  464. return false;
  465. }
  466. // see if index is valid BAM index
  467. char magic[4];
  468. fread(magic, 1, 4, indexStream);
  469. if (strncmp(magic, "BAI\1", 4)) {
  470. printf("Problem with index file - invalid format.\n");
  471. fclose(indexStream);
  472. return false;
  473. }
  474. // get number of reference sequences
  475. uint32_t numRefSeqs;
  476. fread(&numRefSeqs, 4, 1, indexStream);
  477. // intialize space for BamIndex data structure
  478. Index.reserve(numRefSeqs);
  479. // iterate over reference sequences
  480. for (unsigned int i = 0; i < numRefSeqs; ++i) {
  481. // get number of bins for this reference sequence
  482. int32_t numBins;
  483. fread(&numBins, 4, 1, indexStream);
  484. if (numBins > 0) {
  485. RefData& refEntry = References[i];
  486. refEntry.RefHasAlignments = true;
  487. }
  488. // intialize BinVector
  489. BamBinMap binMap;
  490. // iterate over bins for that reference sequence
  491. for (int j = 0; j < numBins; ++j) {
  492. // get binID
  493. uint32_t binID;
  494. fread(&binID, 4, 1, indexStream);
  495. // get number of regionChunks in this bin
  496. uint32_t numChunks;
  497. fread(&numChunks, 4, 1, indexStream);
  498. // intialize ChunkVector
  499. ChunkVector regionChunks;
  500. regionChunks.reserve(numChunks);
  501. // iterate over regionChunks in this bin
  502. for (unsigned int k = 0; k < numChunks; ++k) {
  503. // get chunk boundaries (left, right)
  504. uint64_t left;
  505. uint64_t right;
  506. fread(&left, 8, 1, indexStream);
  507. fread(&right, 8, 1, indexStream);
  508. // save ChunkPair
  509. regionChunks.push_back( Chunk(left, right) );
  510. }
  511. // sort chunks for this bin
  512. sort( regionChunks.begin(), regionChunks.end(), ChunkLessThan );
  513. // save binID, chunkVector for this bin
  514. binMap.insert( pair<uint32_t, ChunkVector>(binID, regionChunks) );
  515. }
  516. // load linear index for this reference sequence
  517. // get number of linear offsets
  518. int32_t numLinearOffsets;
  519. fread(&numLinearOffsets, 4, 1, indexStream);
  520. // intialize LinearOffsetVector
  521. LinearOffsetVector offsets;
  522. offsets.reserve(numLinearOffsets);
  523. // iterate over linear offsets for this reference sequeence
  524. uint64_t linearOffset;
  525. for (int j = 0; j < numLinearOffsets; ++j) {
  526. // read a linear offset & store
  527. fread(&linearOffset, 8, 1, indexStream);
  528. offsets.push_back(linearOffset);
  529. }
  530. // sort linear offsets
  531. sort( offsets.begin(), offsets.end() );
  532. // store index data for that reference sequence
  533. Index.push_back( ReferenceIndex(binMap, offsets) );
  534. }
  535. // close index file (.bai) and return
  536. fclose(indexStream);
  537. return true;
  538. }
  539. // populates BamAlignment with alignment data under file pointer, returns success/fail
  540. bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) {
  541. // read in the 'block length' value, make sure it's not zero
  542. char buffer[4];
  543. mBGZF.Read(buffer, 4);
  544. const unsigned int blockLength = BgzfData::UnpackUnsignedInt(buffer);
  545. if ( blockLength == 0 ) { return false; }
  546. // keep track of bytes read as method progresses
  547. int bytesRead = 4;
  548. // read in core alignment data, make sure the right size of data was read
  549. char x[BAM_CORE_SIZE];
  550. if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; }
  551. bytesRead += BAM_CORE_SIZE;
  552. // set BamAlignment 'core' data and character data lengths
  553. unsigned int tempValue;
  554. unsigned int queryNameLength;
  555. unsigned int numCigarOperations;
  556. unsigned int querySequenceLength;
  557. bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]);
  558. bAlignment.Position = BgzfData::UnpackSignedInt(&x[4]);
  559. tempValue = BgzfData::UnpackUnsignedInt(&x[8]);
  560. bAlignment.Bin = tempValue >> 16;
  561. bAlignment.MapQuality = tempValue >> 8 & 0xff;
  562. queryNameLength = tempValue & 0xff;
  563. tempValue = BgzfData::UnpackUnsignedInt(&x[12]);
  564. bAlignment.AlignmentFlag = tempValue >> 16;
  565. numCigarOperations = tempValue & 0xffff;
  566. querySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]);
  567. bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[20]);
  568. bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]);
  569. bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]);
  570. // calculate lengths/offsets
  571. const unsigned int dataLength = blockLength - BAM_CORE_SIZE;
  572. const unsigned int cigarDataOffset = queryNameLength;
  573. const unsigned int seqDataOffset = cigarDataOffset + (numCigarOperations * 4);
  574. const unsigned int qualDataOffset = seqDataOffset + (querySequenceLength+1)/2;
  575. const unsigned int tagDataOffset = qualDataOffset + querySequenceLength;
  576. const unsigned int tagDataLen = dataLength - tagDataOffset;
  577. // set up destination buffers for character data
  578. char* allCharData = (char*)calloc(sizeof(char), dataLength);
  579. uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset);
  580. char* seqData = ((char*)allCharData) + seqDataOffset;
  581. char* qualData = ((char*)allCharData) + qualDataOffset;
  582. char* tagData = ((char*)allCharData) + tagDataOffset;
  583. // get character data - make sure proper data size was read
  584. if ( mBGZF.Read(allCharData, dataLength) != (signed int)dataLength) { return false; }
  585. else {
  586. bytesRead += dataLength;
  587. // clear out any previous string data
  588. bAlignment.Name.clear();
  589. bAlignment.QueryBases.clear();
  590. bAlignment.Qualities.clear();
  591. bAlignment.AlignedBases.clear();
  592. bAlignment.CigarData.clear();
  593. bAlignment.TagData.clear();
  594. // save name
  595. bAlignment.Name = (string)((const char*)(allCharData));
  596. // save query sequence
  597. for (unsigned int i = 0; i < querySequenceLength; ++i) {
  598. char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ];
  599. bAlignment.QueryBases.append( 1, singleBase );
  600. }
  601. // save sequence length
  602. bAlignment.Length = (int)bAlignment.QueryBases.length();
  603. // save qualities, convert from numeric QV to FASTQ character
  604. for (unsigned int i = 0; i < querySequenceLength; ++i) {
  605. char singleQuality = (char)(qualData[i]+33);
  606. bAlignment.Qualities.append( 1, singleQuality );
  607. }
  608. // save CIGAR-related data;
  609. int k = 0;
  610. for (unsigned int i = 0; i < numCigarOperations; ++i) {
  611. // build CigarOp struct
  612. CigarOp op;
  613. op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT);
  614. op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ];
  615. // save CigarOp
  616. bAlignment.CigarData.push_back(op);
  617. // build AlignedBases string
  618. switch (op.Type) {
  619. case ('M') :
  620. case ('I') : bAlignment.AlignedBases.append( bAlignment.QueryBases.substr(k, op.Length) ); // for 'M', 'I' - write bases
  621. case ('S') : k += op.Length; // for 'S' - skip over query bases
  622. break;
  623. case ('D') : bAlignment.AlignedBases.append( op.Length, '-' ); // for 'D' - write gap character
  624. break;
  625. case ('P') : bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character;
  626. break;
  627. case ('N') : bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in query sequence
  628. k += op.Length;
  629. break;
  630. case ('H') : break; // for 'H' - do nothing, move to next op
  631. default : printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here
  632. exit(1);
  633. }
  634. }
  635. // read in the tag data
  636. bAlignment.TagData.resize(tagDataLen);
  637. memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLen);
  638. }
  639. free(allCharData);
  640. return true;
  641. }
  642. // loads reference data from BAM file
  643. void BamReader::BamReaderPrivate::LoadReferenceData(void) {
  644. // get number of reference sequences
  645. char buffer[4];
  646. mBGZF.Read(buffer, 4);
  647. const unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer);
  648. if (numberRefSeqs == 0) { return; }
  649. References.reserve((int)numberRefSeqs);
  650. // iterate over all references in header
  651. for (unsigned int i = 0; i != numberRefSeqs; ++i) {
  652. // get length of reference name
  653. mBGZF.Read(buffer, 4);
  654. const unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer);
  655. char* refName = (char*)calloc(refNameLength, 1);
  656. // get reference name and reference sequence length
  657. mBGZF.Read(refName, refNameLength);
  658. mBGZF.Read(buffer, 4);
  659. const int refLength = BgzfData::UnpackSignedInt(buffer);
  660. // store data for reference
  661. RefData aReference;
  662. aReference.RefName = (string)((const char*)refName);
  663. aReference.RefLength = refLength;
  664. References.push_back(aReference);
  665. // clean up calloc-ed temp variable
  666. free(refName);
  667. }
  668. }
  669. // merges 'alignment chunks' in BAM bin (used for index building)
  670. void BamReader::BamReaderPrivate::MergeChunks(void) {
  671. // iterate over reference enties
  672. BamIndex::iterator indexIter = Index.begin();
  673. BamIndex::iterator indexEnd = Index.end();
  674. for ( ; indexIter != indexEnd; ++indexIter ) {
  675. // get BAM bin map for this reference
  676. ReferenceIndex& refIndex = (*indexIter);
  677. BamBinMap& bamBinMap = refIndex.Bins;
  678. // iterate over BAM bins
  679. BamBinMap::iterator binIter = bamBinMap.begin();
  680. BamBinMap::iterator binEnd = bamBinMap.end();
  681. for ( ; binIter != binEnd; ++binIter ) {
  682. // get chunk vector for this bin
  683. ChunkVector& binChunks = (*binIter).second;
  684. if ( binChunks.size() == 0 ) { continue; }
  685. ChunkVector mergedChunks;
  686. mergedChunks.push_back( binChunks[0] );
  687. // iterate over chunks
  688. int i = 0;
  689. ChunkVector::iterator chunkIter = binChunks.begin();
  690. ChunkVector::iterator chunkEnd = binChunks.end();
  691. for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) {
  692. // get 'currentChunk' based on numeric index
  693. Chunk& currentChunk = mergedChunks[i];
  694. // get iteratorChunk based on vector iterator
  695. Chunk& iteratorChunk = (*chunkIter);
  696. // if currentChunk.Stop(shifted) == iterator Chunk.Start(shifted)
  697. if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) {
  698. // set currentChunk.Stop to iteratorChunk.Stop
  699. currentChunk.Stop = iteratorChunk.Stop;
  700. }
  701. // otherwise
  702. else {
  703. // set currentChunk + 1 to iteratorChunk
  704. mergedChunks.push_back(iteratorChunk);
  705. ++i;
  706. }
  707. }
  708. // saved merged chunk vector
  709. (*binIter).second = mergedChunks;
  710. }
  711. }
  712. }
  713. // opens BAM file (and index)
  714. void BamReader::BamReaderPrivate::Open(const string& filename, const string& indexFilename) {
  715. Filename = filename;
  716. IndexFilename = indexFilename;
  717. // open the BGZF file for reading, retrieve header text & reference data
  718. mBGZF.Open(filename, "rb");
  719. LoadHeaderData();
  720. LoadReferenceData();
  721. // store file offset of first alignment
  722. AlignmentsBeginOffset = mBGZF.Tell();
  723. // open index file & load index data (if exists)
  724. if ( !IndexFilename.empty() ) {
  725. LoadIndex();
  726. }
  727. }
  728. // returns BAM file pointer to beginning of alignment data
  729. bool BamReader::BamReaderPrivate::Rewind(void) {
  730. // find first reference that has alignments in the BAM file
  731. int refID = 0;
  732. int refCount = (int)References.size();
  733. for ( ; refID < refCount; ++refID ) {
  734. if ( References.at(refID).RefHasAlignments ) { break; }
  735. }
  736. // store default bounds for first alignment
  737. CurrentRefID = refID;
  738. CurrentLeft = 0;
  739. IsRegionSpecified = false;
  740. // return success/failure of seek
  741. return mBGZF.Seek(AlignmentsBeginOffset);
  742. }
  743. // rounds value up to next power-of-2 (used in index building)
  744. void BamReader::BamReaderPrivate::Roundup32(int& value) {
  745. --value;
  746. value |= value >> 1;
  747. value |= value >> 2;
  748. value |= value >> 4;
  749. value |= value >> 8;
  750. value |= value >> 16;
  751. ++value;
  752. }
  753. // saves index data to BAM index file (".bai"), returns success/fail
  754. bool BamReader::BamReaderPrivate::WriteIndex(void) {
  755. IndexFilename = Filename + ".bai";
  756. FILE* indexStream = fopen(IndexFilename.c_str(), "wb");
  757. if ( indexStream == 0 ) {
  758. printf("ERROR: Could not open file to save index\n");
  759. return false;
  760. }
  761. // write BAM index header
  762. fwrite("BAI\1", 1, 4, indexStream);
  763. // write number of reference sequences
  764. int32_t numReferenceSeqs = (int32_t)Index.size();
  765. fwrite(&numReferenceSeqs, 4, 1, indexStream);
  766. // iterate over reference sequences
  767. BamIndex::const_iterator indexIter = Index.begin();
  768. BamIndex::const_iterator indexEnd = Index.end();
  769. for ( ; indexIter != indexEnd; ++ indexIter ) {
  770. // get reference index data
  771. const ReferenceIndex& refIndex = (*indexIter);
  772. const BamBinMap& binMap = refIndex.Bins;
  773. const LinearOffsetVector& offsets = refIndex.Offsets;
  774. // write number of bins
  775. int32_t binCount = (int32_t)binMap.size();
  776. fwrite(&binCount, 4, 1, indexStream);
  777. // iterate over bins
  778. BamBinMap::const_iterator binIter = binMap.begin();
  779. BamBinMap::const_iterator binEnd = binMap.end();
  780. for ( ; binIter != binEnd; ++binIter ) {
  781. // get bin data (key and chunk vector)
  782. const uint32_t& binKey = (*binIter).first;
  783. const ChunkVector& binChunks = (*binIter).second;
  784. // save BAM bin key
  785. fwrite(&binKey, 4, 1, indexStream);
  786. // save chunk count
  787. int32_t chunkCount = (int32_t)binChunks.size();
  788. fwrite(&chunkCount, 4, 1, indexStream);
  789. // iterate over chunks
  790. ChunkVector::const_iterator chunkIter = binChunks.begin();
  791. ChunkVector::const_iterator chunkEnd = binChunks.end();
  792. for ( ; chunkIter != chunkEnd; ++chunkIter ) {
  793. // get current chunk data
  794. const Chunk& chunk = (*chunkIter);
  795. const uint64_t& start = chunk.Start;
  796. const uint64_t& stop = chunk.Stop;
  797. // save chunk offsets
  798. fwrite(&start, 8, 1, indexStream);
  799. fwrite(&stop, 8, 1, indexStream);
  800. }
  801. }
  802. // write linear offsets size
  803. int32_t offsetSize = (int32_t)offsets.size();
  804. fwrite(&offsetSize, 4, 1, indexStream);
  805. // iterate over linear offsets
  806. LinearOffsetVector::const_iterator offsetIter = offsets.begin();
  807. LinearOffsetVector::const_iterator offsetEnd = offsets.end();
  808. for ( ; offsetIter != offsetEnd; ++offsetIter ) {
  809. // write linear offset value
  810. const uint64_t& linearOffset = (*offsetIter);
  811. fwrite(&linearOffset, 8, 1, indexStream);
  812. }
  813. }
  814. // flush buffer, close file, and return success
  815. fflush(indexStream);
  816. fclose(indexStream);
  817. return true;
  818. }