BamWriter.cpp 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
  1. // ***************************************************************************
  2. // BamWriter.cpp (c) 2009 Michael Strömberg, Derek Barnett
  3. // Marth Lab, Department of Biology, Boston College
  4. // All rights reserved.
  5. // ---------------------------------------------------------------------------
  6. // Last modified: 8 December 2009 (DB)
  7. // ---------------------------------------------------------------------------
  8. // Uses BGZF routines were adapted from the bgzf.c code developed at the Broad
  9. // Institute.
  10. // ---------------------------------------------------------------------------
  11. // Provides the basic functionality for producing BAM files
  12. // ***************************************************************************
  13. // BGZF includes
  14. #include "BGZF.h"
  15. #include "BamWriter.h"
  16. using namespace BamTools;
  17. using namespace std;
  18. struct BamWriter::BamWriterPrivate {
  19. // data members
  20. BgzfData mBGZF;
  21. string mpackedCigar;
  22. string mencodedQuery;
  23. string mbaseQualities;
  24. // constructor / destructor
  25. BamWriterPrivate(void) { }
  26. ~BamWriterPrivate(void) {
  27. mBGZF.Close();
  28. }
  29. // "public" interface
  30. void Close(void);
  31. void Open(const std::string& filename, const std::string& samHeader, const BamTools::RefVector& referenceSequences);
  32. void SaveAlignment(const BamTools::BamAlignment& al);
  33. // internal methods
  34. void CreatePackedCigar(const std::vector<CigarOp>& cigarOperations, std::string& packedCigar);
  35. void EncodeQuerySequence(const std::string& query, std::string& encodedQuery);
  36. };
  37. // -----------------------------------------------------
  38. // BamWriter implementation
  39. // -----------------------------------------------------
  40. // constructor
  41. BamWriter::BamWriter(void) {
  42. d = new BamWriterPrivate;
  43. }
  44. // destructor
  45. BamWriter::~BamWriter(void) {
  46. delete d;
  47. d = 0;
  48. }
  49. // closes the alignment archive
  50. void BamWriter::Close(void) {
  51. d->Close();
  52. }
  53. // opens the alignment archive
  54. void BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) {
  55. d->Open(filename, samHeader, referenceSequences);
  56. }
  57. // saves the alignment to the alignment archive
  58. void BamWriter::SaveAlignment(const BamAlignment& al) {
  59. d->SaveAlignment(al);
  60. }
  61. // -----------------------------------------------------
  62. // BamWriterPrivate implementation
  63. // -----------------------------------------------------
  64. // closes the alignment archive
  65. void BamWriter::BamWriterPrivate::Close(void) {
  66. mBGZF.Close();
  67. }
  68. // creates a cigar string from the supplied alignment
  69. void BamWriter::BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) {
  70. // initialize
  71. const unsigned int numCigarOperations = (unsigned int)cigarOperations.size();
  72. packedCigar.resize(numCigarOperations * BT_SIZEOF_INT);
  73. // pack the cigar data into the string
  74. unsigned int* pPackedCigar = (unsigned int*)packedCigar.data();
  75. unsigned int cigarOp;
  76. vector<CigarOp>::const_iterator coIter;
  77. for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); coIter++) {
  78. switch(coIter->Type) {
  79. case 'M':
  80. cigarOp = BAM_CMATCH;
  81. break;
  82. case 'I':
  83. cigarOp = BAM_CINS;
  84. break;
  85. case 'D':
  86. cigarOp = BAM_CDEL;
  87. break;
  88. case 'N':
  89. cigarOp = BAM_CREF_SKIP;
  90. break;
  91. case 'S':
  92. cigarOp = BAM_CSOFT_CLIP;
  93. break;
  94. case 'H':
  95. cigarOp = BAM_CHARD_CLIP;
  96. break;
  97. case 'P':
  98. cigarOp = BAM_CPAD;
  99. break;
  100. default:
  101. printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type);
  102. exit(1);
  103. }
  104. *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp;
  105. pPackedCigar++;
  106. }
  107. }
  108. // encodes the supplied query sequence into 4-bit notation
  109. void BamWriter::BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) {
  110. // prepare the encoded query string
  111. const unsigned int queryLen = (unsigned int)query.size();
  112. const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
  113. encodedQuery.resize(encodedQueryLen);
  114. char* pEncodedQuery = (char*)encodedQuery.data();
  115. const char* pQuery = (const char*)query.data();
  116. unsigned char nucleotideCode;
  117. bool useHighWord = true;
  118. while(*pQuery) {
  119. switch(*pQuery) {
  120. case '=':
  121. nucleotideCode = 0;
  122. break;
  123. case 'A':
  124. nucleotideCode = 1;
  125. break;
  126. case 'C':
  127. nucleotideCode = 2;
  128. break;
  129. case 'G':
  130. nucleotideCode = 4;
  131. break;
  132. case 'T':
  133. nucleotideCode = 8;
  134. break;
  135. case 'N':
  136. nucleotideCode = 15;
  137. break;
  138. default:
  139. printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
  140. exit(1);
  141. }
  142. // pack the nucleotide code
  143. if(useHighWord) {
  144. *pEncodedQuery = nucleotideCode << 4;
  145. useHighWord = false;
  146. } else {
  147. *pEncodedQuery |= nucleotideCode;
  148. pEncodedQuery++;
  149. useHighWord = true;
  150. }
  151. // increment the query position
  152. pQuery++;
  153. }
  154. }
  155. // opens the alignment archive
  156. void BamWriter::BamWriterPrivate::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences) {
  157. // open the BGZF file for writing
  158. mBGZF.Open(filename, "wb");
  159. // ================
  160. // write the header
  161. // ================
  162. // write the BAM signature
  163. const unsigned char SIGNATURE_LENGTH = 4;
  164. const char* BAM_SIGNATURE = "BAM\1";
  165. mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH);
  166. // write the SAM header text length
  167. const unsigned int samHeaderLen = (unsigned int)samHeader.size();
  168. mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT);
  169. // write the SAM header text
  170. if(samHeaderLen > 0) {
  171. mBGZF.Write(samHeader.data(), samHeaderLen);
  172. }
  173. // write the number of reference sequences
  174. const unsigned int numReferenceSequences = (unsigned int)referenceSequences.size();
  175. mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT);
  176. // =============================
  177. // write the sequence dictionary
  178. // =============================
  179. RefVector::const_iterator rsIter;
  180. for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) {
  181. // write the reference sequence name length
  182. const unsigned int referenceSequenceNameLen = (unsigned int)rsIter->RefName.size() + 1;
  183. mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT);
  184. // write the reference sequence name
  185. mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen);
  186. // write the reference sequence length
  187. mBGZF.Write((char*)&rsIter->RefLength, BT_SIZEOF_INT);
  188. }
  189. }
  190. // saves the alignment to the alignment archive
  191. void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) {
  192. // initialize
  193. const unsigned int nameLen = (unsigned int)al.Name.size() + 1;
  194. const unsigned int queryLen = (unsigned int)al.QueryBases.size();
  195. const unsigned int numCigarOperations = (unsigned int)al.CigarData.size();
  196. // create our packed cigar string
  197. string& packedCigar = mpackedCigar;
  198. packedCigar.erase( packedCigar.begin(), packedCigar.end() );
  199. CreatePackedCigar(al.CigarData, packedCigar);
  200. const unsigned int packedCigarLen = (unsigned int)packedCigar.size();
  201. // encode the query
  202. string& encodedQuery = mencodedQuery;
  203. encodedQuery.erase( encodedQuery.begin(), encodedQuery.end() );
  204. EncodeQuerySequence(al.QueryBases, encodedQuery);
  205. const unsigned int encodedQueryLen = (unsigned int)encodedQuery.size();
  206. // store the tag data length
  207. const unsigned int tagDataLength = (unsigned int)al.TagData.size() + 1;
  208. // assign the BAM core data
  209. unsigned int buffer[8];
  210. buffer[0] = al.RefID;
  211. buffer[1] = al.Position;
  212. buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | nameLen;
  213. buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
  214. buffer[4] = queryLen;
  215. buffer[5] = al.MateRefID;
  216. buffer[6] = al.MatePosition;
  217. buffer[7] = al.InsertSize;
  218. // write the block size
  219. const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen + tagDataLength;
  220. const unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
  221. mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT);
  222. // write the BAM core
  223. mBGZF.Write((char*)&buffer, BAM_CORE_SIZE);
  224. // write the query name
  225. mBGZF.Write(al.Name.c_str(), nameLen);
  226. // write the packed cigar
  227. mBGZF.Write(packedCigar.data(), packedCigarLen);
  228. // write the encoded query sequence
  229. mBGZF.Write(encodedQuery.data(), encodedQueryLen);
  230. // write the base qualities
  231. mbaseQualities = al.Qualities;
  232. char* pBaseQualities = (char*)mbaseQualities.data();
  233. for(unsigned int i = 0; i < queryLen; i++) { pBaseQualities[i] -= 33; }
  234. mBGZF.Write(pBaseQualities, queryLen);
  235. // write the read group tag
  236. mBGZF.Write(al.TagData.data(), tagDataLength);
  237. }