BGZF.h 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. // ***************************************************************************
  2. // BGZF.h (c) 2009 Derek Barnett, Michael Str�mberg
  3. // Marth Lab, Department of Biology, Boston College
  4. // All rights reserved.
  5. // ---------------------------------------------------------------------------
  6. // Last modified: 8 December 2009 (DB)
  7. // ---------------------------------------------------------------------------
  8. // BGZF routines were adapted from the bgzf.c code developed at the Broad
  9. // Institute.
  10. // ---------------------------------------------------------------------------
  11. // Provides the basic functionality for reading & writing BGZF files
  12. // ***************************************************************************
  13. #ifndef BGZF_H
  14. #define BGZF_H
  15. #include <nvbio/basic/atomics.h>
  16. // 'C' includes
  17. #include <cstdio>
  18. #include <cstdlib>
  19. #include <cstring>
  20. // C++ includes
  21. #include <string>
  22. // zlib includes
  23. #include <zlib/zlib.h>
  24. // Platform-specific type definitions
  25. #ifdef _MSC_VER
  26. typedef char int8_t;
  27. typedef unsigned char uint8_t;
  28. typedef short int16_t;
  29. typedef unsigned short uint16_t;
  30. typedef int int32_t;
  31. typedef unsigned int uint32_t;
  32. typedef long long int64_t;
  33. typedef unsigned long long uint64_t;
  34. #else
  35. #include <stdint.h>
  36. #endif
  37. namespace BamTools {
  38. // zlib constants
  39. const int GZIP_ID1 = 31;
  40. const int GZIP_ID2 = 139;
  41. const int CM_DEFLATE = 8;
  42. const int FLG_FEXTRA = 4;
  43. const int OS_UNKNOWN = 255;
  44. const int BGZF_XLEN = 6;
  45. const int BGZF_ID1 = 66;
  46. const int BGZF_ID2 = 67;
  47. const int BGZF_LEN = 2;
  48. const int GZIP_WINDOW_BITS = -15;
  49. const int Z_DEFAULT_MEM_LEVEL = 8;
  50. // BZGF constants
  51. const int BLOCK_HEADER_LENGTH = 18;
  52. const int BLOCK_FOOTER_LENGTH = 8;
  53. const int MAX_BLOCK_SIZE = 65536;
  54. const int DEFAULT_BLOCK_SIZE = 65536;
  55. struct BgzfThread;
  56. struct BgzfData {
  57. // data members
  58. unsigned int UncompressedBlockSize;
  59. unsigned int CompressedBlockSize;
  60. unsigned int BlockLength;
  61. unsigned int BlockOffset;
  62. uint64_t BlockAddress;
  63. bool IsOpen;
  64. bool IsWriteOnly;
  65. FILE* Stream;
  66. char* UncompressedBlock;
  67. char* CompressedBlock;
  68. nvbio::AtomicInt32 BackgroundThreads;
  69. nvbio::AtomicInt32 ActiveThreads;
  70. volatile unsigned int CurrentBlockSize;
  71. nvbio::AtomicInt32 WorkCounter;
  72. uint32_t ThreadCount;
  73. int volatile* BlockLengths;
  74. BgzfThread* ThreadPool;
  75. // constructor & destructor
  76. BgzfData(const uint32_t threads = uint32_t(-1));
  77. ~BgzfData(void);
  78. // closes BGZF file
  79. void Close(void);
  80. // opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing
  81. void Open(const std::string& filename, const char* mode);
  82. // reads BGZF data into a byte buffer
  83. int Read(char* data, const unsigned int dataLength);
  84. // reads BGZF block
  85. int ReadBlock(void);
  86. // seek to position in BAM file
  87. bool Seek(int64_t position);
  88. // get file position in BAM file
  89. int64_t Tell(void);
  90. // writes the supplied data into the BGZF buffer
  91. unsigned int Write(const char* data, const unsigned int dataLen);
  92. // checks BGZF block header
  93. static inline bool CheckBlockHeader(char* header);
  94. // packs an unsigned integer into the specified buffer
  95. static inline void PackUnsignedInt(char* buffer, unsigned int value);
  96. // packs an unsigned short into the specified buffer
  97. static inline void PackUnsignedShort(char* buffer, unsigned short value);
  98. // unpacks a buffer into a signed int
  99. static inline signed int UnpackSignedInt(char* buffer);
  100. // unpacks a buffer into a unsigned int
  101. static inline unsigned int UnpackUnsignedInt(char* buffer);
  102. // unpacks a buffer into a unsigned short
  103. static inline unsigned short UnpackUnsignedShort(char* buffer);
  104. // compresses the given block
  105. int DeflateBlock(int32_t id, const unsigned int blockSize);
  106. // compresses the current block
  107. int DeflateBlocks(void);
  108. // flushes the data in the BGZF block
  109. void FlushBlocks(void);
  110. // de-compresses the current block
  111. int InflateBlock(const int& blockLength);
  112. };
  113. // -------------------------------------------------------------
  114. inline
  115. bool BgzfData::CheckBlockHeader(char* header) {
  116. return (header[0] == GZIP_ID1 &&
  117. header[1] == (char)GZIP_ID2 &&
  118. header[2] == Z_DEFLATED &&
  119. (header[3] & FLG_FEXTRA) != 0 &&
  120. BgzfData::UnpackUnsignedShort(&header[10]) == BGZF_XLEN &&
  121. header[12] == BGZF_ID1 &&
  122. header[13] == BGZF_ID2 &&
  123. BgzfData::UnpackUnsignedShort(&header[14]) == BGZF_LEN );
  124. }
  125. // packs an unsigned integer into the specified buffer
  126. inline
  127. void BgzfData::PackUnsignedInt(char* buffer, unsigned int value) {
  128. buffer[0] = (char)value;
  129. buffer[1] = (char)(value >> 8);
  130. buffer[2] = (char)(value >> 16);
  131. buffer[3] = (char)(value >> 24);
  132. }
  133. // packs an unsigned short into the specified buffer
  134. inline
  135. void BgzfData::PackUnsignedShort(char* buffer, unsigned short value) {
  136. buffer[0] = (char)value;
  137. buffer[1] = (char)(value >> 8);
  138. }
  139. // unpacks a buffer into a signed int
  140. inline
  141. signed int BgzfData::UnpackSignedInt(char* buffer) {
  142. union { signed int value; unsigned char valueBuffer[sizeof(signed int)]; } un;
  143. un.value = 0;
  144. un.valueBuffer[0] = buffer[0];
  145. un.valueBuffer[1] = buffer[1];
  146. un.valueBuffer[2] = buffer[2];
  147. un.valueBuffer[3] = buffer[3];
  148. return un.value;
  149. }
  150. // unpacks a buffer into an unsigned int
  151. inline
  152. unsigned int BgzfData::UnpackUnsignedInt(char* buffer) {
  153. union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un;
  154. un.value = 0;
  155. un.valueBuffer[0] = buffer[0];
  156. un.valueBuffer[1] = buffer[1];
  157. un.valueBuffer[2] = buffer[2];
  158. un.valueBuffer[3] = buffer[3];
  159. return un.value;
  160. }
  161. // unpacks a buffer into an unsigned short
  162. inline
  163. unsigned short BgzfData::UnpackUnsignedShort(char* buffer) {
  164. union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)];} un;
  165. un.value = 0;
  166. un.valueBuffer[0] = buffer[0];
  167. un.valueBuffer[1] = buffer[1];
  168. return un.value;
  169. }
  170. } // namespace BamTools
  171. #endif // BGZF_H