format.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. #pragma once
  10. #include <stdint.h>
  11. #include <string>
  12. #include "file/file_prefetch_buffer.h"
  13. #include "file/random_access_file_reader.h"
  14. #include "rocksdb/options.h"
  15. #include "rocksdb/slice.h"
  16. #include "rocksdb/status.h"
  17. #include "rocksdb/table.h"
  18. #include "memory/memory_allocator.h"
  19. #include "options/cf_options.h"
  20. #include "port/malloc.h"
  21. #include "port/port.h" // noexcept
  22. #include "table/persistent_cache_options.h"
  23. namespace ROCKSDB_NAMESPACE {
  24. class RandomAccessFile;
  25. struct ReadOptions;
  26. extern bool ShouldReportDetailedTime(Env* env, Statistics* stats);
  27. // the length of the magic number in bytes.
  28. const int kMagicNumberLengthByte = 8;
  29. // BlockHandle is a pointer to the extent of a file that stores a data
  30. // block or a meta block.
  31. class BlockHandle {
  32. public:
  33. BlockHandle();
  34. BlockHandle(uint64_t offset, uint64_t size);
  35. // The offset of the block in the file.
  36. uint64_t offset() const { return offset_; }
  37. void set_offset(uint64_t _offset) { offset_ = _offset; }
  38. // The size of the stored block
  39. uint64_t size() const { return size_; }
  40. void set_size(uint64_t _size) { size_ = _size; }
  41. void EncodeTo(std::string* dst) const;
  42. Status DecodeFrom(Slice* input);
  43. Status DecodeSizeFrom(uint64_t offset, Slice* input);
  44. // Return a string that contains the copy of handle.
  45. std::string ToString(bool hex = true) const;
  46. // if the block handle's offset and size are both "0", we will view it
  47. // as a null block handle that points to no where.
  48. bool IsNull() const { return offset_ == 0 && size_ == 0; }
  49. static const BlockHandle& NullBlockHandle() { return kNullBlockHandle; }
  50. // Maximum encoding length of a BlockHandle
  51. enum { kMaxEncodedLength = 10 + 10 };
  52. private:
  53. uint64_t offset_;
  54. uint64_t size_;
  55. static const BlockHandle kNullBlockHandle;
  56. };
  57. // Value in block-based table file index.
  58. //
  59. // The index entry for block n is: y -> h, [x],
  60. // where: y is some key between the last key of block n (inclusive) and the
  61. // first key of block n+1 (exclusive); h is BlockHandle pointing to block n;
  62. // x, if present, is the first key of block n (unshortened).
  63. // This struct represents the "h, [x]" part.
  64. struct IndexValue {
  65. BlockHandle handle;
  66. // Empty means unknown.
  67. Slice first_internal_key;
  68. IndexValue() = default;
  69. IndexValue(BlockHandle _handle, Slice _first_internal_key)
  70. : handle(_handle), first_internal_key(_first_internal_key) {}
  71. // have_first_key indicates whether the `first_internal_key` is used.
  72. // If previous_handle is not null, delta encoding is used;
  73. // in this case, the two handles must point to consecutive blocks:
  74. // handle.offset() ==
  75. // previous_handle->offset() + previous_handle->size() + kBlockTrailerSize
  76. void EncodeTo(std::string* dst, bool have_first_key,
  77. const BlockHandle* previous_handle) const;
  78. Status DecodeFrom(Slice* input, bool have_first_key,
  79. const BlockHandle* previous_handle);
  80. std::string ToString(bool hex, bool have_first_key) const;
  81. };
  82. inline uint32_t GetCompressFormatForVersion(CompressionType compression_type,
  83. uint32_t version) {
  84. #ifdef NDEBUG
  85. (void)compression_type;
  86. #endif
  87. // snappy is not versioned
  88. assert(compression_type != kSnappyCompression &&
  89. compression_type != kXpressCompression &&
  90. compression_type != kNoCompression);
  91. // As of version 2, we encode compressed block with
  92. // compress_format_version == 2. Before that, the version is 1.
  93. // DO NOT CHANGE THIS FUNCTION, it affects disk format
  94. return version >= 2 ? 2 : 1;
  95. }
  96. inline bool BlockBasedTableSupportedVersion(uint32_t version) {
  97. return version <= 5;
  98. }
  99. // Footer encapsulates the fixed information stored at the tail
  100. // end of every table file.
  101. class Footer {
  102. public:
  103. // Constructs a footer without specifying its table magic number.
  104. // In such case, the table magic number of such footer should be
  105. // initialized via @ReadFooterFromFile().
  106. // Use this when you plan to load Footer with DecodeFrom(). Never use this
  107. // when you plan to EncodeTo.
  108. Footer() : Footer(kInvalidTableMagicNumber, 0) {}
  109. // Use this constructor when you plan to write out the footer using
  110. // EncodeTo(). Never use this constructor with DecodeFrom().
  111. Footer(uint64_t table_magic_number, uint32_t version);
  112. // The version of the footer in this file
  113. uint32_t version() const { return version_; }
  114. // The checksum type used in this file
  115. ChecksumType checksum() const { return checksum_; }
  116. void set_checksum(const ChecksumType c) { checksum_ = c; }
  117. // The block handle for the metaindex block of the table
  118. const BlockHandle& metaindex_handle() const { return metaindex_handle_; }
  119. void set_metaindex_handle(const BlockHandle& h) { metaindex_handle_ = h; }
  120. // The block handle for the index block of the table
  121. const BlockHandle& index_handle() const { return index_handle_; }
  122. void set_index_handle(const BlockHandle& h) { index_handle_ = h; }
  123. uint64_t table_magic_number() const { return table_magic_number_; }
  124. void EncodeTo(std::string* dst) const;
  125. // Set the current footer based on the input slice.
  126. //
  127. // REQUIRES: table_magic_number_ is not set (i.e.,
  128. // HasInitializedTableMagicNumber() is true). The function will initialize the
  129. // magic number
  130. Status DecodeFrom(Slice* input);
  131. // Encoded length of a Footer. Note that the serialization of a Footer will
  132. // always occupy at least kMinEncodedLength bytes. If fields are changed
  133. // the version number should be incremented and kMaxEncodedLength should be
  134. // increased accordingly.
  135. enum {
  136. // Footer version 0 (legacy) will always occupy exactly this many bytes.
  137. // It consists of two block handles, padding, and a magic number.
  138. kVersion0EncodedLength = 2 * BlockHandle::kMaxEncodedLength + 8,
  139. // Footer of versions 1 and higher will always occupy exactly this many
  140. // bytes. It consists of the checksum type, two block handles, padding,
  141. // a version number (bigger than 1), and a magic number
  142. kNewVersionsEncodedLength = 1 + 2 * BlockHandle::kMaxEncodedLength + 4 + 8,
  143. kMinEncodedLength = kVersion0EncodedLength,
  144. kMaxEncodedLength = kNewVersionsEncodedLength,
  145. };
  146. static const uint64_t kInvalidTableMagicNumber = 0;
  147. // convert this object to a human readable form
  148. std::string ToString() const;
  149. private:
  150. // REQUIRES: magic number wasn't initialized.
  151. void set_table_magic_number(uint64_t magic_number) {
  152. assert(!HasInitializedTableMagicNumber());
  153. table_magic_number_ = magic_number;
  154. }
  155. // return true if @table_magic_number_ is set to a value different
  156. // from @kInvalidTableMagicNumber.
  157. bool HasInitializedTableMagicNumber() const {
  158. return (table_magic_number_ != kInvalidTableMagicNumber);
  159. }
  160. uint32_t version_;
  161. ChecksumType checksum_;
  162. BlockHandle metaindex_handle_;
  163. BlockHandle index_handle_;
  164. uint64_t table_magic_number_ = 0;
  165. };
  166. // Read the footer from file
  167. // If enforce_table_magic_number != 0, ReadFooterFromFile() will return
  168. // corruption if table_magic number is not equal to enforce_table_magic_number
  169. Status ReadFooterFromFile(RandomAccessFileReader* file,
  170. FilePrefetchBuffer* prefetch_buffer,
  171. uint64_t file_size, Footer* footer,
  172. uint64_t enforce_table_magic_number = 0);
  173. // 1-byte type + 32-bit crc
  174. static const size_t kBlockTrailerSize = 5;
  175. // Make block size calculation for IO less error prone
  176. inline uint64_t block_size(const BlockHandle& handle) {
  177. return handle.size() + kBlockTrailerSize;
  178. }
  179. inline CompressionType get_block_compression_type(const char* block_data,
  180. size_t block_size) {
  181. return static_cast<CompressionType>(block_data[block_size]);
  182. }
  183. // Represents the contents of a block read from an SST file. Depending on how
  184. // it's created, it may or may not own the actual block bytes. As an example,
  185. // BlockContents objects representing data read from mmapped files only point
  186. // into the mmapped region.
  187. struct BlockContents {
  188. Slice data; // Actual contents of data
  189. CacheAllocationPtr allocation;
  190. #ifndef NDEBUG
  191. // Whether the block is a raw block, which contains compression type
  192. // byte. It is only used for assertion.
  193. bool is_raw_block = false;
  194. #endif // NDEBUG
  195. BlockContents() {}
  196. // Does not take ownership of the underlying data bytes.
  197. BlockContents(const Slice& _data) : data(_data) {}
  198. // Takes ownership of the underlying data bytes.
  199. BlockContents(CacheAllocationPtr&& _data, size_t _size)
  200. : data(_data.get(), _size), allocation(std::move(_data)) {}
  201. // Takes ownership of the underlying data bytes.
  202. BlockContents(std::unique_ptr<char[]>&& _data, size_t _size)
  203. : data(_data.get(), _size) {
  204. allocation.reset(_data.release());
  205. }
  206. // Returns whether the object has ownership of the underlying data bytes.
  207. bool own_bytes() const { return allocation.get() != nullptr; }
  208. // It's the caller's responsibility to make sure that this is
  209. // for raw block contents, which contains the compression
  210. // byte in the end.
  211. CompressionType get_compression_type() const {
  212. assert(is_raw_block);
  213. return get_block_compression_type(data.data(), data.size());
  214. }
  215. // The additional memory space taken by the block data.
  216. size_t usable_size() const {
  217. if (allocation.get() != nullptr) {
  218. auto allocator = allocation.get_deleter().allocator;
  219. if (allocator) {
  220. return allocator->UsableSize(allocation.get(), data.size());
  221. }
  222. #ifdef ROCKSDB_MALLOC_USABLE_SIZE
  223. return malloc_usable_size(allocation.get());
  224. #else
  225. return data.size();
  226. #endif // ROCKSDB_MALLOC_USABLE_SIZE
  227. } else {
  228. return 0; // no extra memory is occupied by the data
  229. }
  230. }
  231. size_t ApproximateMemoryUsage() const {
  232. return usable_size() + sizeof(*this);
  233. }
  234. BlockContents(BlockContents&& other) ROCKSDB_NOEXCEPT {
  235. *this = std::move(other);
  236. }
  237. BlockContents& operator=(BlockContents&& other) {
  238. data = std::move(other.data);
  239. allocation = std::move(other.allocation);
  240. #ifndef NDEBUG
  241. is_raw_block = other.is_raw_block;
  242. #endif // NDEBUG
  243. return *this;
  244. }
  245. };
  246. // Read the block identified by "handle" from "file". On failure
  247. // return non-OK. On success fill *result and return OK.
  248. extern Status ReadBlockContents(
  249. RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
  250. const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
  251. BlockContents* contents, const ImmutableCFOptions& ioptions,
  252. bool do_uncompress = true, const Slice& compression_dict = Slice(),
  253. const PersistentCacheOptions& cache_options = PersistentCacheOptions());
  254. // The 'data' points to the raw block contents read in from file.
  255. // This method allocates a new heap buffer and the raw block
  256. // contents are uncompresed into this buffer. This buffer is
  257. // returned via 'result' and it is upto the caller to
  258. // free this buffer.
  259. // For description of compress_format_version and possible values, see
  260. // util/compression.h
  261. extern Status UncompressBlockContents(const UncompressionInfo& info,
  262. const char* data, size_t n,
  263. BlockContents* contents,
  264. uint32_t compress_format_version,
  265. const ImmutableCFOptions& ioptions,
  266. MemoryAllocator* allocator = nullptr);
  267. // This is an extension to UncompressBlockContents that accepts
  268. // a specific compression type. This is used by un-wrapped blocks
  269. // with no compression header.
  270. extern Status UncompressBlockContentsForCompressionType(
  271. const UncompressionInfo& info, const char* data, size_t n,
  272. BlockContents* contents, uint32_t compress_format_version,
  273. const ImmutableCFOptions& ioptions, MemoryAllocator* allocator = nullptr);
  274. // Implementation details follow. Clients should ignore,
  275. // TODO(andrewkr): we should prefer one way of representing a null/uninitialized
  276. // BlockHandle. Currently we use zeros for null and use negation-of-zeros for
  277. // uninitialized.
  278. inline BlockHandle::BlockHandle()
  279. : BlockHandle(~static_cast<uint64_t>(0), ~static_cast<uint64_t>(0)) {}
  280. inline BlockHandle::BlockHandle(uint64_t _offset, uint64_t _size)
  281. : offset_(_offset), size_(_size) {}
  282. } // namespace ROCKSDB_NAMESPACE