format.h 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. #pragma once
  10. #include <array>
  11. #include <cstdint>
  12. #include <string>
  13. #include "file/file_prefetch_buffer.h"
  14. #include "file/random_access_file_reader.h"
  15. #include "memory/memory_allocator_impl.h"
  16. #include "options/cf_options.h"
  17. #include "port/malloc.h"
  18. #include "port/port.h" // noexcept
  19. #include "rocksdb/slice.h"
  20. #include "rocksdb/status.h"
  21. #include "rocksdb/table.h"
  22. #include "util/hash.h"
  23. namespace ROCKSDB_NAMESPACE {
  24. class RandomAccessFile;
  25. struct ReadOptions;
  26. bool ShouldReportDetailedTime(Env* env, Statistics* stats);
  27. // the length of the magic number in bytes.
  28. constexpr uint32_t kMagicNumberLengthByte = 8;
  29. extern const uint64_t kLegacyBlockBasedTableMagicNumber;
  30. extern const uint64_t kBlockBasedTableMagicNumber;
  31. extern const uint64_t kLegacyPlainTableMagicNumber;
  32. extern const uint64_t kPlainTableMagicNumber;
  33. extern const uint64_t kCuckooTableMagicNumber;
  34. // BlockHandle is a pointer to the extent of a file that stores a data
  35. // block or a meta block.
  36. class BlockHandle {
  37. public:
  38. // Creates a block handle with special values indicating "uninitialized,"
  39. // distinct from the "null" block handle.
  40. BlockHandle();
  41. BlockHandle(uint64_t offset, uint64_t size);
  42. // The offset of the block in the file.
  43. uint64_t offset() const { return offset_; }
  44. void set_offset(uint64_t _offset) { offset_ = _offset; }
  45. // The size of the stored block, this size does not include the block trailer.
  46. uint64_t size() const { return size_; }
  47. void set_size(uint64_t _size) { size_ = _size; }
  48. void EncodeTo(std::string* dst) const;
  49. char* EncodeTo(char* dst) const;
  50. Status DecodeFrom(Slice* input);
  51. Status DecodeSizeFrom(uint64_t offset, Slice* input);
  52. // Return a string that contains the copy of handle.
  53. std::string ToString(bool hex = true) const;
  54. // if the block handle's offset and size are both "0", we will view it
  55. // as a null block handle that points to no where.
  56. bool IsNull() const { return offset_ == 0 && size_ == 0; }
  57. static const BlockHandle& NullBlockHandle() { return kNullBlockHandle; }
  58. // Maximum encoding length of a BlockHandle
  59. static constexpr uint32_t kMaxEncodedLength = 2 * kMaxVarint64Length;
  60. inline bool operator==(const BlockHandle& rhs) const {
  61. return offset_ == rhs.offset_ && size_ == rhs.size_;
  62. }
  63. inline bool operator!=(const BlockHandle& rhs) const {
  64. return !(*this == rhs);
  65. }
  66. private:
  67. uint64_t offset_;
  68. uint64_t size_;
  69. static const BlockHandle kNullBlockHandle;
  70. };
  71. struct EncodedBlockHandle {
  72. explicit EncodedBlockHandle(const BlockHandle& h) {
  73. auto end = h.EncodeTo(buffer.data());
  74. size = end - buffer.data();
  75. }
  76. Slice AsSlice() const { return Slice(buffer.data(), size); }
  77. std::array<char, BlockHandle::kMaxEncodedLength> buffer;
  78. size_t size;
  79. };
  80. // Value in block-based table file index.
  81. //
  82. // The index entry for block n is: y -> h, [x],
  83. // where: y is some key between the last key of block n (inclusive) and the
  84. // first key of block n+1 (exclusive); h is BlockHandle pointing to block n;
  85. // x, if present, is the first key of block n (unshortened).
  86. // This struct represents the "h, [x]" part.
  87. struct IndexValue {
  88. BlockHandle handle;
  89. // Empty means unknown.
  90. Slice first_internal_key;
  91. IndexValue() = default;
  92. IndexValue(BlockHandle _handle, Slice _first_internal_key)
  93. : handle(_handle), first_internal_key(_first_internal_key) {}
  94. // have_first_key indicates whether the `first_internal_key` is used.
  95. // If previous_handle is not null, delta encoding is used;
  96. // in this case, the two handles must point to consecutive blocks:
  97. // handle.offset() ==
  98. // previous_handle->offset() + previous_handle->size() + kBlockTrailerSize
  99. void EncodeTo(std::string* dst, bool have_first_key,
  100. const BlockHandle* previous_handle) const;
  101. Status DecodeFrom(Slice* input, bool have_first_key,
  102. const BlockHandle* previous_handle);
  103. std::string ToString(bool hex, bool have_first_key) const;
  104. };
  105. // Given a file's base_context_checksum and an offset of a block within that
  106. // file, choose a 32-bit value that is as unique as possible. This value will
  107. // be added to the standard checksum to get a checksum "with context," or can
  108. // be subtracted to "remove" context. Returns zero (no modifier) if feature is
  109. // disabled with base_context_checksum == 0.
  110. inline uint32_t ChecksumModifierForContext(uint32_t base_context_checksum,
  111. uint64_t offset) {
  112. // To disable on base_context_checksum == 0, we could write
  113. // `if (base_context_checksum == 0) return 0;` but benchmarking shows
  114. // measurable performance penalty vs. this: compute the modifier
  115. // unconditionally and use an "all or nothing" bit mask to enable
  116. // or disable.
  117. uint32_t all_or_nothing = uint32_t{0} - (base_context_checksum != 0);
  118. // Desired properties:
  119. // (call this function f(b, o) where b = base and o = offset)
  120. // 1. Fast
  121. // 2. f(b1, o) == f(b2, o) iff b1 == b2
  122. // (Perfectly preserve base entropy)
  123. // 3. f(b, o1) == f(b, o2) only if o1 == o2 or |o1-o2| >= 4 billion
  124. // (Guaranteed uniqueness for nearby offsets)
  125. // 3. f(b, o + j * 2**32) == f(b, o + k * 2**32) only if j == k
  126. // (Upper bits matter, and *aligned* misplacement fails check)
  127. // 4. f(b1, o) == f(b2, o + x) then preferably not
  128. // f(b1, o + y) == f(b2, o + x + y)
  129. // (Avoid linearly correlated matches)
  130. // 5. f(b, o) == 0 depends on both b and o
  131. // (No predictable overlap with non-context checksums)
  132. uint32_t modifier =
  133. base_context_checksum ^ (Lower32of64(offset) + Upper32of64(offset));
  134. return modifier & all_or_nothing;
  135. }
  136. inline uint32_t GetCompressFormatForVersion(uint32_t format_version) {
  137. // As of format_version 2, we encode compressed block with
  138. // compress_format_version == 2. Before that, the version is 1.
  139. // DO NOT CHANGE THIS FUNCTION, it affects disk format
  140. // As of format_version 7 and opening up to custom compression, the
  141. // compression format version is essentially independent of the block-based
  142. // table format version, and encoded in the compression_name table property.
  143. // Thus, this function can go away once we remove support for reading
  144. // format_version=1.
  145. return format_version >= 2 ? 2 : 1;
  146. }
  147. constexpr uint32_t kLatestFormatVersion = 7;
  148. inline bool IsSupportedFormatVersion(uint32_t version) {
  149. return version <= kLatestFormatVersion;
  150. }
  151. // Same as having a unique id in footer.
  152. inline bool FormatVersionUsesContextChecksum(uint32_t version) {
  153. return version >= 6;
  154. }
  155. inline bool FormatVersionUsesIndexHandleInFooter(uint32_t version) {
  156. return version < 6;
  157. }
  158. inline bool FormatVersionUsesCompressionManagerName(uint32_t version) {
  159. return version >= 7;
  160. }
  161. // Footer encapsulates the fixed information stored at the tail end of every
  162. // SST file. In general, it should only include things that cannot go
  163. // elsewhere under the metaindex block. For example, checksum_type is
  164. // required for verifying metaindex block checksum (when applicable), but
  165. // index block handle can easily go in metaindex block. See also FooterBuilder
  166. // below.
  167. class Footer {
  168. public:
  169. // Create empty. Populate using DecodeFrom.
  170. Footer() {}
  171. void Reset() {
  172. table_magic_number_ = kNullTableMagicNumber;
  173. format_version_ = kInvalidFormatVersion;
  174. base_context_checksum_ = 0;
  175. metaindex_handle_ = BlockHandle::NullBlockHandle();
  176. index_handle_ = BlockHandle::NullBlockHandle();
  177. checksum_type_ = kInvalidChecksumType;
  178. block_trailer_size_ = 0;
  179. }
  180. // Deserialize a footer (populate fields) from `input` and check for various
  181. // corruptions. `input_offset` is the offset within the target file of
  182. // `input` buffer, which is needed for verifying format_version >= 6 footer.
  183. // If enforce_table_magic_number != 0, will return corruption if table magic
  184. // number is not equal to enforce_table_magic_number.
  185. Status DecodeFrom(Slice input, uint64_t input_offset,
  186. uint64_t enforce_table_magic_number = 0);
  187. // Table magic number identifies file as RocksDB SST file and which kind of
  188. // SST format is use.
  189. uint64_t table_magic_number() const { return table_magic_number_; }
  190. // A version (footer and more) within a kind of SST. (It would add more
  191. // unnecessary complexity to separate footer versions and
  192. // BBTO::format_version.)
  193. uint32_t format_version() const { return format_version_; }
  194. // See ChecksumModifierForContext()
  195. uint32_t base_context_checksum() const { return base_context_checksum_; }
  196. // Block handle for metaindex block.
  197. const BlockHandle& metaindex_handle() const { return metaindex_handle_; }
  198. // Block handle for (top-level) index block.
  199. // TODO? remove from this struct and only read on decode for legacy cases
  200. const BlockHandle& index_handle() const { return index_handle_; }
  201. // Checksum type used in the file, including footer for format version >= 6.
  202. ChecksumType checksum_type() const {
  203. return static_cast<ChecksumType>(checksum_type_);
  204. }
  205. // Block trailer size used by file with this footer (e.g. 5 for block-based
  206. // table and 0 for plain table). This is inferred from magic number so
  207. // not in the serialized form.
  208. inline size_t GetBlockTrailerSize() const { return block_trailer_size_; }
  209. // Convert this object to a human readable form
  210. std::string ToString() const;
  211. // Encoded lengths of Footers. Bytes for serialized Footer will always be
  212. // >= kMinEncodedLength and <= kMaxEncodedLength.
  213. //
  214. // Footer version 0 (legacy) will always occupy exactly this many bytes.
  215. // It consists of two block handles, padding, and a magic number.
  216. static constexpr uint32_t kVersion0EncodedLength =
  217. 2 * BlockHandle::kMaxEncodedLength + kMagicNumberLengthByte;
  218. static constexpr uint32_t kMinEncodedLength = kVersion0EncodedLength;
  219. // Footer of versions 1 and higher will always occupy exactly this many
  220. // bytes. It originally consisted of the checksum type, two block handles,
  221. // padding (to maximum handle encoding size), a format version number, and a
  222. // magic number.
  223. static constexpr uint32_t kNewVersionsEncodedLength =
  224. 1 + 2 * BlockHandle::kMaxEncodedLength + 4 + kMagicNumberLengthByte;
  225. static constexpr uint32_t kMaxEncodedLength = kNewVersionsEncodedLength;
  226. static constexpr uint64_t kNullTableMagicNumber = 0;
  227. static constexpr uint32_t kInvalidFormatVersion = 0xffffffffU;
  228. private:
  229. static constexpr int kInvalidChecksumType =
  230. (1 << (sizeof(ChecksumType) * 8)) | kNoChecksum;
  231. uint64_t table_magic_number_ = kNullTableMagicNumber;
  232. uint32_t format_version_ = kInvalidFormatVersion;
  233. uint32_t base_context_checksum_ = 0;
  234. BlockHandle metaindex_handle_;
  235. BlockHandle index_handle_;
  236. int checksum_type_ = kInvalidChecksumType;
  237. uint8_t block_trailer_size_ = 0;
  238. };
  239. // Builder for Footer
  240. class FooterBuilder {
  241. public:
  242. // Run builder in inputs. This is a single step with lots of parameters for
  243. // efficiency (based on perf testing).
  244. // * table_magic_number identifies file as RocksDB SST file and which kind of
  245. // SST format is use.
  246. // * format_version is a version for the footer and can also apply to other
  247. // aspects of the SST file (see BlockBasedTableOptions::format_version).
  248. // NOTE: To save complexity in the caller, when format_version == 0 and
  249. // there is a corresponding legacy magic number to the one specified, the
  250. // legacy magic number will be written for forward compatibility.
  251. // * footer_offset is the file offset where the footer will be written
  252. // (for future use).
  253. // * checksum_type is for formats using block checksums.
  254. // * index_handle is optional for some SST kinds and (for caller convenience)
  255. // ignored when format_version >= 6. (Must be added to metaindex in that
  256. // case.)
  257. // * unique_id must be specified if format_vesion >= 6 and SST uses block
  258. // checksums with context. Otherwise, auto-generated if format_vesion >= 6.
  259. Status Build(uint64_t table_magic_number, uint32_t format_version,
  260. uint64_t footer_offset, ChecksumType checksum_type,
  261. const BlockHandle& metaindex_handle,
  262. const BlockHandle& index_handle = BlockHandle::NullBlockHandle(),
  263. uint32_t base_context_checksum = 0);
  264. // After Builder, get a Slice for the serialized Footer, backed by this
  265. // FooterBuilder.
  266. const Slice& GetSlice() const {
  267. assert(slice_.size());
  268. return slice_;
  269. }
  270. private:
  271. Slice slice_;
  272. std::array<char, Footer::kMaxEncodedLength> data_;
  273. };
  274. // Set to true to allow unit testing of writing unsupported block-based table
  275. // format versions (to test read side)
  276. bool& TEST_AllowUnsupportedFormatVersion();
  277. // Read the footer from file
  278. // If enforce_table_magic_number != 0, ReadFooterFromFile() will return
  279. // corruption if table_magic number is not equal to enforce_table_magic_number
  280. Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
  281. FileSystem& fs, FilePrefetchBuffer* prefetch_buffer,
  282. uint64_t file_size, Footer* footer,
  283. uint64_t enforce_table_magic_number = 0,
  284. Statistics* stats = nullptr);
  285. // Computes a checksum using the given ChecksumType. Sometimes we need to
  286. // include one more input byte logically at the end but not part of the main
  287. // data buffer. If data_size >= 1, then
  288. // ComputeBuiltinChecksum(type, data, size)
  289. // ==
  290. // ComputeBuiltinChecksumWithLastByte(type, data, size - 1, data[size - 1])
  291. uint32_t ComputeBuiltinChecksum(ChecksumType type, const char* data,
  292. size_t size);
  293. uint32_t ComputeBuiltinChecksumWithLastByte(ChecksumType type, const char* data,
  294. size_t size, char last_byte);
  295. // Represents the contents of a block read from an SST file. Depending on how
  296. // it's created, it may or may not own the actual block bytes. As an example,
  297. // BlockContents objects representing data read from mmapped files only point
  298. // into the mmapped region. Depending on context, it might be a serialized
  299. // (potentially compressed) block, including a trailer beyond `size`, or an
  300. // uncompressed block.
  301. //
  302. // Please try to use this terminology when dealing with blocks:
  303. // * "Serialized block" - bytes that go into storage. For block-based table
  304. // (usually the case) this includes the block trailer. Here the `size` does
  305. // not include the trailer, but other places in code might include the trailer
  306. // in the size.
  307. // * "Maybe compressed block" - like a serialized block, but without the
  308. // trailer (or no promise of including a trailer). Must be accompanied by a
  309. // CompressionType in some other variable or field.
  310. // * "Uncompressed block" - "payload" bytes that are either stored with no
  311. // compression, used as input to compression function, or result of
  312. // decompression function.
  313. // * "Parsed block" - an in-memory form of a block in block cache, as it is
  314. // used by the table reader. Different C++ types are used depending on the
  315. // block type (see block_cache.h). Only trivially parsable block types
  316. // use BlockContents as the parsed form.
  317. //
  318. struct BlockContents {
  319. // Points to block payload (without trailer)
  320. Slice data;
  321. CacheAllocationPtr allocation;
  322. #ifndef NDEBUG
  323. // Whether there is a known trailer after what is pointed to by `data`.
  324. // See BlockBasedTable::GetCompressionType.
  325. bool has_trailer = false;
  326. #endif // NDEBUG
  327. BlockContents() {}
  328. // Does not take ownership of the underlying data bytes.
  329. BlockContents(const Slice& _data) : data(_data) {}
  330. // Takes ownership of the underlying data bytes.
  331. BlockContents(CacheAllocationPtr&& _data, size_t _size)
  332. : data(_data.get(), _size), allocation(std::move(_data)) {}
  333. // Takes ownership of the underlying data bytes.
  334. BlockContents(std::unique_ptr<char[]>&& _data, size_t _size)
  335. : data(_data.get(), _size) {
  336. allocation.reset(_data.release());
  337. }
  338. // Returns whether the object has ownership of the underlying data bytes.
  339. bool own_bytes() const { return allocation.get() != nullptr; }
  340. // The additional memory space taken by the block data.
  341. size_t usable_size() const {
  342. // FIXME: doesn't account for possible block trailer
  343. if (allocation.get() != nullptr) {
  344. auto allocator = allocation.get_deleter().allocator;
  345. if (allocator) {
  346. return allocator->UsableSize(allocation.get(), data.size());
  347. }
  348. #ifdef ROCKSDB_MALLOC_USABLE_SIZE
  349. return malloc_usable_size(allocation.get());
  350. #else
  351. return data.size();
  352. #endif // ROCKSDB_MALLOC_USABLE_SIZE
  353. } else {
  354. return 0; // no extra memory is occupied by the data
  355. }
  356. }
  357. size_t ApproximateMemoryUsage() const {
  358. return usable_size() + sizeof(*this);
  359. }
  360. BlockContents(BlockContents&& other) noexcept { *this = std::move(other); }
  361. BlockContents& operator=(BlockContents&& other) {
  362. data = std::move(other.data);
  363. allocation = std::move(other.allocation);
  364. #ifndef NDEBUG
  365. has_trailer = other.has_trailer;
  366. #endif // NDEBUG
  367. return *this;
  368. }
  369. };
  370. // The `data` points to serialized block contents read in from file, which
  371. // must be compressed and include a trailer beyond `size`. A new buffer is
  372. // allocated with the given allocator (or default) and the uncompressed
  373. // contents are returned in `out_contents`. Statistics updated.
  374. Status DecompressSerializedBlock(const char* data, size_t size,
  375. CompressionType type,
  376. Decompressor& decompressor,
  377. BlockContents* out_contents,
  378. const ImmutableOptions& ioptions,
  379. MemoryAllocator* allocator = nullptr);
  380. Status DecompressSerializedBlock(Decompressor::Args& args,
  381. Decompressor& decompressor,
  382. BlockContents* out_contents,
  383. const ImmutableOptions& ioptions,
  384. MemoryAllocator* allocator = nullptr);
  385. // This is a variant of DecompressSerializedBlock that does not expect a
  386. // block trailer beyond `size`. (CompressionType is passed in.)
  387. Status DecompressBlockData(
  388. const char* data, size_t size, CompressionType type,
  389. Decompressor& decompressor, BlockContents* out_contents,
  390. const ImmutableOptions& ioptions, MemoryAllocator* allocator = nullptr,
  391. Decompressor::ManagedWorkingArea* working_area = nullptr);
  392. Status DecompressBlockData(Decompressor::Args& args, Decompressor& decompressor,
  393. BlockContents* out_contents,
  394. const ImmutableOptions& ioptions,
  395. MemoryAllocator* allocator = nullptr);
  396. // Replace db_host_id contents with the real hostname if necessary
  397. Status ReifyDbHostIdProperty(Env* env, std::string* db_host_id);
  398. // Implementation details follow. Clients should ignore,
  399. // TODO(andrewkr): we should prefer one way of representing a null/uninitialized
  400. // BlockHandle. Currently we use zeros for null and use negation-of-zeros for
  401. // uninitialized.
  402. inline BlockHandle::BlockHandle() : BlockHandle(~uint64_t{0}, ~uint64_t{0}) {}
  403. inline BlockHandle::BlockHandle(uint64_t _offset, uint64_t _size)
  404. : offset_(_offset), size_(_size) {}
  405. } // namespace ROCKSDB_NAMESPACE