plain_table_factory.h 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
  1. // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  3. // Use of this source code is governed by a BSD-style license that can be
  4. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  5. #pragma once
  6. #include <stdint.h>
  7. #include <memory>
  8. #include <string>
  9. #include "rocksdb/table.h"
  10. namespace ROCKSDB_NAMESPACE {
  11. struct EnvOptions;
  12. class Status;
  13. class RandomAccessFile;
  14. class WritableFile;
  15. class Table;
  16. class TableBuilder;
  17. // PlainTableFactory is the entrance function to the PlainTable format of
  18. // SST files. It returns instances PlainTableBuilder as the builder
  19. // class and PlainTableReader as the reader class, where the format is
  20. // actually implemented.
  21. //
  22. // The PlainTable is designed for memory-mapped file systems, e.g. tmpfs.
  23. // Data is not organized in blocks, which allows fast access. Because of
  24. // following downsides
  25. // 1. Data compression is not supported.
  26. // 2. Data is not checksumed.
  27. // it is not recommended to use this format on other type of file systems.
  28. //
  29. // PlainTable requires fixed length key, configured as a constructor
  30. // parameter of the factory class. Output file format:
  31. // +-------------+-----------------+
  32. // | version | user_key_length |
  33. // +------------++------------+-----------------+ <= key1 offset
  34. // | encoded key1 | value_size | |
  35. // +------------+-------------+-------------+ |
  36. // | value1 |
  37. // | |
  38. // +--------------------------+-------------+---+ <= key2 offset
  39. // | encoded key2 | value_size | |
  40. // +------------+-------------+-------------+ |
  41. // | value2 |
  42. // | |
  43. // | ...... |
  44. // +-----------------+--------------------------+
  45. //
  46. // When the key encoding type is kPlain. Key part is encoded as:
  47. // +------------+--------------------+
  48. // | [key_size] | internal key |
  49. // +------------+--------------------+
  50. // for the case of user_key_len = kPlainTableVariableLength case,
  51. // and simply:
  52. // +----------------------+
  53. // | internal key |
  54. // +----------------------+
  55. // for user_key_len != kPlainTableVariableLength case.
  56. //
  57. // If key encoding type is kPrefix. Keys are encoding in this format.
  58. // There are three ways to encode a key:
  59. // (1) Full Key
  60. // +---------------+---------------+-------------------+
  61. // | Full Key Flag | Full Key Size | Full Internal Key |
  62. // +---------------+---------------+-------------------+
  63. // which simply encodes a full key
  64. //
  65. // (2) A key shared the same prefix as the previous key, which is encoded as
  66. // format of (1).
  67. // +-------------+-------------+-------------+-------------+------------+
  68. // | Prefix Flag | Prefix Size | Suffix Flag | Suffix Size | Key Suffix |
  69. // +-------------+-------------+-------------+-------------+------------+
  70. // where key is the suffix part of the key, including the internal bytes.
  71. // the actual key will be constructed by concatenating prefix part of the
  72. // previous key, with the suffix part of the key here, with sizes given here.
  73. //
  74. // (3) A key shared the same prefix as the previous key, which is encoded as
  75. // the format of (2).
  76. // +-----------------+-----------------+------------------------+
  77. // | Key Suffix Flag | Key Suffix Size | Suffix of Internal Key |
  78. // +-----------------+-----------------+------------------------+
  79. // The key will be constructed by concatenating previous key's prefix (which is
  80. // also a prefix which the last key encoded in the format of (1)) and the
  81. // key given here.
  82. //
  83. // For example, we for following keys (prefix and suffix are separated by
  84. // spaces):
  85. // 0000 0001
  86. // 0000 00021
  87. // 0000 0002
  88. // 00011 00
  89. // 0002 0001
  90. // Will be encoded like this:
  91. // FK 8 00000001
  92. // PF 4 SF 5 00021
  93. // SF 4 0002
  94. // FK 7 0001100
  95. // FK 8 00020001
  96. // (where FK means full key flag, PF means prefix flag and SF means suffix flag)
  97. //
  98. // All those "key flag + key size" shown above are in this format:
  99. // The 8 bits of the first byte:
  100. // +----+----+----+----+----+----+----+----+
  101. // | Type | Size |
  102. // +----+----+----+----+----+----+----+----+
  103. // Type indicates: full key, prefix, or suffix.
  104. // The last 6 bits are for size. If the size bits are not all 1, it means the
  105. // size of the key. Otherwise, varint32 is read after this byte. This varint
  106. // value + 0x3F (the value of all 1) will be the key size.
  107. //
  108. // For example, full key with length 16 will be encoded as (binary):
  109. // 00 010000
  110. // (00 means full key)
  111. // and a prefix with 100 bytes will be encoded as:
  112. // 01 111111 00100101
  113. // (63) (37)
  114. // (01 means key suffix)
  115. //
  116. // All the internal keys above (including kPlain and kPrefix) are encoded in
  117. // this format:
  118. // There are two types:
  119. // (1) normal internal key format
  120. // +----------- ...... -------------+----+---+---+---+---+---+---+---+
  121. // | user key |type| sequence ID |
  122. // +----------- ..... --------------+----+---+---+---+---+---+---+---+
  123. // (2) Special case for keys whose sequence ID is 0 and is value type
  124. // +----------- ...... -------------+----+
  125. // | user key |0x80|
  126. // +----------- ..... --------------+----+
  127. // To save 7 bytes for the special case where sequence ID = 0.
  128. //
  129. //
  130. class PlainTableFactory : public TableFactory {
  131. public:
  132. ~PlainTableFactory() {}
  133. // user_key_len is the length of the user key. If it is set to be
  134. // kPlainTableVariableLength, then it means variable length. Otherwise, all
  135. // the keys need to have the fix length of this value. bloom_bits_per_key is
  136. // number of bits used for bloom filer per key. hash_table_ratio is
  137. // the desired utilization of the hash table used for prefix hashing.
  138. // hash_table_ratio = number of prefixes / #buckets in the hash table
  139. // hash_table_ratio = 0 means skip hash table but only replying on binary
  140. // search.
  141. // index_sparseness determines index interval for keys
  142. // inside the same prefix. It will be the maximum number of linear search
  143. // required after hash and binary search.
  144. // index_sparseness = 0 means index for every key.
  145. // huge_page_tlb_size determines whether to allocate hash indexes from huge
  146. // page TLB and the page size if allocating from there. See comments of
  147. // Arena::AllocateAligned() for details.
  148. explicit PlainTableFactory(
  149. const PlainTableOptions& _table_options = PlainTableOptions());
  150. // Method to allow CheckedCast to work for this class
  151. static const char* kClassName() { return kPlainTableName(); }
  152. const char* Name() const override { return kPlainTableName(); }
  153. using TableFactory::NewTableReader;
  154. Status NewTableReader(const ReadOptions& ro,
  155. const TableReaderOptions& table_reader_options,
  156. std::unique_ptr<RandomAccessFileReader>&& file,
  157. uint64_t file_size, std::unique_ptr<TableReader>* table,
  158. bool prefetch_index_and_filter_in_cache) const override;
  159. TableBuilder* NewTableBuilder(
  160. const TableBuilderOptions& table_builder_options,
  161. WritableFileWriter* file) const override;
  162. std::string GetPrintableOptions() const override;
  163. static const char kValueTypeSeqId0 = char(~0);
  164. std::unique_ptr<TableFactory> Clone() const override {
  165. return std::make_unique<PlainTableFactory>(*this);
  166. }
  167. private:
  168. PlainTableOptions table_options_;
  169. };
  170. } // namespace ROCKSDB_NAMESPACE