plain_table_factory.h 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
  1. // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  3. // Use of this source code is governed by a BSD-style license that can be
  4. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  5. #pragma once
  6. #ifndef ROCKSDB_LITE
  7. #include <memory>
  8. #include <string>
  9. #include <stdint.h>
  10. #include "options/options_helper.h"
  11. #include "rocksdb/options.h"
  12. #include "rocksdb/table.h"
  13. namespace ROCKSDB_NAMESPACE {
  14. struct EnvOptions;
  15. class Status;
  16. class RandomAccessFile;
  17. class WritableFile;
  18. class Table;
  19. class TableBuilder;
  20. // PlainTableFactory is the entrance function to the PlainTable format of
  21. // SST files. It returns instances PlainTableBuilder as the builder
  22. // class and PlainTableReader as the reader class, where the format is
  23. // actually implemented.
  24. //
  25. // The PlainTable is designed for memory-mapped file systems, e.g. tmpfs.
  26. // Data is not organized in blocks, which allows fast access. Because of
  27. // following downsides
  28. // 1. Data compression is not supported.
  29. // 2. Data is not checksumed.
  30. // it is not recommended to use this format on other type of file systems.
  31. //
  32. // PlainTable requires fixed length key, configured as a constructor
  33. // parameter of the factory class. Output file format:
  34. // +-------------+-----------------+
  35. // | version | user_key_length |
  36. // +------------++------------+-----------------+ <= key1 offset
  37. // | encoded key1 | value_size | |
  38. // +------------+-------------+-------------+ |
  39. // | value1 |
  40. // | |
  41. // +--------------------------+-------------+---+ <= key2 offset
  42. // | encoded key2 | value_size | |
  43. // +------------+-------------+-------------+ |
  44. // | value2 |
  45. // | |
  46. // | ...... |
  47. // +-----------------+--------------------------+
  48. //
  49. // When the key encoding type is kPlain. Key part is encoded as:
  50. // +------------+--------------------+
  51. // | [key_size] | internal key |
  52. // +------------+--------------------+
  53. // for the case of user_key_len = kPlainTableVariableLength case,
  54. // and simply:
  55. // +----------------------+
  56. // | internal key |
  57. // +----------------------+
  58. // for user_key_len != kPlainTableVariableLength case.
  59. //
  60. // If key encoding type is kPrefix. Keys are encoding in this format.
  61. // There are three ways to encode a key:
  62. // (1) Full Key
  63. // +---------------+---------------+-------------------+
  64. // | Full Key Flag | Full Key Size | Full Internal Key |
  65. // +---------------+---------------+-------------------+
  66. // which simply encodes a full key
  67. //
  68. // (2) A key shared the same prefix as the previous key, which is encoded as
  69. // format of (1).
  70. // +-------------+-------------+-------------+-------------+------------+
  71. // | Prefix Flag | Prefix Size | Suffix Flag | Suffix Size | Key Suffix |
  72. // +-------------+-------------+-------------+-------------+------------+
  73. // where key is the suffix part of the key, including the internal bytes.
  74. // the actual key will be constructed by concatenating prefix part of the
  75. // previous key, with the suffix part of the key here, with sizes given here.
  76. //
  77. // (3) A key shared the same prefix as the previous key, which is encoded as
  78. // the format of (2).
  79. // +-----------------+-----------------+------------------------+
  80. // | Key Suffix Flag | Key Suffix Size | Suffix of Internal Key |
  81. // +-----------------+-----------------+------------------------+
  82. // The key will be constructed by concatenating previous key's prefix (which is
  83. // also a prefix which the last key encoded in the format of (1)) and the
  84. // key given here.
  85. //
  86. // For example, we for following keys (prefix and suffix are separated by
  87. // spaces):
  88. // 0000 0001
  89. // 0000 00021
  90. // 0000 0002
  91. // 00011 00
  92. // 0002 0001
  93. // Will be encoded like this:
  94. // FK 8 00000001
  95. // PF 4 SF 5 00021
  96. // SF 4 0002
  97. // FK 7 0001100
  98. // FK 8 00020001
  99. // (where FK means full key flag, PF means prefix flag and SF means suffix flag)
  100. //
  101. // All those "key flag + key size" shown above are in this format:
  102. // The 8 bits of the first byte:
  103. // +----+----+----+----+----+----+----+----+
  104. // | Type | Size |
  105. // +----+----+----+----+----+----+----+----+
  106. // Type indicates: full key, prefix, or suffix.
  107. // The last 6 bits are for size. If the size bits are not all 1, it means the
  108. // size of the key. Otherwise, varint32 is read after this byte. This varint
  109. // value + 0x3F (the value of all 1) will be the key size.
  110. //
  111. // For example, full key with length 16 will be encoded as (binary):
  112. // 00 010000
  113. // (00 means full key)
  114. // and a prefix with 100 bytes will be encoded as:
  115. // 01 111111 00100101
  116. // (63) (37)
  117. // (01 means key suffix)
  118. //
  119. // All the internal keys above (including kPlain and kPrefix) are encoded in
  120. // this format:
  121. // There are two types:
  122. // (1) normal internal key format
  123. // +----------- ...... -------------+----+---+---+---+---+---+---+---+
  124. // | user key |type| sequence ID |
  125. // +----------- ..... --------------+----+---+---+---+---+---+---+---+
  126. // (2) Special case for keys whose sequence ID is 0 and is value type
  127. // +----------- ...... -------------+----+
  128. // | user key |0x80|
  129. // +----------- ..... --------------+----+
  130. // To save 7 bytes for the special case where sequence ID = 0.
  131. //
  132. //
  133. class PlainTableFactory : public TableFactory {
  134. public:
  135. ~PlainTableFactory() {}
  136. // user_key_len is the length of the user key. If it is set to be
  137. // kPlainTableVariableLength, then it means variable length. Otherwise, all
  138. // the keys need to have the fix length of this value. bloom_bits_per_key is
  139. // number of bits used for bloom filer per key. hash_table_ratio is
  140. // the desired utilization of the hash table used for prefix hashing.
  141. // hash_table_ratio = number of prefixes / #buckets in the hash table
  142. // hash_table_ratio = 0 means skip hash table but only replying on binary
  143. // search.
  144. // index_sparseness determines index interval for keys
  145. // inside the same prefix. It will be the maximum number of linear search
  146. // required after hash and binary search.
  147. // index_sparseness = 0 means index for every key.
  148. // huge_page_tlb_size determines whether to allocate hash indexes from huge
  149. // page TLB and the page size if allocating from there. See comments of
  150. // Arena::AllocateAligned() for details.
  151. explicit PlainTableFactory(
  152. const PlainTableOptions& _table_options = PlainTableOptions())
  153. : table_options_(_table_options) {}
  154. const char* Name() const override { return "PlainTable"; }
  155. Status NewTableReader(const TableReaderOptions& table_reader_options,
  156. std::unique_ptr<RandomAccessFileReader>&& file,
  157. uint64_t file_size, std::unique_ptr<TableReader>* table,
  158. bool prefetch_index_and_filter_in_cache) const override;
  159. TableBuilder* NewTableBuilder(
  160. const TableBuilderOptions& table_builder_options,
  161. uint32_t column_family_id, WritableFileWriter* file) const override;
  162. std::string GetPrintableTableOptions() const override;
  163. const PlainTableOptions& table_options() const;
  164. static const char kValueTypeSeqId0 = char(~0);
  165. // Sanitizes the specified DB Options.
  166. Status SanitizeOptions(
  167. const DBOptions& /*db_opts*/,
  168. const ColumnFamilyOptions& /*cf_opts*/) const override {
  169. return Status::OK();
  170. }
  171. void* GetOptions() override { return &table_options_; }
  172. Status GetOptionString(std::string* /*opt_string*/,
  173. const std::string& /*delimiter*/) const override {
  174. return Status::OK();
  175. }
  176. private:
  177. PlainTableOptions table_options_;
  178. };
  179. static std::unordered_map<std::string, OptionTypeInfo> plain_table_type_info = {
  180. {"user_key_len",
  181. {offsetof(struct PlainTableOptions, user_key_len), OptionType::kUInt32T,
  182. OptionVerificationType::kNormal, false, 0}},
  183. {"bloom_bits_per_key",
  184. {offsetof(struct PlainTableOptions, bloom_bits_per_key), OptionType::kInt,
  185. OptionVerificationType::kNormal, false, 0}},
  186. {"hash_table_ratio",
  187. {offsetof(struct PlainTableOptions, hash_table_ratio), OptionType::kDouble,
  188. OptionVerificationType::kNormal, false, 0}},
  189. {"index_sparseness",
  190. {offsetof(struct PlainTableOptions, index_sparseness), OptionType::kSizeT,
  191. OptionVerificationType::kNormal, false, 0}},
  192. {"huge_page_tlb_size",
  193. {offsetof(struct PlainTableOptions, huge_page_tlb_size),
  194. OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}},
  195. {"encoding_type",
  196. {offsetof(struct PlainTableOptions, encoding_type),
  197. OptionType::kEncodingType, OptionVerificationType::kByName, false, 0}},
  198. {"full_scan_mode",
  199. {offsetof(struct PlainTableOptions, full_scan_mode), OptionType::kBoolean,
  200. OptionVerificationType::kNormal, false, 0}},
  201. {"store_index_in_file",
  202. {offsetof(struct PlainTableOptions, store_index_in_file),
  203. OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}};
  204. } // namespace ROCKSDB_NAMESPACE
  205. #endif // ROCKSDB_LITE