plain_table_key_coding.h 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. #pragma once
  6. #ifndef ROCKSDB_LITE
  7. #include <array>
  8. #include "db/dbformat.h"
  9. #include "rocksdb/slice.h"
  10. #include "table/plain/plain_table_reader.h"
  11. // The file contains three helper classes of PlainTable format,
  12. // PlainTableKeyEncoder, PlainTableKeyDecoder and PlainTableFileReader.
  13. // These classes issue the lowest level of operations of PlainTable.
  14. // Actual data format of the key is documented in comments of class
  15. // PlainTableFactory.
  16. namespace ROCKSDB_NAMESPACE {
  17. class WritableFile;
  18. struct ParsedInternalKey;
  19. struct PlainTableReaderFileInfo;
  20. enum PlainTableEntryType : unsigned char;
  21. // Helper class for PlainTable format to write out a key to an output file
  22. // The class is used in PlainTableBuilder.
  23. class PlainTableKeyEncoder {
  24. public:
  25. explicit PlainTableKeyEncoder(EncodingType encoding_type,
  26. uint32_t user_key_len,
  27. const SliceTransform* prefix_extractor,
  28. size_t index_sparseness)
  29. : encoding_type_((prefix_extractor != nullptr) ? encoding_type : kPlain),
  30. fixed_user_key_len_(user_key_len),
  31. prefix_extractor_(prefix_extractor),
  32. index_sparseness_((index_sparseness > 1) ? index_sparseness : 1),
  33. key_count_for_prefix_(0) {}
  34. // key: the key to write out, in the format of internal key.
  35. // file: the output file to write out
  36. // offset: offset in the file. Needs to be updated after appending bytes
  37. // for the key
  38. // meta_bytes_buf: buffer for extra meta bytes
  39. // meta_bytes_buf_size: offset to append extra meta bytes. Will be updated
  40. // if meta_bytes_buf is updated.
  41. Status AppendKey(const Slice& key, WritableFileWriter* file, uint64_t* offset,
  42. char* meta_bytes_buf, size_t* meta_bytes_buf_size);
  43. // Return actual encoding type to be picked
  44. EncodingType GetEncodingType() { return encoding_type_; }
  45. private:
  46. EncodingType encoding_type_;
  47. uint32_t fixed_user_key_len_;
  48. const SliceTransform* prefix_extractor_;
  49. const size_t index_sparseness_;
  50. size_t key_count_for_prefix_;
  51. IterKey pre_prefix_;
  52. };
  53. // The class does raw file reads for PlainTableReader.
  54. // It hides whether it is a mmap-read, or a non-mmap read.
  55. // The class is implemented in a way to favor the performance of mmap case.
  56. // The class is used by PlainTableReader.
  57. class PlainTableFileReader {
  58. public:
  59. explicit PlainTableFileReader(const PlainTableReaderFileInfo* _file_info)
  60. : file_info_(_file_info), num_buf_(0) {}
  61. // In mmaped mode, the results point to mmaped area of the file, which
  62. // means it is always valid before closing the file.
  63. // In non-mmap mode, the results point to an internal buffer. If the caller
  64. // makes another read call, the results may not be valid. So callers should
  65. // make a copy when needed.
  66. // In order to save read calls to files, we keep two internal buffers:
  67. // the first read and the most recent read. This is efficient because it
  68. // columns these two common use cases:
  69. // (1) hash index only identify one location, we read the key to verify
  70. // the location, and read key and value if it is the right location.
  71. // (2) after hash index checking, we identify two locations (because of
  72. // hash bucket conflicts), we binary search the two location to see
  73. // which one is what we need and start to read from the location.
  74. // These two most common use cases will be covered by the two buffers
  75. // so that we don't need to re-read the same location.
  76. // Currently we keep a fixed size buffer. If a read doesn't exactly fit
  77. // the buffer, we replace the second buffer with the location user reads.
  78. //
  79. // If return false, status code is stored in status_.
  80. bool Read(uint32_t file_offset, uint32_t len, Slice* out) {
  81. if (file_info_->is_mmap_mode) {
  82. assert(file_offset + len <= file_info_->data_end_offset);
  83. *out = Slice(file_info_->file_data.data() + file_offset, len);
  84. return true;
  85. } else {
  86. return ReadNonMmap(file_offset, len, out);
  87. }
  88. }
  89. // If return false, status code is stored in status_.
  90. bool ReadNonMmap(uint32_t file_offset, uint32_t len, Slice* output);
  91. // *bytes_read = 0 means eof. false means failure and status is saved
  92. // in status_. Not directly returning Status to save copying status
  93. // object to map previous performance of mmap mode.
  94. inline bool ReadVarint32(uint32_t offset, uint32_t* output,
  95. uint32_t* bytes_read);
  96. bool ReadVarint32NonMmap(uint32_t offset, uint32_t* output,
  97. uint32_t* bytes_read);
  98. Status status() const { return status_; }
  99. const PlainTableReaderFileInfo* file_info() { return file_info_; }
  100. private:
  101. const PlainTableReaderFileInfo* file_info_;
  102. struct Buffer {
  103. Buffer() : buf_start_offset(0), buf_len(0), buf_capacity(0) {}
  104. std::unique_ptr<char[]> buf;
  105. uint32_t buf_start_offset;
  106. uint32_t buf_len;
  107. uint32_t buf_capacity;
  108. };
  109. // Keep buffers for two recent reads.
  110. std::array<std::unique_ptr<Buffer>, 2> buffers_;
  111. uint32_t num_buf_;
  112. Status status_;
  113. Slice GetFromBuffer(Buffer* buf, uint32_t file_offset, uint32_t len);
  114. };
  115. // A helper class to decode keys from input buffer
  116. // The class is used by PlainTableBuilder.
  117. class PlainTableKeyDecoder {
  118. public:
  119. explicit PlainTableKeyDecoder(const PlainTableReaderFileInfo* file_info,
  120. EncodingType encoding_type,
  121. uint32_t user_key_len,
  122. const SliceTransform* prefix_extractor)
  123. : file_reader_(file_info),
  124. encoding_type_(encoding_type),
  125. prefix_len_(0),
  126. fixed_user_key_len_(user_key_len),
  127. prefix_extractor_(prefix_extractor),
  128. in_prefix_(false) {}
  129. // Find the next key.
  130. // start: char array where the key starts.
  131. // limit: boundary of the char array
  132. // parsed_key: the output of the result key
  133. // internal_key: if not null, fill with the output of the result key in
  134. // un-parsed format
  135. // bytes_read: how many bytes read from start. Output
  136. // seekable: whether key can be read from this place. Used when building
  137. // indexes. Output.
  138. Status NextKey(uint32_t start_offset, ParsedInternalKey* parsed_key,
  139. Slice* internal_key, Slice* value, uint32_t* bytes_read,
  140. bool* seekable = nullptr);
  141. Status NextKeyNoValue(uint32_t start_offset, ParsedInternalKey* parsed_key,
  142. Slice* internal_key, uint32_t* bytes_read,
  143. bool* seekable = nullptr);
  144. PlainTableFileReader file_reader_;
  145. EncodingType encoding_type_;
  146. uint32_t prefix_len_;
  147. uint32_t fixed_user_key_len_;
  148. Slice saved_user_key_;
  149. IterKey cur_key_;
  150. const SliceTransform* prefix_extractor_;
  151. bool in_prefix_;
  152. private:
  153. Status NextPlainEncodingKey(uint32_t start_offset,
  154. ParsedInternalKey* parsed_key,
  155. Slice* internal_key, uint32_t* bytes_read,
  156. bool* seekable = nullptr);
  157. Status NextPrefixEncodingKey(uint32_t start_offset,
  158. ParsedInternalKey* parsed_key,
  159. Slice* internal_key, uint32_t* bytes_read,
  160. bool* seekable = nullptr);
  161. Status ReadInternalKey(uint32_t file_offset, uint32_t user_key_size,
  162. ParsedInternalKey* parsed_key, uint32_t* bytes_read,
  163. bool* internal_key_valid, Slice* internal_key);
  164. inline Status DecodeSize(uint32_t start_offset,
  165. PlainTableEntryType* entry_type, uint32_t* key_size,
  166. uint32_t* bytes_read);
  167. };
  168. } // namespace ROCKSDB_NAMESPACE
  169. #endif // ROCKSDB_LITE