version_edit.h 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. #pragma once
  10. #include <algorithm>
  11. #include <set>
  12. #include <string>
  13. #include <utility>
  14. #include <vector>
  15. #include "db/dbformat.h"
  16. #include "memory/arena.h"
  17. #include "rocksdb/cache.h"
  18. #include "table/table_reader.h"
  19. #include "util/autovector.h"
  20. namespace ROCKSDB_NAMESPACE {
  21. class VersionSet;
  22. constexpr uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF;
  23. constexpr uint64_t kInvalidBlobFileNumber = 0;
  24. constexpr uint64_t kUnknownOldestAncesterTime = 0;
  25. constexpr uint64_t kUnknownFileCreationTime = 0;
  26. extern const std::string kUnknownFileChecksum;
  27. extern const std::string kUnknownFileChecksumFuncName;
  28. extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id);
  29. // A copyable structure contains information needed to read data from an SST
  30. // file. It can contain a pointer to a table reader opened for the file, or
  31. // file number and size, which can be used to create a new table reader for it.
  32. // The behavior is undefined when a copied of the structure is used when the
  33. // file is not in any live version any more.
  34. struct FileDescriptor {
  35. // Table reader in table_reader_handle
  36. TableReader* table_reader;
  37. uint64_t packed_number_and_path_id;
  38. uint64_t file_size; // File size in bytes
  39. SequenceNumber smallest_seqno; // The smallest seqno in this file
  40. SequenceNumber largest_seqno; // The largest seqno in this file
  41. FileDescriptor() : FileDescriptor(0, 0, 0) {}
  42. FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size)
  43. : FileDescriptor(number, path_id, _file_size, kMaxSequenceNumber, 0) {}
  44. FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size,
  45. SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno)
  46. : table_reader(nullptr),
  47. packed_number_and_path_id(PackFileNumberAndPathId(number, path_id)),
  48. file_size(_file_size),
  49. smallest_seqno(_smallest_seqno),
  50. largest_seqno(_largest_seqno) {}
  51. FileDescriptor(const FileDescriptor& fd) { *this = fd; }
  52. FileDescriptor& operator=(const FileDescriptor& fd) {
  53. table_reader = fd.table_reader;
  54. packed_number_and_path_id = fd.packed_number_and_path_id;
  55. file_size = fd.file_size;
  56. smallest_seqno = fd.smallest_seqno;
  57. largest_seqno = fd.largest_seqno;
  58. return *this;
  59. }
  60. uint64_t GetNumber() const {
  61. return packed_number_and_path_id & kFileNumberMask;
  62. }
  63. uint32_t GetPathId() const {
  64. return static_cast<uint32_t>(
  65. packed_number_and_path_id / (kFileNumberMask + 1));
  66. }
  67. uint64_t GetFileSize() const { return file_size; }
  68. };
  69. struct FileSampledStats {
  70. FileSampledStats() : num_reads_sampled(0) {}
  71. FileSampledStats(const FileSampledStats& other) { *this = other; }
  72. FileSampledStats& operator=(const FileSampledStats& other) {
  73. num_reads_sampled = other.num_reads_sampled.load();
  74. return *this;
  75. }
  76. // number of user reads to this file.
  77. mutable std::atomic<uint64_t> num_reads_sampled;
  78. };
  79. struct FileMetaData {
  80. FileDescriptor fd;
  81. InternalKey smallest; // Smallest internal key served by table
  82. InternalKey largest; // Largest internal key served by table
  83. // Needs to be disposed when refs becomes 0.
  84. Cache::Handle* table_reader_handle = nullptr;
  85. FileSampledStats stats;
  86. // Stats for compensating deletion entries during compaction
  87. // File size compensated by deletion entry.
  88. // This is updated in Version::UpdateAccumulatedStats() first time when the
  89. // file is created or loaded. After it is updated (!= 0), it is immutable.
  90. uint64_t compensated_file_size = 0;
  91. // These values can mutate, but they can only be read or written from
  92. // single-threaded LogAndApply thread
  93. uint64_t num_entries = 0; // the number of entries.
  94. uint64_t num_deletions = 0; // the number of deletion entries.
  95. uint64_t raw_key_size = 0; // total uncompressed key size.
  96. uint64_t raw_value_size = 0; // total uncompressed value size.
  97. int refs = 0; // Reference count
  98. bool being_compacted = false; // Is this file undergoing compaction?
  99. bool init_stats_from_file = false; // true if the data-entry stats of this
  100. // file has initialized from file.
  101. bool marked_for_compaction = false; // True if client asked us nicely to
  102. // compact this file.
  103. // Used only in BlobDB. The file number of the oldest blob file this SST file
  104. // refers to. 0 is an invalid value; BlobDB numbers the files starting from 1.
  105. uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
  106. // The file could be the compaction output from other SST files, which could
  107. // in turn be outputs for compact older SST files. We track the memtable
  108. // flush timestamp for the oldest SST file that eventaully contribute data
  109. // to this file. 0 means the information is not available.
  110. uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
  111. // Unix time when the SST file is created.
  112. uint64_t file_creation_time = kUnknownFileCreationTime;
  113. // File checksum
  114. std::string file_checksum = kUnknownFileChecksum;
  115. // File checksum function name
  116. std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
  117. FileMetaData() = default;
  118. FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size,
  119. const InternalKey& smallest_key, const InternalKey& largest_key,
  120. const SequenceNumber& smallest_seq,
  121. const SequenceNumber& largest_seq, bool marked_for_compact,
  122. uint64_t oldest_blob_file, uint64_t _oldest_ancester_time,
  123. uint64_t _file_creation_time, const std::string& _file_checksum,
  124. const std::string& _file_checksum_func_name)
  125. : fd(file, file_path_id, file_size, smallest_seq, largest_seq),
  126. smallest(smallest_key),
  127. largest(largest_key),
  128. marked_for_compaction(marked_for_compact),
  129. oldest_blob_file_number(oldest_blob_file),
  130. oldest_ancester_time(_oldest_ancester_time),
  131. file_creation_time(_file_creation_time),
  132. file_checksum(_file_checksum),
  133. file_checksum_func_name(_file_checksum_func_name) {
  134. TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this);
  135. }
  136. // REQUIRED: Keys must be given to the function in sorted order (it expects
  137. // the last key to be the largest).
  138. void UpdateBoundaries(const Slice& key, const Slice& value,
  139. SequenceNumber seqno, ValueType value_type);
  140. // Unlike UpdateBoundaries, ranges do not need to be presented in any
  141. // particular order.
  142. void UpdateBoundariesForRange(const InternalKey& start,
  143. const InternalKey& end, SequenceNumber seqno,
  144. const InternalKeyComparator& icmp) {
  145. if (smallest.size() == 0 || icmp.Compare(start, smallest) < 0) {
  146. smallest = start;
  147. }
  148. if (largest.size() == 0 || icmp.Compare(largest, end) < 0) {
  149. largest = end;
  150. }
  151. fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
  152. fd.largest_seqno = std::max(fd.largest_seqno, seqno);
  153. }
  154. // Try to get oldest ancester time from the class itself or table properties
  155. // if table reader is already pinned.
  156. // 0 means the information is not available.
  157. uint64_t TryGetOldestAncesterTime() {
  158. if (oldest_ancester_time != kUnknownOldestAncesterTime) {
  159. return oldest_ancester_time;
  160. } else if (fd.table_reader != nullptr &&
  161. fd.table_reader->GetTableProperties() != nullptr) {
  162. return fd.table_reader->GetTableProperties()->creation_time;
  163. }
  164. return kUnknownOldestAncesterTime;
  165. }
  166. uint64_t TryGetFileCreationTime() {
  167. if (file_creation_time != kUnknownFileCreationTime) {
  168. return file_creation_time;
  169. } else if (fd.table_reader != nullptr &&
  170. fd.table_reader->GetTableProperties() != nullptr) {
  171. return fd.table_reader->GetTableProperties()->file_creation_time;
  172. }
  173. return kUnknownFileCreationTime;
  174. }
  175. };
  176. // A compressed copy of file meta data that just contain minimum data needed
  177. // to server read operations, while still keeping the pointer to full metadata
  178. // of the file in case it is needed.
  179. struct FdWithKeyRange {
  180. FileDescriptor fd;
  181. FileMetaData* file_metadata; // Point to all metadata
  182. Slice smallest_key; // slice that contain smallest key
  183. Slice largest_key; // slice that contain largest key
  184. FdWithKeyRange()
  185. : fd(),
  186. file_metadata(nullptr),
  187. smallest_key(),
  188. largest_key() {
  189. }
  190. FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key,
  191. FileMetaData* _file_metadata)
  192. : fd(_fd),
  193. file_metadata(_file_metadata),
  194. smallest_key(_smallest_key),
  195. largest_key(_largest_key) {}
  196. };
  197. // Data structure to store an array of FdWithKeyRange in one level
  198. // Actual data is guaranteed to be stored closely
  199. struct LevelFilesBrief {
  200. size_t num_files;
  201. FdWithKeyRange* files;
  202. LevelFilesBrief() {
  203. num_files = 0;
  204. files = nullptr;
  205. }
  206. };
  207. // The state of a DB at any given time is referred to as a Version.
  208. // Any modification to the Version is considered a Version Edit. A Version is
  209. // constructed by joining a sequence of Version Edits. Version Edits are written
  210. // to the MANIFEST file.
  211. class VersionEdit {
  212. public:
  213. void Clear();
  214. void SetDBId(const std::string& db_id) {
  215. has_db_id_ = true;
  216. db_id_ = db_id;
  217. }
  218. bool HasDbId() const { return has_db_id_; }
  219. const std::string& GetDbId() const { return db_id_; }
  220. void SetComparatorName(const Slice& name) {
  221. has_comparator_ = true;
  222. comparator_ = name.ToString();
  223. }
  224. bool HasComparatorName() const { return has_comparator_; }
  225. const std::string& GetComparatorName() const { return comparator_; }
  226. void SetLogNumber(uint64_t num) {
  227. has_log_number_ = true;
  228. log_number_ = num;
  229. }
  230. bool HasLogNumber() const { return has_log_number_; }
  231. uint64_t GetLogNumber() const { return log_number_; }
  232. void SetPrevLogNumber(uint64_t num) {
  233. has_prev_log_number_ = true;
  234. prev_log_number_ = num;
  235. }
  236. bool HasPrevLogNumber() const { return has_prev_log_number_; }
  237. uint64_t GetPrevLogNumber() const { return prev_log_number_; }
  238. void SetNextFile(uint64_t num) {
  239. has_next_file_number_ = true;
  240. next_file_number_ = num;
  241. }
  242. bool HasNextFile() const { return has_next_file_number_; }
  243. uint64_t GetNextFile() const { return next_file_number_; }
  244. void SetMaxColumnFamily(uint32_t max_column_family) {
  245. has_max_column_family_ = true;
  246. max_column_family_ = max_column_family;
  247. }
  248. bool HasMaxColumnFamily() const { return has_max_column_family_; }
  249. uint32_t GetMaxColumnFamily() const { return max_column_family_; }
  250. void SetMinLogNumberToKeep(uint64_t num) {
  251. has_min_log_number_to_keep_ = true;
  252. min_log_number_to_keep_ = num;
  253. }
  254. bool HasMinLogNumberToKeep() const { return has_min_log_number_to_keep_; }
  255. uint64_t GetMinLogNumberToKeep() const { return min_log_number_to_keep_; }
  256. void SetLastSequence(SequenceNumber seq) {
  257. has_last_sequence_ = true;
  258. last_sequence_ = seq;
  259. }
  260. bool HasLastSequence() const { return has_last_sequence_; }
  261. SequenceNumber GetLastSequence() const { return last_sequence_; }
  262. // Delete the specified "file" from the specified "level".
  263. void DeleteFile(int level, uint64_t file) {
  264. deleted_files_.emplace(level, file);
  265. }
  266. // Retrieve the files deleted as well as their associated levels.
  267. using DeletedFiles = std::set<std::pair<int, uint64_t>>;
  268. const DeletedFiles& GetDeletedFiles() const { return deleted_files_; }
  269. // Add the specified file at the specified level.
  270. // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
  271. // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
  272. // REQUIRES: "oldest_blob_file_number" is the number of the oldest blob file
  273. // referred to by this file if any, kInvalidBlobFileNumber otherwise.
  274. void AddFile(int level, uint64_t file, uint32_t file_path_id,
  275. uint64_t file_size, const InternalKey& smallest,
  276. const InternalKey& largest, const SequenceNumber& smallest_seqno,
  277. const SequenceNumber& largest_seqno, bool marked_for_compaction,
  278. uint64_t oldest_blob_file_number, uint64_t oldest_ancester_time,
  279. uint64_t file_creation_time, const std::string& file_checksum,
  280. const std::string& file_checksum_func_name) {
  281. assert(smallest_seqno <= largest_seqno);
  282. new_files_.emplace_back(
  283. level, FileMetaData(file, file_path_id, file_size, smallest, largest,
  284. smallest_seqno, largest_seqno,
  285. marked_for_compaction, oldest_blob_file_number,
  286. oldest_ancester_time, file_creation_time,
  287. file_checksum, file_checksum_func_name));
  288. }
  289. void AddFile(int level, const FileMetaData& f) {
  290. assert(f.fd.smallest_seqno <= f.fd.largest_seqno);
  291. new_files_.emplace_back(level, f);
  292. }
  293. // Retrieve the files added as well as their associated levels.
  294. using NewFiles = std::vector<std::pair<int, FileMetaData>>;
  295. const NewFiles& GetNewFiles() const { return new_files_; }
  296. // Number of edits
  297. size_t NumEntries() const { return new_files_.size() + deleted_files_.size(); }
  298. void SetColumnFamily(uint32_t column_family_id) {
  299. column_family_ = column_family_id;
  300. }
  301. uint32_t GetColumnFamily() const { return column_family_; }
  302. // set column family ID by calling SetColumnFamily()
  303. void AddColumnFamily(const std::string& name) {
  304. assert(!is_column_family_drop_);
  305. assert(!is_column_family_add_);
  306. assert(NumEntries() == 0);
  307. is_column_family_add_ = true;
  308. column_family_name_ = name;
  309. }
  310. // set column family ID by calling SetColumnFamily()
  311. void DropColumnFamily() {
  312. assert(!is_column_family_drop_);
  313. assert(!is_column_family_add_);
  314. assert(NumEntries() == 0);
  315. is_column_family_drop_ = true;
  316. }
  317. bool IsColumnFamilyManipulation() const {
  318. return is_column_family_add_ || is_column_family_drop_;
  319. }
  320. void MarkAtomicGroup(uint32_t remaining_entries) {
  321. is_in_atomic_group_ = true;
  322. remaining_entries_ = remaining_entries;
  323. }
  324. bool IsInAtomicGroup() const { return is_in_atomic_group_; }
  325. uint32_t GetRemainingEntries() const { return remaining_entries_; }
  326. // return true on success.
  327. bool EncodeTo(std::string* dst) const;
  328. Status DecodeFrom(const Slice& src);
  329. std::string DebugString(bool hex_key = false) const;
  330. std::string DebugJSON(int edit_num, bool hex_key = false) const;
  331. private:
  332. friend class ReactiveVersionSet;
  333. friend class VersionSet;
  334. friend class Version;
  335. friend class AtomicGroupReadBuffer;
  336. bool GetLevel(Slice* input, int* level, const char** msg);
  337. const char* DecodeNewFile4From(Slice* input);
  338. int max_level_ = 0;
  339. std::string db_id_;
  340. std::string comparator_;
  341. uint64_t log_number_ = 0;
  342. uint64_t prev_log_number_ = 0;
  343. uint64_t next_file_number_ = 0;
  344. uint32_t max_column_family_ = 0;
  345. // The most recent WAL log number that is deleted
  346. uint64_t min_log_number_to_keep_ = 0;
  347. SequenceNumber last_sequence_ = 0;
  348. bool has_db_id_ = false;
  349. bool has_comparator_ = false;
  350. bool has_log_number_ = false;
  351. bool has_prev_log_number_ = false;
  352. bool has_next_file_number_ = false;
  353. bool has_max_column_family_ = false;
  354. bool has_min_log_number_to_keep_ = false;
  355. bool has_last_sequence_ = false;
  356. DeletedFiles deleted_files_;
  357. NewFiles new_files_;
  358. // Each version edit record should have column_family_ set
  359. // If it's not set, it is default (0)
  360. uint32_t column_family_ = 0;
  361. // a version edit can be either column_family add or
  362. // column_family drop. If it's column family add,
  363. // it also includes column family name.
  364. bool is_column_family_drop_ = false;
  365. bool is_column_family_add_ = false;
  366. std::string column_family_name_;
  367. bool is_in_atomic_group_ = false;
  368. uint32_t remaining_entries_ = 0;
  369. };
  370. } // namespace ROCKSDB_NAMESPACE