version_set.h 50 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. //
  10. // The representation of a DBImpl consists of a set of Versions. The
  11. // newest version is called "current". Older versions may be kept
  12. // around to provide a consistent view to live iterators.
  13. //
  14. // Each Version keeps track of a set of Table files per level. The
  15. // entire set of versions is maintained in a VersionSet.
  16. //
  17. // Version,VersionSet are thread-compatible, but require external
  18. // synchronization on all accesses.
  19. #pragma once
  20. #include <atomic>
  21. #include <deque>
  22. #include <limits>
  23. #include <map>
  24. #include <memory>
  25. #include <set>
  26. #include <string>
  27. #include <utility>
  28. #include <vector>
  29. #include "db/column_family.h"
  30. #include "db/compaction/compaction.h"
  31. #include "db/compaction/compaction_picker.h"
  32. #include "db/dbformat.h"
  33. #include "db/file_indexer.h"
  34. #include "db/log_reader.h"
  35. #include "db/range_del_aggregator.h"
  36. #include "db/read_callback.h"
  37. #include "db/table_cache.h"
  38. #include "db/version_builder.h"
  39. #include "db/version_edit.h"
  40. #include "db/write_controller.h"
  41. #include "monitoring/instrumented_mutex.h"
  42. #include "options/db_options.h"
  43. #include "port/port.h"
  44. #include "rocksdb/env.h"
  45. #include "rocksdb/file_checksum.h"
  46. #include "table/get_context.h"
  47. #include "table/multiget_context.h"
  48. #include "trace_replay/block_cache_tracer.h"
  49. namespace ROCKSDB_NAMESPACE {
  50. namespace log {
  51. class Writer;
  52. }
  53. class Compaction;
  54. class LogBuffer;
  55. class LookupKey;
  56. class MemTable;
  57. class Version;
  58. class VersionSet;
  59. class WriteBufferManager;
  60. class MergeContext;
  61. class ColumnFamilySet;
  62. class MergeIteratorBuilder;
  63. // VersionEdit is always supposed to be valid and it is used to point at
  64. // entries in Manifest. Ideally it should not be used as a container to
  65. // carry around few of its fields as function params because it can cause
  66. // readers to think it's a valid entry from Manifest. To avoid that confusion
  67. // introducing VersionEditParams to simply carry around multiple VersionEdit
  68. // params. It need not point to a valid record in Manifest.
  69. using VersionEditParams = VersionEdit;
  70. // Return the smallest index i such that file_level.files[i]->largest >= key.
  71. // Return file_level.num_files if there is no such file.
  72. // REQUIRES: "file_level.files" contains a sorted list of
  73. // non-overlapping files.
  74. extern int FindFile(const InternalKeyComparator& icmp,
  75. const LevelFilesBrief& file_level, const Slice& key);
  76. // Returns true iff some file in "files" overlaps the user key range
  77. // [*smallest,*largest].
  78. // smallest==nullptr represents a key smaller than all keys in the DB.
  79. // largest==nullptr represents a key largest than all keys in the DB.
  80. // REQUIRES: If disjoint_sorted_files, file_level.files[]
  81. // contains disjoint ranges in sorted order.
  82. extern bool SomeFileOverlapsRange(const InternalKeyComparator& icmp,
  83. bool disjoint_sorted_files,
  84. const LevelFilesBrief& file_level,
  85. const Slice* smallest_user_key,
  86. const Slice* largest_user_key);
  87. // Generate LevelFilesBrief from vector<FdWithKeyRange*>
  88. // Would copy smallest_key and largest_key data to sequential memory
  89. // arena: Arena used to allocate the memory
  90. extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
  91. const std::vector<FileMetaData*>& files,
  92. Arena* arena);
  93. // Information of the storage associated with each Version, including number of
  94. // levels of LSM tree, files information at each level, files marked for
  95. // compaction, etc.
  96. class VersionStorageInfo {
  97. public:
  98. VersionStorageInfo(const InternalKeyComparator* internal_comparator,
  99. const Comparator* user_comparator, int num_levels,
  100. CompactionStyle compaction_style,
  101. VersionStorageInfo* src_vstorage,
  102. bool _force_consistency_checks);
  103. // No copying allowed
  104. VersionStorageInfo(const VersionStorageInfo&) = delete;
  105. void operator=(const VersionStorageInfo&) = delete;
  106. ~VersionStorageInfo();
  107. void Reserve(int level, size_t size) { files_[level].reserve(size); }
  108. void AddFile(int level, FileMetaData* f, Logger* info_log = nullptr);
  109. void SetFinalized();
  110. // Update num_non_empty_levels_.
  111. void UpdateNumNonEmptyLevels();
  112. void GenerateFileIndexer() {
  113. file_indexer_.UpdateIndex(&arena_, num_non_empty_levels_, files_);
  114. }
  115. // Update the accumulated stats from a file-meta.
  116. void UpdateAccumulatedStats(FileMetaData* file_meta);
  117. // Decrease the current stat from a to-be-deleted file-meta
  118. void RemoveCurrentStats(FileMetaData* file_meta);
  119. void ComputeCompensatedSizes();
  120. // Updates internal structures that keep track of compaction scores
  121. // We use compaction scores to figure out which compaction to do next
  122. // REQUIRES: db_mutex held!!
  123. // TODO find a better way to pass compaction_options_fifo.
  124. void ComputeCompactionScore(const ImmutableCFOptions& immutable_cf_options,
  125. const MutableCFOptions& mutable_cf_options);
  126. // Estimate est_comp_needed_bytes_
  127. void EstimateCompactionBytesNeeded(
  128. const MutableCFOptions& mutable_cf_options);
  129. // This computes files_marked_for_compaction_ and is called by
  130. // ComputeCompactionScore()
  131. void ComputeFilesMarkedForCompaction();
  132. // This computes ttl_expired_files_ and is called by
  133. // ComputeCompactionScore()
  134. void ComputeExpiredTtlFiles(const ImmutableCFOptions& ioptions,
  135. const uint64_t ttl);
  136. // This computes files_marked_for_periodic_compaction_ and is called by
  137. // ComputeCompactionScore()
  138. void ComputeFilesMarkedForPeriodicCompaction(
  139. const ImmutableCFOptions& ioptions,
  140. const uint64_t periodic_compaction_seconds);
  141. // This computes bottommost_files_marked_for_compaction_ and is called by
  142. // ComputeCompactionScore() or UpdateOldestSnapshot().
  143. //
  144. // Among bottommost files (assumes they've already been computed), marks the
  145. // ones that have keys that would be eliminated if recompacted, according to
  146. // the seqnum of the oldest existing snapshot. Must be called every time
  147. // oldest snapshot changes as that is when bottom-level files can become
  148. // eligible for compaction.
  149. //
  150. // REQUIRES: DB mutex held
  151. void ComputeBottommostFilesMarkedForCompaction();
  152. // Generate level_files_brief_ from files_
  153. void GenerateLevelFilesBrief();
  154. // Sort all files for this version based on their file size and
  155. // record results in files_by_compaction_pri_. The largest files are listed
  156. // first.
  157. void UpdateFilesByCompactionPri(CompactionPri compaction_pri);
  158. void GenerateLevel0NonOverlapping();
  159. bool level0_non_overlapping() const {
  160. return level0_non_overlapping_;
  161. }
  162. // Check whether each file in this version is bottommost (i.e., nothing in its
  163. // key-range could possibly exist in an older file/level).
  164. // REQUIRES: This version has not been saved
  165. void GenerateBottommostFiles();
  166. // Updates the oldest snapshot and related internal state, like the bottommost
  167. // files marked for compaction.
  168. // REQUIRES: DB mutex held
  169. void UpdateOldestSnapshot(SequenceNumber oldest_snapshot_seqnum);
  170. int MaxInputLevel() const;
  171. int MaxOutputLevel(bool allow_ingest_behind) const;
  172. // Return level number that has idx'th highest score
  173. int CompactionScoreLevel(int idx) const { return compaction_level_[idx]; }
  174. // Return idx'th highest score
  175. double CompactionScore(int idx) const { return compaction_score_[idx]; }
  176. void GetOverlappingInputs(
  177. int level, const InternalKey* begin, // nullptr means before all keys
  178. const InternalKey* end, // nullptr means after all keys
  179. std::vector<FileMetaData*>* inputs,
  180. int hint_index = -1, // index of overlap file
  181. int* file_index = nullptr, // return index of overlap file
  182. bool expand_range = true, // if set, returns files which overlap the
  183. // range and overlap each other. If false,
  184. // then just files intersecting the range
  185. InternalKey** next_smallest = nullptr) // if non-null, returns the
  186. const; // smallest key of next file not included
  187. void GetCleanInputsWithinInterval(
  188. int level, const InternalKey* begin, // nullptr means before all keys
  189. const InternalKey* end, // nullptr means after all keys
  190. std::vector<FileMetaData*>* inputs,
  191. int hint_index = -1, // index of overlap file
  192. int* file_index = nullptr) // return index of overlap file
  193. const;
  194. void GetOverlappingInputsRangeBinarySearch(
  195. int level, // level > 0
  196. const InternalKey* begin, // nullptr means before all keys
  197. const InternalKey* end, // nullptr means after all keys
  198. std::vector<FileMetaData*>* inputs,
  199. int hint_index, // index of overlap file
  200. int* file_index, // return index of overlap file
  201. bool within_interval = false, // if set, force the inputs within interval
  202. InternalKey** next_smallest = nullptr) // if non-null, returns the
  203. const; // smallest key of next file not included
  204. // Returns true iff some file in the specified level overlaps
  205. // some part of [*smallest_user_key,*largest_user_key].
  206. // smallest_user_key==NULL represents a key smaller than all keys in the DB.
  207. // largest_user_key==NULL represents a key largest than all keys in the DB.
  208. bool OverlapInLevel(int level, const Slice* smallest_user_key,
  209. const Slice* largest_user_key);
  210. // Returns true iff the first or last file in inputs contains
  211. // an overlapping user key to the file "just outside" of it (i.e.
  212. // just after the last file, or just before the first file)
  213. // REQUIRES: "*inputs" is a sorted list of non-overlapping files
  214. bool HasOverlappingUserKey(const std::vector<FileMetaData*>* inputs,
  215. int level);
  216. int num_levels() const { return num_levels_; }
  217. // REQUIRES: This version has been saved (see VersionSet::SaveTo)
  218. int num_non_empty_levels() const {
  219. assert(finalized_);
  220. return num_non_empty_levels_;
  221. }
  222. // REQUIRES: This version has been finalized.
  223. // (CalculateBaseBytes() is called)
  224. // This may or may not return number of level files. It is to keep backward
  225. // compatible behavior in universal compaction.
  226. int l0_delay_trigger_count() const { return l0_delay_trigger_count_; }
  227. void set_l0_delay_trigger_count(int v) { l0_delay_trigger_count_ = v; }
  228. // REQUIRES: This version has been saved (see VersionSet::SaveTo)
  229. int NumLevelFiles(int level) const {
  230. assert(finalized_);
  231. return static_cast<int>(files_[level].size());
  232. }
  233. // Return the combined file size of all files at the specified level.
  234. uint64_t NumLevelBytes(int level) const;
  235. // REQUIRES: This version has been saved (see VersionSet::SaveTo)
  236. const std::vector<FileMetaData*>& LevelFiles(int level) const {
  237. return files_[level];
  238. }
  239. const ROCKSDB_NAMESPACE::LevelFilesBrief& LevelFilesBrief(int level) const {
  240. assert(level < static_cast<int>(level_files_brief_.size()));
  241. return level_files_brief_[level];
  242. }
  243. // REQUIRES: This version has been saved (see VersionSet::SaveTo)
  244. const std::vector<int>& FilesByCompactionPri(int level) const {
  245. assert(finalized_);
  246. return files_by_compaction_pri_[level];
  247. }
  248. // REQUIRES: This version has been saved (see VersionSet::SaveTo)
  249. // REQUIRES: DB mutex held during access
  250. const autovector<std::pair<int, FileMetaData*>>& FilesMarkedForCompaction()
  251. const {
  252. assert(finalized_);
  253. return files_marked_for_compaction_;
  254. }
  255. // REQUIRES: This version has been saved (see VersionSet::SaveTo)
  256. // REQUIRES: DB mutex held during access
  257. const autovector<std::pair<int, FileMetaData*>>& ExpiredTtlFiles() const {
  258. assert(finalized_);
  259. return expired_ttl_files_;
  260. }
  261. // REQUIRES: This version has been saved (see VersionSet::SaveTo)
  262. // REQUIRES: DB mutex held during access
  263. const autovector<std::pair<int, FileMetaData*>>&
  264. FilesMarkedForPeriodicCompaction() const {
  265. assert(finalized_);
  266. return files_marked_for_periodic_compaction_;
  267. }
  268. void TEST_AddFileMarkedForPeriodicCompaction(int level, FileMetaData* f) {
  269. files_marked_for_periodic_compaction_.emplace_back(level, f);
  270. }
  271. // REQUIRES: This version has been saved (see VersionSet::SaveTo)
  272. // REQUIRES: DB mutex held during access
  273. const autovector<std::pair<int, FileMetaData*>>&
  274. BottommostFilesMarkedForCompaction() const {
  275. assert(finalized_);
  276. return bottommost_files_marked_for_compaction_;
  277. }
  278. int base_level() const { return base_level_; }
  279. double level_multiplier() const { return level_multiplier_; }
  280. // REQUIRES: lock is held
  281. // Set the index that is used to offset into files_by_compaction_pri_ to find
  282. // the next compaction candidate file.
  283. void SetNextCompactionIndex(int level, int index) {
  284. next_file_to_compact_by_size_[level] = index;
  285. }
  286. // REQUIRES: lock is held
  287. int NextCompactionIndex(int level) const {
  288. return next_file_to_compact_by_size_[level];
  289. }
  290. // REQUIRES: This version has been saved (see VersionSet::SaveTo)
  291. const FileIndexer& file_indexer() const {
  292. assert(finalized_);
  293. return file_indexer_;
  294. }
  295. // Only the first few entries of files_by_compaction_pri_ are sorted.
  296. // There is no need to sort all the files because it is likely
  297. // that on a running system, we need to look at only the first
  298. // few largest files because a new version is created every few
  299. // seconds/minutes (because of concurrent compactions).
  300. static const size_t kNumberFilesToSort = 50;
  301. // Return a human-readable short (single-line) summary of the number
  302. // of files per level. Uses *scratch as backing store.
  303. struct LevelSummaryStorage {
  304. char buffer[1000];
  305. };
  306. struct FileSummaryStorage {
  307. char buffer[3000];
  308. };
  309. const char* LevelSummary(LevelSummaryStorage* scratch) const;
  310. // Return a human-readable short (single-line) summary of files
  311. // in a specified level. Uses *scratch as backing store.
  312. const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
  313. // Return the maximum overlapping data (in bytes) at next level for any
  314. // file at a level >= 1.
  315. int64_t MaxNextLevelOverlappingBytes();
  316. // Return a human readable string that describes this version's contents.
  317. std::string DebugString(bool hex = false) const;
  318. uint64_t GetAverageValueSize() const {
  319. if (accumulated_num_non_deletions_ == 0) {
  320. return 0;
  321. }
  322. assert(accumulated_raw_key_size_ + accumulated_raw_value_size_ > 0);
  323. assert(accumulated_file_size_ > 0);
  324. return accumulated_raw_value_size_ / accumulated_num_non_deletions_ *
  325. accumulated_file_size_ /
  326. (accumulated_raw_key_size_ + accumulated_raw_value_size_);
  327. }
  328. uint64_t GetEstimatedActiveKeys() const;
  329. double GetEstimatedCompressionRatioAtLevel(int level) const;
  330. // re-initializes the index that is used to offset into
  331. // files_by_compaction_pri_
  332. // to find the next compaction candidate file.
  333. void ResetNextCompactionIndex(int level) {
  334. next_file_to_compact_by_size_[level] = 0;
  335. }
  336. const InternalKeyComparator* InternalComparator() {
  337. return internal_comparator_;
  338. }
  339. // Returns maximum total bytes of data on a given level.
  340. uint64_t MaxBytesForLevel(int level) const;
  341. // Must be called after any change to MutableCFOptions.
  342. void CalculateBaseBytes(const ImmutableCFOptions& ioptions,
  343. const MutableCFOptions& options);
  344. // Returns an estimate of the amount of live data in bytes.
  345. uint64_t EstimateLiveDataSize() const;
  346. uint64_t estimated_compaction_needed_bytes() const {
  347. return estimated_compaction_needed_bytes_;
  348. }
  349. void TEST_set_estimated_compaction_needed_bytes(uint64_t v) {
  350. estimated_compaction_needed_bytes_ = v;
  351. }
  352. bool force_consistency_checks() const { return force_consistency_checks_; }
  353. SequenceNumber bottommost_files_mark_threshold() const {
  354. return bottommost_files_mark_threshold_;
  355. }
  356. // Returns whether any key in [`smallest_key`, `largest_key`] could appear in
  357. // an older L0 file than `last_l0_idx` or in a greater level than `last_level`
  358. //
  359. // @param last_level Level after which we check for overlap
  360. // @param last_l0_idx If `last_level == 0`, index of L0 file after which we
  361. // check for overlap; otherwise, must be -1
  362. bool RangeMightExistAfterSortedRun(const Slice& smallest_user_key,
  363. const Slice& largest_user_key,
  364. int last_level, int last_l0_idx);
  365. private:
  366. const InternalKeyComparator* internal_comparator_;
  367. const Comparator* user_comparator_;
  368. int num_levels_; // Number of levels
  369. int num_non_empty_levels_; // Number of levels. Any level larger than it
  370. // is guaranteed to be empty.
  371. // Per-level max bytes
  372. std::vector<uint64_t> level_max_bytes_;
  373. // A short brief metadata of files per level
  374. autovector<ROCKSDB_NAMESPACE::LevelFilesBrief> level_files_brief_;
  375. FileIndexer file_indexer_;
  376. Arena arena_; // Used to allocate space for file_levels_
  377. CompactionStyle compaction_style_;
  378. // List of files per level, files in each level are arranged
  379. // in increasing order of keys
  380. std::vector<FileMetaData*>* files_;
  381. // Level that L0 data should be compacted to. All levels < base_level_ should
  382. // be empty. -1 if it is not level-compaction so it's not applicable.
  383. int base_level_;
  384. double level_multiplier_;
  385. // A list for the same set of files that are stored in files_,
  386. // but files in each level are now sorted based on file
  387. // size. The file with the largest size is at the front.
  388. // This vector stores the index of the file from files_.
  389. std::vector<std::vector<int>> files_by_compaction_pri_;
  390. // If true, means that files in L0 have keys with non overlapping ranges
  391. bool level0_non_overlapping_;
  392. // An index into files_by_compaction_pri_ that specifies the first
  393. // file that is not yet compacted
  394. std::vector<int> next_file_to_compact_by_size_;
  395. // Only the first few entries of files_by_compaction_pri_ are sorted.
  396. // There is no need to sort all the files because it is likely
  397. // that on a running system, we need to look at only the first
  398. // few largest files because a new version is created every few
  399. // seconds/minutes (because of concurrent compactions).
  400. static const size_t number_of_files_to_sort_ = 50;
  401. // This vector contains list of files marked for compaction and also not
  402. // currently being compacted. It is protected by DB mutex. It is calculated in
  403. // ComputeCompactionScore()
  404. autovector<std::pair<int, FileMetaData*>> files_marked_for_compaction_;
  405. autovector<std::pair<int, FileMetaData*>> expired_ttl_files_;
  406. autovector<std::pair<int, FileMetaData*>>
  407. files_marked_for_periodic_compaction_;
  408. // These files are considered bottommost because none of their keys can exist
  409. // at lower levels. They are not necessarily all in the same level. The marked
  410. // ones are eligible for compaction because they contain duplicate key
  411. // versions that are no longer protected by snapshot. These variables are
  412. // protected by DB mutex and are calculated in `GenerateBottommostFiles()` and
  413. // `ComputeBottommostFilesMarkedForCompaction()`.
  414. autovector<std::pair<int, FileMetaData*>> bottommost_files_;
  415. autovector<std::pair<int, FileMetaData*>>
  416. bottommost_files_marked_for_compaction_;
  417. // Threshold for needing to mark another bottommost file. Maintain it so we
  418. // can quickly check when releasing a snapshot whether more bottommost files
  419. // became eligible for compaction. It's defined as the min of the max nonzero
  420. // seqnums of unmarked bottommost files.
  421. SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber;
  422. // Monotonically increases as we release old snapshots. Zero indicates no
  423. // snapshots have been released yet. When no snapshots remain we set it to the
  424. // current seqnum, which needs to be protected as a snapshot can still be
  425. // created that references it.
  426. SequenceNumber oldest_snapshot_seqnum_ = 0;
  427. // Level that should be compacted next and its compaction score.
  428. // Score < 1 means compaction is not strictly needed. These fields
  429. // are initialized by Finalize().
  430. // The most critical level to be compacted is listed first
  431. // These are used to pick the best compaction level
  432. std::vector<double> compaction_score_;
  433. std::vector<int> compaction_level_;
  434. int l0_delay_trigger_count_ = 0; // Count used to trigger slow down and stop
  435. // for number of L0 files.
  436. // the following are the sampled temporary stats.
  437. // the current accumulated size of sampled files.
  438. uint64_t accumulated_file_size_;
  439. // the current accumulated size of all raw keys based on the sampled files.
  440. uint64_t accumulated_raw_key_size_;
  441. // the current accumulated size of all raw keys based on the sampled files.
  442. uint64_t accumulated_raw_value_size_;
  443. // total number of non-deletion entries
  444. uint64_t accumulated_num_non_deletions_;
  445. // total number of deletion entries
  446. uint64_t accumulated_num_deletions_;
  447. // current number of non_deletion entries
  448. uint64_t current_num_non_deletions_;
  449. // current number of deletion entries
  450. uint64_t current_num_deletions_;
  451. // current number of file samples
  452. uint64_t current_num_samples_;
  453. // Estimated bytes needed to be compacted until all levels' size is down to
  454. // target sizes.
  455. uint64_t estimated_compaction_needed_bytes_;
  456. bool finalized_;
  457. // If set to true, we will run consistency checks even if RocksDB
  458. // is compiled in release mode
  459. bool force_consistency_checks_;
  460. friend class Version;
  461. friend class VersionSet;
  462. };
  463. using MultiGetRange = MultiGetContext::Range;
  464. // A column family's version consists of the SST files owned by the column
  465. // family at a certain point in time.
  466. class Version {
  467. public:
  468. // Append to *iters a sequence of iterators that will
  469. // yield the contents of this Version when merged together.
  470. // REQUIRES: This version has been saved (see VersionSet::SaveTo)
  471. void AddIterators(const ReadOptions&, const FileOptions& soptions,
  472. MergeIteratorBuilder* merger_iter_builder,
  473. RangeDelAggregator* range_del_agg);
  474. void AddIteratorsForLevel(const ReadOptions&, const FileOptions& soptions,
  475. MergeIteratorBuilder* merger_iter_builder,
  476. int level, RangeDelAggregator* range_del_agg);
  477. Status OverlapWithLevelIterator(const ReadOptions&, const FileOptions&,
  478. const Slice& smallest_user_key,
  479. const Slice& largest_user_key,
  480. int level, bool* overlap);
  481. // Lookup the value for key or get all merge operands for key.
  482. // If do_merge = true (default) then lookup value for key.
  483. // Behavior if do_merge = true:
  484. // If found, store it in *value and
  485. // return OK. Else return a non-OK status.
  486. // Uses *operands to store merge_operator operations to apply later.
  487. //
  488. // If the ReadOptions.read_tier is set to do a read-only fetch, then
  489. // *value_found will be set to false if it cannot be determined whether
  490. // this value exists without doing IO.
  491. //
  492. // If the key is Deleted, *status will be set to NotFound and
  493. // *key_exists will be set to true.
  494. // If no key was found, *status will be set to NotFound and
  495. // *key_exists will be set to false.
  496. // If seq is non-null, *seq will be set to the sequence number found
  497. // for the key if a key was found.
  498. // Behavior if do_merge = false
  499. // If the key has any merge operands then store them in
  500. // merge_context.operands_list and don't merge the operands
  501. // REQUIRES: lock is not held
  502. void Get(const ReadOptions&, const LookupKey& key, PinnableSlice* value,
  503. Status* status, MergeContext* merge_context,
  504. SequenceNumber* max_covering_tombstone_seq,
  505. bool* value_found = nullptr, bool* key_exists = nullptr,
  506. SequenceNumber* seq = nullptr, ReadCallback* callback = nullptr,
  507. bool* is_blob = nullptr, bool do_merge = true);
  508. void MultiGet(const ReadOptions&, MultiGetRange* range,
  509. ReadCallback* callback = nullptr, bool* is_blob = nullptr);
  510. // Loads some stats information from files. Call without mutex held. It needs
  511. // to be called before applying the version to the version set.
  512. void PrepareApply(const MutableCFOptions& mutable_cf_options,
  513. bool update_stats);
  514. // Reference count management (so Versions do not disappear out from
  515. // under live iterators)
  516. void Ref();
  517. // Decrease reference count. Delete the object if no reference left
  518. // and return true. Otherwise, return false.
  519. bool Unref();
  520. // Add all files listed in the current version to *live.
  521. void AddLiveFiles(std::vector<FileDescriptor>* live);
  522. // Return a human readable string that describes this version's contents.
  523. std::string DebugString(bool hex = false, bool print_stats = false) const;
  524. // Returns the version number of this version
  525. uint64_t GetVersionNumber() const { return version_number_; }
  526. // REQUIRES: lock is held
  527. // On success, "tp" will contains the table properties of the file
  528. // specified in "file_meta". If the file name of "file_meta" is
  529. // known ahead, passing it by a non-null "fname" can save a
  530. // file-name conversion.
  531. Status GetTableProperties(std::shared_ptr<const TableProperties>* tp,
  532. const FileMetaData* file_meta,
  533. const std::string* fname = nullptr) const;
  534. // REQUIRES: lock is held
  535. // On success, *props will be populated with all SSTables' table properties.
  536. // The keys of `props` are the sst file name, the values of `props` are the
  537. // tables' properties, represented as std::shared_ptr.
  538. Status GetPropertiesOfAllTables(TablePropertiesCollection* props);
  539. Status GetPropertiesOfAllTables(TablePropertiesCollection* props, int level);
  540. Status GetPropertiesOfTablesInRange(const Range* range, std::size_t n,
  541. TablePropertiesCollection* props) const;
  542. // Print summary of range delete tombstones in SST files into out_str,
  543. // with maximum max_entries_to_print entries printed out.
  544. Status TablesRangeTombstoneSummary(int max_entries_to_print,
  545. std::string* out_str);
  546. // REQUIRES: lock is held
  547. // On success, "tp" will contains the aggregated table property among
  548. // the table properties of all sst files in this version.
  549. Status GetAggregatedTableProperties(
  550. std::shared_ptr<const TableProperties>* tp, int level = -1);
  551. uint64_t GetEstimatedActiveKeys() {
  552. return storage_info_.GetEstimatedActiveKeys();
  553. }
  554. size_t GetMemoryUsageByTableReaders();
  555. ColumnFamilyData* cfd() const { return cfd_; }
  556. // Return the next Version in the linked list. Used for debug only
  557. Version* TEST_Next() const {
  558. return next_;
  559. }
  560. int TEST_refs() const { return refs_; }
  561. VersionStorageInfo* storage_info() { return &storage_info_; }
  562. VersionSet* version_set() { return vset_; }
  563. void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta);
  564. uint64_t GetSstFilesSize();
  565. // Retrieves the file_creation_time of the oldest file in the DB.
  566. // Prerequisite for this API is max_open_files = -1
  567. void GetCreationTimeOfOldestFile(uint64_t* creation_time);
  568. const MutableCFOptions& GetMutableCFOptions() { return mutable_cf_options_; }
  569. private:
  570. Env* env_;
  571. FileSystem* fs_;
  572. friend class ReactiveVersionSet;
  573. friend class VersionSet;
  574. const InternalKeyComparator* internal_comparator() const {
  575. return storage_info_.internal_comparator_;
  576. }
  577. const Comparator* user_comparator() const {
  578. return storage_info_.user_comparator_;
  579. }
  580. bool PrefixMayMatch(const ReadOptions& read_options,
  581. InternalIterator* level_iter,
  582. const Slice& internal_prefix) const;
  583. // Returns true if the filter blocks in the specified level will not be
  584. // checked during read operations. In certain cases (trivial move or preload),
  585. // the filter block may already be cached, but we still do not access it such
  586. // that it eventually expires from the cache.
  587. bool IsFilterSkipped(int level, bool is_file_last_in_level = false);
  588. // The helper function of UpdateAccumulatedStats, which may fill the missing
  589. // fields of file_meta from its associated TableProperties.
  590. // Returns true if it does initialize FileMetaData.
  591. bool MaybeInitializeFileMetaData(FileMetaData* file_meta);
  592. // Update the accumulated stats associated with the current version.
  593. // This accumulated stats will be used in compaction.
  594. void UpdateAccumulatedStats(bool update_stats);
  595. // Sort all files for this version based on their file size and
  596. // record results in files_by_compaction_pri_. The largest files are listed
  597. // first.
  598. void UpdateFilesByCompactionPri();
  599. ColumnFamilyData* cfd_; // ColumnFamilyData to which this Version belongs
  600. Logger* info_log_;
  601. Statistics* db_statistics_;
  602. TableCache* table_cache_;
  603. const MergeOperator* merge_operator_;
  604. VersionStorageInfo storage_info_;
  605. VersionSet* vset_; // VersionSet to which this Version belongs
  606. Version* next_; // Next version in linked list
  607. Version* prev_; // Previous version in linked list
  608. int refs_; // Number of live refs to this version
  609. const FileOptions file_options_;
  610. const MutableCFOptions mutable_cf_options_;
  611. // A version number that uniquely represents this version. This is
  612. // used for debugging and logging purposes only.
  613. uint64_t version_number_;
  614. Version(ColumnFamilyData* cfd, VersionSet* vset, const FileOptions& file_opt,
  615. MutableCFOptions mutable_cf_options, uint64_t version_number = 0);
  616. ~Version();
  617. // No copying allowed
  618. Version(const Version&) = delete;
  619. void operator=(const Version&) = delete;
  620. };
  621. struct ObsoleteFileInfo {
  622. FileMetaData* metadata;
  623. std::string path;
  624. ObsoleteFileInfo() noexcept : metadata(nullptr) {}
  625. ObsoleteFileInfo(FileMetaData* f, const std::string& file_path)
  626. : metadata(f), path(file_path) {}
  627. ObsoleteFileInfo(const ObsoleteFileInfo&) = delete;
  628. ObsoleteFileInfo& operator=(const ObsoleteFileInfo&) = delete;
  629. ObsoleteFileInfo(ObsoleteFileInfo&& rhs) noexcept :
  630. ObsoleteFileInfo() {
  631. *this = std::move(rhs);
  632. }
  633. ObsoleteFileInfo& operator=(ObsoleteFileInfo&& rhs) noexcept {
  634. path = std::move(rhs.path);
  635. metadata = rhs.metadata;
  636. rhs.metadata = nullptr;
  637. return *this;
  638. }
  639. void DeleteMetadata() {
  640. delete metadata;
  641. metadata = nullptr;
  642. }
  643. };
  644. class BaseReferencedVersionBuilder;
  645. class AtomicGroupReadBuffer {
  646. public:
  647. Status AddEdit(VersionEdit* edit);
  648. void Clear();
  649. bool IsFull() const;
  650. bool IsEmpty() const;
  651. uint64_t TEST_read_edits_in_atomic_group() const {
  652. return read_edits_in_atomic_group_;
  653. }
  654. std::vector<VersionEdit>& replay_buffer() { return replay_buffer_; }
  655. private:
  656. uint64_t read_edits_in_atomic_group_ = 0;
  657. std::vector<VersionEdit> replay_buffer_;
  658. };
  659. // VersionSet is the collection of versions of all the column families of the
  660. // database. Each database owns one VersionSet. A VersionSet has access to all
  661. // column families via ColumnFamilySet, i.e. set of the column families.
  662. class VersionSet {
  663. public:
  664. VersionSet(const std::string& dbname, const ImmutableDBOptions* db_options,
  665. const FileOptions& file_options, Cache* table_cache,
  666. WriteBufferManager* write_buffer_manager,
  667. WriteController* write_controller,
  668. BlockCacheTracer* const block_cache_tracer);
  669. // No copying allowed
  670. VersionSet(const VersionSet&) = delete;
  671. void operator=(const VersionSet&) = delete;
  672. virtual ~VersionSet();
  673. // Apply *edit to the current version to form a new descriptor that
  674. // is both saved to persistent state and installed as the new
  675. // current version. Will release *mu while actually writing to the file.
  676. // column_family_options has to be set if edit is column family add
  677. // REQUIRES: *mu is held on entry.
  678. // REQUIRES: no other thread concurrently calls LogAndApply()
  679. Status LogAndApply(
  680. ColumnFamilyData* column_family_data,
  681. const MutableCFOptions& mutable_cf_options, VersionEdit* edit,
  682. InstrumentedMutex* mu, Directory* db_directory = nullptr,
  683. bool new_descriptor_log = false,
  684. const ColumnFamilyOptions* column_family_options = nullptr) {
  685. autovector<ColumnFamilyData*> cfds;
  686. cfds.emplace_back(column_family_data);
  687. autovector<const MutableCFOptions*> mutable_cf_options_list;
  688. mutable_cf_options_list.emplace_back(&mutable_cf_options);
  689. autovector<autovector<VersionEdit*>> edit_lists;
  690. autovector<VersionEdit*> edit_list;
  691. edit_list.emplace_back(edit);
  692. edit_lists.emplace_back(edit_list);
  693. return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
  694. db_directory, new_descriptor_log, column_family_options);
  695. }
  696. // The batch version. If edit_list.size() > 1, caller must ensure that
  697. // no edit in the list column family add or drop
  698. Status LogAndApply(
  699. ColumnFamilyData* column_family_data,
  700. const MutableCFOptions& mutable_cf_options,
  701. const autovector<VersionEdit*>& edit_list, InstrumentedMutex* mu,
  702. Directory* db_directory = nullptr, bool new_descriptor_log = false,
  703. const ColumnFamilyOptions* column_family_options = nullptr) {
  704. autovector<ColumnFamilyData*> cfds;
  705. cfds.emplace_back(column_family_data);
  706. autovector<const MutableCFOptions*> mutable_cf_options_list;
  707. mutable_cf_options_list.emplace_back(&mutable_cf_options);
  708. autovector<autovector<VersionEdit*>> edit_lists;
  709. edit_lists.emplace_back(edit_list);
  710. return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
  711. db_directory, new_descriptor_log, column_family_options);
  712. }
  713. // The across-multi-cf batch version. If edit_lists contain more than
  714. // 1 version edits, caller must ensure that no edit in the []list is column
  715. // family manipulation.
  716. virtual Status LogAndApply(
  717. const autovector<ColumnFamilyData*>& cfds,
  718. const autovector<const MutableCFOptions*>& mutable_cf_options_list,
  719. const autovector<autovector<VersionEdit*>>& edit_lists,
  720. InstrumentedMutex* mu, Directory* db_directory = nullptr,
  721. bool new_descriptor_log = false,
  722. const ColumnFamilyOptions* new_cf_options = nullptr);
  723. static Status GetCurrentManifestPath(const std::string& dbname,
  724. FileSystem* fs,
  725. std::string* manifest_filename,
  726. uint64_t* manifest_file_number);
  727. // Recover the last saved descriptor from persistent storage.
  728. // If read_only == true, Recover() will not complain if some column families
  729. // are not opened
  730. Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
  731. bool read_only = false, std::string* db_id = nullptr);
  732. // Reads a manifest file and returns a list of column families in
  733. // column_families.
  734. static Status ListColumnFamilies(std::vector<std::string>* column_families,
  735. const std::string& dbname, FileSystem* fs);
  736. #ifndef ROCKSDB_LITE
  737. // Try to reduce the number of levels. This call is valid when
  738. // only one level from the new max level to the old
  739. // max level containing files.
  740. // The call is static, since number of levels is immutable during
  741. // the lifetime of a RocksDB instance. It reduces number of levels
  742. // in a DB by applying changes to manifest.
  743. // For example, a db currently has 7 levels [0-6], and a call to
  744. // to reduce to 5 [0-4] can only be executed when only one level
  745. // among [4-6] contains files.
  746. static Status ReduceNumberOfLevels(const std::string& dbname,
  747. const Options* options,
  748. const FileOptions& file_options,
  749. int new_levels);
  750. // Get the checksum information of all live files
  751. Status GetLiveFilesChecksumInfo(FileChecksumList* checksum_list);
  752. // printf contents (for debugging)
  753. Status DumpManifest(Options& options, std::string& manifestFileName,
  754. bool verbose, bool hex = false, bool json = false);
  755. #endif // ROCKSDB_LITE
  756. // Return the current manifest file number
  757. uint64_t manifest_file_number() const { return manifest_file_number_; }
  758. uint64_t options_file_number() const { return options_file_number_; }
  759. uint64_t pending_manifest_file_number() const {
  760. return pending_manifest_file_number_;
  761. }
  762. uint64_t current_next_file_number() const { return next_file_number_.load(); }
  763. uint64_t min_log_number_to_keep_2pc() const {
  764. return min_log_number_to_keep_2pc_.load();
  765. }
  766. // Allocate and return a new file number
  767. uint64_t NewFileNumber() { return next_file_number_.fetch_add(1); }
  768. // Fetch And Add n new file number
  769. uint64_t FetchAddFileNumber(uint64_t n) {
  770. return next_file_number_.fetch_add(n);
  771. }
  772. // Return the last sequence number.
  773. uint64_t LastSequence() const {
  774. return last_sequence_.load(std::memory_order_acquire);
  775. }
  776. // Note: memory_order_acquire must be sufficient.
  777. uint64_t LastAllocatedSequence() const {
  778. return last_allocated_sequence_.load(std::memory_order_seq_cst);
  779. }
  780. // Note: memory_order_acquire must be sufficient.
  781. uint64_t LastPublishedSequence() const {
  782. return last_published_sequence_.load(std::memory_order_seq_cst);
  783. }
  784. // Set the last sequence number to s.
  785. void SetLastSequence(uint64_t s) {
  786. assert(s >= last_sequence_);
  787. // Last visible sequence must always be less than last written seq
  788. assert(!db_options_->two_write_queues || s <= last_allocated_sequence_);
  789. last_sequence_.store(s, std::memory_order_release);
  790. }
  791. // Note: memory_order_release must be sufficient
  792. void SetLastPublishedSequence(uint64_t s) {
  793. assert(s >= last_published_sequence_);
  794. last_published_sequence_.store(s, std::memory_order_seq_cst);
  795. }
  796. // Note: memory_order_release must be sufficient
  797. void SetLastAllocatedSequence(uint64_t s) {
  798. assert(s >= last_allocated_sequence_);
  799. last_allocated_sequence_.store(s, std::memory_order_seq_cst);
  800. }
  801. // Note: memory_order_release must be sufficient
  802. uint64_t FetchAddLastAllocatedSequence(uint64_t s) {
  803. return last_allocated_sequence_.fetch_add(s, std::memory_order_seq_cst);
  804. }
  805. // Mark the specified file number as used.
  806. // REQUIRED: this is only called during single-threaded recovery or repair.
  807. void MarkFileNumberUsed(uint64_t number);
  808. // Mark the specified log number as deleted
  809. // REQUIRED: this is only called during single-threaded recovery or repair, or
  810. // from ::LogAndApply where the global mutex is held.
  811. void MarkMinLogNumberToKeep2PC(uint64_t number);
  812. // Return the log file number for the log file that is currently
  813. // being compacted, or zero if there is no such log file.
  814. uint64_t prev_log_number() const { return prev_log_number_; }
  815. // Returns the minimum log number which still has data not flushed to any SST
  816. // file.
  817. // In non-2PC mode, all the log numbers smaller than this number can be safely
  818. // deleted.
  819. uint64_t MinLogNumberWithUnflushedData() const {
  820. return PreComputeMinLogNumberWithUnflushedData(nullptr);
  821. }
  822. // Returns the minimum log number which still has data not flushed to any SST
  823. // file, except data from `cfd_to_skip`.
  824. uint64_t PreComputeMinLogNumberWithUnflushedData(
  825. const ColumnFamilyData* cfd_to_skip) const {
  826. uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
  827. for (auto cfd : *column_family_set_) {
  828. if (cfd == cfd_to_skip) {
  829. continue;
  830. }
  831. // It's safe to ignore dropped column families here:
  832. // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST.
  833. if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()) {
  834. min_log_num = cfd->GetLogNumber();
  835. }
  836. }
  837. return min_log_num;
  838. }
  839. // Create an iterator that reads over the compaction inputs for "*c".
  840. // The caller should delete the iterator when no longer needed.
  841. InternalIterator* MakeInputIterator(
  842. const Compaction* c, RangeDelAggregator* range_del_agg,
  843. const FileOptions& file_options_compactions);
  844. // Add all files listed in any live version to *live.
  845. void AddLiveFiles(std::vector<FileDescriptor>* live_list);
  846. // Return the approximate size of data to be scanned for range [start, end)
  847. // in levels [start_level, end_level). If end_level == -1 it will search
  848. // through all non-empty levels
  849. uint64_t ApproximateSize(const SizeApproximationOptions& options, Version* v,
  850. const Slice& start, const Slice& end,
  851. int start_level, int end_level,
  852. TableReaderCaller caller);
  853. // Return the size of the current manifest file
  854. uint64_t manifest_file_size() const { return manifest_file_size_; }
  855. // verify that the files that we started with for a compaction
  856. // still exist in the current version and in the same original level.
  857. // This ensures that a concurrent compaction did not erroneously
  858. // pick the same files to compact.
  859. bool VerifyCompactionFileConsistency(Compaction* c);
  860. Status GetMetadataForFile(uint64_t number, int* filelevel,
  861. FileMetaData** metadata, ColumnFamilyData** cfd);
  862. // This function doesn't support leveldb SST filenames
  863. void GetLiveFilesMetaData(std::vector<LiveFileMetaData> *metadata);
  864. void GetObsoleteFiles(std::vector<ObsoleteFileInfo>* files,
  865. std::vector<std::string>* manifest_filenames,
  866. uint64_t min_pending_output);
  867. ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
  868. const FileOptions& file_options() { return file_options_; }
  869. void ChangeFileOptions(const MutableDBOptions& new_options) {
  870. file_options_.writable_file_max_buffer_size =
  871. new_options.writable_file_max_buffer_size;
  872. }
  873. const ImmutableDBOptions* db_options() const { return db_options_; }
  874. static uint64_t GetNumLiveVersions(Version* dummy_versions);
  875. static uint64_t GetTotalSstFilesSize(Version* dummy_versions);
  876. protected:
  877. struct ManifestWriter;
  878. friend class Version;
  879. friend class DBImpl;
  880. friend class DBImplReadOnly;
  881. struct LogReporter : public log::Reader::Reporter {
  882. Status* status;
  883. virtual void Corruption(size_t /*bytes*/, const Status& s) override {
  884. if (this->status->ok()) *this->status = s;
  885. }
  886. };
  887. // Returns approximated offset of a key in a file for a given version.
  888. uint64_t ApproximateOffsetOf(Version* v, const FdWithKeyRange& f,
  889. const Slice& key, TableReaderCaller caller);
  890. // Returns approximated data size between start and end keys in a file
  891. // for a given version.
  892. uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f,
  893. const Slice& start, const Slice& end,
  894. TableReaderCaller caller);
  895. struct MutableCFState {
  896. uint64_t log_number;
  897. };
  898. // Save current contents to *log
  899. Status WriteCurrentStateToManifest(
  900. const std::unordered_map<uint32_t, MutableCFState>& curr_state,
  901. log::Writer* log);
  902. void AppendVersion(ColumnFamilyData* column_family_data, Version* v);
  903. ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
  904. VersionEdit* edit);
  905. Status ReadAndRecover(
  906. log::Reader* reader, AtomicGroupReadBuffer* read_buffer,
  907. const std::unordered_map<std::string, ColumnFamilyOptions>&
  908. name_to_options,
  909. std::unordered_map<int, std::string>& column_families_not_found,
  910. std::unordered_map<
  911. uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>& builders,
  912. VersionEditParams* version_edit, std::string* db_id = nullptr);
  913. // REQUIRES db mutex
  914. Status ApplyOneVersionEditToBuilder(
  915. VersionEdit& edit,
  916. const std::unordered_map<std::string, ColumnFamilyOptions>& name_to_opts,
  917. std::unordered_map<int, std::string>& column_families_not_found,
  918. std::unordered_map<
  919. uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>& builders,
  920. VersionEditParams* version_edit);
  921. Status ExtractInfoFromVersionEdit(ColumnFamilyData* cfd,
  922. const VersionEdit& from_edit,
  923. VersionEditParams* version_edit_params);
  924. std::unique_ptr<ColumnFamilySet> column_family_set_;
  925. Env* const env_;
  926. FileSystem* const fs_;
  927. const std::string dbname_;
  928. std::string db_id_;
  929. const ImmutableDBOptions* const db_options_;
  930. std::atomic<uint64_t> next_file_number_;
  931. // Any log number equal or lower than this should be ignored during recovery,
  932. // and is qualified for being deleted in 2PC mode. In non-2PC mode, this
  933. // number is ignored.
  934. std::atomic<uint64_t> min_log_number_to_keep_2pc_ = {0};
  935. uint64_t manifest_file_number_;
  936. uint64_t options_file_number_;
  937. uint64_t pending_manifest_file_number_;
  938. // The last seq visible to reads. It normally indicates the last sequence in
  939. // the memtable but when using two write queues it could also indicate the
  940. // last sequence in the WAL visible to reads.
  941. std::atomic<uint64_t> last_sequence_;
  942. // The last seq that is already allocated. It is applicable only when we have
  943. // two write queues. In that case seq might or might not have appreated in
  944. // memtable but it is expected to appear in the WAL.
  945. // We have last_sequence <= last_allocated_sequence_
  946. std::atomic<uint64_t> last_allocated_sequence_;
  947. // The last allocated sequence that is also published to the readers. This is
  948. // applicable only when last_seq_same_as_publish_seq_ is not set. Otherwise
  949. // last_sequence_ also indicates the last published seq.
  950. // We have last_sequence <= last_published_sequence_ <=
  951. // last_allocated_sequence_
  952. std::atomic<uint64_t> last_published_sequence_;
  953. uint64_t prev_log_number_; // 0 or backing store for memtable being compacted
  954. // Opened lazily
  955. std::unique_ptr<log::Writer> descriptor_log_;
  956. // generates a increasing version number for every new version
  957. uint64_t current_version_number_;
  958. // Queue of writers to the manifest file
  959. std::deque<ManifestWriter*> manifest_writers_;
  960. // Current size of manifest file
  961. uint64_t manifest_file_size_;
  962. std::vector<ObsoleteFileInfo> obsolete_files_;
  963. std::vector<std::string> obsolete_manifests_;
  964. // env options for all reads and writes except compactions
  965. FileOptions file_options_;
  966. BlockCacheTracer* const block_cache_tracer_;
  967. private:
  968. // REQUIRES db mutex at beginning. may release and re-acquire db mutex
  969. Status ProcessManifestWrites(std::deque<ManifestWriter>& writers,
  970. InstrumentedMutex* mu, Directory* db_directory,
  971. bool new_descriptor_log,
  972. const ColumnFamilyOptions* new_cf_options);
  973. void LogAndApplyCFHelper(VersionEdit* edit);
  974. Status LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b,
  975. VersionEdit* edit, InstrumentedMutex* mu);
  976. };
  977. // ReactiveVersionSet represents a collection of versions of the column
  978. // families of the database. Users of ReactiveVersionSet, e.g. DBImplSecondary,
  979. // need to replay the MANIFEST (description log in older terms) in order to
  980. // reconstruct and install versions.
  981. class ReactiveVersionSet : public VersionSet {
  982. public:
  983. ReactiveVersionSet(const std::string& dbname,
  984. const ImmutableDBOptions* _db_options,
  985. const FileOptions& _file_options, Cache* table_cache,
  986. WriteBufferManager* write_buffer_manager,
  987. WriteController* write_controller);
  988. ~ReactiveVersionSet() override;
  989. Status ReadAndApply(
  990. InstrumentedMutex* mu,
  991. std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
  992. std::unordered_set<ColumnFamilyData*>* cfds_changed);
  993. Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
  994. std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
  995. std::unique_ptr<log::Reader::Reporter>* manifest_reporter,
  996. std::unique_ptr<Status>* manifest_reader_status);
  997. uint64_t TEST_read_edits_in_atomic_group() const {
  998. return read_buffer_.TEST_read_edits_in_atomic_group();
  999. }
  1000. std::vector<VersionEdit>& replay_buffer() {
  1001. return read_buffer_.replay_buffer();
  1002. }
  1003. protected:
  1004. using VersionSet::ApplyOneVersionEditToBuilder;
  1005. // REQUIRES db mutex
  1006. Status ApplyOneVersionEditToBuilder(
  1007. VersionEdit& edit, std::unordered_set<ColumnFamilyData*>* cfds_changed,
  1008. VersionEdit* version_edit);
  1009. Status MaybeSwitchManifest(
  1010. log::Reader::Reporter* reporter,
  1011. std::unique_ptr<log::FragmentBufferedReader>* manifest_reader);
  1012. private:
  1013. std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>
  1014. active_version_builders_;
  1015. AtomicGroupReadBuffer read_buffer_;
  1016. // Number of version edits to skip by ReadAndApply at the beginning of a new
  1017. // MANIFEST created by primary.
  1018. int number_of_edits_to_skip_;
  1019. using VersionSet::LogAndApply;
  1020. using VersionSet::Recover;
  1021. Status LogAndApply(
  1022. const autovector<ColumnFamilyData*>& /*cfds*/,
  1023. const autovector<const MutableCFOptions*>& /*mutable_cf_options_list*/,
  1024. const autovector<autovector<VersionEdit*>>& /*edit_lists*/,
  1025. InstrumentedMutex* /*mu*/, Directory* /*db_directory*/,
  1026. bool /*new_descriptor_log*/,
  1027. const ColumnFamilyOptions* /*new_cf_option*/) override {
  1028. return Status::NotSupported("not supported in reactive mode");
  1029. }
  1030. // No copy allowed
  1031. ReactiveVersionSet(const ReactiveVersionSet&);
  1032. ReactiveVersionSet& operator=(const ReactiveVersionSet&);
  1033. };
  1034. } // namespace ROCKSDB_NAMESPACE