version_set.h 72 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. //
  10. // The representation of a DBImpl consists of a set of Versions. The
  11. // newest version is called "current". Older versions may be kept
  12. // around to provide a consistent view to live iterators.
  13. //
  14. // Each Version keeps track of a set of table files per level, as well as a
  15. // set of blob files. The entire set of versions is maintained in a
  16. // VersionSet.
  17. //
  18. // Version,VersionSet are thread-compatible, but require external
  19. // synchronization on all accesses.
  20. #pragma once
  21. #include <atomic>
  22. #include <deque>
  23. #include <limits>
  24. #include <map>
  25. #include <memory>
  26. #include <optional>
  27. #include <set>
  28. #include <string>
  29. #include <unordered_set>
  30. #include <utility>
  31. #include <vector>
  32. #include "cache/cache_helpers.h"
  33. #include "db/blob/blob_file_meta.h"
  34. #include "db/blob/blob_index.h"
  35. #include "db/column_family.h"
  36. #include "db/compaction/compaction.h"
  37. #include "db/compaction/compaction_picker.h"
  38. #include "db/dbformat.h"
  39. #include "db/error_handler.h"
  40. #include "db/file_indexer.h"
  41. #include "db/log_reader.h"
  42. #include "db/range_del_aggregator.h"
  43. #include "db/read_callback.h"
  44. #include "db/table_cache.h"
  45. #include "db/version_builder.h"
  46. #include "db/version_edit.h"
  47. #include "db/write_controller.h"
  48. #include "env/file_system_tracer.h"
  49. #if USE_COROUTINES
  50. #include "folly/coro/BlockingWait.h"
  51. #include "folly/coro/Collect.h"
  52. #endif
  53. #include "monitoring/instrumented_mutex.h"
  54. #include "options/db_options.h"
  55. #include "options/offpeak_time_info.h"
  56. #include "port/port.h"
  57. #include "rocksdb/env.h"
  58. #include "rocksdb/file_checksum.h"
  59. #include "table/get_context.h"
  60. #include "table/multiget_context.h"
  61. #include "trace_replay/block_cache_tracer.h"
  62. #include "util/autovector.h"
  63. #include "util/coro_utils.h"
  64. #include "util/hash_containers.h"
  65. namespace ROCKSDB_NAMESPACE {
  66. namespace log {
  67. class Writer;
  68. }
  69. class BlobIndex;
  70. class Compaction;
  71. class LogBuffer;
  72. class LookupKey;
  73. class MemTable;
  74. class Version;
  75. class VersionSet;
  76. class WriteBufferManager;
  77. class MergeContext;
  78. class ColumnFamilySet;
  79. class MergeIteratorBuilder;
  80. class SystemClock;
  81. class ManifestTailer;
  82. class FilePickerMultiGet;
  83. // VersionEdit is always supposed to be valid and it is used to point at
  84. // entries in Manifest. Ideally it should not be used as a container to
  85. // carry around few of its fields as function params because it can cause
  86. // readers to think it's a valid entry from Manifest. To avoid that confusion
  87. // introducing VersionEditParams to simply carry around multiple VersionEdit
  88. // params. It need not point to a valid record in Manifest.
  89. using VersionEditParams = VersionEdit;
  90. // Return the smallest index i such that file_level.files[i]->largest >= key.
  91. // Return file_level.num_files if there is no such file.
  92. // REQUIRES: "file_level.files" contains a sorted list of
  93. // non-overlapping files.
  94. int FindFile(const InternalKeyComparator& icmp,
  95. const LevelFilesBrief& file_level, const Slice& key);
  96. // Returns true iff some file in "files" overlaps the user key range
  97. // [*smallest,*largest].
  98. // smallest==nullptr represents a key smaller than all keys in the DB.
  99. // largest==nullptr represents a key largest than all keys in the DB.
  100. // REQUIRES: If disjoint_sorted_files, file_level.files[]
  101. // contains disjoint ranges in sorted order.
  102. bool SomeFileOverlapsRange(const InternalKeyComparator& icmp,
  103. bool disjoint_sorted_files,
  104. const LevelFilesBrief& file_level,
  105. const Slice* smallest_user_key,
  106. const Slice* largest_user_key);
  107. // Generate LevelFilesBrief from vector<FdWithKeyRange*>
  108. // Would copy smallest_key and largest_key data to sequential memory
  109. // arena: Arena used to allocate the memory
  110. void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
  111. const std::vector<FileMetaData*>& files,
  112. Arena* arena);
  113. enum EpochNumberRequirement {
  114. kMightMissing,
  115. kMustPresent,
  116. };
  117. // Information of the storage associated with each Version, including number of
  118. // levels of LSM tree, files information at each level, files marked for
  119. // compaction, blob files, etc.
  120. class VersionStorageInfo {
  121. public:
  122. VersionStorageInfo(const InternalKeyComparator* internal_comparator,
  123. const Comparator* user_comparator, int num_levels,
  124. CompactionStyle compaction_style,
  125. VersionStorageInfo* src_vstorage,
  126. bool _force_consistency_checks,
  127. EpochNumberRequirement epoch_number_requirement,
  128. SystemClock* clock,
  129. uint32_t bottommost_file_compaction_delay,
  130. OffpeakTimeOption offpeak_time_option);
  131. // No copying allowed
  132. VersionStorageInfo(const VersionStorageInfo&) = delete;
  133. void operator=(const VersionStorageInfo&) = delete;
  134. ~VersionStorageInfo();
  135. void Reserve(int level, size_t size) { files_[level].reserve(size); }
  136. void AddFile(int level, FileMetaData* f);
  137. // Resize/Initialize the space for compact_cursor_
  138. void ResizeCompactCursors(int level) {
  139. compact_cursor_.resize(level, InternalKey());
  140. }
  141. const std::vector<InternalKey>& GetCompactCursors() const {
  142. return compact_cursor_;
  143. }
  144. // REQUIRES: ResizeCompactCursors has been called
  145. void AddCursorForOneLevel(int level,
  146. const InternalKey& smallest_uncompacted_key) {
  147. compact_cursor_[level] = smallest_uncompacted_key;
  148. }
  149. // REQUIRES: lock is held
  150. // Update the compact cursor and advance the file index using increment
  151. // so that it can point to the next cursor (increment means the number of
  152. // input files in this level of the last compaction)
  153. const InternalKey& GetNextCompactCursor(int level, size_t increment) {
  154. int cmp_idx = next_file_to_compact_by_size_[level] + (int)increment;
  155. assert(cmp_idx <= (int)files_by_compaction_pri_[level].size());
  156. // TODO(zichen): may need to update next_file_to_compact_by_size_
  157. // for parallel compaction.
  158. InternalKey new_cursor;
  159. if (cmp_idx >= (int)files_by_compaction_pri_[level].size()) {
  160. cmp_idx = 0;
  161. }
  162. // TODO(zichen): rethink if this strategy gives us some good guarantee
  163. return files_[level][files_by_compaction_pri_[level][cmp_idx]]->smallest;
  164. }
  165. void ReserveBlob(size_t size) { blob_files_.reserve(size); }
  166. void AddBlobFile(std::shared_ptr<BlobFileMetaData> blob_file_meta);
  167. void PrepareForVersionAppend(const ImmutableOptions& immutable_options,
  168. const MutableCFOptions& mutable_cf_options);
  169. // REQUIRES: PrepareForVersionAppend has been called
  170. void SetFinalized();
  171. // Update the accumulated stats from a file-meta.
  172. void UpdateAccumulatedStats(FileMetaData* file_meta);
  173. // Decrease the current stat from a to-be-deleted file-meta
  174. void RemoveCurrentStats(FileMetaData* file_meta);
  175. // Updates internal structures that keep track of compaction scores
  176. // We use compaction scores to figure out which compaction to do next
  177. // REQUIRES: db_mutex held!!
  178. // TODO find a better way to pass compaction_options_fifo.
  179. void ComputeCompactionScore(const ImmutableOptions& immutable_options,
  180. const MutableCFOptions& mutable_cf_options);
  181. // Estimate est_comp_needed_bytes_
  182. void EstimateCompactionBytesNeeded(
  183. const MutableCFOptions& mutable_cf_options);
  184. // This computes files_marked_for_compaction_ and is called by
  185. // ComputeCompactionScore()
  186. void ComputeFilesMarkedForCompaction(int last_level);
  187. // This computes ttl_expired_files_ and is called by
  188. // ComputeCompactionScore()
  189. void ComputeExpiredTtlFiles(const ImmutableOptions& ioptions,
  190. const uint64_t ttl);
  191. // This computes files_marked_for_periodic_compaction_ and is called by
  192. // ComputeCompactionScore()
  193. void ComputeFilesMarkedForPeriodicCompaction(
  194. const ImmutableOptions& ioptions,
  195. const uint64_t periodic_compaction_seconds, int last_level);
  196. // This computes bottommost_files_marked_for_compaction_ and is called by
  197. // ComputeCompactionScore() or UpdateOldestSnapshot().
  198. //
  199. // Among bottommost files (assumes they've already been computed), marks the
  200. // ones that have keys that would be eliminated if recompacted, according to
  201. // the seqnum of the oldest existing snapshot. Must be called every time
  202. // oldest snapshot changes as that is when bottom-level files can become
  203. // eligible for compaction.
  204. //
  205. // REQUIRES: DB mutex held
  206. void ComputeBottommostFilesMarkedForCompaction(bool allow_ingest_behind);
  207. // This computes files_marked_for_forced_blob_gc_ and is called by
  208. // ComputeCompactionScore()
  209. //
  210. // REQUIRES: DB mutex held
  211. void ComputeFilesMarkedForForcedBlobGC(
  212. double blob_garbage_collection_age_cutoff,
  213. double blob_garbage_collection_force_threshold,
  214. bool enable_blob_garbage_collection);
  215. bool level0_non_overlapping() const { return level0_non_overlapping_; }
  216. // Updates the oldest snapshot and related internal state, like the bottommost
  217. // files marked for compaction.
  218. // REQUIRES: DB mutex held
  219. void UpdateOldestSnapshot(SequenceNumber oldest_snapshot_seqnum,
  220. bool allow_ingest_behind);
  221. int MaxInputLevel() const;
  222. int MaxOutputLevel(bool allow_ingest_behind) const;
  223. // Return level number that has idx'th highest score
  224. int CompactionScoreLevel(int idx) const { return compaction_level_[idx]; }
  225. // Return idx'th highest score
  226. double CompactionScore(int idx) const { return compaction_score_[idx]; }
  227. void GetOverlappingInputs(
  228. int level, const InternalKey* begin, // nullptr means before all keys
  229. const InternalKey* end, // nullptr means after all keys
  230. std::vector<FileMetaData*>* inputs,
  231. int hint_index = -1, // index of overlap file
  232. int* file_index = nullptr, // return index of overlap file
  233. bool expand_range = true, // if set, returns files which overlap the
  234. // range and overlap each other. If false,
  235. // then just files intersecting the range
  236. const FileMetaData* starting_l0_file =
  237. nullptr, // If not null, restricts L0 file selection to only include
  238. // files at or older than starting_l0_file.
  239. InternalKey** next_smallest =
  240. nullptr // if non-null, returns the
  241. // smallest key of next file not included
  242. ) const;
  243. void GetCleanInputsWithinInterval(
  244. int level, const InternalKey* begin, // nullptr means before all keys
  245. const InternalKey* end, // nullptr means after all keys
  246. std::vector<FileMetaData*>* inputs,
  247. int hint_index = -1, // index of overlap file
  248. int* file_index = nullptr) // return index of overlap file
  249. const;
  250. void GetOverlappingInputsRangeBinarySearch(
  251. int level, // level > 0
  252. const InternalKey* begin, // nullptr means before all keys
  253. const InternalKey* end, // nullptr means after all keys
  254. std::vector<FileMetaData*>* inputs,
  255. int hint_index, // index of overlap file
  256. int* file_index, // return index of overlap file
  257. bool within_interval = false, // if set, force the inputs within interval
  258. InternalKey** next_smallest =
  259. nullptr // if non-null, returns the
  260. // smallest key of next file not included
  261. ) const;
  262. // Returns true iff some file in the specified level overlaps
  263. // some part of [*smallest_user_key,*largest_user_key].
  264. // smallest_user_key==NULL represents a key smaller than all keys in the DB.
  265. // largest_user_key==NULL represents a key largest than all keys in the DB.
  266. bool OverlapInLevel(int level, const Slice* smallest_user_key,
  267. const Slice* largest_user_key);
  268. // Returns true iff the first or last file in inputs contains
  269. // an overlapping user key to the file "just outside" of it (i.e.
  270. // just after the last file, or just before the first file)
  271. // REQUIRES: "*inputs" is a sorted list of non-overlapping files
  272. bool HasOverlappingUserKey(const std::vector<FileMetaData*>* inputs,
  273. int level);
  274. int num_levels() const { return num_levels_; }
  275. // REQUIRES: PrepareForVersionAppend has been called
  276. int num_non_empty_levels() const {
  277. assert(finalized_);
  278. return num_non_empty_levels_;
  279. }
  280. // REQUIRES: PrepareForVersionAppend has been called
  281. // This may or may not return number of level files. It is to keep backward
  282. // compatible behavior in universal compaction.
  283. int l0_delay_trigger_count() const { return l0_delay_trigger_count_; }
  284. void set_l0_delay_trigger_count(int v) { l0_delay_trigger_count_ = v; }
  285. // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
  286. int NumLevelFiles(int level) const {
  287. assert(finalized_);
  288. return static_cast<int>(files_[level].size());
  289. }
  290. // Return the combined file size of all files at the specified level.
  291. uint64_t NumLevelBytes(int level) const;
  292. // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
  293. const std::vector<FileMetaData*>& LevelFiles(int level) const {
  294. return files_[level];
  295. }
  296. bool HasMissingEpochNumber() const;
  297. uint64_t GetMaxEpochNumberOfFiles() const;
  298. EpochNumberRequirement GetEpochNumberRequirement() const {
  299. return epoch_number_requirement_;
  300. }
  301. void SetEpochNumberRequirement(
  302. EpochNumberRequirement epoch_number_requirement) {
  303. epoch_number_requirement_ = epoch_number_requirement;
  304. }
  305. // Ensure all files have epoch number set.
  306. // If there is a file missing epoch number, all files' epoch number will be
  307. // reset according to CF's epoch number. Otherwise, the CF will be updated
  308. // with the max epoch number of the files.
  309. //
  310. // @param restart_epoch This CF's epoch number will be reset to start from 0.
  311. // @param force Force resetting all files' epoch number.
  312. void RecoverEpochNumbers(ColumnFamilyData* cfd, bool restart_epoch = true,
  313. bool force = false);
  314. class FileLocation {
  315. public:
  316. FileLocation() = default;
  317. FileLocation(int level, size_t position)
  318. : level_(level), position_(position) {}
  319. int GetLevel() const { return level_; }
  320. size_t GetPosition() const { return position_; }
  321. bool IsValid() const { return level_ >= 0; }
  322. bool operator==(const FileLocation& rhs) const {
  323. return level_ == rhs.level_ && position_ == rhs.position_;
  324. }
  325. bool operator!=(const FileLocation& rhs) const { return !(*this == rhs); }
  326. static FileLocation Invalid() { return FileLocation(); }
  327. private:
  328. int level_ = -1;
  329. size_t position_ = 0;
  330. };
  331. // REQUIRES: PrepareForVersionAppend has been called
  332. FileLocation GetFileLocation(uint64_t file_number) const {
  333. const auto it = file_locations_.find(file_number);
  334. if (it == file_locations_.end()) {
  335. return FileLocation::Invalid();
  336. }
  337. assert(it->second.GetLevel() < num_levels_);
  338. assert(it->second.GetPosition() < files_[it->second.GetLevel()].size());
  339. assert(files_[it->second.GetLevel()][it->second.GetPosition()]);
  340. assert(files_[it->second.GetLevel()][it->second.GetPosition()]
  341. ->fd.GetNumber() == file_number);
  342. return it->second;
  343. }
  344. // REQUIRES: PrepareForVersionAppend has been called
  345. FileMetaData* GetFileMetaDataByNumber(uint64_t file_number) const {
  346. auto location = GetFileLocation(file_number);
  347. if (!location.IsValid()) {
  348. return nullptr;
  349. }
  350. return files_[location.GetLevel()][location.GetPosition()];
  351. }
  352. // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
  353. using BlobFiles = std::vector<std::shared_ptr<BlobFileMetaData>>;
  354. const BlobFiles& GetBlobFiles() const { return blob_files_; }
  355. // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
  356. BlobFiles::const_iterator GetBlobFileMetaDataLB(
  357. uint64_t blob_file_number) const;
  358. // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
  359. std::shared_ptr<BlobFileMetaData> GetBlobFileMetaData(
  360. uint64_t blob_file_number) const {
  361. const auto it = GetBlobFileMetaDataLB(blob_file_number);
  362. assert(it == blob_files_.end() || *it);
  363. if (it != blob_files_.end() &&
  364. (*it)->GetBlobFileNumber() == blob_file_number) {
  365. return *it;
  366. }
  367. return std::shared_ptr<BlobFileMetaData>();
  368. }
  369. // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
  370. struct BlobStats {
  371. uint64_t total_file_size = 0;
  372. uint64_t total_garbage_size = 0;
  373. double space_amp = 0.0;
  374. };
  375. BlobStats GetBlobStats() const {
  376. uint64_t total_file_size = 0;
  377. uint64_t total_garbage_size = 0;
  378. for (const auto& meta : blob_files_) {
  379. assert(meta);
  380. total_file_size += meta->GetBlobFileSize();
  381. total_garbage_size += meta->GetGarbageBlobBytes();
  382. }
  383. double space_amp = 0.0;
  384. if (total_file_size > total_garbage_size) {
  385. space_amp = static_cast<double>(total_file_size) /
  386. (total_file_size - total_garbage_size);
  387. }
  388. return BlobStats{total_file_size, total_garbage_size, space_amp};
  389. }
  390. const ROCKSDB_NAMESPACE::LevelFilesBrief& LevelFilesBrief(int level) const {
  391. assert(level < static_cast<int>(level_files_brief_.size()));
  392. return level_files_brief_[level];
  393. }
  394. // REQUIRES: PrepareForVersionAppend has been called
  395. const std::vector<int>& FilesByCompactionPri(int level) const {
  396. assert(finalized_);
  397. return files_by_compaction_pri_[level];
  398. }
  399. // REQUIRES: ComputeCompactionScore has been called
  400. // REQUIRES: DB mutex held during access
  401. const autovector<std::pair<int, FileMetaData*>>& FilesMarkedForCompaction()
  402. const {
  403. assert(finalized_);
  404. return files_marked_for_compaction_;
  405. }
  406. void TEST_AddFileMarkedForCompaction(int level, FileMetaData* f) {
  407. f->marked_for_compaction = true;
  408. files_marked_for_compaction_.emplace_back(level, f);
  409. }
  410. // REQUIRES: ComputeCompactionScore has been called
  411. // REQUIRES: DB mutex held during access
  412. // Used by Leveled Compaction only.
  413. const autovector<std::pair<int, FileMetaData*>>& ExpiredTtlFiles() const {
  414. assert(finalized_);
  415. return expired_ttl_files_;
  416. }
  417. // REQUIRES: ComputeCompactionScore has been called
  418. // REQUIRES: DB mutex held during access
  419. // Used by Leveled and Universal Compaction.
  420. const autovector<std::pair<int, FileMetaData*>>&
  421. FilesMarkedForPeriodicCompaction() const {
  422. assert(finalized_);
  423. return files_marked_for_periodic_compaction_;
  424. }
  425. void TEST_AddFileMarkedForPeriodicCompaction(int level, FileMetaData* f) {
  426. files_marked_for_periodic_compaction_.emplace_back(level, f);
  427. }
  428. // REQUIRES: PrepareForVersionAppend has been called
  429. const autovector<std::pair<int, FileMetaData*>>& BottommostFiles() const {
  430. assert(finalized_);
  431. return bottommost_files_;
  432. }
  433. // REQUIRES: ComputeCompactionScore has been called
  434. // REQUIRES: DB mutex held during access
  435. const autovector<std::pair<int, FileMetaData*>>&
  436. BottommostFilesMarkedForCompaction() const {
  437. assert(finalized_);
  438. return bottommost_files_marked_for_compaction_;
  439. }
  440. // REQUIRES: ComputeCompactionScore has been called
  441. // REQUIRES: DB mutex held during access
  442. const autovector<std::pair<int, FileMetaData*>>& FilesMarkedForForcedBlobGC()
  443. const {
  444. assert(finalized_);
  445. return files_marked_for_forced_blob_gc_;
  446. }
  447. int base_level() const { return base_level_; }
  448. double level_multiplier() const { return level_multiplier_; }
  449. // REQUIRES: lock is held
  450. // Set the index that is used to offset into files_by_compaction_pri_ to find
  451. // the next compaction candidate file.
  452. void SetNextCompactionIndex(int level, int index) {
  453. next_file_to_compact_by_size_[level] = index;
  454. }
  455. // REQUIRES: lock is held
  456. int NextCompactionIndex(int level) const {
  457. return next_file_to_compact_by_size_[level];
  458. }
  459. // REQUIRES: PrepareForVersionAppend has been called
  460. const FileIndexer& file_indexer() const {
  461. assert(finalized_);
  462. return file_indexer_;
  463. }
  464. // Only the first few entries of files_by_compaction_pri_ are sorted.
  465. // There is no need to sort all the files because it is likely
  466. // that on a running system, we need to look at only the first
  467. // few largest files because a new version is created every few
  468. // seconds/minutes (because of concurrent compactions).
  469. static const size_t kNumberFilesToSort = 50;
  470. // Return a human-readable short (single-line) summary of the number
  471. // of files per level. Uses *scratch as backing store.
  472. struct LevelSummaryStorage {
  473. char buffer[1000];
  474. };
  475. struct FileSummaryStorage {
  476. char buffer[3000];
  477. };
  478. const char* LevelSummary(LevelSummaryStorage* scratch) const;
  479. // Return a human-readable short (single-line) summary of files
  480. // in a specified level. Uses *scratch as backing store.
  481. const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
  482. // Return the maximum overlapping data (in bytes) at next level for any
  483. // file at a level >= 1.
  484. uint64_t MaxNextLevelOverlappingBytes();
  485. // Return a human readable string that describes this version's contents.
  486. std::string DebugString(bool hex = false) const;
  487. uint64_t GetAverageValueSize() const {
  488. if (accumulated_num_non_deletions_ == 0) {
  489. return 0;
  490. }
  491. assert(accumulated_raw_key_size_ + accumulated_raw_value_size_ > 0);
  492. assert(accumulated_file_size_ > 0);
  493. return accumulated_raw_value_size_ / accumulated_num_non_deletions_ *
  494. accumulated_file_size_ /
  495. (accumulated_raw_key_size_ + accumulated_raw_value_size_);
  496. }
  497. uint64_t GetEstimatedActiveKeys() const;
  498. double GetEstimatedCompressionRatioAtLevel(int level) const;
  499. // re-initializes the index that is used to offset into
  500. // files_by_compaction_pri_
  501. // to find the next compaction candidate file.
  502. void ResetNextCompactionIndex(int level) {
  503. next_file_to_compact_by_size_[level] = 0;
  504. }
  505. const InternalKeyComparator* InternalComparator() const {
  506. return internal_comparator_;
  507. }
  508. // Returns maximum total bytes of data on a given level.
  509. uint64_t MaxBytesForLevel(int level) const;
  510. // Returns an estimate of the amount of live data in bytes.
  511. uint64_t EstimateLiveDataSize() const;
  512. uint64_t estimated_compaction_needed_bytes() const {
  513. return estimated_compaction_needed_bytes_;
  514. }
  515. void TEST_set_estimated_compaction_needed_bytes(uint64_t v,
  516. InstrumentedMutex* mu) {
  517. InstrumentedMutexLock l(mu);
  518. estimated_compaction_needed_bytes_ = v;
  519. }
  520. bool force_consistency_checks() const { return force_consistency_checks_; }
  521. SequenceNumber bottommost_files_mark_threshold() const {
  522. return bottommost_files_mark_threshold_;
  523. }
  524. SequenceNumber standalone_range_tombstone_files_mark_threshold() const {
  525. return standalone_range_tombstone_files_mark_threshold_;
  526. }
  527. // Returns whether any key in [`smallest_key`, `largest_key`] could appear in
  528. // an older L0 file than `last_l0_idx` or in a greater level than `last_level`
  529. //
  530. // @param last_level Level after which we check for overlap
  531. // @param last_l0_idx If `last_level == 0`, index of L0 file after which we
  532. // check for overlap; otherwise, must be -1
  533. bool RangeMightExistAfterSortedRun(const Slice& smallest_user_key,
  534. const Slice& largest_user_key,
  535. int last_level, int last_l0_idx);
  536. Env::WriteLifeTimeHint CalculateSSTWriteHint(
  537. int level, CompactionStyleSet compaction_style_set) const;
  538. const Comparator* user_comparator() const { return user_comparator_; }
  539. private:
  540. void ComputeCompensatedSizes();
  541. void UpdateNumNonEmptyLevels();
  542. void CalculateBaseBytes(const ImmutableOptions& ioptions,
  543. const MutableCFOptions& options);
  544. void UpdateFilesByCompactionPri(const ImmutableOptions& immutable_options,
  545. const MutableCFOptions& mutable_cf_options);
  546. void GenerateFileIndexer() {
  547. file_indexer_.UpdateIndex(&arena_, num_non_empty_levels_, files_);
  548. }
  549. void GenerateLevelFilesBrief();
  550. void GenerateLevel0NonOverlapping();
  551. void GenerateBottommostFiles();
  552. void GenerateFileLocationIndex();
  553. const InternalKeyComparator* internal_comparator_;
  554. const Comparator* user_comparator_;
  555. int num_levels_; // Number of levels
  556. int num_non_empty_levels_; // Number of levels. Any level larger than it
  557. // is guaranteed to be empty.
  558. // Per-level max bytes
  559. std::vector<uint64_t> level_max_bytes_;
  560. // A short brief metadata of files per level
  561. autovector<ROCKSDB_NAMESPACE::LevelFilesBrief> level_files_brief_;
  562. FileIndexer file_indexer_;
  563. Arena arena_; // Used to allocate space for file_levels_
  564. CompactionStyle compaction_style_;
  565. // List of files per level, files in each level are arranged
  566. // in increasing order of keys
  567. // In L0, files are ordered in decreasing epoch number, meaning
  568. // more recent updates are ordered first.
  569. std::vector<FileMetaData*>* files_;
  570. // Map of all table files in version. Maps file number to (level, position on
  571. // level).
  572. using FileLocations = UnorderedMap<uint64_t, FileLocation>;
  573. FileLocations file_locations_;
  574. // Vector of blob files in version sorted by blob file number.
  575. BlobFiles blob_files_;
  576. // Level that L0 data should be compacted to. All levels < base_level_ should
  577. // be empty. -1 if it is not level-compaction so it's not applicable.
  578. int base_level_;
  579. // Applies to level compaction when
  580. // `level_compaction_dynamic_level_bytes=true`. All non-empty levels <=
  581. // lowest_unnecessary_level_ are not needed and will be drained automatically.
  582. // -1 if there is no unnecessary level,
  583. int lowest_unnecessary_level_;
  584. double level_multiplier_;
  585. // A list for the same set of files that are stored in files_,
  586. // but files in each level are now sorted based on file
  587. // size. The file with the largest size is at the front.
  588. // This vector stores the index of the file from files_.
  589. std::vector<std::vector<int>> files_by_compaction_pri_;
  590. // If true, means that files in L0 have keys with non overlapping ranges
  591. bool level0_non_overlapping_;
  592. // An index into files_by_compaction_pri_ that specifies the first
  593. // file that is not yet compacted
  594. std::vector<int> next_file_to_compact_by_size_;
  595. // This vector contains list of files marked for compaction and also not
  596. // currently being compacted. It is protected by DB mutex. It is calculated in
  597. // ComputeCompactionScore(). Used by Leveled and Universal Compaction.
  598. autovector<std::pair<int, FileMetaData*>> files_marked_for_compaction_;
  599. autovector<std::pair<int, FileMetaData*>> expired_ttl_files_;
  600. autovector<std::pair<int, FileMetaData*>>
  601. files_marked_for_periodic_compaction_;
  602. // These files are considered bottommost because none of their keys can exist
  603. // at lower levels. They are not necessarily all in the same level. The marked
  604. // ones are eligible for compaction because they contain duplicate key
  605. // versions that are no longer protected by snapshot. These variables are
  606. // protected by DB mutex and are calculated in `GenerateBottommostFiles()` and
  607. // `ComputeBottommostFilesMarkedForCompaction()`.
  608. autovector<std::pair<int, FileMetaData*>> bottommost_files_;
  609. autovector<std::pair<int, FileMetaData*>>
  610. bottommost_files_marked_for_compaction_;
  611. autovector<std::pair<int, FileMetaData*>> files_marked_for_forced_blob_gc_;
  612. // Threshold for needing to mark another bottommost file. Maintain it so we
  613. // can quickly check when releasing a snapshot whether more bottommost files
  614. // became eligible for compaction. It's defined as the min of the max nonzero
  615. // seqnums of unmarked bottommost files.
  616. SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber;
  617. // The minimum sequence number among all the standalone range tombstone files
  618. // that are marked for compaction. A standalone range tombstone file is one
  619. // with just one range tombstone.
  620. SequenceNumber standalone_range_tombstone_files_mark_threshold_ =
  621. kMaxSequenceNumber;
  622. // Monotonically increases as we release old snapshots. Zero indicates no
  623. // snapshots have been released yet. When no snapshots remain we set it to the
  624. // current seqnum, which needs to be protected as a snapshot can still be
  625. // created that references it.
  626. SequenceNumber oldest_snapshot_seqnum_ = 0;
  627. // Level that should be compacted next and its compaction score.
  628. // Score < 1 means compaction is not strictly needed. These fields
  629. // are initialized by ComputeCompactionScore.
  630. // The most critical level to be compacted is listed first
  631. // These are used to pick the best compaction level
  632. std::vector<double> compaction_score_;
  633. std::vector<int> compaction_level_;
  634. int l0_delay_trigger_count_ = 0; // Count used to trigger slow down and stop
  635. // for number of L0 files.
  636. // Compact cursors for round-robin compactions in each level
  637. std::vector<InternalKey> compact_cursor_;
  638. // the following are the sampled temporary stats.
  639. // the current accumulated size of sampled files.
  640. uint64_t accumulated_file_size_;
  641. // the current accumulated size of all raw keys based on the sampled files.
  642. uint64_t accumulated_raw_key_size_;
  643. // the current accumulated size of all raw keys based on the sampled files.
  644. uint64_t accumulated_raw_value_size_;
  645. // total number of non-deletion entries
  646. uint64_t accumulated_num_non_deletions_;
  647. // total number of deletion entries
  648. uint64_t accumulated_num_deletions_;
  649. // current number of non_deletion entries
  650. uint64_t current_num_non_deletions_;
  651. // current number of deletion entries
  652. uint64_t current_num_deletions_;
  653. // current number of file samples
  654. uint64_t current_num_samples_;
  655. // Estimated bytes needed to be compacted until all levels' size is down to
  656. // target sizes.
  657. uint64_t estimated_compaction_needed_bytes_;
  658. // Used for computing bottommost files marked for compaction and checking for
  659. // offpeak time.
  660. SystemClock* clock_;
  661. uint32_t bottommost_file_compaction_delay_;
  662. bool finalized_;
  663. // If set to true, we will run consistency checks even if RocksDB
  664. // is compiled in release mode
  665. bool force_consistency_checks_;
  666. EpochNumberRequirement epoch_number_requirement_;
  667. OffpeakTimeOption offpeak_time_option_;
  668. friend class Version;
  669. friend class VersionSet;
  670. };
  671. struct ObsoleteFileInfo {
  672. FileMetaData* metadata;
  673. std::string path;
  674. // If true, the FileMataData should be destroyed but the file should
  675. // not be deleted. This is because another FileMetaData still references
  676. // the file, usually because the file is trivial moved so two FileMetadata
  677. // is managing the file.
  678. bool only_delete_metadata = false;
  679. // To apply to this file
  680. uint32_t uncache_aggressiveness = 0;
  681. ObsoleteFileInfo() noexcept
  682. : metadata(nullptr), only_delete_metadata(false) {}
  683. ObsoleteFileInfo(FileMetaData* f, const std::string& file_path,
  684. uint32_t _uncache_aggressiveness,
  685. std::shared_ptr<CacheReservationManager>
  686. file_metadata_cache_res_mgr_arg = nullptr)
  687. : metadata(f),
  688. path(file_path),
  689. uncache_aggressiveness(_uncache_aggressiveness),
  690. file_metadata_cache_res_mgr(
  691. std::move(file_metadata_cache_res_mgr_arg)) {}
  692. ObsoleteFileInfo(const ObsoleteFileInfo&) = delete;
  693. ObsoleteFileInfo& operator=(const ObsoleteFileInfo&) = delete;
  694. ObsoleteFileInfo(ObsoleteFileInfo&& rhs) noexcept : ObsoleteFileInfo() {
  695. *this = std::move(rhs);
  696. }
  697. ObsoleteFileInfo& operator=(ObsoleteFileInfo&& rhs) noexcept {
  698. metadata = rhs.metadata;
  699. rhs.metadata = nullptr;
  700. path = std::move(rhs.path);
  701. only_delete_metadata = rhs.only_delete_metadata;
  702. rhs.only_delete_metadata = false;
  703. uncache_aggressiveness = rhs.uncache_aggressiveness;
  704. rhs.uncache_aggressiveness = 0;
  705. file_metadata_cache_res_mgr = rhs.file_metadata_cache_res_mgr;
  706. rhs.file_metadata_cache_res_mgr = nullptr;
  707. return *this;
  708. }
  709. void DeleteMetadata() {
  710. if (file_metadata_cache_res_mgr) {
  711. Status s = file_metadata_cache_res_mgr->UpdateCacheReservation(
  712. metadata->ApproximateMemoryUsage(), false /* increase */);
  713. s.PermitUncheckedError();
  714. }
  715. delete metadata;
  716. metadata = nullptr;
  717. }
  718. private:
  719. std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr;
  720. };
  721. class ObsoleteBlobFileInfo {
  722. public:
  723. ObsoleteBlobFileInfo(uint64_t blob_file_number, std::string path)
  724. : blob_file_number_(blob_file_number), path_(std::move(path)) {}
  725. uint64_t GetBlobFileNumber() const { return blob_file_number_; }
  726. const std::string& GetPath() const { return path_; }
  727. private:
  728. uint64_t blob_file_number_;
  729. std::string path_;
  730. };
  731. using MultiGetRange = MultiGetContext::Range;
  732. // A column family's version consists of the table and blob files owned by
  733. // the column family at a certain point in time.
  734. class Version {
  735. public:
  736. // Append to *iters a sequence of iterators that will
  737. // yield the contents of this Version when merged together.
  738. // @param read_options Must outlive any iterator built by
  739. // `merger_iter_builder`.
  740. void AddIterators(const ReadOptions& read_options,
  741. const FileOptions& soptions,
  742. MergeIteratorBuilder* merger_iter_builder,
  743. bool allow_unprepared_value);
  744. // @param read_options Must outlive any iterator built by
  745. // `merger_iter_builder`.
  746. void AddIteratorsForLevel(const ReadOptions& read_options,
  747. const FileOptions& soptions,
  748. MergeIteratorBuilder* merger_iter_builder,
  749. int level, bool allow_unprepared_value);
  750. Status OverlapWithLevelIterator(const ReadOptions&, const FileOptions&,
  751. const Slice& smallest_user_key,
  752. const Slice& largest_user_key, int level,
  753. bool* overlap);
  754. // Lookup the value for key or get all merge operands for key.
  755. // If do_merge = true (default) then lookup value for key.
  756. // Behavior if do_merge = true:
  757. // If found, store it in *value and
  758. // return OK. Else return a non-OK status.
  759. // Uses *operands to store merge_operator operations to apply later.
  760. //
  761. // If the ReadOptions.read_tier is set to do a read-only fetch, then
  762. // *value_found will be set to false if it cannot be determined whether
  763. // this value exists without doing IO.
  764. //
  765. // If the key is Deleted, *status will be set to NotFound and
  766. // *key_exists will be set to true.
  767. // If no key was found, *status will be set to NotFound and
  768. // *key_exists will be set to false.
  769. // If seq is non-null, *seq will be set to the sequence number found
  770. // for the key if a key was found.
  771. // Behavior if do_merge = false
  772. // If the key has any merge operands then store them in
  773. // merge_context.operands_list and don't merge the operands
  774. // REQUIRES: lock is not held
  775. // REQUIRES: pinned_iters_mgr != nullptr
  776. void Get(const ReadOptions&, const LookupKey& key, PinnableSlice* value,
  777. PinnableWideColumns* columns, std::string* timestamp, Status* status,
  778. MergeContext* merge_context,
  779. SequenceNumber* max_covering_tombstone_seq,
  780. PinnedIteratorsManager* pinned_iters_mgr,
  781. bool* value_found = nullptr, bool* key_exists = nullptr,
  782. SequenceNumber* seq = nullptr, ReadCallback* callback = nullptr,
  783. bool* is_blob = nullptr, bool do_merge = true);
  784. void MultiGet(const ReadOptions&, MultiGetRange* range,
  785. ReadCallback* callback = nullptr);
  786. // Interprets blob_index_slice as a blob reference, and (assuming the
  787. // corresponding blob file is part of this Version) retrieves the blob and
  788. // saves it in *value.
  789. // REQUIRES: blob_index_slice stores an encoded blob reference
  790. Status GetBlob(const ReadOptions& read_options, const Slice& user_key,
  791. const Slice& blob_index_slice,
  792. FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value,
  793. uint64_t* bytes_read) const;
  794. // Retrieves a blob using a blob reference and saves it in *value,
  795. // assuming the corresponding blob file is part of this Version.
  796. Status GetBlob(const ReadOptions& read_options, const Slice& user_key,
  797. const BlobIndex& blob_index,
  798. FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value,
  799. uint64_t* bytes_read) const;
  800. struct BlobReadContext {
  801. BlobReadContext(const BlobIndex& blob_idx, const KeyContext* key_ctx)
  802. : blob_index(blob_idx), key_context(key_ctx) {}
  803. BlobIndex blob_index;
  804. const KeyContext* key_context;
  805. PinnableSlice result;
  806. };
  807. using BlobReadContexts = std::vector<BlobReadContext>;
  808. void MultiGetBlob(const ReadOptions& read_options, MultiGetRange& range,
  809. std::unordered_map<uint64_t, BlobReadContexts>& blob_ctxs);
  810. // Loads some stats information from files (if update_stats is set) and
  811. // populates derived data structures. Call without mutex held. It needs to be
  812. // called before appending the version to the version set.
  813. void PrepareAppend(const ReadOptions& read_options, bool update_stats);
  814. // Reference count management (so Versions do not disappear out from
  815. // under live iterators)
  816. void Ref();
  817. // Decrease reference count. Delete the object if no reference left
  818. // and return true. Otherwise, return false.
  819. bool Unref();
  820. // Add all files listed in the current version to *live_table_files and
  821. // *live_blob_files.
  822. void AddLiveFiles(std::vector<uint64_t>* live_table_files,
  823. std::vector<uint64_t>* live_blob_files) const;
  824. // Remove live files that are in the delete candidate lists.
  825. void RemoveLiveFiles(
  826. std::vector<ObsoleteFileInfo>& sst_delete_candidates,
  827. std::vector<ObsoleteBlobFileInfo>& blob_delete_candidates) const;
  828. // Return a human readable string that describes this version's contents.
  829. std::string DebugString(bool hex = false, bool print_stats = false) const;
  830. // Returns the version number of this version
  831. uint64_t GetVersionNumber() const { return version_number_; }
  832. // REQUIRES: lock is held
  833. // On success, "tp" will contains the table properties of the file
  834. // specified in "file_meta". If the file name of "file_meta" is
  835. // known ahead, passing it by a non-null "fname" can save a
  836. // file-name conversion.
  837. Status GetTableProperties(const ReadOptions& read_options,
  838. std::shared_ptr<const TableProperties>* tp,
  839. const FileMetaData* file_meta,
  840. const std::string* fname = nullptr) const;
  841. // On success, *props will be populated with all SSTables' table properties.
  842. // The keys of `props` are the sst file name, the values of `props` are the
  843. // tables' properties, represented as std::shared_ptr.
  844. Status GetPropertiesOfAllTables(const ReadOptions& read_options,
  845. TablePropertiesCollection* props) const;
  846. Status GetPropertiesOfAllTables(const ReadOptions& read_options,
  847. TablePropertiesCollection* props,
  848. int level) const;
  849. Status GetPropertiesOfTablesInRange(const ReadOptions& read_options,
  850. const autovector<UserKeyRange>& ranges,
  851. TablePropertiesCollection* props) const;
  852. Status GetPropertiesOfTablesByLevel(
  853. const ReadOptions& read_options,
  854. std::vector<std::unique_ptr<TablePropertiesCollection>>* props_by_level)
  855. const;
  856. // Print summary of range delete tombstones in SST files into out_str,
  857. // with maximum max_entries_to_print entries printed out.
  858. Status TablesRangeTombstoneSummary(int max_entries_to_print,
  859. std::string* out_str);
  860. // REQUIRES: lock is held
  861. // On success, "tp" will contains the aggregated table property among
  862. // the table properties of all sst files in this version.
  863. Status GetAggregatedTableProperties(
  864. const ReadOptions& read_options,
  865. std::shared_ptr<const TableProperties>* tp, int level = -1);
  866. uint64_t GetEstimatedActiveKeys() {
  867. return storage_info_.GetEstimatedActiveKeys();
  868. }
  869. size_t GetMemoryUsageByTableReaders(const ReadOptions& read_options);
  870. ColumnFamilyData* cfd() const { return cfd_; }
  871. // Return the next Version in the linked list.
  872. Version* Next() const { return next_; }
  873. int TEST_refs() const { return refs_; }
  874. VersionStorageInfo* storage_info() { return &storage_info_; }
  875. const VersionStorageInfo* storage_info() const { return &storage_info_; }
  876. VersionSet* version_set() { return vset_; }
  877. void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta);
  878. void GetSstFilesBoundaryKeys(Slice* smallest_user_key,
  879. Slice* largest_user_key);
  880. uint64_t GetSstFilesSize();
  881. // Retrieves the file_creation_time of the oldest file in the DB.
  882. // Prerequisite for this API is max_open_files = -1
  883. void GetCreationTimeOfOldestFile(uint64_t* creation_time);
  884. const MutableCFOptions& GetMutableCFOptions() { return mutable_cf_options_; }
  885. InternalIterator* TEST_GetLevelIterator(
  886. const ReadOptions& read_options, MergeIteratorBuilder* merge_iter_builder,
  887. int level, bool allow_unprepared_value);
  888. private:
  889. Env* env_;
  890. SystemClock* clock_;
  891. friend class ReactiveVersionSet;
  892. friend class VersionSet;
  893. friend class VersionEditHandler;
  894. friend class VersionEditHandlerPointInTime;
  895. const InternalKeyComparator* internal_comparator() const {
  896. return storage_info_.internal_comparator_;
  897. }
  898. const Comparator* user_comparator() const {
  899. return storage_info_.user_comparator_;
  900. }
  901. // Returns true if the filter blocks in the specified level will not be
  902. // checked during read operations. In certain cases (trivial move or preload),
  903. // the filter block may already be cached, but we still do not access it such
  904. // that it eventually expires from the cache.
  905. bool IsFilterSkipped(int level, bool is_file_last_in_level = false);
  906. // The helper function of UpdateAccumulatedStats, which may fill the missing
  907. // fields of file_meta from its associated TableProperties.
  908. // Returns true if it does initialize FileMetaData.
  909. bool MaybeInitializeFileMetaData(const ReadOptions& read_options,
  910. FileMetaData* file_meta);
  911. // Update the accumulated stats associated with the current version.
  912. // This accumulated stats will be used in compaction.
  913. void UpdateAccumulatedStats(const ReadOptions& read_options);
  914. DECLARE_SYNC_AND_ASYNC(
  915. /* ret_type */ Status, /* func_name */ MultiGetFromSST,
  916. const ReadOptions& read_options, MultiGetRange file_range,
  917. int hit_file_level, bool skip_filters, bool skip_range_deletions,
  918. FdWithKeyRange* f,
  919. std::unordered_map<uint64_t, BlobReadContexts>& blob_ctxs,
  920. TableCache::TypedHandle* table_handle, uint64_t& num_filter_read,
  921. uint64_t& num_index_read, uint64_t& num_sst_read);
  922. #ifdef USE_COROUTINES
  923. // MultiGet using async IO to read data blocks from SST files in parallel
  924. // within and across levels
  925. Status MultiGetAsync(
  926. const ReadOptions& options, MultiGetRange* range,
  927. std::unordered_map<uint64_t, BlobReadContexts>* blob_ctxs);
  928. // A helper function to lookup a batch of keys in a single level. It will
  929. // queue coroutine tasks to mget_tasks. It may also split the input batch
  930. // by creating a new batch with keys definitely not in this level and
  931. // enqueuing it to to_process.
  932. Status ProcessBatch(
  933. const ReadOptions& read_options, FilePickerMultiGet* batch,
  934. std::vector<folly::coro::Task<Status>>& mget_tasks,
  935. std::unordered_map<uint64_t, BlobReadContexts>* blob_ctxs,
  936. autovector<FilePickerMultiGet, 4>& batches, std::deque<size_t>& waiting,
  937. std::deque<size_t>& to_process, unsigned int& num_tasks_queued,
  938. std::unordered_map<int, std::tuple<uint64_t, uint64_t, uint64_t>>&
  939. mget_stats);
  940. #endif
  941. ColumnFamilyData* cfd_; // ColumnFamilyData to which this Version belongs
  942. Logger* info_log_;
  943. Statistics* db_statistics_;
  944. TableCache* table_cache_;
  945. BlobSource* blob_source_;
  946. const MergeOperator* merge_operator_;
  947. VersionStorageInfo storage_info_;
  948. VersionSet* vset_; // VersionSet to which this Version belongs
  949. Version* next_; // Next version in linked list
  950. Version* prev_; // Previous version in linked list
  951. int refs_; // Number of live refs to this version
  952. const FileOptions file_options_;
  953. const MutableCFOptions mutable_cf_options_;
  954. // Cached value to avoid recomputing it on every read.
  955. const size_t max_file_size_for_l0_meta_pin_;
  956. // A version number that uniquely represents this version. This is
  957. // used for debugging and logging purposes only.
  958. uint64_t version_number_;
  959. std::shared_ptr<IOTracer> io_tracer_;
  960. bool use_async_io_;
  961. Version(ColumnFamilyData* cfd, VersionSet* vset, const FileOptions& file_opt,
  962. const MutableCFOptions& mutable_cf_options,
  963. const std::shared_ptr<IOTracer>& io_tracer,
  964. uint64_t version_number = 0,
  965. EpochNumberRequirement epoch_number_requirement =
  966. EpochNumberRequirement::kMustPresent);
  967. ~Version();
  968. // No copying allowed
  969. Version(const Version&) = delete;
  970. void operator=(const Version&) = delete;
  971. };
  972. class BaseReferencedVersionBuilder;
  973. class AtomicGroupReadBuffer {
  974. public:
  975. AtomicGroupReadBuffer() = default;
  976. Status AddEdit(VersionEdit* edit);
  977. void Clear();
  978. bool IsFull() const;
  979. bool IsEmpty() const;
  980. uint64_t TEST_read_edits_in_atomic_group() const {
  981. return read_edits_in_atomic_group_;
  982. }
  983. std::vector<VersionEdit>& replay_buffer() { return replay_buffer_; }
  984. private:
  985. uint64_t read_edits_in_atomic_group_ = 0;
  986. std::vector<VersionEdit> replay_buffer_;
  987. };
  988. // VersionSet is the collection of versions of all the column families of the
  989. // database. Each database owns one VersionSet. A VersionSet has access to all
  990. // column families via ColumnFamilySet, i.e. set of the column families.
  991. // `unchanging` means the LSM tree structure of the column families will not
  992. // change during the lifetime of this VersionSet (true for read-only instance,
  993. // but false for secondary instance or writable DB).
  994. class VersionSet {
  995. public:
  996. VersionSet(const std::string& dbname, const ImmutableDBOptions* db_options,
  997. const FileOptions& file_options, Cache* table_cache,
  998. WriteBufferManager* write_buffer_manager,
  999. WriteController* write_controller,
  1000. BlockCacheTracer* const block_cache_tracer,
  1001. const std::shared_ptr<IOTracer>& io_tracer,
  1002. const std::string& db_id, const std::string& db_session_id,
  1003. const std::string& daily_offpeak_time_utc,
  1004. ErrorHandler* error_handler, bool unchanging);
  1005. // No copying allowed
  1006. VersionSet(const VersionSet&) = delete;
  1007. void operator=(const VersionSet&) = delete;
  1008. virtual ~VersionSet();
  1009. virtual Status Close(FSDirectory* db_dir, InstrumentedMutex* mu);
  1010. Status LogAndApplyToDefaultColumnFamily(
  1011. const ReadOptions& read_options, const WriteOptions& write_options,
  1012. VersionEdit* edit, InstrumentedMutex* mu,
  1013. FSDirectory* dir_contains_current_file, bool new_descriptor_log = false,
  1014. const ColumnFamilyOptions* column_family_options = nullptr) {
  1015. ColumnFamilyData* default_cf = GetColumnFamilySet()->GetDefault();
  1016. return LogAndApply(default_cf, read_options, write_options, edit, mu,
  1017. dir_contains_current_file, new_descriptor_log,
  1018. column_family_options);
  1019. }
  1020. // Apply *edit to the current version to form a new descriptor that
  1021. // is both saved to persistent state and installed as the new
  1022. // current version. Will release *mu while actually writing to the file.
  1023. // column_family_options has to be set if edit is column family add.
  1024. // REQUIRES: *mu is held on entry.
  1025. // REQUIRES: no other thread concurrently calls LogAndApply()
  1026. Status LogAndApply(
  1027. ColumnFamilyData* column_family_data, const ReadOptions& read_options,
  1028. const WriteOptions& write_options, VersionEdit* edit,
  1029. InstrumentedMutex* mu, FSDirectory* dir_contains_current_file,
  1030. bool new_descriptor_log = false,
  1031. const ColumnFamilyOptions* column_family_options = nullptr,
  1032. const std::function<void(const Status&)>& manifest_wcb = {},
  1033. const std::function<Status()>& pre_cb = {}) {
  1034. autovector<ColumnFamilyData*> cfds;
  1035. cfds.emplace_back(column_family_data);
  1036. autovector<autovector<VersionEdit*>> edit_lists;
  1037. autovector<VersionEdit*> edit_list;
  1038. edit_list.emplace_back(edit);
  1039. edit_lists.emplace_back(edit_list);
  1040. return LogAndApply(cfds, read_options, write_options, edit_lists, mu,
  1041. dir_contains_current_file, new_descriptor_log,
  1042. column_family_options, {manifest_wcb}, pre_cb);
  1043. }
  1044. // The batch version. If edit_list.size() > 1, caller must ensure that
  1045. // no edit in the list column family add or drop
  1046. Status LogAndApply(
  1047. ColumnFamilyData* column_family_data, const ReadOptions& read_options,
  1048. const WriteOptions& write_options,
  1049. const autovector<VersionEdit*>& edit_list, InstrumentedMutex* mu,
  1050. FSDirectory* dir_contains_current_file, bool new_descriptor_log = false,
  1051. const ColumnFamilyOptions* column_family_options = nullptr,
  1052. const std::function<void(const Status&)>& manifest_wcb = {},
  1053. const std::function<Status()>& pre_cb = {}) {
  1054. autovector<ColumnFamilyData*> cfds;
  1055. cfds.emplace_back(column_family_data);
  1056. autovector<autovector<VersionEdit*>> edit_lists;
  1057. edit_lists.emplace_back(edit_list);
  1058. return LogAndApply(cfds, read_options, write_options, edit_lists, mu,
  1059. dir_contains_current_file, new_descriptor_log,
  1060. column_family_options, {manifest_wcb}, pre_cb);
  1061. }
  1062. // The across-multi-cf batch version. If edit_lists contain more than
  1063. // 1 version edits, caller must ensure that no edit in the []list is column
  1064. // family manipulation.
  1065. virtual Status LogAndApply(
  1066. const autovector<ColumnFamilyData*>& cfds,
  1067. const ReadOptions& read_options, const WriteOptions& write_options,
  1068. const autovector<autovector<VersionEdit*>>& edit_lists,
  1069. InstrumentedMutex* mu, FSDirectory* dir_contains_current_file,
  1070. bool new_descriptor_log = false,
  1071. const ColumnFamilyOptions* new_cf_options = nullptr,
  1072. const std::vector<std::function<void(const Status&)>>& manifest_wcbs = {},
  1073. const std::function<Status()>& pre_cb = {});
  1074. void WakeUpWaitingManifestWriters();
  1075. // Recover the last saved descriptor (MANIFEST) from persistent storage.
  1076. // Unlike `unchanging` on the VersionSet, `read_only` here and in other
  1077. // functions below refers to the CF receiving no writes or modifications
  1078. // through this VersionSet, but could through external manifest updates
  1079. // etc. Thus, `read_only=true` for secondary instances as well as read-only
  1080. // instances.
  1081. Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
  1082. bool read_only = false, std::string* db_id = nullptr,
  1083. bool no_error_if_files_missing = false, bool is_retry = false,
  1084. Status* log_status = nullptr);
  1085. // Do a best-efforts recovery (Options.best_efforts_recovery=true) from all
  1086. // available MANIFEST files. Similar to `Recover` with these differences:
  1087. // 1) not only the latest MANIFEST can be used, if it's not available or
  1088. // no successful recovery can be achieved with it, this function also tries
  1089. // to recover from previous MANIFEST files, in reverse chronological order
  1090. // until a successful recovery can be achieved.
  1091. // 2) this function doesn't just aim to recover to the latest version, if that
  1092. // is not available, the most recent point in time version will be saved in
  1093. // memory. Check doc for `VersionEditHandlerPointInTime` for more details.
  1094. Status TryRecover(const std::vector<ColumnFamilyDescriptor>& column_families,
  1095. bool read_only,
  1096. const std::vector<std::string>& files_in_dbname,
  1097. std::string* db_id, bool* has_missing_table_file);
  1098. // Try to recover the version set to the most recent consistent state
  1099. // recorded in the specified manifest.
  1100. Status TryRecoverFromOneManifest(
  1101. const std::string& manifest_path,
  1102. const std::vector<ColumnFamilyDescriptor>& column_families,
  1103. bool read_only, std::string* db_id, bool* has_missing_table_file);
  1104. // Recover the next epoch number of each CFs and epoch number
  1105. // of their files (if missing)
  1106. void RecoverEpochNumbers();
  1107. // Reads a manifest file and returns a list of column families in
  1108. // column_families.
  1109. static Status ListColumnFamilies(std::vector<std::string>* column_families,
  1110. const std::string& dbname, FileSystem* fs);
  1111. static Status ListColumnFamiliesFromManifest(
  1112. const std::string& manifest_path, FileSystem* fs,
  1113. std::vector<std::string>* column_families);
  1114. // Try to reduce the number of levels. This call is valid when
  1115. // only one level from the new max level to the old
  1116. // max level containing files.
  1117. // The call is static, since number of levels is immutable during
  1118. // the lifetime of a RocksDB instance. It reduces number of levels
  1119. // in a DB by applying changes to manifest.
  1120. // For example, a db currently has 7 levels [0-6], and a call to
  1121. // to reduce to 5 [0-4] can only be executed when only one level
  1122. // among [4-6] contains files.
  1123. static Status ReduceNumberOfLevels(const std::string& dbname,
  1124. const Options* options,
  1125. const FileOptions& file_options,
  1126. int new_levels);
  1127. // Get the checksum information of all live files
  1128. Status GetLiveFilesChecksumInfo(FileChecksumList* checksum_list);
  1129. // printf contents (for debugging)
  1130. Status DumpManifest(Options& options, std::string& manifestFileName,
  1131. bool verbose, bool hex = false, bool json = false,
  1132. const std::vector<ColumnFamilyDescriptor>& cf_descs = {});
  1133. const std::string& DbSessionId() const { return db_session_id_; }
  1134. // Return the current manifest file number
  1135. uint64_t manifest_file_number() const { return manifest_file_number_; }
  1136. uint64_t options_file_number() const { return options_file_number_; }
  1137. uint64_t pending_manifest_file_number() const {
  1138. return pending_manifest_file_number_;
  1139. }
  1140. uint64_t current_next_file_number() const { return next_file_number_.load(); }
  1141. uint64_t min_log_number_to_keep() const {
  1142. return min_log_number_to_keep_.load();
  1143. }
  1144. bool unchanging() const { return unchanging_; }
  1145. // Allocate and return a new file number
  1146. uint64_t NewFileNumber() { return next_file_number_.fetch_add(1); }
  1147. // Fetch And Add n new file number
  1148. uint64_t FetchAddFileNumber(uint64_t n) {
  1149. return next_file_number_.fetch_add(n);
  1150. }
  1151. // Return the last sequence number.
  1152. uint64_t LastSequence() const {
  1153. return last_sequence_.load(std::memory_order_acquire);
  1154. }
  1155. // Note: memory_order_acquire must be sufficient.
  1156. uint64_t LastAllocatedSequence() const {
  1157. return last_allocated_sequence_.load(std::memory_order_seq_cst);
  1158. }
  1159. // Note: memory_order_acquire must be sufficient.
  1160. uint64_t LastPublishedSequence() const {
  1161. return last_published_sequence_.load(std::memory_order_seq_cst);
  1162. }
  1163. // Set the last sequence number to s.
  1164. void SetLastSequence(uint64_t s) {
  1165. assert(s >= last_sequence_);
  1166. // Last visible sequence must always be less than last written seq
  1167. assert(!db_options_->two_write_queues || s <= last_allocated_sequence_);
  1168. last_sequence_.store(s, std::memory_order_release);
  1169. }
  1170. // Note: memory_order_release must be sufficient
  1171. void SetLastPublishedSequence(uint64_t s) {
  1172. assert(s >= last_published_sequence_);
  1173. last_published_sequence_.store(s, std::memory_order_seq_cst);
  1174. }
  1175. // Note: memory_order_release must be sufficient
  1176. void SetLastAllocatedSequence(uint64_t s) {
  1177. assert(s >= last_allocated_sequence_);
  1178. last_allocated_sequence_.store(s, std::memory_order_seq_cst);
  1179. }
  1180. // Note: memory_order_release must be sufficient
  1181. uint64_t FetchAddLastAllocatedSequence(uint64_t s) {
  1182. return last_allocated_sequence_.fetch_add(s, std::memory_order_seq_cst);
  1183. }
  1184. // Mark the specified file number as used.
  1185. // REQUIRED: this is only called during single-threaded recovery or repair.
  1186. void MarkFileNumberUsed(uint64_t number);
  1187. // Mark the specified log number as deleted
  1188. // REQUIRED: this is only called during single-threaded recovery or repair, or
  1189. // from ::LogAndApply where the global mutex is held.
  1190. void MarkMinLogNumberToKeep(uint64_t number);
  1191. // Return the log file number for the log file that is currently
  1192. // being compacted, or zero if there is no such log file.
  1193. uint64_t prev_log_number() const { return prev_log_number_; }
  1194. // Returns the minimum log number which still has data not flushed to any SST
  1195. // file.
  1196. // In non-2PC mode, all the log numbers smaller than this number can be safely
  1197. // deleted, although we still use `min_log_number_to_keep_` to determine when
  1198. // to delete a WAL file.
  1199. uint64_t MinLogNumberWithUnflushedData() const {
  1200. return PreComputeMinLogNumberWithUnflushedData(nullptr);
  1201. }
  1202. // Returns the minimum log number which still has data not flushed to any SST
  1203. // file.
  1204. // Empty column families' log number is considered to be
  1205. // new_log_number_for_empty_cf.
  1206. uint64_t PreComputeMinLogNumberWithUnflushedData(
  1207. uint64_t new_log_number_for_empty_cf) const {
  1208. uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
  1209. for (auto cfd : *column_family_set_) {
  1210. // It's safe to ignore dropped column families here:
  1211. // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST.
  1212. uint64_t num =
  1213. cfd->IsEmpty() ? new_log_number_for_empty_cf : cfd->GetLogNumber();
  1214. if (min_log_num > num && !cfd->IsDropped()) {
  1215. min_log_num = num;
  1216. }
  1217. }
  1218. return min_log_num;
  1219. }
  1220. // Returns the minimum log number which still has data not flushed to any SST
  1221. // file, except data from `cfd_to_skip`.
  1222. uint64_t PreComputeMinLogNumberWithUnflushedData(
  1223. const ColumnFamilyData* cfd_to_skip) const {
  1224. uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
  1225. for (auto cfd : *column_family_set_) {
  1226. if (cfd == cfd_to_skip) {
  1227. continue;
  1228. }
  1229. // It's safe to ignore dropped column families here:
  1230. // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST.
  1231. if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()) {
  1232. min_log_num = cfd->GetLogNumber();
  1233. }
  1234. }
  1235. return min_log_num;
  1236. }
  1237. // Returns the minimum log number which still has data not flushed to any SST
  1238. // file, except data from `cfds_to_skip`.
  1239. uint64_t PreComputeMinLogNumberWithUnflushedData(
  1240. const std::unordered_set<const ColumnFamilyData*>& cfds_to_skip) const {
  1241. uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
  1242. for (auto cfd : *column_family_set_) {
  1243. if (cfds_to_skip.count(cfd)) {
  1244. continue;
  1245. }
  1246. // It's safe to ignore dropped column families here:
  1247. // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST.
  1248. if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()) {
  1249. min_log_num = cfd->GetLogNumber();
  1250. }
  1251. }
  1252. return min_log_num;
  1253. }
  1254. // Create an iterator that reads over the compaction inputs for "*c".
  1255. // The caller should delete the iterator when no longer needed.
  1256. // @param read_options Must outlive the returned iterator.
  1257. // @param start, end indicates compaction range
  1258. InternalIterator* MakeInputIterator(
  1259. const ReadOptions& read_options, const Compaction* c,
  1260. RangeDelAggregator* range_del_agg,
  1261. const FileOptions& file_options_compactions,
  1262. const std::optional<const Slice>& start,
  1263. const std::optional<const Slice>& end);
  1264. // Add all files listed in any live version to *live_table_files and
  1265. // *live_blob_files. Note that these lists may contain duplicates.
  1266. void AddLiveFiles(std::vector<uint64_t>* live_table_files,
  1267. std::vector<uint64_t>* live_blob_files) const;
  1268. // Remove live files that are in the delete candidate lists.
  1269. void RemoveLiveFiles(
  1270. std::vector<ObsoleteFileInfo>& sst_delete_candidates,
  1271. std::vector<ObsoleteBlobFileInfo>& blob_delete_candidates) const;
  1272. // Return the approximate size of data to be scanned for range [start, end)
  1273. // in levels [start_level, end_level). If end_level == -1 it will search
  1274. // through all non-empty levels
  1275. uint64_t ApproximateSize(const SizeApproximationOptions& options,
  1276. const ReadOptions& read_options, Version* v,
  1277. const Slice& start, const Slice& end,
  1278. int start_level, int end_level,
  1279. TableReaderCaller caller);
  1280. // Return the size of the current manifest file
  1281. uint64_t manifest_file_size() const { return manifest_file_size_; }
  1282. Status GetMetadataForFile(uint64_t number, int* filelevel,
  1283. FileMetaData** metadata, ColumnFamilyData** cfd);
  1284. // This function doesn't support leveldb SST filenames
  1285. void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata);
  1286. void AddObsoleteBlobFile(uint64_t blob_file_number, std::string path) {
  1287. obsolete_blob_files_.emplace_back(blob_file_number, std::move(path));
  1288. }
  1289. void GetObsoleteFiles(std::vector<ObsoleteFileInfo>* files,
  1290. std::vector<ObsoleteBlobFileInfo>* blob_files,
  1291. std::vector<std::string>* manifest_filenames,
  1292. uint64_t min_pending_output);
  1293. // REQUIRES: DB mutex held
  1294. uint64_t GetObsoleteSstFilesSize() const;
  1295. ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
  1296. const UnorderedMap<uint32_t, size_t>& GetRunningColumnFamiliesTimestampSize()
  1297. const {
  1298. return column_family_set_->GetRunningColumnFamiliesTimestampSize();
  1299. }
  1300. const UnorderedMap<uint32_t, size_t>&
  1301. GetColumnFamiliesTimestampSizeForRecord() const {
  1302. return column_family_set_->GetColumnFamiliesTimestampSizeForRecord();
  1303. }
  1304. RefedColumnFamilySet GetRefedColumnFamilySet() {
  1305. return RefedColumnFamilySet(GetColumnFamilySet());
  1306. }
  1307. const FileOptions& file_options() { return file_options_; }
  1308. void ChangeFileOptions(const MutableDBOptions& new_options) {
  1309. file_options_.writable_file_max_buffer_size =
  1310. new_options.writable_file_max_buffer_size;
  1311. }
  1312. // TODO - Consider updating together when file options change in SetDBOptions
  1313. const OffpeakTimeOption& offpeak_time_option() {
  1314. return offpeak_time_option_;
  1315. }
  1316. void ChangeOffpeakTimeOption(const std::string& daily_offpeak_time_utc) {
  1317. offpeak_time_option_.SetFromOffpeakTimeString(daily_offpeak_time_utc);
  1318. }
  1319. const ImmutableDBOptions* db_options() const { return db_options_; }
  1320. static uint64_t GetNumLiveVersions(Version* dummy_versions);
  1321. static uint64_t GetTotalSstFilesSize(Version* dummy_versions);
  1322. static uint64_t GetTotalBlobFileSize(Version* dummy_versions);
  1323. // Get the IO Status returned by written Manifest.
  1324. const IOStatus& io_status() const { return io_status_; }
  1325. // The returned WalSet needs to be accessed with DB mutex held.
  1326. const WalSet& GetWalSet() const { return wals_; }
  1327. void TEST_CreateAndAppendVersion(ColumnFamilyData* cfd) {
  1328. assert(cfd);
  1329. Version* const version = new Version(
  1330. cfd, this, file_options_, cfd->GetLatestMutableCFOptions(), io_tracer_);
  1331. constexpr bool update_stats = false;
  1332. // TODO: plumb Env::IOActivity, Env::IOPriority
  1333. const ReadOptions read_options;
  1334. version->PrepareAppend(read_options, update_stats);
  1335. AppendVersion(cfd, version);
  1336. }
  1337. bool& TEST_unchanging() { return const_cast<bool&>(unchanging_); }
  1338. protected:
  1339. struct ManifestWriter;
  1340. friend class Version;
  1341. friend class VersionEditHandler;
  1342. friend class VersionEditHandlerPointInTime;
  1343. friend class DumpManifestHandler;
  1344. friend class DBImpl;
  1345. friend class DBImplReadOnly;
  1346. struct LogReporter : public log::Reader::Reporter {
  1347. Status* status;
  1348. void Corruption(size_t /*bytes*/, const Status& s,
  1349. uint64_t /*log_number*/ = kMaxSequenceNumber) override {
  1350. if (status->ok()) {
  1351. *status = s;
  1352. }
  1353. }
  1354. };
  1355. void Reset();
  1356. // Returns approximated offset of a key in a file for a given version.
  1357. uint64_t ApproximateOffsetOf(const ReadOptions& read_options, Version* v,
  1358. const FdWithKeyRange& f, const Slice& key,
  1359. TableReaderCaller caller);
  1360. // Returns approximated data size between start and end keys in a file
  1361. // for a given version.
  1362. uint64_t ApproximateSize(const ReadOptions& read_options, Version* v,
  1363. const FdWithKeyRange& f, const Slice& start,
  1364. const Slice& end, TableReaderCaller caller);
  1365. struct MutableCFState {
  1366. uint64_t log_number;
  1367. std::string full_history_ts_low;
  1368. explicit MutableCFState() = default;
  1369. explicit MutableCFState(uint64_t _log_number, std::string ts_low)
  1370. : log_number(_log_number), full_history_ts_low(std::move(ts_low)) {}
  1371. };
  1372. // Save current contents to *log
  1373. Status WriteCurrentStateToManifest(
  1374. const WriteOptions& write_options,
  1375. const std::unordered_map<uint32_t, MutableCFState>& curr_state,
  1376. const VersionEdit& wal_additions, log::Writer* log, IOStatus& io_s);
  1377. void AppendVersion(ColumnFamilyData* column_family_data, Version* v);
  1378. ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
  1379. const ReadOptions& read_options,
  1380. const VersionEdit* edit, bool read_only);
  1381. Status VerifyFileMetadata(const ReadOptions& read_options,
  1382. ColumnFamilyData* cfd, const std::string& fpath,
  1383. int level, const FileMetaData& meta);
  1384. // Protected by DB mutex.
  1385. WalSet wals_;
  1386. std::unique_ptr<ColumnFamilySet> column_family_set_;
  1387. Cache* table_cache_;
  1388. Env* const env_;
  1389. FileSystemPtr const fs_;
  1390. SystemClock* const clock_;
  1391. const std::string dbname_;
  1392. std::string db_id_;
  1393. const ImmutableDBOptions* const db_options_;
  1394. std::atomic<uint64_t> next_file_number_;
  1395. // Any WAL number smaller than this should be ignored during recovery,
  1396. // and is qualified for being deleted.
  1397. std::atomic<uint64_t> min_log_number_to_keep_ = {0};
  1398. uint64_t manifest_file_number_;
  1399. uint64_t options_file_number_;
  1400. uint64_t options_file_size_;
  1401. uint64_t pending_manifest_file_number_;
  1402. // The last seq visible to reads. It normally indicates the last sequence in
  1403. // the memtable but when using two write queues it could also indicate the
  1404. // last sequence in the WAL visible to reads.
  1405. std::atomic<uint64_t> last_sequence_;
  1406. // The last sequence number of data committed to the descriptor (manifest
  1407. // file).
  1408. SequenceNumber descriptor_last_sequence_ = 0;
  1409. // The last seq that is already allocated. It is applicable only when we have
  1410. // two write queues. In that case seq might or might not have appreated in
  1411. // memtable but it is expected to appear in the WAL.
  1412. // We have last_sequence <= last_allocated_sequence_
  1413. std::atomic<uint64_t> last_allocated_sequence_;
  1414. // The last allocated sequence that is also published to the readers. This is
  1415. // applicable only when last_seq_same_as_publish_seq_ is not set. Otherwise
  1416. // last_sequence_ also indicates the last published seq.
  1417. // We have last_sequence <= last_published_sequence_ <=
  1418. // last_allocated_sequence_
  1419. std::atomic<uint64_t> last_published_sequence_;
  1420. uint64_t prev_log_number_; // 0 or backing store for memtable being compacted
  1421. // Opened lazily
  1422. std::unique_ptr<log::Writer> descriptor_log_;
  1423. // generates a increasing version number for every new version
  1424. uint64_t current_version_number_;
  1425. // Queue of writers to the manifest file
  1426. std::deque<ManifestWriter*> manifest_writers_;
  1427. // Current size of manifest file
  1428. uint64_t manifest_file_size_;
  1429. // Obsolete files, or during DB shutdown any files not referenced by what's
  1430. // left of the in-memory LSM state.
  1431. std::vector<ObsoleteFileInfo> obsolete_files_;
  1432. std::vector<ObsoleteBlobFileInfo> obsolete_blob_files_;
  1433. std::vector<std::string> obsolete_manifests_;
  1434. // env options for all reads and writes except compactions
  1435. FileOptions file_options_;
  1436. BlockCacheTracer* const block_cache_tracer_;
  1437. // Store the IO status when Manifest is written
  1438. IOStatus io_status_;
  1439. std::shared_ptr<IOTracer> io_tracer_;
  1440. std::string db_session_id_;
  1441. // Off-peak time option used for compaction scoring
  1442. OffpeakTimeOption offpeak_time_option_;
  1443. // Pointer to the DB's ErrorHandler.
  1444. ErrorHandler* const error_handler_;
  1445. private:
  1446. // REQUIRES db mutex at beginning. may release and re-acquire db mutex
  1447. Status ProcessManifestWrites(std::deque<ManifestWriter>& writers,
  1448. InstrumentedMutex* mu,
  1449. FSDirectory* dir_contains_current_file,
  1450. bool new_descriptor_log,
  1451. const ColumnFamilyOptions* new_cf_options,
  1452. const ReadOptions& read_options,
  1453. const WriteOptions& write_options);
  1454. void LogAndApplyCFHelper(VersionEdit* edit,
  1455. SequenceNumber* max_last_sequence);
  1456. Status LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b,
  1457. VersionEdit* edit, SequenceNumber* max_last_sequence,
  1458. InstrumentedMutex* mu);
  1459. const bool unchanging_;
  1460. bool closed_;
  1461. };
  1462. // ReactiveVersionSet represents a collection of versions of the column
  1463. // families of the database. Users of ReactiveVersionSet, e.g. DBImplSecondary,
  1464. // need to replay the MANIFEST (description log in older terms) in order to
  1465. // reconstruct and install versions.
  1466. class ReactiveVersionSet : public VersionSet {
  1467. public:
  1468. ReactiveVersionSet(const std::string& dbname,
  1469. const ImmutableDBOptions* _db_options,
  1470. const FileOptions& _file_options, Cache* table_cache,
  1471. WriteBufferManager* write_buffer_manager,
  1472. WriteController* write_controller,
  1473. const std::shared_ptr<IOTracer>& io_tracer);
  1474. ~ReactiveVersionSet() override;
  1475. Status Close(FSDirectory* /*db_dir*/, InstrumentedMutex* /*mu*/) override {
  1476. return Status::OK();
  1477. }
  1478. Status ReadAndApply(
  1479. InstrumentedMutex* mu,
  1480. std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
  1481. Status* manifest_read_status,
  1482. std::unordered_set<ColumnFamilyData*>* cfds_changed,
  1483. std::vector<std::string>* files_to_delete);
  1484. Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
  1485. std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
  1486. std::unique_ptr<log::Reader::Reporter>* manifest_reporter,
  1487. std::unique_ptr<Status>* manifest_reader_status);
  1488. #ifndef NDEBUG
  1489. uint64_t TEST_read_edits_in_atomic_group() const;
  1490. #endif //! NDEBUG
  1491. std::vector<VersionEdit>& replay_buffer();
  1492. protected:
  1493. // REQUIRES db mutex
  1494. Status ApplyOneVersionEditToBuilder(
  1495. VersionEdit& edit, std::unordered_set<ColumnFamilyData*>* cfds_changed,
  1496. VersionEdit* version_edit);
  1497. Status MaybeSwitchManifest(
  1498. log::Reader::Reporter* reporter,
  1499. std::unique_ptr<log::FragmentBufferedReader>* manifest_reader);
  1500. private:
  1501. std::unique_ptr<ManifestTailer> manifest_tailer_;
  1502. // TODO: plumb Env::IOActivity, Env::IOPriority
  1503. const ReadOptions read_options_;
  1504. using VersionSet::LogAndApply;
  1505. using VersionSet::Recover;
  1506. Status LogAndApply(
  1507. const autovector<ColumnFamilyData*>& /*cfds*/,
  1508. const ReadOptions& /* read_options */,
  1509. const WriteOptions& /* write_options */,
  1510. const autovector<autovector<VersionEdit*>>& /*edit_lists*/,
  1511. InstrumentedMutex* /*mu*/, FSDirectory* /*dir_contains_current_file*/,
  1512. bool /*new_descriptor_log*/, const ColumnFamilyOptions* /*new_cf_option*/,
  1513. const std::vector<std::function<void(const Status&)>>& /*manifest_wcbs*/,
  1514. const std::function<Status()>& /*pre_cb*/) override {
  1515. return Status::NotSupported("not supported in reactive mode");
  1516. }
  1517. // No copy allowed
  1518. ReactiveVersionSet(const ReactiveVersionSet&);
  1519. ReactiveVersionSet& operator=(const ReactiveVersionSet&);
  1520. };
  1521. } // namespace ROCKSDB_NAMESPACE