version_edit.h 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. #pragma once
  10. #include <algorithm>
  11. #include <optional>
  12. #include <set>
  13. #include <string>
  14. #include <utility>
  15. #include <vector>
  16. #include "db/blob/blob_file_addition.h"
  17. #include "db/blob/blob_file_garbage.h"
  18. #include "db/dbformat.h"
  19. #include "db/wal_edit.h"
  20. #include "memory/arena.h"
  21. #include "port/malloc.h"
  22. #include "rocksdb/advanced_cache.h"
  23. #include "rocksdb/advanced_options.h"
  24. #include "table/table_reader.h"
  25. #include "table/unique_id_impl.h"
  26. #include "util/autovector.h"
  27. namespace ROCKSDB_NAMESPACE {
  28. // Tag numbers for serialized VersionEdit. These numbers are written to
  29. // disk and should not be changed. The number should be forward compatible so
  30. // users can down-grade RocksDB safely. A future Tag is ignored by doing '&'
  31. // between Tag and kTagSafeIgnoreMask field.
  32. enum Tag : uint32_t {
  33. kComparator = 1,
  34. kLogNumber = 2,
  35. kNextFileNumber = 3,
  36. kLastSequence = 4,
  37. kCompactCursor = 5,
  38. kDeletedFile = 6,
  39. kNewFile = 7,
  40. // 8 was used for large value refs
  41. kPrevLogNumber = 9,
  42. kMinLogNumberToKeep = 10,
  43. // these are new formats divergent from open source leveldb
  44. kNewFile2 = 100,
  45. kNewFile3 = 102,
  46. kNewFile4 = 103, // 4th (the latest) format version of adding files
  47. kColumnFamily = 200, // specify column family for version edit
  48. kColumnFamilyAdd = 201,
  49. kColumnFamilyDrop = 202,
  50. kMaxColumnFamily = 203,
  51. kInAtomicGroup = 300,
  52. kBlobFileAddition = 400,
  53. kBlobFileGarbage,
  54. // Mask for an unidentified tag from the future which can be safely ignored.
  55. kTagSafeIgnoreMask = 1 << 13,
  56. // Forward compatible (aka ignorable) records
  57. kDbId,
  58. kBlobFileAddition_DEPRECATED,
  59. kBlobFileGarbage_DEPRECATED,
  60. kWalAddition,
  61. kWalDeletion,
  62. kFullHistoryTsLow,
  63. kWalAddition2,
  64. kWalDeletion2,
  65. kPersistUserDefinedTimestamps,
  66. kSubcompactionProgress,
  67. };
  68. enum SubcompactionProgressPerLevelCustomTag : uint32_t {
  69. kSubcompactionProgressPerLevelTerminate = 1, // End of fields marker
  70. kOutputFilesDelta = 2,
  71. kNumProcessedOutputRecords = 3,
  72. kSubcompactionProgressPerLevelCustomTagSafeIgnoreMask = 1 << 16,
  73. };
  74. enum SubcompactionProgressCustomTag : uint32_t {
  75. kSubcompactionProgressTerminate = 1, // End of fields marker
  76. kNextInternalKeyToCompact = 2,
  77. kNumProcessedInputRecords = 3,
  78. kOutputLevelProgress = 4,
  79. kProximalOutputLevelProgress = 5,
  80. kSubcompactionProgressCustomTagSafeIgnoreMask = 1 << 16,
  81. };
  82. enum NewFileCustomTag : uint32_t {
  83. kTerminate = 1, // The end of customized fields
  84. kNeedCompaction = 2,
  85. // Since Manifest is not entirely forward-compatible, we currently encode
  86. // kMinLogNumberToKeep as part of NewFile as a hack. This should be removed
  87. // when manifest becomes forward-compatible.
  88. kMinLogNumberToKeepHack = 3,
  89. kOldestBlobFileNumber = 4,
  90. kOldestAncesterTime = 5,
  91. kFileCreationTime = 6,
  92. kFileChecksum = 7,
  93. kFileChecksumFuncName = 8,
  94. kTemperature = 9,
  95. kMinTimestamp = 10,
  96. kMaxTimestamp = 11,
  97. kUniqueId = 12,
  98. kEpochNumber = 13,
  99. kCompensatedRangeDeletionSize = 14,
  100. kTailSize = 15,
  101. kUserDefinedTimestampsPersisted = 16,
  102. // If this bit for the custom tag is set, opening DB should fail if
  103. // we don't know this field.
  104. kCustomTagNonSafeIgnoreMask = 1 << 6,
  105. // Forward incompatible (aka unignorable) fields
  106. kPathId,
  107. };
  108. class VersionSet;
  109. constexpr uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF;
  110. constexpr uint64_t kUnknownOldestAncesterTime = 0;
  111. constexpr uint64_t kUnknownNewestKeyTime = 0;
  112. constexpr uint64_t kUnknownFileCreationTime = 0;
  113. constexpr uint64_t kUnknownEpochNumber = 0;
  114. // If `Options::cf_allow_ingest_behind` is true, this epoch number
  115. // will be dedicated to files ingested behind.
  116. constexpr uint64_t kReservedEpochNumberForFileIngestedBehind = 1;
  117. uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id);
  118. // A copyable structure contains information needed to read data from an SST
  119. // file. It can contain a pointer to a table reader opened for the file, or
  120. // file number and size, which can be used to create a new table reader for it.
  121. // The behavior is undefined when a copied of the structure is used when the
  122. // file is not in any live version any more.
  123. struct FileDescriptor {
  124. // Table reader in table_reader_handle
  125. TableReader* table_reader;
  126. uint64_t packed_number_and_path_id;
  127. uint64_t file_size; // File size in bytes
  128. SequenceNumber smallest_seqno; // The smallest seqno in this file
  129. SequenceNumber largest_seqno; // The largest seqno in this file
  130. FileDescriptor() : FileDescriptor(0, 0, 0) {}
  131. FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size)
  132. : FileDescriptor(number, path_id, _file_size, kMaxSequenceNumber, 0) {}
  133. FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size,
  134. SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno)
  135. : table_reader(nullptr),
  136. packed_number_and_path_id(PackFileNumberAndPathId(number, path_id)),
  137. file_size(_file_size),
  138. smallest_seqno(_smallest_seqno),
  139. largest_seqno(_largest_seqno) {}
  140. FileDescriptor(const FileDescriptor& fd) { *this = fd; }
  141. FileDescriptor& operator=(const FileDescriptor& fd) {
  142. table_reader = fd.table_reader;
  143. packed_number_and_path_id = fd.packed_number_and_path_id;
  144. file_size = fd.file_size;
  145. smallest_seqno = fd.smallest_seqno;
  146. largest_seqno = fd.largest_seqno;
  147. return *this;
  148. }
  149. uint64_t GetNumber() const {
  150. return packed_number_and_path_id & kFileNumberMask;
  151. }
  152. uint32_t GetPathId() const {
  153. return static_cast<uint32_t>(packed_number_and_path_id /
  154. (kFileNumberMask + 1));
  155. }
  156. uint64_t GetFileSize() const { return file_size; }
  157. };
  158. struct FileSampledStats {
  159. FileSampledStats() : num_reads_sampled(0) {}
  160. FileSampledStats(const FileSampledStats& other) { *this = other; }
  161. FileSampledStats& operator=(const FileSampledStats& other) {
  162. num_reads_sampled = other.num_reads_sampled.load();
  163. return *this;
  164. }
  165. // number of user reads to this file.
  166. mutable std::atomic<uint64_t> num_reads_sampled;
  167. };
  168. struct FileMetaData {
  169. FileDescriptor fd;
  170. InternalKey smallest; // Smallest internal key served by table
  171. InternalKey largest; // Largest internal key served by table
  172. // Needs to be disposed when refs becomes 0.
  173. Cache::Handle* table_reader_handle = nullptr;
  174. FileSampledStats stats;
  175. // Stats for compensating deletion entries during compaction
  176. // File size compensated by deletion entry.
  177. // This is used to compute a file's compaction priority, and is updated in
  178. // Version::ComputeCompensatedSizes() first time when the file is created or
  179. // loaded. After it is updated (!= 0), it is immutable.
  180. uint64_t compensated_file_size = 0;
  181. // These values can mutate, but they can only be read or written from
  182. // single-threaded LogAndApply thread
  183. uint64_t num_entries =
  184. 0; // The number of entries, including deletions and range deletions.
  185. // The number of deletion entries, including range deletions.
  186. uint64_t num_deletions = 0;
  187. uint64_t raw_key_size = 0; // total uncompressed key size.
  188. uint64_t raw_value_size = 0; // total uncompressed value size.
  189. uint64_t num_range_deletions = 0;
  190. // This is computed during Flush/Compaction, and is added to
  191. // `compensated_file_size`. Currently, this estimates the size of keys in the
  192. // next level covered by range tombstones in this file.
  193. uint64_t compensated_range_deletion_size = 0;
  194. int refs = 0; // Reference count
  195. bool being_compacted = false; // Is this file undergoing compaction?
  196. bool init_stats_from_file = false; // true if the data-entry stats of this
  197. // file has initialized from file.
  198. bool marked_for_compaction = false; // True if client asked us nicely to
  199. // compact this file.
  200. Temperature temperature = Temperature::kUnknown;
  201. // Used only in BlobDB. The file number of the oldest blob file this SST file
  202. // refers to. 0 is an invalid value; BlobDB numbers the files starting from 1.
  203. uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
  204. // For flush output file, oldest ancestor time is the oldest key time in the
  205. // file. If the oldest key time is not available, flush time is used.
  206. //
  207. // For compaction output file, oldest ancestor time is the oldest
  208. // among all the oldest key time of its input files, since the file could be
  209. // the compaction output from other SST files, which could in turn be outputs
  210. // for compact older SST files. If that's not available, creation time of this
  211. // compaction output file is used.
  212. //
  213. // 0 means the information is not available.
  214. uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
  215. // Unix time when the SST file is created.
  216. uint64_t file_creation_time = kUnknownFileCreationTime;
  217. // The order of a file being flushed or ingested/imported.
  218. // Compaction output file will be assigned with the minimum `epoch_number`
  219. // among input files'.
  220. // For L0, larger `epoch_number` indicates newer L0 file.
  221. uint64_t epoch_number = kUnknownEpochNumber;
  222. // File checksum
  223. std::string file_checksum = kUnknownFileChecksum;
  224. // File checksum function name
  225. std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
  226. // SST unique id
  227. UniqueId64x2 unique_id{};
  228. // Size of the "tail" part of a SST file
  229. // "Tail" refers to all blocks after data blocks till the end of the SST file
  230. uint64_t tail_size = 0;
  231. // Value of the `AdvancedColumnFamilyOptions.persist_user_defined_timestamps`
  232. // flag when the file is created. Default to true, only when this flag is
  233. // false, it's explicitly written to Manifest.
  234. bool user_defined_timestamps_persisted = true;
  235. FileMetaData() = default;
  236. FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size,
  237. const InternalKey& smallest_key, const InternalKey& largest_key,
  238. const SequenceNumber& smallest_seq,
  239. const SequenceNumber& largest_seq, bool marked_for_compact,
  240. Temperature _temperature, uint64_t oldest_blob_file,
  241. uint64_t _oldest_ancester_time, uint64_t _file_creation_time,
  242. uint64_t _epoch_number, const std::string& _file_checksum,
  243. const std::string& _file_checksum_func_name,
  244. UniqueId64x2 _unique_id,
  245. const uint64_t _compensated_range_deletion_size,
  246. uint64_t _tail_size, bool _user_defined_timestamps_persisted)
  247. : fd(file, file_path_id, file_size, smallest_seq, largest_seq),
  248. smallest(smallest_key),
  249. largest(largest_key),
  250. compensated_range_deletion_size(_compensated_range_deletion_size),
  251. marked_for_compaction(marked_for_compact),
  252. temperature(_temperature),
  253. oldest_blob_file_number(oldest_blob_file),
  254. oldest_ancester_time(_oldest_ancester_time),
  255. file_creation_time(_file_creation_time),
  256. epoch_number(_epoch_number),
  257. file_checksum(_file_checksum),
  258. file_checksum_func_name(_file_checksum_func_name),
  259. unique_id(std::move(_unique_id)),
  260. tail_size(_tail_size),
  261. user_defined_timestamps_persisted(_user_defined_timestamps_persisted) {
  262. TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this);
  263. }
  264. // REQUIRED: Keys must be given to the function in sorted order (it expects
  265. // the last key to be the largest).
  266. Status UpdateBoundaries(const Slice& key, const Slice& value,
  267. SequenceNumber seqno, ValueType value_type);
  268. // Unlike UpdateBoundaries, ranges do not need to be presented in any
  269. // particular order.
  270. void UpdateBoundariesForRange(const InternalKey& start,
  271. const InternalKey& end, SequenceNumber seqno,
  272. const InternalKeyComparator& icmp) {
  273. if (smallest.size() == 0 || icmp.Compare(start, smallest) < 0) {
  274. smallest = start;
  275. }
  276. if (largest.size() == 0 || icmp.Compare(largest, end) < 0) {
  277. largest = end;
  278. }
  279. assert(icmp.Compare(smallest, largest) <= 0);
  280. fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
  281. fd.largest_seqno = std::max(fd.largest_seqno, seqno);
  282. }
  283. // Try to get oldest ancester time from the class itself or table properties
  284. // if table reader is already pinned.
  285. // 0 means the information is not available.
  286. uint64_t TryGetOldestAncesterTime() {
  287. if (oldest_ancester_time != kUnknownOldestAncesterTime) {
  288. return oldest_ancester_time;
  289. } else if (fd.table_reader != nullptr &&
  290. fd.table_reader->GetTableProperties() != nullptr) {
  291. return fd.table_reader->GetTableProperties()->creation_time;
  292. }
  293. return kUnknownOldestAncesterTime;
  294. }
  295. uint64_t TryGetFileCreationTime() {
  296. if (file_creation_time != kUnknownFileCreationTime) {
  297. return file_creation_time;
  298. } else if (fd.table_reader != nullptr &&
  299. fd.table_reader->GetTableProperties() != nullptr) {
  300. return fd.table_reader->GetTableProperties()->file_creation_time;
  301. }
  302. return kUnknownFileCreationTime;
  303. }
  304. // Tries to get the newest key time from the current file
  305. // Falls back on oldest ancestor time of previous (newer) file
  306. uint64_t TryGetNewestKeyTime(FileMetaData* prev_file = nullptr) {
  307. if (fd.table_reader != nullptr &&
  308. fd.table_reader->GetTableProperties() != nullptr) {
  309. uint64_t newest_key_time =
  310. fd.table_reader->GetTableProperties()->newest_key_time;
  311. if (newest_key_time != kUnknownNewestKeyTime) {
  312. return newest_key_time;
  313. }
  314. }
  315. if (prev_file != nullptr) {
  316. uint64_t prev_oldest_ancestor_time =
  317. prev_file->TryGetOldestAncesterTime();
  318. if (prev_oldest_ancestor_time != kUnknownOldestAncesterTime) {
  319. return prev_oldest_ancestor_time;
  320. }
  321. }
  322. return kUnknownNewestKeyTime;
  323. }
  324. // WARNING: manual update to this function is needed
  325. // whenever a new string property is added to FileMetaData
  326. // to reduce approximation error.
  327. //
  328. // TODO: eliminate the need of manually updating this function
  329. // for new string properties
  330. size_t ApproximateMemoryUsage() const {
  331. size_t usage = 0;
  332. #ifdef ROCKSDB_MALLOC_USABLE_SIZE
  333. usage += malloc_usable_size(const_cast<FileMetaData*>(this));
  334. #else
  335. usage += sizeof(*this);
  336. #endif // ROCKSDB_MALLOC_USABLE_SIZE
  337. usage += smallest.size() + largest.size() + file_checksum.size() +
  338. file_checksum_func_name.size();
  339. return usage;
  340. }
  341. // Returns whether this file is one with just one range tombstone. These type
  342. // of file should always be marked for compaction.
  343. bool FileIsStandAloneRangeTombstone() const {
  344. bool res = num_range_deletions == 1 && num_entries == num_range_deletions;
  345. assert(!res || fd.smallest_seqno == fd.largest_seqno);
  346. return res;
  347. }
  348. static uint64_t CalculateTailSize(uint64_t file_size,
  349. const TableProperties& props) {
  350. #ifndef NDEBUG
  351. bool skip = false;
  352. TEST_SYNC_POINT_CALLBACK("FileMetaData::CalculateTailSize", &skip);
  353. if (skip) {
  354. return 0;
  355. }
  356. #endif // NDEBUG
  357. uint64_t tail_size = 0;
  358. // Differentiate between a file with no data blocks (tail_start_offset = 0)
  359. // and a file with unknown tail_start_offset (also set to 0 due to
  360. // non-negative integer storage limitation)
  361. bool contain_no_data_blocks =
  362. props.num_entries == 0 ||
  363. (props.num_entries > 0 &&
  364. (props.num_entries == props.num_range_deletions));
  365. if (props.tail_start_offset > 0 || contain_no_data_blocks) {
  366. assert(props.tail_start_offset <= file_size);
  367. tail_size = file_size - props.tail_start_offset;
  368. }
  369. return tail_size;
  370. }
  371. };
  372. // A compressed copy of file meta data that just contain minimum data needed
  373. // to serve read operations, while still keeping the pointer to full metadata
  374. // of the file in case it is needed.
  375. struct FdWithKeyRange {
  376. FileDescriptor fd;
  377. FileMetaData* file_metadata; // Point to all metadata
  378. Slice smallest_key; // slice that contain smallest key
  379. Slice largest_key; // slice that contain largest key
  380. FdWithKeyRange()
  381. : fd(), file_metadata(nullptr), smallest_key(), largest_key() {}
  382. FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key,
  383. FileMetaData* _file_metadata)
  384. : fd(_fd),
  385. file_metadata(_file_metadata),
  386. smallest_key(_smallest_key),
  387. largest_key(_largest_key) {}
  388. };
  389. // Data structure to store an array of FdWithKeyRange in one level
  390. // Actual data is guaranteed to be stored closely
  391. struct LevelFilesBrief {
  392. size_t num_files;
  393. FdWithKeyRange* files;
  394. LevelFilesBrief() {
  395. num_files = 0;
  396. files = nullptr;
  397. }
  398. };
  399. struct SubcompactionProgressPerLevel {
  400. uint64_t GetNumProcessedOutputRecords() const {
  401. return num_processed_output_records_;
  402. }
  403. void SetNumProcessedOutputRecords(uint64_t num) {
  404. num_processed_output_records_ = num;
  405. }
  406. const autovector<FileMetaData>& GetOutputFiles() const {
  407. return output_files_;
  408. }
  409. void AddToOutputFiles(const FileMetaData& file) {
  410. output_files_.push_back(file);
  411. }
  412. size_t GetLastPersistedOutputFilesCount() const {
  413. return last_persisted_output_files_count_;
  414. }
  415. void UpdateLastPersistedOutputFilesCount() {
  416. last_persisted_output_files_count_ = output_files_.size();
  417. }
  418. void EncodeTo(std::string* dst) const;
  419. Status DecodeFrom(Slice* input);
  420. void Clear() {
  421. num_processed_output_records_ = 0;
  422. output_files_.clear();
  423. last_persisted_output_files_count_ = 0;
  424. }
  425. std::string ToString() const {
  426. std::ostringstream oss;
  427. oss << "SubcompactionProgressPerLevel{";
  428. oss << " num_processed_output_records=" << num_processed_output_records_;
  429. oss << ", output_files_count=" << output_files_.size();
  430. oss << ", last_persisted_output_files_count="
  431. << last_persisted_output_files_count_;
  432. oss << " }";
  433. return oss.str();
  434. }
  435. void TEST_ClearOutputFiles() { output_files_.clear(); }
  436. private:
  437. uint64_t num_processed_output_records_ = 0;
  438. autovector<FileMetaData> output_files_ = {};
  439. // Number of files already persisted to help calculate the new output files to
  440. // persist in the future. This is to prevent having to persist all the output
  441. // files metadata so far every time of a "snapshot" of a progress is persisted
  442. // which can lead to O(1+2+...+n) = O(n^2) file metadata being persisted. The
  443. // current approach of persisting only the delta should always persist
  444. // exactly the number (n) of output files in total.
  445. size_t last_persisted_output_files_count_ = 0;
  446. void EncodeOutputFiles(std::string* dst) const;
  447. Status DecodeOutputFiles(Slice* input,
  448. autovector<FileMetaData>& temp_storage);
  449. };
  450. struct SubcompactionProgress {
  451. std::string next_internal_key_to_compact;
  452. uint64_t num_processed_input_records = 0;
  453. SubcompactionProgressPerLevel output_level_progress;
  454. SubcompactionProgressPerLevel proximal_output_level_progress;
  455. SubcompactionProgress() = default;
  456. void Clear() {
  457. next_internal_key_to_compact.clear();
  458. num_processed_input_records = 0;
  459. output_level_progress.Clear();
  460. proximal_output_level_progress.Clear();
  461. }
  462. void EncodeTo(std::string* dst) const;
  463. Status DecodeFrom(Slice* input);
  464. std::string ToString() const {
  465. std::ostringstream oss;
  466. oss << "SubcompactionProgress{";
  467. oss << " next_internal_key_to_compact=";
  468. if (next_internal_key_to_compact.empty()) {
  469. oss << "";
  470. } else {
  471. ParsedInternalKey parsed_key;
  472. Slice key_slice(next_internal_key_to_compact);
  473. if (ParseInternalKey(key_slice, &parsed_key, false /* log_err_key */)
  474. .ok()) {
  475. oss << "user_key=\"" << parsed_key.user_key.ToString(false /* hex */)
  476. << "\" (hex:" << parsed_key.user_key.ToString(true /* hex */)
  477. << ")";
  478. oss << ", seq=";
  479. if (parsed_key.sequence == kMaxSequenceNumber) {
  480. oss << "kMaxSequenceNumber";
  481. } else {
  482. oss << parsed_key.sequence;
  483. }
  484. oss << ", type=" << static_cast<int>(parsed_key.type);
  485. } else {
  486. oss << "raw=" << key_slice.ToString(true /* hex */);
  487. }
  488. }
  489. oss << ", num_processed_input_records=" << num_processed_input_records;
  490. oss << ", output_level_progress=" << output_level_progress.ToString();
  491. oss << ", proximal_output_level_progress="
  492. << proximal_output_level_progress.ToString();
  493. oss << " }";
  494. return oss.str();
  495. }
  496. };
  497. class VersionEdit;
  498. // Builder class to reconstruct complete subcompaction progress object
  499. // from multiple decoded VersionEdits containing delta output files information
  500. // of the same subcompaction. See
  501. // `SubcompactionProgressPerLevel::last_persisted_output_files_count_`'s comment
  502. //
  503. // WARNING: This class currently assumes all input VersionEdits contain progress
  504. // information for the SAME subcompaction. It does not validate
  505. // progress data from different subcompactions so mixing progress from
  506. // multiple subcompactions can result in corrupted state silently. The caller is
  507. // responsible for ensuring all VersionEdits processed by a single instance
  508. // of this builder correspond to the same subcompaction.
  509. class SubcompactionProgressBuilder {
  510. public:
  511. SubcompactionProgressBuilder() = default;
  512. bool ProcessVersionEdit(const VersionEdit& edit);
  513. const SubcompactionProgress& GetAccumulatedSubcompactionProgress() const {
  514. return accumulated_subcompaction_progress_;
  515. }
  516. bool HasAccumulatedSubcompactionProgress() const {
  517. return has_subcompaction_progress_;
  518. }
  519. void Clear();
  520. private:
  521. void MergeDeltaProgress(const SubcompactionProgress& delta_progress);
  522. void MaybeMergeDeltaProgressPerLevel(
  523. SubcompactionProgressPerLevel& accumulated_level_progress,
  524. const SubcompactionProgressPerLevel& delta_level_progress);
  525. SubcompactionProgress accumulated_subcompaction_progress_;
  526. bool has_subcompaction_progress_ = false;
  527. };
  528. // Type alias for backward compatibility - vector of subcompaction progress
  529. using CompactionProgress = std::vector<SubcompactionProgress>;
  530. // The state of a DB at any given time is referred to as a Version.
  531. // Any modification to the Version is considered a Version Edit. A Version is
  532. // constructed by joining a sequence of Version Edits. Version Edits are written
  533. // to the MANIFEST file.
  534. class VersionEdit {
  535. public:
  536. // Retrieve the table files added as well as their associated levels.
  537. using NewFiles = std::vector<std::pair<int, FileMetaData>>;
  538. static void EncodeToNewFile4(const FileMetaData& f, int level, size_t ts_sz,
  539. bool has_min_log_number_to_keep,
  540. uint64_t min_log_number_to_keep,
  541. bool& min_log_num_written, std::string* dst);
  542. static const char* DecodeNewFile4From(Slice* input, int& max_level,
  543. uint64_t& min_log_number_to_keep,
  544. bool& has_min_log_number_to_keep,
  545. NewFiles& new_files, FileMetaData& f);
  546. void Clear();
  547. void SetDBId(const std::string& db_id) {
  548. has_db_id_ = true;
  549. db_id_ = db_id;
  550. }
  551. bool HasDbId() const { return has_db_id_; }
  552. const std::string& GetDbId() const { return db_id_; }
  553. void SetComparatorName(const Slice& name) {
  554. has_comparator_ = true;
  555. comparator_ = name.ToString();
  556. }
  557. bool HasComparatorName() const { return has_comparator_; }
  558. const std::string& GetComparatorName() const { return comparator_; }
  559. void SetPersistUserDefinedTimestamps(bool persist_user_defined_timestamps) {
  560. has_persist_user_defined_timestamps_ = true;
  561. persist_user_defined_timestamps_ = persist_user_defined_timestamps;
  562. }
  563. bool HasPersistUserDefinedTimestamps() const {
  564. return has_persist_user_defined_timestamps_;
  565. }
  566. bool GetPersistUserDefinedTimestamps() const {
  567. return persist_user_defined_timestamps_;
  568. }
  569. void SetLogNumber(uint64_t num) {
  570. has_log_number_ = true;
  571. log_number_ = num;
  572. }
  573. bool HasLogNumber() const { return has_log_number_; }
  574. uint64_t GetLogNumber() const { return log_number_; }
  575. void SetPrevLogNumber(uint64_t num) {
  576. has_prev_log_number_ = true;
  577. prev_log_number_ = num;
  578. }
  579. bool HasPrevLogNumber() const { return has_prev_log_number_; }
  580. uint64_t GetPrevLogNumber() const { return prev_log_number_; }
  581. void SetNextFile(uint64_t num) {
  582. has_next_file_number_ = true;
  583. next_file_number_ = num;
  584. }
  585. bool HasNextFile() const { return has_next_file_number_; }
  586. uint64_t GetNextFile() const { return next_file_number_; }
  587. void SetMaxColumnFamily(uint32_t max_column_family) {
  588. has_max_column_family_ = true;
  589. max_column_family_ = max_column_family;
  590. }
  591. bool HasMaxColumnFamily() const { return has_max_column_family_; }
  592. uint32_t GetMaxColumnFamily() const { return max_column_family_; }
  593. void SetMinLogNumberToKeep(uint64_t num) {
  594. has_min_log_number_to_keep_ = true;
  595. min_log_number_to_keep_ = num;
  596. }
  597. bool HasMinLogNumberToKeep() const { return has_min_log_number_to_keep_; }
  598. uint64_t GetMinLogNumberToKeep() const { return min_log_number_to_keep_; }
  599. void SetLastSequence(SequenceNumber seq) {
  600. has_last_sequence_ = true;
  601. last_sequence_ = seq;
  602. }
  603. bool HasLastSequence() const { return has_last_sequence_; }
  604. SequenceNumber GetLastSequence() const { return last_sequence_; }
  605. // Delete the specified table file from the specified level.
  606. void DeleteFile(int level, uint64_t file) {
  607. deleted_files_.emplace(level, file);
  608. }
  609. // Retrieve the table files deleted as well as their associated levels.
  610. using DeletedFiles = std::set<std::pair<int, uint64_t>>;
  611. const DeletedFiles& GetDeletedFiles() const { return deleted_files_; }
  612. // Add the specified table file at the specified level.
  613. // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
  614. // REQUIRES: "oldest_blob_file_number" is the number of the oldest blob file
  615. // referred to by this file if any, kInvalidBlobFileNumber otherwise.
  616. void AddFile(int level, uint64_t file, uint32_t file_path_id,
  617. uint64_t file_size, const InternalKey& smallest,
  618. const InternalKey& largest, const SequenceNumber& smallest_seqno,
  619. const SequenceNumber& largest_seqno, bool marked_for_compaction,
  620. Temperature temperature, uint64_t oldest_blob_file_number,
  621. uint64_t oldest_ancester_time, uint64_t file_creation_time,
  622. uint64_t epoch_number, const std::string& file_checksum,
  623. const std::string& file_checksum_func_name,
  624. const UniqueId64x2& unique_id,
  625. const uint64_t compensated_range_deletion_size,
  626. uint64_t tail_size, bool user_defined_timestamps_persisted) {
  627. assert(smallest_seqno <= largest_seqno);
  628. new_files_.emplace_back(
  629. level,
  630. FileMetaData(file, file_path_id, file_size, smallest, largest,
  631. smallest_seqno, largest_seqno, marked_for_compaction,
  632. temperature, oldest_blob_file_number, oldest_ancester_time,
  633. file_creation_time, epoch_number, file_checksum,
  634. file_checksum_func_name, unique_id,
  635. compensated_range_deletion_size, tail_size,
  636. user_defined_timestamps_persisted));
  637. files_to_quarantine_.push_back(file);
  638. if (!HasLastSequence() || largest_seqno > GetLastSequence()) {
  639. SetLastSequence(largest_seqno);
  640. }
  641. }
  642. void AddFile(int level, const FileMetaData& f) {
  643. assert(f.fd.smallest_seqno <= f.fd.largest_seqno);
  644. new_files_.emplace_back(level, f);
  645. files_to_quarantine_.push_back(f.fd.GetNumber());
  646. if (!HasLastSequence() || f.fd.largest_seqno > GetLastSequence()) {
  647. SetLastSequence(f.fd.largest_seqno);
  648. }
  649. }
  650. const NewFiles& GetNewFiles() const { return new_files_; }
  651. NewFiles& GetMutableNewFiles() { return new_files_; }
  652. // Retrieve all the compact cursors
  653. using CompactCursors = std::vector<std::pair<int, InternalKey>>;
  654. const CompactCursors& GetCompactCursors() const { return compact_cursors_; }
  655. void AddCompactCursor(int level, const InternalKey& cursor) {
  656. compact_cursors_.push_back(std::make_pair(level, cursor));
  657. }
  658. void SetCompactCursors(
  659. const std::vector<InternalKey>& compact_cursors_by_level) {
  660. compact_cursors_.clear();
  661. compact_cursors_.reserve(compact_cursors_by_level.size());
  662. for (int i = 0; i < (int)compact_cursors_by_level.size(); i++) {
  663. if (compact_cursors_by_level[i].Valid()) {
  664. compact_cursors_.push_back(
  665. std::make_pair(i, compact_cursors_by_level[i]));
  666. }
  667. }
  668. }
  669. // Add a new blob file.
  670. void AddBlobFile(uint64_t blob_file_number, uint64_t total_blob_count,
  671. uint64_t total_blob_bytes, std::string checksum_method,
  672. std::string checksum_value) {
  673. blob_file_additions_.emplace_back(
  674. blob_file_number, total_blob_count, total_blob_bytes,
  675. std::move(checksum_method), std::move(checksum_value));
  676. files_to_quarantine_.push_back(blob_file_number);
  677. }
  678. void AddBlobFile(BlobFileAddition blob_file_addition) {
  679. blob_file_additions_.emplace_back(std::move(blob_file_addition));
  680. files_to_quarantine_.push_back(
  681. blob_file_additions_.back().GetBlobFileNumber());
  682. }
  683. // Retrieve all the blob files added.
  684. using BlobFileAdditions = std::vector<BlobFileAddition>;
  685. const BlobFileAdditions& GetBlobFileAdditions() const {
  686. return blob_file_additions_;
  687. }
  688. void SetBlobFileAdditions(BlobFileAdditions blob_file_additions) {
  689. assert(blob_file_additions_.empty());
  690. blob_file_additions_ = std::move(blob_file_additions);
  691. std::for_each(
  692. blob_file_additions_.begin(), blob_file_additions_.end(),
  693. [&](const BlobFileAddition& blob_file) {
  694. files_to_quarantine_.push_back(blob_file.GetBlobFileNumber());
  695. });
  696. }
  697. // Add garbage for an existing blob file. Note: intentionally broken English
  698. // follows.
  699. void AddBlobFileGarbage(uint64_t blob_file_number,
  700. uint64_t garbage_blob_count,
  701. uint64_t garbage_blob_bytes) {
  702. blob_file_garbages_.emplace_back(blob_file_number, garbage_blob_count,
  703. garbage_blob_bytes);
  704. }
  705. void AddBlobFileGarbage(BlobFileGarbage blob_file_garbage) {
  706. blob_file_garbages_.emplace_back(std::move(blob_file_garbage));
  707. }
  708. // Retrieve all the blob file garbage added.
  709. using BlobFileGarbages = std::vector<BlobFileGarbage>;
  710. const BlobFileGarbages& GetBlobFileGarbages() const {
  711. return blob_file_garbages_;
  712. }
  713. void SetBlobFileGarbages(BlobFileGarbages blob_file_garbages) {
  714. assert(blob_file_garbages_.empty());
  715. blob_file_garbages_ = std::move(blob_file_garbages);
  716. }
  717. // Add a WAL (either just created or closed).
  718. // AddWal and DeleteWalsBefore cannot be called on the same VersionEdit.
  719. void AddWal(WalNumber number, WalMetadata metadata = WalMetadata()) {
  720. assert(NumEntries() == wal_additions_.size());
  721. wal_additions_.emplace_back(number, std::move(metadata));
  722. }
  723. // Retrieve all the added WALs.
  724. const WalAdditions& GetWalAdditions() const { return wal_additions_; }
  725. bool IsWalAddition() const { return !wal_additions_.empty(); }
  726. // Delete a WAL (either directly deleted or archived).
  727. // AddWal and DeleteWalsBefore cannot be called on the same VersionEdit.
  728. void DeleteWalsBefore(WalNumber number) {
  729. assert((NumEntries() == 1) == !wal_deletion_.IsEmpty());
  730. wal_deletion_ = WalDeletion(number);
  731. }
  732. const WalDeletion& GetWalDeletion() const { return wal_deletion_; }
  733. bool IsWalDeletion() const { return !wal_deletion_.IsEmpty(); }
  734. bool IsWalManipulation() const {
  735. size_t entries = NumEntries();
  736. return (entries > 0) && ((entries == wal_additions_.size()) ||
  737. (entries == !wal_deletion_.IsEmpty()));
  738. }
  739. // Number of edits
  740. size_t NumEntries() const {
  741. return new_files_.size() + deleted_files_.size() +
  742. blob_file_additions_.size() + blob_file_garbages_.size() +
  743. wal_additions_.size() + !wal_deletion_.IsEmpty();
  744. }
  745. void SetColumnFamily(uint32_t column_family_id) {
  746. column_family_ = column_family_id;
  747. }
  748. uint32_t GetColumnFamily() const { return column_family_; }
  749. const std::string& GetColumnFamilyName() const { return column_family_name_; }
  750. // set column family ID by calling SetColumnFamily()
  751. void AddColumnFamily(const std::string& name) {
  752. assert(!is_column_family_drop_);
  753. assert(!is_column_family_add_);
  754. assert(NumEntries() == 0);
  755. is_column_family_add_ = true;
  756. column_family_name_ = name;
  757. }
  758. // set column family ID by calling SetColumnFamily()
  759. void DropColumnFamily() {
  760. assert(!is_column_family_drop_);
  761. assert(!is_column_family_add_);
  762. assert(NumEntries() == 0);
  763. is_column_family_drop_ = true;
  764. }
  765. bool IsColumnFamilyManipulation() const {
  766. return is_column_family_add_ || is_column_family_drop_;
  767. }
  768. bool IsColumnFamilyAdd() const { return is_column_family_add_; }
  769. bool IsColumnFamilyDrop() const { return is_column_family_drop_; }
  770. void MarkNoManifestWriteDummy() { is_no_manifest_write_dummy_ = true; }
  771. bool IsNoManifestWriteDummy() const { return is_no_manifest_write_dummy_; }
  772. void MarkAtomicGroup(uint32_t remaining_entries) {
  773. is_in_atomic_group_ = true;
  774. remaining_entries_ = remaining_entries;
  775. }
  776. bool IsInAtomicGroup() const { return is_in_atomic_group_; }
  777. void SetRemainingEntries(uint32_t remaining_entries) {
  778. remaining_entries_ = remaining_entries;
  779. }
  780. uint32_t GetRemainingEntries() const { return remaining_entries_; }
  781. bool HasFullHistoryTsLow() const { return !full_history_ts_low_.empty(); }
  782. const std::string& GetFullHistoryTsLow() const {
  783. assert(HasFullHistoryTsLow());
  784. return full_history_ts_low_;
  785. }
  786. void SetFullHistoryTsLow(std::string full_history_ts_low) {
  787. assert(!full_history_ts_low.empty());
  788. full_history_ts_low_ = std::move(full_history_ts_low);
  789. }
  790. void SetSubcompactionProgress(const SubcompactionProgress& progress) {
  791. has_subcompaction_progress_ = true;
  792. subcompaction_progress_ = progress;
  793. }
  794. bool HasSubcompactionProgress() const { return has_subcompaction_progress_; }
  795. const SubcompactionProgress& GetSubcompactionProgress() const {
  796. return subcompaction_progress_;
  797. }
  798. void ClearSubcompactionProgress() {
  799. has_subcompaction_progress_ = false;
  800. subcompaction_progress_.Clear();
  801. }
  802. // return true on success.
  803. // `ts_sz` is the size in bytes for the user-defined timestamp contained in
  804. // a user key. This argument is optional because it's only required for
  805. // encoding a `VersionEdit` with new SST files to add. It's used to handle the
  806. // file boundaries: `smallest`, `largest` when
  807. // `FileMetaData.user_defined_timestamps_persisted` is false. When reading
  808. // the Manifest file, a mirroring change needed to handle
  809. // file boundaries are not added to the `VersionEdit.DecodeFrom` function
  810. // because timestamp size is not available at `VersionEdit` decoding time,
  811. // it's instead added to `VersionEditHandler::OnNonCfOperation`.
  812. bool EncodeTo(std::string* dst,
  813. std::optional<size_t> ts_sz = std::nullopt) const;
  814. Status DecodeFrom(const Slice& src);
  815. const autovector<uint64_t>* GetFilesToQuarantineIfCommitFail() const {
  816. return &files_to_quarantine_;
  817. }
  818. std::string DebugString(bool hex_key = false) const;
  819. std::string DebugJSON(int edit_num, bool hex_key = false) const;
  820. private:
  821. // Decode level information from serialized VersionEdit data and and track the
  822. // maximum level seen.
  823. //
  824. // Parameters:
  825. // input: Pointer to serialized data slice
  826. // level: Output parameter for the decoded level value
  827. // max_level: get updated if the decoded level is higher than passed in
  828. // value
  829. //
  830. // Returns: true on successful decode, false on parse error
  831. static bool GetLevel(Slice* input, int* level, int& max_level);
  832. // Encode file boundaries `FileMetaData.smallest` and `FileMetaData.largest`.
  833. // User-defined timestamps in the user key will be stripped if they shouldn't
  834. // be persisted.
  835. static void EncodeFileBoundaries(std::string* dst, const FileMetaData& meta,
  836. size_t ts_sz);
  837. int max_level_ = 0;
  838. std::string db_id_;
  839. std::string comparator_;
  840. uint64_t log_number_ = 0;
  841. uint64_t prev_log_number_ = 0;
  842. uint64_t next_file_number_ = 0;
  843. uint32_t max_column_family_ = 0;
  844. // The most recent WAL log number that is deleted
  845. uint64_t min_log_number_to_keep_ = 0;
  846. SequenceNumber last_sequence_ = 0;
  847. bool has_db_id_ = false;
  848. bool has_comparator_ = false;
  849. bool has_log_number_ = false;
  850. bool has_prev_log_number_ = false;
  851. bool has_next_file_number_ = false;
  852. bool has_max_column_family_ = false;
  853. bool has_min_log_number_to_keep_ = false;
  854. bool has_last_sequence_ = false;
  855. bool has_persist_user_defined_timestamps_ = false;
  856. // Compaction cursors for round-robin compaction policy
  857. CompactCursors compact_cursors_;
  858. DeletedFiles deleted_files_;
  859. NewFiles new_files_;
  860. BlobFileAdditions blob_file_additions_;
  861. BlobFileGarbages blob_file_garbages_;
  862. WalAdditions wal_additions_;
  863. WalDeletion wal_deletion_;
  864. // Each version edit record should have column_family_ set
  865. // If it's not set, it is default (0)
  866. uint32_t column_family_ = 0;
  867. // a version edit can be either column_family add or
  868. // column_family drop. If it's column family add,
  869. // it also includes column family name.
  870. bool is_column_family_drop_ = false;
  871. bool is_column_family_add_ = false;
  872. std::string column_family_name_;
  873. uint32_t remaining_entries_ = 0;
  874. bool is_in_atomic_group_ = false;
  875. bool is_no_manifest_write_dummy_ = false;
  876. std::string full_history_ts_low_;
  877. bool persist_user_defined_timestamps_ = true;
  878. bool has_subcompaction_progress_ = false;
  879. SubcompactionProgress subcompaction_progress_;
  880. // Newly created table files and blob files are eligible for deletion if they
  881. // are not registered as live files after the background jobs creating them
  882. // have finished. In case committing the VersionEdit containing such changes
  883. // to manifest encountered an error, we want to quarantine these files from
  884. // deletion to avoid prematurely deleting files that ended up getting recorded
  885. // in Manifest as live files.
  886. // Since table files and blob files share the same file number space, we just
  887. // record the file number here.
  888. autovector<uint64_t> files_to_quarantine_;
  889. };
  890. } // namespace ROCKSDB_NAMESPACE