compaction_picker_level.cc 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. #include "db/compaction/compaction_picker_level.h"
  10. #include <string>
  11. #include <utility>
  12. #include <vector>
  13. #include "db/version_edit.h"
  14. #include "logging/log_buffer.h"
  15. #include "test_util/sync_point.h"
  16. namespace ROCKSDB_NAMESPACE {
  17. bool LevelCompactionPicker::NeedsCompaction(
  18. const VersionStorageInfo* vstorage) const {
  19. if (!vstorage->ExpiredTtlFiles().empty()) {
  20. return true;
  21. }
  22. if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) {
  23. return true;
  24. }
  25. if (!vstorage->BottommostFilesMarkedForCompaction().empty()) {
  26. return true;
  27. }
  28. if (!vstorage->FilesMarkedForCompaction().empty()) {
  29. return true;
  30. }
  31. if (!vstorage->FilesMarkedForForcedBlobGC().empty()) {
  32. return true;
  33. }
  34. for (int i = 0; i <= vstorage->MaxInputLevel(); i++) {
  35. if (vstorage->CompactionScore(i) >= 1) {
  36. return true;
  37. }
  38. }
  39. return false;
  40. }
  41. namespace {
  42. enum class CompactToNextLevel {
  43. kNo, // compact to the same level as the input file
  44. kYes, // compact to the next level except the last level to the same level
  45. kSkipLastLevel, // compact to the next level but skip the last level
  46. };
  47. // A class to build a leveled compaction step-by-step.
  48. class LevelCompactionBuilder {
  49. public:
  50. LevelCompactionBuilder(const std::string& cf_name,
  51. VersionStorageInfo* vstorage,
  52. CompactionPicker* compaction_picker,
  53. LogBuffer* log_buffer,
  54. const MutableCFOptions& mutable_cf_options,
  55. const ImmutableOptions& ioptions,
  56. const MutableDBOptions& mutable_db_options)
  57. : cf_name_(cf_name),
  58. vstorage_(vstorage),
  59. compaction_picker_(compaction_picker),
  60. log_buffer_(log_buffer),
  61. mutable_cf_options_(mutable_cf_options),
  62. ioptions_(ioptions),
  63. mutable_db_options_(mutable_db_options) {}
  64. // Pick and return a compaction.
  65. Compaction* PickCompaction();
  66. // Pick the initial files to compact to the next level. (or together
  67. // in Intra-L0 compactions)
  68. void SetupInitialFiles();
  69. // If the initial files are from L0 level, pick other L0
  70. // files if needed.
  71. bool SetupOtherL0FilesIfNeeded();
  72. // Compaction with round-robin compaction priority allows more files to be
  73. // picked to form a large compaction
  74. void SetupOtherFilesWithRoundRobinExpansion();
  75. // Based on initial files, setup other files need to be compacted
  76. // in this compaction, accordingly.
  77. bool SetupOtherInputsIfNeeded();
  78. Compaction* GetCompaction();
  79. // From `start_level_`, pick files to compact to `output_level_`.
  80. // Returns false if there is no file to compact.
  81. // If it returns true, inputs->files.size() will be exactly one for
  82. // all compaction priorities except round-robin. For round-robin,
  83. // multiple consecutive files may be put into inputs->files.
  84. // If level is 0 and there is already a compaction on that level, this
  85. // function will return false.
  86. bool PickFileToCompact();
  87. // Return true if a L0 trivial move is picked up.
  88. bool TryPickL0TrivialMove();
  89. // For L0->L0, picks the longest span of files that aren't currently
  90. // undergoing compaction for which work-per-deleted-file decreases. The span
  91. // always starts from the newest L0 file.
  92. //
  93. // Intra-L0 compaction is independent of all other files, so it can be
  94. // performed even when L0->base_level compactions are blocked.
  95. //
  96. // Returns true if `inputs` is populated with a span of files to be compacted;
  97. // otherwise, returns false.
  98. bool PickIntraL0Compaction();
  99. // When total L0 size is small compared to Lbase, try to pick intra-L0
  100. // compaction starting from the newest L0 file. This helps to prevent
  101. // L0->Lbase compaction with large write-amp.
  102. //
  103. // Returns true iff an intra-L0 compaction is picked.
  104. // `start_level_inputs_` and `output_level_` will be updated accordingly if
  105. // a compaction is picked.
  106. bool PickSizeBasedIntraL0Compaction();
  107. // Return true if TrivialMove is extended. `start_index` is the index of
  108. // the initial file picked, which should already be in `start_level_inputs_`.
  109. bool TryExtendNonL0TrivialMove(int start_index,
  110. bool only_expand_right = false);
  111. // Picks a file from level_files to compact.
  112. // level_files is a vector of (level, file metadata) in ascending order of
  113. // level. If compact_to_next_level is true, compact the file to the next
  114. // level, otherwise, compact to the same level as the input file.
  115. // If skip_last_level is true, skip the last level.
  116. void PickFileToCompact(
  117. const autovector<std::pair<int, FileMetaData*>>& level_files,
  118. CompactToNextLevel compact_to_next_level);
  119. const std::string& cf_name_;
  120. VersionStorageInfo* vstorage_;
  121. CompactionPicker* compaction_picker_;
  122. LogBuffer* log_buffer_;
  123. int start_level_ = -1;
  124. int output_level_ = -1;
  125. int parent_index_ = -1;
  126. int base_index_ = -1;
  127. double start_level_score_ = 0;
  128. bool is_l0_trivial_move_ = false;
  129. CompactionInputFiles start_level_inputs_;
  130. std::vector<CompactionInputFiles> compaction_inputs_;
  131. CompactionInputFiles output_level_inputs_;
  132. std::vector<FileMetaData*> grandparents_;
  133. CompactionReason compaction_reason_ = CompactionReason::kUnknown;
  134. const MutableCFOptions& mutable_cf_options_;
  135. const ImmutableOptions& ioptions_;
  136. const MutableDBOptions& mutable_db_options_;
  137. // Pick a path ID to place a newly generated file, with its level
  138. static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
  139. const MutableCFOptions& mutable_cf_options,
  140. int level);
  141. static const int kMinFilesForIntraL0Compaction = 4;
  142. };
  143. void LevelCompactionBuilder::PickFileToCompact(
  144. const autovector<std::pair<int, FileMetaData*>>& level_files,
  145. CompactToNextLevel compact_to_next_level) {
  146. for (auto& level_file : level_files) {
  147. // If it's being compacted it has nothing to do here.
  148. // If this assert() fails that means that some function marked some
  149. // files as being_compacted, but didn't call ComputeCompactionScore()
  150. assert(!level_file.second->being_compacted);
  151. start_level_ = level_file.first;
  152. if ((compact_to_next_level == CompactToNextLevel::kSkipLastLevel &&
  153. start_level_ == vstorage_->num_non_empty_levels() - 1) ||
  154. (start_level_ == 0 &&
  155. !compaction_picker_->level0_compactions_in_progress()->empty())) {
  156. continue;
  157. }
  158. // Compact to the next level only if the file is not in the last level and
  159. // compact_to_next_level is kYes or kSkipLastLevel.
  160. if (compact_to_next_level != CompactToNextLevel::kNo &&
  161. (start_level_ < vstorage_->num_non_empty_levels() - 1)) {
  162. output_level_ =
  163. (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
  164. } else {
  165. output_level_ = start_level_;
  166. }
  167. start_level_inputs_.files = {level_file.second};
  168. start_level_inputs_.level = start_level_;
  169. if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
  170. &start_level_inputs_)) {
  171. return;
  172. }
  173. }
  174. start_level_inputs_.files.clear();
  175. }
  176. void LevelCompactionBuilder::SetupInitialFiles() {
  177. // Find the compactions by size on all levels.
  178. bool skipped_l0_to_base = false;
  179. for (int i = 0; i < compaction_picker_->NumberLevels() - 1; i++) {
  180. start_level_score_ = vstorage_->CompactionScore(i);
  181. start_level_ = vstorage_->CompactionScoreLevel(i);
  182. assert(i == 0 || start_level_score_ <= vstorage_->CompactionScore(i - 1));
  183. if (start_level_score_ >= 1) {
  184. if (skipped_l0_to_base && start_level_ == vstorage_->base_level()) {
  185. // If L0->base_level compaction is pending, don't schedule further
  186. // compaction from base level. Otherwise L0->base_level compaction
  187. // may starve.
  188. continue;
  189. }
  190. output_level_ =
  191. (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
  192. bool picked_file_to_compact = PickFileToCompact();
  193. TEST_SYNC_POINT_CALLBACK("PostPickFileToCompact",
  194. &picked_file_to_compact);
  195. if (picked_file_to_compact) {
  196. // found the compaction!
  197. if (start_level_ == 0) {
  198. // L0 score = `num L0 files` / `level0_file_num_compaction_trigger`
  199. compaction_reason_ = CompactionReason::kLevelL0FilesNum;
  200. } else {
  201. // L1+ score = `Level files size` / `MaxBytesForLevel`
  202. compaction_reason_ = CompactionReason::kLevelMaxLevelSize;
  203. }
  204. break;
  205. } else {
  206. // didn't find the compaction, clear the inputs
  207. start_level_inputs_.clear();
  208. if (start_level_ == 0) {
  209. skipped_l0_to_base = true;
  210. // L0->base_level may be blocked due to ongoing L0->base_level
  211. // compactions. It may also be blocked by an ongoing compaction from
  212. // base_level downwards.
  213. //
  214. // In these cases, to reduce L0 file count and thus reduce likelihood
  215. // of write stalls, we can attempt compacting a span of files within
  216. // L0.
  217. if (PickIntraL0Compaction()) {
  218. output_level_ = 0;
  219. compaction_reason_ = CompactionReason::kLevelL0FilesNum;
  220. break;
  221. }
  222. }
  223. }
  224. } else {
  225. // Compaction scores are sorted in descending order, no further scores
  226. // will be >= 1.
  227. break;
  228. }
  229. }
  230. if (!start_level_inputs_.empty()) {
  231. return;
  232. }
  233. // if we didn't find a compaction, check if there are any files marked for
  234. // compaction
  235. parent_index_ = base_index_ = -1;
  236. compaction_picker_->PickFilesMarkedForCompaction(
  237. cf_name_, vstorage_, &start_level_, &output_level_, &start_level_inputs_,
  238. /*skip_marked_file*/ [](const FileMetaData* /* file */) {
  239. return false;
  240. });
  241. if (!start_level_inputs_.empty()) {
  242. compaction_reason_ = CompactionReason::kFilesMarkedForCompaction;
  243. return;
  244. }
  245. // Bottommost Files Compaction on deleting tombstones
  246. PickFileToCompact(vstorage_->BottommostFilesMarkedForCompaction(),
  247. CompactToNextLevel::kNo);
  248. if (!start_level_inputs_.empty()) {
  249. compaction_reason_ = CompactionReason::kBottommostFiles;
  250. return;
  251. }
  252. // TTL Compaction
  253. if (ioptions_.compaction_pri == kRoundRobin &&
  254. !vstorage_->ExpiredTtlFiles().empty()) {
  255. auto expired_files = vstorage_->ExpiredTtlFiles();
  256. // the expired files list should already be sorted by level
  257. start_level_ = expired_files.front().first;
  258. #ifndef NDEBUG
  259. for (const auto& file : expired_files) {
  260. assert(start_level_ <= file.first);
  261. }
  262. #endif
  263. if (start_level_ > 0) {
  264. output_level_ = start_level_ + 1;
  265. if (PickFileToCompact()) {
  266. compaction_reason_ = CompactionReason::kRoundRobinTtl;
  267. return;
  268. }
  269. }
  270. }
  271. PickFileToCompact(vstorage_->ExpiredTtlFiles(),
  272. CompactToNextLevel::kSkipLastLevel);
  273. if (!start_level_inputs_.empty()) {
  274. compaction_reason_ = CompactionReason::kTtl;
  275. return;
  276. }
  277. // Periodic Compaction
  278. PickFileToCompact(vstorage_->FilesMarkedForPeriodicCompaction(),
  279. ioptions_.level_compaction_dynamic_level_bytes
  280. ? CompactToNextLevel::kYes
  281. : CompactToNextLevel::kNo);
  282. if (!start_level_inputs_.empty()) {
  283. compaction_reason_ = CompactionReason::kPeriodicCompaction;
  284. return;
  285. }
  286. // Forced blob garbage collection
  287. PickFileToCompact(vstorage_->FilesMarkedForForcedBlobGC(),
  288. CompactToNextLevel::kNo);
  289. if (!start_level_inputs_.empty()) {
  290. compaction_reason_ = CompactionReason::kForcedBlobGC;
  291. return;
  292. }
  293. }
  294. bool LevelCompactionBuilder::SetupOtherL0FilesIfNeeded() {
  295. if (start_level_ == 0 && output_level_ != 0 && !is_l0_trivial_move_) {
  296. return compaction_picker_->GetOverlappingL0Files(
  297. vstorage_, &start_level_inputs_, output_level_, &parent_index_);
  298. }
  299. return true;
  300. }
  301. void LevelCompactionBuilder::SetupOtherFilesWithRoundRobinExpansion() {
  302. // We only expand when the start level is not L0 under round robin
  303. assert(start_level_ >= 1);
  304. // For round-robin compaction priority, we have 3 constraints when picking
  305. // multiple files.
  306. // Constraint 1: We can only pick consecutive files
  307. // -> Constraint 1a: When a file is being compacted (or some input files
  308. // are being compacted after expanding, we cannot
  309. // choose it and have to stop choosing more files
  310. // -> Constraint 1b: When we reach the last file (with largest keys), we
  311. // cannot choose more files (the next file will be the
  312. // first one)
  313. // Constraint 2: We should ensure the total compaction bytes (including the
  314. // overlapped files from the next level) is no more than
  315. // mutable_cf_options_.max_compaction_bytes
  316. // Constraint 3: We try our best to pick as many files as possible so that
  317. // the post-compaction level size is less than
  318. // MaxBytesForLevel(start_level_)
  319. // Constraint 4: We do not expand if it is possible to apply a trivial move
  320. // Constraint 5 (TODO): Try to pick minimal files to split into the target
  321. // number of subcompactions
  322. TEST_SYNC_POINT("LevelCompactionPicker::RoundRobin");
  323. // Only expand the inputs when we have selected a file in start_level_inputs_
  324. if (start_level_inputs_.size() == 0) {
  325. return;
  326. }
  327. uint64_t start_lvl_bytes_no_compacting = 0;
  328. uint64_t curr_bytes_to_compact = 0;
  329. uint64_t start_lvl_max_bytes_to_compact = 0;
  330. const std::vector<FileMetaData*>& level_files =
  331. vstorage_->LevelFiles(start_level_);
  332. // Constraint 3 (pre-calculate the ideal max bytes to compact)
  333. for (auto f : level_files) {
  334. if (!f->being_compacted) {
  335. start_lvl_bytes_no_compacting += f->fd.GetFileSize();
  336. }
  337. }
  338. if (start_lvl_bytes_no_compacting >
  339. vstorage_->MaxBytesForLevel(start_level_)) {
  340. start_lvl_max_bytes_to_compact = start_lvl_bytes_no_compacting -
  341. vstorage_->MaxBytesForLevel(start_level_);
  342. }
  343. size_t start_index = vstorage_->FilesByCompactionPri(start_level_)[0];
  344. InternalKey smallest, largest;
  345. // Constraint 4 (No need to check again later)
  346. compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest);
  347. CompactionInputFiles output_level_inputs;
  348. output_level_inputs.level = output_level_;
  349. vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
  350. &output_level_inputs.files);
  351. if (output_level_inputs.empty()) {
  352. if (TryExtendNonL0TrivialMove((int)start_index,
  353. true /* only_expand_right */)) {
  354. return;
  355. }
  356. }
  357. // Constraint 3
  358. if (start_level_inputs_[0]->fd.GetFileSize() >=
  359. start_lvl_max_bytes_to_compact) {
  360. return;
  361. }
  362. CompactionInputFiles tmp_start_level_inputs;
  363. tmp_start_level_inputs = start_level_inputs_;
  364. // TODO (zichen): Future parallel round-robin may also need to update this
  365. // Constraint 1b (only expand till the end)
  366. for (size_t i = start_index + 1; i < level_files.size(); i++) {
  367. auto* f = level_files[i];
  368. if (f->being_compacted) {
  369. // Constraint 1a
  370. return;
  371. }
  372. tmp_start_level_inputs.files.push_back(f);
  373. if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
  374. &tmp_start_level_inputs) ||
  375. compaction_picker_->FilesRangeOverlapWithCompaction(
  376. {tmp_start_level_inputs}, output_level_,
  377. Compaction::EvaluateProximalLevel(vstorage_, mutable_cf_options_,
  378. ioptions_, start_level_,
  379. output_level_))) {
  380. // Constraint 1a
  381. tmp_start_level_inputs.clear();
  382. return;
  383. }
  384. curr_bytes_to_compact = 0;
  385. for (auto start_lvl_f : tmp_start_level_inputs.files) {
  386. curr_bytes_to_compact += start_lvl_f->fd.GetFileSize();
  387. }
  388. // Check whether any output level files are locked
  389. compaction_picker_->GetRange(tmp_start_level_inputs, &smallest, &largest);
  390. vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
  391. &output_level_inputs.files);
  392. if (!output_level_inputs.empty() &&
  393. !compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
  394. &output_level_inputs)) {
  395. // Constraint 1a
  396. tmp_start_level_inputs.clear();
  397. return;
  398. }
  399. uint64_t start_lvl_curr_bytes_to_compact = curr_bytes_to_compact;
  400. for (auto output_lvl_f : output_level_inputs.files) {
  401. curr_bytes_to_compact += output_lvl_f->fd.GetFileSize();
  402. }
  403. if (curr_bytes_to_compact > mutable_cf_options_.max_compaction_bytes) {
  404. // Constraint 2
  405. tmp_start_level_inputs.clear();
  406. return;
  407. }
  408. start_level_inputs_.files = tmp_start_level_inputs.files;
  409. // Constraint 3
  410. if (start_lvl_curr_bytes_to_compact > start_lvl_max_bytes_to_compact) {
  411. return;
  412. }
  413. }
  414. }
  415. bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() {
  416. // Setup input files from output level. For output to L0, we only compact
  417. // spans of files that do not interact with any pending compactions, so don't
  418. // need to consider other levels.
  419. if (output_level_ != 0) {
  420. output_level_inputs_.level = output_level_;
  421. bool round_robin_expanding =
  422. ioptions_.compaction_pri == kRoundRobin &&
  423. compaction_reason_ == CompactionReason::kLevelMaxLevelSize;
  424. if (round_robin_expanding) {
  425. SetupOtherFilesWithRoundRobinExpansion();
  426. }
  427. if (!is_l0_trivial_move_ &&
  428. !compaction_picker_->SetupOtherInputs(
  429. cf_name_, mutable_cf_options_, vstorage_, &start_level_inputs_,
  430. &output_level_inputs_, &parent_index_, base_index_,
  431. round_robin_expanding)) {
  432. return false;
  433. }
  434. compaction_inputs_.push_back(start_level_inputs_);
  435. if (!output_level_inputs_.empty()) {
  436. compaction_inputs_.push_back(output_level_inputs_);
  437. }
  438. // In some edge cases we could pick a compaction that will be compacting
  439. // a key range that overlap with another running compaction, and both
  440. // of them have the same output level. This could happen if
  441. // (1) we are running a non-exclusive manual compaction
  442. // (2) AddFile ingest a new file into the LSM tree
  443. // We need to disallow this from happening.
  444. if (compaction_picker_->FilesRangeOverlapWithCompaction(
  445. compaction_inputs_, output_level_,
  446. Compaction::EvaluateProximalLevel(vstorage_, mutable_cf_options_,
  447. ioptions_, start_level_,
  448. output_level_))) {
  449. // This compaction output could potentially conflict with the output
  450. // of a currently running compaction, we cannot run it.
  451. return false;
  452. }
  453. if (!is_l0_trivial_move_) {
  454. compaction_picker_->GetGrandparents(vstorage_, start_level_inputs_,
  455. output_level_inputs_, &grandparents_);
  456. }
  457. } else {
  458. compaction_inputs_.push_back(start_level_inputs_);
  459. }
  460. return true;
  461. }
  462. Compaction* LevelCompactionBuilder::PickCompaction() {
  463. // Pick up the first file to start compaction. It may have been extended
  464. // to a clean cut.
  465. SetupInitialFiles();
  466. if (start_level_inputs_.empty()) {
  467. return nullptr;
  468. }
  469. assert(start_level_ >= 0 && output_level_ >= 0);
  470. // If it is a L0 -> base level compaction, we need to set up other L0
  471. // files if needed.
  472. if (!SetupOtherL0FilesIfNeeded()) {
  473. return nullptr;
  474. }
  475. // Pick files in the output level and expand more files in the start level
  476. // if needed.
  477. if (!SetupOtherInputsIfNeeded()) {
  478. return nullptr;
  479. }
  480. // Form a compaction object containing the files we picked.
  481. Compaction* c = GetCompaction();
  482. TEST_SYNC_POINT_CALLBACK("LevelCompactionPicker::PickCompaction:Return", c);
  483. return c;
  484. }
  485. Compaction* LevelCompactionBuilder::GetCompaction() {
  486. // TryPickL0TrivialMove() does not apply to the case when compacting L0 to an
  487. // empty output level. So L0 files is picked in PickFileToCompact() by
  488. // compaction score. We may still be able to do trivial move when this file
  489. // does not overlap with other L0s. This happens when
  490. // compaction_inputs_[0].size() == 1 since SetupOtherL0FilesIfNeeded() did not
  491. // pull in more L0s.
  492. assert(!compaction_inputs_.empty());
  493. bool l0_files_might_overlap =
  494. start_level_ == 0 && !is_l0_trivial_move_ &&
  495. (compaction_inputs_.size() > 1 || compaction_inputs_[0].size() > 1);
  496. auto c = new Compaction(
  497. vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_,
  498. std::move(compaction_inputs_), output_level_,
  499. MaxFileSizeForLevel(mutable_cf_options_, output_level_,
  500. ioptions_.compaction_style, vstorage_->base_level(),
  501. ioptions_.level_compaction_dynamic_level_bytes),
  502. mutable_cf_options_.max_compaction_bytes,
  503. GetPathId(ioptions_, mutable_cf_options_, output_level_),
  504. GetCompressionType(vstorage_, mutable_cf_options_, output_level_,
  505. vstorage_->base_level()),
  506. GetCompressionOptions(mutable_cf_options_, vstorage_, output_level_),
  507. Temperature::kUnknown,
  508. /* max_subcompactions */ 0, std::move(grandparents_),
  509. /* earliest_snapshot */ std::nullopt, /* snapshot_checker */ nullptr,
  510. compaction_reason_,
  511. /* trim_ts */ "", start_level_score_, l0_files_might_overlap);
  512. // If it's level 0 compaction, make sure we don't execute any other level 0
  513. // compactions in parallel
  514. compaction_picker_->RegisterCompaction(c);
  515. // Creating a compaction influences the compaction score because the score
  516. // takes running compactions into account (by skipping files that are already
  517. // being compacted). Since we just changed compaction score, we recalculate it
  518. // here
  519. vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
  520. return c;
  521. }
  522. /*
  523. * Find the optimal path to place a file
  524. * Given a level, finds the path where levels up to it will fit in levels
  525. * up to and including this path
  526. */
  527. uint32_t LevelCompactionBuilder::GetPathId(
  528. const ImmutableCFOptions& ioptions,
  529. const MutableCFOptions& mutable_cf_options, int level) {
  530. uint32_t p = 0;
  531. assert(!ioptions.cf_paths.empty());
  532. // size remaining in the most recent path
  533. uint64_t current_path_size = ioptions.cf_paths[0].target_size;
  534. uint64_t level_size;
  535. int cur_level = 0;
  536. // max_bytes_for_level_base denotes L1 size.
  537. // We estimate L0 size to be the same as L1.
  538. level_size = mutable_cf_options.max_bytes_for_level_base;
  539. // Last path is the fallback
  540. while (p < ioptions.cf_paths.size() - 1) {
  541. if (level_size <= current_path_size) {
  542. if (cur_level == level) {
  543. // Does desired level fit in this path?
  544. return p;
  545. } else {
  546. current_path_size -= level_size;
  547. if (cur_level > 0) {
  548. if (ioptions.level_compaction_dynamic_level_bytes) {
  549. // Currently, level_compaction_dynamic_level_bytes is ignored when
  550. // multiple db paths are specified. https://github.com/facebook/
  551. // rocksdb/blob/main/db/column_family.cc.
  552. // Still, adding this check to avoid accidentally using
  553. // max_bytes_for_level_multiplier_additional
  554. level_size = static_cast<uint64_t>(
  555. level_size * mutable_cf_options.max_bytes_for_level_multiplier);
  556. } else {
  557. level_size = static_cast<uint64_t>(
  558. level_size * mutable_cf_options.max_bytes_for_level_multiplier *
  559. mutable_cf_options.MaxBytesMultiplerAdditional(cur_level));
  560. }
  561. }
  562. cur_level++;
  563. continue;
  564. }
  565. }
  566. p++;
  567. current_path_size = ioptions.cf_paths[p].target_size;
  568. }
  569. return p;
  570. }
  571. bool LevelCompactionBuilder::TryPickL0TrivialMove() {
  572. if (vstorage_->base_level() <= 0) {
  573. return false;
  574. }
  575. if (start_level_ == 0 && mutable_cf_options_.compression_per_level.empty() &&
  576. !vstorage_->LevelFiles(output_level_).empty() &&
  577. ioptions_.db_paths.size() <= 1) {
  578. // Try to pick trivial move from L0 to L1. We start from the oldest
  579. // file. We keep expanding to newer files if it would form a
  580. // trivial move.
  581. // For now we don't support it with
  582. // mutable_cf_options_.compression_per_level to prevent the logic
  583. // of determining whether L0 can be trivial moved to the next level.
  584. // We skip the case where output level is empty, since in this case, at
  585. // least the oldest file would qualify for trivial move, and this would
  586. // be a surprising behavior with few benefits.
  587. // We search from the oldest file from the newest. In theory, there are
  588. // files in the middle can form trivial move too, but it is probably
  589. // uncommon and we ignore these cases for simplicity.
  590. const std::vector<FileMetaData*>& level_files =
  591. vstorage_->LevelFiles(start_level_);
  592. InternalKey my_smallest, my_largest;
  593. for (auto it = level_files.rbegin(); it != level_files.rend(); ++it) {
  594. CompactionInputFiles output_level_inputs;
  595. output_level_inputs.level = output_level_;
  596. FileMetaData* file = *it;
  597. if (it == level_files.rbegin()) {
  598. my_smallest = file->smallest;
  599. my_largest = file->largest;
  600. } else {
  601. if (compaction_picker_->icmp()->Compare(file->largest, my_smallest) <
  602. 0) {
  603. my_smallest = file->smallest;
  604. } else if (compaction_picker_->icmp()->Compare(file->smallest,
  605. my_largest) > 0) {
  606. my_largest = file->largest;
  607. } else {
  608. break;
  609. }
  610. }
  611. vstorage_->GetOverlappingInputs(output_level_, &my_smallest, &my_largest,
  612. &output_level_inputs.files);
  613. if (output_level_inputs.empty()) {
  614. assert(!file->being_compacted);
  615. start_level_inputs_.files.push_back(file);
  616. } else {
  617. break;
  618. }
  619. }
  620. }
  621. if (!start_level_inputs_.empty()) {
  622. // Sort files by key range. Not sure it's 100% necessary but it's cleaner
  623. // to always keep files sorted by key the key ranges don't overlap.
  624. std::sort(start_level_inputs_.files.begin(),
  625. start_level_inputs_.files.end(),
  626. [icmp = compaction_picker_->icmp()](FileMetaData* f1,
  627. FileMetaData* f2) -> bool {
  628. return (icmp->Compare(f1->smallest, f2->smallest) < 0);
  629. });
  630. is_l0_trivial_move_ = true;
  631. return true;
  632. }
  633. return false;
  634. }
  635. bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index,
  636. bool only_expand_right) {
  637. if (start_level_inputs_.size() == 1 &&
  638. (ioptions_.db_paths.empty() || ioptions_.db_paths.size() == 1) &&
  639. (mutable_cf_options_.compression_per_level.empty())) {
  640. // Only file of `index`, and it is likely a trivial move. Try to
  641. // expand if it is still a trivial move, but not beyond
  642. // max_compaction_bytes or 4 files, so that we don't create too
  643. // much compaction pressure for the next level.
  644. // Ignore if there are more than one DB path, as it would be hard
  645. // to predict whether it is a trivial move.
  646. const std::vector<FileMetaData*>& level_files =
  647. vstorage_->LevelFiles(start_level_);
  648. const size_t kMaxMultiTrivialMove = 4;
  649. FileMetaData* initial_file = start_level_inputs_.files[0];
  650. size_t total_size = initial_file->fd.GetFileSize();
  651. CompactionInputFiles output_level_inputs;
  652. output_level_inputs.level = output_level_;
  653. // Expand towards right
  654. for (int i = start_index + 1;
  655. i < static_cast<int>(level_files.size()) &&
  656. start_level_inputs_.size() < kMaxMultiTrivialMove;
  657. i++) {
  658. FileMetaData* next_file = level_files[i];
  659. if (next_file->being_compacted) {
  660. break;
  661. }
  662. vstorage_->GetOverlappingInputs(output_level_, &(initial_file->smallest),
  663. &(next_file->largest),
  664. &output_level_inputs.files);
  665. if (!output_level_inputs.empty()) {
  666. break;
  667. }
  668. if (i < static_cast<int>(level_files.size()) - 1 &&
  669. compaction_picker_->icmp()
  670. ->user_comparator()
  671. ->CompareWithoutTimestamp(
  672. next_file->largest.user_key(),
  673. level_files[i + 1]->smallest.user_key()) == 0) {
  674. TEST_SYNC_POINT_CALLBACK(
  675. "LevelCompactionBuilder::TryExtendNonL0TrivialMove:NoCleanCut",
  676. nullptr);
  677. // Not a clean up after adding the next file. Skip.
  678. break;
  679. }
  680. total_size += next_file->fd.GetFileSize();
  681. if (total_size > mutable_cf_options_.max_compaction_bytes) {
  682. break;
  683. }
  684. start_level_inputs_.files.push_back(next_file);
  685. }
  686. // Expand towards left
  687. if (!only_expand_right) {
  688. for (int i = start_index - 1;
  689. i >= 0 && start_level_inputs_.size() < kMaxMultiTrivialMove; i--) {
  690. FileMetaData* next_file = level_files[i];
  691. if (next_file->being_compacted) {
  692. break;
  693. }
  694. vstorage_->GetOverlappingInputs(output_level_, &(next_file->smallest),
  695. &(initial_file->largest),
  696. &output_level_inputs.files);
  697. if (!output_level_inputs.empty()) {
  698. break;
  699. }
  700. if (i > 0 && compaction_picker_->icmp()
  701. ->user_comparator()
  702. ->CompareWithoutTimestamp(
  703. next_file->smallest.user_key(),
  704. level_files[i - 1]->largest.user_key()) == 0) {
  705. // Not a clean up after adding the next file. Skip.
  706. break;
  707. }
  708. total_size += next_file->fd.GetFileSize();
  709. if (total_size > mutable_cf_options_.max_compaction_bytes) {
  710. break;
  711. }
  712. // keep `files` sorted in increasing order by key range
  713. start_level_inputs_.files.insert(start_level_inputs_.files.begin(),
  714. next_file);
  715. }
  716. }
  717. return start_level_inputs_.size() > 1;
  718. }
  719. return false;
  720. }
  721. bool LevelCompactionBuilder::PickFileToCompact() {
  722. // level 0 files are overlapping. So we cannot pick more
  723. // than one concurrent compactions at this level. This
  724. // could be made better by looking at key-ranges that are
  725. // being compacted at level 0.
  726. if (start_level_ == 0 &&
  727. !compaction_picker_->level0_compactions_in_progress()->empty()) {
  728. if (PickSizeBasedIntraL0Compaction()) {
  729. return true;
  730. }
  731. TEST_SYNC_POINT("LevelCompactionPicker::PickCompactionBySize:0");
  732. return false;
  733. }
  734. start_level_inputs_.clear();
  735. start_level_inputs_.level = start_level_;
  736. assert(start_level_ >= 0);
  737. if (TryPickL0TrivialMove()) {
  738. return true;
  739. }
  740. if (start_level_ == 0 && PickSizeBasedIntraL0Compaction()) {
  741. return true;
  742. }
  743. const std::vector<FileMetaData*>& level_files =
  744. vstorage_->LevelFiles(start_level_);
  745. // Pick the file with the highest score in this level that is not already
  746. // being compacted.
  747. const std::vector<int>& file_scores =
  748. vstorage_->FilesByCompactionPri(start_level_);
  749. unsigned int cmp_idx;
  750. for (cmp_idx = vstorage_->NextCompactionIndex(start_level_);
  751. cmp_idx < file_scores.size(); cmp_idx++) {
  752. int index = file_scores[cmp_idx];
  753. auto* f = level_files[index];
  754. // do not pick a file to compact if it is being compacted
  755. // from n-1 level.
  756. if (f->being_compacted) {
  757. if (ioptions_.compaction_pri == kRoundRobin) {
  758. // TODO(zichen): this file may be involved in one compaction from
  759. // an upper level, cannot advance the cursor for round-robin policy.
  760. // Currently, we do not pick any file to compact in this case. We
  761. // should fix this later to ensure a compaction is picked but the
  762. // cursor shall not be advanced.
  763. return false;
  764. }
  765. continue;
  766. }
  767. start_level_inputs_.files.push_back(f);
  768. if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
  769. &start_level_inputs_) ||
  770. compaction_picker_->FilesRangeOverlapWithCompaction(
  771. {start_level_inputs_}, output_level_,
  772. Compaction::EvaluateProximalLevel(vstorage_, mutable_cf_options_,
  773. ioptions_, start_level_,
  774. output_level_))) {
  775. // A locked (pending compaction) input-level file was pulled in due to
  776. // user-key overlap.
  777. start_level_inputs_.clear();
  778. if (ioptions_.compaction_pri == kRoundRobin) {
  779. return false;
  780. }
  781. continue;
  782. }
  783. // Now that input level is fully expanded, we check whether any output
  784. // files are locked due to pending compaction.
  785. //
  786. // Note we rely on ExpandInputsToCleanCut() to tell us whether any output-
  787. // level files are locked, not just the extra ones pulled in for user-key
  788. // overlap.
  789. InternalKey smallest, largest;
  790. compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest);
  791. CompactionInputFiles output_level_inputs;
  792. output_level_inputs.level = output_level_;
  793. vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
  794. &output_level_inputs.files);
  795. if (output_level_inputs.empty()) {
  796. if (start_level_ > 0 &&
  797. TryExtendNonL0TrivialMove(index,
  798. ioptions_.compaction_pri ==
  799. kRoundRobin /* only_expand_right */)) {
  800. break;
  801. }
  802. } else {
  803. if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
  804. &output_level_inputs)) {
  805. start_level_inputs_.clear();
  806. if (ioptions_.compaction_pri == kRoundRobin) {
  807. return false;
  808. }
  809. continue;
  810. }
  811. }
  812. base_index_ = index;
  813. break;
  814. }
  815. // store where to start the iteration in the next call to PickCompaction
  816. if (ioptions_.compaction_pri != kRoundRobin) {
  817. vstorage_->SetNextCompactionIndex(start_level_, cmp_idx);
  818. }
  819. return start_level_inputs_.size() > 0;
  820. }
  821. bool LevelCompactionBuilder::PickIntraL0Compaction() {
  822. start_level_inputs_.clear();
  823. const std::vector<FileMetaData*>& level_files =
  824. vstorage_->LevelFiles(0 /* level */);
  825. if (level_files.size() <
  826. static_cast<size_t>(
  827. mutable_cf_options_.level0_file_num_compaction_trigger + 2) ||
  828. level_files[0]->being_compacted) {
  829. // If L0 isn't accumulating much files beyond the regular trigger, don't
  830. // resort to L0->L0 compaction yet.
  831. return false;
  832. }
  833. return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction,
  834. std::numeric_limits<uint64_t>::max(),
  835. mutable_cf_options_.max_compaction_bytes,
  836. &start_level_inputs_);
  837. }
  838. bool LevelCompactionBuilder::PickSizeBasedIntraL0Compaction() {
  839. assert(start_level_ == 0);
  840. int base_level = vstorage_->base_level();
  841. if (base_level <= 0) {
  842. return false;
  843. }
  844. const std::vector<FileMetaData*>& l0_files =
  845. vstorage_->LevelFiles(/*level=*/0);
  846. size_t min_num_file =
  847. std::max(2, mutable_cf_options_.level0_file_num_compaction_trigger);
  848. if (l0_files.size() < min_num_file) {
  849. return false;
  850. }
  851. uint64_t l0_size = 0;
  852. for (const auto& file : l0_files) {
  853. assert(file->compensated_file_size >= file->fd.GetFileSize());
  854. // Compact down L0s with more deletions.
  855. l0_size += file->compensated_file_size;
  856. }
  857. // Avoid L0->Lbase compactions that are inefficient for write-amp.
  858. const double kMultiplier =
  859. std::max(10.0, mutable_cf_options_.max_bytes_for_level_multiplier) * 2;
  860. const uint64_t min_lbase_size = MultiplyCheckOverflow(l0_size, kMultiplier);
  861. assert(min_lbase_size >= l0_size);
  862. const std::vector<FileMetaData*>& lbase_files =
  863. vstorage_->LevelFiles(/*level=*/base_level);
  864. uint64_t lbase_size = 0;
  865. for (const auto& file : lbase_files) {
  866. lbase_size += file->fd.GetFileSize();
  867. if (lbase_size > min_lbase_size) {
  868. break;
  869. }
  870. }
  871. if (lbase_size <= min_lbase_size) {
  872. return false;
  873. }
  874. start_level_inputs_.clear();
  875. start_level_inputs_.level = 0;
  876. for (const auto& file : l0_files) {
  877. if (file->being_compacted) {
  878. break;
  879. }
  880. start_level_inputs_.files.push_back(file);
  881. }
  882. if (start_level_inputs_.files.size() < min_num_file) {
  883. start_level_inputs_.clear();
  884. return false;
  885. }
  886. output_level_ = 0;
  887. return true /* picked an intra-L0 compaction */;
  888. }
  889. } // namespace
  890. Compaction* LevelCompactionPicker::PickCompaction(
  891. const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
  892. const MutableDBOptions& mutable_db_options,
  893. const std::vector<SequenceNumber>& /*existing_snapshots */,
  894. const SnapshotChecker* /*snapshot_checker*/, VersionStorageInfo* vstorage,
  895. LogBuffer* log_buffer, bool /* require_max_output_level*/) {
  896. LevelCompactionBuilder builder(cf_name, vstorage, this, log_buffer,
  897. mutable_cf_options, ioptions_,
  898. mutable_db_options);
  899. return builder.PickCompaction();
  900. }
  901. } // namespace ROCKSDB_NAMESPACE