compaction_picker_universal.cc 41 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. #include "db/compaction/compaction_picker_universal.h"
  10. #ifndef ROCKSDB_LITE
  11. #include <cinttypes>
  12. #include <limits>
  13. #include <queue>
  14. #include <string>
  15. #include <utility>
  16. #include "db/column_family.h"
  17. #include "file/filename.h"
  18. #include "logging/log_buffer.h"
  19. #include "monitoring/statistics.h"
  20. #include "test_util/sync_point.h"
  21. #include "util/random.h"
  22. #include "util/string_util.h"
  23. namespace ROCKSDB_NAMESPACE {
  24. namespace {
  25. // A helper class that form universal compactions. The class is used by
  26. // UniversalCompactionPicker::PickCompaction().
  27. // The usage is to create the class, and get the compaction object by calling
  28. // PickCompaction().
  29. class UniversalCompactionBuilder {
  30. public:
  31. UniversalCompactionBuilder(const ImmutableCFOptions& ioptions,
  32. const InternalKeyComparator* icmp,
  33. const std::string& cf_name,
  34. const MutableCFOptions& mutable_cf_options,
  35. VersionStorageInfo* vstorage,
  36. UniversalCompactionPicker* picker,
  37. LogBuffer* log_buffer)
  38. : ioptions_(ioptions),
  39. icmp_(icmp),
  40. cf_name_(cf_name),
  41. mutable_cf_options_(mutable_cf_options),
  42. vstorage_(vstorage),
  43. picker_(picker),
  44. log_buffer_(log_buffer) {}
  45. // Form and return the compaction object. The caller owns return object.
  46. Compaction* PickCompaction();
  47. private:
  48. struct SortedRun {
  49. SortedRun(int _level, FileMetaData* _file, uint64_t _size,
  50. uint64_t _compensated_file_size, bool _being_compacted)
  51. : level(_level),
  52. file(_file),
  53. size(_size),
  54. compensated_file_size(_compensated_file_size),
  55. being_compacted(_being_compacted) {
  56. assert(compensated_file_size > 0);
  57. assert(level != 0 || file != nullptr);
  58. }
  59. void Dump(char* out_buf, size_t out_buf_size,
  60. bool print_path = false) const;
  61. // sorted_run_count is added into the string to print
  62. void DumpSizeInfo(char* out_buf, size_t out_buf_size,
  63. size_t sorted_run_count) const;
  64. int level;
  65. // `file` Will be null for level > 0. For level = 0, the sorted run is
  66. // for this file.
  67. FileMetaData* file;
  68. // For level > 0, `size` and `compensated_file_size` are sum of sizes all
  69. // files in the level. `being_compacted` should be the same for all files
  70. // in a non-zero level. Use the value here.
  71. uint64_t size;
  72. uint64_t compensated_file_size;
  73. bool being_compacted;
  74. };
  75. // Pick Universal compaction to limit read amplification
  76. Compaction* PickCompactionToReduceSortedRuns(
  77. unsigned int ratio, unsigned int max_number_of_files_to_compact);
  78. // Pick Universal compaction to limit space amplification.
  79. Compaction* PickCompactionToReduceSizeAmp();
  80. Compaction* PickDeleteTriggeredCompaction();
  81. // Form a compaction from the sorted run indicated by start_index to the
  82. // oldest sorted run.
  83. // The caller is responsible for making sure that those files are not in
  84. // compaction.
  85. Compaction* PickCompactionToOldest(size_t start_index,
  86. CompactionReason compaction_reason);
  87. // Try to pick periodic compaction. The caller should only call it
  88. // if there is at least one file marked for periodic compaction.
  89. // null will be returned if no such a compaction can be formed
  90. // because some files are being compacted.
  91. Compaction* PickPeriodicCompaction();
  92. // Used in universal compaction when the enabled_trivial_move
  93. // option is set. Checks whether there are any overlapping files
  94. // in the input. Returns true if the input files are non
  95. // overlapping.
  96. bool IsInputFilesNonOverlapping(Compaction* c);
  97. const ImmutableCFOptions& ioptions_;
  98. const InternalKeyComparator* icmp_;
  99. double score_;
  100. std::vector<SortedRun> sorted_runs_;
  101. const std::string& cf_name_;
  102. const MutableCFOptions& mutable_cf_options_;
  103. VersionStorageInfo* vstorage_;
  104. UniversalCompactionPicker* picker_;
  105. LogBuffer* log_buffer_;
  106. static std::vector<SortedRun> CalculateSortedRuns(
  107. const VersionStorageInfo& vstorage, const ImmutableCFOptions& ioptions,
  108. const MutableCFOptions& mutable_cf_options);
  109. // Pick a path ID to place a newly generated file, with its estimated file
  110. // size.
  111. static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
  112. const MutableCFOptions& mutable_cf_options,
  113. uint64_t file_size);
  114. };
  115. // Used in universal compaction when trivial move is enabled.
  116. // This structure is used for the construction of min heap
  117. // that contains the file meta data, the level of the file
  118. // and the index of the file in that level
  119. struct InputFileInfo {
  120. InputFileInfo() : f(nullptr), level(0), index(0) {}
  121. FileMetaData* f;
  122. size_t level;
  123. size_t index;
  124. };
  125. // Used in universal compaction when trivial move is enabled.
  126. // This comparator is used for the construction of min heap
  127. // based on the smallest key of the file.
  128. struct SmallestKeyHeapComparator {
  129. explicit SmallestKeyHeapComparator(const Comparator* ucmp) { ucmp_ = ucmp; }
  130. bool operator()(InputFileInfo i1, InputFileInfo i2) const {
  131. return (ucmp_->Compare(i1.f->smallest.user_key(),
  132. i2.f->smallest.user_key()) > 0);
  133. }
  134. private:
  135. const Comparator* ucmp_;
  136. };
  137. typedef std::priority_queue<InputFileInfo, std::vector<InputFileInfo>,
  138. SmallestKeyHeapComparator>
  139. SmallestKeyHeap;
  140. // This function creates the heap that is used to find if the files are
  141. // overlapping during universal compaction when the allow_trivial_move
  142. // is set.
  143. SmallestKeyHeap create_level_heap(Compaction* c, const Comparator* ucmp) {
  144. SmallestKeyHeap smallest_key_priority_q =
  145. SmallestKeyHeap(SmallestKeyHeapComparator(ucmp));
  146. InputFileInfo input_file;
  147. for (size_t l = 0; l < c->num_input_levels(); l++) {
  148. if (c->num_input_files(l) != 0) {
  149. if (l == 0 && c->start_level() == 0) {
  150. for (size_t i = 0; i < c->num_input_files(0); i++) {
  151. input_file.f = c->input(0, i);
  152. input_file.level = 0;
  153. input_file.index = i;
  154. smallest_key_priority_q.push(std::move(input_file));
  155. }
  156. } else {
  157. input_file.f = c->input(l, 0);
  158. input_file.level = l;
  159. input_file.index = 0;
  160. smallest_key_priority_q.push(std::move(input_file));
  161. }
  162. }
  163. }
  164. return smallest_key_priority_q;
  165. }
  166. #ifndef NDEBUG
  167. // smallest_seqno and largest_seqno are set iff. `files` is not empty.
  168. void GetSmallestLargestSeqno(const std::vector<FileMetaData*>& files,
  169. SequenceNumber* smallest_seqno,
  170. SequenceNumber* largest_seqno) {
  171. bool is_first = true;
  172. for (FileMetaData* f : files) {
  173. assert(f->fd.smallest_seqno <= f->fd.largest_seqno);
  174. if (is_first) {
  175. is_first = false;
  176. *smallest_seqno = f->fd.smallest_seqno;
  177. *largest_seqno = f->fd.largest_seqno;
  178. } else {
  179. if (f->fd.smallest_seqno < *smallest_seqno) {
  180. *smallest_seqno = f->fd.smallest_seqno;
  181. }
  182. if (f->fd.largest_seqno > *largest_seqno) {
  183. *largest_seqno = f->fd.largest_seqno;
  184. }
  185. }
  186. }
  187. }
  188. #endif
  189. } // namespace
  190. // Algorithm that checks to see if there are any overlapping
  191. // files in the input
  192. bool UniversalCompactionBuilder::IsInputFilesNonOverlapping(Compaction* c) {
  193. auto comparator = icmp_->user_comparator();
  194. int first_iter = 1;
  195. InputFileInfo prev, curr, next;
  196. SmallestKeyHeap smallest_key_priority_q =
  197. create_level_heap(c, icmp_->user_comparator());
  198. while (!smallest_key_priority_q.empty()) {
  199. curr = smallest_key_priority_q.top();
  200. smallest_key_priority_q.pop();
  201. if (first_iter) {
  202. prev = curr;
  203. first_iter = 0;
  204. } else {
  205. if (comparator->Compare(prev.f->largest.user_key(),
  206. curr.f->smallest.user_key()) >= 0) {
  207. // found overlapping files, return false
  208. return false;
  209. }
  210. assert(comparator->Compare(curr.f->largest.user_key(),
  211. prev.f->largest.user_key()) > 0);
  212. prev = curr;
  213. }
  214. next.f = nullptr;
  215. if (c->level(curr.level) != 0 &&
  216. curr.index < c->num_input_files(curr.level) - 1) {
  217. next.f = c->input(curr.level, curr.index + 1);
  218. next.level = curr.level;
  219. next.index = curr.index + 1;
  220. }
  221. if (next.f) {
  222. smallest_key_priority_q.push(std::move(next));
  223. }
  224. }
  225. return true;
  226. }
  227. bool UniversalCompactionPicker::NeedsCompaction(
  228. const VersionStorageInfo* vstorage) const {
  229. const int kLevel0 = 0;
  230. if (vstorage->CompactionScore(kLevel0) >= 1) {
  231. return true;
  232. }
  233. if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) {
  234. return true;
  235. }
  236. if (!vstorage->FilesMarkedForCompaction().empty()) {
  237. return true;
  238. }
  239. return false;
  240. }
  241. Compaction* UniversalCompactionPicker::PickCompaction(
  242. const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
  243. VersionStorageInfo* vstorage, LogBuffer* log_buffer,
  244. SequenceNumber /* earliest_memtable_seqno */) {
  245. UniversalCompactionBuilder builder(ioptions_, icmp_, cf_name,
  246. mutable_cf_options, vstorage, this,
  247. log_buffer);
  248. return builder.PickCompaction();
  249. }
  250. void UniversalCompactionBuilder::SortedRun::Dump(char* out_buf,
  251. size_t out_buf_size,
  252. bool print_path) const {
  253. if (level == 0) {
  254. assert(file != nullptr);
  255. if (file->fd.GetPathId() == 0 || !print_path) {
  256. snprintf(out_buf, out_buf_size, "file %" PRIu64, file->fd.GetNumber());
  257. } else {
  258. snprintf(out_buf, out_buf_size, "file %" PRIu64
  259. "(path "
  260. "%" PRIu32 ")",
  261. file->fd.GetNumber(), file->fd.GetPathId());
  262. }
  263. } else {
  264. snprintf(out_buf, out_buf_size, "level %d", level);
  265. }
  266. }
  267. void UniversalCompactionBuilder::SortedRun::DumpSizeInfo(
  268. char* out_buf, size_t out_buf_size, size_t sorted_run_count) const {
  269. if (level == 0) {
  270. assert(file != nullptr);
  271. snprintf(out_buf, out_buf_size,
  272. "file %" PRIu64 "[%" ROCKSDB_PRIszt
  273. "] "
  274. "with size %" PRIu64 " (compensated size %" PRIu64 ")",
  275. file->fd.GetNumber(), sorted_run_count, file->fd.GetFileSize(),
  276. file->compensated_file_size);
  277. } else {
  278. snprintf(out_buf, out_buf_size,
  279. "level %d[%" ROCKSDB_PRIszt
  280. "] "
  281. "with size %" PRIu64 " (compensated size %" PRIu64 ")",
  282. level, sorted_run_count, size, compensated_file_size);
  283. }
  284. }
  285. std::vector<UniversalCompactionBuilder::SortedRun>
  286. UniversalCompactionBuilder::CalculateSortedRuns(
  287. const VersionStorageInfo& vstorage, const ImmutableCFOptions& /*ioptions*/,
  288. const MutableCFOptions& mutable_cf_options) {
  289. std::vector<UniversalCompactionBuilder::SortedRun> ret;
  290. for (FileMetaData* f : vstorage.LevelFiles(0)) {
  291. ret.emplace_back(0, f, f->fd.GetFileSize(), f->compensated_file_size,
  292. f->being_compacted);
  293. }
  294. for (int level = 1; level < vstorage.num_levels(); level++) {
  295. uint64_t total_compensated_size = 0U;
  296. uint64_t total_size = 0U;
  297. bool being_compacted = false;
  298. bool is_first = true;
  299. for (FileMetaData* f : vstorage.LevelFiles(level)) {
  300. total_compensated_size += f->compensated_file_size;
  301. total_size += f->fd.GetFileSize();
  302. if (mutable_cf_options.compaction_options_universal.allow_trivial_move ==
  303. true) {
  304. if (f->being_compacted) {
  305. being_compacted = f->being_compacted;
  306. }
  307. } else {
  308. // Compaction always includes all files for a non-zero level, so for a
  309. // non-zero level, all the files should share the same being_compacted
  310. // value.
  311. // This assumption is only valid when
  312. // mutable_cf_options.compaction_options_universal.allow_trivial_move
  313. // is false
  314. assert(is_first || f->being_compacted == being_compacted);
  315. }
  316. if (is_first) {
  317. being_compacted = f->being_compacted;
  318. is_first = false;
  319. }
  320. }
  321. if (total_compensated_size > 0) {
  322. ret.emplace_back(level, nullptr, total_size, total_compensated_size,
  323. being_compacted);
  324. }
  325. }
  326. return ret;
  327. }
  328. // Universal style of compaction. Pick files that are contiguous in
  329. // time-range to compact.
  330. Compaction* UniversalCompactionBuilder::PickCompaction() {
  331. const int kLevel0 = 0;
  332. score_ = vstorage_->CompactionScore(kLevel0);
  333. sorted_runs_ =
  334. CalculateSortedRuns(*vstorage_, ioptions_, mutable_cf_options_);
  335. if (sorted_runs_.size() == 0 ||
  336. (vstorage_->FilesMarkedForPeriodicCompaction().empty() &&
  337. vstorage_->FilesMarkedForCompaction().empty() &&
  338. sorted_runs_.size() < (unsigned int)mutable_cf_options_
  339. .level0_file_num_compaction_trigger)) {
  340. ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: nothing to do\n",
  341. cf_name_.c_str());
  342. TEST_SYNC_POINT_CALLBACK(
  343. "UniversalCompactionBuilder::PickCompaction:Return", nullptr);
  344. return nullptr;
  345. }
  346. VersionStorageInfo::LevelSummaryStorage tmp;
  347. ROCKS_LOG_BUFFER_MAX_SZ(
  348. log_buffer_, 3072,
  349. "[%s] Universal: sorted runs files(%" ROCKSDB_PRIszt "): %s\n",
  350. cf_name_.c_str(), sorted_runs_.size(), vstorage_->LevelSummary(&tmp));
  351. Compaction* c = nullptr;
  352. // Periodic compaction has higher priority than other type of compaction
  353. // because it's a hard requirement.
  354. if (!vstorage_->FilesMarkedForPeriodicCompaction().empty()) {
  355. // Always need to do a full compaction for periodic compaction.
  356. c = PickPeriodicCompaction();
  357. }
  358. // Check for size amplification.
  359. if (c == nullptr &&
  360. sorted_runs_.size() >=
  361. static_cast<size_t>(
  362. mutable_cf_options_.level0_file_num_compaction_trigger)) {
  363. if ((c = PickCompactionToReduceSizeAmp()) != nullptr) {
  364. ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: compacting for size amp\n",
  365. cf_name_.c_str());
  366. } else {
  367. // Size amplification is within limits. Try reducing read
  368. // amplification while maintaining file size ratios.
  369. unsigned int ratio =
  370. mutable_cf_options_.compaction_options_universal.size_ratio;
  371. if ((c = PickCompactionToReduceSortedRuns(ratio, UINT_MAX)) != nullptr) {
  372. ROCKS_LOG_BUFFER(log_buffer_,
  373. "[%s] Universal: compacting for size ratio\n",
  374. cf_name_.c_str());
  375. } else {
  376. // Size amplification and file size ratios are within configured limits.
  377. // If max read amplification is exceeding configured limits, then force
  378. // compaction without looking at filesize ratios and try to reduce
  379. // the number of files to fewer than level0_file_num_compaction_trigger.
  380. // This is guaranteed by NeedsCompaction()
  381. assert(sorted_runs_.size() >=
  382. static_cast<size_t>(
  383. mutable_cf_options_.level0_file_num_compaction_trigger));
  384. // Get the total number of sorted runs that are not being compacted
  385. int num_sr_not_compacted = 0;
  386. for (size_t i = 0; i < sorted_runs_.size(); i++) {
  387. if (sorted_runs_[i].being_compacted == false) {
  388. num_sr_not_compacted++;
  389. }
  390. }
  391. // The number of sorted runs that are not being compacted is greater
  392. // than the maximum allowed number of sorted runs
  393. if (num_sr_not_compacted >
  394. mutable_cf_options_.level0_file_num_compaction_trigger) {
  395. unsigned int num_files =
  396. num_sr_not_compacted -
  397. mutable_cf_options_.level0_file_num_compaction_trigger + 1;
  398. if ((c = PickCompactionToReduceSortedRuns(UINT_MAX, num_files)) !=
  399. nullptr) {
  400. ROCKS_LOG_BUFFER(log_buffer_,
  401. "[%s] Universal: compacting for file num -- %u\n",
  402. cf_name_.c_str(), num_files);
  403. }
  404. }
  405. }
  406. }
  407. }
  408. if (c == nullptr) {
  409. if ((c = PickDeleteTriggeredCompaction()) != nullptr) {
  410. ROCKS_LOG_BUFFER(log_buffer_,
  411. "[%s] Universal: delete triggered compaction\n",
  412. cf_name_.c_str());
  413. }
  414. }
  415. if (c == nullptr) {
  416. TEST_SYNC_POINT_CALLBACK(
  417. "UniversalCompactionBuilder::PickCompaction:Return", nullptr);
  418. return nullptr;
  419. }
  420. if (mutable_cf_options_.compaction_options_universal.allow_trivial_move ==
  421. true &&
  422. c->compaction_reason() != CompactionReason::kPeriodicCompaction) {
  423. c->set_is_trivial_move(IsInputFilesNonOverlapping(c));
  424. }
  425. // validate that all the chosen files of L0 are non overlapping in time
  426. #ifndef NDEBUG
  427. SequenceNumber prev_smallest_seqno = 0U;
  428. bool is_first = true;
  429. size_t level_index = 0U;
  430. if (c->start_level() == 0) {
  431. for (auto f : *c->inputs(0)) {
  432. assert(f->fd.smallest_seqno <= f->fd.largest_seqno);
  433. if (is_first) {
  434. is_first = false;
  435. }
  436. prev_smallest_seqno = f->fd.smallest_seqno;
  437. }
  438. level_index = 1U;
  439. }
  440. for (; level_index < c->num_input_levels(); level_index++) {
  441. if (c->num_input_files(level_index) != 0) {
  442. SequenceNumber smallest_seqno = 0U;
  443. SequenceNumber largest_seqno = 0U;
  444. GetSmallestLargestSeqno(*(c->inputs(level_index)), &smallest_seqno,
  445. &largest_seqno);
  446. if (is_first) {
  447. is_first = false;
  448. } else if (prev_smallest_seqno > 0) {
  449. // A level is considered as the bottommost level if there are
  450. // no files in higher levels or if files in higher levels do
  451. // not overlap with the files being compacted. Sequence numbers
  452. // of files in bottommost level can be set to 0 to help
  453. // compression. As a result, the following assert may not hold
  454. // if the prev_smallest_seqno is 0.
  455. assert(prev_smallest_seqno > largest_seqno);
  456. }
  457. prev_smallest_seqno = smallest_seqno;
  458. }
  459. }
  460. #endif
  461. // update statistics
  462. RecordInHistogram(ioptions_.statistics, NUM_FILES_IN_SINGLE_COMPACTION,
  463. c->inputs(0)->size());
  464. picker_->RegisterCompaction(c);
  465. vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
  466. TEST_SYNC_POINT_CALLBACK("UniversalCompactionBuilder::PickCompaction:Return",
  467. c);
  468. return c;
  469. }
  470. uint32_t UniversalCompactionBuilder::GetPathId(
  471. const ImmutableCFOptions& ioptions,
  472. const MutableCFOptions& mutable_cf_options, uint64_t file_size) {
  473. // Two conditions need to be satisfied:
  474. // (1) the target path needs to be able to hold the file's size
  475. // (2) Total size left in this and previous paths need to be not
  476. // smaller than expected future file size before this new file is
  477. // compacted, which is estimated based on size_ratio.
  478. // For example, if now we are compacting files of size (1, 1, 2, 4, 8),
  479. // we will make sure the target file, probably with size of 16, will be
  480. // placed in a path so that eventually when new files are generated and
  481. // compacted to (1, 1, 2, 4, 8, 16), all those files can be stored in or
  482. // before the path we chose.
  483. //
  484. // TODO(sdong): now the case of multiple column families is not
  485. // considered in this algorithm. So the target size can be violated in
  486. // that case. We need to improve it.
  487. uint64_t accumulated_size = 0;
  488. uint64_t future_size =
  489. file_size *
  490. (100 - mutable_cf_options.compaction_options_universal.size_ratio) / 100;
  491. uint32_t p = 0;
  492. assert(!ioptions.cf_paths.empty());
  493. for (; p < ioptions.cf_paths.size() - 1; p++) {
  494. uint64_t target_size = ioptions.cf_paths[p].target_size;
  495. if (target_size > file_size &&
  496. accumulated_size + (target_size - file_size) > future_size) {
  497. return p;
  498. }
  499. accumulated_size += target_size;
  500. }
  501. return p;
  502. }
  503. //
  504. // Consider compaction files based on their size differences with
  505. // the next file in time order.
  506. //
  507. Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
  508. unsigned int ratio, unsigned int max_number_of_files_to_compact) {
  509. unsigned int min_merge_width =
  510. mutable_cf_options_.compaction_options_universal.min_merge_width;
  511. unsigned int max_merge_width =
  512. mutable_cf_options_.compaction_options_universal.max_merge_width;
  513. const SortedRun* sr = nullptr;
  514. bool done = false;
  515. size_t start_index = 0;
  516. unsigned int candidate_count = 0;
  517. unsigned int max_files_to_compact =
  518. std::min(max_merge_width, max_number_of_files_to_compact);
  519. min_merge_width = std::max(min_merge_width, 2U);
  520. // Caller checks the size before executing this function. This invariant is
  521. // important because otherwise we may have a possible integer underflow when
  522. // dealing with unsigned types.
  523. assert(sorted_runs_.size() > 0);
  524. // Considers a candidate file only if it is smaller than the
  525. // total size accumulated so far.
  526. for (size_t loop = 0; loop < sorted_runs_.size(); loop++) {
  527. candidate_count = 0;
  528. // Skip files that are already being compacted
  529. for (sr = nullptr; loop < sorted_runs_.size(); loop++) {
  530. sr = &sorted_runs_[loop];
  531. if (!sr->being_compacted) {
  532. candidate_count = 1;
  533. break;
  534. }
  535. char file_num_buf[kFormatFileNumberBufSize];
  536. sr->Dump(file_num_buf, sizeof(file_num_buf));
  537. ROCKS_LOG_BUFFER(log_buffer_,
  538. "[%s] Universal: %s"
  539. "[%d] being compacted, skipping",
  540. cf_name_.c_str(), file_num_buf, loop);
  541. sr = nullptr;
  542. }
  543. // This file is not being compacted. Consider it as the
  544. // first candidate to be compacted.
  545. uint64_t candidate_size = sr != nullptr ? sr->compensated_file_size : 0;
  546. if (sr != nullptr) {
  547. char file_num_buf[kFormatFileNumberBufSize];
  548. sr->Dump(file_num_buf, sizeof(file_num_buf), true);
  549. ROCKS_LOG_BUFFER(log_buffer_,
  550. "[%s] Universal: Possible candidate %s[%d].",
  551. cf_name_.c_str(), file_num_buf, loop);
  552. }
  553. // Check if the succeeding files need compaction.
  554. for (size_t i = loop + 1;
  555. candidate_count < max_files_to_compact && i < sorted_runs_.size();
  556. i++) {
  557. const SortedRun* succeeding_sr = &sorted_runs_[i];
  558. if (succeeding_sr->being_compacted) {
  559. break;
  560. }
  561. // Pick files if the total/last candidate file size (increased by the
  562. // specified ratio) is still larger than the next candidate file.
  563. // candidate_size is the total size of files picked so far with the
  564. // default kCompactionStopStyleTotalSize; with
  565. // kCompactionStopStyleSimilarSize, it's simply the size of the last
  566. // picked file.
  567. double sz = candidate_size * (100.0 + ratio) / 100.0;
  568. if (sz < static_cast<double>(succeeding_sr->size)) {
  569. break;
  570. }
  571. if (mutable_cf_options_.compaction_options_universal.stop_style ==
  572. kCompactionStopStyleSimilarSize) {
  573. // Similar-size stopping rule: also check the last picked file isn't
  574. // far larger than the next candidate file.
  575. sz = (succeeding_sr->size * (100.0 + ratio)) / 100.0;
  576. if (sz < static_cast<double>(candidate_size)) {
  577. // If the small file we've encountered begins a run of similar-size
  578. // files, we'll pick them up on a future iteration of the outer
  579. // loop. If it's some lonely straggler, it'll eventually get picked
  580. // by the last-resort read amp strategy which disregards size ratios.
  581. break;
  582. }
  583. candidate_size = succeeding_sr->compensated_file_size;
  584. } else { // default kCompactionStopStyleTotalSize
  585. candidate_size += succeeding_sr->compensated_file_size;
  586. }
  587. candidate_count++;
  588. }
  589. // Found a series of consecutive files that need compaction.
  590. if (candidate_count >= (unsigned int)min_merge_width) {
  591. start_index = loop;
  592. done = true;
  593. break;
  594. } else {
  595. for (size_t i = loop;
  596. i < loop + candidate_count && i < sorted_runs_.size(); i++) {
  597. const SortedRun* skipping_sr = &sorted_runs_[i];
  598. char file_num_buf[256];
  599. skipping_sr->DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop);
  600. ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Skipping %s",
  601. cf_name_.c_str(), file_num_buf);
  602. }
  603. }
  604. }
  605. if (!done || candidate_count <= 1) {
  606. return nullptr;
  607. }
  608. size_t first_index_after = start_index + candidate_count;
  609. // Compression is enabled if files compacted earlier already reached
  610. // size ratio of compression.
  611. bool enable_compression = true;
  612. int ratio_to_compress =
  613. mutable_cf_options_.compaction_options_universal.compression_size_percent;
  614. if (ratio_to_compress >= 0) {
  615. uint64_t total_size = 0;
  616. for (auto& sorted_run : sorted_runs_) {
  617. total_size += sorted_run.compensated_file_size;
  618. }
  619. uint64_t older_file_size = 0;
  620. for (size_t i = sorted_runs_.size() - 1; i >= first_index_after; i--) {
  621. older_file_size += sorted_runs_[i].size;
  622. if (older_file_size * 100L >= total_size * (long)ratio_to_compress) {
  623. enable_compression = false;
  624. break;
  625. }
  626. }
  627. }
  628. uint64_t estimated_total_size = 0;
  629. for (unsigned int i = 0; i < first_index_after; i++) {
  630. estimated_total_size += sorted_runs_[i].size;
  631. }
  632. uint32_t path_id =
  633. GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
  634. int start_level = sorted_runs_[start_index].level;
  635. int output_level;
  636. if (first_index_after == sorted_runs_.size()) {
  637. output_level = vstorage_->num_levels() - 1;
  638. } else if (sorted_runs_[first_index_after].level == 0) {
  639. output_level = 0;
  640. } else {
  641. output_level = sorted_runs_[first_index_after].level - 1;
  642. }
  643. // last level is reserved for the files ingested behind
  644. if (ioptions_.allow_ingest_behind &&
  645. (output_level == vstorage_->num_levels() - 1)) {
  646. assert(output_level > 1);
  647. output_level--;
  648. }
  649. std::vector<CompactionInputFiles> inputs(vstorage_->num_levels());
  650. for (size_t i = 0; i < inputs.size(); ++i) {
  651. inputs[i].level = start_level + static_cast<int>(i);
  652. }
  653. for (size_t i = start_index; i < first_index_after; i++) {
  654. auto& picking_sr = sorted_runs_[i];
  655. if (picking_sr.level == 0) {
  656. FileMetaData* picking_file = picking_sr.file;
  657. inputs[0].files.push_back(picking_file);
  658. } else {
  659. auto& files = inputs[picking_sr.level - start_level].files;
  660. for (auto* f : vstorage_->LevelFiles(picking_sr.level)) {
  661. files.push_back(f);
  662. }
  663. }
  664. char file_num_buf[256];
  665. picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), i);
  666. ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Picking %s",
  667. cf_name_.c_str(), file_num_buf);
  668. }
  669. CompactionReason compaction_reason;
  670. if (max_number_of_files_to_compact == UINT_MAX) {
  671. compaction_reason = CompactionReason::kUniversalSizeRatio;
  672. } else {
  673. compaction_reason = CompactionReason::kUniversalSortedRunNum;
  674. }
  675. return new Compaction(
  676. vstorage_, ioptions_, mutable_cf_options_, std::move(inputs),
  677. output_level,
  678. MaxFileSizeForLevel(mutable_cf_options_, output_level,
  679. kCompactionStyleUniversal),
  680. LLONG_MAX, path_id,
  681. GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, start_level,
  682. 1, enable_compression),
  683. GetCompressionOptions(ioptions_, vstorage_, start_level,
  684. enable_compression),
  685. /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
  686. score_, false /* deletion_compaction */, compaction_reason);
  687. }
  688. // Look at overall size amplification. If size amplification
  689. // exceeeds the configured value, then do a compaction
  690. // of the candidate files all the way upto the earliest
  691. // base file (overrides configured values of file-size ratios,
  692. // min_merge_width and max_merge_width).
  693. //
  694. Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() {
  695. // percentage flexibility while reducing size amplification
  696. uint64_t ratio = mutable_cf_options_.compaction_options_universal
  697. .max_size_amplification_percent;
  698. unsigned int candidate_count = 0;
  699. uint64_t candidate_size = 0;
  700. size_t start_index = 0;
  701. const SortedRun* sr = nullptr;
  702. assert(!sorted_runs_.empty());
  703. if (sorted_runs_.back().being_compacted) {
  704. return nullptr;
  705. }
  706. // Skip files that are already being compacted
  707. for (size_t loop = 0; loop < sorted_runs_.size() - 1; loop++) {
  708. sr = &sorted_runs_[loop];
  709. if (!sr->being_compacted) {
  710. start_index = loop; // Consider this as the first candidate.
  711. break;
  712. }
  713. char file_num_buf[kFormatFileNumberBufSize];
  714. sr->Dump(file_num_buf, sizeof(file_num_buf), true);
  715. ROCKS_LOG_BUFFER(log_buffer_,
  716. "[%s] Universal: skipping %s[%d] compacted %s",
  717. cf_name_.c_str(), file_num_buf, loop,
  718. " cannot be a candidate to reduce size amp.\n");
  719. sr = nullptr;
  720. }
  721. if (sr == nullptr) {
  722. return nullptr; // no candidate files
  723. }
  724. {
  725. char file_num_buf[kFormatFileNumberBufSize];
  726. sr->Dump(file_num_buf, sizeof(file_num_buf), true);
  727. ROCKS_LOG_BUFFER(
  728. log_buffer_,
  729. "[%s] Universal: First candidate %s[%" ROCKSDB_PRIszt "] %s",
  730. cf_name_.c_str(), file_num_buf, start_index, " to reduce size amp.\n");
  731. }
  732. // keep adding up all the remaining files
  733. for (size_t loop = start_index; loop < sorted_runs_.size() - 1; loop++) {
  734. sr = &sorted_runs_[loop];
  735. if (sr->being_compacted) {
  736. char file_num_buf[kFormatFileNumberBufSize];
  737. sr->Dump(file_num_buf, sizeof(file_num_buf), true);
  738. ROCKS_LOG_BUFFER(
  739. log_buffer_, "[%s] Universal: Possible candidate %s[%d] %s",
  740. cf_name_.c_str(), file_num_buf, start_index,
  741. " is already being compacted. No size amp reduction possible.\n");
  742. return nullptr;
  743. }
  744. candidate_size += sr->compensated_file_size;
  745. candidate_count++;
  746. }
  747. if (candidate_count == 0) {
  748. return nullptr;
  749. }
  750. // size of earliest file
  751. uint64_t earliest_file_size = sorted_runs_.back().size;
  752. // size amplification = percentage of additional size
  753. if (candidate_size * 100 < ratio * earliest_file_size) {
  754. ROCKS_LOG_BUFFER(
  755. log_buffer_,
  756. "[%s] Universal: size amp not needed. newer-files-total-size %" PRIu64
  757. " earliest-file-size %" PRIu64,
  758. cf_name_.c_str(), candidate_size, earliest_file_size);
  759. return nullptr;
  760. } else {
  761. ROCKS_LOG_BUFFER(
  762. log_buffer_,
  763. "[%s] Universal: size amp needed. newer-files-total-size %" PRIu64
  764. " earliest-file-size %" PRIu64,
  765. cf_name_.c_str(), candidate_size, earliest_file_size);
  766. }
  767. return PickCompactionToOldest(start_index,
  768. CompactionReason::kUniversalSizeAmplification);
  769. }
  770. // Pick files marked for compaction. Typically, files are marked by
  771. // CompactOnDeleteCollector due to the presence of tombstones.
  772. Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
  773. CompactionInputFiles start_level_inputs;
  774. int output_level;
  775. std::vector<CompactionInputFiles> inputs;
  776. if (vstorage_->num_levels() == 1) {
  777. // This is single level universal. Since we're basically trying to reclaim
  778. // space by processing files marked for compaction due to high tombstone
  779. // density, let's do the same thing as compaction to reduce size amp which
  780. // has the same goals.
  781. bool compact = false;
  782. start_level_inputs.level = 0;
  783. start_level_inputs.files.clear();
  784. output_level = 0;
  785. for (FileMetaData* f : vstorage_->LevelFiles(0)) {
  786. if (f->marked_for_compaction) {
  787. compact = true;
  788. }
  789. if (compact) {
  790. start_level_inputs.files.push_back(f);
  791. }
  792. }
  793. if (start_level_inputs.size() <= 1) {
  794. // If only the last file in L0 is marked for compaction, ignore it
  795. return nullptr;
  796. }
  797. inputs.push_back(start_level_inputs);
  798. } else {
  799. int start_level;
  800. // For multi-level universal, the strategy is to make this look more like
  801. // leveled. We pick one of the files marked for compaction and compact with
  802. // overlapping files in the adjacent level.
  803. picker_->PickFilesMarkedForCompaction(cf_name_, vstorage_, &start_level,
  804. &output_level, &start_level_inputs);
  805. if (start_level_inputs.empty()) {
  806. return nullptr;
  807. }
  808. // Pick the first non-empty level after the start_level
  809. for (output_level = start_level + 1; output_level < vstorage_->num_levels();
  810. output_level++) {
  811. if (vstorage_->NumLevelFiles(output_level) != 0) {
  812. break;
  813. }
  814. }
  815. // If all higher levels are empty, pick the highest level as output level
  816. if (output_level == vstorage_->num_levels()) {
  817. if (start_level == 0) {
  818. output_level = vstorage_->num_levels() - 1;
  819. } else {
  820. // If start level is non-zero and all higher levels are empty, this
  821. // compaction will translate into a trivial move. Since the idea is
  822. // to reclaim space and trivial move doesn't help with that, we
  823. // skip compaction in this case and return nullptr
  824. return nullptr;
  825. }
  826. }
  827. if (ioptions_.allow_ingest_behind &&
  828. output_level == vstorage_->num_levels() - 1) {
  829. assert(output_level > 1);
  830. output_level--;
  831. }
  832. if (output_level != 0) {
  833. if (start_level == 0) {
  834. if (!picker_->GetOverlappingL0Files(vstorage_, &start_level_inputs,
  835. output_level, nullptr)) {
  836. return nullptr;
  837. }
  838. }
  839. CompactionInputFiles output_level_inputs;
  840. int parent_index = -1;
  841. output_level_inputs.level = output_level;
  842. if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_,
  843. &start_level_inputs, &output_level_inputs,
  844. &parent_index, -1)) {
  845. return nullptr;
  846. }
  847. inputs.push_back(start_level_inputs);
  848. if (!output_level_inputs.empty()) {
  849. inputs.push_back(output_level_inputs);
  850. }
  851. if (picker_->FilesRangeOverlapWithCompaction(inputs, output_level)) {
  852. return nullptr;
  853. }
  854. } else {
  855. inputs.push_back(start_level_inputs);
  856. }
  857. }
  858. uint64_t estimated_total_size = 0;
  859. // Use size of the output level as estimated file size
  860. for (FileMetaData* f : vstorage_->LevelFiles(output_level)) {
  861. estimated_total_size += f->fd.GetFileSize();
  862. }
  863. uint32_t path_id =
  864. GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
  865. return new Compaction(
  866. vstorage_, ioptions_, mutable_cf_options_, std::move(inputs),
  867. output_level,
  868. MaxFileSizeForLevel(mutable_cf_options_, output_level,
  869. kCompactionStyleUniversal),
  870. /* max_grandparent_overlap_bytes */ LLONG_MAX, path_id,
  871. GetCompressionType(ioptions_, vstorage_, mutable_cf_options_,
  872. output_level, 1),
  873. GetCompressionOptions(ioptions_, vstorage_, output_level),
  874. /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ true,
  875. score_, false /* deletion_compaction */,
  876. CompactionReason::kFilesMarkedForCompaction);
  877. }
  878. Compaction* UniversalCompactionBuilder::PickCompactionToOldest(
  879. size_t start_index, CompactionReason compaction_reason) {
  880. assert(start_index < sorted_runs_.size());
  881. // Estimate total file size
  882. uint64_t estimated_total_size = 0;
  883. for (size_t loop = start_index; loop < sorted_runs_.size(); loop++) {
  884. estimated_total_size += sorted_runs_[loop].size;
  885. }
  886. uint32_t path_id =
  887. GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
  888. int start_level = sorted_runs_[start_index].level;
  889. std::vector<CompactionInputFiles> inputs(vstorage_->num_levels());
  890. for (size_t i = 0; i < inputs.size(); ++i) {
  891. inputs[i].level = start_level + static_cast<int>(i);
  892. }
  893. for (size_t loop = start_index; loop < sorted_runs_.size(); loop++) {
  894. auto& picking_sr = sorted_runs_[loop];
  895. if (picking_sr.level == 0) {
  896. FileMetaData* f = picking_sr.file;
  897. inputs[0].files.push_back(f);
  898. } else {
  899. auto& files = inputs[picking_sr.level - start_level].files;
  900. for (auto* f : vstorage_->LevelFiles(picking_sr.level)) {
  901. files.push_back(f);
  902. }
  903. }
  904. std::string comp_reason_print_string;
  905. if (compaction_reason == CompactionReason::kPeriodicCompaction) {
  906. comp_reason_print_string = "periodic compaction";
  907. } else if (compaction_reason ==
  908. CompactionReason::kUniversalSizeAmplification) {
  909. comp_reason_print_string = "size amp";
  910. } else {
  911. assert(false);
  912. }
  913. char file_num_buf[256];
  914. picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop);
  915. ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: %s picking %s",
  916. cf_name_.c_str(), comp_reason_print_string.c_str(),
  917. file_num_buf);
  918. }
  919. // output files at the bottom most level, unless it's reserved
  920. int output_level = vstorage_->num_levels() - 1;
  921. // last level is reserved for the files ingested behind
  922. if (ioptions_.allow_ingest_behind) {
  923. assert(output_level > 1);
  924. output_level--;
  925. }
  926. // We never check size for
  927. // compaction_options_universal.compression_size_percent,
  928. // because we always compact all the files, so always compress.
  929. return new Compaction(
  930. vstorage_, ioptions_, mutable_cf_options_, std::move(inputs),
  931. output_level,
  932. MaxFileSizeForLevel(mutable_cf_options_, output_level,
  933. kCompactionStyleUniversal),
  934. LLONG_MAX, path_id,
  935. GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, start_level,
  936. 1, true /* enable_compression */),
  937. GetCompressionOptions(ioptions_, vstorage_, start_level,
  938. true /* enable_compression */),
  939. /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
  940. score_, false /* deletion_compaction */, compaction_reason);
  941. }
  942. Compaction* UniversalCompactionBuilder::PickPeriodicCompaction() {
  943. ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Periodic Compaction",
  944. cf_name_.c_str());
  945. // In universal compaction, sorted runs contain older data are almost always
  946. // generated earlier too. To simplify the problem, we just try to trigger
  947. // a full compaction. We start from the oldest sorted run and include
  948. // all sorted runs, until we hit a sorted already being compacted.
  949. // Since usually the largest (which is usually the oldest) sorted run is
  950. // included anyway, doing a full compaction won't increase write
  951. // amplification much.
  952. // Get some information from marked files to check whether a file is
  953. // included in the compaction.
  954. size_t start_index = sorted_runs_.size();
  955. while (start_index > 0 && !sorted_runs_[start_index - 1].being_compacted) {
  956. start_index--;
  957. }
  958. if (start_index == sorted_runs_.size()) {
  959. return nullptr;
  960. }
  961. // There is a rare corner case where we can't pick up all the files
  962. // because some files are being compacted and we end up with picking files
  963. // but none of them need periodic compaction. Unless we simply recompact
  964. // the last sorted run (either the last level or last L0 file), we would just
  965. // execute the compaction, in order to simplify the logic.
  966. if (start_index == sorted_runs_.size() - 1) {
  967. bool included_file_marked = false;
  968. int start_level = sorted_runs_[start_index].level;
  969. FileMetaData* start_file = sorted_runs_[start_index].file;
  970. for (const std::pair<int, FileMetaData*>& level_file_pair :
  971. vstorage_->FilesMarkedForPeriodicCompaction()) {
  972. if (start_level != 0) {
  973. // Last sorted run is a level
  974. if (start_level == level_file_pair.first) {
  975. included_file_marked = true;
  976. break;
  977. }
  978. } else {
  979. // Last sorted run is a L0 file.
  980. if (start_file == level_file_pair.second) {
  981. included_file_marked = true;
  982. break;
  983. }
  984. }
  985. }
  986. if (!included_file_marked) {
  987. ROCKS_LOG_BUFFER(log_buffer_,
  988. "[%s] Universal: Cannot form a compaction covering file "
  989. "marked for periodic compaction",
  990. cf_name_.c_str());
  991. return nullptr;
  992. }
  993. }
  994. Compaction* c = PickCompactionToOldest(start_index,
  995. CompactionReason::kPeriodicCompaction);
  996. TEST_SYNC_POINT_CALLBACK(
  997. "UniversalCompactionPicker::PickPeriodicCompaction:Return", c);
  998. return c;
  999. }
  1000. } // namespace ROCKSDB_NAMESPACE
  1001. #endif // !ROCKSDB_LITE