memtable_list.h 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. #pragma once
  7. #include <deque>
  8. #include <limits>
  9. #include <list>
  10. #include <set>
  11. #include <string>
  12. #include <vector>
  13. #include "db/dbformat.h"
  14. #include "db/logs_with_prep_tracker.h"
  15. #include "db/memtable.h"
  16. #include "db/range_del_aggregator.h"
  17. #include "file/filename.h"
  18. #include "logging/log_buffer.h"
  19. #include "monitoring/instrumented_mutex.h"
  20. #include "rocksdb/db.h"
  21. #include "rocksdb/iterator.h"
  22. #include "rocksdb/options.h"
  23. #include "rocksdb/types.h"
  24. #include "util/autovector.h"
  25. namespace ROCKSDB_NAMESPACE {
  26. class ColumnFamilyData;
  27. class InternalKeyComparator;
  28. class InstrumentedMutex;
  29. class MergeIteratorBuilder;
  30. class MemTableList;
  31. struct FlushJobInfo;
  32. // keeps a list of immutable memtables in a vector. the list is immutable
  33. // if refcount is bigger than one. It is used as a state for Get() and
  34. // Iterator code paths
  35. //
  36. // This class is not thread-safe. External synchronization is required
  37. // (such as holding the db mutex or being on the write thread).
  38. class MemTableListVersion {
  39. public:
  40. explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
  41. MemTableListVersion* old = nullptr);
  42. explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
  43. int max_write_buffer_number_to_maintain,
  44. int64_t max_write_buffer_size_to_maintain);
  45. void Ref();
  46. void Unref(autovector<MemTable*>* to_delete = nullptr);
  47. // Search all the memtables starting from the most recent one.
  48. // Return the most recent value found, if any.
  49. //
  50. // If any operation was found for this key, its most recent sequence number
  51. // will be stored in *seq on success (regardless of whether true/false is
  52. // returned). Otherwise, *seq will be set to kMaxSequenceNumber.
  53. bool Get(const LookupKey& key, std::string* value, Status* s,
  54. MergeContext* merge_context,
  55. SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
  56. const ReadOptions& read_opts, ReadCallback* callback = nullptr,
  57. bool* is_blob_index = nullptr);
  58. bool Get(const LookupKey& key, std::string* value, Status* s,
  59. MergeContext* merge_context,
  60. SequenceNumber* max_covering_tombstone_seq,
  61. const ReadOptions& read_opts, ReadCallback* callback = nullptr,
  62. bool* is_blob_index = nullptr) {
  63. SequenceNumber seq;
  64. return Get(key, value, s, merge_context, max_covering_tombstone_seq, &seq,
  65. read_opts, callback, is_blob_index);
  66. }
  67. void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
  68. ReadCallback* callback, bool* is_blob);
  69. // Returns all the merge operands corresponding to the key by searching all
  70. // memtables starting from the most recent one.
  71. bool GetMergeOperands(const LookupKey& key, Status* s,
  72. MergeContext* merge_context,
  73. SequenceNumber* max_covering_tombstone_seq,
  74. const ReadOptions& read_opts);
  75. // Similar to Get(), but searches the Memtable history of memtables that
  76. // have already been flushed. Should only be used from in-memory only
  77. // queries (such as Transaction validation) as the history may contain
  78. // writes that are also present in the SST files.
  79. bool GetFromHistory(const LookupKey& key, std::string* value, Status* s,
  80. MergeContext* merge_context,
  81. SequenceNumber* max_covering_tombstone_seq,
  82. SequenceNumber* seq, const ReadOptions& read_opts,
  83. bool* is_blob_index = nullptr);
  84. bool GetFromHistory(const LookupKey& key, std::string* value, Status* s,
  85. MergeContext* merge_context,
  86. SequenceNumber* max_covering_tombstone_seq,
  87. const ReadOptions& read_opts,
  88. bool* is_blob_index = nullptr) {
  89. SequenceNumber seq;
  90. return GetFromHistory(key, value, s, merge_context,
  91. max_covering_tombstone_seq, &seq, read_opts,
  92. is_blob_index);
  93. }
  94. Status AddRangeTombstoneIterators(const ReadOptions& read_opts, Arena* arena,
  95. RangeDelAggregator* range_del_agg);
  96. void AddIterators(const ReadOptions& options,
  97. std::vector<InternalIterator*>* iterator_list,
  98. Arena* arena);
  99. void AddIterators(const ReadOptions& options,
  100. MergeIteratorBuilder* merge_iter_builder);
  101. uint64_t GetTotalNumEntries() const;
  102. uint64_t GetTotalNumDeletes() const;
  103. MemTable::MemTableStats ApproximateStats(const Slice& start_ikey,
  104. const Slice& end_ikey);
  105. // Returns the value of MemTable::GetEarliestSequenceNumber() on the most
  106. // recent MemTable in this list or kMaxSequenceNumber if the list is empty.
  107. // If include_history=true, will also search Memtables in MemTableList
  108. // History.
  109. SequenceNumber GetEarliestSequenceNumber(bool include_history = false) const;
  110. private:
  111. friend class MemTableList;
  112. friend Status InstallMemtableAtomicFlushResults(
  113. const autovector<MemTableList*>* imm_lists,
  114. const autovector<ColumnFamilyData*>& cfds,
  115. const autovector<const MutableCFOptions*>& mutable_cf_options_list,
  116. const autovector<const autovector<MemTable*>*>& mems_list,
  117. VersionSet* vset, InstrumentedMutex* mu,
  118. const autovector<FileMetaData*>& file_meta,
  119. autovector<MemTable*>* to_delete, Directory* db_directory,
  120. LogBuffer* log_buffer);
  121. // REQUIRE: m is an immutable memtable
  122. void Add(MemTable* m, autovector<MemTable*>* to_delete);
  123. // REQUIRE: m is an immutable memtable
  124. void Remove(MemTable* m, autovector<MemTable*>* to_delete);
  125. void TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
  126. bool GetFromList(std::list<MemTable*>* list, const LookupKey& key,
  127. std::string* value, Status* s, MergeContext* merge_context,
  128. SequenceNumber* max_covering_tombstone_seq,
  129. SequenceNumber* seq, const ReadOptions& read_opts,
  130. ReadCallback* callback = nullptr,
  131. bool* is_blob_index = nullptr);
  132. void AddMemTable(MemTable* m);
  133. void UnrefMemTable(autovector<MemTable*>* to_delete, MemTable* m);
  134. // Calculate the total amount of memory used by memlist_ and memlist_history_
  135. // excluding the last MemTable in memlist_history_. The reason for excluding
  136. // the last MemTable is to see if dropping the last MemTable will keep total
  137. // memory usage above or equal to max_write_buffer_size_to_maintain_
  138. size_t ApproximateMemoryUsageExcludingLast() const;
  139. // Whether this version contains flushed memtables that are only kept around
  140. // for transaction conflict checking.
  141. bool HasHistory() const { return !memlist_history_.empty(); }
  142. bool MemtableLimitExceeded(size_t usage);
  143. // Immutable MemTables that have not yet been flushed.
  144. std::list<MemTable*> memlist_;
  145. // MemTables that have already been flushed
  146. // (used during Transaction validation)
  147. std::list<MemTable*> memlist_history_;
  148. // Maximum number of MemTables to keep in memory (including both flushed
  149. const int max_write_buffer_number_to_maintain_;
  150. // Maximum size of MemTables to keep in memory (including both flushed
  151. // and not-yet-flushed tables).
  152. const int64_t max_write_buffer_size_to_maintain_;
  153. int refs_ = 0;
  154. size_t* parent_memtable_list_memory_usage_;
  155. };
  156. // This class stores references to all the immutable memtables.
  157. // The memtables are flushed to L0 as soon as possible and in
  158. // any order. If there are more than one immutable memtable, their
  159. // flushes can occur concurrently. However, they are 'committed'
  160. // to the manifest in FIFO order to maintain correctness and
  161. // recoverability from a crash.
  162. //
  163. //
  164. // Other than imm_flush_needed and imm_trim_needed, this class is not
  165. // thread-safe and requires external synchronization (such as holding the db
  166. // mutex or being on the write thread.)
  167. class MemTableList {
  168. public:
  169. // A list of memtables.
  170. explicit MemTableList(int min_write_buffer_number_to_merge,
  171. int max_write_buffer_number_to_maintain,
  172. int64_t max_write_buffer_size_to_maintain)
  173. : imm_flush_needed(false),
  174. imm_trim_needed(false),
  175. min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
  176. current_(new MemTableListVersion(&current_memory_usage_,
  177. max_write_buffer_number_to_maintain,
  178. max_write_buffer_size_to_maintain)),
  179. num_flush_not_started_(0),
  180. commit_in_progress_(false),
  181. flush_requested_(false),
  182. current_memory_usage_(0),
  183. current_memory_usage_excluding_last_(0),
  184. current_has_history_(false) {
  185. current_->Ref();
  186. }
  187. // Should not delete MemTableList without making sure MemTableList::current()
  188. // is Unref()'d.
  189. ~MemTableList() {}
  190. MemTableListVersion* current() const { return current_; }
  191. // so that background threads can detect non-nullptr pointer to
  192. // determine whether there is anything more to start flushing.
  193. std::atomic<bool> imm_flush_needed;
  194. std::atomic<bool> imm_trim_needed;
  195. // Returns the total number of memtables in the list that haven't yet
  196. // been flushed and logged.
  197. int NumNotFlushed() const;
  198. // Returns total number of memtables in the list that have been
  199. // completely flushed and logged.
  200. int NumFlushed() const;
  201. // Returns true if there is at least one memtable on which flush has
  202. // not yet started.
  203. bool IsFlushPending() const;
  204. // Returns the earliest memtables that needs to be flushed. The returned
  205. // memtables are guaranteed to be in the ascending order of created time.
  206. void PickMemtablesToFlush(const uint64_t* max_memtable_id,
  207. autovector<MemTable*>* mems);
  208. // Reset status of the given memtable list back to pending state so that
  209. // they can get picked up again on the next round of flush.
  210. void RollbackMemtableFlush(const autovector<MemTable*>& mems,
  211. uint64_t file_number);
  212. // Try commit a successful flush in the manifest file. It might just return
  213. // Status::OK letting a concurrent flush to do the actual the recording.
  214. Status TryInstallMemtableFlushResults(
  215. ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
  216. const autovector<MemTable*>& m, LogsWithPrepTracker* prep_tracker,
  217. VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number,
  218. autovector<MemTable*>* to_delete, Directory* db_directory,
  219. LogBuffer* log_buffer,
  220. std::list<std::unique_ptr<FlushJobInfo>>* committed_flush_jobs_info);
  221. // New memtables are inserted at the front of the list.
  222. // Takes ownership of the referenced held on *m by the caller of Add().
  223. void Add(MemTable* m, autovector<MemTable*>* to_delete);
  224. // Returns an estimate of the number of bytes of data in use.
  225. size_t ApproximateMemoryUsage();
  226. // Returns the cached current_memory_usage_excluding_last_ value.
  227. size_t ApproximateMemoryUsageExcludingLast() const;
  228. // Returns the cached current_has_history_ value.
  229. bool HasHistory() const;
  230. // Updates current_memory_usage_excluding_last_ and current_has_history_
  231. // from MemTableListVersion. Must be called whenever InstallNewVersion is
  232. // called.
  233. void UpdateCachedValuesFromMemTableListVersion();
  234. // `usage` is the current size of the mutable Memtable. When
  235. // max_write_buffer_size_to_maintain is used, total size of mutable and
  236. // immutable memtables is checked against it to decide whether to trim
  237. // memtable list.
  238. void TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
  239. // Returns an estimate of the number of bytes of data used by
  240. // the unflushed mem-tables.
  241. size_t ApproximateUnflushedMemTablesMemoryUsage();
  242. // Returns an estimate of the timestamp of the earliest key.
  243. uint64_t ApproximateOldestKeyTime() const;
  244. // Request a flush of all existing memtables to storage. This will
  245. // cause future calls to IsFlushPending() to return true if this list is
  246. // non-empty (regardless of the min_write_buffer_number_to_merge
  247. // parameter). This flush request will persist until the next time
  248. // PickMemtablesToFlush() is called.
  249. void FlushRequested() { flush_requested_ = true; }
  250. bool HasFlushRequested() { return flush_requested_; }
  251. // Returns true if a trim history should be scheduled and the caller should
  252. // be the one to schedule it
  253. bool MarkTrimHistoryNeeded() {
  254. auto expected = false;
  255. return imm_trim_needed.compare_exchange_strong(
  256. expected, true, std::memory_order_relaxed, std::memory_order_relaxed);
  257. }
  258. void ResetTrimHistoryNeeded() {
  259. auto expected = true;
  260. imm_trim_needed.compare_exchange_strong(
  261. expected, false, std::memory_order_relaxed, std::memory_order_relaxed);
  262. }
  263. // Copying allowed
  264. // MemTableList(const MemTableList&);
  265. // void operator=(const MemTableList&);
  266. size_t* current_memory_usage() { return &current_memory_usage_; }
  267. // Returns the min log containing the prep section after memtables listsed in
  268. // `memtables_to_flush` are flushed and their status is persisted in manifest.
  269. uint64_t PrecomputeMinLogContainingPrepSection(
  270. const autovector<MemTable*>& memtables_to_flush);
  271. uint64_t GetEarliestMemTableID() const {
  272. auto& memlist = current_->memlist_;
  273. if (memlist.empty()) {
  274. return std::numeric_limits<uint64_t>::max();
  275. }
  276. return memlist.back()->GetID();
  277. }
  278. uint64_t GetLatestMemTableID() const {
  279. auto& memlist = current_->memlist_;
  280. if (memlist.empty()) {
  281. return 0;
  282. }
  283. return memlist.front()->GetID();
  284. }
  285. void AssignAtomicFlushSeq(const SequenceNumber& seq) {
  286. const auto& memlist = current_->memlist_;
  287. // Scan the memtable list from new to old
  288. for (auto it = memlist.begin(); it != memlist.end(); ++it) {
  289. MemTable* mem = *it;
  290. if (mem->atomic_flush_seqno_ == kMaxSequenceNumber) {
  291. mem->atomic_flush_seqno_ = seq;
  292. } else {
  293. // Earlier memtables must have been assigned a atomic flush seq, no
  294. // need to continue scan.
  295. break;
  296. }
  297. }
  298. }
  299. // Used only by DBImplSecondary during log replay.
  300. // Remove memtables whose data were written before the WAL with log_number
  301. // was created, i.e. mem->GetNextLogNumber() <= log_number. The memtables are
  302. // not freed, but put into a vector for future deref and reclamation.
  303. void RemoveOldMemTables(uint64_t log_number,
  304. autovector<MemTable*>* to_delete);
  305. private:
  306. friend Status InstallMemtableAtomicFlushResults(
  307. const autovector<MemTableList*>* imm_lists,
  308. const autovector<ColumnFamilyData*>& cfds,
  309. const autovector<const MutableCFOptions*>& mutable_cf_options_list,
  310. const autovector<const autovector<MemTable*>*>& mems_list,
  311. VersionSet* vset, InstrumentedMutex* mu,
  312. const autovector<FileMetaData*>& file_meta,
  313. autovector<MemTable*>* to_delete, Directory* db_directory,
  314. LogBuffer* log_buffer);
  315. // DB mutex held
  316. void InstallNewVersion();
  317. const int min_write_buffer_number_to_merge_;
  318. MemTableListVersion* current_;
  319. // the number of elements that still need flushing
  320. int num_flush_not_started_;
  321. // committing in progress
  322. bool commit_in_progress_;
  323. // Requested a flush of memtables to storage. It's possible to request that
  324. // a subset of memtables be flushed.
  325. bool flush_requested_;
  326. // The current memory usage.
  327. size_t current_memory_usage_;
  328. // Cached value of current_->ApproximateMemoryUsageExcludingLast().
  329. std::atomic<size_t> current_memory_usage_excluding_last_;
  330. // Cached value of current_->HasHistory().
  331. std::atomic<bool> current_has_history_;
  332. };
  333. // Installs memtable atomic flush results.
  334. // In most cases, imm_lists is nullptr, and the function simply uses the
  335. // immutable memtable lists associated with the cfds. There are unit tests that
  336. // installs flush results for external immutable memtable lists other than the
  337. // cfds' own immutable memtable lists, e.g. MemTableLIstTest. In this case,
  338. // imm_lists parameter is not nullptr.
  339. extern Status InstallMemtableAtomicFlushResults(
  340. const autovector<MemTableList*>* imm_lists,
  341. const autovector<ColumnFamilyData*>& cfds,
  342. const autovector<const MutableCFOptions*>& mutable_cf_options_list,
  343. const autovector<const autovector<MemTable*>*>& mems_list, VersionSet* vset,
  344. InstrumentedMutex* mu, const autovector<FileMetaData*>& file_meta,
  345. autovector<MemTable*>* to_delete, Directory* db_directory,
  346. LogBuffer* log_buffer);
  347. } // namespace ROCKSDB_NAMESPACE