hash_linklist_rep.cc 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. #ifndef ROCKSDB_LITE
  7. #include "memtable/hash_linklist_rep.h"
  8. #include <algorithm>
  9. #include <atomic>
  10. #include "db/memtable.h"
  11. #include "memory/arena.h"
  12. #include "memtable/skiplist.h"
  13. #include "monitoring/histogram.h"
  14. #include "port/port.h"
  15. #include "rocksdb/memtablerep.h"
  16. #include "rocksdb/slice.h"
  17. #include "rocksdb/slice_transform.h"
  18. #include "util/hash.h"
  19. namespace ROCKSDB_NAMESPACE {
  20. namespace {
  21. typedef const char* Key;
  22. typedef SkipList<Key, const MemTableRep::KeyComparator&> MemtableSkipList;
  23. typedef std::atomic<void*> Pointer;
  24. // A data structure used as the header of a link list of a hash bucket.
  25. struct BucketHeader {
  26. Pointer next;
  27. std::atomic<uint32_t> num_entries;
  28. explicit BucketHeader(void* n, uint32_t count)
  29. : next(n), num_entries(count) {}
  30. bool IsSkipListBucket() {
  31. return next.load(std::memory_order_relaxed) == this;
  32. }
  33. uint32_t GetNumEntries() const {
  34. return num_entries.load(std::memory_order_relaxed);
  35. }
  36. // REQUIRES: called from single-threaded Insert()
  37. void IncNumEntries() {
  38. // Only one thread can do write at one time. No need to do atomic
  39. // incremental. Update it with relaxed load and store.
  40. num_entries.store(GetNumEntries() + 1, std::memory_order_relaxed);
  41. }
  42. };
  43. // A data structure used as the header of a skip list of a hash bucket.
  44. struct SkipListBucketHeader {
  45. BucketHeader Counting_header;
  46. MemtableSkipList skip_list;
  47. explicit SkipListBucketHeader(const MemTableRep::KeyComparator& cmp,
  48. Allocator* allocator, uint32_t count)
  49. : Counting_header(this, // Pointing to itself to indicate header type.
  50. count),
  51. skip_list(cmp, allocator) {}
  52. };
  53. struct Node {
  54. // Accessors/mutators for links. Wrapped in methods so we can
  55. // add the appropriate barriers as necessary.
  56. Node* Next() {
  57. // Use an 'acquire load' so that we observe a fully initialized
  58. // version of the returned Node.
  59. return next_.load(std::memory_order_acquire);
  60. }
  61. void SetNext(Node* x) {
  62. // Use a 'release store' so that anybody who reads through this
  63. // pointer observes a fully initialized version of the inserted node.
  64. next_.store(x, std::memory_order_release);
  65. }
  66. // No-barrier variants that can be safely used in a few locations.
  67. Node* NoBarrier_Next() {
  68. return next_.load(std::memory_order_relaxed);
  69. }
  70. void NoBarrier_SetNext(Node* x) { next_.store(x, std::memory_order_relaxed); }
  71. // Needed for placement new below which is fine
  72. Node() {}
  73. private:
  74. std::atomic<Node*> next_;
  75. // Prohibit copying due to the below
  76. Node(const Node&) = delete;
  77. Node& operator=(const Node&) = delete;
  78. public:
  79. char key[1];
  80. };
  81. // Memory structure of the mem table:
  82. // It is a hash table, each bucket points to one entry, a linked list or a
  83. // skip list. In order to track total number of records in a bucket to determine
  84. // whether should switch to skip list, a header is added just to indicate
  85. // number of entries in the bucket.
  86. //
  87. //
  88. // +-----> NULL Case 1. Empty bucket
  89. // |
  90. // |
  91. // | +---> +-------+
  92. // | | | Next +--> NULL
  93. // | | +-------+
  94. // +-----+ | | | | Case 2. One Entry in bucket.
  95. // | +-+ | | Data | next pointer points to
  96. // +-----+ | | | NULL. All other cases
  97. // | | | | | next pointer is not NULL.
  98. // +-----+ | +-------+
  99. // | +---+
  100. // +-----+ +-> +-------+ +> +-------+ +-> +-------+
  101. // | | | | Next +--+ | Next +--+ | Next +-->NULL
  102. // +-----+ | +-------+ +-------+ +-------+
  103. // | +-----+ | Count | | | | |
  104. // +-----+ +-------+ | Data | | Data |
  105. // | | | | | |
  106. // +-----+ Case 3. | | | |
  107. // | | A header +-------+ +-------+
  108. // +-----+ points to
  109. // | | a linked list. Count indicates total number
  110. // +-----+ of rows in this bucket.
  111. // | |
  112. // +-----+ +-> +-------+ <--+
  113. // | | | | Next +----+
  114. // +-----+ | +-------+ Case 4. A header points to a skip
  115. // | +----+ | Count | list and next pointer points to
  116. // +-----+ +-------+ itself, to distinguish case 3 or 4.
  117. // | | | | Count still is kept to indicates total
  118. // +-----+ | Skip +--> of entries in the bucket for debugging
  119. // | | | List | Data purpose.
  120. // | | | +-->
  121. // +-----+ | |
  122. // | | +-------+
  123. // +-----+
  124. //
  125. // We don't have data race when changing cases because:
  126. // (1) When changing from case 2->3, we create a new bucket header, put the
  127. // single node there first without changing the original node, and do a
  128. // release store when changing the bucket pointer. In that case, a reader
  129. // who sees a stale value of the bucket pointer will read this node, while
  130. // a reader sees the correct value because of the release store.
  131. // (2) When changing case 3->4, a new header is created with skip list points
  132. // to the data, before doing an acquire store to change the bucket pointer.
  133. // The old header and nodes are never changed, so any reader sees any
  134. // of those existing pointers will guarantee to be able to iterate to the
  135. // end of the linked list.
  136. // (3) Header's next pointer in case 3 might change, but they are never equal
  137. // to itself, so no matter a reader sees any stale or newer value, it will
  138. // be able to correctly distinguish case 3 and 4.
  139. //
  140. // The reason that we use case 2 is we want to make the format to be efficient
  141. // when the utilization of buckets is relatively low. If we use case 3 for
  142. // single entry bucket, we will need to waste 12 bytes for every entry,
  143. // which can be significant decrease of memory utilization.
  144. class HashLinkListRep : public MemTableRep {
  145. public:
  146. HashLinkListRep(const MemTableRep::KeyComparator& compare,
  147. Allocator* allocator, const SliceTransform* transform,
  148. size_t bucket_size, uint32_t threshold_use_skiplist,
  149. size_t huge_page_tlb_size, Logger* logger,
  150. int bucket_entries_logging_threshold,
  151. bool if_log_bucket_dist_when_flash);
  152. KeyHandle Allocate(const size_t len, char** buf) override;
  153. void Insert(KeyHandle handle) override;
  154. bool Contains(const char* key) const override;
  155. size_t ApproximateMemoryUsage() override;
  156. void Get(const LookupKey& k, void* callback_args,
  157. bool (*callback_func)(void* arg, const char* entry)) override;
  158. ~HashLinkListRep() override;
  159. MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override;
  160. MemTableRep::Iterator* GetDynamicPrefixIterator(
  161. Arena* arena = nullptr) override;
  162. private:
  163. friend class DynamicIterator;
  164. size_t bucket_size_;
  165. // Maps slices (which are transformed user keys) to buckets of keys sharing
  166. // the same transform.
  167. Pointer* buckets_;
  168. const uint32_t threshold_use_skiplist_;
  169. // The user-supplied transform whose domain is the user keys.
  170. const SliceTransform* transform_;
  171. const MemTableRep::KeyComparator& compare_;
  172. Logger* logger_;
  173. int bucket_entries_logging_threshold_;
  174. bool if_log_bucket_dist_when_flash_;
  175. bool LinkListContains(Node* head, const Slice& key) const;
  176. SkipListBucketHeader* GetSkipListBucketHeader(Pointer* first_next_pointer)
  177. const;
  178. Node* GetLinkListFirstNode(Pointer* first_next_pointer) const;
  179. Slice GetPrefix(const Slice& internal_key) const {
  180. return transform_->Transform(ExtractUserKey(internal_key));
  181. }
  182. size_t GetHash(const Slice& slice) const {
  183. return fastrange64(GetSliceNPHash64(slice), bucket_size_);
  184. }
  185. Pointer* GetBucket(size_t i) const {
  186. return static_cast<Pointer*>(buckets_[i].load(std::memory_order_acquire));
  187. }
  188. Pointer* GetBucket(const Slice& slice) const {
  189. return GetBucket(GetHash(slice));
  190. }
  191. bool Equal(const Slice& a, const Key& b) const {
  192. return (compare_(b, a) == 0);
  193. }
  194. bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
  195. bool KeyIsAfterNode(const Slice& internal_key, const Node* n) const {
  196. // nullptr n is considered infinite
  197. return (n != nullptr) && (compare_(n->key, internal_key) < 0);
  198. }
  199. bool KeyIsAfterNode(const Key& key, const Node* n) const {
  200. // nullptr n is considered infinite
  201. return (n != nullptr) && (compare_(n->key, key) < 0);
  202. }
  203. bool KeyIsAfterOrAtNode(const Slice& internal_key, const Node* n) const {
  204. // nullptr n is considered infinite
  205. return (n != nullptr) && (compare_(n->key, internal_key) <= 0);
  206. }
  207. bool KeyIsAfterOrAtNode(const Key& key, const Node* n) const {
  208. // nullptr n is considered infinite
  209. return (n != nullptr) && (compare_(n->key, key) <= 0);
  210. }
  211. Node* FindGreaterOrEqualInBucket(Node* head, const Slice& key) const;
  212. Node* FindLessOrEqualInBucket(Node* head, const Slice& key) const;
  213. class FullListIterator : public MemTableRep::Iterator {
  214. public:
  215. explicit FullListIterator(MemtableSkipList* list, Allocator* allocator)
  216. : iter_(list), full_list_(list), allocator_(allocator) {}
  217. ~FullListIterator() override {}
  218. // Returns true iff the iterator is positioned at a valid node.
  219. bool Valid() const override { return iter_.Valid(); }
  220. // Returns the key at the current position.
  221. // REQUIRES: Valid()
  222. const char* key() const override {
  223. assert(Valid());
  224. return iter_.key();
  225. }
  226. // Advances to the next position.
  227. // REQUIRES: Valid()
  228. void Next() override {
  229. assert(Valid());
  230. iter_.Next();
  231. }
  232. // Advances to the previous position.
  233. // REQUIRES: Valid()
  234. void Prev() override {
  235. assert(Valid());
  236. iter_.Prev();
  237. }
  238. // Advance to the first entry with a key >= target
  239. void Seek(const Slice& internal_key, const char* memtable_key) override {
  240. const char* encoded_key =
  241. (memtable_key != nullptr) ?
  242. memtable_key : EncodeKey(&tmp_, internal_key);
  243. iter_.Seek(encoded_key);
  244. }
  245. // Retreat to the last entry with a key <= target
  246. void SeekForPrev(const Slice& internal_key,
  247. const char* memtable_key) override {
  248. const char* encoded_key = (memtable_key != nullptr)
  249. ? memtable_key
  250. : EncodeKey(&tmp_, internal_key);
  251. iter_.SeekForPrev(encoded_key);
  252. }
  253. // Position at the first entry in collection.
  254. // Final state of iterator is Valid() iff collection is not empty.
  255. void SeekToFirst() override { iter_.SeekToFirst(); }
  256. // Position at the last entry in collection.
  257. // Final state of iterator is Valid() iff collection is not empty.
  258. void SeekToLast() override { iter_.SeekToLast(); }
  259. private:
  260. MemtableSkipList::Iterator iter_;
  261. // To destruct with the iterator.
  262. std::unique_ptr<MemtableSkipList> full_list_;
  263. std::unique_ptr<Allocator> allocator_;
  264. std::string tmp_; // For passing to EncodeKey
  265. };
  266. class LinkListIterator : public MemTableRep::Iterator {
  267. public:
  268. explicit LinkListIterator(const HashLinkListRep* const hash_link_list_rep,
  269. Node* head)
  270. : hash_link_list_rep_(hash_link_list_rep),
  271. head_(head),
  272. node_(nullptr) {}
  273. ~LinkListIterator() override {}
  274. // Returns true iff the iterator is positioned at a valid node.
  275. bool Valid() const override { return node_ != nullptr; }
  276. // Returns the key at the current position.
  277. // REQUIRES: Valid()
  278. const char* key() const override {
  279. assert(Valid());
  280. return node_->key;
  281. }
  282. // Advances to the next position.
  283. // REQUIRES: Valid()
  284. void Next() override {
  285. assert(Valid());
  286. node_ = node_->Next();
  287. }
  288. // Advances to the previous position.
  289. // REQUIRES: Valid()
  290. void Prev() override {
  291. // Prefix iterator does not support total order.
  292. // We simply set the iterator to invalid state
  293. Reset(nullptr);
  294. }
  295. // Advance to the first entry with a key >= target
  296. void Seek(const Slice& internal_key,
  297. const char* /*memtable_key*/) override {
  298. node_ = hash_link_list_rep_->FindGreaterOrEqualInBucket(head_,
  299. internal_key);
  300. }
  301. // Retreat to the last entry with a key <= target
  302. void SeekForPrev(const Slice& /*internal_key*/,
  303. const char* /*memtable_key*/) override {
  304. // Since we do not support Prev()
  305. // We simply do not support SeekForPrev
  306. Reset(nullptr);
  307. }
  308. // Position at the first entry in collection.
  309. // Final state of iterator is Valid() iff collection is not empty.
  310. void SeekToFirst() override {
  311. // Prefix iterator does not support total order.
  312. // We simply set the iterator to invalid state
  313. Reset(nullptr);
  314. }
  315. // Position at the last entry in collection.
  316. // Final state of iterator is Valid() iff collection is not empty.
  317. void SeekToLast() override {
  318. // Prefix iterator does not support total order.
  319. // We simply set the iterator to invalid state
  320. Reset(nullptr);
  321. }
  322. protected:
  323. void Reset(Node* head) {
  324. head_ = head;
  325. node_ = nullptr;
  326. }
  327. private:
  328. friend class HashLinkListRep;
  329. const HashLinkListRep* const hash_link_list_rep_;
  330. Node* head_;
  331. Node* node_;
  332. virtual void SeekToHead() {
  333. node_ = head_;
  334. }
  335. };
  336. class DynamicIterator : public HashLinkListRep::LinkListIterator {
  337. public:
  338. explicit DynamicIterator(HashLinkListRep& memtable_rep)
  339. : HashLinkListRep::LinkListIterator(&memtable_rep, nullptr),
  340. memtable_rep_(memtable_rep) {}
  341. // Advance to the first entry with a key >= target
  342. void Seek(const Slice& k, const char* memtable_key) override {
  343. auto transformed = memtable_rep_.GetPrefix(k);
  344. auto* bucket = memtable_rep_.GetBucket(transformed);
  345. SkipListBucketHeader* skip_list_header =
  346. memtable_rep_.GetSkipListBucketHeader(bucket);
  347. if (skip_list_header != nullptr) {
  348. // The bucket is organized as a skip list
  349. if (!skip_list_iter_) {
  350. skip_list_iter_.reset(
  351. new MemtableSkipList::Iterator(&skip_list_header->skip_list));
  352. } else {
  353. skip_list_iter_->SetList(&skip_list_header->skip_list);
  354. }
  355. if (memtable_key != nullptr) {
  356. skip_list_iter_->Seek(memtable_key);
  357. } else {
  358. IterKey encoded_key;
  359. encoded_key.EncodeLengthPrefixedKey(k);
  360. skip_list_iter_->Seek(encoded_key.GetUserKey().data());
  361. }
  362. } else {
  363. // The bucket is organized as a linked list
  364. skip_list_iter_.reset();
  365. Reset(memtable_rep_.GetLinkListFirstNode(bucket));
  366. HashLinkListRep::LinkListIterator::Seek(k, memtable_key);
  367. }
  368. }
  369. bool Valid() const override {
  370. if (skip_list_iter_) {
  371. return skip_list_iter_->Valid();
  372. }
  373. return HashLinkListRep::LinkListIterator::Valid();
  374. }
  375. const char* key() const override {
  376. if (skip_list_iter_) {
  377. return skip_list_iter_->key();
  378. }
  379. return HashLinkListRep::LinkListIterator::key();
  380. }
  381. void Next() override {
  382. if (skip_list_iter_) {
  383. skip_list_iter_->Next();
  384. } else {
  385. HashLinkListRep::LinkListIterator::Next();
  386. }
  387. }
  388. private:
  389. // the underlying memtable
  390. const HashLinkListRep& memtable_rep_;
  391. std::unique_ptr<MemtableSkipList::Iterator> skip_list_iter_;
  392. };
  393. class EmptyIterator : public MemTableRep::Iterator {
  394. // This is used when there wasn't a bucket. It is cheaper than
  395. // instantiating an empty bucket over which to iterate.
  396. public:
  397. EmptyIterator() { }
  398. bool Valid() const override { return false; }
  399. const char* key() const override {
  400. assert(false);
  401. return nullptr;
  402. }
  403. void Next() override {}
  404. void Prev() override {}
  405. void Seek(const Slice& /*user_key*/,
  406. const char* /*memtable_key*/) override {}
  407. void SeekForPrev(const Slice& /*user_key*/,
  408. const char* /*memtable_key*/) override {}
  409. void SeekToFirst() override {}
  410. void SeekToLast() override {}
  411. private:
  412. };
  413. };
  414. HashLinkListRep::HashLinkListRep(
  415. const MemTableRep::KeyComparator& compare, Allocator* allocator,
  416. const SliceTransform* transform, size_t bucket_size,
  417. uint32_t threshold_use_skiplist, size_t huge_page_tlb_size, Logger* logger,
  418. int bucket_entries_logging_threshold, bool if_log_bucket_dist_when_flash)
  419. : MemTableRep(allocator),
  420. bucket_size_(bucket_size),
  421. // Threshold to use skip list doesn't make sense if less than 3, so we
  422. // force it to be minimum of 3 to simplify implementation.
  423. threshold_use_skiplist_(std::max(threshold_use_skiplist, 3U)),
  424. transform_(transform),
  425. compare_(compare),
  426. logger_(logger),
  427. bucket_entries_logging_threshold_(bucket_entries_logging_threshold),
  428. if_log_bucket_dist_when_flash_(if_log_bucket_dist_when_flash) {
  429. char* mem = allocator_->AllocateAligned(sizeof(Pointer) * bucket_size,
  430. huge_page_tlb_size, logger);
  431. buckets_ = new (mem) Pointer[bucket_size];
  432. for (size_t i = 0; i < bucket_size_; ++i) {
  433. buckets_[i].store(nullptr, std::memory_order_relaxed);
  434. }
  435. }
  436. HashLinkListRep::~HashLinkListRep() {
  437. }
  438. KeyHandle HashLinkListRep::Allocate(const size_t len, char** buf) {
  439. char* mem = allocator_->AllocateAligned(sizeof(Node) + len);
  440. Node* x = new (mem) Node();
  441. *buf = x->key;
  442. return static_cast<void*>(x);
  443. }
  444. SkipListBucketHeader* HashLinkListRep::GetSkipListBucketHeader(
  445. Pointer* first_next_pointer) const {
  446. if (first_next_pointer == nullptr) {
  447. return nullptr;
  448. }
  449. if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
  450. // Single entry bucket
  451. return nullptr;
  452. }
  453. // Counting header
  454. BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer);
  455. if (header->IsSkipListBucket()) {
  456. assert(header->GetNumEntries() > threshold_use_skiplist_);
  457. auto* skip_list_bucket_header =
  458. reinterpret_cast<SkipListBucketHeader*>(header);
  459. assert(skip_list_bucket_header->Counting_header.next.load(
  460. std::memory_order_relaxed) == header);
  461. return skip_list_bucket_header;
  462. }
  463. assert(header->GetNumEntries() <= threshold_use_skiplist_);
  464. return nullptr;
  465. }
  466. Node* HashLinkListRep::GetLinkListFirstNode(Pointer* first_next_pointer) const {
  467. if (first_next_pointer == nullptr) {
  468. return nullptr;
  469. }
  470. if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
  471. // Single entry bucket
  472. return reinterpret_cast<Node*>(first_next_pointer);
  473. }
  474. // Counting header
  475. BucketHeader* header = reinterpret_cast<BucketHeader*>(first_next_pointer);
  476. if (!header->IsSkipListBucket()) {
  477. assert(header->GetNumEntries() <= threshold_use_skiplist_);
  478. return reinterpret_cast<Node*>(
  479. header->next.load(std::memory_order_acquire));
  480. }
  481. assert(header->GetNumEntries() > threshold_use_skiplist_);
  482. return nullptr;
  483. }
  484. void HashLinkListRep::Insert(KeyHandle handle) {
  485. Node* x = static_cast<Node*>(handle);
  486. assert(!Contains(x->key));
  487. Slice internal_key = GetLengthPrefixedSlice(x->key);
  488. auto transformed = GetPrefix(internal_key);
  489. auto& bucket = buckets_[GetHash(transformed)];
  490. Pointer* first_next_pointer =
  491. static_cast<Pointer*>(bucket.load(std::memory_order_relaxed));
  492. if (first_next_pointer == nullptr) {
  493. // Case 1. empty bucket
  494. // NoBarrier_SetNext() suffices since we will add a barrier when
  495. // we publish a pointer to "x" in prev[i].
  496. x->NoBarrier_SetNext(nullptr);
  497. bucket.store(x, std::memory_order_release);
  498. return;
  499. }
  500. BucketHeader* header = nullptr;
  501. if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) {
  502. // Case 2. only one entry in the bucket
  503. // Need to convert to a Counting bucket and turn to case 4.
  504. Node* first = reinterpret_cast<Node*>(first_next_pointer);
  505. // Need to add a bucket header.
  506. // We have to first convert it to a bucket with header before inserting
  507. // the new node. Otherwise, we might need to change next pointer of first.
  508. // In that case, a reader might sees the next pointer is NULL and wrongly
  509. // think the node is a bucket header.
  510. auto* mem = allocator_->AllocateAligned(sizeof(BucketHeader));
  511. header = new (mem) BucketHeader(first, 1);
  512. bucket.store(header, std::memory_order_release);
  513. } else {
  514. header = reinterpret_cast<BucketHeader*>(first_next_pointer);
  515. if (header->IsSkipListBucket()) {
  516. // Case 4. Bucket is already a skip list
  517. assert(header->GetNumEntries() > threshold_use_skiplist_);
  518. auto* skip_list_bucket_header =
  519. reinterpret_cast<SkipListBucketHeader*>(header);
  520. // Only one thread can execute Insert() at one time. No need to do atomic
  521. // incremental.
  522. skip_list_bucket_header->Counting_header.IncNumEntries();
  523. skip_list_bucket_header->skip_list.Insert(x->key);
  524. return;
  525. }
  526. }
  527. if (bucket_entries_logging_threshold_ > 0 &&
  528. header->GetNumEntries() ==
  529. static_cast<uint32_t>(bucket_entries_logging_threshold_)) {
  530. Info(logger_, "HashLinkedList bucket %" ROCKSDB_PRIszt
  531. " has more than %d "
  532. "entries. Key to insert: %s",
  533. GetHash(transformed), header->GetNumEntries(),
  534. GetLengthPrefixedSlice(x->key).ToString(true).c_str());
  535. }
  536. if (header->GetNumEntries() == threshold_use_skiplist_) {
  537. // Case 3. number of entries reaches the threshold so need to convert to
  538. // skip list.
  539. LinkListIterator bucket_iter(
  540. this, reinterpret_cast<Node*>(
  541. first_next_pointer->load(std::memory_order_relaxed)));
  542. auto mem = allocator_->AllocateAligned(sizeof(SkipListBucketHeader));
  543. SkipListBucketHeader* new_skip_list_header = new (mem)
  544. SkipListBucketHeader(compare_, allocator_, header->GetNumEntries() + 1);
  545. auto& skip_list = new_skip_list_header->skip_list;
  546. // Add all current entries to the skip list
  547. for (bucket_iter.SeekToHead(); bucket_iter.Valid(); bucket_iter.Next()) {
  548. skip_list.Insert(bucket_iter.key());
  549. }
  550. // insert the new entry
  551. skip_list.Insert(x->key);
  552. // Set the bucket
  553. bucket.store(new_skip_list_header, std::memory_order_release);
  554. } else {
  555. // Case 5. Need to insert to the sorted linked list without changing the
  556. // header.
  557. Node* first =
  558. reinterpret_cast<Node*>(header->next.load(std::memory_order_relaxed));
  559. assert(first != nullptr);
  560. // Advance counter unless the bucket needs to be advanced to skip list.
  561. // In that case, we need to make sure the previous count never exceeds
  562. // threshold_use_skiplist_ to avoid readers to cast to wrong format.
  563. header->IncNumEntries();
  564. Node* cur = first;
  565. Node* prev = nullptr;
  566. while (true) {
  567. if (cur == nullptr) {
  568. break;
  569. }
  570. Node* next = cur->Next();
  571. // Make sure the lists are sorted.
  572. // If x points to head_ or next points nullptr, it is trivially satisfied.
  573. assert((cur == first) || (next == nullptr) ||
  574. KeyIsAfterNode(next->key, cur));
  575. if (KeyIsAfterNode(internal_key, cur)) {
  576. // Keep searching in this list
  577. prev = cur;
  578. cur = next;
  579. } else {
  580. break;
  581. }
  582. }
  583. // Our data structure does not allow duplicate insertion
  584. assert(cur == nullptr || !Equal(x->key, cur->key));
  585. // NoBarrier_SetNext() suffices since we will add a barrier when
  586. // we publish a pointer to "x" in prev[i].
  587. x->NoBarrier_SetNext(cur);
  588. if (prev) {
  589. prev->SetNext(x);
  590. } else {
  591. header->next.store(static_cast<void*>(x), std::memory_order_release);
  592. }
  593. }
  594. }
  595. bool HashLinkListRep::Contains(const char* key) const {
  596. Slice internal_key = GetLengthPrefixedSlice(key);
  597. auto transformed = GetPrefix(internal_key);
  598. auto bucket = GetBucket(transformed);
  599. if (bucket == nullptr) {
  600. return false;
  601. }
  602. SkipListBucketHeader* skip_list_header = GetSkipListBucketHeader(bucket);
  603. if (skip_list_header != nullptr) {
  604. return skip_list_header->skip_list.Contains(key);
  605. } else {
  606. return LinkListContains(GetLinkListFirstNode(bucket), internal_key);
  607. }
  608. }
  609. size_t HashLinkListRep::ApproximateMemoryUsage() {
  610. // Memory is always allocated from the allocator.
  611. return 0;
  612. }
  613. void HashLinkListRep::Get(const LookupKey& k, void* callback_args,
  614. bool (*callback_func)(void* arg, const char* entry)) {
  615. auto transformed = transform_->Transform(k.user_key());
  616. auto bucket = GetBucket(transformed);
  617. auto* skip_list_header = GetSkipListBucketHeader(bucket);
  618. if (skip_list_header != nullptr) {
  619. // Is a skip list
  620. MemtableSkipList::Iterator iter(&skip_list_header->skip_list);
  621. for (iter.Seek(k.memtable_key().data());
  622. iter.Valid() && callback_func(callback_args, iter.key());
  623. iter.Next()) {
  624. }
  625. } else {
  626. auto* link_list_head = GetLinkListFirstNode(bucket);
  627. if (link_list_head != nullptr) {
  628. LinkListIterator iter(this, link_list_head);
  629. for (iter.Seek(k.internal_key(), nullptr);
  630. iter.Valid() && callback_func(callback_args, iter.key());
  631. iter.Next()) {
  632. }
  633. }
  634. }
  635. }
  636. MemTableRep::Iterator* HashLinkListRep::GetIterator(Arena* alloc_arena) {
  637. // allocate a new arena of similar size to the one currently in use
  638. Arena* new_arena = new Arena(allocator_->BlockSize());
  639. auto list = new MemtableSkipList(compare_, new_arena);
  640. HistogramImpl keys_per_bucket_hist;
  641. for (size_t i = 0; i < bucket_size_; ++i) {
  642. int count = 0;
  643. auto* bucket = GetBucket(i);
  644. if (bucket != nullptr) {
  645. auto* skip_list_header = GetSkipListBucketHeader(bucket);
  646. if (skip_list_header != nullptr) {
  647. // Is a skip list
  648. MemtableSkipList::Iterator itr(&skip_list_header->skip_list);
  649. for (itr.SeekToFirst(); itr.Valid(); itr.Next()) {
  650. list->Insert(itr.key());
  651. count++;
  652. }
  653. } else {
  654. auto* link_list_head = GetLinkListFirstNode(bucket);
  655. if (link_list_head != nullptr) {
  656. LinkListIterator itr(this, link_list_head);
  657. for (itr.SeekToHead(); itr.Valid(); itr.Next()) {
  658. list->Insert(itr.key());
  659. count++;
  660. }
  661. }
  662. }
  663. }
  664. if (if_log_bucket_dist_when_flash_) {
  665. keys_per_bucket_hist.Add(count);
  666. }
  667. }
  668. if (if_log_bucket_dist_when_flash_ && logger_ != nullptr) {
  669. Info(logger_, "hashLinkedList Entry distribution among buckets: %s",
  670. keys_per_bucket_hist.ToString().c_str());
  671. }
  672. if (alloc_arena == nullptr) {
  673. return new FullListIterator(list, new_arena);
  674. } else {
  675. auto mem = alloc_arena->AllocateAligned(sizeof(FullListIterator));
  676. return new (mem) FullListIterator(list, new_arena);
  677. }
  678. }
  679. MemTableRep::Iterator* HashLinkListRep::GetDynamicPrefixIterator(
  680. Arena* alloc_arena) {
  681. if (alloc_arena == nullptr) {
  682. return new DynamicIterator(*this);
  683. } else {
  684. auto mem = alloc_arena->AllocateAligned(sizeof(DynamicIterator));
  685. return new (mem) DynamicIterator(*this);
  686. }
  687. }
  688. bool HashLinkListRep::LinkListContains(Node* head,
  689. const Slice& user_key) const {
  690. Node* x = FindGreaterOrEqualInBucket(head, user_key);
  691. return (x != nullptr && Equal(user_key, x->key));
  692. }
  693. Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head,
  694. const Slice& key) const {
  695. Node* x = head;
  696. while (true) {
  697. if (x == nullptr) {
  698. return x;
  699. }
  700. Node* next = x->Next();
  701. // Make sure the lists are sorted.
  702. // If x points to head_ or next points nullptr, it is trivially satisfied.
  703. assert((x == head) || (next == nullptr) || KeyIsAfterNode(next->key, x));
  704. if (KeyIsAfterNode(key, x)) {
  705. // Keep searching in this list
  706. x = next;
  707. } else {
  708. break;
  709. }
  710. }
  711. return x;
  712. }
  713. } // anon namespace
  714. MemTableRep* HashLinkListRepFactory::CreateMemTableRep(
  715. const MemTableRep::KeyComparator& compare, Allocator* allocator,
  716. const SliceTransform* transform, Logger* logger) {
  717. return new HashLinkListRep(compare, allocator, transform, bucket_count_,
  718. threshold_use_skiplist_, huge_page_tlb_size_,
  719. logger, bucket_entries_logging_threshold_,
  720. if_log_bucket_dist_when_flash_);
  721. }
  722. MemTableRepFactory* NewHashLinkListRepFactory(
  723. size_t bucket_count, size_t huge_page_tlb_size,
  724. int bucket_entries_logging_threshold, bool if_log_bucket_dist_when_flash,
  725. uint32_t threshold_use_skiplist) {
  726. return new HashLinkListRepFactory(
  727. bucket_count, threshold_use_skiplist, huge_page_tlb_size,
  728. bucket_entries_logging_threshold, if_log_bucket_dist_when_flash);
  729. }
  730. } // namespace ROCKSDB_NAMESPACE
  731. #endif // ROCKSDB_LITE