| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528 |
- // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
- // This source code is licensed under both the GPLv2 (found in the
- // COPYING file in the root directory) and Apache 2.0 License
- // (found in the LICENSE.Apache file in the root directory).
- #ifndef ROCKSDB_LITE
- #include "table/cuckoo/cuckoo_table_builder.h"
- #include <assert.h>
- #include <algorithm>
- #include <limits>
- #include <string>
- #include <vector>
- #include "db/dbformat.h"
- #include "file/writable_file_writer.h"
- #include "rocksdb/env.h"
- #include "rocksdb/table.h"
- #include "table/block_based/block_builder.h"
- #include "table/cuckoo/cuckoo_table_factory.h"
- #include "table/format.h"
- #include "table/meta_blocks.h"
- #include "util/autovector.h"
- #include "util/random.h"
- #include "util/string_util.h"
- namespace ROCKSDB_NAMESPACE {
- const std::string CuckooTablePropertyNames::kEmptyKey =
- "rocksdb.cuckoo.bucket.empty.key";
- const std::string CuckooTablePropertyNames::kNumHashFunc =
- "rocksdb.cuckoo.hash.num";
- const std::string CuckooTablePropertyNames::kHashTableSize =
- "rocksdb.cuckoo.hash.size";
- const std::string CuckooTablePropertyNames::kValueLength =
- "rocksdb.cuckoo.value.length";
- const std::string CuckooTablePropertyNames::kIsLastLevel =
- "rocksdb.cuckoo.file.islastlevel";
- const std::string CuckooTablePropertyNames::kCuckooBlockSize =
- "rocksdb.cuckoo.hash.cuckooblocksize";
- const std::string CuckooTablePropertyNames::kIdentityAsFirstHash =
- "rocksdb.cuckoo.hash.identityfirst";
- const std::string CuckooTablePropertyNames::kUseModuleHash =
- "rocksdb.cuckoo.hash.usemodule";
- const std::string CuckooTablePropertyNames::kUserKeyLength =
- "rocksdb.cuckoo.hash.userkeylength";
- // Obtained by running echo rocksdb.table.cuckoo | sha1sum
- extern const uint64_t kCuckooTableMagicNumber = 0x926789d0c5f17873ull;
- CuckooTableBuilder::CuckooTableBuilder(
- WritableFileWriter* file, double max_hash_table_ratio,
- uint32_t max_num_hash_table, uint32_t max_search_depth,
- const Comparator* user_comparator, uint32_t cuckoo_block_size,
- bool use_module_hash, bool identity_as_first_hash,
- uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t),
- uint32_t column_family_id, const std::string& column_family_name)
- : num_hash_func_(2),
- file_(file),
- max_hash_table_ratio_(max_hash_table_ratio),
- max_num_hash_func_(max_num_hash_table),
- max_search_depth_(max_search_depth),
- cuckoo_block_size_(std::max(1U, cuckoo_block_size)),
- hash_table_size_(use_module_hash ? 0 : 2),
- is_last_level_file_(false),
- has_seen_first_key_(false),
- has_seen_first_value_(false),
- key_size_(0),
- value_size_(0),
- num_entries_(0),
- num_values_(0),
- ucomp_(user_comparator),
- use_module_hash_(use_module_hash),
- identity_as_first_hash_(identity_as_first_hash),
- get_slice_hash_(get_slice_hash),
- closed_(false) {
- // Data is in a huge block.
- properties_.num_data_blocks = 1;
- properties_.index_size = 0;
- properties_.filter_size = 0;
- properties_.column_family_id = column_family_id;
- properties_.column_family_name = column_family_name;
- }
- void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
- if (num_entries_ >= kMaxVectorIdx - 1) {
- status_ = Status::NotSupported("Number of keys in a file must be < 2^32-1");
- return;
- }
- ParsedInternalKey ikey;
- if (!ParseInternalKey(key, &ikey)) {
- status_ = Status::Corruption("Unable to parse key into inernal key.");
- return;
- }
- if (ikey.type != kTypeDeletion && ikey.type != kTypeValue) {
- status_ = Status::NotSupported("Unsupported key type " +
- ToString(ikey.type));
- return;
- }
- // Determine if we can ignore the sequence number and value type from
- // internal keys by looking at sequence number from first key. We assume
- // that if first key has a zero sequence number, then all the remaining
- // keys will have zero seq. no.
- if (!has_seen_first_key_) {
- is_last_level_file_ = ikey.sequence == 0;
- has_seen_first_key_ = true;
- smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
- largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
- key_size_ = is_last_level_file_ ? ikey.user_key.size() : key.size();
- }
- if (key_size_ != (is_last_level_file_ ? ikey.user_key.size() : key.size())) {
- status_ = Status::NotSupported("all keys have to be the same size");
- return;
- }
- if (ikey.type == kTypeValue) {
- if (!has_seen_first_value_) {
- has_seen_first_value_ = true;
- value_size_ = value.size();
- }
- if (value_size_ != value.size()) {
- status_ = Status::NotSupported("all values have to be the same size");
- return;
- }
- if (is_last_level_file_) {
- kvs_.append(ikey.user_key.data(), ikey.user_key.size());
- } else {
- kvs_.append(key.data(), key.size());
- }
- kvs_.append(value.data(), value.size());
- ++num_values_;
- } else {
- if (is_last_level_file_) {
- deleted_keys_.append(ikey.user_key.data(), ikey.user_key.size());
- } else {
- deleted_keys_.append(key.data(), key.size());
- }
- }
- ++num_entries_;
- // In order to fill the empty buckets in the hash table, we identify a
- // key which is not used so far (unused_user_key). We determine this by
- // maintaining smallest and largest keys inserted so far in bytewise order
- // and use them to find a key outside this range in Finish() operation.
- // Note that this strategy is independent of user comparator used here.
- if (ikey.user_key.compare(smallest_user_key_) < 0) {
- smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
- } else if (ikey.user_key.compare(largest_user_key_) > 0) {
- largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
- }
- if (!use_module_hash_) {
- if (hash_table_size_ < num_entries_ / max_hash_table_ratio_) {
- hash_table_size_ *= 2;
- }
- }
- }
- bool CuckooTableBuilder::IsDeletedKey(uint64_t idx) const {
- assert(closed_);
- return idx >= num_values_;
- }
- Slice CuckooTableBuilder::GetKey(uint64_t idx) const {
- assert(closed_);
- if (IsDeletedKey(idx)) {
- return Slice(&deleted_keys_[static_cast<size_t>((idx - num_values_) * key_size_)], static_cast<size_t>(key_size_));
- }
- return Slice(&kvs_[static_cast<size_t>(idx * (key_size_ + value_size_))], static_cast<size_t>(key_size_));
- }
- Slice CuckooTableBuilder::GetUserKey(uint64_t idx) const {
- assert(closed_);
- return is_last_level_file_ ? GetKey(idx) : ExtractUserKey(GetKey(idx));
- }
- Slice CuckooTableBuilder::GetValue(uint64_t idx) const {
- assert(closed_);
- if (IsDeletedKey(idx)) {
- static std::string empty_value(static_cast<unsigned int>(value_size_), 'a');
- return Slice(empty_value);
- }
- return Slice(&kvs_[static_cast<size_t>(idx * (key_size_ + value_size_) + key_size_)], static_cast<size_t>(value_size_));
- }
- Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
- buckets->resize(static_cast<size_t>(hash_table_size_ + cuckoo_block_size_ - 1));
- uint32_t make_space_for_key_call_id = 0;
- for (uint32_t vector_idx = 0; vector_idx < num_entries_; vector_idx++) {
- uint64_t bucket_id = 0;
- bool bucket_found = false;
- autovector<uint64_t> hash_vals;
- Slice user_key = GetUserKey(vector_idx);
- for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !bucket_found;
- ++hash_cnt) {
- uint64_t hash_val = CuckooHash(user_key, hash_cnt, use_module_hash_,
- hash_table_size_, identity_as_first_hash_, get_slice_hash_);
- // If there is a collision, check next cuckoo_block_size_ locations for
- // empty locations. While checking, if we reach end of the hash table,
- // stop searching and proceed for next hash function.
- for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
- ++block_idx, ++hash_val) {
- if ((*buckets)[static_cast<size_t>(hash_val)].vector_idx == kMaxVectorIdx) {
- bucket_id = hash_val;
- bucket_found = true;
- break;
- } else {
- if (ucomp_->Compare(user_key,
- GetUserKey((*buckets)[static_cast<size_t>(hash_val)].vector_idx)) == 0) {
- return Status::NotSupported("Same key is being inserted again.");
- }
- hash_vals.push_back(hash_val);
- }
- }
- }
- while (!bucket_found && !MakeSpaceForKey(hash_vals,
- ++make_space_for_key_call_id, buckets, &bucket_id)) {
- // Rehash by increashing number of hash tables.
- if (num_hash_func_ >= max_num_hash_func_) {
- return Status::NotSupported("Too many collisions. Unable to hash.");
- }
- // We don't really need to rehash the entire table because old hashes are
- // still valid and we only increased the number of hash functions.
- uint64_t hash_val = CuckooHash(user_key, num_hash_func_, use_module_hash_,
- hash_table_size_, identity_as_first_hash_, get_slice_hash_);
- ++num_hash_func_;
- for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
- ++block_idx, ++hash_val) {
- if ((*buckets)[static_cast<size_t>(hash_val)].vector_idx == kMaxVectorIdx) {
- bucket_found = true;
- bucket_id = hash_val;
- break;
- } else {
- hash_vals.push_back(hash_val);
- }
- }
- }
- (*buckets)[static_cast<size_t>(bucket_id)].vector_idx = vector_idx;
- }
- return Status::OK();
- }
- Status CuckooTableBuilder::Finish() {
- assert(!closed_);
- closed_ = true;
- std::vector<CuckooBucket> buckets;
- Status s;
- std::string unused_bucket;
- if (num_entries_ > 0) {
- // Calculate the real hash size if module hash is enabled.
- if (use_module_hash_) {
- hash_table_size_ =
- static_cast<uint64_t>(num_entries_ / max_hash_table_ratio_);
- }
- s = MakeHashTable(&buckets);
- if (!s.ok()) {
- return s;
- }
- // Determine unused_user_key to fill empty buckets.
- std::string unused_user_key = smallest_user_key_;
- int curr_pos = static_cast<int>(unused_user_key.size()) - 1;
- while (curr_pos >= 0) {
- --unused_user_key[curr_pos];
- if (Slice(unused_user_key).compare(smallest_user_key_) < 0) {
- break;
- }
- --curr_pos;
- }
- if (curr_pos < 0) {
- // Try using the largest key to identify an unused key.
- unused_user_key = largest_user_key_;
- curr_pos = static_cast<int>(unused_user_key.size()) - 1;
- while (curr_pos >= 0) {
- ++unused_user_key[curr_pos];
- if (Slice(unused_user_key).compare(largest_user_key_) > 0) {
- break;
- }
- --curr_pos;
- }
- }
- if (curr_pos < 0) {
- return Status::Corruption("Unable to find unused key");
- }
- if (is_last_level_file_) {
- unused_bucket = unused_user_key;
- } else {
- ParsedInternalKey ikey(unused_user_key, 0, kTypeValue);
- AppendInternalKey(&unused_bucket, ikey);
- }
- }
- properties_.num_entries = num_entries_;
- properties_.num_deletions = num_entries_ - num_values_;
- properties_.fixed_key_len = key_size_;
- properties_.user_collected_properties[
- CuckooTablePropertyNames::kValueLength].assign(
- reinterpret_cast<const char*>(&value_size_), sizeof(value_size_));
- uint64_t bucket_size = key_size_ + value_size_;
- unused_bucket.resize(static_cast<size_t>(bucket_size), 'a');
- // Write the table.
- uint32_t num_added = 0;
- for (auto& bucket : buckets) {
- if (bucket.vector_idx == kMaxVectorIdx) {
- s = file_->Append(Slice(unused_bucket));
- } else {
- ++num_added;
- s = file_->Append(GetKey(bucket.vector_idx));
- if (s.ok()) {
- if (value_size_ > 0) {
- s = file_->Append(GetValue(bucket.vector_idx));
- }
- }
- }
- if (!s.ok()) {
- return s;
- }
- }
- assert(num_added == NumEntries());
- properties_.raw_key_size = num_added * properties_.fixed_key_len;
- properties_.raw_value_size = num_added * value_size_;
- uint64_t offset = buckets.size() * bucket_size;
- properties_.data_size = offset;
- unused_bucket.resize(static_cast<size_t>(properties_.fixed_key_len));
- properties_.user_collected_properties[
- CuckooTablePropertyNames::kEmptyKey] = unused_bucket;
- properties_.user_collected_properties[
- CuckooTablePropertyNames::kNumHashFunc].assign(
- reinterpret_cast<char*>(&num_hash_func_), sizeof(num_hash_func_));
- properties_.user_collected_properties[
- CuckooTablePropertyNames::kHashTableSize].assign(
- reinterpret_cast<const char*>(&hash_table_size_),
- sizeof(hash_table_size_));
- properties_.user_collected_properties[
- CuckooTablePropertyNames::kIsLastLevel].assign(
- reinterpret_cast<const char*>(&is_last_level_file_),
- sizeof(is_last_level_file_));
- properties_.user_collected_properties[
- CuckooTablePropertyNames::kCuckooBlockSize].assign(
- reinterpret_cast<const char*>(&cuckoo_block_size_),
- sizeof(cuckoo_block_size_));
- properties_.user_collected_properties[
- CuckooTablePropertyNames::kIdentityAsFirstHash].assign(
- reinterpret_cast<const char*>(&identity_as_first_hash_),
- sizeof(identity_as_first_hash_));
- properties_.user_collected_properties[
- CuckooTablePropertyNames::kUseModuleHash].assign(
- reinterpret_cast<const char*>(&use_module_hash_),
- sizeof(use_module_hash_));
- uint32_t user_key_len = static_cast<uint32_t>(smallest_user_key_.size());
- properties_.user_collected_properties[
- CuckooTablePropertyNames::kUserKeyLength].assign(
- reinterpret_cast<const char*>(&user_key_len),
- sizeof(user_key_len));
- // Write meta blocks.
- MetaIndexBuilder meta_index_builder;
- PropertyBlockBuilder property_block_builder;
- property_block_builder.AddTableProperty(properties_);
- property_block_builder.Add(properties_.user_collected_properties);
- Slice property_block = property_block_builder.Finish();
- BlockHandle property_block_handle;
- property_block_handle.set_offset(offset);
- property_block_handle.set_size(property_block.size());
- s = file_->Append(property_block);
- offset += property_block.size();
- if (!s.ok()) {
- return s;
- }
- meta_index_builder.Add(kPropertiesBlock, property_block_handle);
- Slice meta_index_block = meta_index_builder.Finish();
- BlockHandle meta_index_block_handle;
- meta_index_block_handle.set_offset(offset);
- meta_index_block_handle.set_size(meta_index_block.size());
- s = file_->Append(meta_index_block);
- if (!s.ok()) {
- return s;
- }
- Footer footer(kCuckooTableMagicNumber, 1);
- footer.set_metaindex_handle(meta_index_block_handle);
- footer.set_index_handle(BlockHandle::NullBlockHandle());
- std::string footer_encoding;
- footer.EncodeTo(&footer_encoding);
- s = file_->Append(footer_encoding);
- if (file_ != nullptr) {
- file_checksum_ = file_->GetFileChecksum();
- }
- return s;
- }
- void CuckooTableBuilder::Abandon() {
- assert(!closed_);
- closed_ = true;
- }
- uint64_t CuckooTableBuilder::NumEntries() const {
- return num_entries_;
- }
- uint64_t CuckooTableBuilder::FileSize() const {
- if (closed_) {
- return file_->GetFileSize();
- } else if (num_entries_ == 0) {
- return 0;
- }
- if (use_module_hash_) {
- return static_cast<uint64_t>((key_size_ + value_size_) *
- num_entries_ / max_hash_table_ratio_);
- } else {
- // Account for buckets being a power of two.
- // As elements are added, file size remains constant for a while and
- // doubles its size. Since compaction algorithm stops adding elements
- // only after it exceeds the file limit, we account for the extra element
- // being added here.
- uint64_t expected_hash_table_size = hash_table_size_;
- if (expected_hash_table_size < (num_entries_ + 1) / max_hash_table_ratio_) {
- expected_hash_table_size *= 2;
- }
- return (key_size_ + value_size_) * expected_hash_table_size - 1;
- }
- }
- // This method is invoked when there is no place to insert the target key.
- // It searches for a set of elements that can be moved to accommodate target
- // key. The search is a BFS graph traversal with first level (hash_vals)
- // being all the buckets target key could go to.
- // Then, from each node (curr_node), we find all the buckets that curr_node
- // could go to. They form the children of curr_node in the tree.
- // We continue the traversal until we find an empty bucket, in which case, we
- // move all elements along the path from first level to this empty bucket, to
- // make space for target key which is inserted at first level (*bucket_id).
- // If tree depth exceedes max depth, we return false indicating failure.
- bool CuckooTableBuilder::MakeSpaceForKey(
- const autovector<uint64_t>& hash_vals,
- const uint32_t make_space_for_key_call_id,
- std::vector<CuckooBucket>* buckets, uint64_t* bucket_id) {
- struct CuckooNode {
- uint64_t bucket_id;
- uint32_t depth;
- uint32_t parent_pos;
- CuckooNode(uint64_t _bucket_id, uint32_t _depth, int _parent_pos)
- : bucket_id(_bucket_id), depth(_depth), parent_pos(_parent_pos) {}
- };
- // This is BFS search tree that is stored simply as a vector.
- // Each node stores the index of parent node in the vector.
- std::vector<CuckooNode> tree;
- // We want to identify already visited buckets in the current method call so
- // that we don't add same buckets again for exploration in the tree.
- // We do this by maintaining a count of current method call in
- // make_space_for_key_call_id, which acts as a unique id for this invocation
- // of the method. We store this number into the nodes that we explore in
- // current method call.
- // It is unlikely for the increment operation to overflow because the maximum
- // no. of times this will be called is <= max_num_hash_func_ + num_entries_.
- for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
- uint64_t bid = hash_vals[hash_cnt];
- (*buckets)[static_cast<size_t>(bid)].make_space_for_key_call_id = make_space_for_key_call_id;
- tree.push_back(CuckooNode(bid, 0, 0));
- }
- bool null_found = false;
- uint32_t curr_pos = 0;
- while (!null_found && curr_pos < tree.size()) {
- CuckooNode& curr_node = tree[curr_pos];
- uint32_t curr_depth = curr_node.depth;
- if (curr_depth >= max_search_depth_) {
- break;
- }
- CuckooBucket& curr_bucket = (*buckets)[static_cast<size_t>(curr_node.bucket_id)];
- for (uint32_t hash_cnt = 0;
- hash_cnt < num_hash_func_ && !null_found; ++hash_cnt) {
- uint64_t child_bucket_id = CuckooHash(GetUserKey(curr_bucket.vector_idx),
- hash_cnt, use_module_hash_, hash_table_size_, identity_as_first_hash_,
- get_slice_hash_);
- // Iterate inside Cuckoo Block.
- for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
- ++block_idx, ++child_bucket_id) {
- if ((*buckets)[static_cast<size_t>(child_bucket_id)].make_space_for_key_call_id ==
- make_space_for_key_call_id) {
- continue;
- }
- (*buckets)[static_cast<size_t>(child_bucket_id)].make_space_for_key_call_id =
- make_space_for_key_call_id;
- tree.push_back(CuckooNode(child_bucket_id, curr_depth + 1,
- curr_pos));
- if ((*buckets)[static_cast<size_t>(child_bucket_id)].vector_idx == kMaxVectorIdx) {
- null_found = true;
- break;
- }
- }
- }
- ++curr_pos;
- }
- if (null_found) {
- // There is an empty node in tree.back(). Now, traverse the path from this
- // empty node to top of the tree and at every node in the path, replace
- // child with the parent. Stop when first level is reached in the tree
- // (happens when 0 <= bucket_to_replace_pos < num_hash_func_) and return
- // this location in first level for target key to be inserted.
- uint32_t bucket_to_replace_pos = static_cast<uint32_t>(tree.size()) - 1;
- while (bucket_to_replace_pos >= num_hash_func_) {
- CuckooNode& curr_node = tree[bucket_to_replace_pos];
- (*buckets)[static_cast<size_t>(curr_node.bucket_id)] =
- (*buckets)[static_cast<size_t>(tree[curr_node.parent_pos].bucket_id)];
- bucket_to_replace_pos = curr_node.parent_pos;
- }
- *bucket_id = tree[bucket_to_replace_pos].bucket_id;
- }
- return null_found;
- }
- const char* CuckooTableBuilder::GetFileChecksumFuncName() const {
- if (file_ != nullptr) {
- return file_->GetFileChecksumFuncName();
- } else {
- return kUnknownFileChecksumFuncName.c_str();
- }
- }
- } // namespace ROCKSDB_NAMESPACE
- #endif // ROCKSDB_LITE
|