| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671 | //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.//  This source code is licensed under both the GPLv2 (found in the//  COPYING file in the root directory) and Apache 2.0 License//  (found in the LICENSE.Apache file in the root directory).//// Copyright (c) 2011 The LevelDB Authors. All rights reserved.// Use of this source code is governed by a BSD-style license that can be// found in the LICENSE file. See the AUTHORS file for names of contributors.#pragma once#include <stdio.h>#include <memory>#include <string>#include <utility>#include "db/lookup_key.h"#include "db/merge_context.h"#include "logging/logging.h"#include "monitoring/perf_context_imp.h"#include "rocksdb/comparator.h"#include "rocksdb/db.h"#include "rocksdb/filter_policy.h"#include "rocksdb/slice.h"#include "rocksdb/slice_transform.h"#include "rocksdb/table.h"#include "rocksdb/types.h"#include "util/coding.h"#include "util/user_comparator_wrapper.h"namespace ROCKSDB_NAMESPACE {// The file declares data structures and functions that deal with internal// keys.// Each internal key contains a user key, a sequence number (SequenceNumber)// and a type (ValueType), and they are usually encoded together.// There are some related helper classes here.class InternalKey;// Value types encoded as the last component of internal keys.// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk// data structures.// The highest bit of the value type needs to be reserved to SST tables// for them to do more flexible encoding.enum ValueType : unsigned char {  kTypeDeletion = 0x0,  kTypeValue = 0x1,  kTypeMerge = 0x2,  kTypeLogData = 0x3,               // WAL only.  kTypeColumnFamilyDeletion = 0x4,  // WAL only.  kTypeColumnFamilyValue = 0x5,     // WAL only.  kTypeColumnFamilyMerge = 0x6,     // WAL only.  kTypeSingleDeletion = 0x7,  kTypeColumnFamilySingleDeletion = 0x8,  // WAL only.  kTypeBeginPrepareXID = 0x9,             // WAL only.  kTypeEndPrepareXID = 0xA,               // WAL only.  kTypeCommitXID = 0xB,                   // WAL only.  kTypeRollbackXID = 0xC,                 // WAL only.  kTypeNoop = 0xD,                        // WAL only.  kTypeColumnFamilyRangeDeletion = 0xE,   // WAL only.  kTypeRangeDeletion = 0xF,               // meta block  kTypeColumnFamilyBlobIndex = 0x10,      // Blob DB only  kTypeBlobIndex = 0x11,                  // Blob DB only  // When the prepared record is also persisted in db, we use a different  // record. This is to ensure that the WAL that is generated by a WritePolicy  // is not mistakenly read by another, which would result into data  // inconsistency.  kTypeBeginPersistedPrepareXID = 0x12,  // WAL only.  // Similar to kTypeBeginPersistedPrepareXID, this is to ensure that WAL  // generated by WriteUnprepared write policy is not mistakenly read by  // another.  kTypeBeginUnprepareXID = 0x13,  // WAL only.  kMaxValue = 0x7F                // Not used for storing records.};// Defined in dbformat.ccextern const ValueType kValueTypeForSeek;extern const ValueType kValueTypeForSeekForPrev;// Checks whether a type is an inline value type// (i.e. a type used in memtable skiplist and sst file datablock).inline bool IsValueType(ValueType t) {  return t <= kTypeMerge || t == kTypeSingleDeletion || t == kTypeBlobIndex;}// Checks whether a type is from user operation// kTypeRangeDeletion is in meta block so this API is separated from aboveinline bool IsExtendedValueType(ValueType t) {  return IsValueType(t) || t == kTypeRangeDeletion;}// We leave eight bits empty at the bottom so a type and sequence#// can be packed together into 64-bits.static const SequenceNumber kMaxSequenceNumber = ((0x1ull << 56) - 1);static const SequenceNumber kDisableGlobalSequenceNumber = port::kMaxUint64;// The data structure that represents an internal key in the way that user_key,// sequence number and type are stored in separated forms.struct ParsedInternalKey {  Slice user_key;  SequenceNumber sequence;  ValueType type;  ParsedInternalKey()      : sequence(kMaxSequenceNumber)  // Make code analyzer happy  {}  // Intentionally left uninitialized (for speed)  ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)      : user_key(u), sequence(seq), type(t) {}  std::string DebugString(bool hex = false) const;  void clear() {    user_key.clear();    sequence = 0;    type = kTypeDeletion;  }};// Return the length of the encoding of "key".inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {  return key.user_key.size() + 8;}// Pack a sequence number and a ValueType into a uint64_textern uint64_t PackSequenceAndType(uint64_t seq, ValueType t);// Given the result of PackSequenceAndType, store the sequence number in *seq// and the ValueType in *t.extern void UnPackSequenceAndType(uint64_t packed, uint64_t* seq, ValueType* t);EntryType GetEntryType(ValueType value_type);// Append the serialization of "key" to *result.extern void AppendInternalKey(std::string* result,                              const ParsedInternalKey& key);// Serialized internal key consists of user key followed by footer.// This function appends the footer to *result, assuming that *result already// contains the user key at the end.extern void AppendInternalKeyFooter(std::string* result, SequenceNumber s,                                    ValueType t);// Attempt to parse an internal key from "internal_key".  On success,// stores the parsed data in "*result", and returns true.//// On error, returns false, leaves "*result" in an undefined state.extern bool ParseInternalKey(const Slice& internal_key,                             ParsedInternalKey* result);// Returns the user key portion of an internal key.inline Slice ExtractUserKey(const Slice& internal_key) {  assert(internal_key.size() >= 8);  return Slice(internal_key.data(), internal_key.size() - 8);}inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key,                                             size_t ts_sz) {  assert(internal_key.size() >= 8 + ts_sz);  return Slice(internal_key.data(), internal_key.size() - 8 - ts_sz);}inline Slice StripTimestampFromUserKey(const Slice& user_key, size_t ts_sz) {  assert(user_key.size() >= ts_sz);  return Slice(user_key.data(), user_key.size() - ts_sz);}inline uint64_t ExtractInternalKeyFooter(const Slice& internal_key) {  assert(internal_key.size() >= 8);  const size_t n = internal_key.size();  return DecodeFixed64(internal_key.data() + n - 8);}inline ValueType ExtractValueType(const Slice& internal_key) {  uint64_t num = ExtractInternalKeyFooter(internal_key);  unsigned char c = num & 0xff;  return static_cast<ValueType>(c);}// A comparator for internal keys that uses a specified comparator for// the user key portion and breaks ties by decreasing sequence number.class InternalKeyComparator#ifdef NDEBUG    final#endif    : public Comparator { private:  UserComparatorWrapper user_comparator_;  std::string name_; public:  explicit InternalKeyComparator(const Comparator* c)      : user_comparator_(c),        name_("rocksdb.InternalKeyComparator:" +              std::string(user_comparator_.Name())) {}  virtual ~InternalKeyComparator() {}  virtual const char* Name() const override;  virtual int Compare(const Slice& a, const Slice& b) const override;  // Same as Compare except that it excludes the value type from comparison  virtual int CompareKeySeq(const Slice& a, const Slice& b) const;  virtual void FindShortestSeparator(std::string* start,                                     const Slice& limit) const override;  virtual void FindShortSuccessor(std::string* key) const override;  const Comparator* user_comparator() const {    return user_comparator_.user_comparator();  }  int Compare(const InternalKey& a, const InternalKey& b) const;  int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const;  virtual const Comparator* GetRootComparator() const override {    return user_comparator_.GetRootComparator();  }};// The class represent the internal key in encoded form.class InternalKey { private:  std::string rep_; public:  InternalKey() {}  // Leave rep_ as empty to indicate it is invalid  InternalKey(const Slice& _user_key, SequenceNumber s, ValueType t) {    AppendInternalKey(&rep_, ParsedInternalKey(_user_key, s, t));  }  // sets the internal key to be bigger or equal to all internal keys with this  // user key  void SetMaxPossibleForUserKey(const Slice& _user_key) {    AppendInternalKey(        &rep_, ParsedInternalKey(_user_key, 0, static_cast<ValueType>(0)));  }  // sets the internal key to be smaller or equal to all internal keys with this  // user key  void SetMinPossibleForUserKey(const Slice& _user_key) {    AppendInternalKey(&rep_, ParsedInternalKey(_user_key, kMaxSequenceNumber,                                               kValueTypeForSeek));  }  bool Valid() const {    ParsedInternalKey parsed;    return ParseInternalKey(Slice(rep_), &parsed);  }  void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }  Slice Encode() const {    assert(!rep_.empty());    return rep_;  }  Slice user_key() const { return ExtractUserKey(rep_); }  size_t size() { return rep_.size(); }  void Set(const Slice& _user_key, SequenceNumber s, ValueType t) {    SetFrom(ParsedInternalKey(_user_key, s, t));  }  void SetFrom(const ParsedInternalKey& p) {    rep_.clear();    AppendInternalKey(&rep_, p);  }  void Clear() { rep_.clear(); }  // The underlying representation.  // Intended only to be used together with ConvertFromUserKey().  std::string* rep() { return &rep_; }  // Assuming that *rep() contains a user key, this method makes internal key  // out of it in-place. This saves a memcpy compared to Set()/SetFrom().  void ConvertFromUserKey(SequenceNumber s, ValueType t) {    AppendInternalKeyFooter(&rep_, s, t);  }  std::string DebugString(bool hex = false) const;};inline int InternalKeyComparator::Compare(const InternalKey& a,                                          const InternalKey& b) const {  return Compare(a.Encode(), b.Encode());}inline bool ParseInternalKey(const Slice& internal_key,                             ParsedInternalKey* result) {  const size_t n = internal_key.size();  if (n < 8) return false;  uint64_t num = DecodeFixed64(internal_key.data() + n - 8);  unsigned char c = num & 0xff;  result->sequence = num >> 8;  result->type = static_cast<ValueType>(c);  assert(result->type <= ValueType::kMaxValue);  result->user_key = Slice(internal_key.data(), n - 8);  return IsExtendedValueType(result->type);}// Update the sequence number in the internal key.// Guarantees not to invalidate ikey.data().inline void UpdateInternalKey(std::string* ikey, uint64_t seq, ValueType t) {  size_t ikey_sz = ikey->size();  assert(ikey_sz >= 8);  uint64_t newval = (seq << 8) | t;  // Note: Since C++11, strings are guaranteed to be stored contiguously and  // string::operator[]() is guaranteed not to change ikey.data().  EncodeFixed64(&(*ikey)[ikey_sz - 8], newval);}// Get the sequence number from the internal keyinline uint64_t GetInternalKeySeqno(const Slice& internal_key) {  const size_t n = internal_key.size();  assert(n >= 8);  uint64_t num = DecodeFixed64(internal_key.data() + n - 8);  return num >> 8;}// The class to store keys in an efficient way. It allows:// 1. Users can either copy the key into it, or have it point to an unowned//    address.// 2. For copied key, a short inline buffer is kept to reduce memory//    allocation for smaller keys.// 3. It tracks user key or internal key, and allow conversion between them.class IterKey { public:  IterKey()      : buf_(space_),        key_(buf_),        key_size_(0),        buf_size_(sizeof(space_)),        is_user_key_(true) {}  // No copying allowed  IterKey(const IterKey&) = delete;  void operator=(const IterKey&) = delete;  ~IterKey() { ResetBuffer(); }  // The bool will be picked up by the next calls to SetKey  void SetIsUserKey(bool is_user_key) { is_user_key_ = is_user_key; }  // Returns the key in whichever format that was provided to KeyIter  Slice GetKey() const { return Slice(key_, key_size_); }  Slice GetInternalKey() const {    assert(!IsUserKey());    return Slice(key_, key_size_);  }  Slice GetUserKey() const {    if (IsUserKey()) {      return Slice(key_, key_size_);    } else {      assert(key_size_ >= 8);      return Slice(key_, key_size_ - 8);    }  }  size_t Size() const { return key_size_; }  void Clear() { key_size_ = 0; }  // Append "non_shared_data" to its back, from "shared_len"  // This function is used in Block::Iter::ParseNextKey  // shared_len: bytes in [0, shard_len-1] would be remained  // non_shared_data: data to be append, its length must be >= non_shared_len  void TrimAppend(const size_t shared_len, const char* non_shared_data,                  const size_t non_shared_len) {    assert(shared_len <= key_size_);    size_t total_size = shared_len + non_shared_len;    if (IsKeyPinned() /* key is not in buf_ */) {      // Copy the key from external memory to buf_ (copy shared_len bytes)      EnlargeBufferIfNeeded(total_size);      memcpy(buf_, key_, shared_len);    } else if (total_size > buf_size_) {      // Need to allocate space, delete previous space      char* p = new char[total_size];      memcpy(p, key_, shared_len);      if (buf_ != space_) {        delete[] buf_;      }      buf_ = p;      buf_size_ = total_size;    }    memcpy(buf_ + shared_len, non_shared_data, non_shared_len);    key_ = buf_;    key_size_ = total_size;  }  Slice SetKey(const Slice& key, bool copy = true) {    // is_user_key_ expected to be set already via SetIsUserKey    return SetKeyImpl(key, copy);  }  Slice SetUserKey(const Slice& key, bool copy = true) {    is_user_key_ = true;    return SetKeyImpl(key, copy);  }  Slice SetInternalKey(const Slice& key, bool copy = true) {    is_user_key_ = false;    return SetKeyImpl(key, copy);  }  // Copies the content of key, updates the reference to the user key in ikey  // and returns a Slice referencing the new copy.  Slice SetInternalKey(const Slice& key, ParsedInternalKey* ikey) {    size_t key_n = key.size();    assert(key_n >= 8);    SetInternalKey(key);    ikey->user_key = Slice(key_, key_n - 8);    return Slice(key_, key_n);  }  // Copy the key into IterKey own buf_  void OwnKey() {    assert(IsKeyPinned() == true);    Reserve(key_size_);    memcpy(buf_, key_, key_size_);    key_ = buf_;  }  // Update the sequence number in the internal key.  Guarantees not to  // invalidate slices to the key (and the user key).  void UpdateInternalKey(uint64_t seq, ValueType t) {    assert(!IsKeyPinned());    assert(key_size_ >= 8);    uint64_t newval = (seq << 8) | t;    EncodeFixed64(&buf_[key_size_ - 8], newval);  }  bool IsKeyPinned() const { return (key_ != buf_); }  void SetInternalKey(const Slice& key_prefix, const Slice& user_key,                      SequenceNumber s,                      ValueType value_type = kValueTypeForSeek) {    size_t psize = key_prefix.size();    size_t usize = user_key.size();    EnlargeBufferIfNeeded(psize + usize + sizeof(uint64_t));    if (psize > 0) {      memcpy(buf_, key_prefix.data(), psize);    }    memcpy(buf_ + psize, user_key.data(), usize);    EncodeFixed64(buf_ + usize + psize, PackSequenceAndType(s, value_type));    key_ = buf_;    key_size_ = psize + usize + sizeof(uint64_t);    is_user_key_ = false;  }  void SetInternalKey(const Slice& user_key, SequenceNumber s,                      ValueType value_type = kValueTypeForSeek) {    SetInternalKey(Slice(), user_key, s, value_type);  }  void Reserve(size_t size) {    EnlargeBufferIfNeeded(size);    key_size_ = size;  }  void SetInternalKey(const ParsedInternalKey& parsed_key) {    SetInternalKey(Slice(), parsed_key);  }  void SetInternalKey(const Slice& key_prefix,                      const ParsedInternalKey& parsed_key_suffix) {    SetInternalKey(key_prefix, parsed_key_suffix.user_key,                   parsed_key_suffix.sequence, parsed_key_suffix.type);  }  void EncodeLengthPrefixedKey(const Slice& key) {    auto size = key.size();    EnlargeBufferIfNeeded(size + static_cast<size_t>(VarintLength(size)));    char* ptr = EncodeVarint32(buf_, static_cast<uint32_t>(size));    memcpy(ptr, key.data(), size);    key_ = buf_;    is_user_key_ = true;  }  bool IsUserKey() const { return is_user_key_; } private:  char* buf_;  const char* key_;  size_t key_size_;  size_t buf_size_;  char space_[32];  // Avoid allocation for short keys  bool is_user_key_;  Slice SetKeyImpl(const Slice& key, bool copy) {    size_t size = key.size();    if (copy) {      // Copy key to buf_      EnlargeBufferIfNeeded(size);      memcpy(buf_, key.data(), size);      key_ = buf_;    } else {      // Update key_ to point to external memory      key_ = key.data();    }    key_size_ = size;    return Slice(key_, key_size_);  }  void ResetBuffer() {    if (buf_ != space_) {      delete[] buf_;      buf_ = space_;    }    buf_size_ = sizeof(space_);    key_size_ = 0;  }  // Enlarge the buffer size if needed based on key_size.  // By default, static allocated buffer is used. Once there is a key  // larger than the static allocated buffer, another buffer is dynamically  // allocated, until a larger key buffer is requested. In that case, we  // reallocate buffer and delete the old one.  void EnlargeBufferIfNeeded(size_t key_size) {    // If size is smaller than buffer size, continue using current buffer,    // or the static allocated one, as default    if (key_size > buf_size_) {      EnlargeBuffer(key_size);    }  }  void EnlargeBuffer(size_t key_size);};// Convert from a SliceTranform of user keys, to a SliceTransform of// user keys.class InternalKeySliceTransform : public SliceTransform { public:  explicit InternalKeySliceTransform(const SliceTransform* transform)      : transform_(transform) {}  virtual const char* Name() const override { return transform_->Name(); }  virtual Slice Transform(const Slice& src) const override {    auto user_key = ExtractUserKey(src);    return transform_->Transform(user_key);  }  virtual bool InDomain(const Slice& src) const override {    auto user_key = ExtractUserKey(src);    return transform_->InDomain(user_key);  }  virtual bool InRange(const Slice& dst) const override {    auto user_key = ExtractUserKey(dst);    return transform_->InRange(user_key);  }  const SliceTransform* user_prefix_extractor() const { return transform_; } private:  // Like comparator, InternalKeySliceTransform will not take care of the  // deletion of transform_  const SliceTransform* const transform_;};// Read the key of a record from a write batch.// if this record represent the default column family then cf_record// must be passed as false, otherwise it must be passed as true.extern bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key,                                       bool cf_record);// Read record from a write batch piece from input.// tag, column_family, key, value and blob are return values. Callers own the// Slice they point to.// Tag is defined as ValueType.// input will be advanced to after the record.extern Status ReadRecordFromWriteBatch(Slice* input, char* tag,                                       uint32_t* column_family, Slice* key,                                       Slice* value, Slice* blob, Slice* xid);// When user call DeleteRange() to delete a range of keys,// we will store a serialized RangeTombstone in MemTable and SST.// the struct here is a easy-understood form// start/end_key_ is the start/end user key of the range to be deletedstruct RangeTombstone {  Slice start_key_;  Slice end_key_;  SequenceNumber seq_;  RangeTombstone() = default;  RangeTombstone(Slice sk, Slice ek, SequenceNumber sn)      : start_key_(sk), end_key_(ek), seq_(sn) {}  RangeTombstone(ParsedInternalKey parsed_key, Slice value) {    start_key_ = parsed_key.user_key;    seq_ = parsed_key.sequence;    end_key_ = value;  }  // be careful to use Serialize(), allocates new memory  std::pair<InternalKey, Slice> Serialize() const {    auto key = InternalKey(start_key_, seq_, kTypeRangeDeletion);    Slice value = end_key_;    return std::make_pair(std::move(key), std::move(value));  }  // be careful to use SerializeKey(), allocates new memory  InternalKey SerializeKey() const {    return InternalKey(start_key_, seq_, kTypeRangeDeletion);  }  // The tombstone end-key is exclusive, so we generate an internal-key here  // which has a similar property. Using kMaxSequenceNumber guarantees that  // the returned internal-key will compare less than any other internal-key  // with the same user-key. This in turn guarantees that the serialized  // end-key for a tombstone such as [a-b] will compare less than the key "b".  //  // be careful to use SerializeEndKey(), allocates new memory  InternalKey SerializeEndKey() const {    return InternalKey(end_key_, kMaxSequenceNumber, kTypeRangeDeletion);  }};inline int InternalKeyComparator::Compare(const Slice& akey,                                          const Slice& bkey) const {  // Order by:  //    increasing user key (according to user-supplied comparator)  //    decreasing sequence number  //    decreasing type (though sequence# should be enough to disambiguate)  int r = user_comparator_.Compare(ExtractUserKey(akey), ExtractUserKey(bkey));  if (r == 0) {    const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);    const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);    if (anum > bnum) {      r = -1;    } else if (anum < bnum) {      r = +1;    }  }  return r;}inline int InternalKeyComparator::CompareKeySeq(const Slice& akey,                                                const Slice& bkey) const {  // Order by:  //    increasing user key (according to user-supplied comparator)  //    decreasing sequence number  int r = user_comparator_.Compare(ExtractUserKey(akey), ExtractUserKey(bkey));  if (r == 0) {    // Shift the number to exclude the last byte which contains the value type    const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8) >> 8;    const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8) >> 8;    if (anum > bnum) {      r = -1;    } else if (anum < bnum) {      r = +1;    }  }  return r;}// Wrap InternalKeyComparator as a comparator class for ParsedInternalKey.struct ParsedInternalKeyComparator {  explicit ParsedInternalKeyComparator(const InternalKeyComparator* c)      : cmp(c) {}  bool operator()(const ParsedInternalKey& a,                  const ParsedInternalKey& b) const {    return cmp->Compare(a, b) < 0;  }  const InternalKeyComparator* cmp;};}  // namespace ROCKSDB_NAMESPACE
 |