dbformat.h 46 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. #pragma once
  10. #include <stdio.h>
  11. #include <array>
  12. #include <memory>
  13. #include <optional>
  14. #include <string>
  15. #include <utility>
  16. #include "rocksdb/comparator.h"
  17. #include "rocksdb/slice.h"
  18. #include "rocksdb/slice_transform.h"
  19. #include "rocksdb/types.h"
  20. #include "util/coding.h"
  21. #include "util/user_comparator_wrapper.h"
  22. namespace ROCKSDB_NAMESPACE {
  23. // The file declares data structures and functions that deal with internal
  24. // keys.
  25. // Each internal key contains a user key, a sequence number (SequenceNumber)
  26. // and a type (ValueType), and they are usually encoded together.
  27. // There are some related helper classes here.
  28. class InternalKey;
  29. // Value types encoded as the last component of internal keys.
  30. // DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
  31. // data structures.
  32. // The highest bit of the value type needs to be reserved to SST tables
  33. // for them to do more flexible encoding.
  34. enum ValueType : unsigned char {
  35. kTypeDeletion = 0x0,
  36. kTypeValue = 0x1,
  37. kTypeMerge = 0x2,
  38. kTypeLogData = 0x3, // WAL only.
  39. kTypeColumnFamilyDeletion = 0x4, // WAL only.
  40. kTypeColumnFamilyValue = 0x5, // WAL only.
  41. kTypeColumnFamilyMerge = 0x6, // WAL only.
  42. kTypeSingleDeletion = 0x7,
  43. kTypeColumnFamilySingleDeletion = 0x8, // WAL only.
  44. kTypeBeginPrepareXID = 0x9, // WAL only.
  45. kTypeEndPrepareXID = 0xA, // WAL only.
  46. kTypeCommitXID = 0xB, // WAL only.
  47. kTypeRollbackXID = 0xC, // WAL only.
  48. kTypeNoop = 0xD, // WAL only.
  49. kTypeColumnFamilyRangeDeletion = 0xE, // WAL only.
  50. kTypeRangeDeletion = 0xF, // meta block
  51. kTypeColumnFamilyBlobIndex = 0x10, // Blob DB only
  52. kTypeBlobIndex = 0x11, // Blob DB only
  53. // When the prepared record is also persisted in db, we use a different
  54. // record. This is to ensure that the WAL that is generated by a WritePolicy
  55. // is not mistakenly read by another, which would result into data
  56. // inconsistency.
  57. kTypeBeginPersistedPrepareXID = 0x12, // WAL only.
  58. // Similar to kTypeBeginPersistedPrepareXID, this is to ensure that WAL
  59. // generated by WriteUnprepared write policy is not mistakenly read by
  60. // another.
  61. kTypeBeginUnprepareXID = 0x13, // WAL only.
  62. kTypeDeletionWithTimestamp = 0x14,
  63. kTypeCommitXIDAndTimestamp = 0x15, // WAL only
  64. kTypeWideColumnEntity = 0x16,
  65. kTypeColumnFamilyWideColumnEntity = 0x17, // WAL only
  66. kTypeValuePreferredSeqno = 0x18, // Value with a unix write time
  67. kTypeColumnFamilyValuePreferredSeqno = 0x19, // WAL only
  68. kTypeMaxValid, // Should be after the last valid type, only used for
  69. // validation
  70. kMaxValue = 0x7F // Not used for storing records.
  71. };
  72. // Defined in dbformat.cc
  73. extern const ValueType kValueTypeForSeek;
  74. extern const ValueType kValueTypeForSeekForPrev;
  75. // A range of user keys used internally by RocksDB. Also see `Range` used by
  76. // public APIs.
  77. // TODO: merge with Range in pubic API, but this is generally inclusive limit
  78. // and it is maybe exclusive limit
  79. struct UserKeyRange {
  80. // In case of user_defined timestamp, if enabled, `start` and `limit` should
  81. // include user_defined timestamps.
  82. Slice start;
  83. Slice limit;
  84. UserKeyRange() = default;
  85. UserKeyRange(const Slice& s, const Slice& l) : start(s), limit(l) {}
  86. };
  87. // A range of user keys used internally by RocksDB. Also see `RangeOpt` used by
  88. // public APIs.
  89. struct UserKeyRangeOpt {
  90. // In case of user_defined timestamp, if enabled, `start` and `limit` should
  91. // point to key with timestamp part.
  92. // An optional range start, if missing, indicating a start before all keys.
  93. OptSlice start;
  94. // An optional range end, if missing, indicating an end after all keys.
  95. OptSlice limit;
  96. UserKeyRangeOpt(const OptSlice& s, const OptSlice& l) : start(s), limit(l) {}
  97. };
  98. // Checks whether a type is an inline value type
  99. // (i.e. a type used in memtable skiplist and sst file datablock).
  100. inline bool IsValueType(ValueType t) {
  101. return t <= kTypeMerge || kTypeSingleDeletion == t || kTypeBlobIndex == t ||
  102. kTypeDeletionWithTimestamp == t || kTypeWideColumnEntity == t ||
  103. kTypeValuePreferredSeqno == t;
  104. }
  105. // Checks whether a type is from user operation
  106. // kTypeRangeDeletion is in meta block so this API is separated from above
  107. // kTypeMaxValid can be from keys generated by
  108. // TruncatedRangeDelIterator::start_key()
  109. inline bool IsExtendedValueType(ValueType t) {
  110. return IsValueType(t) || t == kTypeRangeDeletion || t == kTypeMaxValid;
  111. }
  112. // We leave eight bits empty at the bottom so a type and sequence#
  113. // can be packed together into 64-bits.
  114. static const SequenceNumber kMaxSequenceNumber = ((0x1ull << 56) - 1);
  115. static const SequenceNumber kDisableGlobalSequenceNumber =
  116. std::numeric_limits<uint64_t>::max();
  117. constexpr uint64_t kNumInternalBytes = 8;
  118. // Defined in dbformat.cc
  119. extern const std::string kDisableUserTimestamp;
  120. // The data structure that represents an internal key in the way that user_key,
  121. // sequence number and type are stored in separated forms.
  122. struct ParsedInternalKey {
  123. Slice user_key;
  124. SequenceNumber sequence;
  125. ValueType type;
  126. ParsedInternalKey()
  127. : sequence(kMaxSequenceNumber),
  128. type(kTypeDeletion) // Make code analyzer happy
  129. {} // Intentionally left uninitialized (for speed)
  130. // u contains timestamp if user timestamp feature is enabled.
  131. ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
  132. : user_key(u), sequence(seq), type(t) {}
  133. std::string DebugString(bool log_err_key, bool hex,
  134. const Comparator* ucmp = nullptr) const;
  135. void clear() {
  136. user_key.clear();
  137. sequence = 0;
  138. type = kTypeDeletion;
  139. }
  140. void SetTimestamp(const Slice& ts) {
  141. assert(ts.size() <= user_key.size());
  142. const char* addr = user_key.data() + user_key.size() - ts.size();
  143. memcpy(const_cast<char*>(addr), ts.data(), ts.size());
  144. }
  145. Slice GetTimestamp(size_t ts_sz) {
  146. assert(ts_sz <= user_key.size());
  147. const char* addr = user_key.data() + user_key.size() - ts_sz;
  148. return Slice(const_cast<char*>(addr), ts_sz);
  149. }
  150. };
  151. // Return the length of the encoding of "key".
  152. inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
  153. return key.user_key.size() + kNumInternalBytes;
  154. }
  155. // Pack a sequence number and a ValueType into a uint64_t
  156. inline uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
  157. assert(seq <= kMaxSequenceNumber);
  158. // kTypeMaxValid is used in TruncatedRangeDelIterator, see its constructor.
  159. assert(IsExtendedValueType(t) || t == kTypeMaxValid);
  160. return (seq << 8) | t;
  161. }
  162. // Given the result of PackSequenceAndType, store the sequence number in *seq
  163. // and the ValueType in *t.
  164. inline void UnPackSequenceAndType(uint64_t packed, uint64_t* seq,
  165. ValueType* t) {
  166. *seq = packed >> 8;
  167. *t = static_cast<ValueType>(packed & 0xff);
  168. // Commented the following two assertions in order to test key-value checksum
  169. // on corrupted keys without crashing ("DbKvChecksumTest").
  170. // assert(*seq <= kMaxSequenceNumber);
  171. // assert(IsExtendedValueType(*t));
  172. }
  173. const uint64_t kRangeTombstoneSentinel =
  174. PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion);
  175. EntryType GetEntryType(ValueType value_type);
  176. // Append the serialization of "key" to *result.
  177. //
  178. // input [internal key]: <user_key | seqno + type>
  179. // output before: empty
  180. // output: <user_key | seqno + type>
  181. void AppendInternalKey(std::string* result, const ParsedInternalKey& key);
  182. // Append the serialization of "key" to *result, replacing the original
  183. // timestamp with argument ts.
  184. //
  185. // input [internal key]: <user_provided_key | original_ts | seqno + type>
  186. // output before: empty
  187. // output after: <user_provided_key | ts | seqno + type>
  188. void AppendInternalKeyWithDifferentTimestamp(std::string* result,
  189. const ParsedInternalKey& key,
  190. const Slice& ts);
  191. // Append the user key to *result, replacing the original timestamp with
  192. // argument ts.
  193. //
  194. // input [user key]: <user_provided_key | original_ts>
  195. // output before: empty
  196. // output after: <user_provided_key | ts>
  197. void AppendUserKeyWithDifferentTimestamp(std::string* result, const Slice& key,
  198. const Slice& ts);
  199. // Serialized internal key consists of user key followed by footer.
  200. // This function appends the footer to *result, assuming that *result already
  201. // contains the user key at the end.
  202. //
  203. // output before: <user_key>
  204. // output after: <user_key | seqno + type>
  205. void AppendInternalKeyFooter(std::string* result, SequenceNumber s,
  206. ValueType t);
  207. // Append the key and a minimal timestamp to *result
  208. //
  209. // input [user key without ts]: <user_provided_key>
  210. // output before: empty
  211. // output after: <user_provided_key | min_ts>
  212. void AppendKeyWithMinTimestamp(std::string* result, const Slice& key,
  213. size_t ts_sz);
  214. // Append the key and a maximal timestamp to *result
  215. //
  216. // input [user key without ts]: <user_provided_key>
  217. // output before: empty
  218. // output after: <user_provided_key | max_ts>
  219. void AppendKeyWithMaxTimestamp(std::string* result, const Slice& key,
  220. size_t ts_sz);
  221. // `key` is a user key with timestamp. Append the user key without timestamp
  222. // and the minimum timestamp to *result.
  223. //
  224. // input [user key]: <user_provided_key | original_ts>
  225. // output before: empty
  226. // output after: <user_provided_key | min_ts>
  227. void AppendUserKeyWithMinTimestamp(std::string* result, const Slice& key,
  228. size_t ts_sz);
  229. // `key` is a user key with timestamp. Append the user key without timestamp
  230. // and the maximal timestamp to *result.
  231. //
  232. // input [user key]: <user_provided_key | original_ts>
  233. // output before: empty
  234. // output after: <user_provided_key | max_ts>
  235. void AppendUserKeyWithMaxTimestamp(std::string* result, const Slice& key,
  236. size_t ts_sz);
  237. // `key` is an internal key containing a user key without timestamp. Create a
  238. // new key in *result by padding a min timestamp of size `ts_sz` to the user key
  239. // and copying the remaining internal key bytes.
  240. //
  241. // input [internal key]: <user_provided_key | seqno + type>
  242. // output before: empty
  243. // output after: <user_provided_key | min_ts | seqno + type>
  244. void PadInternalKeyWithMinTimestamp(std::string* result, const Slice& key,
  245. size_t ts_sz);
  246. // `key` is an internal key containing a user key without timestamp. Create a
  247. // new key in *result by padding a max timestamp of size `ts_sz` to the user key
  248. // and copying the remaining internal key bytes.
  249. //
  250. // input [internal key]: <user_provided_key | seqno + type>
  251. // output before: empty
  252. // output after: <user_provided_key | max_ts | seqno + type>
  253. void PadInternalKeyWithMaxTimestamp(std::string* result, const Slice& key,
  254. size_t ts_sz);
  255. // `key` is an internal key containing a user key with timestamp of size
  256. // `ts_sz`. Create a new internal key in *result by stripping the timestamp from
  257. // the user key and copying the remaining internal key bytes.
  258. //
  259. // input [internal key]: <user_provided_key | original_ts | seqno + type>
  260. // output before: empty
  261. // output after: <user_provided_key | seqno + type>
  262. void StripTimestampFromInternalKey(std::string* result, const Slice& key,
  263. size_t ts_sz);
  264. // `key` is an internal key containing a user key with timestamp of size
  265. // `ts_sz`. Create a new internal key in *result while replace the original
  266. // timestamp with min timestamp.
  267. //
  268. // input [internal key]: <user_provided_key | original_ts | seqno + type>
  269. // output before: empty
  270. // output after: <user_provided_key | min_ts | seqno + type>
  271. void ReplaceInternalKeyWithMinTimestamp(std::string* result, const Slice& key,
  272. size_t ts_sz);
  273. // Attempt to parse an internal key from "internal_key". On success,
  274. // stores the parsed data in "*result", and returns true.
  275. //
  276. // On error, returns false, leaves "*result" in an undefined state.
  277. Status ParseInternalKey(const Slice& internal_key, ParsedInternalKey* result,
  278. bool log_err_key);
  279. // Returns the user key portion of an internal key.
  280. //
  281. // input [internal key]: <user_key | seqno + type>
  282. // output: <user_key>
  283. inline Slice ExtractUserKey(const Slice& internal_key) {
  284. assert(internal_key.size() >= kNumInternalBytes);
  285. return Slice(internal_key.data(), internal_key.size() - kNumInternalBytes);
  286. }
  287. // input [internal key]: <user_provided_key | ts | seqno + type>
  288. // output : <user_provided_key>
  289. inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key,
  290. size_t ts_sz) {
  291. assert(internal_key.size() >= kNumInternalBytes + ts_sz);
  292. return Slice(internal_key.data(),
  293. internal_key.size() - (kNumInternalBytes + ts_sz));
  294. }
  295. // input [user key]: <user_provided_key | ts>
  296. // output: <user_provided_key>
  297. inline Slice StripTimestampFromUserKey(const Slice& user_key, size_t ts_sz) {
  298. assert(user_key.size() >= ts_sz);
  299. return Slice(user_key.data(), user_key.size() - ts_sz);
  300. }
  301. // input [user key]: <user_provided_key | ts>
  302. // output: <ts>
  303. inline Slice ExtractTimestampFromUserKey(const Slice& user_key, size_t ts_sz) {
  304. assert(user_key.size() >= ts_sz);
  305. return Slice(user_key.data() + user_key.size() - ts_sz, ts_sz);
  306. }
  307. // input [internal key]: <user_provided_key | ts | seqno + type>
  308. // output: <ts>
  309. inline Slice ExtractTimestampFromKey(const Slice& internal_key, size_t ts_sz) {
  310. const size_t key_size = internal_key.size();
  311. assert(key_size >= kNumInternalBytes + ts_sz);
  312. return Slice(internal_key.data() + key_size - ts_sz - kNumInternalBytes,
  313. ts_sz);
  314. }
  315. // input [internal key]: <user_provided_key | ts | seqno + type>
  316. // output: <seqno + type>
  317. inline uint64_t ExtractInternalKeyFooter(const Slice& internal_key) {
  318. assert(internal_key.size() >= kNumInternalBytes);
  319. const size_t n = internal_key.size();
  320. return DecodeFixed64(internal_key.data() + n - kNumInternalBytes);
  321. }
  322. // input [internal key]: <user_provided_key | ts | seqno + type>
  323. // output: <type>
  324. inline ValueType ExtractValueType(const Slice& internal_key) {
  325. uint64_t num = ExtractInternalKeyFooter(internal_key);
  326. unsigned char c = num & 0xff;
  327. return static_cast<ValueType>(c);
  328. }
  329. // A comparator for internal keys that uses a specified comparator for
  330. // the user key portion and breaks ties by decreasing sequence number.
  331. class InternalKeyComparator
  332. #ifdef NDEBUG
  333. final
  334. #endif
  335. : public CompareInterface {
  336. private:
  337. UserComparatorWrapper user_comparator_;
  338. public:
  339. // `InternalKeyComparator`s constructed with the default constructor are not
  340. // usable and will segfault on any attempt to use them for comparisons.
  341. InternalKeyComparator() = default;
  342. // @param named If true, assign a name to this comparator based on the
  343. // underlying comparator's name. This involves an allocation and copy in
  344. // this constructor to precompute the result of `Name()`. To avoid this
  345. // overhead, set `named` to false. In that case, `Name()` will return a
  346. // generic name that is non-specific to the underlying comparator.
  347. explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c) {}
  348. virtual ~InternalKeyComparator() {}
  349. int Compare(const Slice& a, const Slice& b) const override;
  350. bool Equal(const Slice& a, const Slice& b) const {
  351. // TODO Use user_comparator_.Equal(). Perhaps compare seqno before
  352. // comparing the user key too.
  353. return Compare(a, b) == 0;
  354. }
  355. // Same as Compare except that it excludes the value type from comparison
  356. int CompareKeySeq(const Slice& a, const Slice& b) const;
  357. int CompareKeySeq(const ParsedInternalKey& a, const Slice& b) const;
  358. const Comparator* user_comparator() const {
  359. return user_comparator_.user_comparator();
  360. }
  361. int Compare(const InternalKey& a, const InternalKey& b) const;
  362. int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const;
  363. int Compare(const Slice& a, const ParsedInternalKey& b) const;
  364. int Compare(const ParsedInternalKey& a, const Slice& b) const;
  365. // In this `Compare()` overload, the sequence numbers provided in
  366. // `a_global_seqno` and `b_global_seqno` override the sequence numbers in `a`
  367. // and `b`, respectively. To disable sequence number override(s), provide the
  368. // value `kDisableGlobalSequenceNumber`.
  369. int Compare(const Slice& a, SequenceNumber a_global_seqno, const Slice& b,
  370. SequenceNumber b_global_seqno) const;
  371. };
  372. // The class represent the internal key in encoded form.
  373. class InternalKey {
  374. private:
  375. std::string rep_;
  376. public:
  377. InternalKey() {} // Leave rep_ as empty to indicate it is invalid
  378. InternalKey(const Slice& _user_key, SequenceNumber s, ValueType t) {
  379. AppendInternalKey(&rep_, ParsedInternalKey(_user_key, s, t));
  380. }
  381. InternalKey(const Slice& _user_key, SequenceNumber s, ValueType t, Slice ts) {
  382. AppendInternalKeyWithDifferentTimestamp(
  383. &rep_, ParsedInternalKey(_user_key, s, t), ts);
  384. }
  385. // sets the internal key to be bigger or equal to all internal keys with this
  386. // user key
  387. void SetMaxPossibleForUserKey(const Slice& _user_key) {
  388. AppendInternalKey(
  389. &rep_, ParsedInternalKey(_user_key, 0, static_cast<ValueType>(0)));
  390. }
  391. // sets the internal key to be smaller or equal to all internal keys with this
  392. // user key
  393. void SetMinPossibleForUserKey(const Slice& _user_key) {
  394. AppendInternalKey(&rep_, ParsedInternalKey(_user_key, kMaxSequenceNumber,
  395. kValueTypeForSeek));
  396. }
  397. bool Valid() const {
  398. ParsedInternalKey parsed;
  399. return (ParseInternalKey(Slice(rep_), &parsed, false /* log_err_key */)
  400. .ok()); // TODO
  401. }
  402. void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
  403. Slice Encode() const {
  404. assert(!rep_.empty());
  405. return rep_;
  406. }
  407. Slice user_key() const { return ExtractUserKey(rep_); }
  408. size_t size() const { return rep_.size(); }
  409. bool unset() const { return rep_.empty(); }
  410. void Set(const Slice& _user_key, SequenceNumber s, ValueType t) {
  411. SetFrom(ParsedInternalKey(_user_key, s, t));
  412. }
  413. void Set(const Slice& _user_key_with_ts, SequenceNumber s, ValueType t,
  414. const Slice& ts) {
  415. ParsedInternalKey pik(_user_key_with_ts, s, t);
  416. // Should not call pik.SetTimestamp() directly as it overwrites the buffer
  417. // containing _user_key.
  418. SetFrom(pik, ts);
  419. }
  420. void SetFrom(const ParsedInternalKey& p) {
  421. rep_.clear();
  422. AppendInternalKey(&rep_, p);
  423. }
  424. void SetFrom(const ParsedInternalKey& p, const Slice& ts) {
  425. rep_.clear();
  426. AppendInternalKeyWithDifferentTimestamp(&rep_, p, ts);
  427. }
  428. void Clear() { rep_.clear(); }
  429. // The underlying representation.
  430. // Intended only to be used together with ConvertFromUserKey().
  431. std::string* rep() { return &rep_; }
  432. const std::string* const_rep() const { return &rep_; }
  433. // Assuming that *rep() contains a user key, this method makes internal key
  434. // out of it in-place. This saves a memcpy compared to Set()/SetFrom().
  435. void ConvertFromUserKey(SequenceNumber s, ValueType t) {
  436. AppendInternalKeyFooter(&rep_, s, t);
  437. }
  438. std::string DebugString(bool hex, const Comparator* ucmp = nullptr) const;
  439. };
  440. inline int InternalKeyComparator::Compare(const InternalKey& a,
  441. const InternalKey& b) const {
  442. return Compare(a.Encode(), b.Encode());
  443. }
  444. inline Status ParseInternalKey(const Slice& internal_key,
  445. ParsedInternalKey* result, bool log_err_key) {
  446. const size_t n = internal_key.size();
  447. if (n < kNumInternalBytes) {
  448. return Status::Corruption("Corrupted Key: Internal Key too small. Size=" +
  449. std::to_string(n) + ". ");
  450. }
  451. uint64_t num = DecodeFixed64(internal_key.data() + n - kNumInternalBytes);
  452. unsigned char c = num & 0xff;
  453. result->sequence = num >> 8;
  454. result->type = static_cast<ValueType>(c);
  455. assert(result->type <= ValueType::kMaxValue);
  456. result->user_key = Slice(internal_key.data(), n - kNumInternalBytes);
  457. if (IsExtendedValueType(result->type)) {
  458. return Status::OK();
  459. } else {
  460. return Status::Corruption("Corrupted Key",
  461. result->DebugString(log_err_key, true));
  462. }
  463. }
  464. // Update the sequence number in the internal key.
  465. // Guarantees not to invalidate ikey.data().
  466. inline void UpdateInternalKey(std::string* ikey, uint64_t seq, ValueType t) {
  467. size_t ikey_sz = ikey->size();
  468. assert(ikey_sz >= kNumInternalBytes);
  469. uint64_t newval = (seq << 8) | t;
  470. // Note: Since C++11, strings are guaranteed to be stored contiguously and
  471. // string::operator[]() is guaranteed not to change ikey.data().
  472. EncodeFixed64(&(*ikey)[ikey_sz - kNumInternalBytes], newval);
  473. }
  474. // Get the sequence number from the internal key
  475. inline uint64_t GetInternalKeySeqno(const Slice& internal_key) {
  476. const size_t n = internal_key.size();
  477. assert(n >= kNumInternalBytes);
  478. uint64_t num = DecodeFixed64(internal_key.data() + n - kNumInternalBytes);
  479. return num >> 8;
  480. }
  481. // The class to store keys in an efficient way. It allows:
  482. // 1. Users can either copy the key into it, or have it point to an unowned
  483. // address.
  484. // 2. For copied key, a short inline buffer is kept to reduce memory
  485. // allocation for smaller keys.
  486. // 3. It tracks user key or internal key, and allow conversion between them.
  487. class IterKey {
  488. static constexpr size_t kInlineBufferSize = 39;
  489. // This is only used by user-defined timestamps in MemTable only feature,
  490. // which only supports uint64_t timestamps.
  491. static constexpr char kTsMin[] = "\x00\x00\x00\x00\x00\x00\x00\x00";
  492. public:
  493. IterKey()
  494. : buf_(space_),
  495. key_(buf_),
  496. key_size_(0),
  497. buf_size_(kInlineBufferSize),
  498. is_user_key_(true),
  499. secondary_buf_(space_for_secondary_buf_),
  500. secondary_buf_size_(kInlineBufferSize) {}
  501. // No copying allowed
  502. IterKey(const IterKey&) = delete;
  503. void operator=(const IterKey&) = delete;
  504. ~IterKey() {
  505. ResetBuffer();
  506. ResetSecondaryBuffer();
  507. }
  508. // The bool will be picked up by the next calls to SetKey
  509. void SetIsUserKey(bool is_user_key) { is_user_key_ = is_user_key; }
  510. // Returns the key in whichever format that was provided to KeyIter
  511. // If user-defined timestamp is enabled, then timestamp is included in the
  512. // return result.
  513. Slice GetKey() const { return Slice(key_, key_size_); }
  514. Slice GetInternalKey() const {
  515. assert(!IsUserKey());
  516. return Slice(key_, key_size_);
  517. }
  518. // If user-defined timestamp is enabled, then timestamp is included in the
  519. // return result of GetUserKey();
  520. Slice GetUserKey() const {
  521. if (IsUserKey()) {
  522. return Slice(key_, key_size_);
  523. } else {
  524. assert(key_size_ >= kNumInternalBytes);
  525. return Slice(key_, key_size_ - kNumInternalBytes);
  526. }
  527. }
  528. size_t Size() const { return key_size_; }
  529. void Clear() { key_size_ = 0; }
  530. // Append "non_shared_data" to its back, from "shared_len"
  531. // This function is used in Block::Iter::ParseNextKey
  532. // shared_len: bytes in [0, shard_len-1] would be remained
  533. // non_shared_data: data to be append, its length must be >= non_shared_len
  534. void TrimAppend(const size_t shared_len, const char* non_shared_data,
  535. const size_t non_shared_len) {
  536. assert(shared_len <= key_size_);
  537. size_t total_size = shared_len + non_shared_len;
  538. if (IsKeyPinned() /* key is not in buf_ */) {
  539. // Copy the key from external memory to buf_ (copy shared_len bytes)
  540. EnlargeBufferIfNeeded(total_size);
  541. memcpy(buf_, key_, shared_len);
  542. } else if (total_size > buf_size_) {
  543. // Need to allocate space, delete previous space
  544. char* p = new char[total_size];
  545. memcpy(p, key_, shared_len);
  546. if (buf_ != space_) {
  547. delete[] buf_;
  548. }
  549. buf_ = p;
  550. buf_size_ = total_size;
  551. }
  552. memcpy(buf_ + shared_len, non_shared_data, non_shared_len);
  553. key_ = buf_;
  554. key_size_ = total_size;
  555. }
  556. // A version of `TrimAppend` assuming the last bytes of length `ts_sz` in the
  557. // user key part of `key_` is not counted towards shared bytes. And the
  558. // decoded key needed a min timestamp of length `ts_sz` pad to the user key.
  559. void TrimAppendWithTimestamp(const size_t shared_len,
  560. const char* non_shared_data,
  561. const size_t non_shared_len,
  562. const size_t ts_sz) {
  563. // This function is only used by the UDT in memtable feature, which only
  564. // support built in comparators with uint64 timestamps.
  565. assert(ts_sz == sizeof(uint64_t));
  566. size_t next_key_slice_index = 0;
  567. if (IsUserKey()) {
  568. key_slices_[next_key_slice_index++] = Slice(key_, shared_len);
  569. key_slices_[next_key_slice_index++] =
  570. Slice(non_shared_data, non_shared_len);
  571. key_slices_[next_key_slice_index++] = Slice(kTsMin, ts_sz);
  572. } else {
  573. assert(shared_len + non_shared_len >= kNumInternalBytes);
  574. // Invaraint: shared_user_key_len + shared_internal_bytes_len = shared_len
  575. // In naming below `*_len` variables, keyword `user_key` refers to the
  576. // user key part of the existing key in `key_` as apposed to the new key.
  577. // Similary, `internal_bytes` refers to the footer part of the existing
  578. // key. These bytes potentially will move between user key part and the
  579. // footer part in the new key.
  580. const size_t user_key_len = key_size_ - kNumInternalBytes;
  581. const size_t sharable_user_key_len = user_key_len - ts_sz;
  582. const size_t shared_user_key_len =
  583. std::min(shared_len, sharable_user_key_len);
  584. const size_t shared_internal_bytes_len = shared_len - shared_user_key_len;
  585. // One Slice among the three Slices will get split into two Slices, plus
  586. // a timestamp slice.
  587. bool ts_added = false;
  588. // Add slice parts and find the right location to add the min timestamp.
  589. MaybeAddKeyPartsWithTimestamp(
  590. key_, shared_user_key_len,
  591. shared_internal_bytes_len + non_shared_len < kNumInternalBytes,
  592. shared_len + non_shared_len - kNumInternalBytes, ts_sz,
  593. &next_key_slice_index, &ts_added);
  594. MaybeAddKeyPartsWithTimestamp(
  595. key_ + user_key_len, shared_internal_bytes_len,
  596. non_shared_len < kNumInternalBytes,
  597. shared_internal_bytes_len + non_shared_len - kNumInternalBytes, ts_sz,
  598. &next_key_slice_index, &ts_added);
  599. MaybeAddKeyPartsWithTimestamp(non_shared_data, non_shared_len,
  600. non_shared_len >= kNumInternalBytes,
  601. non_shared_len - kNumInternalBytes, ts_sz,
  602. &next_key_slice_index, &ts_added);
  603. assert(ts_added);
  604. }
  605. SetKeyImpl(next_key_slice_index,
  606. /* total_bytes= */ shared_len + non_shared_len + ts_sz);
  607. }
  608. Slice SetKeyWithPaddedMinTimestamp(const Slice& key, size_t ts_sz) {
  609. // This function is only used by the UDT in memtable feature, which only
  610. // support built in comparators with uint64 timestamps.
  611. assert(ts_sz == sizeof(uint64_t));
  612. size_t num_key_slices = 0;
  613. if (is_user_key_) {
  614. key_slices_[0] = key;
  615. key_slices_[1] = Slice(kTsMin, ts_sz);
  616. num_key_slices = 2;
  617. } else {
  618. assert(key.size() >= kNumInternalBytes);
  619. size_t user_key_size = key.size() - kNumInternalBytes;
  620. key_slices_[0] = Slice(key.data(), user_key_size);
  621. key_slices_[1] = Slice(kTsMin, ts_sz);
  622. key_slices_[2] = Slice(key.data() + user_key_size, kNumInternalBytes);
  623. num_key_slices = 3;
  624. }
  625. return SetKeyImpl(num_key_slices, key.size() + ts_sz);
  626. }
  627. Slice SetKey(const Slice& key, bool copy = true) {
  628. // is_user_key_ expected to be set already via SetIsUserKey
  629. return SetKeyImpl(key, copy);
  630. }
  631. // If user-defined timestamp is enabled, then `key` includes timestamp.
  632. // TODO(yanqin) this is also used to set prefix, which do not include
  633. // timestamp. Should be handled.
  634. Slice SetUserKey(const Slice& key, bool copy = true) {
  635. is_user_key_ = true;
  636. return SetKeyImpl(key, copy);
  637. }
  638. Slice SetInternalKey(const Slice& key, bool copy = true) {
  639. is_user_key_ = false;
  640. return SetKeyImpl(key, copy);
  641. }
  642. // Copies the content of key, updates the reference to the user key in ikey
  643. // and returns a Slice referencing the new copy.
  644. Slice SetInternalKey(const Slice& key, ParsedInternalKey* ikey) {
  645. size_t key_n = key.size();
  646. assert(key_n >= kNumInternalBytes);
  647. SetInternalKey(key);
  648. ikey->user_key = Slice(key_, key_n - kNumInternalBytes);
  649. return Slice(key_, key_n);
  650. }
  651. // Update the sequence number in the internal key. Guarantees not to
  652. // invalidate slices to the key (and the user key).
  653. void UpdateInternalKey(uint64_t seq, ValueType t, const Slice* ts = nullptr) {
  654. assert(!IsKeyPinned());
  655. assert(key_size_ >= kNumInternalBytes);
  656. if (ts) {
  657. assert(key_size_ >= kNumInternalBytes + ts->size());
  658. memcpy(&buf_[key_size_ - kNumInternalBytes - ts->size()], ts->data(),
  659. ts->size());
  660. }
  661. uint64_t newval = (seq << 8) | t;
  662. if (key_ == buf_) {
  663. EncodeFixed64(&buf_[key_size_ - kNumInternalBytes], newval);
  664. } else {
  665. assert(key_ == secondary_buf_);
  666. EncodeFixed64(&secondary_buf_[key_size_ - kNumInternalBytes], newval);
  667. }
  668. }
  669. bool IsKeyPinned() const { return key_ != buf_ && key_ != secondary_buf_; }
  670. // If `ts` is provided, user_key should not contain timestamp,
  671. // and `ts` is appended after user_key.
  672. // TODO: more efficient storage for timestamp.
  673. void SetInternalKey(const Slice& key_prefix, const Slice& user_key,
  674. SequenceNumber s,
  675. ValueType value_type = kValueTypeForSeek,
  676. const Slice* ts = nullptr) {
  677. size_t psize = key_prefix.size();
  678. size_t usize = user_key.size();
  679. size_t ts_sz = (ts != nullptr ? ts->size() : 0);
  680. EnlargeBufferIfNeeded(psize + usize + sizeof(uint64_t) + ts_sz);
  681. if (psize > 0) {
  682. memcpy(buf_, key_prefix.data(), psize);
  683. }
  684. memcpy(buf_ + psize, user_key.data(), usize);
  685. if (ts) {
  686. memcpy(buf_ + psize + usize, ts->data(), ts_sz);
  687. }
  688. EncodeFixed64(buf_ + usize + psize + ts_sz,
  689. PackSequenceAndType(s, value_type));
  690. key_ = buf_;
  691. key_size_ = psize + usize + sizeof(uint64_t) + ts_sz;
  692. is_user_key_ = false;
  693. }
  694. void SetInternalKey(const Slice& user_key, SequenceNumber s,
  695. ValueType value_type = kValueTypeForSeek,
  696. const Slice* ts = nullptr) {
  697. SetInternalKey(Slice(), user_key, s, value_type, ts);
  698. }
  699. void Reserve(size_t size) {
  700. EnlargeBufferIfNeeded(size);
  701. key_size_ = size;
  702. }
  703. void SetInternalKey(const ParsedInternalKey& parsed_key) {
  704. SetInternalKey(Slice(), parsed_key);
  705. }
  706. void SetInternalKey(const Slice& key_prefix,
  707. const ParsedInternalKey& parsed_key_suffix) {
  708. SetInternalKey(key_prefix, parsed_key_suffix.user_key,
  709. parsed_key_suffix.sequence, parsed_key_suffix.type);
  710. }
  711. void EncodeLengthPrefixedKey(const Slice& key) {
  712. auto size = key.size();
  713. EnlargeBufferIfNeeded(size + static_cast<size_t>(VarintLength(size)));
  714. char* ptr = EncodeVarint32(buf_, static_cast<uint32_t>(size));
  715. memcpy(ptr, key.data(), size);
  716. key_ = buf_;
  717. is_user_key_ = true;
  718. }
  719. bool IsUserKey() const { return is_user_key_; }
  720. private:
  721. char* buf_;
  722. const char* key_;
  723. size_t key_size_;
  724. size_t buf_size_;
  725. char space_[kInlineBufferSize]; // Avoid allocation for short keys
  726. bool is_user_key_;
  727. // Below variables are only used by user-defined timestamps in MemTable only
  728. // feature for iterating keys in an index block or a data block.
  729. //
  730. // We will alternate between buf_ and secondary_buf_ to hold the key. key_
  731. // will be modified in accordance to point to the right one. This is to avoid
  732. // an extra copy when we need to copy some shared bytes from previous key
  733. // (delta encoding), and we need to pad a min timestamp at the right location.
  734. char space_for_secondary_buf_[kInlineBufferSize]; // Avoid allocation for
  735. // short keys
  736. char* secondary_buf_;
  737. size_t secondary_buf_size_;
  738. // Use to track the pieces that together make the whole key. We then copy
  739. // these pieces in order either into buf_ or secondary_buf_ depending on where
  740. // the previous key is held.
  741. std::array<Slice, 5> key_slices_;
  742. // End of variables used by user-defined timestamps in MemTable only feature.
  743. Slice SetKeyImpl(const Slice& key, bool copy) {
  744. size_t size = key.size();
  745. if (copy) {
  746. // Copy key to buf_
  747. EnlargeBufferIfNeeded(size);
  748. memcpy(buf_, key.data(), size);
  749. key_ = buf_;
  750. } else {
  751. // Update key_ to point to external memory
  752. key_ = key.data();
  753. }
  754. key_size_ = size;
  755. return Slice(key_, key_size_);
  756. }
  757. Slice SetKeyImpl(size_t num_key_slices, size_t total_bytes) {
  758. assert(num_key_slices <= 5);
  759. char* buf_start = nullptr;
  760. if (key_ == buf_) {
  761. // If the previous key is in buf_, we copy key_slices_ in order into
  762. // secondary_buf_.
  763. EnlargeSecondaryBufferIfNeeded(total_bytes);
  764. buf_start = secondary_buf_;
  765. key_ = secondary_buf_;
  766. } else {
  767. // Copy key_slices_ in order into buf_.
  768. EnlargeBufferIfNeeded(total_bytes);
  769. buf_start = buf_;
  770. key_ = buf_;
  771. }
  772. #ifndef NDEBUG
  773. size_t actual_total_bytes = 0;
  774. #endif // NDEBUG
  775. for (size_t i = 0; i < num_key_slices; i++) {
  776. size_t key_slice_size = key_slices_[i].size();
  777. memcpy(buf_start, key_slices_[i].data(), key_slice_size);
  778. buf_start += key_slice_size;
  779. #ifndef NDEBUG
  780. actual_total_bytes += key_slice_size;
  781. #endif // NDEBUG
  782. }
  783. #ifndef NDEBUG
  784. assert(actual_total_bytes == total_bytes);
  785. #endif // NDEBUG
  786. key_size_ = total_bytes;
  787. return Slice(key_, key_size_);
  788. }
  789. void ResetBuffer() {
  790. if (key_ == buf_) {
  791. key_size_ = 0;
  792. }
  793. if (buf_ != space_) {
  794. delete[] buf_;
  795. buf_ = space_;
  796. }
  797. buf_size_ = kInlineBufferSize;
  798. }
  799. void ResetSecondaryBuffer() {
  800. if (key_ == secondary_buf_) {
  801. key_size_ = 0;
  802. }
  803. if (secondary_buf_ != space_for_secondary_buf_) {
  804. delete[] secondary_buf_;
  805. secondary_buf_ = space_for_secondary_buf_;
  806. }
  807. secondary_buf_size_ = kInlineBufferSize;
  808. }
  809. // Enlarge the buffer size if needed based on key_size.
  810. // By default, inline buffer is used. Once there is a key
  811. // larger than the inline buffer, another buffer is dynamically
  812. // allocated, until a larger key buffer is requested. In that case, we
  813. // reallocate buffer and delete the old one.
  814. void EnlargeBufferIfNeeded(size_t key_size) {
  815. // If size is smaller than buffer size, continue using current buffer,
  816. // or the static allocated one, as default
  817. if (key_size > buf_size_) {
  818. EnlargeBuffer(key_size);
  819. }
  820. }
  821. void EnlargeSecondaryBufferIfNeeded(size_t key_size);
  822. void EnlargeBuffer(size_t key_size);
  823. void MaybeAddKeyPartsWithTimestamp(const char* slice_data,
  824. const size_t slice_sz, bool add_timestamp,
  825. const size_t left_sz, const size_t ts_sz,
  826. size_t* next_key_slice_idx,
  827. bool* ts_added) {
  828. assert(next_key_slice_idx);
  829. if (add_timestamp && !*ts_added) {
  830. assert(slice_sz >= left_sz);
  831. key_slices_[(*next_key_slice_idx)++] = Slice(slice_data, left_sz);
  832. key_slices_[(*next_key_slice_idx)++] = Slice(kTsMin, ts_sz);
  833. key_slices_[(*next_key_slice_idx)++] =
  834. Slice(slice_data + left_sz, slice_sz - left_sz);
  835. *ts_added = true;
  836. } else {
  837. key_slices_[(*next_key_slice_idx)++] = Slice(slice_data, slice_sz);
  838. }
  839. assert(*next_key_slice_idx <= 5);
  840. }
  841. };
  842. // Convert from a SliceTransform of user keys, to a SliceTransform of
  843. // internal keys.
  844. class InternalKeySliceTransform : public SliceTransform {
  845. public:
  846. explicit InternalKeySliceTransform(const SliceTransform* transform)
  847. : transform_(transform) {}
  848. const char* Name() const override { return transform_->Name(); }
  849. Slice Transform(const Slice& src) const override {
  850. auto user_key = ExtractUserKey(src);
  851. return transform_->Transform(user_key);
  852. }
  853. bool InDomain(const Slice& src) const override {
  854. auto user_key = ExtractUserKey(src);
  855. return transform_->InDomain(user_key);
  856. }
  857. bool InRange(const Slice& dst) const override {
  858. auto user_key = ExtractUserKey(dst);
  859. return transform_->InRange(user_key);
  860. }
  861. const SliceTransform* user_prefix_extractor() const { return transform_; }
  862. private:
  863. // Like comparator, InternalKeySliceTransform will not take care of the
  864. // deletion of transform_
  865. const SliceTransform* const transform_;
  866. };
  867. // Read the key of a record from a write batch.
  868. // if this record represent the default column family then cf_record
  869. // must be passed as false, otherwise it must be passed as true.
  870. bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key, bool cf_record);
  871. // Read record from a write batch piece from input.
  872. // tag, column_family, key, value and blob are return values. Callers own the
  873. // slice they point to.
  874. // Tag is defined as ValueType.
  875. // input will be advanced to after the record.
  876. // If user-defined timestamp is enabled for a column family, then the `key`
  877. // resulting from this call will include timestamp.
  878. Status ReadRecordFromWriteBatch(Slice* input, char* tag,
  879. uint32_t* column_family, Slice* key,
  880. Slice* value, Slice* blob, Slice* xid,
  881. uint64_t* write_unix_time);
  882. // When user call DeleteRange() to delete a range of keys,
  883. // we will store a serialized RangeTombstone in MemTable and SST.
  884. // the struct here is an easy-understood form
  885. // start/end_key_ is the start/end user key of the range to be deleted
  886. struct RangeTombstone {
  887. Slice start_key_;
  888. Slice end_key_;
  889. SequenceNumber seq_;
  890. // TODO: we should optimize the storage here when user-defined timestamp
  891. // is NOT enabled: they currently take up (16 + 32 + 32) bytes per tombstone.
  892. Slice ts_;
  893. std::string pinned_start_key_;
  894. std::string pinned_end_key_;
  895. RangeTombstone() = default;
  896. RangeTombstone(Slice sk, Slice ek, SequenceNumber sn)
  897. : start_key_(sk), end_key_(ek), seq_(sn) {}
  898. // User-defined timestamp is enabled, `sk` and `ek` should be user key
  899. // with timestamp, `ts` will replace the timestamps in `sk` and
  900. // `ek`.
  901. RangeTombstone(Slice sk, Slice ek, SequenceNumber sn, Slice ts) : seq_(sn) {
  902. const size_t ts_sz = ts.size();
  903. assert(ts_sz > 0);
  904. pinned_start_key_.reserve(sk.size());
  905. pinned_end_key_.reserve(ek.size());
  906. AppendUserKeyWithDifferentTimestamp(&pinned_start_key_, sk, ts);
  907. AppendUserKeyWithDifferentTimestamp(&pinned_end_key_, ek, ts);
  908. start_key_ = pinned_start_key_;
  909. end_key_ = pinned_end_key_;
  910. ts_ = Slice(pinned_start_key_.data() + sk.size() - ts_sz, ts_sz);
  911. }
  912. RangeTombstone(ParsedInternalKey parsed_key, Slice value) {
  913. start_key_ = parsed_key.user_key;
  914. seq_ = parsed_key.sequence;
  915. end_key_ = value;
  916. }
  917. // be careful to use Serialize(), allocates new memory
  918. std::pair<InternalKey, Slice> Serialize() const {
  919. auto key = InternalKey(start_key_, seq_, kTypeRangeDeletion);
  920. return std::make_pair(std::move(key), end_key_);
  921. }
  922. // be careful to use SerializeKey(), allocates new memory
  923. InternalKey SerializeKey() const {
  924. return InternalKey(start_key_, seq_, kTypeRangeDeletion);
  925. }
  926. // The tombstone end-key is exclusive, so we generate an internal-key here
  927. // which has a similar property. Using kMaxSequenceNumber guarantees that
  928. // the returned internal-key will compare less than any other internal-key
  929. // with the same user-key. This in turn guarantees that the serialized
  930. // end-key for a tombstone such as [a-b] will compare less than the key "b".
  931. //
  932. // be careful to use SerializeEndKey(), allocates new memory
  933. InternalKey SerializeEndKey() const {
  934. if (!ts_.empty()) {
  935. static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
  936. if (ts_.size() <= strlen(kTsMax)) {
  937. return InternalKey(end_key_, kMaxSequenceNumber, kTypeRangeDeletion,
  938. Slice(kTsMax, ts_.size()));
  939. } else {
  940. return InternalKey(end_key_, kMaxSequenceNumber, kTypeRangeDeletion,
  941. std::string(ts_.size(), '\xff'));
  942. }
  943. }
  944. return InternalKey(end_key_, kMaxSequenceNumber, kTypeRangeDeletion);
  945. }
  946. };
  947. inline int InternalKeyComparator::Compare(const Slice& akey,
  948. const Slice& bkey) const {
  949. // Order by:
  950. // increasing user key (according to user-supplied comparator)
  951. // decreasing sequence number
  952. // decreasing type (though sequence# should be enough to disambiguate)
  953. int r = user_comparator_.Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
  954. if (r == 0) {
  955. const uint64_t anum =
  956. DecodeFixed64(akey.data() + akey.size() - kNumInternalBytes);
  957. const uint64_t bnum =
  958. DecodeFixed64(bkey.data() + bkey.size() - kNumInternalBytes);
  959. if (anum > bnum) {
  960. r = -1;
  961. } else if (anum < bnum) {
  962. r = +1;
  963. }
  964. }
  965. return r;
  966. }
  967. inline int InternalKeyComparator::CompareKeySeq(const Slice& akey,
  968. const Slice& bkey) const {
  969. // Order by:
  970. // increasing user key (according to user-supplied comparator)
  971. // decreasing sequence number
  972. int r = user_comparator_.Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
  973. if (r == 0) {
  974. // Shift the number to exclude the last byte which contains the value type
  975. const uint64_t anum =
  976. DecodeFixed64(akey.data() + akey.size() - kNumInternalBytes) >> 8;
  977. const uint64_t bnum =
  978. DecodeFixed64(bkey.data() + bkey.size() - kNumInternalBytes) >> 8;
  979. if (anum > bnum) {
  980. r = -1;
  981. } else if (anum < bnum) {
  982. r = +1;
  983. }
  984. }
  985. return r;
  986. }
  987. inline int InternalKeyComparator::CompareKeySeq(const ParsedInternalKey& a,
  988. const Slice& b) const {
  989. // Order by:
  990. // increasing user key (according to user-supplied comparator)
  991. // decreasing sequence number
  992. int r = user_comparator_.Compare(a.user_key, ExtractUserKey(b));
  993. if (r == 0) {
  994. // Shift the number to exclude the last byte which contains the value type
  995. const uint64_t anum = a.sequence;
  996. const uint64_t bnum =
  997. DecodeFixed64(b.data() + b.size() - kNumInternalBytes) >> 8;
  998. if (anum > bnum) {
  999. r = -1;
  1000. } else if (anum < bnum) {
  1001. r = +1;
  1002. }
  1003. }
  1004. return r;
  1005. }
  1006. inline int InternalKeyComparator::Compare(const Slice& a,
  1007. SequenceNumber a_global_seqno,
  1008. const Slice& b,
  1009. SequenceNumber b_global_seqno) const {
  1010. int r = user_comparator_.Compare(ExtractUserKey(a), ExtractUserKey(b));
  1011. if (r == 0) {
  1012. uint64_t a_footer, b_footer;
  1013. if (a_global_seqno == kDisableGlobalSequenceNumber) {
  1014. a_footer = ExtractInternalKeyFooter(a);
  1015. } else {
  1016. a_footer = PackSequenceAndType(a_global_seqno, ExtractValueType(a));
  1017. }
  1018. if (b_global_seqno == kDisableGlobalSequenceNumber) {
  1019. b_footer = ExtractInternalKeyFooter(b);
  1020. } else {
  1021. b_footer = PackSequenceAndType(b_global_seqno, ExtractValueType(b));
  1022. }
  1023. if (a_footer > b_footer) {
  1024. r = -1;
  1025. } else if (a_footer < b_footer) {
  1026. r = +1;
  1027. }
  1028. }
  1029. return r;
  1030. }
  1031. // Wrap InternalKeyComparator as a comparator class for ParsedInternalKey.
  1032. struct ParsedInternalKeyComparator {
  1033. explicit ParsedInternalKeyComparator(const InternalKeyComparator* c)
  1034. : cmp(c) {}
  1035. bool operator()(const ParsedInternalKey& a,
  1036. const ParsedInternalKey& b) const {
  1037. return cmp->Compare(a, b) < 0;
  1038. }
  1039. const InternalKeyComparator* cmp;
  1040. };
  1041. class PredecessorWALInfo {
  1042. public:
  1043. PredecessorWALInfo()
  1044. : log_number_(0),
  1045. size_bytes_(0),
  1046. last_seqno_recorded_(0),
  1047. initialized_(false) {}
  1048. explicit PredecessorWALInfo(uint64_t log_number, uint64_t size_bytes,
  1049. SequenceNumber last_seqno_recorded)
  1050. : log_number_(log_number),
  1051. size_bytes_(size_bytes),
  1052. last_seqno_recorded_(last_seqno_recorded),
  1053. initialized_(true) {}
  1054. uint64_t GetLogNumber() const {
  1055. assert(initialized_);
  1056. return log_number_;
  1057. }
  1058. uint64_t GetSizeBytes() const {
  1059. assert(initialized_);
  1060. return size_bytes_;
  1061. }
  1062. SequenceNumber GetLastSeqnoRecorded() const {
  1063. assert(initialized_);
  1064. return last_seqno_recorded_;
  1065. }
  1066. bool IsInitialized() const { return initialized_; }
  1067. inline void EncodeTo(std::string* dst) const {
  1068. assert(dst != nullptr);
  1069. assert(initialized_);
  1070. PutFixed64(dst, log_number_);
  1071. PutFixed64(dst, size_bytes_);
  1072. PutFixed64(dst, last_seqno_recorded_);
  1073. }
  1074. inline Status DecodeFrom(Slice* src) {
  1075. if (!GetFixed64(src, &log_number_)) {
  1076. return Status::Corruption("Error decoding log number");
  1077. }
  1078. if (!GetFixed64(src, &size_bytes_)) {
  1079. return Status::Corruption("Error decoding size bytes");
  1080. }
  1081. if (!GetFixed64(src, &last_seqno_recorded_)) {
  1082. return Status::Corruption("Error decoding last seqno recorded");
  1083. }
  1084. initialized_ = true;
  1085. return Status::OK();
  1086. }
  1087. private:
  1088. uint64_t log_number_;
  1089. uint64_t size_bytes_;
  1090. SequenceNumber last_seqno_recorded_;
  1091. bool initialized_;
  1092. };
  1093. } // namespace ROCKSDB_NAMESPACE