get_context.cc 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. #include "table/get_context.h"
  6. #include "db/merge_helper.h"
  7. #include "db/pinned_iterators_manager.h"
  8. #include "db/read_callback.h"
  9. #include "monitoring/file_read_sample.h"
  10. #include "monitoring/perf_context_imp.h"
  11. #include "monitoring/statistics.h"
  12. #include "rocksdb/env.h"
  13. #include "rocksdb/merge_operator.h"
  14. #include "rocksdb/statistics.h"
  15. namespace ROCKSDB_NAMESPACE {
  16. namespace {
  17. void appendToReplayLog(std::string* replay_log, ValueType type, Slice value) {
  18. #ifndef ROCKSDB_LITE
  19. if (replay_log) {
  20. if (replay_log->empty()) {
  21. // Optimization: in the common case of only one operation in the
  22. // log, we allocate the exact amount of space needed.
  23. replay_log->reserve(1 + VarintLength(value.size()) + value.size());
  24. }
  25. replay_log->push_back(type);
  26. PutLengthPrefixedSlice(replay_log, value);
  27. }
  28. #else
  29. (void)replay_log;
  30. (void)type;
  31. (void)value;
  32. #endif // ROCKSDB_LITE
  33. }
  34. } // namespace
  35. GetContext::GetContext(
  36. const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger,
  37. Statistics* statistics, GetState init_state, const Slice& user_key,
  38. PinnableSlice* pinnable_val, bool* value_found, MergeContext* merge_context,
  39. bool do_merge, SequenceNumber* _max_covering_tombstone_seq, Env* env,
  40. SequenceNumber* seq, PinnedIteratorsManager* _pinned_iters_mgr,
  41. ReadCallback* callback, bool* is_blob_index, uint64_t tracing_get_id)
  42. : ucmp_(ucmp),
  43. merge_operator_(merge_operator),
  44. logger_(logger),
  45. statistics_(statistics),
  46. state_(init_state),
  47. user_key_(user_key),
  48. pinnable_val_(pinnable_val),
  49. value_found_(value_found),
  50. merge_context_(merge_context),
  51. max_covering_tombstone_seq_(_max_covering_tombstone_seq),
  52. env_(env),
  53. seq_(seq),
  54. replay_log_(nullptr),
  55. pinned_iters_mgr_(_pinned_iters_mgr),
  56. callback_(callback),
  57. do_merge_(do_merge),
  58. is_blob_index_(is_blob_index),
  59. tracing_get_id_(tracing_get_id) {
  60. if (seq_) {
  61. *seq_ = kMaxSequenceNumber;
  62. }
  63. sample_ = should_sample_file_read();
  64. }
  65. // Called from TableCache::Get and Table::Get when file/block in which
  66. // key may exist are not there in TableCache/BlockCache respectively. In this
  67. // case we can't guarantee that key does not exist and are not permitted to do
  68. // IO to be certain.Set the status=kFound and value_found=false to let the
  69. // caller know that key may exist but is not there in memory
  70. void GetContext::MarkKeyMayExist() {
  71. state_ = kFound;
  72. if (value_found_ != nullptr) {
  73. *value_found_ = false;
  74. }
  75. }
  76. void GetContext::SaveValue(const Slice& value, SequenceNumber /*seq*/) {
  77. assert(state_ == kNotFound);
  78. appendToReplayLog(replay_log_, kTypeValue, value);
  79. state_ = kFound;
  80. if (LIKELY(pinnable_val_ != nullptr)) {
  81. pinnable_val_->PinSelf(value);
  82. }
  83. }
  84. void GetContext::ReportCounters() {
  85. if (get_context_stats_.num_cache_hit > 0) {
  86. RecordTick(statistics_, BLOCK_CACHE_HIT, get_context_stats_.num_cache_hit);
  87. }
  88. if (get_context_stats_.num_cache_index_hit > 0) {
  89. RecordTick(statistics_, BLOCK_CACHE_INDEX_HIT,
  90. get_context_stats_.num_cache_index_hit);
  91. }
  92. if (get_context_stats_.num_cache_data_hit > 0) {
  93. RecordTick(statistics_, BLOCK_CACHE_DATA_HIT,
  94. get_context_stats_.num_cache_data_hit);
  95. }
  96. if (get_context_stats_.num_cache_filter_hit > 0) {
  97. RecordTick(statistics_, BLOCK_CACHE_FILTER_HIT,
  98. get_context_stats_.num_cache_filter_hit);
  99. }
  100. if (get_context_stats_.num_cache_compression_dict_hit > 0) {
  101. RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_HIT,
  102. get_context_stats_.num_cache_compression_dict_hit);
  103. }
  104. if (get_context_stats_.num_cache_index_miss > 0) {
  105. RecordTick(statistics_, BLOCK_CACHE_INDEX_MISS,
  106. get_context_stats_.num_cache_index_miss);
  107. }
  108. if (get_context_stats_.num_cache_filter_miss > 0) {
  109. RecordTick(statistics_, BLOCK_CACHE_FILTER_MISS,
  110. get_context_stats_.num_cache_filter_miss);
  111. }
  112. if (get_context_stats_.num_cache_data_miss > 0) {
  113. RecordTick(statistics_, BLOCK_CACHE_DATA_MISS,
  114. get_context_stats_.num_cache_data_miss);
  115. }
  116. if (get_context_stats_.num_cache_compression_dict_miss > 0) {
  117. RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_MISS,
  118. get_context_stats_.num_cache_compression_dict_miss);
  119. }
  120. if (get_context_stats_.num_cache_bytes_read > 0) {
  121. RecordTick(statistics_, BLOCK_CACHE_BYTES_READ,
  122. get_context_stats_.num_cache_bytes_read);
  123. }
  124. if (get_context_stats_.num_cache_miss > 0) {
  125. RecordTick(statistics_, BLOCK_CACHE_MISS,
  126. get_context_stats_.num_cache_miss);
  127. }
  128. if (get_context_stats_.num_cache_add > 0) {
  129. RecordTick(statistics_, BLOCK_CACHE_ADD, get_context_stats_.num_cache_add);
  130. }
  131. if (get_context_stats_.num_cache_bytes_write > 0) {
  132. RecordTick(statistics_, BLOCK_CACHE_BYTES_WRITE,
  133. get_context_stats_.num_cache_bytes_write);
  134. }
  135. if (get_context_stats_.num_cache_index_add > 0) {
  136. RecordTick(statistics_, BLOCK_CACHE_INDEX_ADD,
  137. get_context_stats_.num_cache_index_add);
  138. }
  139. if (get_context_stats_.num_cache_index_bytes_insert > 0) {
  140. RecordTick(statistics_, BLOCK_CACHE_INDEX_BYTES_INSERT,
  141. get_context_stats_.num_cache_index_bytes_insert);
  142. }
  143. if (get_context_stats_.num_cache_data_add > 0) {
  144. RecordTick(statistics_, BLOCK_CACHE_DATA_ADD,
  145. get_context_stats_.num_cache_data_add);
  146. }
  147. if (get_context_stats_.num_cache_data_bytes_insert > 0) {
  148. RecordTick(statistics_, BLOCK_CACHE_DATA_BYTES_INSERT,
  149. get_context_stats_.num_cache_data_bytes_insert);
  150. }
  151. if (get_context_stats_.num_cache_filter_add > 0) {
  152. RecordTick(statistics_, BLOCK_CACHE_FILTER_ADD,
  153. get_context_stats_.num_cache_filter_add);
  154. }
  155. if (get_context_stats_.num_cache_filter_bytes_insert > 0) {
  156. RecordTick(statistics_, BLOCK_CACHE_FILTER_BYTES_INSERT,
  157. get_context_stats_.num_cache_filter_bytes_insert);
  158. }
  159. if (get_context_stats_.num_cache_compression_dict_add > 0) {
  160. RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_ADD,
  161. get_context_stats_.num_cache_compression_dict_add);
  162. }
  163. if (get_context_stats_.num_cache_compression_dict_bytes_insert > 0) {
  164. RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
  165. get_context_stats_.num_cache_compression_dict_bytes_insert);
  166. }
  167. }
  168. bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
  169. const Slice& value, bool* matched,
  170. Cleanable* value_pinner) {
  171. assert(matched);
  172. assert((state_ != kMerge && parsed_key.type != kTypeMerge) ||
  173. merge_context_ != nullptr);
  174. if (ucmp_->CompareWithoutTimestamp(parsed_key.user_key, user_key_) == 0) {
  175. *matched = true;
  176. // If the value is not in the snapshot, skip it
  177. if (!CheckCallback(parsed_key.sequence)) {
  178. return true; // to continue to the next seq
  179. }
  180. appendToReplayLog(replay_log_, parsed_key.type, value);
  181. if (seq_ != nullptr) {
  182. // Set the sequence number if it is uninitialized
  183. if (*seq_ == kMaxSequenceNumber) {
  184. *seq_ = parsed_key.sequence;
  185. }
  186. }
  187. auto type = parsed_key.type;
  188. // Key matches. Process it
  189. if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex) &&
  190. max_covering_tombstone_seq_ != nullptr &&
  191. *max_covering_tombstone_seq_ > parsed_key.sequence) {
  192. type = kTypeRangeDeletion;
  193. }
  194. switch (type) {
  195. case kTypeValue:
  196. case kTypeBlobIndex:
  197. assert(state_ == kNotFound || state_ == kMerge);
  198. if (type == kTypeBlobIndex && is_blob_index_ == nullptr) {
  199. // Blob value not supported. Stop.
  200. state_ = kBlobIndex;
  201. return false;
  202. }
  203. if (kNotFound == state_) {
  204. state_ = kFound;
  205. if (do_merge_) {
  206. if (LIKELY(pinnable_val_ != nullptr)) {
  207. if (LIKELY(value_pinner != nullptr)) {
  208. // If the backing resources for the value are provided, pin them
  209. pinnable_val_->PinSlice(value, value_pinner);
  210. } else {
  211. TEST_SYNC_POINT_CALLBACK("GetContext::SaveValue::PinSelf",
  212. this);
  213. // Otherwise copy the value
  214. pinnable_val_->PinSelf(value);
  215. }
  216. }
  217. } else {
  218. // It means this function is called as part of DB GetMergeOperands
  219. // API and the current value should be part of
  220. // merge_context_->operand_list
  221. push_operand(value, value_pinner);
  222. }
  223. } else if (kMerge == state_) {
  224. assert(merge_operator_ != nullptr);
  225. state_ = kFound;
  226. if (do_merge_) {
  227. if (LIKELY(pinnable_val_ != nullptr)) {
  228. Status merge_status = MergeHelper::TimedFullMerge(
  229. merge_operator_, user_key_, &value,
  230. merge_context_->GetOperands(), pinnable_val_->GetSelf(),
  231. logger_, statistics_, env_);
  232. pinnable_val_->PinSelf();
  233. if (!merge_status.ok()) {
  234. state_ = kCorrupt;
  235. }
  236. }
  237. } else {
  238. // It means this function is called as part of DB GetMergeOperands
  239. // API and the current value should be part of
  240. // merge_context_->operand_list
  241. push_operand(value, value_pinner);
  242. }
  243. }
  244. if (is_blob_index_ != nullptr) {
  245. *is_blob_index_ = (type == kTypeBlobIndex);
  246. }
  247. return false;
  248. case kTypeDeletion:
  249. case kTypeSingleDeletion:
  250. case kTypeRangeDeletion:
  251. // TODO(noetzli): Verify correctness once merge of single-deletes
  252. // is supported
  253. assert(state_ == kNotFound || state_ == kMerge);
  254. if (kNotFound == state_) {
  255. state_ = kDeleted;
  256. } else if (kMerge == state_) {
  257. state_ = kFound;
  258. if (LIKELY(pinnable_val_ != nullptr)) {
  259. if (do_merge_) {
  260. Status merge_status = MergeHelper::TimedFullMerge(
  261. merge_operator_, user_key_, nullptr,
  262. merge_context_->GetOperands(), pinnable_val_->GetSelf(),
  263. logger_, statistics_, env_);
  264. pinnable_val_->PinSelf();
  265. if (!merge_status.ok()) {
  266. state_ = kCorrupt;
  267. }
  268. }
  269. // If do_merge_ = false then the current value shouldn't be part of
  270. // merge_context_->operand_list
  271. }
  272. }
  273. return false;
  274. case kTypeMerge:
  275. assert(state_ == kNotFound || state_ == kMerge);
  276. state_ = kMerge;
  277. // value_pinner is not set from plain_table_reader.cc for example.
  278. push_operand(value, value_pinner);
  279. if (do_merge_ && merge_operator_ != nullptr &&
  280. merge_operator_->ShouldMerge(
  281. merge_context_->GetOperandsDirectionBackward())) {
  282. state_ = kFound;
  283. if (LIKELY(pinnable_val_ != nullptr)) {
  284. // do_merge_ = true this is the case where this function is called
  285. // as part of DB Get API hence merge operators should be merged.
  286. if (do_merge_) {
  287. Status merge_status = MergeHelper::TimedFullMerge(
  288. merge_operator_, user_key_, nullptr,
  289. merge_context_->GetOperands(), pinnable_val_->GetSelf(),
  290. logger_, statistics_, env_);
  291. pinnable_val_->PinSelf();
  292. if (!merge_status.ok()) {
  293. state_ = kCorrupt;
  294. }
  295. }
  296. }
  297. return false;
  298. }
  299. return true;
  300. default:
  301. assert(false);
  302. break;
  303. }
  304. }
  305. // state_ could be Corrupt, merge or notfound
  306. return false;
  307. }
  308. void GetContext::push_operand(const Slice& value, Cleanable* value_pinner) {
  309. if (pinned_iters_mgr() && pinned_iters_mgr()->PinningEnabled() &&
  310. value_pinner != nullptr) {
  311. value_pinner->DelegateCleanupsTo(pinned_iters_mgr());
  312. merge_context_->PushOperand(value, true /*value_pinned*/);
  313. } else {
  314. merge_context_->PushOperand(value, false);
  315. }
  316. }
  317. void replayGetContextLog(const Slice& replay_log, const Slice& user_key,
  318. GetContext* get_context, Cleanable* value_pinner) {
  319. #ifndef ROCKSDB_LITE
  320. Slice s = replay_log;
  321. while (s.size()) {
  322. auto type = static_cast<ValueType>(*s.data());
  323. s.remove_prefix(1);
  324. Slice value;
  325. bool ret = GetLengthPrefixedSlice(&s, &value);
  326. assert(ret);
  327. (void)ret;
  328. bool dont_care __attribute__((__unused__));
  329. // Since SequenceNumber is not stored and unknown, we will use
  330. // kMaxSequenceNumber.
  331. get_context->SaveValue(
  332. ParsedInternalKey(user_key, kMaxSequenceNumber, type), value,
  333. &dont_care, value_pinner);
  334. }
  335. #else // ROCKSDB_LITE
  336. (void)replay_log;
  337. (void)user_key;
  338. (void)get_context;
  339. (void)value_pinner;
  340. assert(false);
  341. #endif // ROCKSDB_LITE
  342. }
  343. } // namespace ROCKSDB_NAMESPACE