transaction_util.cc 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. #include "utilities/transactions/transaction_util.h"
  6. #include <cinttypes>
  7. #include <string>
  8. #include <vector>
  9. #include "db/db_impl/db_impl.h"
  10. #include "rocksdb/status.h"
  11. #include "rocksdb/utilities/write_batch_with_index.h"
  12. #include "util/cast_util.h"
  13. #include "util/string_util.h"
  14. namespace ROCKSDB_NAMESPACE {
  15. Status TransactionUtil::CheckKeyForConflicts(
  16. DBImpl* db_impl, ColumnFamilyHandle* column_family, const std::string& key,
  17. SequenceNumber snap_seq, const std::string* const read_ts, bool cache_only,
  18. ReadCallback* snap_checker, SequenceNumber min_uncommitted,
  19. bool enable_udt_validation) {
  20. Status result;
  21. auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
  22. auto cfd = cfh->cfd();
  23. SuperVersion* sv = db_impl->GetAndRefSuperVersion(cfd);
  24. if (sv == nullptr) {
  25. result = Status::InvalidArgument("Could not access column family " +
  26. cfh->GetName());
  27. }
  28. if (result.ok()) {
  29. SequenceNumber earliest_seq =
  30. db_impl->GetEarliestMemTableSequenceNumber(sv, true);
  31. result =
  32. CheckKey(db_impl, sv, earliest_seq, snap_seq, key, read_ts, cache_only,
  33. snap_checker, min_uncommitted, enable_udt_validation);
  34. db_impl->ReturnAndCleanupSuperVersion(cfd, sv);
  35. }
  36. return result;
  37. }
  38. Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv,
  39. SequenceNumber earliest_seq,
  40. SequenceNumber snap_seq,
  41. const std::string& key,
  42. const std::string* const read_ts,
  43. bool cache_only, ReadCallback* snap_checker,
  44. SequenceNumber min_uncommitted,
  45. bool enable_udt_validation) {
  46. // When `min_uncommitted` is provided, keys are not always committed
  47. // in sequence number order, and `snap_checker` is used to check whether
  48. // specific sequence number is in the database is visible to the transaction.
  49. // So `snap_checker` must be provided.
  50. assert(min_uncommitted == kMaxSequenceNumber || snap_checker != nullptr);
  51. Status result;
  52. bool need_to_read_sst = false;
  53. // Since it would be too slow to check the SST files, we will only use
  54. // the memtables to check whether there have been any recent writes
  55. // to this key after it was accessed in this transaction. But if the
  56. // Memtables do not contain a long enough history, we must fail the
  57. // transaction.
  58. if (earliest_seq == kMaxSequenceNumber) {
  59. // The age of this memtable is unknown. Cannot rely on it to check
  60. // for recent writes. This error shouldn't happen often in practice as
  61. // the Memtable should have a valid earliest sequence number except in some
  62. // corner cases (such as error cases during recovery).
  63. need_to_read_sst = true;
  64. if (cache_only) {
  65. result = Status::TryAgain(
  66. "Transaction could not check for conflicts as the MemTable does not "
  67. "contain a long enough history to check write at SequenceNumber: ",
  68. std::to_string(snap_seq));
  69. }
  70. } else if (snap_seq < earliest_seq || min_uncommitted <= earliest_seq) {
  71. // Use <= for min_uncommitted since earliest_seq is actually the largest sec
  72. // before this memtable was created
  73. need_to_read_sst = true;
  74. if (cache_only) {
  75. // The age of this memtable is too new to use to check for recent
  76. // writes.
  77. char msg[300];
  78. snprintf(msg, sizeof(msg),
  79. "Transaction could not check for conflicts for operation at "
  80. "SequenceNumber %" PRIu64
  81. " as the MemTable only contains changes newer than "
  82. "SequenceNumber %" PRIu64
  83. ". Increasing the value of the "
  84. "max_write_buffer_size_to_maintain option could reduce the "
  85. "frequency "
  86. "of this error.",
  87. snap_seq, earliest_seq);
  88. result = Status::TryAgain(msg);
  89. }
  90. }
  91. if (result.ok()) {
  92. SequenceNumber seq = kMaxSequenceNumber;
  93. std::string timestamp;
  94. bool found_record_for_key = false;
  95. // When min_uncommitted == kMaxSequenceNumber, writes are committed in
  96. // sequence number order, so only keys larger than `snap_seq` can cause
  97. // conflict.
  98. // When min_uncommitted != kMaxSequenceNumber, keys lower than
  99. // min_uncommitted will not triggered conflicts, while keys larger than
  100. // min_uncommitted might create conflicts, so we need to read them out
  101. // from the DB, and call callback to snap_checker to determine. So only
  102. // keys lower than min_uncommitted can be skipped.
  103. SequenceNumber lower_bound_seq =
  104. (min_uncommitted == kMaxSequenceNumber) ? snap_seq : min_uncommitted;
  105. Status s = db_impl->GetLatestSequenceForKey(
  106. sv, key, !need_to_read_sst, lower_bound_seq, &seq,
  107. !read_ts ? nullptr : &timestamp, &found_record_for_key,
  108. /*is_blob_index=*/nullptr);
  109. if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
  110. result = s;
  111. } else if (found_record_for_key) {
  112. bool write_conflict = snap_checker == nullptr
  113. ? snap_seq < seq
  114. : !snap_checker->IsVisible(seq);
  115. // Perform conflict checking based on timestamp if applicable.
  116. if (enable_udt_validation && !write_conflict && read_ts != nullptr) {
  117. ColumnFamilyData* cfd = sv->cfd;
  118. assert(cfd);
  119. const Comparator* const ucmp = cfd->user_comparator();
  120. assert(ucmp);
  121. assert(read_ts->size() == ucmp->timestamp_size());
  122. assert(read_ts->size() == timestamp.size());
  123. // Write conflict if *ts < timestamp.
  124. write_conflict = ucmp->CompareTimestamp(*read_ts, timestamp) < 0;
  125. }
  126. if (write_conflict) {
  127. result = Status::Busy();
  128. }
  129. }
  130. }
  131. return result;
  132. }
  133. Status TransactionUtil::CheckKeysForConflicts(DBImpl* db_impl,
  134. const LockTracker& tracker,
  135. bool cache_only) {
  136. Status result;
  137. std::unique_ptr<LockTracker::ColumnFamilyIterator> cf_it(
  138. tracker.GetColumnFamilyIterator());
  139. assert(cf_it != nullptr);
  140. while (cf_it->HasNext()) {
  141. ColumnFamilyId cf = cf_it->Next();
  142. SuperVersion* sv = db_impl->GetAndRefSuperVersion(cf);
  143. if (sv == nullptr) {
  144. result = Status::InvalidArgument("Could not access column family " +
  145. std::to_string(cf));
  146. break;
  147. }
  148. SequenceNumber earliest_seq =
  149. db_impl->GetEarliestMemTableSequenceNumber(sv, true);
  150. // For each of the keys in this transaction, check to see if someone has
  151. // written to this key since the start of the transaction.
  152. std::unique_ptr<LockTracker::KeyIterator> key_it(
  153. tracker.GetKeyIterator(cf));
  154. assert(key_it != nullptr);
  155. while (key_it->HasNext()) {
  156. const std::string& key = key_it->Next();
  157. PointLockStatus status = tracker.GetPointLockStatus(cf, key);
  158. const SequenceNumber key_seq = status.seq;
  159. // TODO: support timestamp-based conflict checking.
  160. // CheckKeysForConflicts() is currently used only by optimistic
  161. // transactions.
  162. result = CheckKey(db_impl, sv, earliest_seq, key_seq, key,
  163. /*read_ts=*/nullptr, cache_only);
  164. if (!result.ok()) {
  165. break;
  166. }
  167. }
  168. db_impl->ReturnAndCleanupSuperVersion(cf, sv);
  169. if (!result.ok()) {
  170. break;
  171. }
  172. }
  173. return result;
  174. }
  175. } // namespace ROCKSDB_NAMESPACE