write_unprepared_txn_db.cc 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. #ifndef ROCKSDB_LITE
  6. #include "utilities/transactions/write_unprepared_txn_db.h"
  7. #include "db/arena_wrapped_db_iter.h"
  8. #include "rocksdb/utilities/transaction_db.h"
  9. #include "util/cast_util.h"
  10. namespace ROCKSDB_NAMESPACE {
  11. // Instead of reconstructing a Transaction object, and calling rollback on it,
  12. // we can be more efficient with RollbackRecoveredTransaction by skipping
  13. // unnecessary steps (eg. updating CommitMap, reconstructing keyset)
  14. Status WriteUnpreparedTxnDB::RollbackRecoveredTransaction(
  15. const DBImpl::RecoveredTransaction* rtxn) {
  16. // TODO(lth): Reduce duplicate code with WritePrepared rollback logic.
  17. assert(rtxn->unprepared_);
  18. auto cf_map_shared_ptr = WritePreparedTxnDB::GetCFHandleMap();
  19. auto cf_comp_map_shared_ptr = WritePreparedTxnDB::GetCFComparatorMap();
  20. // In theory we could write with disableWAL = true during recovery, and
  21. // assume that if we crash again during recovery, we can just replay from
  22. // the very beginning. Unfortunately, the XIDs from the application may not
  23. // necessarily be unique across restarts, potentially leading to situations
  24. // like this:
  25. //
  26. // BEGIN_PREPARE(unprepared) Put(a) END_PREPARE(xid = 1)
  27. // -- crash and recover with Put(a) rolled back as it was not prepared
  28. // BEGIN_PREPARE(prepared) Put(b) END_PREPARE(xid = 1)
  29. // COMMIT(xid = 1)
  30. // -- crash and recover with both a, b
  31. //
  32. // We could just write the rollback marker, but then we would have to extend
  33. // MemTableInserter during recovery to actually do writes into the DB
  34. // instead of just dropping the in-memory write batch.
  35. //
  36. WriteOptions w_options;
  37. class InvalidSnapshotReadCallback : public ReadCallback {
  38. public:
  39. InvalidSnapshotReadCallback(SequenceNumber snapshot)
  40. : ReadCallback(snapshot) {}
  41. inline bool IsVisibleFullCheck(SequenceNumber) override {
  42. // The seq provided as snapshot is the seq right before we have locked and
  43. // wrote to it, so whatever is there, it is committed.
  44. return true;
  45. }
  46. // Ignore the refresh request since we are confident that our snapshot seq
  47. // is not going to be affected by concurrent compactions (not enabled yet.)
  48. void Refresh(SequenceNumber) override {}
  49. };
  50. // Iterate starting with largest sequence number.
  51. for (auto it = rtxn->batches_.rbegin(); it != rtxn->batches_.rend(); ++it) {
  52. auto last_visible_txn = it->first - 1;
  53. const auto& batch = it->second.batch_;
  54. WriteBatch rollback_batch;
  55. struct RollbackWriteBatchBuilder : public WriteBatch::Handler {
  56. DBImpl* db_;
  57. ReadOptions roptions;
  58. InvalidSnapshotReadCallback callback;
  59. WriteBatch* rollback_batch_;
  60. std::map<uint32_t, const Comparator*>& comparators_;
  61. std::map<uint32_t, ColumnFamilyHandle*>& handles_;
  62. using CFKeys = std::set<Slice, SetComparator>;
  63. std::map<uint32_t, CFKeys> keys_;
  64. bool rollback_merge_operands_;
  65. RollbackWriteBatchBuilder(
  66. DBImpl* db, SequenceNumber snap_seq, WriteBatch* dst_batch,
  67. std::map<uint32_t, const Comparator*>& comparators,
  68. std::map<uint32_t, ColumnFamilyHandle*>& handles,
  69. bool rollback_merge_operands)
  70. : db_(db),
  71. callback(snap_seq),
  72. // disable min_uncommitted optimization
  73. rollback_batch_(dst_batch),
  74. comparators_(comparators),
  75. handles_(handles),
  76. rollback_merge_operands_(rollback_merge_operands) {}
  77. Status Rollback(uint32_t cf, const Slice& key) {
  78. Status s;
  79. CFKeys& cf_keys = keys_[cf];
  80. if (cf_keys.size() == 0) { // just inserted
  81. auto cmp = comparators_[cf];
  82. keys_[cf] = CFKeys(SetComparator(cmp));
  83. }
  84. auto res = cf_keys.insert(key);
  85. if (res.second ==
  86. false) { // second is false if a element already existed.
  87. return s;
  88. }
  89. PinnableSlice pinnable_val;
  90. bool not_used;
  91. auto cf_handle = handles_[cf];
  92. DBImpl::GetImplOptions get_impl_options;
  93. get_impl_options.column_family = cf_handle;
  94. get_impl_options.value = &pinnable_val;
  95. get_impl_options.value_found = &not_used;
  96. get_impl_options.callback = &callback;
  97. s = db_->GetImpl(roptions, key, get_impl_options);
  98. assert(s.ok() || s.IsNotFound());
  99. if (s.ok()) {
  100. s = rollback_batch_->Put(cf_handle, key, pinnable_val);
  101. assert(s.ok());
  102. } else if (s.IsNotFound()) {
  103. // There has been no readable value before txn. By adding a delete we
  104. // make sure that there will be none afterwards either.
  105. s = rollback_batch_->Delete(cf_handle, key);
  106. assert(s.ok());
  107. } else {
  108. // Unexpected status. Return it to the user.
  109. }
  110. return s;
  111. }
  112. Status PutCF(uint32_t cf, const Slice& key,
  113. const Slice& /*val*/) override {
  114. return Rollback(cf, key);
  115. }
  116. Status DeleteCF(uint32_t cf, const Slice& key) override {
  117. return Rollback(cf, key);
  118. }
  119. Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
  120. return Rollback(cf, key);
  121. }
  122. Status MergeCF(uint32_t cf, const Slice& key,
  123. const Slice& /*val*/) override {
  124. if (rollback_merge_operands_) {
  125. return Rollback(cf, key);
  126. } else {
  127. return Status::OK();
  128. }
  129. }
  130. // Recovered batches do not contain 2PC markers.
  131. Status MarkNoop(bool) override { return Status::InvalidArgument(); }
  132. Status MarkBeginPrepare(bool) override {
  133. return Status::InvalidArgument();
  134. }
  135. Status MarkEndPrepare(const Slice&) override {
  136. return Status::InvalidArgument();
  137. }
  138. Status MarkCommit(const Slice&) override {
  139. return Status::InvalidArgument();
  140. }
  141. Status MarkRollback(const Slice&) override {
  142. return Status::InvalidArgument();
  143. }
  144. } rollback_handler(db_impl_, last_visible_txn, &rollback_batch,
  145. *cf_comp_map_shared_ptr.get(), *cf_map_shared_ptr.get(),
  146. txn_db_options_.rollback_merge_operands);
  147. auto s = batch->Iterate(&rollback_handler);
  148. if (!s.ok()) {
  149. return s;
  150. }
  151. // The Rollback marker will be used as a batch separator
  152. WriteBatchInternal::MarkRollback(&rollback_batch, rtxn->name_);
  153. const uint64_t kNoLogRef = 0;
  154. const bool kDisableMemtable = true;
  155. const size_t kOneBatch = 1;
  156. uint64_t seq_used = kMaxSequenceNumber;
  157. s = db_impl_->WriteImpl(w_options, &rollback_batch, nullptr, nullptr,
  158. kNoLogRef, !kDisableMemtable, &seq_used, kOneBatch);
  159. if (!s.ok()) {
  160. return s;
  161. }
  162. // If two_write_queues, we must manually release the sequence number to
  163. // readers.
  164. if (db_impl_->immutable_db_options().two_write_queues) {
  165. db_impl_->SetLastPublishedSequence(seq_used);
  166. }
  167. }
  168. return Status::OK();
  169. }
  170. Status WriteUnpreparedTxnDB::Initialize(
  171. const std::vector<size_t>& compaction_enabled_cf_indices,
  172. const std::vector<ColumnFamilyHandle*>& handles) {
  173. // TODO(lth): Reduce code duplication in this function.
  174. auto dbimpl = static_cast_with_check<DBImpl, DB>(GetRootDB());
  175. assert(dbimpl != nullptr);
  176. db_impl_->SetSnapshotChecker(new WritePreparedSnapshotChecker(this));
  177. // A callback to commit a single sub-batch
  178. class CommitSubBatchPreReleaseCallback : public PreReleaseCallback {
  179. public:
  180. explicit CommitSubBatchPreReleaseCallback(WritePreparedTxnDB* db)
  181. : db_(db) {}
  182. Status Callback(SequenceNumber commit_seq,
  183. bool is_mem_disabled __attribute__((__unused__)), uint64_t,
  184. size_t /*index*/, size_t /*total*/) override {
  185. assert(!is_mem_disabled);
  186. db_->AddCommitted(commit_seq, commit_seq);
  187. return Status::OK();
  188. }
  189. private:
  190. WritePreparedTxnDB* db_;
  191. };
  192. db_impl_->SetRecoverableStatePreReleaseCallback(
  193. new CommitSubBatchPreReleaseCallback(this));
  194. // PessimisticTransactionDB::Initialize
  195. for (auto cf_ptr : handles) {
  196. AddColumnFamily(cf_ptr);
  197. }
  198. // Verify cf options
  199. for (auto handle : handles) {
  200. ColumnFamilyDescriptor cfd;
  201. Status s = handle->GetDescriptor(&cfd);
  202. if (!s.ok()) {
  203. return s;
  204. }
  205. s = VerifyCFOptions(cfd.options);
  206. if (!s.ok()) {
  207. return s;
  208. }
  209. }
  210. // Re-enable compaction for the column families that initially had
  211. // compaction enabled.
  212. std::vector<ColumnFamilyHandle*> compaction_enabled_cf_handles;
  213. compaction_enabled_cf_handles.reserve(compaction_enabled_cf_indices.size());
  214. for (auto index : compaction_enabled_cf_indices) {
  215. compaction_enabled_cf_handles.push_back(handles[index]);
  216. }
  217. // create 'real' transactions from recovered shell transactions
  218. auto rtxns = dbimpl->recovered_transactions();
  219. std::map<SequenceNumber, SequenceNumber> ordered_seq_cnt;
  220. for (auto rtxn : rtxns) {
  221. auto recovered_trx = rtxn.second;
  222. assert(recovered_trx);
  223. assert(recovered_trx->batches_.size() >= 1);
  224. assert(recovered_trx->name_.length());
  225. // We can only rollback transactions after AdvanceMaxEvictedSeq is called,
  226. // but AddPrepared must occur before AdvanceMaxEvictedSeq, which is why
  227. // two iterations is required.
  228. if (recovered_trx->unprepared_) {
  229. continue;
  230. }
  231. WriteOptions w_options;
  232. w_options.sync = true;
  233. TransactionOptions t_options;
  234. auto first_log_number = recovered_trx->batches_.begin()->second.log_number_;
  235. auto first_seq = recovered_trx->batches_.begin()->first;
  236. auto last_prepare_batch_cnt =
  237. recovered_trx->batches_.begin()->second.batch_cnt_;
  238. Transaction* real_trx = BeginTransaction(w_options, t_options, nullptr);
  239. assert(real_trx);
  240. auto wupt =
  241. static_cast_with_check<WriteUnpreparedTxn, Transaction>(real_trx);
  242. wupt->recovered_txn_ = true;
  243. real_trx->SetLogNumber(first_log_number);
  244. real_trx->SetId(first_seq);
  245. Status s = real_trx->SetName(recovered_trx->name_);
  246. if (!s.ok()) {
  247. return s;
  248. }
  249. wupt->prepare_batch_cnt_ = last_prepare_batch_cnt;
  250. for (auto batch : recovered_trx->batches_) {
  251. const auto& seq = batch.first;
  252. const auto& batch_info = batch.second;
  253. auto cnt = batch_info.batch_cnt_ ? batch_info.batch_cnt_ : 1;
  254. assert(batch_info.log_number_);
  255. ordered_seq_cnt[seq] = cnt;
  256. assert(wupt->unprep_seqs_.count(seq) == 0);
  257. wupt->unprep_seqs_[seq] = cnt;
  258. s = wupt->RebuildFromWriteBatch(batch_info.batch_);
  259. assert(s.ok());
  260. if (!s.ok()) {
  261. return s;
  262. }
  263. }
  264. const bool kClear = true;
  265. wupt->InitWriteBatch(kClear);
  266. real_trx->SetState(Transaction::PREPARED);
  267. if (!s.ok()) {
  268. return s;
  269. }
  270. }
  271. // AddPrepared must be called in order
  272. for (auto seq_cnt : ordered_seq_cnt) {
  273. auto seq = seq_cnt.first;
  274. auto cnt = seq_cnt.second;
  275. for (size_t i = 0; i < cnt; i++) {
  276. AddPrepared(seq + i);
  277. }
  278. }
  279. SequenceNumber prev_max = max_evicted_seq_;
  280. SequenceNumber last_seq = db_impl_->GetLatestSequenceNumber();
  281. AdvanceMaxEvictedSeq(prev_max, last_seq);
  282. // Create a gap between max and the next snapshot. This simplifies the logic
  283. // in IsInSnapshot by not having to consider the special case of max ==
  284. // snapshot after recovery. This is tested in IsInSnapshotEmptyMapTest.
  285. if (last_seq) {
  286. db_impl_->versions_->SetLastAllocatedSequence(last_seq + 1);
  287. db_impl_->versions_->SetLastSequence(last_seq + 1);
  288. db_impl_->versions_->SetLastPublishedSequence(last_seq + 1);
  289. }
  290. Status s;
  291. // Rollback unprepared transactions.
  292. for (auto rtxn : rtxns) {
  293. auto recovered_trx = rtxn.second;
  294. if (recovered_trx->unprepared_) {
  295. s = RollbackRecoveredTransaction(recovered_trx);
  296. if (!s.ok()) {
  297. return s;
  298. }
  299. continue;
  300. }
  301. }
  302. if (s.ok()) {
  303. dbimpl->DeleteAllRecoveredTransactions();
  304. // Compaction should start only after max_evicted_seq_ is set AND recovered
  305. // transactions are either added to PrepareHeap or rolled back.
  306. s = EnableAutoCompaction(compaction_enabled_cf_handles);
  307. }
  308. return s;
  309. }
  310. Transaction* WriteUnpreparedTxnDB::BeginTransaction(
  311. const WriteOptions& write_options, const TransactionOptions& txn_options,
  312. Transaction* old_txn) {
  313. if (old_txn != nullptr) {
  314. ReinitializeTransaction(old_txn, write_options, txn_options);
  315. return old_txn;
  316. } else {
  317. return new WriteUnpreparedTxn(this, write_options, txn_options);
  318. }
  319. }
  320. // Struct to hold ownership of snapshot and read callback for iterator cleanup.
  321. struct WriteUnpreparedTxnDB::IteratorState {
  322. IteratorState(WritePreparedTxnDB* txn_db, SequenceNumber sequence,
  323. std::shared_ptr<ManagedSnapshot> s,
  324. SequenceNumber min_uncommitted, WriteUnpreparedTxn* txn)
  325. : callback(txn_db, sequence, min_uncommitted, txn->unprep_seqs_,
  326. kBackedByDBSnapshot),
  327. snapshot(s) {}
  328. SequenceNumber MaxVisibleSeq() { return callback.max_visible_seq(); }
  329. WriteUnpreparedTxnReadCallback callback;
  330. std::shared_ptr<ManagedSnapshot> snapshot;
  331. };
  332. namespace {
  333. static void CleanupWriteUnpreparedTxnDBIterator(void* arg1, void* /*arg2*/) {
  334. delete reinterpret_cast<WriteUnpreparedTxnDB::IteratorState*>(arg1);
  335. }
  336. } // anonymous namespace
  337. Iterator* WriteUnpreparedTxnDB::NewIterator(const ReadOptions& options,
  338. ColumnFamilyHandle* column_family,
  339. WriteUnpreparedTxn* txn) {
  340. // TODO(lth): Refactor so that this logic is shared with WritePrepared.
  341. constexpr bool ALLOW_BLOB = true;
  342. constexpr bool ALLOW_REFRESH = true;
  343. std::shared_ptr<ManagedSnapshot> own_snapshot = nullptr;
  344. SequenceNumber snapshot_seq = kMaxSequenceNumber;
  345. SequenceNumber min_uncommitted = 0;
  346. // Currently, the Prev() iterator logic does not work well without snapshot
  347. // validation. The logic simply iterates through values of a key in
  348. // ascending seqno order, stopping at the first non-visible value and
  349. // returning the last visible value.
  350. //
  351. // For example, if snapshot sequence is 3, and we have the following keys:
  352. // foo: v1 1
  353. // foo: v2 2
  354. // foo: v3 3
  355. // foo: v4 4
  356. // foo: v5 5
  357. //
  358. // Then 1, 2, 3 will be visible, but 4 will be non-visible, so we return v3,
  359. // which is the last visible value.
  360. //
  361. // For unprepared transactions, if we have snap_seq = 3, but the current
  362. // transaction has unprep_seq 5, then returning the first non-visible value
  363. // would be incorrect, as we should return v5, and not v3. The problem is that
  364. // there are committed values at snapshot_seq < commit_seq < unprep_seq.
  365. //
  366. // Snapshot validation can prevent this problem by ensuring that no committed
  367. // values exist at snapshot_seq < commit_seq, and thus any value with a
  368. // sequence number greater than snapshot_seq must be unprepared values. For
  369. // example, if the transaction had a snapshot at 3, then snapshot validation
  370. // would be performed during the Put(v5) call. It would find v4, and the Put
  371. // would fail with snapshot validation failure.
  372. //
  373. // TODO(lth): Improve Prev() logic to continue iterating until
  374. // max_visible_seq, and then return the last visible value, so that this
  375. // restriction can be lifted.
  376. const Snapshot* snapshot = nullptr;
  377. if (options.snapshot == nullptr) {
  378. snapshot = GetSnapshot();
  379. own_snapshot = std::make_shared<ManagedSnapshot>(db_impl_, snapshot);
  380. } else {
  381. snapshot = options.snapshot;
  382. }
  383. snapshot_seq = snapshot->GetSequenceNumber();
  384. assert(snapshot_seq != kMaxSequenceNumber);
  385. // Iteration is safe as long as largest_validated_seq <= snapshot_seq. We are
  386. // guaranteed that for keys that were modified by this transaction (and thus
  387. // might have unprepared values), no committed values exist at
  388. // largest_validated_seq < commit_seq (or the contrapositive: any committed
  389. // value must exist at commit_seq <= largest_validated_seq). This implies
  390. // that commit_seq <= largest_validated_seq <= snapshot_seq or commit_seq <=
  391. // snapshot_seq. As explained above, the problem with Prev() only happens when
  392. // snapshot_seq < commit_seq.
  393. //
  394. // For keys that were not modified by this transaction, largest_validated_seq_
  395. // is meaningless, and Prev() should just work with the existing visibility
  396. // logic.
  397. if (txn->largest_validated_seq_ > snapshot->GetSequenceNumber() &&
  398. !txn->unprep_seqs_.empty()) {
  399. ROCKS_LOG_ERROR(info_log_,
  400. "WriteUnprepared iterator creation failed since the "
  401. "transaction has performed unvalidated writes");
  402. return nullptr;
  403. }
  404. min_uncommitted =
  405. static_cast_with_check<const SnapshotImpl, const Snapshot>(snapshot)
  406. ->min_uncommitted_;
  407. auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
  408. auto* state =
  409. new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted, txn);
  410. auto* db_iter =
  411. db_impl_->NewIteratorImpl(options, cfd, state->MaxVisibleSeq(),
  412. &state->callback, !ALLOW_BLOB, !ALLOW_REFRESH);
  413. db_iter->RegisterCleanup(CleanupWriteUnpreparedTxnDBIterator, state, nullptr);
  414. return db_iter;
  415. }
  416. } // namespace ROCKSDB_NAMESPACE
  417. #endif // ROCKSDB_LITE