write_unprepared_txn_db.cc 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. #include "utilities/transactions/write_unprepared_txn_db.h"
  6. #include "db/arena_wrapped_db_iter.h"
  7. #include "rocksdb/utilities/transaction_db.h"
  8. #include "util/cast_util.h"
  9. namespace ROCKSDB_NAMESPACE {
  10. // Instead of reconstructing a Transaction object, and calling rollback on it,
  11. // we can be more efficient with RollbackRecoveredTransaction by skipping
  12. // unnecessary steps (eg. updating CommitMap, reconstructing keyset)
  13. Status WriteUnpreparedTxnDB::RollbackRecoveredTransaction(
  14. const DBImpl::RecoveredTransaction* rtxn) {
  15. // TODO(lth): Reduce duplicate code with WritePrepared rollback logic.
  16. assert(rtxn->unprepared_);
  17. auto cf_map_shared_ptr = WritePreparedTxnDB::GetCFHandleMap();
  18. auto cf_comp_map_shared_ptr = WritePreparedTxnDB::GetCFComparatorMap();
  19. // In theory we could write with disableWAL = true during recovery, and
  20. // assume that if we crash again during recovery, we can just replay from
  21. // the very beginning. Unfortunately, the XIDs from the application may not
  22. // necessarily be unique across restarts, potentially leading to situations
  23. // like this:
  24. //
  25. // BEGIN_PREPARE(unprepared) Put(a) END_PREPARE(xid = 1)
  26. // -- crash and recover with Put(a) rolled back as it was not prepared
  27. // BEGIN_PREPARE(prepared) Put(b) END_PREPARE(xid = 1)
  28. // COMMIT(xid = 1)
  29. // -- crash and recover with both a, b
  30. //
  31. // We could just write the rollback marker, but then we would have to extend
  32. // MemTableInserter during recovery to actually do writes into the DB
  33. // instead of just dropping the in-memory write batch.
  34. //
  35. // TODO: plumb Env::IOActivity, Env::IOPriority
  36. WriteOptions w_options;
  37. class InvalidSnapshotReadCallback : public ReadCallback {
  38. public:
  39. InvalidSnapshotReadCallback(SequenceNumber snapshot)
  40. : ReadCallback(snapshot) {}
  41. inline bool IsVisibleFullCheck(SequenceNumber) override {
  42. // The seq provided as snapshot is the seq right before we have locked and
  43. // wrote to it, so whatever is there, it is committed.
  44. return true;
  45. }
  46. // Ignore the refresh request since we are confident that our snapshot seq
  47. // is not going to be affected by concurrent compactions (not enabled yet.)
  48. void Refresh(SequenceNumber) override {}
  49. };
  50. // Iterate starting with largest sequence number.
  51. for (auto it = rtxn->batches_.rbegin(); it != rtxn->batches_.rend(); ++it) {
  52. auto last_visible_txn = it->first - 1;
  53. const auto& batch = it->second.batch_;
  54. WriteBatch rollback_batch(0 /* reserved_bytes */, 0 /* max_bytes */,
  55. w_options.protection_bytes_per_key,
  56. 0 /* default_cf_ts_sz */);
  57. struct RollbackWriteBatchBuilder : public WriteBatch::Handler {
  58. DBImpl* db_;
  59. ReadOptions roptions;
  60. InvalidSnapshotReadCallback callback;
  61. WriteBatch* rollback_batch_;
  62. std::map<uint32_t, const Comparator*>& comparators_;
  63. std::map<uint32_t, ColumnFamilyHandle*>& handles_;
  64. using CFKeys = std::set<Slice, SetComparator>;
  65. std::map<uint32_t, CFKeys> keys_;
  66. bool rollback_merge_operands_;
  67. RollbackWriteBatchBuilder(
  68. DBImpl* db, SequenceNumber snap_seq, WriteBatch* dst_batch,
  69. std::map<uint32_t, const Comparator*>& comparators,
  70. std::map<uint32_t, ColumnFamilyHandle*>& handles,
  71. bool rollback_merge_operands)
  72. : db_(db),
  73. callback(snap_seq),
  74. // disable min_uncommitted optimization
  75. rollback_batch_(dst_batch),
  76. comparators_(comparators),
  77. handles_(handles),
  78. rollback_merge_operands_(rollback_merge_operands) {}
  79. Status Rollback(uint32_t cf, const Slice& key) {
  80. Status s;
  81. CFKeys& cf_keys = keys_[cf];
  82. if (cf_keys.size() == 0) { // just inserted
  83. auto cmp = comparators_[cf];
  84. keys_[cf] = CFKeys(SetComparator(cmp));
  85. }
  86. auto res = cf_keys.insert(key);
  87. if (res.second ==
  88. false) { // second is false if a element already existed.
  89. return s;
  90. }
  91. PinnableSlice pinnable_val;
  92. bool not_used;
  93. auto cf_handle = handles_[cf];
  94. DBImpl::GetImplOptions get_impl_options;
  95. get_impl_options.column_family = cf_handle;
  96. get_impl_options.value = &pinnable_val;
  97. get_impl_options.value_found = &not_used;
  98. get_impl_options.callback = &callback;
  99. s = db_->GetImpl(roptions, key, get_impl_options);
  100. assert(s.ok() || s.IsNotFound());
  101. if (s.ok()) {
  102. s = rollback_batch_->Put(cf_handle, key, pinnable_val);
  103. assert(s.ok());
  104. } else if (s.IsNotFound()) {
  105. // There has been no readable value before txn. By adding a delete we
  106. // make sure that there will be none afterwards either.
  107. s = rollback_batch_->Delete(cf_handle, key);
  108. assert(s.ok());
  109. } else {
  110. // Unexpected status. Return it to the user.
  111. }
  112. return s;
  113. }
  114. Status PutCF(uint32_t cf, const Slice& key,
  115. const Slice& /*val*/) override {
  116. return Rollback(cf, key);
  117. }
  118. Status DeleteCF(uint32_t cf, const Slice& key) override {
  119. return Rollback(cf, key);
  120. }
  121. Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
  122. return Rollback(cf, key);
  123. }
  124. Status MergeCF(uint32_t cf, const Slice& key,
  125. const Slice& /*val*/) override {
  126. if (rollback_merge_operands_) {
  127. return Rollback(cf, key);
  128. } else {
  129. return Status::OK();
  130. }
  131. }
  132. // Recovered batches do not contain 2PC markers.
  133. Status MarkNoop(bool) override { return Status::InvalidArgument(); }
  134. Status MarkBeginPrepare(bool) override {
  135. return Status::InvalidArgument();
  136. }
  137. Status MarkEndPrepare(const Slice&) override {
  138. return Status::InvalidArgument();
  139. }
  140. Status MarkCommit(const Slice&) override {
  141. return Status::InvalidArgument();
  142. }
  143. Status MarkRollback(const Slice&) override {
  144. return Status::InvalidArgument();
  145. }
  146. } rollback_handler(db_impl_, last_visible_txn, &rollback_batch,
  147. *cf_comp_map_shared_ptr.get(), *cf_map_shared_ptr.get(),
  148. txn_db_options_.rollback_merge_operands);
  149. auto s = batch->Iterate(&rollback_handler);
  150. if (!s.ok()) {
  151. return s;
  152. }
  153. // The Rollback marker will be used as a batch separator
  154. s = WriteBatchInternal::MarkRollback(&rollback_batch, rtxn->name_);
  155. if (!s.ok()) {
  156. return s;
  157. }
  158. const uint64_t kNoLogRef = 0;
  159. const bool kDisableMemtable = true;
  160. const size_t kOneBatch = 1;
  161. uint64_t seq_used = kMaxSequenceNumber;
  162. s = db_impl_->WriteImpl(w_options, &rollback_batch, nullptr, nullptr,
  163. nullptr, kNoLogRef, !kDisableMemtable, &seq_used,
  164. kOneBatch);
  165. if (!s.ok()) {
  166. return s;
  167. }
  168. // If two_write_queues, we must manually release the sequence number to
  169. // readers.
  170. if (db_impl_->immutable_db_options().two_write_queues) {
  171. db_impl_->SetLastPublishedSequence(seq_used);
  172. }
  173. }
  174. return Status::OK();
  175. }
  176. Status WriteUnpreparedTxnDB::Initialize(
  177. const std::vector<size_t>& compaction_enabled_cf_indices,
  178. const std::vector<ColumnFamilyHandle*>& handles) {
  179. // TODO(lth): Reduce code duplication in this function.
  180. auto dbimpl = static_cast_with_check<DBImpl>(GetRootDB());
  181. assert(dbimpl != nullptr);
  182. db_impl_->SetSnapshotChecker(new WritePreparedSnapshotChecker(this));
  183. // A callback to commit a single sub-batch
  184. class CommitSubBatchPreReleaseCallback : public PreReleaseCallback {
  185. public:
  186. explicit CommitSubBatchPreReleaseCallback(WritePreparedTxnDB* db)
  187. : db_(db) {}
  188. Status Callback(SequenceNumber commit_seq,
  189. bool is_mem_disabled __attribute__((__unused__)), uint64_t,
  190. size_t /*index*/, size_t /*total*/) override {
  191. assert(!is_mem_disabled);
  192. db_->AddCommitted(commit_seq, commit_seq);
  193. return Status::OK();
  194. }
  195. private:
  196. WritePreparedTxnDB* db_;
  197. };
  198. db_impl_->SetRecoverableStatePreReleaseCallback(
  199. new CommitSubBatchPreReleaseCallback(this));
  200. // PessimisticTransactionDB::Initialize
  201. for (auto cf_ptr : handles) {
  202. AddColumnFamily(cf_ptr);
  203. }
  204. // Verify cf options
  205. for (auto handle : handles) {
  206. ColumnFamilyDescriptor cfd;
  207. Status s = handle->GetDescriptor(&cfd);
  208. if (!s.ok()) {
  209. return s;
  210. }
  211. s = VerifyCFOptions(cfd.options);
  212. if (!s.ok()) {
  213. return s;
  214. }
  215. }
  216. // Re-enable compaction for the column families that initially had
  217. // compaction enabled.
  218. std::vector<ColumnFamilyHandle*> compaction_enabled_cf_handles;
  219. compaction_enabled_cf_handles.reserve(compaction_enabled_cf_indices.size());
  220. for (auto index : compaction_enabled_cf_indices) {
  221. compaction_enabled_cf_handles.push_back(handles[index]);
  222. }
  223. // create 'real' transactions from recovered shell transactions
  224. auto rtxns = dbimpl->recovered_transactions();
  225. std::map<SequenceNumber, SequenceNumber> ordered_seq_cnt;
  226. for (const auto& rtxn : rtxns) {
  227. auto recovered_trx = rtxn.second;
  228. assert(recovered_trx);
  229. assert(recovered_trx->batches_.size() >= 1);
  230. assert(recovered_trx->name_.length());
  231. // We can only rollback transactions after AdvanceMaxEvictedSeq is called,
  232. // but AddPrepared must occur before AdvanceMaxEvictedSeq, which is why
  233. // two iterations is required.
  234. if (recovered_trx->unprepared_) {
  235. continue;
  236. }
  237. // TODO: plumb Env::IOActivity, Env::IOPriority
  238. WriteOptions w_options;
  239. w_options.sync = true;
  240. TransactionOptions t_options;
  241. auto first_log_number = recovered_trx->batches_.begin()->second.log_number_;
  242. auto first_seq = recovered_trx->batches_.begin()->first;
  243. auto last_prepare_batch_cnt =
  244. recovered_trx->batches_.begin()->second.batch_cnt_;
  245. Transaction* real_trx = BeginTransaction(w_options, t_options, nullptr);
  246. assert(real_trx);
  247. auto wupt = static_cast_with_check<WriteUnpreparedTxn>(real_trx);
  248. wupt->recovered_txn_ = true;
  249. real_trx->SetLogNumber(first_log_number);
  250. real_trx->SetId(first_seq);
  251. Status s = real_trx->SetName(recovered_trx->name_);
  252. if (!s.ok()) {
  253. return s;
  254. }
  255. wupt->prepare_batch_cnt_ = last_prepare_batch_cnt;
  256. for (auto batch : recovered_trx->batches_) {
  257. const auto& seq = batch.first;
  258. const auto& batch_info = batch.second;
  259. auto cnt = batch_info.batch_cnt_ ? batch_info.batch_cnt_ : 1;
  260. assert(batch_info.log_number_);
  261. ordered_seq_cnt[seq] = cnt;
  262. assert(wupt->unprep_seqs_.count(seq) == 0);
  263. wupt->unprep_seqs_[seq] = cnt;
  264. s = wupt->RebuildFromWriteBatch(batch_info.batch_);
  265. assert(s.ok());
  266. if (!s.ok()) {
  267. return s;
  268. }
  269. }
  270. const bool kClear = true;
  271. wupt->InitWriteBatch(kClear);
  272. real_trx->SetState(Transaction::PREPARED);
  273. if (!s.ok()) {
  274. return s;
  275. }
  276. }
  277. // AddPrepared must be called in order
  278. for (auto seq_cnt : ordered_seq_cnt) {
  279. auto seq = seq_cnt.first;
  280. auto cnt = seq_cnt.second;
  281. for (size_t i = 0; i < cnt; i++) {
  282. AddPrepared(seq + i);
  283. }
  284. }
  285. SequenceNumber prev_max = max_evicted_seq_;
  286. SequenceNumber last_seq = db_impl_->GetLatestSequenceNumber();
  287. AdvanceMaxEvictedSeq(prev_max, last_seq);
  288. // Create a gap between max and the next snapshot. This simplifies the logic
  289. // in IsInSnapshot by not having to consider the special case of max ==
  290. // snapshot after recovery. This is tested in IsInSnapshotEmptyMapTest.
  291. if (last_seq) {
  292. db_impl_->versions_->SetLastAllocatedSequence(last_seq + 1);
  293. db_impl_->versions_->SetLastSequence(last_seq + 1);
  294. db_impl_->versions_->SetLastPublishedSequence(last_seq + 1);
  295. }
  296. Status s;
  297. // Rollback unprepared transactions.
  298. for (const auto& rtxn : rtxns) {
  299. auto recovered_trx = rtxn.second;
  300. if (recovered_trx->unprepared_) {
  301. s = RollbackRecoveredTransaction(recovered_trx);
  302. if (!s.ok()) {
  303. return s;
  304. }
  305. continue;
  306. }
  307. }
  308. if (s.ok()) {
  309. dbimpl->DeleteAllRecoveredTransactions();
  310. // Compaction should start only after max_evicted_seq_ is set AND recovered
  311. // transactions are either added to PrepareHeap or rolled back.
  312. s = EnableAutoCompaction(compaction_enabled_cf_handles);
  313. }
  314. return s;
  315. }
  316. Transaction* WriteUnpreparedTxnDB::BeginTransaction(
  317. const WriteOptions& write_options, const TransactionOptions& txn_options,
  318. Transaction* old_txn) {
  319. if (old_txn != nullptr) {
  320. ReinitializeTransaction(old_txn, write_options, txn_options);
  321. return old_txn;
  322. } else {
  323. return new WriteUnpreparedTxn(this, write_options, txn_options);
  324. }
  325. }
  326. // Struct to hold ownership of snapshot and read callback for iterator cleanup.
  327. struct WriteUnpreparedTxnDB::IteratorState {
  328. IteratorState(WritePreparedTxnDB* txn_db, SequenceNumber sequence,
  329. std::shared_ptr<ManagedSnapshot> s,
  330. SequenceNumber min_uncommitted, WriteUnpreparedTxn* txn)
  331. : callback(txn_db, sequence, min_uncommitted, txn->unprep_seqs_,
  332. kBackedByDBSnapshot),
  333. snapshot(s) {}
  334. SequenceNumber MaxVisibleSeq() { return callback.max_visible_seq(); }
  335. WriteUnpreparedTxnReadCallback callback;
  336. std::shared_ptr<ManagedSnapshot> snapshot;
  337. };
  338. namespace {
  339. static void CleanupWriteUnpreparedTxnDBIterator(void* arg1, void* /*arg2*/) {
  340. delete static_cast<WriteUnpreparedTxnDB::IteratorState*>(arg1);
  341. }
  342. } // anonymous namespace
  343. Iterator* WriteUnpreparedTxnDB::NewIterator(const ReadOptions& _read_options,
  344. ColumnFamilyHandle* column_family,
  345. WriteUnpreparedTxn* txn) {
  346. if (_read_options.io_activity != Env::IOActivity::kUnknown &&
  347. _read_options.io_activity != Env::IOActivity::kDBIterator) {
  348. return NewErrorIterator(Status::InvalidArgument(
  349. "Can only call NewIterator with `ReadOptions::io_activity` is "
  350. "`Env::IOActivity::kUnknown` or `Env::IOActivity::kDBIterator`"));
  351. }
  352. ReadOptions read_options(_read_options);
  353. if (read_options.io_activity == Env::IOActivity::kUnknown) {
  354. read_options.io_activity = Env::IOActivity::kDBIterator;
  355. }
  356. // TODO(lth): Refactor so that this logic is shared with WritePrepared.
  357. constexpr bool expose_blob_index = false;
  358. constexpr bool allow_refresh = false;
  359. std::shared_ptr<ManagedSnapshot> own_snapshot = nullptr;
  360. SequenceNumber snapshot_seq = kMaxSequenceNumber;
  361. SequenceNumber min_uncommitted = 0;
  362. // Currently, the Prev() iterator logic does not work well without snapshot
  363. // validation. The logic simply iterates through values of a key in
  364. // ascending seqno order, stopping at the first non-visible value and
  365. // returning the last visible value.
  366. //
  367. // For example, if snapshot sequence is 3, and we have the following keys:
  368. // foo: v1 1
  369. // foo: v2 2
  370. // foo: v3 3
  371. // foo: v4 4
  372. // foo: v5 5
  373. //
  374. // Then 1, 2, 3 will be visible, but 4 will be non-visible, so we return v3,
  375. // which is the last visible value.
  376. //
  377. // For unprepared transactions, if we have snap_seq = 3, but the current
  378. // transaction has unprep_seq 5, then returning the first non-visible value
  379. // would be incorrect, as we should return v5, and not v3. The problem is that
  380. // there are committed values at snapshot_seq < commit_seq < unprep_seq.
  381. //
  382. // Snapshot validation can prevent this problem by ensuring that no committed
  383. // values exist at snapshot_seq < commit_seq, and thus any value with a
  384. // sequence number greater than snapshot_seq must be unprepared values. For
  385. // example, if the transaction had a snapshot at 3, then snapshot validation
  386. // would be performed during the Put(v5) call. It would find v4, and the Put
  387. // would fail with snapshot validation failure.
  388. //
  389. // TODO(lth): Improve Prev() logic to continue iterating until
  390. // max_visible_seq, and then return the last visible value, so that this
  391. // restriction can be lifted.
  392. const Snapshot* snapshot = nullptr;
  393. if (read_options.snapshot == nullptr) {
  394. snapshot = GetSnapshot();
  395. own_snapshot = std::make_shared<ManagedSnapshot>(db_impl_, snapshot);
  396. } else {
  397. snapshot = read_options.snapshot;
  398. }
  399. snapshot_seq = snapshot->GetSequenceNumber();
  400. assert(snapshot_seq != kMaxSequenceNumber);
  401. // Iteration is safe as long as largest_validated_seq <= snapshot_seq. We are
  402. // guaranteed that for keys that were modified by this transaction (and thus
  403. // might have unprepared values), no committed values exist at
  404. // largest_validated_seq < commit_seq (or the contrapositive: any committed
  405. // value must exist at commit_seq <= largest_validated_seq). This implies
  406. // that commit_seq <= largest_validated_seq <= snapshot_seq or commit_seq <=
  407. // snapshot_seq. As explained above, the problem with Prev() only happens when
  408. // snapshot_seq < commit_seq.
  409. //
  410. // For keys that were not modified by this transaction, largest_validated_seq_
  411. // is meaningless, and Prev() should just work with the existing visibility
  412. // logic.
  413. if (txn->largest_validated_seq_ > snapshot->GetSequenceNumber() &&
  414. !txn->unprep_seqs_.empty()) {
  415. ROCKS_LOG_ERROR(info_log_,
  416. "WriteUnprepared iterator creation failed since the "
  417. "transaction has performed unvalidated writes");
  418. return nullptr;
  419. }
  420. min_uncommitted =
  421. static_cast_with_check<const SnapshotImpl>(snapshot)->min_uncommitted_;
  422. auto* cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
  423. auto* cfd = cfh->cfd();
  424. auto* state =
  425. new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted, txn);
  426. SuperVersion* super_version = cfd->GetReferencedSuperVersion(db_impl_);
  427. auto* db_iter = db_impl_->NewIteratorImpl(
  428. read_options, cfh, super_version, state->MaxVisibleSeq(),
  429. &state->callback, expose_blob_index, allow_refresh);
  430. db_iter->RegisterCleanup(CleanupWriteUnpreparedTxnDBIterator, state, nullptr);
  431. return db_iter;
  432. }
  433. } // namespace ROCKSDB_NAMESPACE