db_follower_test.cc 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531
  1. // Copyright (c) 2024-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. #include "db/db_test_util.h"
  6. #include "port/stack_trace.h"
  7. #include "test_util/sync_point.h"
  8. namespace ROCKSDB_NAMESPACE {
  9. #ifdef OS_LINUX
  10. class DBFollowerTest : public DBTestBase {
  11. public:
  12. // Create directories for leader and follower
  13. // Create the leader DB object
  14. DBFollowerTest() : DBTestBase("/db_follower_test", /*env_do_fsync*/ false) {
  15. follower_name_ = dbname_ + "/follower";
  16. db_parent_ = dbname_;
  17. Close();
  18. Destroy(CurrentOptions());
  19. EXPECT_EQ(env_->CreateDirIfMissing(dbname_), Status::OK());
  20. dbname_ = dbname_ + "/leader";
  21. Reopen(CurrentOptions());
  22. }
  23. ~DBFollowerTest() {
  24. follower_.reset();
  25. EXPECT_EQ(DestroyDB(follower_name_, CurrentOptions()), Status::OK());
  26. Destroy(CurrentOptions());
  27. dbname_ = db_parent_;
  28. }
  29. protected:
  30. class DBFollowerTestFS : public FileSystemWrapper {
  31. public:
  32. explicit DBFollowerTestFS(const std::shared_ptr<FileSystem>& target)
  33. : FileSystemWrapper(target),
  34. cv_(&mutex_),
  35. barrier_(false),
  36. count_(0),
  37. reinit_count_(0) {}
  38. const char* Name() const override { return "DBFollowerTestFS"; }
  39. IOStatus NewSequentialFile(const std::string& fname,
  40. const FileOptions& file_opts,
  41. std::unique_ptr<FSSequentialFile>* result,
  42. IODebugContext* dbg = nullptr) override {
  43. class DBFollowerTestSeqFile : public FSSequentialFileWrapper {
  44. public:
  45. DBFollowerTestSeqFile(DBFollowerTestFS* fs,
  46. std::unique_ptr<FSSequentialFile>&& file,
  47. uint64_t /*size*/)
  48. : FSSequentialFileWrapper(file.get()),
  49. fs_(fs),
  50. file_(std::move(file)) {}
  51. IOStatus Read(size_t n, const IOOptions& options, Slice* result,
  52. char* scratch, IODebugContext* dbg) override {
  53. fs_->BarrierWait();
  54. return target()->Read(n, options, result, scratch, dbg);
  55. }
  56. private:
  57. DBFollowerTestFS* fs_;
  58. std::unique_ptr<FSSequentialFile> file_;
  59. };
  60. std::unique_ptr<FSSequentialFile> file;
  61. IOStatus s = target()->NewSequentialFile(fname, file_opts, &file, dbg);
  62. if (s.ok() && test::GetFileType(fname) == kDescriptorFile) {
  63. uint64_t size = 0;
  64. EXPECT_EQ(target()->GetFileSize(fname, IOOptions(), &size, nullptr),
  65. IOStatus::OK());
  66. result->reset(new DBFollowerTestSeqFile(this, std::move(file), size));
  67. } else {
  68. *result = std::move(file);
  69. }
  70. return s;
  71. }
  72. void BarrierInit(int count) {
  73. MutexLock l(&mutex_);
  74. barrier_ = true;
  75. count_ = count;
  76. }
  77. void BarrierWait() {
  78. MutexLock l(&mutex_);
  79. if (!barrier_) {
  80. return;
  81. }
  82. if (--count_ == 0) {
  83. if (reinit_count_ > 0) {
  84. count_ = reinit_count_;
  85. reinit_count_ = 0;
  86. } else {
  87. barrier_ = false;
  88. }
  89. cv_.SignalAll();
  90. } else {
  91. cv_.Wait();
  92. }
  93. }
  94. void BarrierWaitAndReinit(int count) {
  95. MutexLock l(&mutex_);
  96. if (!barrier_) {
  97. return;
  98. }
  99. reinit_count_ = count;
  100. if (--count_ == 0) {
  101. if (reinit_count_ > 0) {
  102. count_ = reinit_count_;
  103. reinit_count_ = 0;
  104. } else {
  105. barrier_ = false;
  106. }
  107. cv_.SignalAll();
  108. } else {
  109. cv_.Wait();
  110. }
  111. }
  112. private:
  113. port::Mutex mutex_;
  114. port::CondVar cv_;
  115. bool barrier_;
  116. int count_;
  117. int reinit_count_;
  118. };
  119. class DBFollowerTestSstPartitioner : public SstPartitioner {
  120. public:
  121. explicit DBFollowerTestSstPartitioner(uint64_t max_keys)
  122. : max_keys_(max_keys), num_keys_(0) {}
  123. const char* Name() const override { return "DBFollowerTestSstPartitioner"; }
  124. PartitionerResult ShouldPartition(
  125. const PartitionerRequest& /*request*/) override {
  126. if (++num_keys_ > max_keys_) {
  127. num_keys_ = 0;
  128. return PartitionerResult::kRequired;
  129. } else {
  130. return PartitionerResult::kNotRequired;
  131. }
  132. }
  133. bool CanDoTrivialMove(const Slice& /*smallest_user_key*/,
  134. const Slice& /*largest_user_key*/) override {
  135. return true;
  136. }
  137. private:
  138. uint64_t max_keys_;
  139. uint64_t num_keys_;
  140. };
  141. class DBFollowerTestSstPartitionerFactory : public SstPartitionerFactory {
  142. public:
  143. explicit DBFollowerTestSstPartitionerFactory(uint64_t max_keys)
  144. : max_keys_(max_keys) {}
  145. std::unique_ptr<SstPartitioner> CreatePartitioner(
  146. const SstPartitioner::Context& /*context*/) const override {
  147. std::unique_ptr<SstPartitioner> partitioner;
  148. partitioner.reset(new DBFollowerTestSstPartitioner(max_keys_));
  149. return partitioner;
  150. }
  151. const char* Name() const override {
  152. return "DBFollowerTestSstPartitionerFactory";
  153. }
  154. private:
  155. uint64_t max_keys_;
  156. };
  157. Status OpenAsFollower() {
  158. Options opts = CurrentOptions();
  159. if (!follower_env_) {
  160. follower_env_ = NewCompositeEnv(
  161. std::make_shared<DBFollowerTestFS>(env_->GetFileSystem()));
  162. }
  163. opts.env = follower_env_.get();
  164. opts.follower_refresh_catchup_period_ms = 100;
  165. return DB::OpenAsFollower(opts, follower_name_, dbname_, &follower_);
  166. }
  167. std::string FollowerGet(const std::string& k) {
  168. ReadOptions options;
  169. options.verify_checksums = true;
  170. std::string result;
  171. Status s = follower()->Get(options, k, &result);
  172. if (s.IsNotFound()) {
  173. result = "NOT_FOUND";
  174. } else if (!s.ok()) {
  175. result = s.ToString();
  176. }
  177. return result;
  178. }
  179. DB* follower() { return follower_.get(); }
  180. DBFollowerTestFS* follower_fs() {
  181. return static_cast<DBFollowerTestFS*>(follower_env_->GetFileSystem().get());
  182. }
  183. void CheckDirs() {
  184. std::vector<std::string> db_children;
  185. std::vector<std::string> follower_children;
  186. EXPECT_OK(env_->GetChildren(dbname_, &db_children));
  187. EXPECT_OK(env_->GetChildren(follower_name_, &follower_children));
  188. std::set<uint64_t> db_filenums;
  189. std::set<uint64_t> follower_filenums;
  190. for (auto& name : db_children) {
  191. if (test::GetFileType(name) != kTableFile) {
  192. continue;
  193. }
  194. db_filenums.insert(test::GetFileNumber(name));
  195. }
  196. for (auto& name : follower_children) {
  197. if (test::GetFileType(name) != kTableFile) {
  198. continue;
  199. }
  200. follower_filenums.insert(test::GetFileNumber(name));
  201. }
  202. db_filenums.merge(follower_filenums);
  203. EXPECT_EQ(follower_filenums.size(), db_filenums.size());
  204. }
  205. private:
  206. std::string follower_name_;
  207. std::string db_parent_;
  208. std::unique_ptr<Env> follower_env_;
  209. std::unique_ptr<DB> follower_;
  210. };
  211. TEST_F(DBFollowerTest, Basic) {
  212. ASSERT_OK(Put("k1", "v1"));
  213. ASSERT_OK(Flush());
  214. ASSERT_OK(Put("k2", "v2"));
  215. ASSERT_OK(Flush());
  216. ASSERT_OK(OpenAsFollower());
  217. std::string val;
  218. ASSERT_OK(follower()->Get(ReadOptions(), "k1", &val));
  219. ASSERT_EQ(val, "v1");
  220. CheckDirs();
  221. }
  222. TEST_F(DBFollowerTest, Flush) {
  223. SyncPoint::GetInstance()->LoadDependency({
  224. {"DBImplFollower::TryCatchupWithLeader:Begin1", "Leader::Start"},
  225. {"Leader::Done", "DBImplFollower::TryCatchupWithLeader:Begin2"},
  226. {"DBImplFollower::TryCatchupWithLeader:End", "Follower::WaitForCatchup"},
  227. });
  228. SyncPoint::GetInstance()->EnableProcessing();
  229. ASSERT_OK(OpenAsFollower());
  230. TEST_SYNC_POINT("Leader::Start");
  231. ASSERT_OK(Put("k1", "v1"));
  232. ASSERT_OK(Flush());
  233. TEST_SYNC_POINT("Leader::Done");
  234. TEST_SYNC_POINT("Follower::WaitForCatchup");
  235. std::string val;
  236. ASSERT_OK(follower()->Get(ReadOptions(), "k1", &val));
  237. ASSERT_EQ(val, "v1");
  238. CheckDirs();
  239. SyncPoint::GetInstance()->DisableProcessing();
  240. }
  241. // This test creates 4 L0 files, immediately followed by a compaction to L1.
  242. // The follower replays the 4 flush records from the MANIFEST unsuccessfully,
  243. // and then successfully recovers a Version from the compaction record
  244. TEST_F(DBFollowerTest, RetryCatchup) {
  245. Options opts = CurrentOptions();
  246. opts.disable_auto_compactions = true;
  247. Reopen(opts);
  248. ASSERT_OK(OpenAsFollower());
  249. SyncPoint::GetInstance()->LoadDependency({
  250. {"DBImplFollower::TryCatchupWithLeader:Begin1", "Leader::Start"},
  251. {"DBImpl::BackgroundCompaction:Start",
  252. "DBImplFollower::TryCatchupWithLeader:Begin2"},
  253. {"VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
  254. "Begin1",
  255. "DBImpl::BackgroundCompaction:BeforeCompaction"},
  256. {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
  257. "VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
  258. "Begin2"},
  259. {"DBImplFollower::TryCatchupWithLeader:End", "Follower::WaitForCatchup"},
  260. });
  261. SyncPoint::GetInstance()->EnableProcessing();
  262. TEST_SYNC_POINT("Leader::Start");
  263. ASSERT_OK(Put("k1", "v1"));
  264. ASSERT_OK(Flush());
  265. ASSERT_OK(Put("k1", "v2"));
  266. ASSERT_OK(Flush());
  267. ASSERT_OK(Put("k1", "v3"));
  268. ASSERT_OK(Flush());
  269. ASSERT_OK(Put("k1", "v4"));
  270. ASSERT_OK(Flush());
  271. ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, true));
  272. TEST_SYNC_POINT("Follower::WaitForCatchup");
  273. ASSERT_EQ(FollowerGet("k1"), "v4");
  274. CheckDirs();
  275. SyncPoint::GetInstance()->ClearAllCallBacks();
  276. SyncPoint::GetInstance()->DisableProcessing();
  277. }
  278. // This test validates the same as the previous test, except there is a
  279. // MANIFEST rollover between the flushes and compaction. The follower
  280. // does not switch to a new MANIFEST in ReadAndApply. So it would require
  281. // another round of refresh before catching up.
  282. TEST_F(DBFollowerTest, RetryCatchupManifestRollover) {
  283. Options opts = CurrentOptions();
  284. opts.disable_auto_compactions = true;
  285. Reopen(opts);
  286. ASSERT_OK(Put("k1", "v1"));
  287. ASSERT_OK(Flush());
  288. ASSERT_OK(Put("k1", "v2"));
  289. ASSERT_OK(Flush());
  290. ASSERT_OK(Put("k1", "v3"));
  291. ASSERT_OK(Flush());
  292. ASSERT_OK(OpenAsFollower());
  293. SyncPoint::GetInstance()->LoadDependency({
  294. {"DBImplFollower::TryCatchupWithLeader:Begin1", "Leader::Start"},
  295. {"Leader::Flushed", "DBImplFollower::TryCatchupWithLeader:Begin2"},
  296. {"VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
  297. "Begin1",
  298. "Leader::Done"},
  299. {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
  300. "VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
  301. "Begin2"},
  302. {"DBImplFollower::TryCatchupWithLeader:End",
  303. "Follower::WaitForCatchup:1"},
  304. });
  305. SyncPoint::GetInstance()->EnableProcessing();
  306. TEST_SYNC_POINT("Leader::Start");
  307. ASSERT_OK(Put("k1", "v4"));
  308. ASSERT_OK(Flush());
  309. TEST_SYNC_POINT("Leader::Flushed");
  310. TEST_SYNC_POINT("Leader::Done");
  311. Reopen(opts);
  312. ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, true));
  313. TEST_SYNC_POINT("Follower::WaitForCatchup:1");
  314. SyncPoint::GetInstance()->LoadDependency({
  315. {"DBImplFollower::TryCatchupWithLeader:End",
  316. "Follower::WaitForCatchup:2"},
  317. });
  318. TEST_SYNC_POINT("Follower::WaitForCatchup:2");
  319. ASSERT_EQ(FollowerGet("k1"), "v4");
  320. SyncPoint::GetInstance()->ClearAllCallBacks();
  321. SyncPoint::GetInstance()->DisableProcessing();
  322. }
  323. // This test creates 4 L0 files and compacts them. The follower, during catchup,
  324. // successfully instantiates 4 Versions corresponding to the 4 files (but
  325. // doesn't install them yet), followed by deleting those 4 and adding a new
  326. // file from compaction. The test verifies that the 4 L0 files are deleted
  327. // correctly by the follower.
  328. // We use the Barrier* functions to ensure that the follower first sees the 4
  329. // L0 files and is able to link them, and then sees the compaction that
  330. // obsoletes those L0 files (so those L0 files are intermediates that it has
  331. // to explicitly delete). Suppose we don't have any barriers, its possible
  332. // the follower reads the L0 records and compaction records from the MANIFEST
  333. // in one read, which means those L0 files would have already been deleted
  334. // by the leader and the follower cannot link to them.
  335. TEST_F(DBFollowerTest, IntermediateObsoleteFiles) {
  336. Options opts = CurrentOptions();
  337. opts.disable_auto_compactions = true;
  338. Reopen(opts);
  339. ASSERT_OK(OpenAsFollower());
  340. follower_fs()->BarrierInit(2);
  341. ASSERT_OK(Put("k1", "v1"));
  342. ASSERT_OK(Flush());
  343. ASSERT_OK(Put("k1", "v2"));
  344. ASSERT_OK(Flush());
  345. ASSERT_OK(Put("k1", "v3"));
  346. ASSERT_OK(Flush());
  347. ASSERT_OK(Put("k1", "v4"));
  348. ASSERT_OK(Flush());
  349. follower_fs()->BarrierWaitAndReinit(2);
  350. ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, true));
  351. follower_fs()->BarrierWait();
  352. SyncPoint::GetInstance()->LoadDependency({
  353. {"DBImplFollower::TryCatchupWithLeader:End",
  354. "Follower::WaitForCatchup:1"},
  355. });
  356. SyncPoint::GetInstance()->EnableProcessing();
  357. TEST_SYNC_POINT("Follower::WaitForCatchup:1");
  358. CheckDirs();
  359. ASSERT_EQ(FollowerGet("k1"), "v4");
  360. }
  361. // This test verifies a scenario where the follower can recover a Version
  362. // partially (i.e some of the additions cannot be found), and the files
  363. // that are found are obsoleted by a subsequent VersionEdit.
  364. TEST_F(DBFollowerTest, PartialVersionRecovery) {
  365. Options opts = CurrentOptions();
  366. opts.disable_auto_compactions = true;
  367. opts.sst_partitioner_factory =
  368. std::make_shared<DBFollowerTestSstPartitionerFactory>(1);
  369. Reopen(opts);
  370. ASSERT_OK(Put("k1", "v1"));
  371. ASSERT_OK(Put("k2", "v1"));
  372. ASSERT_OK(Put("k3", "v1"));
  373. ASSERT_OK(Flush());
  374. MoveFilesToLevel(2);
  375. ASSERT_OK(Put("k1", "v2"));
  376. ASSERT_OK(Flush());
  377. ASSERT_OK(Put("k3", "v2"));
  378. ASSERT_OK(Flush());
  379. MoveFilesToLevel(1);
  380. ASSERT_OK(OpenAsFollower());
  381. ASSERT_OK(dbfull()->SetOptions(dbfull()->DefaultColumnFamily(),
  382. {{"max_compaction_bytes", "1"}}));
  383. follower_fs()->BarrierInit(2);
  384. Slice key("k1");
  385. ASSERT_OK(dbfull()->TEST_CompactRange(1, &key, &key, nullptr, true));
  386. follower_fs()->BarrierWaitAndReinit(2);
  387. // The second compaction input overlaps the previous compaction outputs
  388. // by one file. This file is never added to VersionStorageInfo since it
  389. // was added and deleted before the catch up completes. We later verify that
  390. // the follower correctly deleted this file.
  391. key = Slice("k3");
  392. ASSERT_OK(dbfull()->TEST_CompactRange(1, &key, &key, nullptr, true));
  393. follower_fs()->BarrierWait();
  394. SyncPoint::GetInstance()->LoadDependency({
  395. {"DBImplFollower::TryCatchupWithLeader:End",
  396. "Follower::WaitForCatchup:1"},
  397. });
  398. SyncPoint::GetInstance()->EnableProcessing();
  399. TEST_SYNC_POINT("Follower::WaitForCatchup:1");
  400. CheckDirs();
  401. ASSERT_EQ(FollowerGet("k1"), "v2");
  402. ASSERT_EQ(FollowerGet("k2"), "v1");
  403. ASSERT_EQ(FollowerGet("k3"), "v2");
  404. SyncPoint::GetInstance()->DisableProcessing();
  405. }
  406. // This test verifies a scenario similar to the PartialVersionRecovery, except
  407. // with a MANIFEST rollover in between. When there is a rollover, the
  408. // follower's attempt ends without installing a new Version. The next catch up
  409. // attempt will recover a full Version.
  410. TEST_F(DBFollowerTest, PartialVersionRecoveryWithRollover) {
  411. Options opts = CurrentOptions();
  412. opts.disable_auto_compactions = true;
  413. opts.sst_partitioner_factory =
  414. std::make_shared<DBFollowerTestSstPartitionerFactory>(1);
  415. Reopen(opts);
  416. ASSERT_OK(Put("k1", "v1"));
  417. ASSERT_OK(Put("k2", "v1"));
  418. ASSERT_OK(Put("k3", "v1"));
  419. ASSERT_OK(Flush());
  420. MoveFilesToLevel(2);
  421. ASSERT_OK(Put("k1", "v2"));
  422. ASSERT_OK(Flush());
  423. ASSERT_OK(Put("k3", "v2"));
  424. ASSERT_OK(Flush());
  425. MoveFilesToLevel(1);
  426. opts.max_compaction_bytes = 1;
  427. Reopen(opts);
  428. ASSERT_OK(OpenAsFollower());
  429. follower_fs()->BarrierInit(2);
  430. Slice key("k1");
  431. ASSERT_OK(dbfull()->TEST_CompactRange(1, &key, &key, nullptr, true));
  432. follower_fs()->BarrierWaitAndReinit(2);
  433. Reopen(opts);
  434. key = Slice("k3");
  435. ASSERT_OK(dbfull()->TEST_CompactRange(1, &key, &key, nullptr, true));
  436. follower_fs()->BarrierWait();
  437. SyncPoint::GetInstance()->LoadDependency({
  438. {"DBImplFollower::TryCatchupWithLeader:Begin1",
  439. "Follower::WaitForCatchup:1"},
  440. {"Follower::WaitForCatchup:2",
  441. "DBImplFollower::TryCatchupWithLeader:Begin2"},
  442. });
  443. SyncPoint::GetInstance()->EnableProcessing();
  444. TEST_SYNC_POINT("Follower::WaitForCatchup:1");
  445. TEST_SYNC_POINT("Follower::WaitForCatchup:2");
  446. SyncPoint::GetInstance()->LoadDependency({
  447. {"DBImplFollower::TryCatchupWithLeader:End",
  448. "Follower::WaitForCatchup:3"},
  449. });
  450. TEST_SYNC_POINT("Follower::WaitForCatchup:3");
  451. CheckDirs();
  452. ASSERT_EQ(FollowerGet("k1"), "v2");
  453. ASSERT_EQ(FollowerGet("k2"), "v1");
  454. ASSERT_EQ(FollowerGet("k3"), "v2");
  455. SyncPoint::GetInstance()->DisableProcessing();
  456. }
  457. #endif
  458. } // namespace ROCKSDB_NAMESPACE
  459. int main(int argc, char** argv) {
  460. ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
  461. ::testing::InitGoogleTest(&argc, argv);
  462. return RUN_ALL_TESTS();
  463. }