db_impl_follower.cc 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343
  1. // Copyright (c) 2024-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. #include "db/db_impl/db_impl_follower.h"
  6. #include <algorithm>
  7. #include <cinttypes>
  8. #include "db/arena_wrapped_db_iter.h"
  9. #include "db/merge_context.h"
  10. #include "env/composite_env_wrapper.h"
  11. #include "env/fs_on_demand.h"
  12. #include "logging/auto_roll_logger.h"
  13. #include "logging/logging.h"
  14. #include "monitoring/perf_context_imp.h"
  15. #include "rocksdb/configurable.h"
  16. #include "rocksdb/db.h"
  17. #include "util/cast_util.h"
  18. #include "util/write_batch_util.h"
  19. namespace ROCKSDB_NAMESPACE {
  20. DBImplFollower::DBImplFollower(const DBOptions& db_options,
  21. std::unique_ptr<Env>&& env,
  22. const std::string& dbname, std::string src_path)
  23. : DBImplSecondary(db_options, dbname, ""),
  24. env_guard_(std::move(env)),
  25. stop_requested_(false),
  26. src_path_(std::move(src_path)),
  27. cv_(&mu_) {
  28. ROCKS_LOG_INFO(immutable_db_options_.info_log,
  29. "Opening the db in follower mode");
  30. LogFlush(immutable_db_options_.info_log);
  31. }
  32. DBImplFollower::~DBImplFollower() {
  33. Status s = Close();
  34. if (!s.ok()) {
  35. ROCKS_LOG_INFO(immutable_db_options_.info_log, "Error closing DB : %s",
  36. s.ToString().c_str());
  37. }
  38. }
  39. // Recover a follower DB instance by reading the MANIFEST. The verification
  40. // as part of the MANIFEST replay will ensure that local links to the
  41. // leader's files are created, thus ensuring we can continue reading them
  42. // even if the leader deletes those files due to compaction.
  43. // TODO:
  44. // 1. Devise a mechanism to prevent misconfiguration by, for example,
  45. // keeping a local copy of the IDENTITY file and cross checking
  46. // 2. Make the recovery more robust by retrying if the first attempt
  47. // fails.
  48. Status DBImplFollower::Recover(
  49. const std::vector<ColumnFamilyDescriptor>& column_families,
  50. bool /*readonly*/, bool /*error_if_wal_file_exists*/,
  51. bool /*error_if_data_exists_in_wals*/, bool /*is_retry*/, uint64_t*,
  52. RecoveryContext* /*recovery_ctx*/, bool* /*can_retry*/) {
  53. mutex_.AssertHeld();
  54. JobContext job_context(0);
  55. Status s;
  56. s = static_cast<ReactiveVersionSet*>(versions_.get())
  57. ->Recover(column_families, &manifest_reader_, &manifest_reporter_,
  58. &manifest_reader_status_);
  59. if (!s.ok()) {
  60. if (manifest_reader_status_) {
  61. manifest_reader_status_->PermitUncheckedError();
  62. }
  63. return s;
  64. }
  65. if (s.ok()) {
  66. default_cf_handle_ = new ColumnFamilyHandleImpl(
  67. versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_);
  68. default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats();
  69. // Start the periodic catch-up thread
  70. // TODO: See if it makes sense to have a threadpool, rather than a thread
  71. // per follower DB instance
  72. catch_up_thread_.reset(
  73. new port::Thread(&DBImplFollower::PeriodicRefresh, this));
  74. }
  75. return s;
  76. }
  77. // Try to catch up by tailing the MANIFEST.
  78. // TODO:
  79. // 1. Cleanup obsolete files afterward
  80. // 2. Add some error notifications and statistics
  81. Status DBImplFollower::TryCatchUpWithLeader() {
  82. assert(versions_.get() != nullptr);
  83. assert(manifest_reader_.get() != nullptr);
  84. Status s;
  85. TEST_SYNC_POINT("DBImplFollower::TryCatchupWithLeader:Begin1");
  86. TEST_SYNC_POINT("DBImplFollower::TryCatchupWithLeader:Begin2");
  87. // read the manifest and apply new changes to the follower instance
  88. std::unordered_set<ColumnFamilyData*> cfds_changed;
  89. JobContext job_context(0, true /*create_superversion*/);
  90. {
  91. InstrumentedMutexLock lock_guard(&mutex_);
  92. std::vector<std::string> files_to_delete;
  93. s = static_cast_with_check<ReactiveVersionSet>(versions_.get())
  94. ->ReadAndApply(&mutex_, &manifest_reader_,
  95. manifest_reader_status_.get(), &cfds_changed,
  96. &files_to_delete);
  97. ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem_);
  98. pending_outputs_inserted_elem_.reset(new std::list<uint64_t>::iterator(
  99. CaptureCurrentFileNumberInPendingOutputs()));
  100. ROCKS_LOG_INFO(immutable_db_options_.info_log, "Last sequence is %" PRIu64,
  101. static_cast<uint64_t>(versions_->LastSequence()));
  102. ROCKS_LOG_INFO(
  103. immutable_db_options_.info_log, "Next file number is %" PRIu64,
  104. static_cast<uint64_t>(versions_->current_next_file_number()));
  105. for (ColumnFamilyData* cfd : cfds_changed) {
  106. if (cfd->IsDropped()) {
  107. ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] is dropped\n",
  108. cfd->GetName().c_str());
  109. continue;
  110. }
  111. VersionStorageInfo::LevelSummaryStorage tmp;
  112. ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
  113. "[%s] Level summary: %s\n", cfd->GetName().c_str(),
  114. cfd->current()->storage_info()->LevelSummary(&tmp));
  115. }
  116. if (s.ok()) {
  117. for (auto cfd : cfds_changed) {
  118. if (cfd->mem()->GetEarliestSequenceNumber() <
  119. versions_->LastSequence()) {
  120. // Construct a new memtable with earliest sequence number set to the
  121. // last sequence number in the VersionSet. This matters when
  122. // DBImpl::MultiCFSnapshot tries to get consistent references
  123. // to super versions in a lock free manner, it checks the earliest
  124. // sequence number to detect if there was a change in version in
  125. // the meantime.
  126. MemTable* new_mem = cfd->ConstructNewMemtable(
  127. cfd->GetLatestMutableCFOptions(), versions_->LastSequence());
  128. cfd->mem()->SetNextLogNumber(cfd->GetLogNumber());
  129. cfd->mem()->ConstructFragmentedRangeTombstones();
  130. cfd->imm()->Add(cfd->mem(), &job_context.memtables_to_free);
  131. new_mem->Ref();
  132. cfd->SetMemtable(new_mem);
  133. }
  134. // This will check if the old memtable is still referenced
  135. cfd->imm()->RemoveOldMemTables(cfd->GetLogNumber(),
  136. &job_context.memtables_to_free);
  137. auto& sv_context = job_context.superversion_contexts.back();
  138. cfd->InstallSuperVersion(&sv_context, &mutex_);
  139. sv_context.NewSuperVersion();
  140. }
  141. }
  142. for (auto& file : files_to_delete) {
  143. IOStatus io_s = fs_->DeleteFile(file, IOOptions(), nullptr);
  144. if (!io_s.ok()) {
  145. ROCKS_LOG_INFO(immutable_db_options_.info_log,
  146. "Cannot delete file %s: %s", file.c_str(),
  147. io_s.ToString().c_str());
  148. }
  149. }
  150. }
  151. job_context.Clean();
  152. // Cleanup unused, obsolete files.
  153. JobContext purge_files_job_context(0);
  154. {
  155. InstrumentedMutexLock lock_guard(&mutex_);
  156. // Currently, follower instance does not create any database files, thus
  157. // is unnecessary for the follower to force full scan.
  158. FindObsoleteFiles(&purge_files_job_context, /*force=*/false);
  159. }
  160. if (purge_files_job_context.HaveSomethingToDelete()) {
  161. PurgeObsoleteFiles(purge_files_job_context);
  162. }
  163. purge_files_job_context.Clean();
  164. TEST_SYNC_POINT("DBImplFollower::TryCatchupWithLeader:End");
  165. return s;
  166. }
  167. void DBImplFollower::PeriodicRefresh() {
  168. while (!stop_requested_.load()) {
  169. MutexLock l(&mu_);
  170. int64_t wait_until =
  171. immutable_db_options_.clock->NowMicros() +
  172. immutable_db_options_.follower_refresh_catchup_period_ms * 1000;
  173. immutable_db_options_.clock->TimedWait(
  174. &cv_, std::chrono::microseconds(wait_until));
  175. if (stop_requested_.load()) {
  176. break;
  177. }
  178. Status s;
  179. for (uint64_t i = 0;
  180. i < immutable_db_options_.follower_catchup_retry_count &&
  181. !stop_requested_.load();
  182. ++i) {
  183. s = TryCatchUpWithLeader();
  184. if (s.ok()) {
  185. ROCKS_LOG_INFO(immutable_db_options_.info_log,
  186. "Successful catch up on attempt %llu",
  187. static_cast<unsigned long long>(i));
  188. break;
  189. }
  190. wait_until = immutable_db_options_.clock->NowMicros() +
  191. immutable_db_options_.follower_catchup_retry_wait_ms * 1000;
  192. immutable_db_options_.clock->TimedWait(
  193. &cv_, std::chrono::microseconds(wait_until));
  194. }
  195. if (!s.ok()) {
  196. ROCKS_LOG_INFO(immutable_db_options_.info_log, "Catch up unsuccessful");
  197. }
  198. }
  199. }
  200. Status DBImplFollower::Close() {
  201. if (catch_up_thread_) {
  202. stop_requested_.store(true);
  203. {
  204. MutexLock l(&mu_);
  205. cv_.SignalAll();
  206. }
  207. catch_up_thread_->join();
  208. catch_up_thread_.reset();
  209. }
  210. ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem_);
  211. return DBImpl::Close();
  212. }
  213. Status DB::OpenAsFollower(const Options& options, const std::string& dbname,
  214. const std::string& leader_path,
  215. std::unique_ptr<DB>* dbptr) {
  216. dbptr->reset();
  217. DBOptions db_options(options);
  218. ColumnFamilyOptions cf_options(options);
  219. std::vector<ColumnFamilyDescriptor> column_families;
  220. column_families.emplace_back(kDefaultColumnFamilyName, cf_options);
  221. std::vector<ColumnFamilyHandle*> handles;
  222. Status s = DB::OpenAsFollower(db_options, dbname, leader_path,
  223. column_families, &handles, dbptr);
  224. if (s.ok()) {
  225. assert(handles.size() == 1);
  226. delete handles[0];
  227. }
  228. return s;
  229. }
  230. Status DB::OpenAsFollower(
  231. const DBOptions& db_options, const std::string& dbname,
  232. const std::string& src_path,
  233. const std::vector<ColumnFamilyDescriptor>& column_families,
  234. std::vector<ColumnFamilyHandle*>* handles, std::unique_ptr<DB>* dbptr) {
  235. dbptr->reset();
  236. FileSystem* fs = db_options.env->GetFileSystem().get();
  237. {
  238. IOStatus io_s;
  239. if (db_options.create_if_missing) {
  240. io_s = fs->CreateDirIfMissing(dbname, IOOptions(), nullptr);
  241. } else {
  242. io_s = fs->FileExists(dbname, IOOptions(), nullptr);
  243. }
  244. if (!io_s.ok()) {
  245. return static_cast<Status>(io_s);
  246. }
  247. }
  248. std::unique_ptr<Env> new_env(new CompositeEnvWrapper(
  249. db_options.env, NewOnDemandFileSystem(db_options.env->GetFileSystem(),
  250. src_path, dbname)));
  251. DBOptions tmp_opts(db_options);
  252. Status s;
  253. tmp_opts.env = new_env.get();
  254. if (nullptr == tmp_opts.info_log) {
  255. s = CreateLoggerFromOptions(dbname, tmp_opts, &tmp_opts.info_log);
  256. if (!s.ok()) {
  257. tmp_opts.info_log = nullptr;
  258. return s;
  259. }
  260. }
  261. handles->clear();
  262. DBImplFollower* impl =
  263. new DBImplFollower(tmp_opts, std::move(new_env), dbname, src_path);
  264. impl->versions_.reset(new ReactiveVersionSet(
  265. dbname, &impl->immutable_db_options_, impl->file_options_,
  266. impl->table_cache_.get(), impl->write_buffer_manager_,
  267. &impl->write_controller_, impl->io_tracer_));
  268. impl->column_family_memtables_.reset(
  269. new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet()));
  270. impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath();
  271. impl->mutex_.Lock();
  272. s = impl->Recover(column_families, /*read_only=*/true,
  273. /*error_if_wal_file_exists=*/false,
  274. /*error_if_data_exists_in_wals=*/false);
  275. if (s.ok()) {
  276. for (const auto& cf : column_families) {
  277. auto cfd =
  278. impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
  279. if (nullptr == cfd) {
  280. s = Status::InvalidArgument("Column family not found", cf.name);
  281. break;
  282. }
  283. handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
  284. }
  285. }
  286. SuperVersionContext sv_context(false /* create_superversion */);
  287. if (s.ok()) {
  288. for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
  289. sv_context.NewSuperVersion();
  290. cfd->InstallSuperVersion(&sv_context, &impl->mutex_);
  291. }
  292. }
  293. impl->mutex_.Unlock();
  294. sv_context.Clean();
  295. if (s.ok()) {
  296. dbptr->reset(impl);
  297. for (auto h : *handles) {
  298. impl->NewThreadStatusCfInfo(
  299. static_cast_with_check<ColumnFamilyHandleImpl>(h)->cfd());
  300. }
  301. } else {
  302. for (auto h : *handles) {
  303. delete h;
  304. }
  305. handles->clear();
  306. delete impl;
  307. }
  308. return s;
  309. }
  310. } // namespace ROCKSDB_NAMESPACE