db_filesnapshot.cc 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. #include <algorithm>
  7. #include <cstdint>
  8. #include <memory>
  9. #include <string>
  10. #include <vector>
  11. #include "db/db_impl/db_impl.h"
  12. #include "db/job_context.h"
  13. #include "db/version_set.h"
  14. #include "file/file_util.h"
  15. #include "file/filename.h"
  16. #include "logging/logging.h"
  17. #include "port/port.h"
  18. #include "rocksdb/db.h"
  19. #include "rocksdb/env.h"
  20. #include "rocksdb/metadata.h"
  21. #include "rocksdb/transaction_log.h"
  22. #include "rocksdb/types.h"
  23. #include "test_util/sync_point.h"
  24. #include "util/file_checksum_helper.h"
  25. #include "util/mutexlock.h"
  26. namespace ROCKSDB_NAMESPACE {
  27. Status DBImpl::FlushForGetLiveFiles() {
  28. return DBImpl::FlushAllColumnFamilies(FlushOptions(),
  29. FlushReason::kGetLiveFiles);
  30. }
  31. Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
  32. uint64_t* manifest_file_size, bool flush_memtable) {
  33. *manifest_file_size = 0;
  34. mutex_.Lock();
  35. if (flush_memtable) {
  36. Status status = FlushForGetLiveFiles();
  37. if (!status.ok()) {
  38. mutex_.Unlock();
  39. ROCKS_LOG_ERROR(immutable_db_options_.info_log, "Cannot Flush data %s\n",
  40. status.ToString().c_str());
  41. return status;
  42. }
  43. }
  44. // Make a set of all of the live table and blob files
  45. std::vector<uint64_t> live_table_files;
  46. std::vector<uint64_t> live_blob_files;
  47. for (auto cfd : *versions_->GetColumnFamilySet()) {
  48. if (cfd->IsDropped()) {
  49. continue;
  50. }
  51. cfd->current()->AddLiveFiles(&live_table_files, &live_blob_files);
  52. }
  53. ret.clear();
  54. ret.reserve(live_table_files.size() + live_blob_files.size() +
  55. 3); // for CURRENT + MANIFEST + OPTIONS
  56. // create names of the live files. The names are not absolute
  57. // paths, instead they are relative to dbname_.
  58. for (const auto& table_file_number : live_table_files) {
  59. ret.emplace_back(MakeTableFileName("", table_file_number));
  60. }
  61. for (const auto& blob_file_number : live_blob_files) {
  62. ret.emplace_back(BlobFileName("", blob_file_number));
  63. }
  64. ret.emplace_back(CurrentFileName(""));
  65. ret.emplace_back(DescriptorFileName("", versions_->manifest_file_number()));
  66. // In read-only mode the OPTIONS file number is zero when no OPTIONS file
  67. // exist at all. In this cases we do not record any OPTIONS file in the live
  68. // file list.
  69. if (versions_->options_file_number() != 0) {
  70. ret.emplace_back(OptionsFileName("", versions_->options_file_number()));
  71. }
  72. // find length of manifest file while holding the mutex lock
  73. *manifest_file_size = versions_->manifest_file_size();
  74. mutex_.Unlock();
  75. return Status::OK();
  76. }
  77. Status DBImpl::GetSortedWalFiles(VectorWalPtr& files) {
  78. return GetSortedWalFilesImpl(files,
  79. /*need_seqnos*/ true);
  80. }
  81. Status DBImpl::GetSortedWalFilesImpl(VectorWalPtr& files, bool need_seqnos) {
  82. // Record tracked WALs as a (minimum) cross-check for directory scan
  83. std::vector<uint64_t> required_by_manifest;
  84. // If caller disabled deletions, this function should return files that are
  85. // guaranteed not to be deleted until deletions are re-enabled. We need to
  86. // wait for pending purges to finish since WalManager doesn't know which
  87. // files are going to be purged. Additional purges won't be scheduled as
  88. // long as deletions are disabled (so the below loop must terminate).
  89. // Also note that we disable deletions anyway to avoid the case where a
  90. // file is deleted in the middle of the scan, causing IO error.
  91. Status deletions_disabled = DisableFileDeletions();
  92. {
  93. InstrumentedMutexLock l(&mutex_);
  94. while (pending_purge_obsolete_files_ > 0 || bg_purge_scheduled_ > 0) {
  95. bg_cv_.Wait();
  96. }
  97. // Record tracked WALs as a (minimum) cross-check for directory scan
  98. const auto& manifest_wals = versions_->GetWalSet().GetWals();
  99. required_by_manifest.reserve(manifest_wals.size());
  100. for (const auto& wal : manifest_wals) {
  101. required_by_manifest.push_back(wal.first);
  102. }
  103. }
  104. // NOTE: need to include archived WALs because needed WALs might have been
  105. // archived since getting required_by_manifest set
  106. Status s = wal_manager_.GetSortedWalFiles(files, need_seqnos,
  107. /*include_archived*/ true);
  108. // DisableFileDeletions / EnableFileDeletions not supported in read-only DB
  109. if (deletions_disabled.ok()) {
  110. Status s2 = EnableFileDeletions();
  111. assert(s2.ok());
  112. s2.PermitUncheckedError();
  113. } else {
  114. assert(deletions_disabled.IsNotSupported());
  115. }
  116. if (s.ok()) {
  117. // Verify includes those required by manifest (one sorted list is superset
  118. // of the other)
  119. auto required = required_by_manifest.begin();
  120. auto included = files.begin();
  121. while (required != required_by_manifest.end()) {
  122. if (included == files.end() || *required < (*included)->LogNumber()) {
  123. // FAIL - did not find
  124. return Status::Corruption(
  125. "WAL file " + std::to_string(*required) +
  126. " required by manifest but not in directory list");
  127. }
  128. if (*required == (*included)->LogNumber()) {
  129. ++required;
  130. ++included;
  131. } else {
  132. assert(*required > (*included)->LogNumber());
  133. ++included;
  134. }
  135. }
  136. }
  137. if (s.ok()) {
  138. size_t wal_count = files.size();
  139. ROCKS_LOG_INFO(immutable_db_options_.info_log,
  140. "Number of WAL files %" ROCKSDB_PRIszt " (%" ROCKSDB_PRIszt
  141. " required by manifest)",
  142. wal_count, required_by_manifest.size());
  143. #ifndef NDEBUG
  144. std::ostringstream wal_names;
  145. for (const auto& wal : files) {
  146. wal_names << wal->PathName() << " ";
  147. }
  148. std::ostringstream wal_required_by_manifest_names;
  149. for (const auto& wal : required_by_manifest) {
  150. wal_required_by_manifest_names << wal << ".log ";
  151. }
  152. ROCKS_LOG_INFO(immutable_db_options_.info_log,
  153. "Log files : %s .Log files required by manifest: %s.",
  154. wal_names.str().c_str(),
  155. wal_required_by_manifest_names.str().c_str());
  156. #endif // NDEBUG
  157. }
  158. return s;
  159. }
  160. Status DBImpl::GetCurrentWalFile(std::unique_ptr<WalFile>* current_wal_file) {
  161. uint64_t current_logfile_number;
  162. {
  163. InstrumentedMutexLock l(&mutex_);
  164. current_logfile_number = cur_wal_number_;
  165. }
  166. return wal_manager_.GetLiveWalFile(current_logfile_number, current_wal_file);
  167. }
  168. Status DBImpl::GetLiveFilesStorageInfo(
  169. const LiveFilesStorageInfoOptions& opts,
  170. std::vector<LiveFileStorageInfo>* files) {
  171. // To avoid returning partial results, only move results to files on success.
  172. assert(files);
  173. files->clear();
  174. std::vector<LiveFileStorageInfo> results;
  175. // NOTE: This implementation was largely migrated from Checkpoint.
  176. Status s;
  177. VectorWalPtr live_wal_files;
  178. bool flush_memtable = true;
  179. if (!immutable_db_options_.allow_2pc) {
  180. if (opts.wal_size_for_flush == std::numeric_limits<uint64_t>::max()) {
  181. flush_memtable = false;
  182. } else if (opts.wal_size_for_flush > 0) {
  183. // FIXME: avoid querying the filesystem for current WAL state
  184. // If the outstanding WAL files are small, we skip the flush.
  185. // Don't take archived log size into account when calculating wal
  186. // size for flush, and don't need to verify consistency with manifest
  187. // here & now.
  188. s = wal_manager_.GetSortedWalFiles(live_wal_files,
  189. /* need_seqnos */ false,
  190. /*include_archived*/ false);
  191. if (!s.ok()) {
  192. return s;
  193. }
  194. // Don't flush column families if total log size is smaller than
  195. // log_size_for_flush. We copy the log files instead.
  196. // We may be able to cover 2PC case too.
  197. uint64_t total_wal_size = 0;
  198. for (auto& wal : live_wal_files) {
  199. assert(wal->Type() == kAliveLogFile);
  200. total_wal_size += wal->SizeFileBytes();
  201. }
  202. if (total_wal_size < opts.wal_size_for_flush) {
  203. flush_memtable = false;
  204. }
  205. live_wal_files.clear();
  206. }
  207. }
  208. // This is a modified version of GetLiveFiles, to get access to more
  209. // metadata.
  210. mutex_.Lock();
  211. if (flush_memtable) {
  212. bool wal_locked = lock_wal_count_ > 0;
  213. if (wal_locked) {
  214. ROCKS_LOG_INFO(immutable_db_options_.info_log,
  215. "Can't FlushForGetLiveFiles while WAL is locked");
  216. } else {
  217. Status status = FlushForGetLiveFiles();
  218. if (!status.ok()) {
  219. mutex_.Unlock();
  220. ROCKS_LOG_ERROR(immutable_db_options_.info_log,
  221. "Cannot Flush data %s\n", status.ToString().c_str());
  222. return status;
  223. }
  224. }
  225. }
  226. // Make a set of all of the live table and blob files
  227. for (auto cfd : *versions_->GetColumnFamilySet()) {
  228. if (cfd->IsDropped()) {
  229. continue;
  230. }
  231. VersionStorageInfo& vsi = *cfd->current()->storage_info();
  232. auto& cf_paths = cfd->ioptions().cf_paths;
  233. auto GetDir = [&](size_t path_id) {
  234. // Matching TableFileName() behavior
  235. if (path_id >= cf_paths.size()) {
  236. assert(false);
  237. return cf_paths.back().path;
  238. } else {
  239. return cf_paths[path_id].path;
  240. }
  241. };
  242. for (int level = 0; level < vsi.num_levels(); ++level) {
  243. const auto& level_files = vsi.LevelFiles(level);
  244. for (const auto& meta : level_files) {
  245. assert(meta);
  246. results.emplace_back();
  247. LiveFileStorageInfo& info = results.back();
  248. info.relative_filename = MakeTableFileName(meta->fd.GetNumber());
  249. info.directory = GetDir(meta->fd.GetPathId());
  250. info.file_number = meta->fd.GetNumber();
  251. info.file_type = kTableFile;
  252. info.size = meta->fd.GetFileSize();
  253. if (opts.include_checksum_info) {
  254. info.file_checksum_func_name = meta->file_checksum_func_name;
  255. info.file_checksum = meta->file_checksum;
  256. if (info.file_checksum_func_name.empty()) {
  257. info.file_checksum_func_name = kUnknownFileChecksumFuncName;
  258. info.file_checksum = kUnknownFileChecksum;
  259. }
  260. }
  261. info.temperature = meta->temperature;
  262. }
  263. }
  264. const auto& blob_files = vsi.GetBlobFiles();
  265. for (const auto& meta : blob_files) {
  266. assert(meta);
  267. results.emplace_back();
  268. LiveFileStorageInfo& info = results.back();
  269. info.relative_filename = BlobFileName(meta->GetBlobFileNumber());
  270. info.directory = GetDir(/* path_id */ 0);
  271. info.file_number = meta->GetBlobFileNumber();
  272. info.file_type = kBlobFile;
  273. info.size = meta->GetBlobFileSize();
  274. if (opts.include_checksum_info) {
  275. info.file_checksum_func_name = meta->GetChecksumMethod();
  276. info.file_checksum = meta->GetChecksumValue();
  277. if (info.file_checksum_func_name.empty()) {
  278. info.file_checksum_func_name = kUnknownFileChecksumFuncName;
  279. info.file_checksum = kUnknownFileChecksum;
  280. }
  281. }
  282. // TODO?: info.temperature
  283. }
  284. }
  285. // Capture some final info before releasing mutex
  286. const uint64_t manifest_number = versions_->manifest_file_number();
  287. const uint64_t manifest_size = versions_->manifest_file_size();
  288. const uint64_t options_number = versions_->options_file_number();
  289. const uint64_t options_size = versions_->options_file_size_;
  290. const uint64_t min_log_num = MinLogNumberToKeep();
  291. // Ensure consistency with manifest for track_and_verify_wals_in_manifest
  292. const uint64_t max_log_num = cur_wal_number_;
  293. mutex_.Unlock();
  294. std::string manifest_fname = DescriptorFileName(manifest_number);
  295. { // MANIFEST
  296. results.emplace_back();
  297. LiveFileStorageInfo& info = results.back();
  298. info.relative_filename = manifest_fname;
  299. info.directory = GetName();
  300. info.file_number = manifest_number;
  301. info.file_type = kDescriptorFile;
  302. info.size = manifest_size;
  303. info.trim_to_size = true;
  304. if (opts.include_checksum_info) {
  305. info.file_checksum_func_name = kUnknownFileChecksumFuncName;
  306. info.file_checksum = kUnknownFileChecksum;
  307. }
  308. }
  309. { // CURRENT
  310. results.emplace_back();
  311. LiveFileStorageInfo& info = results.back();
  312. info.relative_filename = kCurrentFileName;
  313. info.directory = GetName();
  314. info.file_type = kCurrentFile;
  315. // CURRENT could be replaced so we have to record the contents as needed.
  316. info.replacement_contents = manifest_fname + "\n";
  317. info.size = manifest_fname.size() + 1;
  318. if (opts.include_checksum_info) {
  319. info.file_checksum_func_name = kUnknownFileChecksumFuncName;
  320. info.file_checksum = kUnknownFileChecksum;
  321. }
  322. }
  323. // In read-only mode the OPTIONS file number is zero when no OPTIONS file
  324. // exist at all. In this cases we do not record any OPTIONS file in the live
  325. // file list.
  326. if (options_number != 0) {
  327. results.emplace_back();
  328. LiveFileStorageInfo& info = results.back();
  329. info.relative_filename = OptionsFileName(options_number);
  330. info.directory = GetName();
  331. info.file_number = options_number;
  332. info.file_type = kOptionsFile;
  333. info.size = options_size;
  334. if (opts.include_checksum_info) {
  335. info.file_checksum_func_name = kUnknownFileChecksumFuncName;
  336. info.file_checksum = kUnknownFileChecksum;
  337. }
  338. }
  339. // Some legacy testing stuff TODO: carefully clean up obsolete parts
  340. TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:FlushDone");
  341. TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles1");
  342. TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles2");
  343. if (s.ok()) {
  344. // FlushWAL is required to ensure we can physically copy everything
  345. // logically written to the WAL. (Sync not strictly required for
  346. // active WAL to be copied rather than hard linked, even when
  347. // Checkpoint guarantees that the copied-to file is sync-ed. Plus we can't
  348. // help track_and_verify_wals_in_manifest after manifest_size is
  349. // already determined.)
  350. s = FlushWAL(/*sync=*/false);
  351. if (s.IsNotSupported()) { // read-only DB or similar
  352. s = Status::OK();
  353. }
  354. }
  355. TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive1");
  356. TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive2");
  357. // Even after WAL flush, there could be multiple WALs that are not
  358. // fully synced. Although the output DB of a Checkpoint or Backup needs
  359. // to be fully synced on return, we don't strictly need to sync this
  360. // DB (the input DB). If we allow Checkpoint to hard link an inactive
  361. // WAL that isn't fully synced, that could result in an insufficiently
  362. // sync-ed Checkpoint. Here we get the set of WALs that are potentially
  363. // unsynced or still being written to, to prevent them from being hard
  364. // linked. Enforcing max_log_num from above ensures any new WALs after
  365. // GetOpenWalSizes() and before GetSortedWalFiles() are not included in
  366. // the results.
  367. // NOTE: we might still hard link a file that is open for writing, even
  368. // if we don't do any more writes to it.
  369. //
  370. // In a step toward reducing unnecessary file metadata queries, we also
  371. // get and use our known flushed sizes for those WALs.
  372. // FIXME: eventually we should not be using filesystem queries at all for
  373. // the required set of WAL files.
  374. //
  375. // However for recycled log files, we just copy the whole file,
  376. // for better or worse.
  377. //
  378. std::map<uint64_t, uint64_t> open_wal_number_to_size;
  379. bool recycling_log_files = immutable_db_options_.recycle_log_file_num > 0;
  380. if (s.ok() && !recycling_log_files) {
  381. s = GetOpenWalSizes(open_wal_number_to_size);
  382. }
  383. // [old comment] If we have more than one column family, we also need to get
  384. // WAL files.
  385. if (s.ok()) {
  386. // FIXME: avoid querying the filesystem for current WAL state
  387. s = GetSortedWalFilesImpl(live_wal_files,
  388. /* need_seqnos */ false);
  389. }
  390. if (!s.ok()) {
  391. return s;
  392. }
  393. size_t wal_count = live_wal_files.size();
  394. // Link WAL files. Copy exact size of last one because it is the only one
  395. // that has changes after the last flush.
  396. auto wal_dir = immutable_db_options_.GetWalDir();
  397. for (size_t i = 0; s.ok() && i < wal_count; ++i) {
  398. if ((live_wal_files[i]->Type() == kAliveLogFile) &&
  399. (!flush_memtable || live_wal_files[i]->LogNumber() >= min_log_num) &&
  400. live_wal_files[i]->LogNumber() <= max_log_num) {
  401. results.emplace_back();
  402. LiveFileStorageInfo& info = results.back();
  403. auto f = live_wal_files[i]->PathName();
  404. assert(!f.empty() && f[0] == '/');
  405. info.relative_filename = f.substr(1);
  406. info.directory = wal_dir;
  407. info.file_number = live_wal_files[i]->LogNumber();
  408. info.file_type = kWalFile;
  409. if (recycling_log_files) {
  410. info.size = live_wal_files[i]->SizeFileBytes();
  411. // Recyclable WAL files must be copied instead of hard linked
  412. info.trim_to_size = true;
  413. } else {
  414. auto it = open_wal_number_to_size.find(info.file_number);
  415. if (it == open_wal_number_to_size.end()) {
  416. // Known fully synced and no future writes (in part from
  417. // max_log_num check). Ok to hard link
  418. info.size = live_wal_files[i]->SizeFileBytes();
  419. assert(!info.trim_to_size);
  420. } else {
  421. // Marked as (possibly) still open -> use our known flushed size
  422. // and force file copy instead of hard link
  423. info.size = it->second;
  424. info.trim_to_size = true;
  425. // FIXME: this is needed as long as db_stress uses
  426. // SetReadUnsyncedData(false), because it will only be able to
  427. // copy the synced portion of the WAL, which under
  428. // SetReadUnsyncedData(false) is given by the reported file size.
  429. info.size = std::min(info.size, live_wal_files[i]->SizeFileBytes());
  430. }
  431. }
  432. if (opts.include_checksum_info) {
  433. info.file_checksum_func_name = kUnknownFileChecksumFuncName;
  434. info.file_checksum = kUnknownFileChecksum;
  435. }
  436. }
  437. }
  438. if (s.ok()) {
  439. // Only move results to output on success.
  440. *files = std::move(results);
  441. }
  442. return s;
  443. }
  444. } // namespace ROCKSDB_NAMESPACE