checkpoint_impl.cc 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2012 Facebook.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file.
  9. #include "utilities/checkpoint/checkpoint_impl.h"
  10. #include <algorithm>
  11. #include <cinttypes>
  12. #include <string>
  13. #include <tuple>
  14. #include <unordered_set>
  15. #include <vector>
  16. #include "db/wal_manager.h"
  17. #include "file/file_util.h"
  18. #include "file/filename.h"
  19. #include "logging/logging.h"
  20. #include "port/port.h"
  21. #include "rocksdb/db.h"
  22. #include "rocksdb/env.h"
  23. #include "rocksdb/metadata.h"
  24. #include "rocksdb/options.h"
  25. #include "rocksdb/transaction_log.h"
  26. #include "rocksdb/types.h"
  27. #include "rocksdb/utilities/checkpoint.h"
  28. #include "test_util/sync_point.h"
  29. #include "util/cast_util.h"
  30. #include "util/file_checksum_helper.h"
  31. namespace ROCKSDB_NAMESPACE {
  32. Status Checkpoint::Create(DB* db, Checkpoint** checkpoint_ptr) {
  33. *checkpoint_ptr = new CheckpointImpl(db);
  34. return Status::OK();
  35. }
  36. Status Checkpoint::CreateCheckpoint(const std::string& /*checkpoint_dir*/,
  37. uint64_t /*log_size_for_flush*/,
  38. uint64_t* /*sequence_number_ptr*/) {
  39. return Status::NotSupported("");
  40. }
  41. Status CheckpointImpl::CleanStagingDirectory(
  42. const std::string& full_private_path, Logger* info_log) {
  43. std::vector<std::string> subchildren;
  44. Status s = db_->GetEnv()->FileExists(full_private_path);
  45. if (s.IsNotFound()) {
  46. // Nothing to clean
  47. return Status::OK();
  48. } else if (!s.ok()) {
  49. return s;
  50. }
  51. assert(s.ok());
  52. ROCKS_LOG_INFO(info_log, "File exists %s -- %s", full_private_path.c_str(),
  53. s.ToString().c_str());
  54. s = db_->GetEnv()->GetChildren(full_private_path, &subchildren);
  55. if (s.ok()) {
  56. for (auto& subchild : subchildren) {
  57. Status del_s;
  58. std::string subchild_path = full_private_path + "/" + subchild;
  59. del_s = db_->GetEnv()->DeleteFile(subchild_path);
  60. ROCKS_LOG_INFO(info_log, "Delete file %s -- %s", subchild_path.c_str(),
  61. del_s.ToString().c_str());
  62. if (!del_s.ok() && s.ok()) {
  63. s = del_s;
  64. }
  65. }
  66. }
  67. // Then delete the private dir
  68. if (s.ok()) {
  69. s = db_->GetEnv()->DeleteDir(full_private_path);
  70. ROCKS_LOG_INFO(info_log, "Delete dir %s -- %s", full_private_path.c_str(),
  71. s.ToString().c_str());
  72. }
  73. return s;
  74. }
  75. Status Checkpoint::ExportColumnFamily(
  76. ColumnFamilyHandle* /*handle*/, const std::string& /*export_dir*/,
  77. ExportImportFilesMetaData** /*metadata*/) {
  78. return Status::NotSupported("");
  79. }
  80. // Builds an openable snapshot of RocksDB
  81. Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir,
  82. uint64_t log_size_for_flush,
  83. uint64_t* sequence_number_ptr) {
  84. DBOptions db_options = db_->GetDBOptions();
  85. Status file_exists_s = db_->GetEnv()->FileExists(checkpoint_dir);
  86. if (file_exists_s.ok()) {
  87. return Status::InvalidArgument("Directory exists");
  88. } else if (!file_exists_s.IsNotFound()) {
  89. assert(file_exists_s.IsIOError());
  90. return file_exists_s;
  91. } else {
  92. assert(file_exists_s.IsNotFound());
  93. };
  94. Status s;
  95. ROCKS_LOG_INFO(
  96. db_options.info_log,
  97. "Started the snapshot process -- creating snapshot in directory %s",
  98. checkpoint_dir.c_str());
  99. size_t final_nonslash_idx = checkpoint_dir.find_last_not_of('/');
  100. if (final_nonslash_idx == std::string::npos) {
  101. // npos means it's only slashes or empty. Non-empty means it's the root
  102. // directory, but it shouldn't be because we verified above the directory
  103. // doesn't exist.
  104. assert(checkpoint_dir.empty());
  105. s.PermitUncheckedError();
  106. return Status::InvalidArgument("invalid checkpoint directory name");
  107. }
  108. std::string full_private_path =
  109. checkpoint_dir.substr(0, final_nonslash_idx + 1) + ".tmp";
  110. ROCKS_LOG_INFO(db_options.info_log,
  111. "Snapshot process -- using temporary directory %s",
  112. full_private_path.c_str());
  113. s = CleanStagingDirectory(full_private_path, db_options.info_log.get());
  114. if (!s.ok()) {
  115. return Status::Aborted(
  116. "Failed to clean the temporary directory " + full_private_path +
  117. " needed before checkpoint creation : " + s.ToString());
  118. }
  119. // create snapshot directory
  120. s = db_->GetEnv()->CreateDir(full_private_path);
  121. uint64_t sequence_number = 0;
  122. if (s.ok()) {
  123. // enable file deletions
  124. s = db_->DisableFileDeletions();
  125. const bool disabled_file_deletions = s.ok();
  126. if (s.ok() || s.IsNotSupported()) {
  127. s = CreateCustomCheckpoint(
  128. [&](const std::string& src_dirname, const std::string& fname,
  129. FileType) {
  130. ROCKS_LOG_INFO(db_options.info_log, "Hard Linking %s",
  131. fname.c_str());
  132. return db_->GetFileSystem()->LinkFile(
  133. src_dirname + "/" + fname, full_private_path + "/" + fname,
  134. IOOptions(), nullptr);
  135. } /* link_file_cb */,
  136. [&](const std::string& src_dirname, const std::string& fname,
  137. uint64_t size_limit_bytes, FileType,
  138. const std::string& /* checksum_func_name */,
  139. const std::string& /* checksum_val */,
  140. const Temperature temperature) {
  141. ROCKS_LOG_INFO(db_options.info_log, "Copying %s", fname.c_str());
  142. return CopyFile(db_->GetFileSystem(), src_dirname + "/" + fname,
  143. temperature, full_private_path + "/" + fname,
  144. temperature, size_limit_bytes, db_options.use_fsync,
  145. nullptr);
  146. } /* copy_file_cb */,
  147. [&](const std::string& fname, const std::string& contents, FileType) {
  148. ROCKS_LOG_INFO(db_options.info_log, "Creating %s", fname.c_str());
  149. return CreateFile(db_->GetFileSystem(),
  150. full_private_path + "/" + fname, contents,
  151. db_options.use_fsync);
  152. } /* create_file_cb */,
  153. &sequence_number, log_size_for_flush);
  154. // we copied all the files, enable file deletions
  155. if (disabled_file_deletions) {
  156. Status ss = db_->EnableFileDeletions();
  157. assert(ss.ok());
  158. ss.PermitUncheckedError();
  159. }
  160. }
  161. }
  162. if (s.ok()) {
  163. // move tmp private backup to real snapshot directory
  164. s = db_->GetEnv()->RenameFile(full_private_path, checkpoint_dir);
  165. }
  166. if (s.ok()) {
  167. std::unique_ptr<FSDirectory> checkpoint_directory;
  168. s = db_->GetFileSystem()->NewDirectory(checkpoint_dir, IOOptions(),
  169. &checkpoint_directory, nullptr);
  170. if (s.ok() && checkpoint_directory != nullptr) {
  171. s = checkpoint_directory->FsyncWithDirOptions(
  172. IOOptions(), nullptr,
  173. DirFsyncOptions(DirFsyncOptions::FsyncReason::kDirRenamed));
  174. }
  175. }
  176. if (s.ok()) {
  177. if (sequence_number_ptr != nullptr) {
  178. *sequence_number_ptr = sequence_number;
  179. }
  180. // here we know that we succeeded and installed the new snapshot
  181. ROCKS_LOG_INFO(db_options.info_log, "Snapshot DONE. All is good");
  182. ROCKS_LOG_INFO(db_options.info_log, "Snapshot sequence number: %" PRIu64,
  183. sequence_number);
  184. } else {
  185. ROCKS_LOG_INFO(db_options.info_log, "Snapshot failed -- %s",
  186. s.ToString().c_str());
  187. // clean all the files and directory we might have created
  188. Status del_s =
  189. CleanStagingDirectory(full_private_path, db_options.info_log.get());
  190. ROCKS_LOG_INFO(db_options.info_log,
  191. "Clean files or directory we might have created %s: %s",
  192. full_private_path.c_str(), del_s.ToString().c_str());
  193. del_s.PermitUncheckedError();
  194. }
  195. return s;
  196. }
  197. Status CheckpointImpl::CreateCustomCheckpoint(
  198. std::function<Status(const std::string& src_dirname,
  199. const std::string& src_fname, FileType type)>
  200. link_file_cb,
  201. std::function<
  202. Status(const std::string& src_dirname, const std::string& src_fname,
  203. uint64_t size_limit_bytes, FileType type,
  204. const std::string& checksum_func_name,
  205. const std::string& checksum_val, const Temperature temperature)>
  206. copy_file_cb,
  207. std::function<Status(const std::string& fname, const std::string& contents,
  208. FileType type)>
  209. create_file_cb,
  210. uint64_t* sequence_number, uint64_t log_size_for_flush,
  211. bool get_live_table_checksum) {
  212. *sequence_number = db_->GetLatestSequenceNumber();
  213. LiveFilesStorageInfoOptions opts;
  214. opts.include_checksum_info = get_live_table_checksum;
  215. opts.wal_size_for_flush = log_size_for_flush;
  216. std::vector<LiveFileStorageInfo> infos;
  217. {
  218. Status s = db_->GetLiveFilesStorageInfo(opts, &infos);
  219. if (!s.ok()) {
  220. return s;
  221. }
  222. }
  223. // Verify that everything except WAL files are in same directory
  224. // (db_paths / cf_paths not supported)
  225. std::unordered_set<std::string> dirs;
  226. for (auto& info : infos) {
  227. if (info.file_type != kWalFile) {
  228. dirs.insert(info.directory);
  229. }
  230. }
  231. if (dirs.size() > 1) {
  232. return Status::NotSupported(
  233. "db_paths / cf_paths not supported for Checkpoint nor BackupEngine");
  234. }
  235. bool same_fs = true;
  236. for (auto& info : infos) {
  237. Status s;
  238. if (!info.replacement_contents.empty()) {
  239. // Currently should only be used for CURRENT file.
  240. assert(info.file_type == kCurrentFile);
  241. if (info.size != info.replacement_contents.size()) {
  242. s = Status::Corruption("Inconsistent size metadata for " +
  243. info.relative_filename);
  244. } else {
  245. s = create_file_cb(info.relative_filename, info.replacement_contents,
  246. info.file_type);
  247. }
  248. } else {
  249. if (same_fs && !info.trim_to_size) {
  250. s = link_file_cb(info.directory, info.relative_filename,
  251. info.file_type);
  252. if (s.IsNotSupported()) {
  253. same_fs = false;
  254. s = Status::OK();
  255. }
  256. s.MustCheck();
  257. }
  258. if (!same_fs || info.trim_to_size) {
  259. assert(info.file_checksum_func_name.empty() ==
  260. !opts.include_checksum_info);
  261. // no assertion on file_checksum because empty is used for both "not
  262. // set" and "unknown"
  263. if (opts.include_checksum_info) {
  264. s = copy_file_cb(info.directory, info.relative_filename, info.size,
  265. info.file_type, info.file_checksum_func_name,
  266. info.file_checksum, info.temperature);
  267. } else {
  268. s = copy_file_cb(info.directory, info.relative_filename, info.size,
  269. info.file_type, kUnknownFileChecksumFuncName,
  270. kUnknownFileChecksum, info.temperature);
  271. }
  272. }
  273. }
  274. if (!s.ok()) {
  275. return s;
  276. }
  277. }
  278. return Status::OK();
  279. }
  280. // Exports all live SST files of a specified Column Family onto export_dir,
  281. // returning SST files information in metadata.
  282. Status CheckpointImpl::ExportColumnFamily(
  283. ColumnFamilyHandle* handle, const std::string& export_dir,
  284. ExportImportFilesMetaData** metadata) {
  285. auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(handle);
  286. const auto cf_name = cfh->GetName();
  287. const auto db_options = db_->GetDBOptions();
  288. assert(metadata != nullptr);
  289. assert(*metadata == nullptr);
  290. auto s = db_->GetEnv()->FileExists(export_dir);
  291. if (s.ok()) {
  292. return Status::InvalidArgument("Specified export_dir exists");
  293. } else if (!s.IsNotFound()) {
  294. assert(s.IsIOError());
  295. return s;
  296. }
  297. const auto final_nonslash_idx = export_dir.find_last_not_of('/');
  298. if (final_nonslash_idx == std::string::npos) {
  299. return Status::InvalidArgument("Specified export_dir invalid");
  300. }
  301. ROCKS_LOG_INFO(db_options.info_log,
  302. "[%s] export column family onto export directory %s",
  303. cf_name.c_str(), export_dir.c_str());
  304. // Create a temporary export directory.
  305. const auto tmp_export_dir =
  306. export_dir.substr(0, final_nonslash_idx + 1) + ".tmp";
  307. s = db_->GetEnv()->CreateDir(tmp_export_dir);
  308. if (s.ok()) {
  309. // FIXME: should respect atomic_flush and flush all CFs if needed.
  310. s = db_->Flush(ROCKSDB_NAMESPACE::FlushOptions(), handle);
  311. }
  312. ColumnFamilyMetaData db_metadata;
  313. if (s.ok()) {
  314. // Export live sst files with file deletions disabled.
  315. s = db_->DisableFileDeletions();
  316. if (s.ok()) {
  317. db_->GetColumnFamilyMetaData(handle, &db_metadata);
  318. s = ExportFilesInMetaData(
  319. db_options, db_metadata,
  320. [&](const std::string& src_dirname, const std::string& fname) {
  321. ROCKS_LOG_INFO(db_options.info_log, "[%s] HardLinking %s",
  322. cf_name.c_str(), fname.c_str());
  323. return db_->GetEnv()->LinkFile(src_dirname + fname,
  324. tmp_export_dir + fname);
  325. } /*link_file_cb*/,
  326. [&](const std::string& src_dirname, const std::string& fname) {
  327. ROCKS_LOG_INFO(db_options.info_log, "[%s] Copying %s",
  328. cf_name.c_str(), fname.c_str());
  329. // FIXME: temperature handling
  330. return CopyFile(db_->GetFileSystem(), src_dirname + fname,
  331. Temperature::kUnknown, tmp_export_dir + fname,
  332. Temperature::kUnknown, 0, db_options.use_fsync,
  333. nullptr);
  334. } /*copy_file_cb*/);
  335. const auto enable_status = db_->EnableFileDeletions();
  336. if (s.ok()) {
  337. s = enable_status;
  338. }
  339. }
  340. }
  341. auto moved_to_user_specified_dir = false;
  342. if (s.ok()) {
  343. // Move temporary export directory to the actual export directory.
  344. s = db_->GetEnv()->RenameFile(tmp_export_dir, export_dir);
  345. }
  346. if (s.ok()) {
  347. // Fsync export directory.
  348. moved_to_user_specified_dir = true;
  349. std::unique_ptr<FSDirectory> dir_ptr;
  350. s = db_->GetFileSystem()->NewDirectory(export_dir, IOOptions(), &dir_ptr,
  351. nullptr);
  352. if (s.ok()) {
  353. assert(dir_ptr != nullptr);
  354. s = dir_ptr->FsyncWithDirOptions(
  355. IOOptions(), nullptr,
  356. DirFsyncOptions(DirFsyncOptions::FsyncReason::kDirRenamed));
  357. }
  358. }
  359. if (s.ok()) {
  360. // Export of files succeeded. Fill in the metadata information.
  361. auto result_metadata = new ExportImportFilesMetaData();
  362. result_metadata->db_comparator_name = handle->GetComparator()->Name();
  363. for (const auto& level_metadata : db_metadata.levels) {
  364. for (const auto& file_metadata : level_metadata.files) {
  365. LiveFileMetaData live_file_metadata;
  366. live_file_metadata.size = file_metadata.size;
  367. live_file_metadata.name = file_metadata.name;
  368. live_file_metadata.file_number = file_metadata.file_number;
  369. live_file_metadata.db_path = export_dir;
  370. live_file_metadata.smallest_seqno = file_metadata.smallest_seqno;
  371. live_file_metadata.largest_seqno = file_metadata.largest_seqno;
  372. live_file_metadata.smallestkey = file_metadata.smallestkey;
  373. live_file_metadata.largestkey = file_metadata.largestkey;
  374. live_file_metadata.oldest_blob_file_number =
  375. file_metadata.oldest_blob_file_number;
  376. live_file_metadata.epoch_number = file_metadata.epoch_number;
  377. live_file_metadata.level = level_metadata.level;
  378. live_file_metadata.smallest = file_metadata.smallest;
  379. live_file_metadata.largest = file_metadata.largest;
  380. result_metadata->files.push_back(live_file_metadata);
  381. }
  382. *metadata = result_metadata;
  383. }
  384. ROCKS_LOG_INFO(db_options.info_log, "[%s] Export succeeded.",
  385. cf_name.c_str());
  386. } else {
  387. // Failure: Clean up all the files/directories created.
  388. ROCKS_LOG_INFO(db_options.info_log, "[%s] Export failed. %s",
  389. cf_name.c_str(), s.ToString().c_str());
  390. std::vector<std::string> subchildren;
  391. const auto cleanup_dir =
  392. moved_to_user_specified_dir ? export_dir : tmp_export_dir;
  393. db_->GetEnv()->GetChildren(cleanup_dir, &subchildren);
  394. for (const auto& subchild : subchildren) {
  395. const auto subchild_path = cleanup_dir + "/" + subchild;
  396. const auto status = db_->GetEnv()->DeleteFile(subchild_path);
  397. if (!status.ok()) {
  398. ROCKS_LOG_WARN(db_options.info_log, "Failed to cleanup file %s: %s",
  399. subchild_path.c_str(), status.ToString().c_str());
  400. }
  401. }
  402. const auto status = db_->GetEnv()->DeleteDir(cleanup_dir);
  403. if (!status.ok()) {
  404. ROCKS_LOG_WARN(db_options.info_log, "Failed to cleanup dir %s: %s",
  405. cleanup_dir.c_str(), status.ToString().c_str());
  406. }
  407. }
  408. return s;
  409. }
  410. Status CheckpointImpl::ExportFilesInMetaData(
  411. const DBOptions& db_options, const ColumnFamilyMetaData& metadata,
  412. std::function<Status(const std::string& src_dirname,
  413. const std::string& src_fname)>
  414. link_file_cb,
  415. std::function<Status(const std::string& src_dirname,
  416. const std::string& src_fname)>
  417. copy_file_cb) {
  418. Status s;
  419. auto hardlink_file = true;
  420. // Copy/hard link files in metadata.
  421. size_t num_files = 0;
  422. for (const auto& level_metadata : metadata.levels) {
  423. for (const auto& file_metadata : level_metadata.files) {
  424. uint64_t number;
  425. FileType type;
  426. const auto ok = ParseFileName(file_metadata.name, &number, &type);
  427. if (!ok) {
  428. s = Status::Corruption("Could not parse file name");
  429. break;
  430. }
  431. // We should only get sst files here.
  432. assert(type == kTableFile);
  433. assert(file_metadata.size > 0 && file_metadata.name[0] == '/');
  434. const auto src_fname = file_metadata.name;
  435. ++num_files;
  436. if (hardlink_file) {
  437. s = link_file_cb(db_->GetName(), src_fname);
  438. if (num_files == 1 && s.IsNotSupported()) {
  439. // Fallback to copy if link failed due to cross-device directories.
  440. hardlink_file = false;
  441. s = Status::OK();
  442. }
  443. }
  444. if (!hardlink_file) {
  445. s = copy_file_cb(db_->GetName(), src_fname);
  446. }
  447. if (!s.ok()) {
  448. break;
  449. }
  450. }
  451. }
  452. ROCKS_LOG_INFO(db_options.info_log, "Number of table files %" ROCKSDB_PRIszt,
  453. num_files);
  454. return s;
  455. }
  456. } // namespace ROCKSDB_NAMESPACE