sst_file_manager_impl.cc 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. #include "file/sst_file_manager_impl.h"
  6. #include <cinttypes>
  7. #include <vector>
  8. #include "db/db_impl/db_impl.h"
  9. #include "logging/logging.h"
  10. #include "port/port.h"
  11. #include "rocksdb/env.h"
  12. #include "rocksdb/sst_file_manager.h"
  13. #include "test_util/sync_point.h"
  14. #include "util/mutexlock.h"
  15. namespace ROCKSDB_NAMESPACE {
  16. SstFileManagerImpl::SstFileManagerImpl(
  17. const std::shared_ptr<SystemClock>& clock,
  18. const std::shared_ptr<FileSystem>& fs,
  19. const std::shared_ptr<Logger>& logger, int64_t rate_bytes_per_sec,
  20. double max_trash_db_ratio, uint64_t bytes_max_delete_chunk)
  21. : clock_(clock),
  22. fs_(fs),
  23. logger_(logger),
  24. total_files_size_(0),
  25. compaction_buffer_size_(0),
  26. cur_compactions_reserved_size_(0),
  27. max_allowed_space_(0),
  28. delete_scheduler_(clock_.get(), fs_.get(), rate_bytes_per_sec,
  29. logger.get(), this, max_trash_db_ratio,
  30. bytes_max_delete_chunk),
  31. cv_(&mu_),
  32. closing_(false),
  33. bg_thread_(nullptr),
  34. reserved_disk_buffer_(0),
  35. free_space_trigger_(0),
  36. cur_instance_(nullptr) {}
  37. SstFileManagerImpl::~SstFileManagerImpl() {
  38. Close();
  39. bg_err_.PermitUncheckedError();
  40. }
  41. void SstFileManagerImpl::Close() {
  42. {
  43. MutexLock l(&mu_);
  44. if (closing_) {
  45. return;
  46. }
  47. closing_ = true;
  48. cv_.SignalAll();
  49. }
  50. if (bg_thread_) {
  51. bg_thread_->join();
  52. }
  53. }
  54. Status SstFileManagerImpl::OnAddFile(const std::string& file_path) {
  55. uint64_t file_size;
  56. Status s = fs_->GetFileSize(file_path, IOOptions(), &file_size, nullptr);
  57. if (s.ok()) {
  58. MutexLock l(&mu_);
  59. OnAddFileImpl(file_path, file_size);
  60. }
  61. TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::OnAddFile",
  62. const_cast<std::string*>(&file_path));
  63. return s;
  64. }
  65. Status SstFileManagerImpl::OnAddFile(const std::string& file_path,
  66. uint64_t file_size) {
  67. MutexLock l(&mu_);
  68. OnAddFileImpl(file_path, file_size);
  69. TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::OnAddFile",
  70. const_cast<std::string*>(&file_path));
  71. return Status::OK();
  72. }
  73. Status SstFileManagerImpl::OnDeleteFile(const std::string& file_path) {
  74. {
  75. MutexLock l(&mu_);
  76. OnDeleteFileImpl(file_path);
  77. }
  78. TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::OnDeleteFile",
  79. const_cast<std::string*>(&file_path));
  80. return Status::OK();
  81. }
  82. void SstFileManagerImpl::OnCompactionCompletion(Compaction* c) {
  83. MutexLock l(&mu_);
  84. uint64_t size_added_by_compaction = 0;
  85. for (size_t i = 0; i < c->num_input_levels(); i++) {
  86. for (size_t j = 0; j < c->num_input_files(i); j++) {
  87. FileMetaData* filemeta = c->input(i, j);
  88. size_added_by_compaction += filemeta->fd.GetFileSize();
  89. }
  90. }
  91. assert(cur_compactions_reserved_size_ >= size_added_by_compaction);
  92. cur_compactions_reserved_size_ -= size_added_by_compaction;
  93. }
  94. Status SstFileManagerImpl::OnMoveFile(const std::string& old_path,
  95. const std::string& new_path,
  96. uint64_t* file_size) {
  97. {
  98. MutexLock l(&mu_);
  99. if (file_size != nullptr) {
  100. *file_size = tracked_files_[old_path];
  101. }
  102. OnAddFileImpl(new_path, tracked_files_[old_path]);
  103. OnDeleteFileImpl(old_path);
  104. }
  105. TEST_SYNC_POINT("SstFileManagerImpl::OnMoveFile");
  106. return Status::OK();
  107. }
  108. Status SstFileManagerImpl::OnUntrackFile(const std::string& file_path) {
  109. {
  110. MutexLock l(&mu_);
  111. OnDeleteFileImpl(file_path);
  112. }
  113. TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::OnUntrackFile",
  114. const_cast<std::string*>(&file_path));
  115. return Status::OK();
  116. }
  117. void SstFileManagerImpl::SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) {
  118. MutexLock l(&mu_);
  119. max_allowed_space_ = max_allowed_space;
  120. }
  121. void SstFileManagerImpl::SetCompactionBufferSize(
  122. uint64_t compaction_buffer_size) {
  123. MutexLock l(&mu_);
  124. compaction_buffer_size_ = compaction_buffer_size;
  125. }
  126. bool SstFileManagerImpl::IsMaxAllowedSpaceReached() {
  127. MutexLock l(&mu_);
  128. if (max_allowed_space_ <= 0) {
  129. return false;
  130. }
  131. return total_files_size_ >= max_allowed_space_;
  132. }
  133. bool SstFileManagerImpl::IsMaxAllowedSpaceReachedIncludingCompactions() {
  134. MutexLock l(&mu_);
  135. if (max_allowed_space_ <= 0) {
  136. return false;
  137. }
  138. return total_files_size_ + cur_compactions_reserved_size_ >=
  139. max_allowed_space_;
  140. }
  141. bool SstFileManagerImpl::EnoughRoomForCompaction(
  142. ColumnFamilyData* cfd, const std::vector<CompactionInputFiles>& inputs,
  143. const Status& bg_error) {
  144. MutexLock l(&mu_);
  145. uint64_t size_added_by_compaction = 0;
  146. // First check if we even have the space to do the compaction
  147. for (size_t i = 0; i < inputs.size(); i++) {
  148. for (size_t j = 0; j < inputs[i].size(); j++) {
  149. FileMetaData* filemeta = inputs[i][j];
  150. size_added_by_compaction += filemeta->fd.GetFileSize();
  151. }
  152. }
  153. // Update cur_compactions_reserved_size_ so concurrent compaction
  154. // don't max out space
  155. size_t needed_headroom = cur_compactions_reserved_size_ +
  156. size_added_by_compaction + compaction_buffer_size_;
  157. if (max_allowed_space_ != 0 &&
  158. (needed_headroom + total_files_size_ > max_allowed_space_)) {
  159. return false;
  160. }
  161. // Implement more aggressive checks only if this DB instance has already
  162. // seen a NoSpace() error. This is tin order to contain a single potentially
  163. // misbehaving DB instance and prevent it from slowing down compactions of
  164. // other DB instances
  165. if (bg_error.IsNoSpace() && CheckFreeSpace()) {
  166. auto fn =
  167. TableFileName(cfd->ioptions().cf_paths, inputs[0][0]->fd.GetNumber(),
  168. inputs[0][0]->fd.GetPathId());
  169. uint64_t free_space = 0;
  170. Status s = fs_->GetFreeSpace(fn, IOOptions(), &free_space, nullptr);
  171. s.PermitUncheckedError(); // TODO: Check the status
  172. // needed_headroom is based on current size reserved by compactions,
  173. // minus any files created by running compactions as they would count
  174. // against the reserved size. If user didn't specify any compaction
  175. // buffer, add reserved_disk_buffer_ that's calculated by default so the
  176. // compaction doesn't end up leaving nothing for logs and flush SSTs
  177. if (compaction_buffer_size_ == 0) {
  178. needed_headroom += reserved_disk_buffer_;
  179. }
  180. if (free_space < needed_headroom + size_added_by_compaction) {
  181. // We hit the condition of not enough disk space
  182. ROCKS_LOG_ERROR(logger_,
  183. "free space [%" PRIu64
  184. " bytes] is less than "
  185. "needed headroom [%" ROCKSDB_PRIszt " bytes]\n",
  186. free_space, needed_headroom);
  187. return false;
  188. }
  189. }
  190. cur_compactions_reserved_size_ += size_added_by_compaction;
  191. // Take a snapshot of cur_compactions_reserved_size_ for when we encounter
  192. // a NoSpace error.
  193. free_space_trigger_ = cur_compactions_reserved_size_;
  194. return true;
  195. }
  196. uint64_t SstFileManagerImpl::GetCompactionsReservedSize() {
  197. MutexLock l(&mu_);
  198. return cur_compactions_reserved_size_;
  199. }
  200. uint64_t SstFileManagerImpl::GetTotalSize() {
  201. MutexLock l(&mu_);
  202. return total_files_size_;
  203. }
  204. std::unordered_map<std::string, uint64_t>
  205. SstFileManagerImpl::GetTrackedFiles() {
  206. MutexLock l(&mu_);
  207. return tracked_files_;
  208. }
  209. int64_t SstFileManagerImpl::GetDeleteRateBytesPerSecond() {
  210. return delete_scheduler_.GetRateBytesPerSecond();
  211. }
  212. void SstFileManagerImpl::SetDeleteRateBytesPerSecond(int64_t delete_rate) {
  213. return delete_scheduler_.SetRateBytesPerSecond(delete_rate);
  214. }
  215. double SstFileManagerImpl::GetMaxTrashDBRatio() {
  216. return delete_scheduler_.GetMaxTrashDBRatio();
  217. }
  218. void SstFileManagerImpl::SetMaxTrashDBRatio(double r) {
  219. return delete_scheduler_.SetMaxTrashDBRatio(r);
  220. }
  221. uint64_t SstFileManagerImpl::GetTotalTrashSize() {
  222. return delete_scheduler_.GetTotalTrashSize();
  223. }
  224. void SstFileManagerImpl::ReserveDiskBuffer(uint64_t size,
  225. const std::string& path) {
  226. MutexLock l(&mu_);
  227. reserved_disk_buffer_ += size;
  228. if (path_.empty()) {
  229. path_ = path;
  230. }
  231. }
  232. void SstFileManagerImpl::ClearError() {
  233. while (true) {
  234. MutexLock l(&mu_);
  235. if (error_handler_list_.empty() || closing_) {
  236. return;
  237. }
  238. uint64_t free_space = 0;
  239. Status s = fs_->GetFreeSpace(path_, IOOptions(), &free_space, nullptr);
  240. free_space = max_allowed_space_ > 0
  241. ? std::min(max_allowed_space_, free_space)
  242. : free_space;
  243. if (s.ok()) {
  244. // In case of multi-DB instances, some of them may have experienced a
  245. // soft error and some a hard error. In the SstFileManagerImpl, a hard
  246. // error will basically override previously reported soft errors. Once
  247. // we clear the hard error, we don't keep track of previous errors for
  248. // now
  249. if (bg_err_.severity() == Status::Severity::kHardError) {
  250. if (free_space < reserved_disk_buffer_) {
  251. ROCKS_LOG_ERROR(logger_,
  252. "free space [%" PRIu64
  253. " bytes] is less than "
  254. "required disk buffer [%" PRIu64 " bytes]\n",
  255. free_space, reserved_disk_buffer_);
  256. ROCKS_LOG_ERROR(logger_, "Cannot clear hard error\n");
  257. s = Status::NoSpace();
  258. }
  259. } else if (bg_err_.severity() == Status::Severity::kSoftError) {
  260. if (free_space < free_space_trigger_) {
  261. ROCKS_LOG_WARN(logger_,
  262. "free space [%" PRIu64
  263. " bytes] is less than "
  264. "free space for compaction trigger [%" PRIu64
  265. " bytes]\n",
  266. free_space, free_space_trigger_);
  267. ROCKS_LOG_WARN(logger_, "Cannot clear soft error\n");
  268. s = Status::NoSpace();
  269. }
  270. }
  271. }
  272. // Someone could have called CancelErrorRecovery() and the list could have
  273. // become empty, so check again here
  274. if (s.ok()) {
  275. assert(!error_handler_list_.empty());
  276. auto error_handler = error_handler_list_.front();
  277. // Since we will release the mutex, set cur_instance_ to signal to the
  278. // shutdown thread, if it calls // CancelErrorRecovery() the meantime,
  279. // to indicate that this DB instance is busy. The DB instance is
  280. // guaranteed to not be deleted before RecoverFromBGError() returns,
  281. // since the ErrorHandler::recovery_in_prog_ flag would be true
  282. cur_instance_ = error_handler;
  283. mu_.Unlock();
  284. s = error_handler->RecoverFromBGError();
  285. TEST_SYNC_POINT("SstFileManagerImpl::ErrorCleared");
  286. mu_.Lock();
  287. // The DB instance might have been deleted while we were
  288. // waiting for the mutex, so check cur_instance_ to make sure its
  289. // still non-null
  290. if (cur_instance_) {
  291. // Check for error again, since the instance may have recovered but
  292. // immediately got another error. If that's the case, and the new
  293. // error is also a NoSpace() non-fatal error, leave the instance in
  294. // the list
  295. Status err = cur_instance_->GetBGError();
  296. if (s.ok() && err.subcode() == IOStatus::SubCode::kNoSpace &&
  297. err.severity() < Status::Severity::kFatalError) {
  298. s = err;
  299. }
  300. cur_instance_ = nullptr;
  301. }
  302. if (s.ok() || s.IsShutdownInProgress() ||
  303. (!s.ok() && s.severity() >= Status::Severity::kFatalError)) {
  304. // If shutdown is in progress, abandon this handler instance
  305. // and continue with the others
  306. error_handler_list_.pop_front();
  307. }
  308. }
  309. if (!error_handler_list_.empty()) {
  310. // If there are more instances to be recovered, reschedule after 5
  311. // seconds
  312. int64_t wait_until = clock_->NowMicros() + 5000000;
  313. cv_.TimedWait(wait_until);
  314. }
  315. // Check again for error_handler_list_ empty, as a DB instance shutdown
  316. // could have removed it from the queue while we were in timed wait
  317. if (error_handler_list_.empty()) {
  318. ROCKS_LOG_INFO(logger_, "Clearing error\n");
  319. bg_err_ = Status::OK();
  320. return;
  321. }
  322. }
  323. }
  324. void SstFileManagerImpl::StartErrorRecovery(ErrorHandler* handler,
  325. Status bg_error) {
  326. MutexLock l(&mu_);
  327. if (bg_error.severity() == Status::Severity::kSoftError) {
  328. if (bg_err_.ok()) {
  329. // Setting bg_err_ basically means we're in degraded mode
  330. // Assume that all pending compactions will fail similarly. The trigger
  331. // for clearing this condition is set to current compaction reserved
  332. // size, so we stop checking disk space available in
  333. // EnoughRoomForCompaction once this much free space is available
  334. bg_err_ = bg_error;
  335. }
  336. } else if (bg_error.severity() == Status::Severity::kHardError) {
  337. bg_err_ = bg_error;
  338. } else {
  339. assert(false);
  340. }
  341. // If this is the first instance of this error, kick of a thread to poll
  342. // and recover from this condition
  343. if (error_handler_list_.empty()) {
  344. error_handler_list_.push_back(handler);
  345. // Release lock before calling join. Its ok to do so because
  346. // error_handler_list_ is now non-empty, so no other invocation of this
  347. // function will execute this piece of code
  348. mu_.Unlock();
  349. if (bg_thread_) {
  350. bg_thread_->join();
  351. }
  352. // Start a new thread. The previous one would have exited.
  353. bg_thread_.reset(new port::Thread(&SstFileManagerImpl::ClearError, this));
  354. mu_.Lock();
  355. } else {
  356. // Check if this DB instance is already in the list
  357. for (auto iter = error_handler_list_.begin();
  358. iter != error_handler_list_.end(); ++iter) {
  359. if ((*iter) == handler) {
  360. return;
  361. }
  362. }
  363. error_handler_list_.push_back(handler);
  364. }
  365. }
  366. bool SstFileManagerImpl::CancelErrorRecovery(ErrorHandler* handler) {
  367. MutexLock l(&mu_);
  368. if (cur_instance_ == handler) {
  369. // This instance is currently busy attempting to recover
  370. // Nullify it so the recovery thread doesn't attempt to access it again
  371. cur_instance_ = nullptr;
  372. return false;
  373. }
  374. for (auto iter = error_handler_list_.begin();
  375. iter != error_handler_list_.end(); ++iter) {
  376. if ((*iter) == handler) {
  377. error_handler_list_.erase(iter);
  378. return true;
  379. }
  380. }
  381. return false;
  382. }
  383. Status SstFileManagerImpl::ScheduleFileDeletion(const std::string& file_path,
  384. const std::string& path_to_sync,
  385. const bool force_bg) {
  386. TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::ScheduleFileDeletion",
  387. const_cast<std::string*>(&file_path));
  388. return delete_scheduler_.DeleteFile(file_path, path_to_sync, force_bg);
  389. }
  390. Status SstFileManagerImpl::ScheduleUnaccountedFileDeletion(
  391. const std::string& file_path, const std::string& dir_to_sync,
  392. const bool force_bg, std::optional<int32_t> bucket) {
  393. TEST_SYNC_POINT_CALLBACK(
  394. "SstFileManagerImpl::ScheduleUnaccountedFileDeletion",
  395. const_cast<std::string*>(&file_path));
  396. return delete_scheduler_.DeleteUnaccountedFile(file_path, dir_to_sync,
  397. force_bg, bucket);
  398. }
  399. void SstFileManagerImpl::WaitForEmptyTrash() {
  400. delete_scheduler_.WaitForEmptyTrash();
  401. }
  402. std::optional<int32_t> SstFileManagerImpl::NewTrashBucket() {
  403. return delete_scheduler_.NewTrashBucket();
  404. }
  405. void SstFileManagerImpl::WaitForEmptyTrashBucket(int32_t bucket) {
  406. delete_scheduler_.WaitForEmptyTrashBucket(bucket);
  407. }
  408. void SstFileManagerImpl::OnAddFileImpl(const std::string& file_path,
  409. uint64_t file_size) {
  410. auto tracked_file = tracked_files_.find(file_path);
  411. if (tracked_file != tracked_files_.end()) {
  412. // File was added before, we will just update the size
  413. total_files_size_ -= tracked_file->second;
  414. total_files_size_ += file_size;
  415. } else {
  416. total_files_size_ += file_size;
  417. }
  418. tracked_files_[file_path] = file_size;
  419. }
  420. void SstFileManagerImpl::OnDeleteFileImpl(const std::string& file_path) {
  421. auto tracked_file = tracked_files_.find(file_path);
  422. if (tracked_file == tracked_files_.end()) {
  423. // File is not tracked
  424. return;
  425. }
  426. total_files_size_ -= tracked_file->second;
  427. tracked_files_.erase(tracked_file);
  428. }
  429. SstFileManager* NewSstFileManager(Env* env, std::shared_ptr<Logger> info_log,
  430. std::string trash_dir,
  431. int64_t rate_bytes_per_sec,
  432. bool delete_existing_trash, Status* status,
  433. double max_trash_db_ratio,
  434. uint64_t bytes_max_delete_chunk) {
  435. const auto& fs = env->GetFileSystem();
  436. return NewSstFileManager(env, fs, info_log, trash_dir, rate_bytes_per_sec,
  437. delete_existing_trash, status, max_trash_db_ratio,
  438. bytes_max_delete_chunk);
  439. }
  440. SstFileManager* NewSstFileManager(Env* env, std::shared_ptr<FileSystem> fs,
  441. std::shared_ptr<Logger> info_log,
  442. const std::string& trash_dir,
  443. int64_t rate_bytes_per_sec,
  444. bool delete_existing_trash, Status* status,
  445. double max_trash_db_ratio,
  446. uint64_t bytes_max_delete_chunk) {
  447. const auto& clock = env->GetSystemClock();
  448. SstFileManagerImpl* res =
  449. new SstFileManagerImpl(clock, fs, info_log, rate_bytes_per_sec,
  450. max_trash_db_ratio, bytes_max_delete_chunk);
  451. // trash_dir is deprecated and not needed anymore, but if user passed it
  452. // we will still remove files in it.
  453. Status s = Status::OK();
  454. if (delete_existing_trash && trash_dir != "") {
  455. std::vector<std::string> files_in_trash;
  456. s = fs->GetChildren(trash_dir, IOOptions(), &files_in_trash, nullptr);
  457. if (s.ok()) {
  458. for (const std::string& trash_file : files_in_trash) {
  459. std::string path_in_trash = trash_dir + "/" + trash_file;
  460. res->OnAddFile(path_in_trash);
  461. Status file_delete =
  462. res->ScheduleFileDeletion(path_in_trash, trash_dir);
  463. if (s.ok() && !file_delete.ok()) {
  464. s = file_delete;
  465. }
  466. }
  467. }
  468. }
  469. if (status) {
  470. *status = s;
  471. } else {
  472. // No one passed us a Status, so they must not care about the error...
  473. s.PermitUncheckedError();
  474. }
  475. return res;
  476. }
  477. } // namespace ROCKSDB_NAMESPACE