error_handler.cc 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846
  1. // Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. #include "db/error_handler.h"
  7. #include "db/db_impl/db_impl.h"
  8. #include "db/event_helpers.h"
  9. #include "file/sst_file_manager_impl.h"
  10. #include "logging/logging.h"
  11. #include "port/lang.h"
  12. namespace ROCKSDB_NAMESPACE {
  13. // Maps to help decide the severity of an error based on the
  14. // BackgroundErrorReason, Code, SubCode and whether db_options.paranoid_checks
  15. // is set or not. There are 3 maps, going from most specific to least specific
  16. // (i.e from all 4 fields in a tuple to only the BackgroundErrorReason and
  17. // paranoid_checks). The less specific map serves as a catch all in case we miss
  18. // a specific error code or subcode.
  19. std::map<std::tuple<BackgroundErrorReason, Status::Code, Status::SubCode, bool>,
  20. Status::Severity>
  21. ErrorSeverityMap = {
  22. // Errors during BG compaction
  23. {std::make_tuple(BackgroundErrorReason::kCompaction,
  24. Status::Code::kIOError, Status::SubCode::kNoSpace,
  25. true),
  26. Status::Severity::kSoftError},
  27. {std::make_tuple(BackgroundErrorReason::kCompaction,
  28. Status::Code::kIOError, Status::SubCode::kNoSpace,
  29. false),
  30. Status::Severity::kNoError},
  31. {std::make_tuple(BackgroundErrorReason::kCompaction,
  32. Status::Code::kIOError, Status::SubCode::kSpaceLimit,
  33. true),
  34. Status::Severity::kHardError},
  35. {std::make_tuple(BackgroundErrorReason::kCompaction,
  36. Status::Code::kIOError, Status::SubCode::kIOFenced,
  37. true),
  38. Status::Severity::kFatalError},
  39. {std::make_tuple(BackgroundErrorReason::kCompaction,
  40. Status::Code::kIOError, Status::SubCode::kIOFenced,
  41. false),
  42. Status::Severity::kFatalError},
  43. // Errors during BG flush
  44. {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
  45. Status::SubCode::kNoSpace, true),
  46. Status::Severity::kHardError},
  47. {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
  48. Status::SubCode::kNoSpace, false),
  49. Status::Severity::kNoError},
  50. {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
  51. Status::SubCode::kSpaceLimit, true),
  52. Status::Severity::kHardError},
  53. {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
  54. Status::SubCode::kIOFenced, true),
  55. Status::Severity::kFatalError},
  56. {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
  57. Status::SubCode::kIOFenced, false),
  58. Status::Severity::kFatalError},
  59. // Errors during Write
  60. {std::make_tuple(BackgroundErrorReason::kWriteCallback,
  61. Status::Code::kIOError, Status::SubCode::kNoSpace,
  62. true),
  63. Status::Severity::kHardError},
  64. {std::make_tuple(BackgroundErrorReason::kWriteCallback,
  65. Status::Code::kIOError, Status::SubCode::kNoSpace,
  66. false),
  67. Status::Severity::kHardError},
  68. {std::make_tuple(BackgroundErrorReason::kWriteCallback,
  69. Status::Code::kIOError, Status::SubCode::kIOFenced,
  70. true),
  71. Status::Severity::kFatalError},
  72. {std::make_tuple(BackgroundErrorReason::kWriteCallback,
  73. Status::Code::kIOError, Status::SubCode::kIOFenced,
  74. false),
  75. Status::Severity::kFatalError},
  76. // Errors during MANIFEST write
  77. {std::make_tuple(BackgroundErrorReason::kManifestWrite,
  78. Status::Code::kIOError, Status::SubCode::kNoSpace,
  79. true),
  80. Status::Severity::kHardError},
  81. {std::make_tuple(BackgroundErrorReason::kManifestWrite,
  82. Status::Code::kIOError, Status::SubCode::kNoSpace,
  83. false),
  84. Status::Severity::kHardError},
  85. {std::make_tuple(BackgroundErrorReason::kManifestWrite,
  86. Status::Code::kIOError, Status::SubCode::kIOFenced,
  87. true),
  88. Status::Severity::kFatalError},
  89. {std::make_tuple(BackgroundErrorReason::kManifestWrite,
  90. Status::Code::kIOError, Status::SubCode::kIOFenced,
  91. false),
  92. Status::Severity::kFatalError},
  93. // Errors during BG flush with WAL disabled
  94. {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
  95. Status::Code::kIOError, Status::SubCode::kNoSpace,
  96. true),
  97. Status::Severity::kHardError},
  98. {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
  99. Status::Code::kIOError, Status::SubCode::kNoSpace,
  100. false),
  101. Status::Severity::kNoError},
  102. {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
  103. Status::Code::kIOError, Status::SubCode::kSpaceLimit,
  104. true),
  105. Status::Severity::kHardError},
  106. {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
  107. Status::Code::kIOError, Status::SubCode::kIOFenced,
  108. true),
  109. Status::Severity::kFatalError},
  110. {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
  111. Status::Code::kIOError, Status::SubCode::kIOFenced,
  112. false),
  113. Status::Severity::kFatalError},
  114. // Errors during MANIFEST write when WAL is disabled
  115. {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
  116. Status::Code::kIOError, Status::SubCode::kNoSpace,
  117. true),
  118. Status::Severity::kHardError},
  119. {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
  120. Status::Code::kIOError, Status::SubCode::kNoSpace,
  121. false),
  122. Status::Severity::kHardError},
  123. {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
  124. Status::Code::kIOError, Status::SubCode::kIOFenced,
  125. true),
  126. Status::Severity::kFatalError},
  127. {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
  128. Status::Code::kIOError, Status::SubCode::kIOFenced,
  129. false),
  130. Status::Severity::kFatalError},
  131. };
  132. std::map<std::tuple<BackgroundErrorReason, Status::Code, bool>,
  133. Status::Severity>
  134. DefaultErrorSeverityMap = {
  135. // Errors during BG compaction
  136. {std::make_tuple(BackgroundErrorReason::kCompaction,
  137. Status::Code::kCorruption, true),
  138. Status::Severity::kUnrecoverableError},
  139. {std::make_tuple(BackgroundErrorReason::kCompaction,
  140. Status::Code::kCorruption, false),
  141. Status::Severity::kNoError},
  142. {std::make_tuple(BackgroundErrorReason::kCompaction,
  143. Status::Code::kIOError, true),
  144. Status::Severity::kFatalError},
  145. {std::make_tuple(BackgroundErrorReason::kCompaction,
  146. Status::Code::kIOError, false),
  147. Status::Severity::kNoError},
  148. // Errors during BG flush
  149. {std::make_tuple(BackgroundErrorReason::kFlush,
  150. Status::Code::kCorruption, true),
  151. Status::Severity::kUnrecoverableError},
  152. {std::make_tuple(BackgroundErrorReason::kFlush,
  153. Status::Code::kCorruption, false),
  154. Status::Severity::kNoError},
  155. {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
  156. true),
  157. Status::Severity::kFatalError},
  158. {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
  159. false),
  160. Status::Severity::kNoError},
  161. // Errors during Write
  162. {std::make_tuple(BackgroundErrorReason::kWriteCallback,
  163. Status::Code::kCorruption, true),
  164. Status::Severity::kUnrecoverableError},
  165. {std::make_tuple(BackgroundErrorReason::kWriteCallback,
  166. Status::Code::kCorruption, false),
  167. Status::Severity::kNoError},
  168. {std::make_tuple(BackgroundErrorReason::kWriteCallback,
  169. Status::Code::kIOError, true),
  170. Status::Severity::kFatalError},
  171. {std::make_tuple(BackgroundErrorReason::kWriteCallback,
  172. Status::Code::kIOError, false),
  173. Status::Severity::kNoError},
  174. {std::make_tuple(BackgroundErrorReason::kManifestWrite,
  175. Status::Code::kIOError, true),
  176. Status::Severity::kFatalError},
  177. {std::make_tuple(BackgroundErrorReason::kManifestWrite,
  178. Status::Code::kIOError, false),
  179. Status::Severity::kFatalError},
  180. // Errors during BG flush with WAL disabled
  181. {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
  182. Status::Code::kCorruption, true),
  183. Status::Severity::kUnrecoverableError},
  184. {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
  185. Status::Code::kCorruption, false),
  186. Status::Severity::kNoError},
  187. {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
  188. Status::Code::kIOError, true),
  189. Status::Severity::kFatalError},
  190. {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
  191. Status::Code::kIOError, false),
  192. Status::Severity::kNoError},
  193. {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
  194. Status::Code::kIOError, true),
  195. Status::Severity::kFatalError},
  196. {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
  197. Status::Code::kIOError, false),
  198. Status::Severity::kFatalError},
  199. };
  200. std::map<std::tuple<BackgroundErrorReason, bool>, Status::Severity>
  201. DefaultReasonMap = {
  202. // Errors during BG compaction
  203. {std::make_tuple(BackgroundErrorReason::kCompaction, true),
  204. Status::Severity::kFatalError},
  205. {std::make_tuple(BackgroundErrorReason::kCompaction, false),
  206. Status::Severity::kNoError},
  207. // Errors during BG flush
  208. {std::make_tuple(BackgroundErrorReason::kFlush, true),
  209. Status::Severity::kFatalError},
  210. {std::make_tuple(BackgroundErrorReason::kFlush, false),
  211. Status::Severity::kNoError},
  212. // Errors during Write
  213. {std::make_tuple(BackgroundErrorReason::kWriteCallback, true),
  214. Status::Severity::kFatalError},
  215. {std::make_tuple(BackgroundErrorReason::kWriteCallback, false),
  216. Status::Severity::kFatalError},
  217. // Errors during Memtable update
  218. {std::make_tuple(BackgroundErrorReason::kMemTable, true),
  219. Status::Severity::kFatalError},
  220. {std::make_tuple(BackgroundErrorReason::kMemTable, false),
  221. Status::Severity::kFatalError},
  222. };
  223. void ErrorHandler::CancelErrorRecoveryForShutDown() {
  224. db_mutex_->AssertHeld();
  225. // We'll release the lock before calling sfm, so make sure no new
  226. // recovery gets scheduled at that point
  227. auto_recovery_ = false;
  228. SstFileManagerImpl* sfm =
  229. static_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
  230. if (sfm) {
  231. // This may or may not cancel a pending recovery
  232. db_mutex_->Unlock();
  233. bool cancelled = sfm->CancelErrorRecovery(this);
  234. db_mutex_->Lock();
  235. if (cancelled) {
  236. recovery_in_prog_ = false;
  237. }
  238. }
  239. // If auto recovery is also runing to resume from the retryable error,
  240. // we should wait and end the auto recovery.
  241. EndAutoRecovery();
  242. }
  243. // This is the main function for looking at an error during a background
  244. // operation and deciding the severity, and error recovery strategy. The high
  245. // level algorithm is as follows -
  246. // 1. Classify the severity of the error based on the ErrorSeverityMap,
  247. // DefaultErrorSeverityMap and DefaultReasonMap defined earlier
  248. // 2. Call a Status code specific override function to adjust the severity
  249. // if needed. The reason for this is our ability to recover may depend on
  250. // the exact options enabled in DBOptions
  251. // 3. Determine if auto recovery is possible. A listener notification callback
  252. // is called, which can disable the auto recovery even if we decide its
  253. // feasible
  254. // 4. For Status::NoSpace() errors, rely on SstFileManagerImpl to control
  255. // the actual recovery. If no sst file manager is specified in DBOptions,
  256. // a default one is allocated during DB::Open(), so there will always be
  257. // one.
  258. // This can also get called as part of a recovery operation. In that case, we
  259. // also track the error separately in recovery_error_ so we can tell in the
  260. // end whether recovery succeeded or not
  261. void ErrorHandler::HandleKnownErrors(const Status& bg_err,
  262. BackgroundErrorReason reason) {
  263. db_mutex_->AssertHeld();
  264. if (bg_err.ok()) {
  265. return;
  266. }
  267. bool paranoid = db_options_.paranoid_checks;
  268. Status::Severity sev = Status::Severity::kFatalError;
  269. Status new_bg_err;
  270. DBRecoverContext context;
  271. bool found = false;
  272. {
  273. auto entry = ErrorSeverityMap.find(
  274. std::make_tuple(reason, bg_err.code(), bg_err.subcode(), paranoid));
  275. if (entry != ErrorSeverityMap.end()) {
  276. sev = entry->second;
  277. found = true;
  278. }
  279. }
  280. if (!found) {
  281. auto entry = DefaultErrorSeverityMap.find(
  282. std::make_tuple(reason, bg_err.code(), paranoid));
  283. if (entry != DefaultErrorSeverityMap.end()) {
  284. sev = entry->second;
  285. found = true;
  286. }
  287. }
  288. if (!found) {
  289. auto entry = DefaultReasonMap.find(std::make_tuple(reason, paranoid));
  290. if (entry != DefaultReasonMap.end()) {
  291. sev = entry->second;
  292. }
  293. }
  294. new_bg_err = Status(bg_err, sev);
  295. // Check if recovery is currently in progress. If it is, we will save this
  296. // error so we can check it at the end to see if recovery succeeded or not
  297. if (recovery_in_prog_ && recovery_error_.ok()) {
  298. recovery_error_ = status_to_io_status(Status(new_bg_err));
  299. }
  300. bool auto_recovery = auto_recovery_;
  301. if (new_bg_err.severity() >= Status::Severity::kFatalError && auto_recovery) {
  302. auto_recovery = false;
  303. }
  304. // Allow some error specific overrides
  305. if (new_bg_err.subcode() == IOStatus::SubCode::kNoSpace ||
  306. new_bg_err.subcode() == IOStatus::SubCode::kSpaceLimit) {
  307. new_bg_err = OverrideNoSpaceError(new_bg_err, &auto_recovery);
  308. }
  309. if (!new_bg_err.ok()) {
  310. Status s = new_bg_err;
  311. EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &s,
  312. db_mutex_, &auto_recovery);
  313. if (!s.ok() && (s.severity() > bg_error_.severity())) {
  314. bg_error_ = s;
  315. } else {
  316. ROCKS_LOG_INFO(db_options_.info_log,
  317. "ErrorHandler: Hit less severe background error\n");
  318. // This error is less severe than previously encountered error. Don't
  319. // take any further action
  320. return;
  321. }
  322. }
  323. bool stop = bg_error_.severity() >= Status::Severity::kHardError;
  324. ROCKS_LOG_INFO(
  325. db_options_.info_log,
  326. "ErrorHandler: Set regular background error, auto_recovery=%d, stop=%d\n",
  327. int{auto_recovery}, int{stop});
  328. recover_context_ = context;
  329. if (auto_recovery) {
  330. recovery_in_prog_ = true;
  331. // Kick-off error specific recovery
  332. if (new_bg_err.subcode() == IOStatus::SubCode::kNoSpace ||
  333. new_bg_err.subcode() == IOStatus::SubCode::kSpaceLimit) {
  334. RecoverFromNoSpace();
  335. }
  336. }
  337. if (stop) {
  338. is_db_stopped_.store(true, std::memory_order_release);
  339. }
  340. }
  341. // This is the main function for looking at IO related error during the
  342. // background operations. The main logic is:
  343. // File scope IO error is treated as retryable IO error in the write path. In
  344. // RocksDB, If a file has write IO error and it is at file scope, RocksDB never
  345. // write to the same file again. RocksDB will create a new file and rewrite the
  346. // whole content. Thus, it is retryable.
  347. // There are three main categories of error handling:
  348. // 1) if the error is caused by data loss, the error is mapped to
  349. // unrecoverable error. Application/user must take action to handle
  350. // this situation (File scope case is excluded).
  351. // 2) if the error is a Retryable IO error (i.e., it is a file scope IO error,
  352. // or its retryable flag is set and not a data loss error), auto resume (
  353. // DBImpl::ResumeImpl) may be called and the auto resume can be controlled
  354. // by resume count and resume interval options. There are three sub-cases:
  355. // a) if the error happens during compaction, it is mapped to a soft error.
  356. // the compaction thread will reschedule a new compaction. This doesn't
  357. // call auto resume.
  358. // b) if the error happens during flush and also WAL is empty, it is mapped
  359. // to a soft error. Note that, it includes the case that IO error happens
  360. // in SST or manifest write during flush. Auto resume will be called.
  361. // c) all other errors are mapped to hard error. Auto resume will be called.
  362. // 3) for other cases, HandleKnownErrors(const Status& bg_err,
  363. // BackgroundErrorReason reason) will be called to handle other error cases
  364. // such as delegating to SstFileManager to handle no space error.
  365. void ErrorHandler::SetBGError(const Status& bg_status,
  366. BackgroundErrorReason reason, bool wal_related) {
  367. db_mutex_->AssertHeld();
  368. Status tmp_status = bg_status;
  369. IOStatus bg_io_err = status_to_io_status(std::move(tmp_status));
  370. if (bg_io_err.ok()) {
  371. return;
  372. }
  373. ROCKS_LOG_WARN(db_options_.info_log, "Background IO error %s, reason %d",
  374. bg_io_err.ToString().c_str(), static_cast<int>(reason));
  375. RecordStats({ERROR_HANDLER_BG_ERROR_COUNT, ERROR_HANDLER_BG_IO_ERROR_COUNT},
  376. {} /* int_histograms */);
  377. Status new_bg_io_err = bg_io_err;
  378. DBRecoverContext context;
  379. if (bg_io_err.GetScope() != IOStatus::IOErrorScope::kIOErrorScopeFile &&
  380. bg_io_err.GetDataLoss()) {
  381. // First, data loss (non file scope) is treated as unrecoverable error. So
  382. // it can directly overwrite any existing bg_error_.
  383. bool auto_recovery = false;
  384. Status bg_err(new_bg_io_err, Status::Severity::kUnrecoverableError);
  385. CheckAndSetRecoveryAndBGError(bg_err);
  386. ROCKS_LOG_INFO(
  387. db_options_.info_log,
  388. "ErrorHandler: Set background IO error as unrecoverable error\n");
  389. EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason,
  390. &bg_err, db_mutex_, &auto_recovery);
  391. recover_context_ = context;
  392. return;
  393. }
  394. if (wal_related) {
  395. assert(reason == BackgroundErrorReason::kWriteCallback ||
  396. reason == BackgroundErrorReason::kMemTable ||
  397. reason == BackgroundErrorReason::kFlush);
  398. }
  399. if (db_options_.manual_wal_flush && wal_related && bg_io_err.IsIOError()) {
  400. // With manual_wal_flush, a WAL write failure can drop buffered WAL writes.
  401. // Memtables and WAL then become inconsistent. A successful memtable flush
  402. // on one CF can cause CFs to be inconsistent upon restart. Before we fix
  403. // the bug in auto recovery from WAL write failures that can flush one CF
  404. // at a time, we set the error severity to fatal to disallow auto recovery.
  405. // TODO: remove parameter `wal_related` once we can automatically recover
  406. // from WAL write failures.
  407. bool auto_recovery = false;
  408. Status bg_err(new_bg_io_err, Status::Severity::kFatalError);
  409. CheckAndSetRecoveryAndBGError(bg_err);
  410. ROCKS_LOG_WARN(db_options_.info_log,
  411. "ErrorHandler: A potentially WAL error happened, set "
  412. "background IO error as fatal error\n");
  413. EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason,
  414. &bg_err, db_mutex_, &auto_recovery);
  415. recover_context_ = context;
  416. return;
  417. }
  418. if (bg_io_err.subcode() != IOStatus::SubCode::kNoSpace &&
  419. (bg_io_err.GetScope() == IOStatus::IOErrorScope::kIOErrorScopeFile ||
  420. bg_io_err.GetRetryable())) {
  421. // Second, check if the error is a retryable IO error (file scope IO error
  422. // is also treated as retryable IO error in RocksDB write path). if it is
  423. // retryable error and its severity is higher than bg_error_, overwrite the
  424. // bg_error_ with new error. In current stage, for retryable IO error of
  425. // compaction, treat it as soft error. In other cases, treat the retryable
  426. // IO error as hard error. Note that, all the NoSpace error should be
  427. // handled by the SstFileManager::StartErrorRecovery(). Therefore, no matter
  428. // it is retryable or file scope, this logic will be bypassed.
  429. RecordStats({ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT},
  430. {} /* int_histograms */);
  431. ROCKS_LOG_INFO(db_options_.info_log,
  432. "ErrorHandler: Set background retryable IO error\n");
  433. if (BackgroundErrorReason::kCompaction == reason) {
  434. // We map the retryable IO error during compaction to soft error. Since
  435. // compaction can reschedule by itself. We will not set the BG error in
  436. // this case
  437. // TODO: a better way to set or clean the retryable IO error which
  438. // happens during compaction SST file write.
  439. RecordStats({ERROR_HANDLER_AUTORESUME_COUNT}, {} /* int_histograms */);
  440. ROCKS_LOG_INFO(
  441. db_options_.info_log,
  442. "ErrorHandler: Compaction will schedule by itself to resume\n");
  443. bool auto_recovery = false;
  444. EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason,
  445. &new_bg_io_err, db_mutex_,
  446. &auto_recovery);
  447. // Not used in this code path.
  448. new_bg_io_err.PermitUncheckedError();
  449. return;
  450. }
  451. Status::Severity severity;
  452. if (BackgroundErrorReason::kFlushNoWAL == reason ||
  453. BackgroundErrorReason::kManifestWriteNoWAL == reason) {
  454. // When the BG Retryable IO error reason is flush without WAL,
  455. // We map it to a soft error. At the same time, all the background work
  456. // should be stopped except the BG work from recovery. Therefore, we
  457. // set the soft_error_no_bg_work_ to true. At the same time, since DB
  458. // continues to receive writes when BG error is soft error, to avoid
  459. // to many small memtable being generated during auto resume, the flush
  460. // reason is set to kErrorRecoveryRetryFlush.
  461. severity = Status::Severity::kSoftError;
  462. soft_error_no_bg_work_ = true;
  463. context.flush_reason = FlushReason::kErrorRecoveryRetryFlush;
  464. } else {
  465. severity = Status::Severity::kHardError;
  466. }
  467. Status bg_err(new_bg_io_err, severity);
  468. CheckAndSetRecoveryAndBGError(bg_err);
  469. recover_context_ = context;
  470. bool auto_recovery = db_options_.max_bgerror_resume_count > 0;
  471. EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason,
  472. &new_bg_io_err, db_mutex_,
  473. &auto_recovery);
  474. StartRecoverFromRetryableBGIOError(bg_io_err);
  475. return;
  476. }
  477. HandleKnownErrors(new_bg_io_err, reason);
  478. }
  479. void ErrorHandler::AddFilesToQuarantine(
  480. autovector<const autovector<uint64_t>*> files_to_quarantine) {
  481. db_mutex_->AssertHeld();
  482. std::ostringstream quarantine_files_oss;
  483. bool is_first_one = true;
  484. for (const auto* files : files_to_quarantine) {
  485. assert(files);
  486. for (uint64_t file_number : *files) {
  487. files_to_quarantine_.push_back(file_number);
  488. quarantine_files_oss << (is_first_one ? "" : ", ") << file_number;
  489. is_first_one = false;
  490. }
  491. }
  492. ROCKS_LOG_INFO(db_options_.info_log,
  493. "ErrorHandler: added file numbers %s to quarantine.\n",
  494. quarantine_files_oss.str().c_str());
  495. }
  496. void ErrorHandler::ClearFilesToQuarantine() {
  497. db_mutex_->AssertHeld();
  498. files_to_quarantine_.clear();
  499. ROCKS_LOG_INFO(db_options_.info_log,
  500. "ErrorHandler: cleared files in quarantine.\n");
  501. }
  502. Status ErrorHandler::OverrideNoSpaceError(const Status& bg_error,
  503. bool* auto_recovery) {
  504. if (bg_error.severity() >= Status::Severity::kFatalError) {
  505. return bg_error;
  506. }
  507. if (db_options_.sst_file_manager.get() == nullptr) {
  508. // We rely on SFM to poll for enough disk space and recover
  509. *auto_recovery = false;
  510. return bg_error;
  511. }
  512. if (db_options_.allow_2pc &&
  513. (bg_error.severity() <= Status::Severity::kSoftError)) {
  514. // Don't know how to recover, as the contents of the current WAL file may
  515. // be inconsistent, and it may be needed for 2PC. If 2PC is not enabled,
  516. // we can just flush the memtable and discard the log
  517. *auto_recovery = false;
  518. return Status(bg_error, Status::Severity::kFatalError);
  519. }
  520. {
  521. uint64_t free_space;
  522. if (db_options_.env->GetFreeSpace(db_options_.db_paths[0].path,
  523. &free_space) == Status::NotSupported()) {
  524. *auto_recovery = false;
  525. }
  526. }
  527. return bg_error;
  528. }
  529. void ErrorHandler::RecoverFromNoSpace() {
  530. SstFileManagerImpl* sfm =
  531. static_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
  532. // Inform SFM of the error, so it can kick-off the recovery
  533. if (sfm) {
  534. sfm->StartErrorRecovery(this, bg_error_);
  535. }
  536. }
  537. Status ErrorHandler::ClearBGError() {
  538. db_mutex_->AssertHeld();
  539. // Signal that recovery succeeded
  540. if (recovery_error_.ok()) {
  541. // If this assertion fails, it means likely bg error is not set after a
  542. // file is quarantined during MANIFEST write.
  543. assert(files_to_quarantine_.empty());
  544. Status old_bg_error = bg_error_;
  545. // old_bg_error is only for notifying listeners, so may not be checked
  546. old_bg_error.PermitUncheckedError();
  547. // Clear and check the recovery IO and BG error
  548. is_db_stopped_.store(false, std::memory_order_release);
  549. bg_error_ = Status::OK();
  550. recovery_error_ = IOStatus::OK();
  551. bg_error_.PermitUncheckedError();
  552. recovery_error_.PermitUncheckedError();
  553. recovery_in_prog_ = false;
  554. soft_error_no_bg_work_ = false;
  555. if (!db_->shutdown_initiated_) {
  556. // NotifyOnErrorRecoveryEnd() may release and re-acquire db_mutex_.
  557. // Prevent DB from being closed while we notify listeners. DB close will
  558. // wait until allow_db_shutdown_ = true, see ReadyForShutdown().
  559. allow_db_shutdown_ = false;
  560. EventHelpers::NotifyOnErrorRecoveryEnd(
  561. db_options_.listeners, old_bg_error, bg_error_, db_mutex_);
  562. allow_db_shutdown_ = true;
  563. }
  564. }
  565. return recovery_error_;
  566. }
  567. Status ErrorHandler::RecoverFromBGError(bool is_manual) {
  568. InstrumentedMutexLock l(db_mutex_);
  569. bool no_bg_work_original_flag = soft_error_no_bg_work_;
  570. if (is_manual) {
  571. // If its a manual recovery and there's a background recovery in progress
  572. // return busy status
  573. if (recovery_in_prog_) {
  574. return Status::Busy("Recovery already in progress");
  575. }
  576. recovery_in_prog_ = true;
  577. // In manual resume, we allow the bg work to run. If it is a auto resume,
  578. // the bg work should follow this tag.
  579. soft_error_no_bg_work_ = false;
  580. // In manual resume, if the bg error is a soft error and also requires
  581. // no bg work, the error must be recovered by call the flush with
  582. // flush reason: kErrorRecoveryRetryFlush. In other case, the flush
  583. // reason is set to kErrorRecovery.
  584. if (no_bg_work_original_flag) {
  585. recover_context_.flush_reason = FlushReason::kErrorRecoveryRetryFlush;
  586. } else {
  587. recover_context_.flush_reason = FlushReason::kErrorRecovery;
  588. }
  589. }
  590. if (bg_error_.severity() == Status::Severity::kSoftError &&
  591. recover_context_.flush_reason == FlushReason::kErrorRecovery) {
  592. // Simply clear the background error and return
  593. recovery_error_ = IOStatus::OK();
  594. return ClearBGError();
  595. }
  596. // Reset recovery_error_. We will use this to record any errors that happen
  597. // during the recovery process. While recovering, the only operations that
  598. // can generate background errors should be the flush operations
  599. recovery_error_ = IOStatus::OK();
  600. recovery_error_.PermitUncheckedError();
  601. Status s = db_->ResumeImpl(recover_context_);
  602. if (s.ok()) {
  603. soft_error_no_bg_work_ = false;
  604. } else {
  605. soft_error_no_bg_work_ = no_bg_work_original_flag;
  606. }
  607. // For manual recover, shutdown, and fatal error cases, set
  608. // recovery_in_prog_ to false. For automatic background recovery, leave it
  609. // as is regardless of success or failure as it will be retried
  610. if (is_manual || s.IsShutdownInProgress() ||
  611. bg_error_.severity() >= Status::Severity::kFatalError) {
  612. recovery_in_prog_ = false;
  613. }
  614. return s;
  615. }
  616. void ErrorHandler::StartRecoverFromRetryableBGIOError(
  617. const IOStatus& io_error) {
  618. db_mutex_->AssertHeld();
  619. if (bg_error_.ok() || io_error.ok()) {
  620. return;
  621. }
  622. if (db_options_.max_bgerror_resume_count <= 0 || recovery_in_prog_) {
  623. // Auto resume BG error is not enabled
  624. return;
  625. }
  626. if (end_recovery_) {
  627. // Can temporarily release db mutex
  628. EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
  629. Status::ShutdownInProgress(),
  630. db_mutex_);
  631. db_mutex_->AssertHeld();
  632. return;
  633. }
  634. RecordStats({ERROR_HANDLER_AUTORESUME_COUNT}, {} /* int_histograms */);
  635. ROCKS_LOG_INFO(
  636. db_options_.info_log,
  637. "ErrorHandler: Call StartRecoverFromRetryableBGIOError to resume\n");
  638. // Needs to be set in the same lock hold as setting BG error, otherwise
  639. // intervening writes could see a BG error without a recovery and bail out.
  640. recovery_in_prog_ = true;
  641. if (recovery_thread_) {
  642. // Ensure only one thread can execute the join().
  643. std::unique_ptr<port::Thread> old_recovery_thread(
  644. std::move(recovery_thread_));
  645. // In this case, if recovery_in_prog_ is false, current thread should
  646. // wait the previous recover thread to finish and create a new thread
  647. // to recover from the bg error.
  648. db_mutex_->Unlock();
  649. TEST_SYNC_POINT(
  650. "StartRecoverFromRetryableBGIOError:BeforeWaitingForOtherThread");
  651. old_recovery_thread->join();
  652. TEST_SYNC_POINT(
  653. "StartRecoverFromRetryableBGIOError:AfterWaitingForOtherThread");
  654. db_mutex_->Lock();
  655. }
  656. recovery_thread_.reset(
  657. new port::Thread(&ErrorHandler::RecoverFromRetryableBGIOError, this));
  658. }
  659. // Automatic recover from Retryable BG IO error. Must be called after db
  660. // mutex is released.
  661. void ErrorHandler::RecoverFromRetryableBGIOError() {
  662. assert(recovery_in_prog_);
  663. TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart");
  664. TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart2");
  665. InstrumentedMutexLock l(db_mutex_);
  666. if (end_recovery_) {
  667. EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
  668. Status::ShutdownInProgress(),
  669. db_mutex_);
  670. recovery_in_prog_ = false;
  671. return;
  672. }
  673. DBRecoverContext context = recover_context_;
  674. context.flush_after_recovery = true;
  675. int resume_count = db_options_.max_bgerror_resume_count;
  676. uint64_t wait_interval = db_options_.bgerror_resume_retry_interval;
  677. uint64_t retry_count = 0;
  678. // Recover from the retryable error. Create a separate thread to do it.
  679. while (resume_count > 0) {
  680. if (end_recovery_) {
  681. EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
  682. Status::ShutdownInProgress(),
  683. db_mutex_);
  684. recovery_in_prog_ = false;
  685. return;
  686. }
  687. TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume0");
  688. TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume1");
  689. recovery_error_ = IOStatus::OK();
  690. retry_count++;
  691. Status s = db_->ResumeImpl(context);
  692. RecordStats({ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT},
  693. {} /* int_histograms */);
  694. if (s.IsShutdownInProgress() ||
  695. bg_error_.severity() >= Status::Severity::kFatalError) {
  696. // If DB shutdown in progress or the error severity is higher than
  697. // Hard Error, stop auto resume and returns.
  698. recovery_in_prog_ = false;
  699. RecordStats({} /* ticker_types */,
  700. {{ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count}});
  701. EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
  702. bg_error_, db_mutex_);
  703. return;
  704. }
  705. if (!recovery_error_.ok() &&
  706. recovery_error_.severity() <= Status::Severity::kHardError &&
  707. recovery_error_.GetRetryable()) {
  708. // If new BG IO error happens during auto recovery and it is retryable
  709. // and its severity is Hard Error or lower, the auto resmue sleep for
  710. // a period of time and redo auto resume if it is allowed.
  711. TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait0");
  712. TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait1");
  713. int64_t wait_until = db_options_.clock->NowMicros() + wait_interval;
  714. cv_.TimedWait(wait_until);
  715. } else {
  716. // There are three possibility: 1) recovery_error_ is set during resume
  717. // and the error is not retryable, 2) recover is successful, 3) other
  718. // error happens during resume and cannot be resumed here.
  719. if (recovery_error_.ok() && s.ok()) {
  720. // recover from the retryable IO error and no other BG errors. Clean
  721. // the bg_error and notify user.
  722. TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverSuccess");
  723. RecordStats({ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT},
  724. {{ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count}});
  725. return;
  726. } else {
  727. // In this case: 1) recovery_error_ is more serious or not retryable
  728. // 2) other error happens. The auto recovery stops.
  729. recovery_in_prog_ = false;
  730. RecordStats({} /* ticker_types */,
  731. {{ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count}});
  732. EventHelpers::NotifyOnErrorRecoveryEnd(
  733. db_options_.listeners, bg_error_,
  734. !recovery_error_.ok() ? recovery_error_ : s, db_mutex_);
  735. return;
  736. }
  737. }
  738. resume_count--;
  739. }
  740. recovery_in_prog_ = false;
  741. EventHelpers::NotifyOnErrorRecoveryEnd(
  742. db_options_.listeners, bg_error_,
  743. Status::Aborted("Exceeded resume retry count"), db_mutex_);
  744. TEST_SYNC_POINT("RecoverFromRetryableBGIOError:LoopOut");
  745. RecordStats({} /* ticker_types */,
  746. {{ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count}});
  747. }
  748. void ErrorHandler::CheckAndSetRecoveryAndBGError(const Status& bg_err) {
  749. if (recovery_in_prog_ && recovery_error_.ok()) {
  750. recovery_error_ = status_to_io_status(Status(bg_err));
  751. }
  752. if (bg_err.severity() > bg_error_.severity()) {
  753. bg_error_ = bg_err;
  754. }
  755. if (bg_error_.severity() >= Status::Severity::kHardError) {
  756. is_db_stopped_.store(true, std::memory_order_release);
  757. }
  758. }
  759. void ErrorHandler::EndAutoRecovery() {
  760. db_mutex_->AssertHeld();
  761. if (!end_recovery_) {
  762. end_recovery_ = true;
  763. }
  764. if (recovery_thread_) {
  765. // Ensure only one thread can execute the join().
  766. std::unique_ptr<port::Thread> old_recovery_thread(
  767. std::move(recovery_thread_));
  768. db_mutex_->Unlock();
  769. cv_.SignalAll();
  770. old_recovery_thread->join();
  771. db_mutex_->Lock();
  772. }
  773. TEST_SYNC_POINT("PostEndAutoRecovery");
  774. }
  775. void ErrorHandler::RecordStats(
  776. const std::vector<Tickers>& ticker_types,
  777. const std::vector<std::tuple<Histograms, uint64_t>>& int_histograms) {
  778. if (bg_error_stats_ == nullptr) {
  779. return;
  780. }
  781. for (const auto& ticker_type : ticker_types) {
  782. RecordTick(bg_error_stats_.get(), ticker_type);
  783. }
  784. for (const auto& hist : int_histograms) {
  785. RecordInHistogram(bg_error_stats_.get(), std::get<0>(hist),
  786. std::get<1>(hist));
  787. }
  788. }
  789. } // namespace ROCKSDB_NAMESPACE