| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725 |
- // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
- // This source code is licensed under both the GPLv2 (found in the
- // COPYING file in the root directory) and Apache 2.0 License
- // (found in the LICENSE.Apache file in the root directory).
- //
- // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style license that can be
- // found in the LICENSE file. See the AUTHORS file for names of contributors.
- #include <fcntl.h>
- #include <sys/stat.h>
- #include <sys/types.h>
- #include <cinttypes>
- #include "db/db_impl/db_impl.h"
- #include "db/db_test_util.h"
- #include "db/log_format.h"
- #include "db/version_set.h"
- #include "file/filename.h"
- #include "port/stack_trace.h"
- #include "rocksdb/cache.h"
- #include "rocksdb/convenience.h"
- #include "rocksdb/db.h"
- #include "rocksdb/env.h"
- #include "rocksdb/options.h"
- #include "rocksdb/table.h"
- #include "rocksdb/utilities/transaction_db.h"
- #include "rocksdb/write_batch.h"
- #include "table/block_based/block_based_table_builder.h"
- #include "table/meta_blocks.h"
- #include "table/mock_table.h"
- #include "test_util/testharness.h"
- #include "test_util/testutil.h"
- #include "util/cast_util.h"
- #include "util/random.h"
- #include "util/string_util.h"
- namespace ROCKSDB_NAMESPACE {
- static constexpr int kValueSize = 1000;
- namespace {
- // A wrapper that allows injection of errors.
- class ErrorFS : public FileSystemWrapper {
- public:
- bool writable_file_error_;
- int num_writable_file_errors_;
- explicit ErrorFS(const std::shared_ptr<FileSystem>& _target)
- : FileSystemWrapper(_target),
- writable_file_error_(false),
- num_writable_file_errors_(0) {}
- const char* Name() const override { return "ErrorEnv"; }
- IOStatus NewWritableFile(const std::string& fname, const FileOptions& opts,
- std::unique_ptr<FSWritableFile>* result,
- IODebugContext* dbg) override {
- result->reset();
- if (writable_file_error_) {
- ++num_writable_file_errors_;
- return IOStatus::IOError(fname, "fake error");
- }
- return target()->NewWritableFile(fname, opts, result, dbg);
- }
- };
- } // anonymous namespace
- class CorruptionTest : public testing::Test {
- public:
- std::shared_ptr<Env> env_guard_;
- std::shared_ptr<ErrorFS> fs_;
- std::unique_ptr<Env> env_;
- Env* base_env_;
- std::string dbname_;
- std::shared_ptr<Cache> tiny_cache_;
- Options options_;
- DB* db_;
- CorruptionTest() {
- // If LRU cache shard bit is smaller than 2 (or -1 which will automatically
- // set it to 0), test SequenceNumberRecovery will fail, likely because of a
- // bug in recovery code. Keep it 4 for now to make the test passes.
- tiny_cache_ = NewLRUCache(100, 4);
- base_env_ = Env::Default();
- EXPECT_OK(
- test::CreateEnvFromSystem(ConfigOptions(), &base_env_, &env_guard_));
- EXPECT_NE(base_env_, nullptr);
- fs_.reset(new ErrorFS(base_env_->GetFileSystem()));
- env_ = NewCompositeEnv(fs_);
- options_.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords;
- options_.env = env_.get();
- dbname_ = test::PerThreadDBPath(env_.get(), "corruption_test");
- Status s = DestroyDB(dbname_, options_);
- EXPECT_OK(s);
- db_ = nullptr;
- options_.create_if_missing = true;
- BlockBasedTableOptions table_options;
- table_options.block_size_deviation = 0; // make unit test pass for now
- options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
- Reopen();
- options_.create_if_missing = false;
- }
- ~CorruptionTest() override {
- SyncPoint::GetInstance()->DisableProcessing();
- SyncPoint::GetInstance()->LoadDependency({});
- SyncPoint::GetInstance()->ClearAllCallBacks();
- delete db_;
- db_ = nullptr;
- if (getenv("KEEP_DB")) {
- fprintf(stdout, "db is still at %s\n", dbname_.c_str());
- } else {
- Options opts;
- opts.env = base_env_;
- EXPECT_OK(DestroyDB(dbname_, opts));
- }
- }
- void CloseDb() {
- delete db_;
- db_ = nullptr;
- }
- Status TryReopen(Options* options = nullptr) {
- delete db_;
- db_ = nullptr;
- Options opt = (options ? *options : options_);
- if (opt.env == Options().env) {
- // If env is not overridden, replace it with ErrorEnv.
- // Otherwise, the test already uses a non-default Env.
- opt.env = env_.get();
- }
- opt.arena_block_size = 4096;
- BlockBasedTableOptions table_options;
- table_options.block_cache = tiny_cache_;
- table_options.block_size_deviation = 0;
- opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
- return DB::Open(opt, dbname_, &db_);
- }
- void Reopen(Options* options = nullptr) { ASSERT_OK(TryReopen(options)); }
- void RepairDB() {
- delete db_;
- db_ = nullptr;
- ASSERT_OK(::ROCKSDB_NAMESPACE::RepairDB(dbname_, options_));
- }
- void Build(int n, int start, int flush_every) {
- std::string key_space, value_space;
- WriteBatch batch;
- for (int i = 0; i < n; i++) {
- if (flush_every != 0 && i != 0 && i % flush_every == 0) {
- DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
- ASSERT_OK(dbi->TEST_FlushMemTable());
- }
- // if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
- Slice key = Key(i + start, &key_space);
- batch.Clear();
- ASSERT_OK(batch.Put(key, Value(i + start, &value_space)));
- ASSERT_OK(db_->Write(WriteOptions(), &batch));
- }
- }
- void Build(int n, int flush_every = 0) { Build(n, 0, flush_every); }
- void Check(int min_expected, int max_expected) {
- Check(min_expected, max_expected, ReadOptions(false, true));
- }
- void Check(int min_expected, int max_expected, ReadOptions read_options) {
- uint64_t next_expected = 0;
- uint64_t missed = 0;
- int bad_keys = 0;
- int bad_values = 0;
- int correct = 0;
- std::string value_space;
- // Do not verify checksums. If we verify checksums then the
- // db itself will raise errors because data is corrupted.
- // Instead, we want the reads to be successful and this test
- // will detect whether the appropriate corruptions have
- // occurred.
- Iterator* iter = db_->NewIterator(read_options);
- for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
- ASSERT_OK(iter->status());
- uint64_t key;
- Slice in(iter->key());
- if (!ConsumeDecimalNumber(&in, &key) || !in.empty() ||
- key < next_expected) {
- bad_keys++;
- continue;
- }
- missed += (key - next_expected);
- next_expected = key + 1;
- if (iter->value() != Value(static_cast<int>(key), &value_space)) {
- bad_values++;
- } else {
- correct++;
- }
- }
- iter->status().PermitUncheckedError();
- delete iter;
- fprintf(
- stderr,
- "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%llu\n",
- min_expected, max_expected, correct, bad_keys, bad_values,
- static_cast<unsigned long long>(missed));
- ASSERT_LE(min_expected, correct);
- ASSERT_GE(max_expected, correct);
- }
- void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
- // Pick file to corrupt
- std::vector<std::string> filenames;
- ASSERT_OK(env_->GetChildren(dbname_, &filenames));
- uint64_t number;
- FileType type;
- std::string fname;
- int picked_number = -1;
- for (size_t i = 0; i < filenames.size(); i++) {
- if (ParseFileName(filenames[i], &number, &type) && type == filetype &&
- static_cast<int>(number) > picked_number) { // Pick latest file
- fname = dbname_ + "/" + filenames[i];
- picked_number = static_cast<int>(number);
- }
- }
- ASSERT_TRUE(!fname.empty()) << filetype;
- ASSERT_OK(test::CorruptFile(env_.get(), fname, offset, bytes_to_corrupt,
- /*verify_checksum*/ filetype == kTableFile));
- }
- // corrupts exactly one file at level `level`. if no file found at level,
- // asserts
- void CorruptTableFileAtLevel(int level, int offset, int bytes_to_corrupt) {
- std::vector<LiveFileMetaData> metadata;
- db_->GetLiveFilesMetaData(&metadata);
- for (const auto& m : metadata) {
- if (m.level == level) {
- ASSERT_OK(test::CorruptFile(env_.get(), dbname_ + "/" + m.name, offset,
- bytes_to_corrupt));
- return;
- }
- }
- FAIL() << "no file found at level";
- }
- int Property(const std::string& name) {
- std::string property;
- int result;
- if (db_->GetProperty(name, &property) &&
- sscanf(property.c_str(), "%d", &result) == 1) {
- return result;
- } else {
- return -1;
- }
- }
- // Return the ith key
- Slice Key(int i, std::string* storage) {
- char buf[100];
- snprintf(buf, sizeof(buf), "%016d", i);
- storage->assign(buf, strlen(buf));
- return Slice(*storage);
- }
- // Return the value to associate with the specified key
- Slice Value(int k, std::string* storage) {
- if (k == 0) {
- // Ugh. Random seed of 0 used to produce no entropy. This code
- // preserves the implementation that was in place when all of the
- // magic values in this file were picked.
- *storage = std::string(kValueSize, ' ');
- } else {
- Random r(k);
- *storage = r.RandomString(kValueSize);
- }
- return Slice(*storage);
- }
- void GetSortedWalFiles(std::vector<uint64_t>& file_nums) {
- std::vector<std::string> tmp_files;
- ASSERT_OK(env_->GetChildren(dbname_, &tmp_files));
- FileType type = kWalFile;
- for (const auto& file : tmp_files) {
- uint64_t number = 0;
- if (ParseFileName(file, &number, &type) && type == kWalFile) {
- file_nums.push_back(number);
- }
- }
- std::sort(file_nums.begin(), file_nums.end());
- }
- void CorruptFileWithTruncation(FileType file, uint64_t number,
- uint64_t bytes_to_truncate = 0) {
- std::string path;
- switch (file) {
- case FileType::kWalFile:
- path = LogFileName(dbname_, number);
- break;
- // TODO: Add other file types as this method is being used for those file
- // types.
- default:
- return;
- }
- uint64_t old_size = 0;
- ASSERT_OK(env_->GetFileSize(path, &old_size));
- assert(old_size > bytes_to_truncate);
- uint64_t new_size = old_size - bytes_to_truncate;
- // If bytes_to_truncate == 0, it will do full truncation.
- if (bytes_to_truncate == 0) {
- new_size = 0;
- }
- ASSERT_OK(test::TruncateFile(env_.get(), path, new_size));
- }
- };
- TEST_F(CorruptionTest, Recovery) {
- Build(100);
- Check(100, 100);
- #ifdef OS_WIN
- // On Wndows OS Disk cache does not behave properly
- // We do not call FlushBuffers on every Flush. If we do not close
- // the log file prior to the corruption we end up with the first
- // block not corrupted but only the second. However, under the debugger
- // things work just fine but never pass when running normally
- // For that reason people may want to run with unbuffered I/O. That option
- // is not available for WAL though.
- CloseDb();
- #endif
- Corrupt(kWalFile, 19, 1); // WriteBatch tag for first record
- Corrupt(kWalFile, log::kBlockSize + 1000, 1); // Somewhere in second block
- ASSERT_TRUE(!TryReopen().ok());
- options_.paranoid_checks = false;
- Reopen(&options_);
- // The 64 records in the first two log blocks are completely lost.
- Check(36, 36);
- }
- TEST_F(CorruptionTest, PostPITRCorruptionWALsRetained) {
- // Repro for bug where WALs following the point-in-time recovery were not
- // retained leading to the next recovery failing.
- CloseDb();
- options_.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
- const std::string test_cf_name = "test_cf";
- std::vector<ColumnFamilyDescriptor> cf_descs;
- cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
- cf_descs.emplace_back(test_cf_name, ColumnFamilyOptions());
- uint64_t log_num;
- {
- options_.create_missing_column_families = true;
- std::vector<ColumnFamilyHandle*> cfhs;
- ASSERT_OK(DB::Open(options_, dbname_, cf_descs, &cfhs, &db_));
- assert(db_ != nullptr); // suppress false clang-analyze report
- ASSERT_OK(db_->Put(WriteOptions(), cfhs[0], "k", "v"));
- ASSERT_OK(db_->Put(WriteOptions(), cfhs[1], "k", "v"));
- ASSERT_OK(db_->Put(WriteOptions(), cfhs[0], "k2", "v2"));
- std::vector<uint64_t> file_nums;
- GetSortedWalFiles(file_nums);
- log_num = file_nums.back();
- for (auto* cfh : cfhs) {
- delete cfh;
- }
- CloseDb();
- }
- CorruptFileWithTruncation(FileType::kWalFile, log_num,
- /*bytes_to_truncate=*/1);
- {
- // Recover "k" -> "v" for both CFs. "k2" -> "v2" is lost due to truncation.
- options_.avoid_flush_during_recovery = true;
- std::vector<ColumnFamilyHandle*> cfhs;
- ASSERT_OK(DB::Open(options_, dbname_, cf_descs, &cfhs, &db_));
- assert(db_ != nullptr); // suppress false clang-analyze report
- // Flush one but not both CFs and write some data so there's a seqno gap
- // between the PITR corruption and the next DB session's first WAL.
- ASSERT_OK(db_->Put(WriteOptions(), cfhs[1], "k2", "v2"));
- ASSERT_OK(db_->Flush(FlushOptions(), cfhs[1]));
- for (auto* cfh : cfhs) {
- delete cfh;
- }
- CloseDb();
- }
- // With the bug, this DB open would remove the WALs following the PITR
- // corruption. Then, the next recovery would fail.
- for (int i = 0; i < 2; ++i) {
- std::vector<ColumnFamilyHandle*> cfhs;
- ASSERT_OK(DB::Open(options_, dbname_, cf_descs, &cfhs, &db_));
- assert(db_ != nullptr); // suppress false clang-analyze report
- for (auto* cfh : cfhs) {
- delete cfh;
- }
- CloseDb();
- }
- }
- TEST_F(CorruptionTest, RecoverWriteError) {
- fs_->writable_file_error_ = true;
- Status s = TryReopen();
- ASSERT_TRUE(!s.ok());
- }
- TEST_F(CorruptionTest, NewFileErrorDuringWrite) {
- // Do enough writing to force minor compaction
- fs_->writable_file_error_ = true;
- const int num =
- static_cast<int>(3 + (Options().write_buffer_size / kValueSize));
- std::string value_storage;
- Status s;
- bool failed = false;
- for (int i = 0; i < num; i++) {
- WriteBatch batch;
- ASSERT_OK(batch.Put("a", Value(100, &value_storage)));
- s = db_->Write(WriteOptions(), &batch);
- if (!s.ok()) {
- failed = true;
- }
- ASSERT_TRUE(!failed || !s.ok());
- }
- ASSERT_TRUE(!s.ok());
- ASSERT_GE(fs_->num_writable_file_errors_, 1);
- fs_->writable_file_error_ = false;
- Reopen();
- }
- TEST_F(CorruptionTest, TableFile) {
- Build(100);
- DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
- ASSERT_OK(dbi->TEST_FlushMemTable());
- ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr));
- ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr));
- Corrupt(kTableFile, 100, 1);
- Check(99, 99);
- ASSERT_NOK(dbi->VerifyChecksum());
- }
- TEST_F(CorruptionTest, VerifyChecksumReadahead) {
- Options options;
- options.level_compaction_dynamic_level_bytes = false;
- SpecialEnv senv(base_env_);
- options.env = &senv;
- // Disable block cache as we are going to check checksum for
- // the same file twice and measure number of reads.
- BlockBasedTableOptions table_options_no_bc;
- table_options_no_bc.no_block_cache = true;
- options.table_factory.reset(NewBlockBasedTableFactory(table_options_no_bc));
- Reopen(&options);
- Build(10000);
- DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
- ASSERT_OK(dbi->TEST_FlushMemTable());
- ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr));
- ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr));
- senv.count_random_reads_ = true;
- senv.random_read_counter_.Reset();
- ASSERT_OK(dbi->VerifyChecksum());
- // Make sure the counter is enabled.
- ASSERT_GT(senv.random_read_counter_.Read(), 0);
- // The SST file is about 10MB. Default readahead size is 256KB.
- // Give a conservative 20 reads for metadata blocks, The number
- // of random reads should be within 10 MB / 256KB + 20 = 60.
- ASSERT_LT(senv.random_read_counter_.Read(), 60);
- senv.random_read_bytes_counter_ = 0;
- ReadOptions ro;
- ro.readahead_size = size_t{32 * 1024};
- ASSERT_OK(dbi->VerifyChecksum(ro));
- // The SST file is about 10MB. We set readahead size to 32KB.
- // Give 0 to 20 reads for metadata blocks, and allow real read
- // to range from 24KB to 48KB. The lower bound would be:
- // 10MB / 48KB + 0 = 213
- // The higher bound is
- // 10MB / 24KB + 20 = 447.
- ASSERT_GE(senv.random_read_counter_.Read(), 213);
- ASSERT_LE(senv.random_read_counter_.Read(), 447);
- // Test readahead shouldn't break mmap mode (where it should be
- // disabled).
- options.allow_mmap_reads = true;
- Reopen(&options);
- dbi = static_cast<DBImpl*>(db_);
- ASSERT_OK(dbi->VerifyChecksum(ro));
- CloseDb();
- }
- TEST_F(CorruptionTest, TableFileIndexData) {
- Options options;
- options.level_compaction_dynamic_level_bytes = false;
- // very big, we'll trigger flushes manually
- options.write_buffer_size = 100 * 1024 * 1024;
- Reopen(&options);
- // build 2 tables, flush at 5000
- Build(10000, 5000);
- DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
- ASSERT_OK(dbi->TEST_FlushMemTable());
- // corrupt an index block of an entire file
- Corrupt(kTableFile, -2000, 500);
- options.paranoid_checks = false;
- Reopen(&options);
- dbi = static_cast_with_check<DBImpl>(db_);
- // one full file may be readable, since only one was corrupted
- // the other file should be fully non-readable, since index was corrupted
- Check(0, 5000, ReadOptions(true, true));
- ASSERT_NOK(dbi->VerifyChecksum());
- // In paranoid mode, the db cannot be opened due to the corrupted file.
- ASSERT_TRUE(TryReopen().IsCorruption());
- }
- TEST_F(CorruptionTest, TableFileFooterMagic) {
- Build(100);
- DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
- ASSERT_OK(dbi->TEST_FlushMemTable());
- Check(100, 100);
- // Corrupt the whole footer
- Corrupt(kTableFile, -100, 100);
- Status s = TryReopen();
- ASSERT_TRUE(s.IsCorruption());
- // Contains useful message, and magic number should be the first thing
- // reported as corrupt.
- ASSERT_TRUE(s.ToString().find("magic number") != std::string::npos);
- // with file name
- ASSERT_TRUE(s.ToString().find(".sst") != std::string::npos);
- }
- TEST_F(CorruptionTest, TableFileFooterNotMagic) {
- Build(100);
- DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
- ASSERT_OK(dbi->TEST_FlushMemTable());
- Check(100, 100);
- // Corrupt footer except magic number
- Corrupt(kTableFile, -100, 92);
- Status s = TryReopen();
- ASSERT_TRUE(s.IsCorruption());
- // The next thing checked after magic number is format_version
- ASSERT_TRUE(s.ToString().find("format_version") != std::string::npos);
- // with file name
- ASSERT_TRUE(s.ToString().find(".sst") != std::string::npos);
- }
- TEST_F(CorruptionTest, DBOpenWithWrongFileSize) {
- // Validate that when paranoid flag is true, DB::Open() fails if one of the
- // file corrupted. Validate that when paranoid flag is false, DB::Open()
- // succeed if one of the file corrupted, and the healthy file is readable.
- CloseDb();
- const std::string test_cf_name = "test_cf";
- std::vector<ColumnFamilyDescriptor> cf_descs;
- cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
- cf_descs.emplace_back(test_cf_name, ColumnFamilyOptions());
- {
- options_.create_missing_column_families = true;
- std::vector<ColumnFamilyHandle*> cfhs;
- ASSERT_OK(DB::Open(options_, dbname_, cf_descs, &cfhs, &db_));
- assert(db_ != nullptr); // suppress false clang-analyze report
- ASSERT_OK(db_->Put(WriteOptions(), cfhs[0], "k", "v"));
- ASSERT_OK(db_->Put(WriteOptions(), cfhs[1], "k1", "v1"));
- ASSERT_OK(db_->Put(WriteOptions(), cfhs[0], "k2", "v2"));
- for (auto* cfh : cfhs) {
- delete cfh;
- }
- DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
- ASSERT_OK(dbi->TEST_FlushMemTable());
- // ********************************************
- // Corrupt the file by making the file bigger
- std::vector<LiveFileMetaData> metadata;
- db_->GetLiveFilesMetaData(&metadata);
- std::string filename = dbname_ + metadata[0].name;
- const auto& fs = options_.env->GetFileSystem();
- {
- std::unique_ptr<FSWritableFile> f;
- ASSERT_OK(fs->ReopenWritableFile(filename, FileOptions(), &f, nullptr));
- ASSERT_OK(f->Append("blahblah", IOOptions(), nullptr));
- ASSERT_OK(f->Close(IOOptions(), nullptr));
- }
- CloseDb();
- }
- // DB failed to open due to one of the file is corrupted, as paranoid flag is
- // true
- options_.paranoid_checks = true;
- std::vector<ColumnFamilyHandle*> cfhs;
- auto s = DB::Open(options_, dbname_, cf_descs, &cfhs, &db_);
- ASSERT_TRUE(s.IsCorruption());
- ASSERT_TRUE(s.ToString().find("file size mismatch") != std::string::npos);
- // DB opened successfully, as paranoid flag is false, validate the one that is
- // healthy is still accessible
- options_.paranoid_checks = false;
- ASSERT_OK(DB::Open(options_, dbname_, cf_descs, &cfhs, &db_));
- assert(db_ != nullptr); // suppress false clang-analyze report
- std::string v;
- ASSERT_OK(db_->Get(ReadOptions(), cfhs[1], "k1", &v));
- ASSERT_EQ(v, "v1");
- // Validate the default column family is corrupted
- Check(0, 0);
- s = db_->Get(ReadOptions(), cfhs[0], "k1", &v);
- ASSERT_TRUE(s.IsCorruption());
- delete cfhs[1];
- delete cfhs[0];
- }
- TEST_F(CorruptionTest, TableFileWrongSize) {
- Build(100);
- DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
- ASSERT_OK(dbi->TEST_FlushMemTable());
- Check(100, 100);
- // ********************************************
- // Make the file bigger by appending to it
- std::vector<LiveFileMetaData> metadata;
- db_->GetLiveFilesMetaData(&metadata);
- ASSERT_EQ(1U, metadata.size());
- std::string filename = dbname_ + metadata[0].name;
- const auto& fs = options_.env->GetFileSystem();
- {
- std::unique_ptr<FSWritableFile> f;
- ASSERT_OK(fs->ReopenWritableFile(filename, FileOptions(), &f, nullptr));
- ASSERT_OK(f->Append("blahblah", IOOptions(), nullptr));
- ASSERT_OK(f->Close(IOOptions(), nullptr));
- }
- // DB actually accepts this without paranoid checks, relying on size
- // recorded in manifest to locate the SST footer.
- options_.paranoid_checks = false;
- Reopen();
- // As footer could not be extraced, file is completely unreadable
- Check(0, 0);
- std::string v;
- auto s = db_->Get(ReadOptions(), "k1", &v);
- ASSERT_TRUE(s.IsCorruption());
- // But reports the issue with paranoid checks
- options_.paranoid_checks = true;
- s = TryReopen();
- ASSERT_TRUE(s.IsCorruption());
- ASSERT_TRUE(s.ToString().find("file size mismatch") != std::string::npos);
- // ********************************************
- // Make the file smaller with truncation.
- // First leaving a partial footer, and then completely removing footer.
- for (size_t bytes_lost : {8, 100}) {
- ASSERT_OK(test::TruncateFile(env_.get(), filename,
- metadata[0].size - bytes_lost));
- // Reported well with paranoid checks
- options_.paranoid_checks = true;
- s = TryReopen();
- ASSERT_TRUE(s.IsCorruption());
- ASSERT_TRUE(s.ToString().find("file size mismatch") != std::string::npos);
- // Without paranoid checks, not reported until read
- options_.paranoid_checks = false;
- Reopen();
- Check(0, 0); // Missing data
- }
- }
- TEST_F(CorruptionTest, MissingDescriptor) {
- Build(1000);
- RepairDB();
- Reopen();
- Check(1000, 1000);
- }
- TEST_F(CorruptionTest, SequenceNumberRecovery) {
- ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
- ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
- ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3"));
- ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4"));
- ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5"));
- RepairDB();
- Reopen();
- std::string v;
- ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
- ASSERT_EQ("v5", v);
- // Write something. If sequence number was not recovered properly,
- // it will be hidden by an earlier write.
- ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6"));
- ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
- ASSERT_EQ("v6", v);
- Reopen();
- ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
- ASSERT_EQ("v6", v);
- }
- TEST_F(CorruptionTest, CorruptedDescriptor) {
- ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
- DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
- ASSERT_OK(dbi->TEST_FlushMemTable());
- CompactRangeOptions cro;
- cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
- ASSERT_OK(
- dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
- Corrupt(kDescriptorFile, 0, 1000);
- Status s = TryReopen();
- ASSERT_TRUE(!s.ok());
- RepairDB();
- Reopen();
- std::string v;
- ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
- ASSERT_EQ("hello", v);
- }
- TEST_F(CorruptionTest, CompactionInputError) {
- Options options;
- options.level_compaction_dynamic_level_bytes = false;
- options.env = env_.get();
- Reopen(&options);
- Build(10);
- DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
- ASSERT_OK(dbi->TEST_FlushMemTable());
- ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr));
- ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr));
- ASSERT_EQ(1, Property("rocksdb.num-files-at-level2"));
- Corrupt(kTableFile, 100, 1);
- Check(9, 9);
- ASSERT_NOK(dbi->VerifyChecksum());
- // Force compactions by writing lots of values
- Build(10000);
- Check(10000, 10000);
- ASSERT_NOK(dbi->VerifyChecksum());
- }
- TEST_F(CorruptionTest, CompactionInputErrorParanoid) {
- Options options;
- options.level_compaction_dynamic_level_bytes = false;
- options.env = env_.get();
- options.paranoid_checks = true;
- options.write_buffer_size = 131072;
- options.max_write_buffer_number = 2;
- Reopen(&options);
- DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
- // Fill levels >= 1
- for (int level = 1; level < dbi->NumberLevels(); level++) {
- ASSERT_OK(dbi->Put(WriteOptions(), "", "begin"));
- ASSERT_OK(dbi->Put(WriteOptions(), "~", "end"));
- ASSERT_OK(dbi->TEST_FlushMemTable());
- for (int comp_level = 0; comp_level < dbi->NumberLevels() - level;
- ++comp_level) {
- ASSERT_OK(dbi->TEST_CompactRange(comp_level, nullptr, nullptr));
- }
- }
- Reopen(&options);
- dbi = static_cast_with_check<DBImpl>(db_);
- Build(10);
- ASSERT_OK(dbi->TEST_FlushMemTable());
- ASSERT_OK(dbi->TEST_WaitForCompact());
- ASSERT_EQ(1, Property("rocksdb.num-files-at-level0"));
- CorruptTableFileAtLevel(0, 100, 1);
- Check(9, 9);
- ASSERT_NOK(dbi->VerifyChecksum());
- // Write must eventually fail because of corrupted table
- Status s;
- std::string tmp1, tmp2;
- bool failed = false;
- for (int i = 0; i < 10000; i++) {
- s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
- if (!s.ok()) {
- failed = true;
- }
- // if one write failed, every subsequent write must fail, too
- ASSERT_TRUE(!failed || !s.ok()) << "write did not fail in a corrupted db";
- }
- ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
- }
- TEST_F(CorruptionTest, UnrelatedKeys) {
- Build(10);
- DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
- ASSERT_OK(dbi->TEST_FlushMemTable());
- Corrupt(kTableFile, 100, 1);
- ASSERT_NOK(dbi->VerifyChecksum());
- std::string tmp1, tmp2;
- ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
- std::string v;
- ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
- ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
- ASSERT_OK(dbi->TEST_FlushMemTable());
- ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
- ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
- }
- TEST_F(CorruptionTest, RangeDeletionCorrupted) {
- ASSERT_OK(
- db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b"));
- ASSERT_OK(db_->Flush(FlushOptions()));
- std::vector<LiveFileMetaData> metadata;
- db_->GetLiveFilesMetaData(&metadata);
- ASSERT_EQ(static_cast<size_t>(1), metadata.size());
- std::string filename = dbname_ + metadata[0].name;
- FileOptions file_opts;
- const auto& fs = options_.env->GetFileSystem();
- std::unique_ptr<RandomAccessFileReader> file_reader;
- ASSERT_OK(RandomAccessFileReader::Create(fs, filename, file_opts,
- &file_reader, nullptr));
- uint64_t file_size;
- ASSERT_OK(
- fs->GetFileSize(filename, file_opts.io_options, &file_size, nullptr));
- BlockHandle range_del_handle;
- const ReadOptions read_options;
- ASSERT_OK(FindMetaBlockInFile(file_reader.get(), file_size,
- kBlockBasedTableMagicNumber,
- ImmutableOptions(options_), read_options,
- kRangeDelBlockName, &range_del_handle));
- ASSERT_OK(TryReopen());
- ASSERT_OK(test::CorruptFile(env_.get(), filename,
- static_cast<int>(range_del_handle.offset()), 1));
- ASSERT_TRUE(TryReopen().IsCorruption());
- }
- TEST_F(CorruptionTest, FileSystemStateCorrupted) {
- for (int iter = 0; iter < 2; ++iter) {
- Options options;
- options.level_compaction_dynamic_level_bytes = false;
- options.env = env_.get();
- options.paranoid_checks = true;
- options.create_if_missing = true;
- Reopen(&options);
- Build(10);
- ASSERT_OK(db_->Flush(FlushOptions()));
- DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
- std::vector<LiveFileMetaData> metadata;
- dbi->GetLiveFilesMetaData(&metadata);
- ASSERT_GT(metadata.size(), 0);
- std::string filename = dbname_ + metadata[0].name;
- delete db_;
- db_ = nullptr;
- if (iter == 0) { // corrupt file size
- std::unique_ptr<WritableFile> file;
- ASSERT_OK(env_->NewWritableFile(filename, &file, EnvOptions()));
- ASSERT_OK(file->Append(Slice("corrupted sst")));
- file.reset();
- Status x = TryReopen(&options);
- ASSERT_TRUE(x.IsCorruption());
- } else { // delete the file
- ASSERT_OK(env_->DeleteFile(filename));
- Status x = TryReopen(&options);
- ASSERT_TRUE(x.IsCorruption());
- }
- ASSERT_OK(DestroyDB(dbname_, options_));
- }
- }
- static const auto& corruption_modes = {
- mock::MockTableFactory::kCorruptNone, mock::MockTableFactory::kCorruptKey,
- mock::MockTableFactory::kCorruptValue,
- mock::MockTableFactory::kCorruptReorderKey};
- TEST_F(CorruptionTest, ParanoidFileChecksOnFlush) {
- Options options;
- options.level_compaction_dynamic_level_bytes = false;
- options.env = env_.get();
- options.paranoid_file_checks = true;
- options.create_if_missing = true;
- Status s;
- for (const auto& mode : corruption_modes) {
- delete db_;
- db_ = nullptr;
- s = DestroyDB(dbname_, options);
- ASSERT_OK(s);
- std::shared_ptr<mock::MockTableFactory> mock =
- std::make_shared<mock::MockTableFactory>();
- options.table_factory = mock;
- mock->SetCorruptionMode(mode);
- ASSERT_OK(DB::Open(options, dbname_, &db_));
- assert(db_ != nullptr); // suppress false clang-analyze report
- Build(10);
- s = db_->Flush(FlushOptions());
- if (mode == mock::MockTableFactory::kCorruptNone) {
- ASSERT_OK(s);
- } else {
- ASSERT_NOK(s);
- }
- }
- }
- TEST_F(CorruptionTest, ParanoidFileChecksOnCompact) {
- Options options;
- options.level_compaction_dynamic_level_bytes = false;
- options.env = env_.get();
- options.paranoid_file_checks = true;
- options.create_if_missing = true;
- Status s;
- for (const auto& mode : corruption_modes) {
- delete db_;
- db_ = nullptr;
- s = DestroyDB(dbname_, options);
- ASSERT_OK(s);
- std::shared_ptr<mock::MockTableFactory> mock =
- std::make_shared<mock::MockTableFactory>();
- options.table_factory = mock;
- ASSERT_OK(DB::Open(options, dbname_, &db_));
- assert(db_ != nullptr); // suppress false clang-analyze report
- Build(100, 2);
- DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
- ASSERT_OK(dbi->TEST_FlushMemTable());
- mock->SetCorruptionMode(mode);
- CompactRangeOptions cro;
- cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
- s = dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr);
- if (mode == mock::MockTableFactory::kCorruptNone) {
- ASSERT_OK(s);
- } else {
- ASSERT_NOK(s);
- }
- }
- }
- TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeFirst) {
- Options options;
- options.level_compaction_dynamic_level_bytes = false;
- options.env = env_.get();
- options.paranoid_file_checks = true;
- options.create_if_missing = true;
- for (bool do_flush : {true, false}) {
- delete db_;
- db_ = nullptr;
- ASSERT_OK(DestroyDB(dbname_, options));
- ASSERT_OK(DB::Open(options, dbname_, &db_));
- std::string start, end;
- assert(db_ != nullptr); // suppress false clang-analyze report
- ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
- Key(3, &start), Key(7, &end)));
- auto snap = db_->GetSnapshot();
- ASSERT_NE(snap, nullptr);
- ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
- Key(8, &start), Key(9, &end)));
- ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
- Key(2, &start), Key(5, &end)));
- Build(10);
- if (do_flush) {
- ASSERT_OK(db_->Flush(FlushOptions()));
- } else {
- DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
- ASSERT_OK(dbi->TEST_FlushMemTable());
- CompactRangeOptions cro;
- cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
- ASSERT_OK(
- dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
- }
- db_->ReleaseSnapshot(snap);
- }
- }
- TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRange) {
- Options options;
- options.level_compaction_dynamic_level_bytes = false;
- options.env = env_.get();
- options.paranoid_file_checks = true;
- options.create_if_missing = true;
- for (bool do_flush : {true, false}) {
- delete db_;
- db_ = nullptr;
- ASSERT_OK(DestroyDB(dbname_, options));
- ASSERT_OK(DB::Open(options, dbname_, &db_));
- assert(db_ != nullptr); // suppress false clang-analyze report
- Build(10, 0, 0);
- std::string start, end;
- ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
- Key(5, &start), Key(15, &end)));
- auto snap = db_->GetSnapshot();
- ASSERT_NE(snap, nullptr);
- ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
- Key(8, &start), Key(9, &end)));
- ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
- Key(12, &start), Key(17, &end)));
- ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
- Key(2, &start), Key(4, &end)));
- Build(10, 10, 0);
- if (do_flush) {
- ASSERT_OK(db_->Flush(FlushOptions()));
- } else {
- DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
- ASSERT_OK(dbi->TEST_FlushMemTable());
- CompactRangeOptions cro;
- cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
- ASSERT_OK(
- dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
- }
- db_->ReleaseSnapshot(snap);
- }
- }
- TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeLast) {
- Options options;
- options.level_compaction_dynamic_level_bytes = false;
- options.env = env_.get();
- options.paranoid_file_checks = true;
- options.create_if_missing = true;
- for (bool do_flush : {true, false}) {
- delete db_;
- db_ = nullptr;
- ASSERT_OK(DestroyDB(dbname_, options));
- ASSERT_OK(DB::Open(options, dbname_, &db_));
- assert(db_ != nullptr); // suppress false clang-analyze report
- std::string start, end;
- Build(10);
- ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
- Key(3, &start), Key(7, &end)));
- auto snap = db_->GetSnapshot();
- ASSERT_NE(snap, nullptr);
- ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
- Key(6, &start), Key(8, &end)));
- ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
- Key(2, &start), Key(5, &end)));
- if (do_flush) {
- ASSERT_OK(db_->Flush(FlushOptions()));
- } else {
- DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
- ASSERT_OK(dbi->TEST_FlushMemTable());
- CompactRangeOptions cro;
- cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
- ASSERT_OK(
- dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
- }
- db_->ReleaseSnapshot(snap);
- }
- }
- TEST_F(CorruptionTest, LogCorruptionErrorsInCompactionIterator) {
- Options options;
- options.level_compaction_dynamic_level_bytes = false;
- options.env = env_.get();
- options.create_if_missing = true;
- options.allow_data_in_errors = true;
- auto mode = mock::MockTableFactory::kCorruptKey;
- delete db_;
- db_ = nullptr;
- ASSERT_OK(DestroyDB(dbname_, options));
- std::shared_ptr<mock::MockTableFactory> mock =
- std::make_shared<mock::MockTableFactory>();
- mock->SetCorruptionMode(mode);
- options.table_factory = mock;
- ASSERT_OK(DB::Open(options, dbname_, &db_));
- assert(db_ != nullptr); // suppress false clang-analyze report
- Build(100, 2);
- DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
- ASSERT_OK(dbi->TEST_FlushMemTable());
- CompactRangeOptions cro;
- cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
- Status s =
- dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr);
- ASSERT_NOK(s);
- ASSERT_TRUE(s.IsCorruption());
- }
- TEST_F(CorruptionTest, CompactionKeyOrderCheck) {
- Options options;
- options.level_compaction_dynamic_level_bytes = false;
- options.env = env_.get();
- options.paranoid_file_checks = false;
- options.create_if_missing = true;
- delete db_;
- db_ = nullptr;
- ASSERT_OK(DestroyDB(dbname_, options));
- std::shared_ptr<mock::MockTableFactory> mock =
- std::make_shared<mock::MockTableFactory>();
- options.table_factory = mock;
- ASSERT_OK(DB::Open(options, dbname_, &db_));
- assert(db_ != nullptr); // suppress false clang-analyze report
- mock->SetCorruptionMode(mock::MockTableFactory::kCorruptReorderKey);
- Build(100, 2);
- DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
- ASSERT_OK(dbi->TEST_FlushMemTable());
- mock->SetCorruptionMode(mock::MockTableFactory::kCorruptNone);
- CompactRangeOptions cro;
- cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
- ASSERT_NOK(
- dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
- }
- TEST_F(CorruptionTest, FlushKeyOrderCheck) {
- Options options;
- options.level_compaction_dynamic_level_bytes = false;
- options.env = env_.get();
- options.paranoid_file_checks = false;
- options.create_if_missing = true;
- ASSERT_OK(db_->Put(WriteOptions(), "foo1", "v1"));
- ASSERT_OK(db_->Put(WriteOptions(), "foo2", "v1"));
- ASSERT_OK(db_->Put(WriteOptions(), "foo3", "v1"));
- ASSERT_OK(db_->Put(WriteOptions(), "foo4", "v1"));
- int cnt = 0;
- // Generate some out of order keys from the memtable
- SyncPoint::GetInstance()->SetCallBack(
- "MemTableIterator::Next:0", [&](void* arg) {
- MemTableRep::Iterator* mem_iter =
- static_cast<MemTableRep::Iterator*>(arg);
- if (++cnt == 3) {
- mem_iter->Prev();
- mem_iter->Prev();
- }
- });
- ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
- Status s = static_cast_with_check<DBImpl>(db_)->TEST_FlushMemTable();
- ASSERT_NOK(s);
- ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
- ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
- }
- TEST_F(CorruptionTest, VerifyWholeTableChecksum) {
- CloseDb();
- Options options;
- options.level_compaction_dynamic_level_bytes = false;
- options.env = env_.get();
- ASSERT_OK(DestroyDB(dbname_, options));
- options.create_if_missing = true;
- options.file_checksum_gen_factory =
- ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory();
- Reopen(&options);
- Build(10, 5);
- ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
- CloseDb();
- // Corrupt the first byte of each table file, this must be data block.
- Corrupt(kTableFile, 0, 1);
- ASSERT_OK(TryReopen(&options));
- SyncPoint::GetInstance()->DisableProcessing();
- SyncPoint::GetInstance()->ClearAllCallBacks();
- int count{0};
- SyncPoint::GetInstance()->SetCallBack(
- "DBImpl::VerifyFullFileChecksum:mismatch", [&](void* arg) {
- auto* s = static_cast<Status*>(arg);
- ASSERT_NE(s, nullptr);
- ++count;
- ASSERT_NOK(*s);
- });
- SyncPoint::GetInstance()->EnableProcessing();
- ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsCorruption());
- ASSERT_EQ(1, count);
- }
- class CrashDuringRecoveryWithCorruptionTest
- : public CorruptionTest,
- public testing::WithParamInterface<std::tuple<bool, bool>> {
- public:
- explicit CrashDuringRecoveryWithCorruptionTest()
- : CorruptionTest(),
- avoid_flush_during_recovery_(std::get<0>(GetParam())),
- track_and_verify_wals_in_manifest_(std::get<1>(GetParam())) {}
- protected:
- const bool avoid_flush_during_recovery_;
- const bool track_and_verify_wals_in_manifest_;
- };
- INSTANTIATE_TEST_CASE_P(CorruptionTest, CrashDuringRecoveryWithCorruptionTest,
- ::testing::Values(std::make_tuple(true, false),
- std::make_tuple(false, false),
- std::make_tuple(true, true),
- std::make_tuple(false, true)));
- // In case of non-TransactionDB with avoid_flush_during_recovery = true, RocksDB
- // won't flush the data from WAL to L0 for all column families if possible. As a
- // result, not all column families can increase their log_numbers, and
- // min_log_number_to_keep won't change.
- // It may prematurely persist a new MANIFEST even before we can declare the DB
- // is in consistent state after recovery (this is when the new WAL is synced)
- // and advances log_numbers for some column families.
- //
- // If there is power failure before we sync the new WAL, we will end up in
- // a situation in which after persisting the MANIFEST, RocksDB will see some
- // column families' log_numbers larger than the corrupted wal, and
- // "Column family inconsistency: SST file contains data beyond the point of
- // corruption" error will be hit, causing recovery to fail.
- //
- // After adding the fix, only after new WAL is synced, RocksDB persist a new
- // MANIFEST with column families to ensure RocksDB is in consistent state.
- // RocksDB writes an empty WriteBatch as a sentinel to the new WAL which is
- // synced immediately afterwards. The sequence number of the sentinel
- // WriteBatch will be the next sequence number immediately after the largest
- // sequence number recovered from previous WALs and MANIFEST because of which DB
- // will be in consistent state.
- // If a future recovery starts from the new MANIFEST, then it means the new WAL
- // is successfully synced. Due to the sentinel empty write batch at the
- // beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point.
- // If future recovery starts from the old MANIFEST, it means the writing the new
- // MANIFEST failed. It won't have the "SST ahead of WAL" error.
- //
- // The combination of corrupting a WAL and injecting an error during subsequent
- // re-open exposes the bug of prematurely persisting a new MANIFEST with
- // advanced ColumnFamilyData::log_number.
- TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) {
- CloseDb();
- Options options;
- options.level_compaction_dynamic_level_bytes = false;
- options.track_and_verify_wals_in_manifest =
- track_and_verify_wals_in_manifest_;
- options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
- options.avoid_flush_during_recovery = false;
- options.env = env_.get();
- ASSERT_OK(DestroyDB(dbname_, options));
- options.create_if_missing = true;
- options.max_write_buffer_number = 8;
- Reopen(&options);
- Status s;
- const std::string test_cf_name = "test_cf";
- ColumnFamilyHandle* cfh = nullptr;
- s = db_->CreateColumnFamily(options, test_cf_name, &cfh);
- ASSERT_OK(s);
- delete cfh;
- CloseDb();
- std::vector<ColumnFamilyDescriptor> cf_descs;
- cf_descs.emplace_back(kDefaultColumnFamilyName, options);
- cf_descs.emplace_back(test_cf_name, options);
- std::vector<ColumnFamilyHandle*> handles;
- // 1. Open and populate the DB. Write and flush default_cf several times to
- // advance wal number so that some column families have advanced log_number
- // while other don't.
- {
- ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_));
- auto* dbimpl = static_cast_with_check<DBImpl>(db_);
- assert(dbimpl);
- // Write one key to test_cf.
- ASSERT_OK(db_->Put(WriteOptions(), handles[1], "old_key", "dontcare"));
- ASSERT_OK(db_->Flush(FlushOptions(), handles[1]));
- // Write to default_cf and flush this cf several times to advance wal
- // number. TEST_SwitchMemtable makes sure WALs are not synced and test can
- // corrupt un-sync WAL.
- for (int i = 0; i < 2; ++i) {
- ASSERT_OK(db_->Put(WriteOptions(), "key" + std::to_string(i),
- "value" + std::to_string(i)));
- ASSERT_OK(dbimpl->TEST_SwitchMemtable());
- }
- for (auto* h : handles) {
- delete h;
- }
- handles.clear();
- CloseDb();
- }
- // 2. Corrupt second last un-syned wal file to emulate power reset which
- // caused the DB to lose the un-synced WAL.
- {
- std::vector<uint64_t> file_nums;
- GetSortedWalFiles(file_nums);
- size_t size = file_nums.size();
- assert(size >= 2);
- uint64_t log_num = file_nums[size - 2];
- CorruptFileWithTruncation(FileType::kWalFile, log_num,
- /*bytes_to_truncate=*/8);
- }
- // 3. After first crash reopen the DB which contains corrupted WAL. Default
- // family has higher log number than corrupted wal number.
- //
- // Case1: If avoid_flush_during_recovery = true, RocksDB won't flush the data
- // from WAL to L0 for all column families (test_cf_name in this case). As a
- // result, not all column families can increase their log_numbers, and
- // min_log_number_to_keep won't change.
- //
- // Case2: If avoid_flush_during_recovery = false, all column families have
- // flushed their data from WAL to L0 during recovery, and none of them will
- // ever need to read the WALs again.
- // 4. Fault is injected to fail the recovery.
- {
- SyncPoint::GetInstance()->DisableProcessing();
- SyncPoint::GetInstance()->ClearAllCallBacks();
- SyncPoint::GetInstance()->SetCallBack(
- "DBImpl::GetLogSizeAndMaybeTruncate:0", [&](void* arg) {
- auto* tmp_s = static_cast<Status*>(arg);
- assert(tmp_s);
- *tmp_s = Status::IOError("Injected");
- });
- SyncPoint::GetInstance()->EnableProcessing();
- handles.clear();
- options.avoid_flush_during_recovery = true;
- s = DB::Open(options, dbname_, cf_descs, &handles, &db_);
- ASSERT_TRUE(s.IsIOError());
- ASSERT_EQ("IO error: Injected", s.ToString());
- for (auto* h : handles) {
- delete h;
- }
- CloseDb();
- SyncPoint::GetInstance()->DisableProcessing();
- SyncPoint::GetInstance()->ClearAllCallBacks();
- }
- // 5. After second crash reopen the db with second corruption. Default family
- // has higher log number than corrupted wal number.
- //
- // Case1: If avoid_flush_during_recovery = true, we persist a new
- // MANIFEST with advanced log_numbers for some column families only after
- // syncing the WAL. So during second crash, RocksDB will skip the corrupted
- // WAL files as they have been moved to different folder. Since newly synced
- // WAL file's sequence number (sentinel WriteBatch) will be the next
- // sequence number immediately after the largest sequence number recovered
- // from previous WALs and MANIFEST, db will be in consistent state and opens
- // successfully.
- //
- // Case2: If avoid_flush_during_recovery = false, the corrupted WAL is below
- // this number. So during a second crash after persisting the new MANIFEST,
- // RocksDB will skip the corrupted WAL(s) because they are all below this
- // bound. Therefore, we won't hit the "column family inconsistency" error
- // message.
- {
- options.avoid_flush_during_recovery = avoid_flush_during_recovery_;
- ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_));
- // Verify that data is not lost.
- {
- std::string v;
- ASSERT_OK(db_->Get(ReadOptions(), handles[1], "old_key", &v));
- ASSERT_EQ("dontcare", v);
- v.clear();
- ASSERT_OK(db_->Get(ReadOptions(), "key" + std::to_string(0), &v));
- ASSERT_EQ("value" + std::to_string(0), v);
- // Since it's corrupting second last wal, below key is not found.
- v.clear();
- ASSERT_EQ(db_->Get(ReadOptions(), "key" + std::to_string(1), &v),
- Status::NotFound());
- }
- for (auto* h : handles) {
- delete h;
- }
- handles.clear();
- CloseDb();
- }
- }
- // In case of TransactionDB, it enables two-phase-commit. The prepare section of
- // an uncommitted transaction always need to be kept. Even if we perform flush
- // during recovery, we may still need to hold an old WAL. The
- // min_log_number_to_keep won't change, and "Column family inconsistency: SST
- // file contains data beyond the point of corruption" error will be hit, causing
- // recovery to fail.
- //
- // After adding the fix, only after new WAL is synced, RocksDB persist a new
- // MANIFEST with column families to ensure RocksDB is in consistent state.
- // RocksDB writes an empty WriteBatch as a sentinel to the new WAL which is
- // synced immediately afterwards. The sequence number of the sentinel
- // WriteBatch will be the next sequence number immediately after the largest
- // sequence number recovered from previous WALs and MANIFEST because of which DB
- // will be in consistent state.
- // If a future recovery starts from the new MANIFEST, then it means the new WAL
- // is successfully synced. Due to the sentinel empty write batch at the
- // beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point.
- // If future recovery starts from the old MANIFEST, it means the writing the new
- // MANIFEST failed. It won't have the "SST ahead of WAL" error.
- //
- // The combination of corrupting a WAL and injecting an error during subsequent
- // re-open exposes the bug of prematurely persisting a new MANIFEST with
- // advanced ColumnFamilyData::log_number.
- TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) {
- CloseDb();
- Options options;
- options.level_compaction_dynamic_level_bytes = false;
- options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
- options.track_and_verify_wals_in_manifest =
- track_and_verify_wals_in_manifest_;
- options.avoid_flush_during_recovery = false;
- options.env = env_.get();
- ASSERT_OK(DestroyDB(dbname_, options));
- options.create_if_missing = true;
- options.max_write_buffer_number = 3;
- Reopen(&options);
- // Create cf test_cf_name.
- ColumnFamilyHandle* cfh = nullptr;
- const std::string test_cf_name = "test_cf";
- Status s = db_->CreateColumnFamily(options, test_cf_name, &cfh);
- ASSERT_OK(s);
- delete cfh;
- CloseDb();
- std::vector<ColumnFamilyDescriptor> cf_descs;
- cf_descs.emplace_back(kDefaultColumnFamilyName, options);
- cf_descs.emplace_back(test_cf_name, options);
- std::vector<ColumnFamilyHandle*> handles;
- TransactionDB* txn_db = nullptr;
- TransactionDBOptions txn_db_opts;
- // 1. Open and populate the DB. Write and flush default_cf several times to
- // advance wal number so that some column families have advanced log_number
- // while other don't.
- {
- ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs,
- &handles, &txn_db));
- auto* txn = txn_db->BeginTransaction(WriteOptions(), TransactionOptions());
- // Put cf1
- ASSERT_OK(txn->Put(handles[1], "foo", "value"));
- ASSERT_OK(txn->SetName("txn0"));
- ASSERT_OK(txn->Prepare());
- ASSERT_OK(txn_db->Flush(FlushOptions()));
- delete txn;
- txn = nullptr;
- auto* dbimpl = static_cast_with_check<DBImpl>(txn_db->GetRootDB());
- assert(dbimpl);
- // Put and flush cf0
- for (int i = 0; i < 2; ++i) {
- ASSERT_OK(txn_db->Put(WriteOptions(), "key" + std::to_string(i),
- "value" + std::to_string(i)));
- ASSERT_OK(dbimpl->TEST_SwitchMemtable());
- }
- // Put cf1
- txn = txn_db->BeginTransaction(WriteOptions(), TransactionOptions());
- ASSERT_OK(txn->Put(handles[1], "foo1", "value1"));
- ASSERT_OK(txn->Commit());
- delete txn;
- txn = nullptr;
- for (auto* h : handles) {
- delete h;
- }
- handles.clear();
- delete txn_db;
- }
- // 2. Corrupt second last wal to emulate power reset which caused the DB to
- // lose the un-synced WAL.
- {
- std::vector<uint64_t> file_nums;
- GetSortedWalFiles(file_nums);
- size_t size = file_nums.size();
- assert(size >= 2);
- uint64_t log_num = file_nums[size - 2];
- CorruptFileWithTruncation(FileType::kWalFile, log_num,
- /*bytes_to_truncate=*/8);
- }
- // 3. After first crash reopen the DB which contains corrupted WAL. Default
- // family has higher log number than corrupted wal number. There may be old
- // WAL files that it must not delete because they can contain data of
- // uncommitted transactions. As a result, min_log_number_to_keep won't change.
- {
- SyncPoint::GetInstance()->DisableProcessing();
- SyncPoint::GetInstance()->ClearAllCallBacks();
- SyncPoint::GetInstance()->SetCallBack(
- "DBImpl::Open::BeforeSyncWAL", [&](void* arg) {
- auto* tmp_s = static_cast<Status*>(arg);
- assert(tmp_s);
- *tmp_s = Status::IOError("Injected");
- });
- SyncPoint::GetInstance()->EnableProcessing();
- handles.clear();
- s = TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs, &handles,
- &txn_db);
- ASSERT_TRUE(s.IsIOError());
- ASSERT_EQ("IO error: Injected", s.ToString());
- for (auto* h : handles) {
- delete h;
- }
- CloseDb();
- SyncPoint::GetInstance()->DisableProcessing();
- SyncPoint::GetInstance()->ClearAllCallBacks();
- }
- // 4. Corrupt max_wal_num.
- {
- std::vector<uint64_t> file_nums;
- GetSortedWalFiles(file_nums);
- size_t size = file_nums.size();
- uint64_t log_num = file_nums[size - 1];
- CorruptFileWithTruncation(FileType::kWalFile, log_num);
- }
- // 5. After second crash reopen the db with second corruption. Default family
- // has higher log number than corrupted wal number.
- // We persist a new MANIFEST with advanced log_numbers for some column
- // families only after syncing the WAL. So during second crash, RocksDB will
- // skip the corrupted WAL files as they have been moved to different folder.
- // Since newly synced WAL file's sequence number (sentinel WriteBatch) will be
- // the next sequence number immediately after the largest sequence number
- // recovered from previous WALs and MANIFEST, db will be in consistent state
- // and opens successfully.
- {
- ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs,
- &handles, &txn_db));
- // Verify that data is not lost.
- {
- std::string v;
- // Key not visible since it's not committed.
- ASSERT_EQ(txn_db->Get(ReadOptions(), handles[1], "foo", &v),
- Status::NotFound());
- v.clear();
- ASSERT_OK(txn_db->Get(ReadOptions(), "key" + std::to_string(0), &v));
- ASSERT_EQ("value" + std::to_string(0), v);
- // Last WAL is corrupted which contains two keys below.
- v.clear();
- ASSERT_EQ(txn_db->Get(ReadOptions(), "key" + std::to_string(1), &v),
- Status::NotFound());
- v.clear();
- ASSERT_EQ(txn_db->Get(ReadOptions(), handles[1], "foo1", &v),
- Status::NotFound());
- }
- for (auto* h : handles) {
- delete h;
- }
- delete txn_db;
- }
- }
- // This test is similar to
- // CrashDuringRecoveryWithCorruptionTest.CrashDuringRecovery except it calls
- // flush and corrupts Last WAL. It calls flush to sync some of the WALs and
- // remaining are unsyned one of which is then corrupted to simulate crash.
- //
- // In case of non-TransactionDB with avoid_flush_during_recovery = true, RocksDB
- // won't flush the data from WAL to L0 for all column families if possible. As a
- // result, not all column families can increase their log_numbers, and
- // min_log_number_to_keep won't change.
- // It may prematurely persist a new MANIFEST even before we can declare the DB
- // is in consistent state after recovery (this is when the new WAL is synced)
- // and advances log_numbers for some column families.
- //
- // If there is power failure before we sync the new WAL, we will end up in
- // a situation in which after persisting the MANIFEST, RocksDB will see some
- // column families' log_numbers larger than the corrupted wal, and
- // "Column family inconsistency: SST file contains data beyond the point of
- // corruption" error will be hit, causing recovery to fail.
- //
- // After adding the fix, only after new WAL is synced, RocksDB persist a new
- // MANIFEST with column families to ensure RocksDB is in consistent state.
- // RocksDB writes an empty WriteBatch as a sentinel to the new WAL which is
- // synced immediately afterwards. The sequence number of the sentinel
- // WriteBatch will be the next sequence number immediately after the largest
- // sequence number recovered from previous WALs and MANIFEST because of which DB
- // will be in consistent state.
- // If a future recovery starts from the new MANIFEST, then it means the new WAL
- // is successfully synced. Due to the sentinel empty write batch at the
- // beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point.
- // If future recovery starts from the old MANIFEST, it means the writing the new
- // MANIFEST failed. It won't have the "SST ahead of WAL" error.
- // The combination of corrupting a WAL and injecting an error during subsequent
- // re-open exposes the bug of prematurely persisting a new MANIFEST with
- // advanced ColumnFamilyData::log_number.
- TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecoveryWithFlush) {
- CloseDb();
- Options options;
- options.level_compaction_dynamic_level_bytes = false;
- options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
- options.avoid_flush_during_recovery = false;
- options.env = env_.get();
- options.create_if_missing = true;
- ASSERT_OK(DestroyDB(dbname_, options));
- Reopen(&options);
- ColumnFamilyHandle* cfh = nullptr;
- const std::string test_cf_name = "test_cf";
- Status s = db_->CreateColumnFamily(options, test_cf_name, &cfh);
- ASSERT_OK(s);
- delete cfh;
- CloseDb();
- std::vector<ColumnFamilyDescriptor> cf_descs;
- cf_descs.emplace_back(kDefaultColumnFamilyName, options);
- cf_descs.emplace_back(test_cf_name, options);
- std::vector<ColumnFamilyHandle*> handles;
- {
- ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_));
- // Write one key to test_cf.
- ASSERT_OK(db_->Put(WriteOptions(), handles[1], "old_key", "dontcare"));
- // Write to default_cf and flush this cf several times to advance wal
- // number.
- for (int i = 0; i < 2; ++i) {
- ASSERT_OK(db_->Put(WriteOptions(), "key" + std::to_string(i),
- "value" + std::to_string(i)));
- ASSERT_OK(db_->Flush(FlushOptions()));
- }
- ASSERT_OK(db_->Put(WriteOptions(), handles[1], "dontcare", "dontcare"));
- for (auto* h : handles) {
- delete h;
- }
- handles.clear();
- CloseDb();
- }
- // Corrupt second last un-syned wal file to emulate power reset which
- // caused the DB to lose the un-synced WAL.
- {
- std::vector<uint64_t> file_nums;
- GetSortedWalFiles(file_nums);
- size_t size = file_nums.size();
- uint64_t log_num = file_nums[size - 1];
- CorruptFileWithTruncation(FileType::kWalFile, log_num,
- /*bytes_to_truncate=*/8);
- }
- // Fault is injected to fail the recovery.
- {
- SyncPoint::GetInstance()->DisableProcessing();
- SyncPoint::GetInstance()->ClearAllCallBacks();
- SyncPoint::GetInstance()->SetCallBack(
- "DBImpl::GetLogSizeAndMaybeTruncate:0", [&](void* arg) {
- auto* tmp_s = static_cast<Status*>(arg);
- assert(tmp_s);
- *tmp_s = Status::IOError("Injected");
- });
- SyncPoint::GetInstance()->EnableProcessing();
- handles.clear();
- options.avoid_flush_during_recovery = true;
- s = DB::Open(options, dbname_, cf_descs, &handles, &db_);
- ASSERT_TRUE(s.IsIOError());
- ASSERT_EQ("IO error: Injected", s.ToString());
- for (auto* h : handles) {
- delete h;
- }
- CloseDb();
- SyncPoint::GetInstance()->DisableProcessing();
- SyncPoint::GetInstance()->ClearAllCallBacks();
- }
- // Reopen db again
- {
- options.avoid_flush_during_recovery = avoid_flush_during_recovery_;
- ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_));
- // Verify that data is not lost.
- {
- std::string v;
- ASSERT_OK(db_->Get(ReadOptions(), handles[1], "old_key", &v));
- ASSERT_EQ("dontcare", v);
- for (int i = 0; i < 2; ++i) {
- v.clear();
- ASSERT_OK(db_->Get(ReadOptions(), "key" + std::to_string(i), &v));
- ASSERT_EQ("value" + std::to_string(i), v);
- }
- // Since it's corrupting last wal after Flush, below key is not found.
- v.clear();
- ASSERT_EQ(db_->Get(ReadOptions(), handles[1], "dontcare", &v),
- Status::NotFound());
- }
- for (auto* h : handles) {
- delete h;
- }
- }
- }
- } // namespace ROCKSDB_NAMESPACE
- int main(int argc, char** argv) {
- ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
- ::testing::InitGoogleTest(&argc, argv);
- RegisterCustomObjects(argc, argv);
- return RUN_ALL_TESTS();
- }
|