external_sst_file_ingestion_job.cc 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. #ifndef ROCKSDB_LITE
  6. #include "db/external_sst_file_ingestion_job.h"
  7. #include <algorithm>
  8. #include <cinttypes>
  9. #include <string>
  10. #include <unordered_set>
  11. #include <vector>
  12. #include "db/db_impl/db_impl.h"
  13. #include "db/version_edit.h"
  14. #include "file/file_util.h"
  15. #include "file/random_access_file_reader.h"
  16. #include "table/merging_iterator.h"
  17. #include "table/scoped_arena_iterator.h"
  18. #include "table/sst_file_writer_collectors.h"
  19. #include "table/table_builder.h"
  20. #include "test_util/sync_point.h"
  21. #include "util/stop_watch.h"
  22. namespace ROCKSDB_NAMESPACE {
  23. Status ExternalSstFileIngestionJob::Prepare(
  24. const std::vector<std::string>& external_files_paths,
  25. uint64_t next_file_number, SuperVersion* sv) {
  26. Status status;
  27. // Read the information of files we are ingesting
  28. for (const std::string& file_path : external_files_paths) {
  29. IngestedFileInfo file_to_ingest;
  30. status = GetIngestedFileInfo(file_path, &file_to_ingest, sv);
  31. if (!status.ok()) {
  32. return status;
  33. }
  34. files_to_ingest_.push_back(file_to_ingest);
  35. }
  36. for (const IngestedFileInfo& f : files_to_ingest_) {
  37. if (f.cf_id !=
  38. TablePropertiesCollectorFactory::Context::kUnknownColumnFamily &&
  39. f.cf_id != cfd_->GetID()) {
  40. return Status::InvalidArgument(
  41. "External file column family id dont match");
  42. }
  43. }
  44. const Comparator* ucmp = cfd_->internal_comparator().user_comparator();
  45. auto num_files = files_to_ingest_.size();
  46. if (num_files == 0) {
  47. return Status::InvalidArgument("The list of files is empty");
  48. } else if (num_files > 1) {
  49. // Verify that passed files dont have overlapping ranges
  50. autovector<const IngestedFileInfo*> sorted_files;
  51. for (size_t i = 0; i < num_files; i++) {
  52. sorted_files.push_back(&files_to_ingest_[i]);
  53. }
  54. std::sort(
  55. sorted_files.begin(), sorted_files.end(),
  56. [&ucmp](const IngestedFileInfo* info1, const IngestedFileInfo* info2) {
  57. return sstableKeyCompare(ucmp, info1->smallest_internal_key,
  58. info2->smallest_internal_key) < 0;
  59. });
  60. for (size_t i = 0; i < num_files - 1; i++) {
  61. if (sstableKeyCompare(ucmp, sorted_files[i]->largest_internal_key,
  62. sorted_files[i + 1]->smallest_internal_key) >= 0) {
  63. files_overlap_ = true;
  64. break;
  65. }
  66. }
  67. }
  68. if (ingestion_options_.ingest_behind && files_overlap_) {
  69. return Status::NotSupported("Files have overlapping ranges");
  70. }
  71. for (IngestedFileInfo& f : files_to_ingest_) {
  72. if (f.num_entries == 0 && f.num_range_deletions == 0) {
  73. return Status::InvalidArgument("File contain no entries");
  74. }
  75. if (!f.smallest_internal_key.Valid() || !f.largest_internal_key.Valid()) {
  76. return Status::Corruption("Generated table have corrupted keys");
  77. }
  78. }
  79. // Copy/Move external files into DB
  80. std::unordered_set<size_t> ingestion_path_ids;
  81. for (IngestedFileInfo& f : files_to_ingest_) {
  82. f.fd = FileDescriptor(next_file_number++, 0, f.file_size);
  83. f.copy_file = false;
  84. const std::string path_outside_db = f.external_file_path;
  85. const std::string path_inside_db =
  86. TableFileName(cfd_->ioptions()->cf_paths, f.fd.GetNumber(),
  87. f.fd.GetPathId());
  88. if (ingestion_options_.move_files) {
  89. status =
  90. fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr);
  91. if (status.ok()) {
  92. // It is unsafe to assume application had sync the file and file
  93. // directory before ingest the file. For integrity of RocksDB we need
  94. // to sync the file.
  95. std::unique_ptr<FSWritableFile> file_to_sync;
  96. status = fs_->ReopenWritableFile(path_inside_db, env_options_,
  97. &file_to_sync, nullptr);
  98. if (status.ok()) {
  99. TEST_SYNC_POINT(
  100. "ExternalSstFileIngestionJob::BeforeSyncIngestedFile");
  101. status = SyncIngestedFile(file_to_sync.get());
  102. TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncIngestedFile");
  103. if (!status.ok()) {
  104. ROCKS_LOG_WARN(db_options_.info_log,
  105. "Failed to sync ingested file %s: %s",
  106. path_inside_db.c_str(), status.ToString().c_str());
  107. }
  108. }
  109. } else if (status.IsNotSupported() &&
  110. ingestion_options_.failed_move_fall_back_to_copy) {
  111. // Original file is on a different FS, use copy instead of hard linking.
  112. f.copy_file = true;
  113. }
  114. } else {
  115. f.copy_file = true;
  116. }
  117. if (f.copy_file) {
  118. TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:CopyFile",
  119. nullptr);
  120. // CopyFile also sync the new file.
  121. status = CopyFile(fs_, path_outside_db, path_inside_db, 0,
  122. db_options_.use_fsync);
  123. }
  124. TEST_SYNC_POINT("ExternalSstFileIngestionJob::Prepare:FileAdded");
  125. if (!status.ok()) {
  126. break;
  127. }
  128. f.internal_file_path = path_inside_db;
  129. ingestion_path_ids.insert(f.fd.GetPathId());
  130. }
  131. TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncDir");
  132. if (status.ok()) {
  133. for (auto path_id : ingestion_path_ids) {
  134. status = directories_->GetDataDir(path_id)->Fsync();
  135. if (!status.ok()) {
  136. ROCKS_LOG_WARN(db_options_.info_log,
  137. "Failed to sync directory %" ROCKSDB_PRIszt
  138. " while ingest file: %s",
  139. path_id, status.ToString().c_str());
  140. break;
  141. }
  142. }
  143. }
  144. TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncDir");
  145. // TODO: The following is duplicated with Cleanup().
  146. if (!status.ok()) {
  147. // We failed, remove all files that we copied into the db
  148. for (IngestedFileInfo& f : files_to_ingest_) {
  149. if (f.internal_file_path.empty()) {
  150. continue;
  151. }
  152. Status s = env_->DeleteFile(f.internal_file_path);
  153. if (!s.ok()) {
  154. ROCKS_LOG_WARN(db_options_.info_log,
  155. "AddFile() clean up for file %s failed : %s",
  156. f.internal_file_path.c_str(), s.ToString().c_str());
  157. }
  158. }
  159. }
  160. return status;
  161. }
  162. Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed,
  163. SuperVersion* super_version) {
  164. autovector<Range> ranges;
  165. for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) {
  166. ranges.emplace_back(file_to_ingest.smallest_internal_key.user_key(),
  167. file_to_ingest.largest_internal_key.user_key());
  168. }
  169. Status status =
  170. cfd_->RangesOverlapWithMemtables(ranges, super_version, flush_needed);
  171. if (status.ok() && *flush_needed &&
  172. !ingestion_options_.allow_blocking_flush) {
  173. status = Status::InvalidArgument("External file requires flush");
  174. }
  175. return status;
  176. }
  177. // REQUIRES: we have become the only writer by entering both write_thread_ and
  178. // nonmem_write_thread_
  179. Status ExternalSstFileIngestionJob::Run() {
  180. Status status;
  181. SuperVersion* super_version = cfd_->GetSuperVersion();
  182. #ifndef NDEBUG
  183. // We should never run the job with a memtable that is overlapping
  184. // with the files we are ingesting
  185. bool need_flush = false;
  186. status = NeedsFlush(&need_flush, super_version);
  187. assert(status.ok() && need_flush == false);
  188. #endif
  189. bool force_global_seqno = false;
  190. if (ingestion_options_.snapshot_consistency && !db_snapshots_->empty()) {
  191. // We need to assign a global sequence number to all the files even
  192. // if the dont overlap with any ranges since we have snapshots
  193. force_global_seqno = true;
  194. }
  195. // It is safe to use this instead of LastAllocatedSequence since we are
  196. // the only active writer, and hence they are equal
  197. SequenceNumber last_seqno = versions_->LastSequence();
  198. edit_.SetColumnFamily(cfd_->GetID());
  199. // The levels that the files will be ingested into
  200. for (IngestedFileInfo& f : files_to_ingest_) {
  201. SequenceNumber assigned_seqno = 0;
  202. if (ingestion_options_.ingest_behind) {
  203. status = CheckLevelForIngestedBehindFile(&f);
  204. } else {
  205. status = AssignLevelAndSeqnoForIngestedFile(
  206. super_version, force_global_seqno, cfd_->ioptions()->compaction_style,
  207. last_seqno, &f, &assigned_seqno);
  208. }
  209. if (!status.ok()) {
  210. return status;
  211. }
  212. status = AssignGlobalSeqnoForIngestedFile(&f, assigned_seqno);
  213. TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run",
  214. &assigned_seqno);
  215. if (assigned_seqno > last_seqno) {
  216. assert(assigned_seqno == last_seqno + 1);
  217. last_seqno = assigned_seqno;
  218. ++consumed_seqno_count_;
  219. }
  220. if (!status.ok()) {
  221. return status;
  222. }
  223. // We use the import time as the ancester time. This is the time the data
  224. // is written to the database.
  225. int64_t temp_current_time = 0;
  226. uint64_t current_time = kUnknownFileCreationTime;
  227. uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
  228. if (env_->GetCurrentTime(&temp_current_time).ok()) {
  229. current_time = oldest_ancester_time =
  230. static_cast<uint64_t>(temp_current_time);
  231. }
  232. edit_.AddFile(
  233. f.picked_level, f.fd.GetNumber(), f.fd.GetPathId(), f.fd.GetFileSize(),
  234. f.smallest_internal_key, f.largest_internal_key, f.assigned_seqno,
  235. f.assigned_seqno, false, kInvalidBlobFileNumber, oldest_ancester_time,
  236. current_time, kUnknownFileChecksum, kUnknownFileChecksumFuncName);
  237. }
  238. return status;
  239. }
  240. void ExternalSstFileIngestionJob::UpdateStats() {
  241. // Update internal stats for new ingested files
  242. uint64_t total_keys = 0;
  243. uint64_t total_l0_files = 0;
  244. uint64_t total_time = env_->NowMicros() - job_start_time_;
  245. EventLoggerStream stream = event_logger_->Log();
  246. stream << "event"
  247. << "ingest_finished";
  248. stream << "files_ingested";
  249. stream.StartArray();
  250. for (IngestedFileInfo& f : files_to_ingest_) {
  251. InternalStats::CompactionStats stats(CompactionReason::kExternalSstIngestion, 1);
  252. stats.micros = total_time;
  253. // If actual copy occurred for this file, then we need to count the file
  254. // size as the actual bytes written. If the file was linked, then we ignore
  255. // the bytes written for file metadata.
  256. // TODO (yanqin) maybe account for file metadata bytes for exact accuracy?
  257. if (f.copy_file) {
  258. stats.bytes_written = f.fd.GetFileSize();
  259. } else {
  260. stats.bytes_moved = f.fd.GetFileSize();
  261. }
  262. stats.num_output_files = 1;
  263. cfd_->internal_stats()->AddCompactionStats(f.picked_level,
  264. Env::Priority::USER, stats);
  265. cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_INGESTED_ADD_FILE,
  266. f.fd.GetFileSize());
  267. total_keys += f.num_entries;
  268. if (f.picked_level == 0) {
  269. total_l0_files += 1;
  270. }
  271. ROCKS_LOG_INFO(
  272. db_options_.info_log,
  273. "[AddFile] External SST file %s was ingested in L%d with path %s "
  274. "(global_seqno=%" PRIu64 ")\n",
  275. f.external_file_path.c_str(), f.picked_level,
  276. f.internal_file_path.c_str(), f.assigned_seqno);
  277. stream << "file" << f.internal_file_path << "level" << f.picked_level;
  278. }
  279. stream.EndArray();
  280. stream << "lsm_state";
  281. stream.StartArray();
  282. auto vstorage = cfd_->current()->storage_info();
  283. for (int level = 0; level < vstorage->num_levels(); ++level) {
  284. stream << vstorage->NumLevelFiles(level);
  285. }
  286. stream.EndArray();
  287. cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_KEYS_TOTAL,
  288. total_keys);
  289. cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_FILES_TOTAL,
  290. files_to_ingest_.size());
  291. cfd_->internal_stats()->AddCFStats(
  292. InternalStats::INGESTED_LEVEL0_NUM_FILES_TOTAL, total_l0_files);
  293. }
  294. void ExternalSstFileIngestionJob::Cleanup(const Status& status) {
  295. if (!status.ok()) {
  296. // We failed to add the files to the database
  297. // remove all the files we copied
  298. for (IngestedFileInfo& f : files_to_ingest_) {
  299. if (f.internal_file_path.empty()) {
  300. continue;
  301. }
  302. Status s = env_->DeleteFile(f.internal_file_path);
  303. if (!s.ok()) {
  304. ROCKS_LOG_WARN(db_options_.info_log,
  305. "AddFile() clean up for file %s failed : %s",
  306. f.internal_file_path.c_str(), s.ToString().c_str());
  307. }
  308. }
  309. consumed_seqno_count_ = 0;
  310. files_overlap_ = false;
  311. } else if (status.ok() && ingestion_options_.move_files) {
  312. // The files were moved and added successfully, remove original file links
  313. for (IngestedFileInfo& f : files_to_ingest_) {
  314. Status s = env_->DeleteFile(f.external_file_path);
  315. if (!s.ok()) {
  316. ROCKS_LOG_WARN(
  317. db_options_.info_log,
  318. "%s was added to DB successfully but failed to remove original "
  319. "file link : %s",
  320. f.external_file_path.c_str(), s.ToString().c_str());
  321. }
  322. }
  323. }
  324. }
  325. Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
  326. const std::string& external_file, IngestedFileInfo* file_to_ingest,
  327. SuperVersion* sv) {
  328. file_to_ingest->external_file_path = external_file;
  329. // Get external file size
  330. Status status = fs_->GetFileSize(external_file, IOOptions(),
  331. &file_to_ingest->file_size, nullptr);
  332. if (!status.ok()) {
  333. return status;
  334. }
  335. // Create TableReader for external file
  336. std::unique_ptr<TableReader> table_reader;
  337. std::unique_ptr<FSRandomAccessFile> sst_file;
  338. std::unique_ptr<RandomAccessFileReader> sst_file_reader;
  339. status = fs_->NewRandomAccessFile(external_file, env_options_,
  340. &sst_file, nullptr);
  341. if (!status.ok()) {
  342. return status;
  343. }
  344. sst_file_reader.reset(new RandomAccessFileReader(std::move(sst_file),
  345. external_file));
  346. status = cfd_->ioptions()->table_factory->NewTableReader(
  347. TableReaderOptions(*cfd_->ioptions(),
  348. sv->mutable_cf_options.prefix_extractor.get(),
  349. env_options_, cfd_->internal_comparator()),
  350. std::move(sst_file_reader), file_to_ingest->file_size, &table_reader);
  351. if (!status.ok()) {
  352. return status;
  353. }
  354. if (ingestion_options_.verify_checksums_before_ingest) {
  355. // If customized readahead size is needed, we can pass a user option
  356. // all the way to here. Right now we just rely on the default readahead
  357. // to keep things simple.
  358. ReadOptions ro;
  359. ro.readahead_size = ingestion_options_.verify_checksums_readahead_size;
  360. status = table_reader->VerifyChecksum(
  361. ro, TableReaderCaller::kExternalSSTIngestion);
  362. }
  363. if (!status.ok()) {
  364. return status;
  365. }
  366. // Get the external file properties
  367. auto props = table_reader->GetTableProperties();
  368. const auto& uprops = props->user_collected_properties;
  369. // Get table version
  370. auto version_iter = uprops.find(ExternalSstFilePropertyNames::kVersion);
  371. if (version_iter == uprops.end()) {
  372. return Status::Corruption("External file version not found");
  373. }
  374. file_to_ingest->version = DecodeFixed32(version_iter->second.c_str());
  375. auto seqno_iter = uprops.find(ExternalSstFilePropertyNames::kGlobalSeqno);
  376. if (file_to_ingest->version == 2) {
  377. // version 2 imply that we have global sequence number
  378. if (seqno_iter == uprops.end()) {
  379. return Status::Corruption(
  380. "External file global sequence number not found");
  381. }
  382. // Set the global sequence number
  383. file_to_ingest->original_seqno = DecodeFixed64(seqno_iter->second.c_str());
  384. auto offsets_iter = props->properties_offsets.find(
  385. ExternalSstFilePropertyNames::kGlobalSeqno);
  386. if (offsets_iter == props->properties_offsets.end() ||
  387. offsets_iter->second == 0) {
  388. file_to_ingest->global_seqno_offset = 0;
  389. return Status::Corruption("Was not able to find file global seqno field");
  390. }
  391. file_to_ingest->global_seqno_offset = static_cast<size_t>(offsets_iter->second);
  392. } else if (file_to_ingest->version == 1) {
  393. // SST file V1 should not have global seqno field
  394. assert(seqno_iter == uprops.end());
  395. file_to_ingest->original_seqno = 0;
  396. if (ingestion_options_.allow_blocking_flush ||
  397. ingestion_options_.allow_global_seqno) {
  398. return Status::InvalidArgument(
  399. "External SST file V1 does not support global seqno");
  400. }
  401. } else {
  402. return Status::InvalidArgument("External file version is not supported");
  403. }
  404. // Get number of entries in table
  405. file_to_ingest->num_entries = props->num_entries;
  406. file_to_ingest->num_range_deletions = props->num_range_deletions;
  407. ParsedInternalKey key;
  408. ReadOptions ro;
  409. // During reading the external file we can cache blocks that we read into
  410. // the block cache, if we later change the global seqno of this file, we will
  411. // have block in cache that will include keys with wrong seqno.
  412. // We need to disable fill_cache so that we read from the file without
  413. // updating the block cache.
  414. ro.fill_cache = false;
  415. std::unique_ptr<InternalIterator> iter(table_reader->NewIterator(
  416. ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr,
  417. /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion));
  418. std::unique_ptr<InternalIterator> range_del_iter(
  419. table_reader->NewRangeTombstoneIterator(ro));
  420. // Get first (smallest) and last (largest) key from file.
  421. file_to_ingest->smallest_internal_key =
  422. InternalKey("", 0, ValueType::kTypeValue);
  423. file_to_ingest->largest_internal_key =
  424. InternalKey("", 0, ValueType::kTypeValue);
  425. bool bounds_set = false;
  426. iter->SeekToFirst();
  427. if (iter->Valid()) {
  428. if (!ParseInternalKey(iter->key(), &key)) {
  429. return Status::Corruption("external file have corrupted keys");
  430. }
  431. if (key.sequence != 0) {
  432. return Status::Corruption("external file have non zero sequence number");
  433. }
  434. file_to_ingest->smallest_internal_key.SetFrom(key);
  435. iter->SeekToLast();
  436. if (!ParseInternalKey(iter->key(), &key)) {
  437. return Status::Corruption("external file have corrupted keys");
  438. }
  439. if (key.sequence != 0) {
  440. return Status::Corruption("external file have non zero sequence number");
  441. }
  442. file_to_ingest->largest_internal_key.SetFrom(key);
  443. bounds_set = true;
  444. }
  445. // We may need to adjust these key bounds, depending on whether any range
  446. // deletion tombstones extend past them.
  447. const Comparator* ucmp = cfd_->internal_comparator().user_comparator();
  448. if (range_del_iter != nullptr) {
  449. for (range_del_iter->SeekToFirst(); range_del_iter->Valid();
  450. range_del_iter->Next()) {
  451. if (!ParseInternalKey(range_del_iter->key(), &key)) {
  452. return Status::Corruption("external file have corrupted keys");
  453. }
  454. RangeTombstone tombstone(key, range_del_iter->value());
  455. InternalKey start_key = tombstone.SerializeKey();
  456. if (!bounds_set ||
  457. sstableKeyCompare(ucmp, start_key,
  458. file_to_ingest->smallest_internal_key) < 0) {
  459. file_to_ingest->smallest_internal_key = start_key;
  460. }
  461. InternalKey end_key = tombstone.SerializeEndKey();
  462. if (!bounds_set ||
  463. sstableKeyCompare(ucmp, end_key,
  464. file_to_ingest->largest_internal_key) > 0) {
  465. file_to_ingest->largest_internal_key = end_key;
  466. }
  467. bounds_set = true;
  468. }
  469. }
  470. file_to_ingest->cf_id = static_cast<uint32_t>(props->column_family_id);
  471. file_to_ingest->table_properties = *props;
  472. return status;
  473. }
  474. Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
  475. SuperVersion* sv, bool force_global_seqno, CompactionStyle compaction_style,
  476. SequenceNumber last_seqno, IngestedFileInfo* file_to_ingest,
  477. SequenceNumber* assigned_seqno) {
  478. Status status;
  479. *assigned_seqno = 0;
  480. if (force_global_seqno) {
  481. *assigned_seqno = last_seqno + 1;
  482. if (compaction_style == kCompactionStyleUniversal || files_overlap_) {
  483. file_to_ingest->picked_level = 0;
  484. return status;
  485. }
  486. }
  487. bool overlap_with_db = false;
  488. Arena arena;
  489. ReadOptions ro;
  490. ro.total_order_seek = true;
  491. int target_level = 0;
  492. auto* vstorage = cfd_->current()->storage_info();
  493. for (int lvl = 0; lvl < cfd_->NumberLevels(); lvl++) {
  494. if (lvl > 0 && lvl < vstorage->base_level()) {
  495. continue;
  496. }
  497. if (vstorage->NumLevelFiles(lvl) > 0) {
  498. bool overlap_with_level = false;
  499. status = sv->current->OverlapWithLevelIterator(
  500. ro, env_options_, file_to_ingest->smallest_internal_key.user_key(),
  501. file_to_ingest->largest_internal_key.user_key(), lvl,
  502. &overlap_with_level);
  503. if (!status.ok()) {
  504. return status;
  505. }
  506. if (overlap_with_level) {
  507. // We must use L0 or any level higher than `lvl` to be able to overwrite
  508. // the keys that we overlap with in this level, We also need to assign
  509. // this file a seqno to overwrite the existing keys in level `lvl`
  510. overlap_with_db = true;
  511. break;
  512. }
  513. if (compaction_style == kCompactionStyleUniversal && lvl != 0) {
  514. const std::vector<FileMetaData*>& level_files =
  515. vstorage->LevelFiles(lvl);
  516. const SequenceNumber level_largest_seqno =
  517. (*max_element(level_files.begin(), level_files.end(),
  518. [](FileMetaData* f1, FileMetaData* f2) {
  519. return f1->fd.largest_seqno < f2->fd.largest_seqno;
  520. }))
  521. ->fd.largest_seqno;
  522. // should only assign seqno to current level's largest seqno when
  523. // the file fits
  524. if (level_largest_seqno != 0 &&
  525. IngestedFileFitInLevel(file_to_ingest, lvl)) {
  526. *assigned_seqno = level_largest_seqno;
  527. } else {
  528. continue;
  529. }
  530. }
  531. } else if (compaction_style == kCompactionStyleUniversal) {
  532. continue;
  533. }
  534. // We dont overlap with any keys in this level, but we still need to check
  535. // if our file can fit in it
  536. if (IngestedFileFitInLevel(file_to_ingest, lvl)) {
  537. target_level = lvl;
  538. }
  539. }
  540. // If files overlap, we have to ingest them at level 0 and assign the newest
  541. // sequence number
  542. if (files_overlap_) {
  543. target_level = 0;
  544. *assigned_seqno = last_seqno + 1;
  545. }
  546. TEST_SYNC_POINT_CALLBACK(
  547. "ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile",
  548. &overlap_with_db);
  549. file_to_ingest->picked_level = target_level;
  550. if (overlap_with_db && *assigned_seqno == 0) {
  551. *assigned_seqno = last_seqno + 1;
  552. }
  553. return status;
  554. }
  555. Status ExternalSstFileIngestionJob::CheckLevelForIngestedBehindFile(
  556. IngestedFileInfo* file_to_ingest) {
  557. auto* vstorage = cfd_->current()->storage_info();
  558. // first check if new files fit in the bottommost level
  559. int bottom_lvl = cfd_->NumberLevels() - 1;
  560. if(!IngestedFileFitInLevel(file_to_ingest, bottom_lvl)) {
  561. return Status::InvalidArgument(
  562. "Can't ingest_behind file as it doesn't fit "
  563. "at the bottommost level!");
  564. }
  565. // second check if despite allow_ingest_behind=true we still have 0 seqnums
  566. // at some upper level
  567. for (int lvl = 0; lvl < cfd_->NumberLevels() - 1; lvl++) {
  568. for (auto file : vstorage->LevelFiles(lvl)) {
  569. if (file->fd.smallest_seqno == 0) {
  570. return Status::InvalidArgument(
  571. "Can't ingest_behind file as despite allow_ingest_behind=true "
  572. "there are files with 0 seqno in database at upper levels!");
  573. }
  574. }
  575. }
  576. file_to_ingest->picked_level = bottom_lvl;
  577. return Status::OK();
  578. }
  579. Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile(
  580. IngestedFileInfo* file_to_ingest, SequenceNumber seqno) {
  581. if (file_to_ingest->original_seqno == seqno) {
  582. // This file already have the correct global seqno
  583. return Status::OK();
  584. } else if (!ingestion_options_.allow_global_seqno) {
  585. return Status::InvalidArgument("Global seqno is required, but disabled");
  586. } else if (file_to_ingest->global_seqno_offset == 0) {
  587. return Status::InvalidArgument(
  588. "Trying to set global seqno for a file that dont have a global seqno "
  589. "field");
  590. }
  591. if (ingestion_options_.write_global_seqno) {
  592. // Determine if we can write global_seqno to a given offset of file.
  593. // If the file system does not support random write, then we should not.
  594. // Otherwise we should.
  595. std::unique_ptr<FSRandomRWFile> rwfile;
  596. Status status =
  597. fs_->NewRandomRWFile(file_to_ingest->internal_file_path, env_options_,
  598. &rwfile, nullptr);
  599. if (status.ok()) {
  600. std::string seqno_val;
  601. PutFixed64(&seqno_val, seqno);
  602. status = rwfile->Write(file_to_ingest->global_seqno_offset, seqno_val,
  603. IOOptions(), nullptr);
  604. if (status.ok()) {
  605. TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno");
  606. status = SyncIngestedFile(rwfile.get());
  607. TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncGlobalSeqno");
  608. if (!status.ok()) {
  609. ROCKS_LOG_WARN(db_options_.info_log,
  610. "Failed to sync ingested file %s after writing global "
  611. "sequence number: %s",
  612. file_to_ingest->internal_file_path.c_str(),
  613. status.ToString().c_str());
  614. }
  615. }
  616. if (!status.ok()) {
  617. return status;
  618. }
  619. } else if (!status.IsNotSupported()) {
  620. return status;
  621. }
  622. }
  623. file_to_ingest->assigned_seqno = seqno;
  624. return Status::OK();
  625. }
  626. bool ExternalSstFileIngestionJob::IngestedFileFitInLevel(
  627. const IngestedFileInfo* file_to_ingest, int level) {
  628. if (level == 0) {
  629. // Files can always fit in L0
  630. return true;
  631. }
  632. auto* vstorage = cfd_->current()->storage_info();
  633. Slice file_smallest_user_key(
  634. file_to_ingest->smallest_internal_key.user_key());
  635. Slice file_largest_user_key(file_to_ingest->largest_internal_key.user_key());
  636. if (vstorage->OverlapInLevel(level, &file_smallest_user_key,
  637. &file_largest_user_key)) {
  638. // File overlap with another files in this level, we cannot
  639. // add it to this level
  640. return false;
  641. }
  642. if (cfd_->RangeOverlapWithCompaction(file_smallest_user_key,
  643. file_largest_user_key, level)) {
  644. // File overlap with a running compaction output that will be stored
  645. // in this level, we cannot add this file to this level
  646. return false;
  647. }
  648. // File did not overlap with level files, our compaction output
  649. return true;
  650. }
  651. template <typename TWritableFile>
  652. Status ExternalSstFileIngestionJob::SyncIngestedFile(TWritableFile* file) {
  653. assert(file != nullptr);
  654. if (db_options_.use_fsync) {
  655. return file->Fsync(IOOptions(), nullptr);
  656. } else {
  657. return file->Sync(IOOptions(), nullptr);
  658. }
  659. }
  660. } // namespace ROCKSDB_NAMESPACE
  661. #endif // !ROCKSDB_LITE