external_sst_file_ingestion_job.cc 66 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. #include "db/external_sst_file_ingestion_job.h"
  6. #include <algorithm>
  7. #include <cinttypes>
  8. #include <string>
  9. #include <unordered_set>
  10. #include <vector>
  11. #include "db/db_impl/db_impl.h"
  12. #include "db/version_edit.h"
  13. #include "file/file_util.h"
  14. #include "file/random_access_file_reader.h"
  15. #include "logging/logging.h"
  16. #include "table/merging_iterator.h"
  17. #include "table/sst_file_writer_collectors.h"
  18. #include "table/table_builder.h"
  19. #include "table/unique_id_impl.h"
  20. #include "test_util/sync_point.h"
  21. #include "util/udt_util.h"
  22. namespace ROCKSDB_NAMESPACE {
  23. Status ExternalSstFileIngestionJob::Prepare(
  24. const std::vector<std::string>& external_files_paths,
  25. const std::vector<std::string>& files_checksums,
  26. const std::vector<std::string>& files_checksum_func_names,
  27. const std::optional<RangeOpt>& atomic_replace_range,
  28. const Temperature& file_temperature, uint64_t next_file_number,
  29. SuperVersion* sv) {
  30. Status status;
  31. // Read the information of files we are ingesting
  32. for (const std::string& file_path : external_files_paths) {
  33. IngestedFileInfo file_to_ingest;
  34. // For temperature, first assume it matches provided hint
  35. file_to_ingest.file_temperature = file_temperature;
  36. status =
  37. GetIngestedFileInfo(file_path, next_file_number++, &file_to_ingest, sv);
  38. if (!status.ok()) {
  39. ROCKS_LOG_WARN(db_options_.info_log,
  40. "Failed to get ingested file info: %s: %s",
  41. file_path.c_str(), status.ToString().c_str());
  42. return status;
  43. }
  44. // Files generated in another DB or CF may have a different column family
  45. // ID, so we let it pass here.
  46. if (file_to_ingest.cf_id !=
  47. TablePropertiesCollectorFactory::Context::kUnknownColumnFamily &&
  48. file_to_ingest.cf_id != cfd_->GetID() &&
  49. !ingestion_options_.allow_db_generated_files) {
  50. return Status::InvalidArgument(
  51. "External file column family id don't match");
  52. }
  53. if (file_to_ingest.num_entries == 0 &&
  54. file_to_ingest.num_range_deletions == 0) {
  55. return Status::InvalidArgument("File contain no entries");
  56. }
  57. if (!file_to_ingest.smallest_internal_key.Valid() ||
  58. !file_to_ingest.largest_internal_key.Valid()) {
  59. return Status::Corruption("Generated table have corrupted keys");
  60. }
  61. files_to_ingest_.emplace_back(std::move(file_to_ingest));
  62. }
  63. auto num_files = files_to_ingest_.size();
  64. if (num_files == 0) {
  65. return Status::InvalidArgument("The list of files is empty");
  66. } else if (num_files > 1) {
  67. // Verify that passed files don't have overlapping ranges
  68. autovector<const IngestedFileInfo*> sorted_files;
  69. for (size_t i = 0; i < num_files; i++) {
  70. sorted_files.push_back(&files_to_ingest_[i]);
  71. }
  72. std::sort(sorted_files.begin(), sorted_files.end(), file_range_checker_);
  73. for (size_t i = 0; i + 1 < num_files; i++) {
  74. if (file_range_checker_.Overlaps(*sorted_files[i], *sorted_files[i + 1],
  75. /* known_sorted= */ true)) {
  76. files_overlap_ = true;
  77. break;
  78. }
  79. }
  80. }
  81. if (atomic_replace_range.has_value()) {
  82. atomic_replace_range_.emplace();
  83. if (atomic_replace_range->start && atomic_replace_range->limit) {
  84. // User keys to internal keys (with timestamps)
  85. const size_t ts_sz = ucmp_->timestamp_size();
  86. std::string start_with_ts, limit_with_ts;
  87. auto [start, limit] = MaybeAddTimestampsToRange(
  88. atomic_replace_range->start, atomic_replace_range->limit, ts_sz,
  89. &start_with_ts, &limit_with_ts);
  90. assert(start.has_value());
  91. assert(limit.has_value());
  92. atomic_replace_range_->smallest_internal_key.Set(
  93. *start, kMaxSequenceNumber, kValueTypeForSeek);
  94. atomic_replace_range_->largest_internal_key.Set(
  95. *limit, kMaxSequenceNumber, kValueTypeForSeek);
  96. // Check files to ingest against replace range
  97. for (size_t i = 0; i < num_files; i++) {
  98. if (!file_range_checker_.Contains(*atomic_replace_range_,
  99. files_to_ingest_[i])) {
  100. return Status::InvalidArgument(
  101. "Atomic replace range does not contain all files");
  102. }
  103. }
  104. } else {
  105. // Currently if either bound is not present, both must be
  106. assert(atomic_replace_range->start.has_value() == false);
  107. assert(atomic_replace_range->limit.has_value() == false);
  108. assert(atomic_replace_range_->smallest_internal_key.unset());
  109. assert(atomic_replace_range_->largest_internal_key.unset());
  110. }
  111. }
  112. if (files_overlap_) {
  113. if (ingestion_options_.ingest_behind) {
  114. return Status::NotSupported(
  115. "Files with overlapping ranges cannot be ingested with ingestion "
  116. "behind mode.");
  117. }
  118. // Overlapping files need at least two different sequence numbers. If
  119. // settings disables global seqno, ingestion will fail anyway, so fail
  120. // fast in prepare.
  121. if (!ingestion_options_.allow_global_seqno &&
  122. !ingestion_options_.allow_db_generated_files) {
  123. return Status::InvalidArgument(
  124. "Global seqno is required, but disabled (because external files key "
  125. "range overlaps).");
  126. }
  127. if (ucmp_->timestamp_size() > 0) {
  128. return Status::NotSupported(
  129. "Files with overlapping ranges cannot be ingested to column "
  130. "family with user-defined timestamp enabled.");
  131. }
  132. }
  133. // Copy/Move external files into DB
  134. std::unordered_set<size_t> ingestion_path_ids;
  135. for (IngestedFileInfo& f : files_to_ingest_) {
  136. f.copy_file = false;
  137. const std::string path_outside_db = f.external_file_path;
  138. const std::string path_inside_db = TableFileName(
  139. cfd_->ioptions().cf_paths, f.fd.GetNumber(), f.fd.GetPathId());
  140. if (ingestion_options_.move_files || ingestion_options_.link_files) {
  141. status =
  142. fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr);
  143. if (status.ok()) {
  144. // It is unsafe to assume application had sync the file and file
  145. // directory before ingest the file. For integrity of RocksDB we need
  146. // to sync the file.
  147. // TODO(xingbo), We should in general be moving away from production
  148. // uses of ReuseWritableFile (except explicitly for WAL recycling),
  149. // ReopenWritableFile, and NewRandomRWFile. We should create a
  150. // FileSystem::SyncFile/FsyncFile API that by default does the
  151. // re-open+sync+close combo but can (a) be reused easily, and (b) be
  152. // overridden to do that more cleanly, e.g. in EncryptedEnv.
  153. // https://github.com/facebook/rocksdb/issues/13741
  154. std::unique_ptr<FSWritableFile> file_to_sync;
  155. Status s = fs_->ReopenWritableFile(path_inside_db, env_options_,
  156. &file_to_sync, nullptr);
  157. TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:Reopen",
  158. &s);
  159. // Some file systems (especially remote/distributed) don't support
  160. // reopening a file for writing and don't require reopening and
  161. // syncing the file. Ignore the NotSupported error in that case.
  162. if (!s.IsNotSupported()) {
  163. status = s;
  164. if (status.ok()) {
  165. TEST_SYNC_POINT(
  166. "ExternalSstFileIngestionJob::BeforeSyncIngestedFile");
  167. status = SyncIngestedFile(file_to_sync.get());
  168. TEST_SYNC_POINT(
  169. "ExternalSstFileIngestionJob::AfterSyncIngestedFile");
  170. if (!status.ok()) {
  171. ROCKS_LOG_WARN(db_options_.info_log,
  172. "Failed to sync ingested file %s: %s",
  173. path_inside_db.c_str(), status.ToString().c_str());
  174. }
  175. }
  176. }
  177. } else if (status.IsNotSupported() &&
  178. ingestion_options_.failed_move_fall_back_to_copy) {
  179. // Original file is on a different FS, use copy instead of hard linking.
  180. f.copy_file = true;
  181. ROCKS_LOG_INFO(db_options_.info_log,
  182. "Tried to link file %s but it's not supported : %s",
  183. path_outside_db.c_str(), status.ToString().c_str());
  184. } else {
  185. ROCKS_LOG_WARN(db_options_.info_log, "Failed to link file %s to %s: %s",
  186. path_outside_db.c_str(), path_inside_db.c_str(),
  187. status.ToString().c_str());
  188. }
  189. } else {
  190. f.copy_file = true;
  191. }
  192. if (f.copy_file) {
  193. TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:CopyFile",
  194. nullptr);
  195. // Always determining the destination temperature from the ingested-to
  196. // level would be difficult because in general we only find out the level
  197. // ingested to later, during Run().
  198. // However, we can guarantee "last level" temperature for when the user
  199. // requires ingestion to the last level.
  200. Temperature dst_temp =
  201. (ingestion_options_.ingest_behind ||
  202. ingestion_options_.fail_if_not_bottommost_level)
  203. ? sv->mutable_cf_options.last_level_temperature
  204. : sv->mutable_cf_options.default_write_temperature;
  205. // Note: CopyFile also syncs the new file.
  206. status = CopyFile(fs_.get(), path_outside_db, f.file_temperature,
  207. path_inside_db, dst_temp, 0, db_options_.use_fsync,
  208. io_tracer_);
  209. // The destination of the copy will be ingested
  210. f.file_temperature = dst_temp;
  211. if (!status.ok()) {
  212. ROCKS_LOG_WARN(db_options_.info_log, "Failed to copy file %s to %s: %s",
  213. path_outside_db.c_str(), path_inside_db.c_str(),
  214. status.ToString().c_str());
  215. }
  216. } else {
  217. // Note: we currently assume that linking files does not cross
  218. // temperatures, so no need to change f.file_temperature
  219. }
  220. TEST_SYNC_POINT("ExternalSstFileIngestionJob::Prepare:FileAdded");
  221. if (!status.ok()) {
  222. break;
  223. }
  224. f.internal_file_path = path_inside_db;
  225. // Initialize the checksum information of ingested files.
  226. f.file_checksum = kUnknownFileChecksum;
  227. f.file_checksum_func_name = kUnknownFileChecksumFuncName;
  228. ingestion_path_ids.insert(f.fd.GetPathId());
  229. }
  230. TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncDir");
  231. if (status.ok()) {
  232. for (auto path_id : ingestion_path_ids) {
  233. status = directories_->GetDataDir(path_id)->FsyncWithDirOptions(
  234. IOOptions(), nullptr,
  235. DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
  236. if (!status.ok()) {
  237. ROCKS_LOG_WARN(db_options_.info_log,
  238. "Failed to sync directory %" ROCKSDB_PRIszt
  239. " while ingest file: %s",
  240. path_id, status.ToString().c_str());
  241. break;
  242. }
  243. }
  244. }
  245. TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncDir");
  246. // Generate and check the sst file checksum. Note that, if
  247. // IngestExternalFileOptions::write_global_seqno is true, we will not update
  248. // the checksum information in the files_to_ingests_ here, since the file is
  249. // updated with the new global_seqno. After global_seqno is updated, DB will
  250. // generate the new checksum and store it in the Manifest. In all other cases
  251. // if ingestion_options_.write_global_seqno == true and
  252. // verify_file_checksum is false, we only check the checksum function name.
  253. if (status.ok() && db_options_.file_checksum_gen_factory != nullptr) {
  254. if (ingestion_options_.verify_file_checksum == false &&
  255. files_checksums.size() == files_to_ingest_.size() &&
  256. files_checksum_func_names.size() == files_to_ingest_.size()) {
  257. // Only when verify_file_checksum == false and the checksum for ingested
  258. // files are provided, DB will use the provided checksum and does not
  259. // generate the checksum for ingested files.
  260. need_generate_file_checksum_ = false;
  261. } else {
  262. need_generate_file_checksum_ = true;
  263. }
  264. std::vector<std::string> generated_checksums;
  265. std::vector<std::string> generated_checksum_func_names;
  266. // Step 1: generate the checksum for ingested sst file.
  267. if (need_generate_file_checksum_) {
  268. for (size_t i = 0; i < files_to_ingest_.size(); i++) {
  269. std::string generated_checksum;
  270. std::string generated_checksum_func_name;
  271. std::string requested_checksum_func_name =
  272. i < files_checksum_func_names.size() ? files_checksum_func_names[i]
  273. : "";
  274. // TODO: rate limit file reads for checksum calculation during file
  275. // ingestion.
  276. // TODO: plumb Env::IOActivity
  277. ReadOptions ro;
  278. IOStatus io_s = GenerateOneFileChecksum(
  279. fs_.get(), files_to_ingest_[i].internal_file_path,
  280. db_options_.file_checksum_gen_factory.get(),
  281. requested_checksum_func_name, &generated_checksum,
  282. &generated_checksum_func_name,
  283. ingestion_options_.verify_checksums_readahead_size,
  284. db_options_.allow_mmap_reads, io_tracer_,
  285. db_options_.rate_limiter.get(), ro, db_options_.stats,
  286. db_options_.clock);
  287. if (!io_s.ok()) {
  288. status = io_s;
  289. ROCKS_LOG_WARN(db_options_.info_log,
  290. "Sst file checksum generation of file: %s failed: %s",
  291. files_to_ingest_[i].internal_file_path.c_str(),
  292. status.ToString().c_str());
  293. break;
  294. }
  295. if (ingestion_options_.write_global_seqno == false) {
  296. files_to_ingest_[i].file_checksum = generated_checksum;
  297. files_to_ingest_[i].file_checksum_func_name =
  298. generated_checksum_func_name;
  299. }
  300. generated_checksums.push_back(generated_checksum);
  301. generated_checksum_func_names.push_back(generated_checksum_func_name);
  302. }
  303. }
  304. // Step 2: based on the verify_file_checksum and ingested checksum
  305. // information, do the verification.
  306. if (status.ok()) {
  307. if (files_checksums.size() == files_to_ingest_.size() &&
  308. files_checksum_func_names.size() == files_to_ingest_.size()) {
  309. // Verify the checksum and checksum function name.
  310. if (ingestion_options_.verify_file_checksum) {
  311. for (size_t i = 0; i < files_to_ingest_.size(); i++) {
  312. if (files_checksum_func_names[i] !=
  313. generated_checksum_func_names[i]) {
  314. status = Status::InvalidArgument(
  315. "DB file checksum gen factory " +
  316. std::string(db_options_.file_checksum_gen_factory->Name()) +
  317. " generated checksum function name " +
  318. generated_checksum_func_names[i] + " for file " +
  319. external_files_paths[i] +
  320. " which does not match requested/provided " +
  321. files_checksum_func_names[i]);
  322. break;
  323. }
  324. if (files_checksums[i] != generated_checksums[i]) {
  325. status = Status::Corruption(
  326. "Checksum verification mismatch for ingestion file " +
  327. external_files_paths[i] + " using function " +
  328. generated_checksum_func_names[i] + ". Expected: " +
  329. Slice(files_checksums[i]).ToString(/*hex=*/true) +
  330. " Computed: " +
  331. Slice(generated_checksums[i]).ToString(/*hex=*/true));
  332. break;
  333. }
  334. }
  335. } else {
  336. // If verify_file_checksum is not enabled, we only verify the factory
  337. // recognizes the checksum function name. If it does not match, fail
  338. // the ingestion. If matches, we trust the ingested checksum
  339. // information and store in the Manifest.
  340. for (size_t i = 0; i < files_to_ingest_.size(); i++) {
  341. FileChecksumGenContext gen_context;
  342. gen_context.file_name = files_to_ingest_[i].internal_file_path;
  343. gen_context.requested_checksum_func_name =
  344. files_checksum_func_names[i];
  345. auto file_checksum_gen =
  346. db_options_.file_checksum_gen_factory
  347. ->CreateFileChecksumGenerator(gen_context);
  348. if (file_checksum_gen == nullptr ||
  349. files_checksum_func_names[i] != file_checksum_gen->Name()) {
  350. status = Status::InvalidArgument(
  351. "Checksum function name " + files_checksum_func_names[i] +
  352. " for file " + external_files_paths[i] +
  353. " not recognized by DB checksum gen factory" +
  354. db_options_.file_checksum_gen_factory->Name() +
  355. (file_checksum_gen ? (" Returned function " +
  356. std::string(file_checksum_gen->Name()))
  357. : ""));
  358. break;
  359. }
  360. files_to_ingest_[i].file_checksum = files_checksums[i];
  361. files_to_ingest_[i].file_checksum_func_name =
  362. files_checksum_func_names[i];
  363. }
  364. }
  365. } else if (files_checksums.size() != files_checksum_func_names.size() ||
  366. files_checksums.size() != 0) {
  367. // The checksum or checksum function name vector are not both empty
  368. // and they are incomplete.
  369. status = Status::InvalidArgument(
  370. "The checksum information of ingested sst files are nonempty and "
  371. "the size of checksums or the size of the checksum function "
  372. "names does not match with the number of ingested sst files");
  373. }
  374. if (!status.ok()) {
  375. ROCKS_LOG_WARN(db_options_.info_log, "Ingestion failed: %s",
  376. status.ToString().c_str());
  377. }
  378. }
  379. }
  380. if (status.ok()) {
  381. DivideInputFilesIntoBatches();
  382. }
  383. return status;
  384. }
  385. void ExternalSstFileIngestionJob::DivideInputFilesIntoBatches() {
  386. if (!files_overlap_) {
  387. // No overlap, treat as one batch without the need of tracking overall batch
  388. // range.
  389. file_batches_to_ingest_.emplace_back(/* _track_batch_range= */ false);
  390. for (auto& file : files_to_ingest_) {
  391. file_batches_to_ingest_.back().AddFile(&file, file_range_checker_);
  392. }
  393. return;
  394. }
  395. file_batches_to_ingest_.emplace_back(/* _track_batch_range= */ true);
  396. for (auto& file : files_to_ingest_) {
  397. if (!file_batches_to_ingest_.back().unset() &&
  398. file_range_checker_.Overlaps(file_batches_to_ingest_.back(), file,
  399. /* known_sorted= */ false)) {
  400. file_batches_to_ingest_.emplace_back(/* _track_batch_range= */ true);
  401. }
  402. file_batches_to_ingest_.back().AddFile(&file, file_range_checker_);
  403. }
  404. }
  405. Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed,
  406. SuperVersion* super_version) {
  407. Status status;
  408. if (atomic_replace_range_.has_value() && atomic_replace_range_->unset()) {
  409. // For replacing whole CF, we can simply check whether memtable is empty
  410. *flush_needed = !super_version->mem->IsEmpty();
  411. } else {
  412. autovector<UserKeyRange> ranges;
  413. if (atomic_replace_range_.has_value()) {
  414. assert(!atomic_replace_range_->smallest_internal_key.unset());
  415. assert(!atomic_replace_range_->largest_internal_key.unset());
  416. // NOTE: we already checked in Prepare() that the atomic_replace_range
  417. // covers all the files_to_ingest
  418. // FIXME: need to make upper bound key exclusive (not easy here because
  419. // the existing internal APIs deal in inclusive upper bound user keys)
  420. ranges.emplace_back(
  421. atomic_replace_range_->smallest_internal_key.user_key(),
  422. atomic_replace_range_->largest_internal_key.user_key());
  423. } else {
  424. ranges.reserve(files_to_ingest_.size());
  425. for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) {
  426. ranges.emplace_back(file_to_ingest.start_ukey,
  427. file_to_ingest.limit_ukey);
  428. }
  429. }
  430. status = cfd_->RangesOverlapWithMemtables(
  431. ranges, super_version, db_options_.allow_data_in_errors, flush_needed);
  432. if (!status.ok()) {
  433. ROCKS_LOG_WARN(db_options_.info_log,
  434. "Failed to check ranges overlap with memtables: %s",
  435. status.ToString().c_str());
  436. }
  437. }
  438. if (status.ok() && *flush_needed) {
  439. if (!ingestion_options_.allow_blocking_flush) {
  440. status = Status::InvalidArgument("External file requires flush");
  441. }
  442. if (ucmp_->timestamp_size() > 0) {
  443. status = Status::InvalidArgument(
  444. "Column family enables user-defined timestamps, please make "
  445. "sure the key range (without timestamp) of external file does not "
  446. "overlap with key range in the memtables.");
  447. }
  448. }
  449. return status;
  450. }
  451. // REQUIRES: we have become the only writer by entering both write_thread_ and
  452. // nonmem_write_thread_
  453. Status ExternalSstFileIngestionJob::Run() {
  454. SuperVersion* super_version = cfd_->GetSuperVersion();
  455. // If column family is flushed after Prepare and before Run, we should have a
  456. // specific state of Memtables. The mutable Memtable should be empty, and the
  457. // immutable Memtable list should be empty.
  458. if (flushed_before_run_ && (super_version->imm->NumNotFlushed() != 0 ||
  459. !super_version->mem->IsEmpty())) {
  460. return Status::TryAgain(
  461. "Inconsistent memtable state detected when flushed before run.");
  462. }
  463. Status status;
  464. #ifndef NDEBUG
  465. // We should never run the job with a memtable that is overlapping
  466. // with the files we are ingesting
  467. bool need_flush = false;
  468. status = NeedsFlush(&need_flush, super_version);
  469. if (!status.ok()) {
  470. ROCKS_LOG_WARN(db_options_.info_log,
  471. "Failed to check if flush is needed: %s",
  472. status.ToString().c_str());
  473. return status;
  474. }
  475. if (need_flush) {
  476. return Status::TryAgain("need_flush");
  477. }
  478. assert(status.ok() && need_flush == false);
  479. #endif
  480. bool force_global_seqno = false;
  481. if (ingestion_options_.snapshot_consistency && !db_snapshots_->empty()) {
  482. // We need to assign a global sequence number to all the files even
  483. // if the don't overlap with any ranges since we have snapshots
  484. force_global_seqno = true;
  485. }
  486. // It is safe to use this instead of LastAllocatedSequence since we are
  487. // the only active writer, and hence they are equal
  488. SequenceNumber last_seqno = versions_->LastSequence();
  489. edit_.SetColumnFamily(cfd_->GetID());
  490. if (atomic_replace_range_.has_value()) {
  491. auto* vstorage = super_version->current->storage_info();
  492. if (atomic_replace_range_->unset()) {
  493. if (cfd_->compaction_picker()->IsCompactionInProgress()) {
  494. return Status::InvalidArgument(
  495. "Atomic replace range (full) overlaps with pending compaction");
  496. }
  497. for (int lvl = 0; lvl < cfd_->NumberLevels(); lvl++) {
  498. for (auto file : vstorage->LevelFiles(lvl)) {
  499. // Set up to delete file to be replaced
  500. edit_.DeleteFile(lvl, file->fd.GetNumber());
  501. }
  502. }
  503. } else {
  504. assert(!atomic_replace_range_->smallest_internal_key.unset());
  505. assert(!atomic_replace_range_->largest_internal_key.unset());
  506. for (int lvl = 0; lvl < cfd_->NumberLevels(); lvl++) {
  507. if (cfd_->RangeOverlapWithCompaction(
  508. atomic_replace_range_->smallest_internal_key.user_key(),
  509. atomic_replace_range_->largest_internal_key.user_key(), lvl)) {
  510. return Status::InvalidArgument(
  511. "Atomic replace range overlaps with pending compaction");
  512. }
  513. for (auto file : vstorage->LevelFiles(lvl)) {
  514. if (file_range_checker_.Overlaps(*atomic_replace_range_,
  515. file->smallest, file->largest)) {
  516. if (file_range_checker_.Contains(*atomic_replace_range_,
  517. file->smallest, file->largest)) {
  518. // Set up to delete file to be replaced
  519. edit_.DeleteFile(lvl, file->fd.GetNumber());
  520. } else {
  521. // TODO: generate and ingest a tombstone file also
  522. return Status::InvalidArgument(
  523. "Atomic replace range partially overlaps with existing file");
  524. }
  525. }
  526. }
  527. }
  528. }
  529. }
  530. // Find levels to ingest into
  531. std::optional<int> prev_batch_uppermost_level;
  532. // batches at the front of file_batches_to_ingest_ contains older updates and
  533. // are placed in smaller levels.
  534. for (auto& batch : file_batches_to_ingest_) {
  535. int batch_uppermost_level = 0;
  536. status = AssignLevelsForOneBatch(batch, super_version, force_global_seqno,
  537. &last_seqno, &batch_uppermost_level,
  538. prev_batch_uppermost_level);
  539. if (!status.ok()) {
  540. ROCKS_LOG_WARN(db_options_.info_log,
  541. "Failed to assign levels for one batch: %s",
  542. status.ToString().c_str());
  543. return status;
  544. }
  545. prev_batch_uppermost_level = batch_uppermost_level;
  546. }
  547. CreateEquivalentFileIngestingCompactions();
  548. return status;
  549. }
  550. Status ExternalSstFileIngestionJob::AssignLevelsForOneBatch(
  551. FileBatchInfo& batch, SuperVersion* super_version, bool force_global_seqno,
  552. SequenceNumber* last_seqno, int* batch_uppermost_level,
  553. std::optional<int> prev_batch_uppermost_level) {
  554. Status status;
  555. assert(batch_uppermost_level);
  556. *batch_uppermost_level = std::numeric_limits<int>::max();
  557. for (IngestedFileInfo* file : batch.files) {
  558. assert(file);
  559. SequenceNumber assigned_seqno = 0;
  560. if (ingestion_options_.ingest_behind) {
  561. status = CheckLevelForIngestedBehindFile(file);
  562. } else {
  563. status = AssignLevelAndSeqnoForIngestedFile(
  564. super_version, force_global_seqno, cfd_->ioptions().compaction_style,
  565. *last_seqno, file, &assigned_seqno, prev_batch_uppermost_level);
  566. }
  567. // Modify the smallest/largest internal key to include the sequence number
  568. // that we just learned. Only overwrite sequence number zero. There could
  569. // be a nonzero sequence number already to indicate a range tombstone's
  570. // exclusive endpoint.
  571. ParsedInternalKey smallest_parsed, largest_parsed;
  572. if (status.ok()) {
  573. status = ParseInternalKey(*(file->smallest_internal_key.rep()),
  574. &smallest_parsed, false /* log_err_key */);
  575. }
  576. if (status.ok()) {
  577. status = ParseInternalKey(*(file->largest_internal_key.rep()),
  578. &largest_parsed, false /* log_err_key */);
  579. }
  580. if (!status.ok()) {
  581. ROCKS_LOG_WARN(db_options_.info_log, "Failed to parse internal key: %s",
  582. status.ToString().c_str());
  583. return status;
  584. }
  585. // If any ingested file overlaps with the DB, it will fail here.
  586. if (ingestion_options_.allow_db_generated_files && assigned_seqno != 0) {
  587. return Status::InvalidArgument(
  588. "An ingested file overlaps with existing data in the DB and has been "
  589. "assigned a non-zero sequence number, which is not allowed when "
  590. "'allow_db_generated_files' is enabled.");
  591. }
  592. if (smallest_parsed.sequence == 0 && assigned_seqno != 0) {
  593. UpdateInternalKey(file->smallest_internal_key.rep(), assigned_seqno,
  594. smallest_parsed.type);
  595. }
  596. if (largest_parsed.sequence == 0 && assigned_seqno != 0) {
  597. UpdateInternalKey(file->largest_internal_key.rep(), assigned_seqno,
  598. largest_parsed.type);
  599. }
  600. status = AssignGlobalSeqnoForIngestedFile(file, assigned_seqno);
  601. if (!status.ok()) {
  602. ROCKS_LOG_WARN(
  603. db_options_.info_log,
  604. "Failed to assign global sequence number for ingested file: %s",
  605. status.ToString().c_str());
  606. return status;
  607. }
  608. TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run",
  609. &assigned_seqno);
  610. assert(assigned_seqno == 0 || assigned_seqno == *last_seqno + 1);
  611. if (assigned_seqno > *last_seqno) {
  612. *last_seqno = assigned_seqno;
  613. }
  614. max_assigned_seqno_ = std::max(max_assigned_seqno_, assigned_seqno);
  615. status = GenerateChecksumForIngestedFile(file);
  616. if (!status.ok()) {
  617. ROCKS_LOG_WARN(db_options_.info_log,
  618. "Failed to generate checksum for ingested file: %s",
  619. status.ToString().c_str());
  620. return status;
  621. }
  622. // We use the import time as the ancester time. This is the time the data
  623. // is written to the database.
  624. int64_t temp_current_time = 0;
  625. uint64_t current_time = kUnknownFileCreationTime;
  626. uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
  627. if (clock_->GetCurrentTime(&temp_current_time).ok()) {
  628. current_time = oldest_ancester_time =
  629. static_cast<uint64_t>(temp_current_time);
  630. }
  631. uint64_t tail_size = FileMetaData::CalculateTailSize(
  632. file->fd.GetFileSize(), file->table_properties);
  633. bool marked_for_compaction =
  634. file->table_properties.num_range_deletions == 1 &&
  635. (file->table_properties.num_entries ==
  636. file->table_properties.num_range_deletions);
  637. SequenceNumber smallest_seqno = file->assigned_seqno;
  638. SequenceNumber largest_seqno = file->assigned_seqno;
  639. if (ingestion_options_.allow_db_generated_files) {
  640. assert(file->assigned_seqno == 0);
  641. assert(file->smallest_seqno != kMaxSequenceNumber);
  642. assert(file->largest_seqno != kMaxSequenceNumber);
  643. smallest_seqno = file->smallest_seqno;
  644. largest_seqno = file->largest_seqno;
  645. max_assigned_seqno_ = std::max(max_assigned_seqno_, file->largest_seqno);
  646. }
  647. FileMetaData f_metadata(
  648. file->fd.GetNumber(), file->fd.GetPathId(), file->fd.GetFileSize(),
  649. file->smallest_internal_key, file->largest_internal_key, smallest_seqno,
  650. largest_seqno, false, file->file_temperature, kInvalidBlobFileNumber,
  651. oldest_ancester_time, current_time,
  652. ingestion_options_.ingest_behind
  653. ? kReservedEpochNumberForFileIngestedBehind
  654. : cfd_->NewEpochNumber(), // orders files ingested to L0
  655. file->file_checksum, file->file_checksum_func_name, file->unique_id, 0,
  656. tail_size, file->user_defined_timestamps_persisted);
  657. f_metadata.temperature = file->file_temperature;
  658. f_metadata.marked_for_compaction = marked_for_compaction;
  659. edit_.AddFile(file->picked_level, f_metadata);
  660. *batch_uppermost_level =
  661. std::min(*batch_uppermost_level, file->picked_level);
  662. }
  663. return Status::OK();
  664. }
  665. void ExternalSstFileIngestionJob::CreateEquivalentFileIngestingCompactions() {
  666. // A map from output level to input of compactions equivalent to this
  667. // ingestion job.
  668. // TODO: simplify below logic to creating compaction per ingested file
  669. // instead of per output level, once we figure out how to treat ingested files
  670. // with adjacent range deletion tombstones to same output level in the same
  671. // job as non-overlapping compactions.
  672. std::map<int, CompactionInputFiles>
  673. output_level_to_file_ingesting_compaction_input;
  674. for (const auto& pair : edit_.GetNewFiles()) {
  675. int output_level = pair.first;
  676. const FileMetaData& f_metadata = pair.second;
  677. CompactionInputFiles& input =
  678. output_level_to_file_ingesting_compaction_input[output_level];
  679. if (input.files.empty()) {
  680. // Treat the source level of ingested files to be level 0
  681. input.level = 0;
  682. }
  683. compaction_input_metdatas_.push_back(new FileMetaData(f_metadata));
  684. input.files.push_back(compaction_input_metdatas_.back());
  685. }
  686. for (const auto& pair : output_level_to_file_ingesting_compaction_input) {
  687. int output_level = pair.first;
  688. const CompactionInputFiles& input = pair.second;
  689. const auto& mutable_cf_options = cfd_->GetLatestMutableCFOptions();
  690. file_ingesting_compactions_.push_back(new Compaction(
  691. cfd_->current()->storage_info(), cfd_->ioptions(), mutable_cf_options,
  692. mutable_db_options_, {input}, output_level,
  693. /* output file size limit not applicable */
  694. MaxFileSizeForLevel(mutable_cf_options, output_level,
  695. cfd_->ioptions().compaction_style),
  696. LLONG_MAX /* max compaction bytes, not applicable */,
  697. 0 /* output path ID, not applicable */, mutable_cf_options.compression,
  698. mutable_cf_options.compression_opts, Temperature::kUnknown,
  699. 0 /* max_subcompaction, not applicable */,
  700. {} /* grandparents, not applicable */,
  701. std::nullopt /* earliest_snapshot */, nullptr /* snapshot_checker */,
  702. CompactionReason::kExternalSstIngestion, "" /* trim_ts */,
  703. -1 /* score, not applicable */,
  704. files_overlap_ /* l0_files_might_overlap, not applicable */));
  705. }
  706. }
  707. void ExternalSstFileIngestionJob::RegisterRange() {
  708. for (const auto& c : file_ingesting_compactions_) {
  709. cfd_->compaction_picker()->RegisterCompaction(c);
  710. }
  711. }
  712. void ExternalSstFileIngestionJob::UnregisterRange() {
  713. for (const auto& c : file_ingesting_compactions_) {
  714. cfd_->compaction_picker()->UnregisterCompaction(c);
  715. delete c;
  716. }
  717. file_ingesting_compactions_.clear();
  718. for (const auto& f : compaction_input_metdatas_) {
  719. delete f;
  720. }
  721. compaction_input_metdatas_.clear();
  722. }
  723. void ExternalSstFileIngestionJob::UpdateStats() {
  724. // Update internal stats for new ingested files
  725. uint64_t total_keys = 0;
  726. uint64_t total_l0_files = 0;
  727. uint64_t total_time = clock_->NowMicros() - job_start_time_;
  728. EventLoggerStream stream = event_logger_->Log();
  729. stream << "event" << "ingest_finished";
  730. stream << "files_ingested";
  731. stream.StartArray();
  732. for (IngestedFileInfo& f : files_to_ingest_) {
  733. InternalStats::CompactionStats stats(
  734. CompactionReason::kExternalSstIngestion, 1);
  735. stats.micros = total_time;
  736. // If actual copy occurred for this file, then we need to count the file
  737. // size as the actual bytes written. If the file was linked, then we ignore
  738. // the bytes written for file metadata.
  739. // TODO (yanqin) maybe account for file metadata bytes for exact accuracy?
  740. if (f.copy_file) {
  741. stats.bytes_written = f.fd.GetFileSize();
  742. } else {
  743. stats.bytes_moved = f.fd.GetFileSize();
  744. }
  745. stats.num_output_files = 1;
  746. cfd_->internal_stats()->AddCompactionStats(f.picked_level,
  747. Env::Priority::USER, stats);
  748. cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_INGESTED_ADD_FILE,
  749. f.fd.GetFileSize());
  750. total_keys += f.num_entries;
  751. if (f.picked_level == 0) {
  752. total_l0_files += 1;
  753. }
  754. ROCKS_LOG_INFO(
  755. db_options_.info_log,
  756. "[AddFile] External SST file %s was ingested in L%d with path %s "
  757. "(global_seqno=%" PRIu64 ")\n",
  758. f.external_file_path.c_str(), f.picked_level,
  759. f.internal_file_path.c_str(), f.assigned_seqno);
  760. stream << "file" << f.internal_file_path << "level" << f.picked_level;
  761. }
  762. stream.EndArray();
  763. stream << "lsm_state";
  764. stream.StartArray();
  765. auto vstorage = cfd_->current()->storage_info();
  766. for (int level = 0; level < vstorage->num_levels(); ++level) {
  767. stream << vstorage->NumLevelFiles(level);
  768. }
  769. stream.EndArray();
  770. cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_KEYS_TOTAL,
  771. total_keys);
  772. cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_FILES_TOTAL,
  773. files_to_ingest_.size());
  774. cfd_->internal_stats()->AddCFStats(
  775. InternalStats::INGESTED_LEVEL0_NUM_FILES_TOTAL, total_l0_files);
  776. }
  777. void ExternalSstFileIngestionJob::Cleanup(const Status& status) {
  778. IOOptions io_opts;
  779. if (!status.ok()) {
  780. // We failed to add the files to the database
  781. // remove all the files we copied
  782. DeleteInternalFiles();
  783. files_overlap_ = false;
  784. } else if (status.ok() && ingestion_options_.move_files) {
  785. // The files were moved and added successfully, remove original file links
  786. for (IngestedFileInfo& f : files_to_ingest_) {
  787. Status s = fs_->DeleteFile(f.external_file_path, io_opts, nullptr);
  788. if (!s.ok()) {
  789. ROCKS_LOG_WARN(
  790. db_options_.info_log,
  791. "%s was added to DB successfully but failed to remove original "
  792. "file link : %s",
  793. f.external_file_path.c_str(), s.ToString().c_str());
  794. }
  795. }
  796. }
  797. }
  798. void ExternalSstFileIngestionJob::DeleteInternalFiles() {
  799. IOOptions io_opts;
  800. for (IngestedFileInfo& f : files_to_ingest_) {
  801. if (f.internal_file_path.empty()) {
  802. continue;
  803. }
  804. Status s = fs_->DeleteFile(f.internal_file_path, io_opts, nullptr);
  805. if (!s.ok()) {
  806. ROCKS_LOG_WARN(db_options_.info_log,
  807. "AddFile() clean up for file %s failed : %s",
  808. f.internal_file_path.c_str(), s.ToString().c_str());
  809. }
  810. }
  811. }
  812. Status ExternalSstFileIngestionJob::ResetTableReader(
  813. const std::string& external_file, uint64_t new_file_number,
  814. bool user_defined_timestamps_persisted, SuperVersion* sv,
  815. IngestedFileInfo* file_to_ingest,
  816. std::unique_ptr<TableReader>* table_reader) {
  817. std::unique_ptr<FSRandomAccessFile> sst_file;
  818. FileOptions fo{env_options_};
  819. fo.temperature = file_to_ingest->file_temperature;
  820. Status status =
  821. fs_->NewRandomAccessFile(external_file, fo, &sst_file, nullptr);
  822. if (!status.ok()) {
  823. ROCKS_LOG_WARN(
  824. db_options_.info_log,
  825. "Failed to create random access file for external file %s: %s",
  826. external_file.c_str(), status.ToString().c_str());
  827. return status;
  828. }
  829. Temperature updated_temp = sst_file->GetTemperature();
  830. if (updated_temp != Temperature::kUnknown &&
  831. updated_temp != file_to_ingest->file_temperature) {
  832. // The hint was missing or wrong. Track temperature reported by storage.
  833. file_to_ingest->file_temperature = updated_temp;
  834. }
  835. std::unique_ptr<RandomAccessFileReader> sst_file_reader(
  836. new RandomAccessFileReader(std::move(sst_file), external_file,
  837. nullptr /*Env*/, io_tracer_));
  838. table_reader->reset();
  839. ReadOptions ro;
  840. ro.fill_cache = ingestion_options_.fill_cache;
  841. status = sv->mutable_cf_options.table_factory->NewTableReader(
  842. ro,
  843. TableReaderOptions(
  844. cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor,
  845. sv->mutable_cf_options.compression_manager.get(), env_options_,
  846. cfd_->internal_comparator(),
  847. sv->mutable_cf_options.block_protection_bytes_per_key,
  848. /*skip_filters*/ false, /*immortal*/ false,
  849. /*force_direct_prefetch*/ false, /*level*/ -1,
  850. /*block_cache_tracer*/ nullptr,
  851. /*max_file_size_for_l0_meta_pin*/ 0, versions_->DbSessionId(),
  852. /*cur_file_num*/ new_file_number,
  853. /* unique_id */ {}, /* largest_seqno */ 0,
  854. /* tail_size */ 0, user_defined_timestamps_persisted),
  855. std::move(sst_file_reader), file_to_ingest->file_size, table_reader,
  856. // No need to prefetch index/filter if caching is not needed.
  857. /*prefetch_index_and_filter_in_cache=*/ingestion_options_.fill_cache);
  858. return status;
  859. }
  860. Status ExternalSstFileIngestionJob::SanityCheckTableProperties(
  861. const std::string& external_file, uint64_t new_file_number,
  862. SuperVersion* sv, IngestedFileInfo* file_to_ingest,
  863. std::unique_ptr<TableReader>* table_reader) {
  864. // Get the external file properties
  865. auto props = table_reader->get()->GetTableProperties();
  866. assert(props.get());
  867. const auto& uprops = props->user_collected_properties;
  868. // Get table version
  869. auto version_iter = uprops.find(ExternalSstFilePropertyNames::kVersion);
  870. if (version_iter == uprops.end()) {
  871. assert(!SstFileWriter::CreatedBySstFileWriter(*props));
  872. if (!ingestion_options_.allow_db_generated_files) {
  873. return Status::Corruption("External file version not found");
  874. } else {
  875. // 0 is special version for when a file from live DB does not have the
  876. // version table property
  877. file_to_ingest->version = 0;
  878. }
  879. } else {
  880. assert(SstFileWriter::CreatedBySstFileWriter(*props));
  881. file_to_ingest->version = DecodeFixed32(version_iter->second.c_str());
  882. }
  883. auto seqno_iter = uprops.find(ExternalSstFilePropertyNames::kGlobalSeqno);
  884. if (file_to_ingest->version == 2) {
  885. // version 2 imply that we have global sequence number
  886. if (seqno_iter == uprops.end()) {
  887. return Status::Corruption(
  888. "External file global sequence number not found");
  889. }
  890. // Set the global sequence number
  891. file_to_ingest->original_seqno = DecodeFixed64(seqno_iter->second.c_str());
  892. if (props->external_sst_file_global_seqno_offset == 0) {
  893. file_to_ingest->global_seqno_offset = 0;
  894. return Status::Corruption("Was not able to find file global seqno field");
  895. }
  896. file_to_ingest->global_seqno_offset =
  897. static_cast<size_t>(props->external_sst_file_global_seqno_offset);
  898. } else if (file_to_ingest->version == 1) {
  899. // SST file V1 should not have global seqno field
  900. assert(seqno_iter == uprops.end());
  901. file_to_ingest->original_seqno = 0;
  902. if (ingestion_options_.allow_blocking_flush ||
  903. ingestion_options_.allow_global_seqno) {
  904. return Status::InvalidArgument(
  905. "External SST file V1 does not support global seqno");
  906. }
  907. } else if (file_to_ingest->version == 0) {
  908. // allow_db_generated_files is true
  909. assert(seqno_iter == uprops.end());
  910. file_to_ingest->original_seqno = 0;
  911. file_to_ingest->global_seqno_offset = 0;
  912. } else {
  913. return Status::InvalidArgument("External file version " +
  914. std::to_string(file_to_ingest->version) +
  915. " is not supported");
  916. }
  917. file_to_ingest->cf_id = static_cast<uint32_t>(props->column_family_id);
  918. // This assignment works fine even though `table_reader` may later be reset,
  919. // since that will not affect how table properties are parsed, and this
  920. // assignment is making a copy.
  921. file_to_ingest->table_properties = *props;
  922. // Get number of entries in table
  923. file_to_ingest->num_entries = props->num_entries;
  924. file_to_ingest->num_range_deletions = props->num_range_deletions;
  925. // Validate table properties related to comparator name and user defined
  926. // timestamps persisted flag.
  927. file_to_ingest->user_defined_timestamps_persisted =
  928. static_cast<bool>(props->user_defined_timestamps_persisted);
  929. bool mark_sst_file_has_no_udt = false;
  930. Status s = ValidateUserDefinedTimestampsOptions(
  931. cfd_->user_comparator(), props->comparator_name,
  932. cfd_->ioptions().persist_user_defined_timestamps,
  933. file_to_ingest->user_defined_timestamps_persisted,
  934. &mark_sst_file_has_no_udt);
  935. if (s.ok() && mark_sst_file_has_no_udt) {
  936. // A column family that enables user-defined timestamps in Memtable only
  937. // feature can also ingest external files created by a setting that disables
  938. // user-defined timestamps. In that case, we need to re-mark the
  939. // user_defined_timestamps_persisted flag for the file.
  940. file_to_ingest->user_defined_timestamps_persisted = false;
  941. } else if (!s.ok()) {
  942. ROCKS_LOG_WARN(
  943. db_options_.info_log,
  944. "ValidateUserDefinedTimestampsOptions failed for external file %s: %s",
  945. external_file.c_str(), s.ToString().c_str());
  946. return s;
  947. }
  948. // `TableReader` is initialized with `user_defined_timestamps_persisted` flag
  949. // to be true. If its value changed to false after this sanity check, we
  950. // need to reset the `TableReader`.
  951. if (ucmp_->timestamp_size() > 0 &&
  952. !file_to_ingest->user_defined_timestamps_persisted) {
  953. s = ResetTableReader(external_file, new_file_number,
  954. file_to_ingest->user_defined_timestamps_persisted, sv,
  955. file_to_ingest, table_reader);
  956. }
  957. return s;
  958. }
  959. Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
  960. const std::string& external_file, uint64_t new_file_number,
  961. IngestedFileInfo* file_to_ingest, SuperVersion* sv) {
  962. file_to_ingest->external_file_path = external_file;
  963. // Get external file size
  964. Status status = fs_->GetFileSize(external_file, IOOptions(),
  965. &file_to_ingest->file_size, nullptr);
  966. if (!status.ok()) {
  967. ROCKS_LOG_WARN(db_options_.info_log,
  968. "Failed to get file size for external file %s: %s",
  969. external_file.c_str(), status.ToString().c_str());
  970. return status;
  971. }
  972. // Assign FD with number
  973. file_to_ingest->fd =
  974. FileDescriptor(new_file_number, 0, file_to_ingest->file_size);
  975. // Create TableReader for external file
  976. std::unique_ptr<TableReader> table_reader;
  977. // Initially create the `TableReader` with flag
  978. // `user_defined_timestamps_persisted` to be true since that's the most common
  979. // case
  980. status = ResetTableReader(external_file, new_file_number,
  981. /*user_defined_timestamps_persisted=*/true, sv,
  982. file_to_ingest, &table_reader);
  983. if (!status.ok()) {
  984. ROCKS_LOG_WARN(db_options_.info_log,
  985. "Failed to reset table reader for external file %s: %s",
  986. external_file.c_str(), status.ToString().c_str());
  987. return status;
  988. }
  989. status = SanityCheckTableProperties(external_file, new_file_number, sv,
  990. file_to_ingest, &table_reader);
  991. if (!status.ok()) {
  992. ROCKS_LOG_WARN(
  993. db_options_.info_log,
  994. "Failed to sanity check table properties for external file %s: %s",
  995. external_file.c_str(), status.ToString().c_str());
  996. return status;
  997. }
  998. const bool allow_data_in_errors = db_options_.allow_data_in_errors;
  999. ParsedInternalKey key;
  1000. if (ingestion_options_.allow_db_generated_files) {
  1001. // We are ingesting a DB generated SST file for which we don't reassign
  1002. // sequence numbers. We need its smallest sequence number and largest
  1003. // sequence number for FileMetaData.
  1004. Status seqno_status = GetSeqnoBoundaryForFile(
  1005. table_reader.get(), sv, file_to_ingest, allow_data_in_errors);
  1006. if (!seqno_status.ok()) {
  1007. ROCKS_LOG_WARN(
  1008. db_options_.info_log,
  1009. "Failed to get sequence number boundary for external file %s: %s",
  1010. external_file.c_str(), seqno_status.ToString().c_str());
  1011. return seqno_status;
  1012. }
  1013. assert(file_to_ingest->smallest_seqno <= file_to_ingest->largest_seqno);
  1014. assert(file_to_ingest->largest_seqno < kMaxSequenceNumber);
  1015. } else {
  1016. SequenceNumber largest_seqno =
  1017. table_reader.get()->GetTableProperties()->key_largest_seqno;
  1018. // UINT64_MAX means unknown and the file is generated before table property
  1019. // `key_largest_seqno` is introduced.
  1020. if (largest_seqno != UINT64_MAX && largest_seqno > 0) {
  1021. return Status::Corruption(
  1022. "External file has non zero largest sequence number " +
  1023. std::to_string(largest_seqno));
  1024. }
  1025. }
  1026. if (ingestion_options_.verify_checksums_before_ingest) {
  1027. // If customized readahead size is needed, we can pass a user option
  1028. // all the way to here. Right now we just rely on the default readahead
  1029. // to keep things simple.
  1030. // TODO: plumb Env::IOActivity, Env::IOPriority
  1031. ReadOptions ro;
  1032. ro.readahead_size = ingestion_options_.verify_checksums_readahead_size;
  1033. ro.fill_cache = ingestion_options_.fill_cache;
  1034. status = table_reader->VerifyChecksum(
  1035. ro, TableReaderCaller::kExternalSSTIngestion);
  1036. if (!status.ok()) {
  1037. ROCKS_LOG_WARN(db_options_.info_log,
  1038. "Failed to verify checksum for table reader: %s",
  1039. status.ToString().c_str());
  1040. return status;
  1041. }
  1042. }
  1043. // TODO: plumb Env::IOActivity, Env::IOPriority
  1044. ReadOptions ro;
  1045. ro.fill_cache = ingestion_options_.fill_cache;
  1046. std::unique_ptr<InternalIterator> iter(table_reader->NewIterator(
  1047. ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr,
  1048. /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion));
  1049. // Get first (smallest) and last (largest) key from file.
  1050. iter->SeekToFirst();
  1051. if (iter->Valid()) {
  1052. Status pik_status =
  1053. ParseInternalKey(iter->key(), &key, allow_data_in_errors);
  1054. if (!pik_status.ok()) {
  1055. return Status::Corruption("Corrupted key in external file. ",
  1056. pik_status.getState());
  1057. }
  1058. if (key.sequence != 0 && !ingestion_options_.allow_db_generated_files) {
  1059. return Status::Corruption("External file has non zero sequence number");
  1060. }
  1061. file_to_ingest->smallest_internal_key.SetFrom(key);
  1062. Slice largest;
  1063. if (strcmp(sv->mutable_cf_options.table_factory->Name(), "PlainTable") ==
  1064. 0) {
  1065. // PlainTable iterator does not support SeekToLast().
  1066. largest = iter->key();
  1067. for (; iter->Valid(); iter->Next()) {
  1068. if (cfd_->internal_comparator().Compare(iter->key(), largest) > 0) {
  1069. largest = iter->key();
  1070. }
  1071. }
  1072. if (!iter->status().ok()) {
  1073. return iter->status();
  1074. }
  1075. } else {
  1076. iter->SeekToLast();
  1077. if (!iter->Valid()) {
  1078. if (iter->status().ok()) {
  1079. // The file contains at least 1 key since iter is valid after
  1080. // SeekToFirst().
  1081. return Status::Corruption("Can not find largest key in sst file");
  1082. } else {
  1083. return iter->status();
  1084. }
  1085. }
  1086. largest = iter->key();
  1087. }
  1088. pik_status = ParseInternalKey(largest, &key, allow_data_in_errors);
  1089. if (!pik_status.ok()) {
  1090. return Status::Corruption("Corrupted key in external file. ",
  1091. pik_status.getState());
  1092. }
  1093. if (key.sequence != 0 && !ingestion_options_.allow_db_generated_files) {
  1094. return Status::Corruption("External file has non zero sequence number");
  1095. }
  1096. file_to_ingest->largest_internal_key.SetFrom(key);
  1097. } else if (!iter->status().ok()) {
  1098. return iter->status();
  1099. }
  1100. std::unique_ptr<InternalIterator> range_del_iter(
  1101. table_reader->NewRangeTombstoneIterator(ro));
  1102. // We may need to adjust these key bounds, depending on whether any range
  1103. // deletion tombstones extend past them.
  1104. if (range_del_iter != nullptr) {
  1105. for (range_del_iter->SeekToFirst(); range_del_iter->Valid();
  1106. range_del_iter->Next()) {
  1107. Status pik_status =
  1108. ParseInternalKey(range_del_iter->key(), &key, allow_data_in_errors);
  1109. if (!pik_status.ok()) {
  1110. return Status::Corruption("Corrupted key in external file. ",
  1111. pik_status.getState());
  1112. }
  1113. if (key.sequence != 0 && !ingestion_options_.allow_db_generated_files) {
  1114. return Status::Corruption(
  1115. "External file has a range deletion with non zero sequence "
  1116. "number.");
  1117. }
  1118. RangeTombstone tombstone(key, range_del_iter->value());
  1119. file_range_checker_.MaybeUpdateRange(tombstone.SerializeKey(),
  1120. tombstone.SerializeEndKey(),
  1121. file_to_ingest);
  1122. }
  1123. }
  1124. const size_t ts_sz = ucmp_->timestamp_size();
  1125. Slice smallest = file_to_ingest->smallest_internal_key.user_key();
  1126. Slice largest = file_to_ingest->largest_internal_key.user_key();
  1127. if (ts_sz > 0) {
  1128. AppendUserKeyWithMaxTimestamp(&file_to_ingest->start_ukey, smallest, ts_sz);
  1129. AppendUserKeyWithMinTimestamp(&file_to_ingest->limit_ukey, largest, ts_sz);
  1130. } else {
  1131. file_to_ingest->start_ukey.assign(smallest.data(), smallest.size());
  1132. file_to_ingest->limit_ukey.assign(largest.data(), largest.size());
  1133. }
  1134. auto s =
  1135. GetSstInternalUniqueId(file_to_ingest->table_properties.db_id,
  1136. file_to_ingest->table_properties.db_session_id,
  1137. file_to_ingest->table_properties.orig_file_number,
  1138. &(file_to_ingest->unique_id));
  1139. if (!s.ok()) {
  1140. ROCKS_LOG_WARN(db_options_.info_log,
  1141. "Failed to get SST unique id for file %s",
  1142. file_to_ingest->internal_file_path.c_str());
  1143. file_to_ingest->unique_id = kNullUniqueId64x2;
  1144. }
  1145. return status;
  1146. }
  1147. Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
  1148. SuperVersion* sv, bool force_global_seqno, CompactionStyle compaction_style,
  1149. SequenceNumber last_seqno, IngestedFileInfo* file_to_ingest,
  1150. SequenceNumber* assigned_seqno,
  1151. std::optional<int> prev_batch_uppermost_level) {
  1152. Status status;
  1153. *assigned_seqno = 0;
  1154. const size_t ts_sz = ucmp_->timestamp_size();
  1155. assert(!prev_batch_uppermost_level.has_value() ||
  1156. prev_batch_uppermost_level.value() < cfd_->NumberLevels());
  1157. bool must_assign_to_l0 = (prev_batch_uppermost_level.has_value() &&
  1158. prev_batch_uppermost_level.value() == 0) ||
  1159. compaction_style == kCompactionStyleFIFO;
  1160. if (force_global_seqno || (!ingestion_options_.allow_db_generated_files &&
  1161. (files_overlap_ || must_assign_to_l0))) {
  1162. *assigned_seqno = last_seqno + 1;
  1163. if (must_assign_to_l0) {
  1164. assert(ts_sz == 0);
  1165. file_to_ingest->picked_level = 0;
  1166. if (ingestion_options_.fail_if_not_bottommost_level &&
  1167. cfd_->NumberLevels() > 1) {
  1168. status = Status::TryAgain(
  1169. "Files cannot be ingested to Lmax. Please make sure key range of "
  1170. "Lmax does not overlap with files to ingest.");
  1171. }
  1172. return status;
  1173. }
  1174. }
  1175. bool overlap_with_db = false;
  1176. Arena arena;
  1177. // TODO: plumb Env::IOActivity, Env::IOPriority
  1178. ReadOptions ro;
  1179. ro.fill_cache = ingestion_options_.fill_cache;
  1180. ro.total_order_seek = true;
  1181. int target_level = 0;
  1182. auto* vstorage = cfd_->current()->storage_info();
  1183. assert(!must_assign_to_l0 || ingestion_options_.allow_db_generated_files);
  1184. int assigned_level_exclusive_end = cfd_->NumberLevels();
  1185. if (must_assign_to_l0) {
  1186. assigned_level_exclusive_end = 0;
  1187. } else if (prev_batch_uppermost_level.has_value()) {
  1188. assigned_level_exclusive_end = prev_batch_uppermost_level.value();
  1189. }
  1190. // When ingesting db generated files, we require that ingested files do not
  1191. // overlap with any file in the DB. So we need to check all levels.
  1192. int overlap_checking_exclusive_end =
  1193. ingestion_options_.allow_db_generated_files
  1194. ? cfd_->NumberLevels()
  1195. : assigned_level_exclusive_end;
  1196. for (int lvl = 0; lvl < overlap_checking_exclusive_end; lvl++) {
  1197. if (lvl > 0 && lvl < vstorage->base_level()) {
  1198. continue;
  1199. }
  1200. if (lvl < assigned_level_exclusive_end &&
  1201. atomic_replace_range_.has_value()) {
  1202. target_level = lvl;
  1203. continue;
  1204. }
  1205. if (cfd_->RangeOverlapWithCompaction(file_to_ingest->start_ukey,
  1206. file_to_ingest->limit_ukey, lvl)) {
  1207. // We must use L0 or any level higher than `lvl` to be able to overwrite
  1208. // the compaction output keys that we overlap with in this level, We also
  1209. // need to assign this file a seqno to overwrite the compaction output
  1210. // keys in level `lvl`
  1211. overlap_with_db = true;
  1212. break;
  1213. } else if (vstorage->NumLevelFiles(lvl) > 0) {
  1214. bool overlap_with_level = false;
  1215. status = sv->current->OverlapWithLevelIterator(
  1216. ro, env_options_, file_to_ingest->start_ukey,
  1217. file_to_ingest->limit_ukey, lvl, &overlap_with_level);
  1218. if (!status.ok()) {
  1219. ROCKS_LOG_WARN(db_options_.info_log,
  1220. "Failed to check overlap with level iterator: %s",
  1221. status.ToString().c_str());
  1222. return status;
  1223. }
  1224. if (overlap_with_level) {
  1225. // We must use L0 or any level higher than `lvl` to be able to overwrite
  1226. // the keys that we overlap with in this level, We also need to assign
  1227. // this file a seqno to overwrite the existing keys in level `lvl`
  1228. overlap_with_db = true;
  1229. break;
  1230. }
  1231. }
  1232. // We don't overlap with any keys in this level, but we still need to check
  1233. // if our file can fit in it
  1234. if (lvl < assigned_level_exclusive_end &&
  1235. IngestedFileFitInLevel(file_to_ingest, lvl)) {
  1236. target_level = lvl;
  1237. }
  1238. }
  1239. if (ingestion_options_.fail_if_not_bottommost_level &&
  1240. target_level < cfd_->NumberLevels() - 1) {
  1241. status = Status::TryAgain(
  1242. "Files cannot be ingested to Lmax. Please make sure key range of Lmax "
  1243. "and ongoing compaction's output to Lmax does not overlap with files "
  1244. "to ingest. Input files overlapping with each other can cause some "
  1245. "file to be assigned to non Lmax level.");
  1246. return status;
  1247. }
  1248. TEST_SYNC_POINT_CALLBACK(
  1249. "ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile",
  1250. &overlap_with_db);
  1251. file_to_ingest->picked_level = target_level;
  1252. if (overlap_with_db) {
  1253. if (ts_sz > 0) {
  1254. status = Status::InvalidArgument(
  1255. "Column family enables user-defined timestamps, please make sure the "
  1256. "key range (without timestamp) of external file does not overlap "
  1257. "with key range (without timestamp) in the db");
  1258. return status;
  1259. }
  1260. if (*assigned_seqno == 0) {
  1261. *assigned_seqno = last_seqno + 1;
  1262. }
  1263. }
  1264. return status;
  1265. }
  1266. Status ExternalSstFileIngestionJob::CheckLevelForIngestedBehindFile(
  1267. IngestedFileInfo* file_to_ingest) {
  1268. assert(!atomic_replace_range_.has_value());
  1269. auto* vstorage = cfd_->current()->storage_info();
  1270. // First, check if new files fit in the last level
  1271. int last_lvl = cfd_->NumberLevels() - 1;
  1272. if (!IngestedFileFitInLevel(file_to_ingest, last_lvl)) {
  1273. return Status::InvalidArgument(
  1274. "Can't ingest_behind file as it doesn't fit "
  1275. "at the last level!");
  1276. }
  1277. // Second, check if despite cf_allow_ingest_behind=true we still have 0
  1278. // seqnums at some upper level
  1279. for (int lvl = 0; lvl < cfd_->NumberLevels() - 1; lvl++) {
  1280. for (auto file : vstorage->LevelFiles(lvl)) {
  1281. if (file->fd.smallest_seqno == 0) {
  1282. return Status::InvalidArgument(
  1283. "Can't ingest_behind file as despite cf_allow_ingest_behind=true "
  1284. "there are files with 0 seqno in database at upper levels!");
  1285. }
  1286. }
  1287. }
  1288. file_to_ingest->picked_level = last_lvl;
  1289. return Status::OK();
  1290. }
  1291. Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile(
  1292. IngestedFileInfo* file_to_ingest, SequenceNumber seqno) {
  1293. if (ingestion_options_.allow_db_generated_files) {
  1294. assert(seqno == 0);
  1295. assert(file_to_ingest->original_seqno == 0);
  1296. }
  1297. if (file_to_ingest->original_seqno == seqno) {
  1298. // This file already has the correct global seqno.
  1299. return Status::OK();
  1300. } else if (!ingestion_options_.allow_global_seqno) {
  1301. return Status::InvalidArgument("Global seqno is required, but disabled");
  1302. } else if (ingestion_options_.write_global_seqno &&
  1303. file_to_ingest->global_seqno_offset == 0) {
  1304. return Status::InvalidArgument(
  1305. "Trying to set global seqno for a file that don't have a global seqno "
  1306. "field");
  1307. }
  1308. if (ingestion_options_.write_global_seqno) {
  1309. // Determine if we can write global_seqno to a given offset of file.
  1310. // If the file system does not support random write, then we should not.
  1311. // Otherwise we should.
  1312. std::unique_ptr<FSRandomRWFile> rwfile;
  1313. Status status = fs_->NewRandomRWFile(file_to_ingest->internal_file_path,
  1314. env_options_, &rwfile, nullptr);
  1315. TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::NewRandomRWFile",
  1316. &status);
  1317. if (status.ok()) {
  1318. FSRandomRWFilePtr fsptr(std::move(rwfile), io_tracer_,
  1319. file_to_ingest->internal_file_path);
  1320. std::string seqno_val;
  1321. PutFixed64(&seqno_val, seqno);
  1322. status = fsptr->Write(file_to_ingest->global_seqno_offset, seqno_val,
  1323. IOOptions(), nullptr);
  1324. if (!status.ok()) {
  1325. ROCKS_LOG_WARN(db_options_.info_log,
  1326. "Failed to write global seqno to %s: %s",
  1327. file_to_ingest->internal_file_path.c_str(),
  1328. status.ToString().c_str());
  1329. return status;
  1330. }
  1331. if (status.ok()) {
  1332. TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno");
  1333. status = SyncIngestedFile(fsptr.get());
  1334. TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncGlobalSeqno");
  1335. if (!status.ok()) {
  1336. ROCKS_LOG_WARN(db_options_.info_log,
  1337. "Failed to sync ingested file %s after writing global "
  1338. "sequence number: %s",
  1339. file_to_ingest->internal_file_path.c_str(),
  1340. status.ToString().c_str());
  1341. }
  1342. }
  1343. if (!status.ok()) {
  1344. return status;
  1345. }
  1346. } else if (!status.IsNotSupported()) {
  1347. ROCKS_LOG_WARN(
  1348. db_options_.info_log,
  1349. "Failed to open ingested file %s for random read/write: %s",
  1350. file_to_ingest->internal_file_path.c_str(),
  1351. status.ToString().c_str());
  1352. return status;
  1353. }
  1354. }
  1355. file_to_ingest->assigned_seqno = seqno;
  1356. return Status::OK();
  1357. }
  1358. IOStatus ExternalSstFileIngestionJob::GenerateChecksumForIngestedFile(
  1359. IngestedFileInfo* file_to_ingest) {
  1360. if (db_options_.file_checksum_gen_factory == nullptr ||
  1361. need_generate_file_checksum_ == false ||
  1362. ingestion_options_.write_global_seqno == false) {
  1363. // If file_checksum_gen_factory is not set, we are not able to generate
  1364. // the checksum. if write_global_seqno is false, it means we will use
  1365. // file checksum generated during Prepare(). This step will be skipped.
  1366. return IOStatus::OK();
  1367. }
  1368. std::string file_checksum;
  1369. std::string file_checksum_func_name;
  1370. std::string requested_checksum_func_name;
  1371. // TODO: rate limit file reads for checksum calculation during file ingestion.
  1372. // TODO: plumb Env::IOActivity
  1373. ReadOptions ro;
  1374. IOStatus io_s = GenerateOneFileChecksum(
  1375. fs_.get(), file_to_ingest->internal_file_path,
  1376. db_options_.file_checksum_gen_factory.get(), requested_checksum_func_name,
  1377. &file_checksum, &file_checksum_func_name,
  1378. ingestion_options_.verify_checksums_readahead_size,
  1379. db_options_.allow_mmap_reads, io_tracer_, db_options_.rate_limiter.get(),
  1380. ro, db_options_.stats, db_options_.clock);
  1381. if (!io_s.ok()) {
  1382. ROCKS_LOG_WARN(
  1383. db_options_.info_log, "Failed to generate checksum for %s: %s",
  1384. file_to_ingest->internal_file_path.c_str(), io_s.ToString().c_str());
  1385. return io_s;
  1386. }
  1387. file_to_ingest->file_checksum = std::move(file_checksum);
  1388. file_to_ingest->file_checksum_func_name = std::move(file_checksum_func_name);
  1389. return IOStatus::OK();
  1390. }
  1391. bool ExternalSstFileIngestionJob::IngestedFileFitInLevel(
  1392. const IngestedFileInfo* file_to_ingest, int level) {
  1393. if (level == 0) {
  1394. // Files can always fit in L0
  1395. return true;
  1396. }
  1397. auto* vstorage = cfd_->current()->storage_info();
  1398. Slice file_smallest_user_key(file_to_ingest->start_ukey);
  1399. Slice file_largest_user_key(file_to_ingest->limit_ukey);
  1400. if (vstorage->OverlapInLevel(level, &file_smallest_user_key,
  1401. &file_largest_user_key)) {
  1402. // File overlap with another files in this level, we cannot
  1403. // add it to this level
  1404. return false;
  1405. }
  1406. // File did not overlap with level files, nor compaction output
  1407. return true;
  1408. }
  1409. template <typename TWritableFile>
  1410. Status ExternalSstFileIngestionJob::SyncIngestedFile(TWritableFile* file) {
  1411. assert(file != nullptr);
  1412. if (db_options_.use_fsync) {
  1413. return file->Fsync(IOOptions(), nullptr);
  1414. } else {
  1415. return file->Sync(IOOptions(), nullptr);
  1416. }
  1417. }
  1418. Status ExternalSstFileIngestionJob::GetSeqnoBoundaryForFile(
  1419. TableReader* table_reader, SuperVersion* sv,
  1420. IngestedFileInfo* file_to_ingest, bool allow_data_in_errors) {
  1421. const auto tp = table_reader->GetTableProperties();
  1422. const bool has_largest_seqno = tp->HasKeyLargestSeqno();
  1423. SequenceNumber largest_seqno = tp->key_largest_seqno;
  1424. if (has_largest_seqno) {
  1425. file_to_ingest->largest_seqno = largest_seqno;
  1426. if (largest_seqno == 0) {
  1427. file_to_ingest->smallest_seqno = 0;
  1428. return Status::OK();
  1429. }
  1430. if (tp->HasKeySmallestSeqno()) {
  1431. file_to_ingest->smallest_seqno = tp->key_smallest_seqno;
  1432. return Status::OK();
  1433. }
  1434. }
  1435. // For older SST files they may not be recorded in table properties, so
  1436. // we scan the file to find out.
  1437. TEST_SYNC_POINT(
  1438. "ExternalSstFileIngestionJob::GetSeqnoBoundaryForFile:FileScan");
  1439. SequenceNumber smallest_seqno = kMaxSequenceNumber;
  1440. SequenceNumber largest_seqno_from_iter = 0;
  1441. ReadOptions ro;
  1442. ro.fill_cache = ingestion_options_.fill_cache;
  1443. std::unique_ptr<InternalIterator> iter(table_reader->NewIterator(
  1444. ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr,
  1445. /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion));
  1446. ParsedInternalKey key;
  1447. iter->SeekToFirst();
  1448. while (iter->Valid()) {
  1449. Status pik_status =
  1450. ParseInternalKey(iter->key(), &key, allow_data_in_errors);
  1451. if (!pik_status.ok()) {
  1452. return Status::Corruption("Corrupted key in external file. ",
  1453. pik_status.getState());
  1454. }
  1455. smallest_seqno = std::min(smallest_seqno, key.sequence);
  1456. largest_seqno_from_iter = std::max(largest_seqno_from_iter, key.sequence);
  1457. iter->Next();
  1458. }
  1459. if (!iter->status().ok()) {
  1460. return iter->status();
  1461. }
  1462. if (table_reader->GetTableProperties()->num_range_deletions > 0) {
  1463. std::unique_ptr<InternalIterator> range_del_iter(
  1464. table_reader->NewRangeTombstoneIterator(ro));
  1465. if (range_del_iter != nullptr) {
  1466. for (range_del_iter->SeekToFirst(); range_del_iter->Valid();
  1467. range_del_iter->Next()) {
  1468. Status pik_status =
  1469. ParseInternalKey(range_del_iter->key(), &key, allow_data_in_errors);
  1470. if (!pik_status.ok()) {
  1471. return Status::Corruption("Corrupted key in external file. ",
  1472. pik_status.getState());
  1473. }
  1474. smallest_seqno = std::min(smallest_seqno, key.sequence);
  1475. largest_seqno_from_iter =
  1476. std::max(largest_seqno_from_iter, key.sequence);
  1477. }
  1478. if (!range_del_iter->status().ok()) {
  1479. return range_del_iter->status();
  1480. }
  1481. }
  1482. }
  1483. file_to_ingest->smallest_seqno = smallest_seqno;
  1484. if (!has_largest_seqno) {
  1485. file_to_ingest->largest_seqno = largest_seqno_from_iter;
  1486. } else {
  1487. assert(largest_seqno == largest_seqno_from_iter);
  1488. file_to_ingest->largest_seqno = largest_seqno;
  1489. }
  1490. if (file_to_ingest->largest_seqno == kMaxSequenceNumber) {
  1491. return Status::InvalidArgument(
  1492. "Unknown smallest seqno for db generated file.");
  1493. }
  1494. if (file_to_ingest->smallest_seqno == kMaxSequenceNumber) {
  1495. return Status::InvalidArgument(
  1496. "Unknown largest seqno for db generated file.");
  1497. }
  1498. return Status::OK();
  1499. }
  1500. } // namespace ROCKSDB_NAMESPACE