db_impl_readonly.cc 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. #include "db/db_impl/db_impl_readonly.h"
  6. #include "db/arena_wrapped_db_iter.h"
  7. #include "db/db_impl/compacted_db_impl.h"
  8. #include "db/db_impl/db_impl.h"
  9. #include "db/manifest_ops.h"
  10. #include "db/merge_context.h"
  11. #include "logging/logging.h"
  12. #include "monitoring/perf_context_imp.h"
  13. #include "util/cast_util.h"
  14. namespace ROCKSDB_NAMESPACE {
  15. DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options,
  16. const std::string& dbname)
  17. : DBImpl(db_options, dbname, /*seq_per_batch*/ false,
  18. /*batch_per_txn*/ true, /*read_only*/ true) {
  19. ROCKS_LOG_INFO(immutable_db_options_.info_log,
  20. "Opening the db in read only mode");
  21. LogFlush(immutable_db_options_.info_log);
  22. }
  23. DBImplReadOnly::~DBImplReadOnly() = default;
  24. // Implementations of the DB interface
  25. Status DBImplReadOnly::GetImpl(const ReadOptions& read_options,
  26. const Slice& key,
  27. GetImplOptions& get_impl_options) {
  28. assert(get_impl_options.value != nullptr ||
  29. get_impl_options.columns != nullptr ||
  30. get_impl_options.merge_operands != nullptr);
  31. assert(get_impl_options.column_family);
  32. Status s;
  33. if (read_options.timestamp) {
  34. s = FailIfTsMismatchCf(get_impl_options.column_family,
  35. *(read_options.timestamp));
  36. if (!s.ok()) {
  37. return s;
  38. }
  39. } else {
  40. s = FailIfCfHasTs(get_impl_options.column_family);
  41. if (!s.ok()) {
  42. return s;
  43. }
  44. }
  45. // Clear the timestamps for returning results so that we can distinguish
  46. // between tombstone or key that has never been written
  47. if (get_impl_options.timestamp) {
  48. get_impl_options.timestamp->clear();
  49. }
  50. PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock);
  51. StopWatch sw(immutable_db_options_.clock, stats_, DB_GET);
  52. PERF_TIMER_GUARD(get_snapshot_time);
  53. const Comparator* ucmp = get_impl_options.column_family->GetComparator();
  54. assert(ucmp);
  55. std::string* ts =
  56. ucmp->timestamp_size() > 0 ? get_impl_options.timestamp : nullptr;
  57. SequenceNumber snapshot = versions_->LastSequence();
  58. GetWithTimestampReadCallback read_cb(snapshot);
  59. auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(
  60. get_impl_options.column_family);
  61. auto cfd = cfh->cfd();
  62. if (tracer_) {
  63. InstrumentedMutexLock lock(&trace_mutex_);
  64. if (tracer_) {
  65. tracer_->Get(get_impl_options.column_family, key);
  66. }
  67. }
  68. // In read-only mode Get(), no super version operation is needed (i.e.
  69. // GetAndRefSuperVersion and ReturnAndCleanupSuperVersion)
  70. SuperVersion* super_version = cfd->GetSuperVersion();
  71. if (read_options.timestamp && read_options.timestamp->size() > 0) {
  72. s = FailIfReadCollapsedHistory(cfd, super_version,
  73. *(read_options.timestamp));
  74. if (!s.ok()) {
  75. return s;
  76. }
  77. }
  78. // Prepare to store a list of merge operations if merge occurs.
  79. MergeContext merge_context;
  80. // TODO - Large Result Optimization for Read Only DB
  81. // (https://github.com/facebook/rocksdb/pull/10458)
  82. SequenceNumber max_covering_tombstone_seq = 0;
  83. LookupKey lkey(key, snapshot, read_options.timestamp);
  84. PERF_TIMER_STOP(get_snapshot_time);
  85. // Look up starts here
  86. if (super_version->mem->Get(
  87. lkey,
  88. get_impl_options.value ? get_impl_options.value->GetSelf() : nullptr,
  89. get_impl_options.columns, ts, &s, &merge_context,
  90. &max_covering_tombstone_seq, read_options,
  91. false /* immutable_memtable */, &read_cb,
  92. /*is_blob_index=*/nullptr, /*do_merge=*/get_impl_options.get_value)) {
  93. if (get_impl_options.value) {
  94. get_impl_options.value->PinSelf();
  95. }
  96. RecordTick(stats_, MEMTABLE_HIT);
  97. } else {
  98. PERF_TIMER_GUARD(get_from_output_files_time);
  99. PinnedIteratorsManager pinned_iters_mgr;
  100. super_version->current->Get(
  101. read_options, lkey, get_impl_options.value, get_impl_options.columns,
  102. ts, &s, &merge_context, &max_covering_tombstone_seq, &pinned_iters_mgr,
  103. /*value_found*/ nullptr,
  104. /*key_exists*/ nullptr, /*seq*/ nullptr, &read_cb,
  105. /*is_blob*/ nullptr,
  106. /*do_merge=*/get_impl_options.get_value);
  107. RecordTick(stats_, MEMTABLE_MISS);
  108. }
  109. {
  110. RecordTick(stats_, NUMBER_KEYS_READ);
  111. size_t size = 0;
  112. if (get_impl_options.value) {
  113. size = get_impl_options.value->size();
  114. } else if (get_impl_options.columns) {
  115. size = get_impl_options.columns->serialized_size();
  116. } else if (get_impl_options.merge_operands) {
  117. *get_impl_options.number_of_operands =
  118. static_cast<int>(merge_context.GetNumOperands());
  119. for (const Slice& sl : merge_context.GetOperands()) {
  120. size += sl.size();
  121. get_impl_options.merge_operands->PinSelf(sl);
  122. get_impl_options.merge_operands++;
  123. }
  124. }
  125. RecordTick(stats_, BYTES_READ, size);
  126. RecordInHistogram(stats_, BYTES_PER_READ, size);
  127. PERF_COUNTER_ADD(get_read_bytes, size);
  128. }
  129. return s;
  130. }
  131. Iterator* DBImplReadOnly::NewIterator(const ReadOptions& _read_options,
  132. ColumnFamilyHandle* column_family) {
  133. if (_read_options.io_activity != Env::IOActivity::kUnknown &&
  134. _read_options.io_activity != Env::IOActivity::kDBIterator) {
  135. return NewErrorIterator(Status::InvalidArgument(
  136. "Can only call NewIterator with `ReadOptions::io_activity` is "
  137. "`Env::IOActivity::kUnknown` or `Env::IOActivity::kDBIterator`"));
  138. }
  139. ReadOptions read_options(_read_options);
  140. if (read_options.io_activity == Env::IOActivity::kUnknown) {
  141. read_options.io_activity = Env::IOActivity::kDBIterator;
  142. }
  143. assert(column_family);
  144. if (read_options.timestamp) {
  145. const Status s =
  146. FailIfTsMismatchCf(column_family, *(read_options.timestamp));
  147. if (!s.ok()) {
  148. return NewErrorIterator(s);
  149. }
  150. } else {
  151. const Status s = FailIfCfHasTs(column_family);
  152. if (!s.ok()) {
  153. return NewErrorIterator(s);
  154. }
  155. }
  156. auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
  157. auto cfd = cfh->cfd();
  158. SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
  159. if (read_options.timestamp && read_options.timestamp->size() > 0) {
  160. const Status s = FailIfReadCollapsedHistory(cfd, super_version,
  161. *(read_options.timestamp));
  162. if (!s.ok()) {
  163. cfd->GetSuperVersion()->Unref();
  164. return NewErrorIterator(s);
  165. }
  166. }
  167. SequenceNumber latest_snapshot = versions_->LastSequence();
  168. SequenceNumber read_seq =
  169. read_options.snapshot != nullptr
  170. ? static_cast<const SnapshotImpl*>(read_options.snapshot)->number_
  171. : latest_snapshot;
  172. ReadCallback* read_callback = nullptr; // No read callback provided.
  173. return NewArenaWrappedDbIterator(
  174. env_, read_options, cfh, super_version, read_seq, read_callback, this,
  175. /*expose_blob_index=*/false, /*allow_refresh=*/false,
  176. /*allow_mark_memtable_for_flush=*/false);
  177. }
  178. Status DBImplReadOnly::NewIterators(
  179. const ReadOptions& read_options,
  180. const std::vector<ColumnFamilyHandle*>& column_families,
  181. std::vector<Iterator*>* iterators) {
  182. if (read_options.timestamp) {
  183. for (auto* cf : column_families) {
  184. assert(cf);
  185. const Status s = FailIfTsMismatchCf(cf, *(read_options.timestamp));
  186. if (!s.ok()) {
  187. return s;
  188. }
  189. }
  190. } else {
  191. for (auto* cf : column_families) {
  192. assert(cf);
  193. const Status s = FailIfCfHasTs(cf);
  194. if (!s.ok()) {
  195. return s;
  196. }
  197. }
  198. }
  199. ReadCallback* read_callback = nullptr; // No read callback provided.
  200. if (iterators == nullptr) {
  201. return Status::InvalidArgument("iterators not allowed to be nullptr");
  202. }
  203. iterators->clear();
  204. iterators->reserve(column_families.size());
  205. SequenceNumber latest_snapshot = versions_->LastSequence();
  206. SequenceNumber read_seq =
  207. read_options.snapshot != nullptr
  208. ? static_cast<const SnapshotImpl*>(read_options.snapshot)->number_
  209. : latest_snapshot;
  210. autovector<std::tuple<ColumnFamilyHandleImpl*, SuperVersion*>> cfh_to_sv;
  211. const bool check_read_ts =
  212. read_options.timestamp && read_options.timestamp->size() > 0;
  213. for (auto cfh : column_families) {
  214. auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(cfh)->cfd();
  215. auto* sv = cfd->GetSuperVersion()->Ref();
  216. cfh_to_sv.emplace_back(static_cast_with_check<ColumnFamilyHandleImpl>(cfh),
  217. sv);
  218. if (check_read_ts) {
  219. const Status s =
  220. FailIfReadCollapsedHistory(cfd, sv, *(read_options.timestamp));
  221. if (!s.ok()) {
  222. for (auto prev_entry : cfh_to_sv) {
  223. std::get<1>(prev_entry)->Unref();
  224. }
  225. return s;
  226. }
  227. }
  228. }
  229. assert(cfh_to_sv.size() == column_families.size());
  230. for (auto [cfh, sv] : cfh_to_sv) {
  231. auto* db_iter = NewArenaWrappedDbIterator(
  232. env_, read_options, cfh, sv, read_seq, read_callback, this,
  233. /*expose_blob_index=*/false, /*allow_refresh=*/false,
  234. /*allow_mark_memtable_for_flush=*/false);
  235. iterators->push_back(db_iter);
  236. }
  237. return Status::OK();
  238. }
  239. namespace {
  240. // Return OK if dbname exists in the file system or create it if
  241. // create_if_missing
  242. Status OpenForReadOnlyCheckExistence(const DBOptions& db_options,
  243. const std::string& dbname) {
  244. Status s;
  245. if (!db_options.create_if_missing) {
  246. // Attempt to read "CURRENT" file
  247. const std::shared_ptr<FileSystem>& fs = db_options.env->GetFileSystem();
  248. std::string manifest_path;
  249. uint64_t manifest_file_number;
  250. s = GetCurrentManifestPath(dbname, fs.get(), /*is_retry=*/false,
  251. &manifest_path, &manifest_file_number);
  252. } else {
  253. // Historic behavior that doesn't necessarily make sense
  254. s = db_options.env->CreateDirIfMissing(dbname);
  255. }
  256. return s;
  257. }
  258. } // namespace
  259. Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
  260. std::unique_ptr<DB>* dbptr,
  261. bool /*error_if_wal_file_exists*/) {
  262. Status s = OpenForReadOnlyCheckExistence(options, dbname);
  263. if (!s.ok()) {
  264. return s;
  265. }
  266. *dbptr = nullptr;
  267. // Try to first open DB as fully compacted DB
  268. s = CompactedDBImpl::Open(options, dbname, dbptr);
  269. if (s.ok()) {
  270. return s;
  271. }
  272. DBOptions db_options(options);
  273. ColumnFamilyOptions cf_options(options);
  274. std::vector<ColumnFamilyDescriptor> column_families;
  275. column_families.emplace_back(kDefaultColumnFamilyName, cf_options);
  276. std::vector<ColumnFamilyHandle*> handles;
  277. s = DBImplReadOnly::OpenForReadOnlyWithoutCheck(
  278. db_options, dbname, column_families, &handles, dbptr);
  279. if (s.ok()) {
  280. assert(handles.size() == 1);
  281. // i can delete the handle since DBImpl is always holding a
  282. // reference to default column family
  283. delete handles[0];
  284. }
  285. return s;
  286. }
  287. Status DB::OpenForReadOnly(
  288. const DBOptions& db_options, const std::string& dbname,
  289. const std::vector<ColumnFamilyDescriptor>& column_families,
  290. std::vector<ColumnFamilyHandle*>* handles, std::unique_ptr<DB>* dbptr,
  291. bool error_if_wal_file_exists) {
  292. // If dbname does not exist in the file system, should not do anything
  293. Status s = OpenForReadOnlyCheckExistence(db_options, dbname);
  294. if (!s.ok()) {
  295. return s;
  296. }
  297. return DBImplReadOnly::OpenForReadOnlyWithoutCheck(
  298. db_options, dbname, column_families, handles, dbptr,
  299. error_if_wal_file_exists);
  300. }
  301. Status DBImplReadOnly::OpenForReadOnlyWithoutCheck(
  302. const DBOptions& db_options, const std::string& dbname,
  303. const std::vector<ColumnFamilyDescriptor>& column_families,
  304. std::vector<ColumnFamilyHandle*>* handles, std::unique_ptr<DB>* dbptr,
  305. bool error_if_wal_file_exists) {
  306. *dbptr = nullptr;
  307. handles->clear();
  308. SuperVersionContext sv_context(/* create_superversion */ true);
  309. DBImplReadOnly* impl = new DBImplReadOnly(db_options, dbname);
  310. impl->mutex_.Lock();
  311. Status s = impl->Recover(column_families, true /* read only */,
  312. error_if_wal_file_exists);
  313. if (s.ok()) {
  314. // set column family handles
  315. for (const auto& cf : column_families) {
  316. auto cfd =
  317. impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
  318. if (cfd == nullptr) {
  319. s = Status::InvalidArgument("Column family not found", cf.name);
  320. break;
  321. }
  322. handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
  323. }
  324. }
  325. if (s.ok()) {
  326. for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
  327. sv_context.NewSuperVersion();
  328. cfd->InstallSuperVersion(&sv_context, &impl->mutex_);
  329. }
  330. }
  331. impl->mutex_.Unlock();
  332. sv_context.Clean();
  333. if (s.ok()) {
  334. dbptr->reset(impl);
  335. for (auto* h : *handles) {
  336. impl->NewThreadStatusCfInfo(
  337. static_cast_with_check<ColumnFamilyHandleImpl>(h)->cfd());
  338. }
  339. } else {
  340. for (auto h : *handles) {
  341. delete h;
  342. }
  343. handles->clear();
  344. delete impl;
  345. }
  346. return s;
  347. }
  348. } // namespace ROCKSDB_NAMESPACE