blob_source.cc 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460
  1. // Copyright (c) Meta Platforms, Inc. and affiliates.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. #include "db/blob/blob_source.h"
  6. #include <cassert>
  7. #include <string>
  8. #include "cache/cache_reservation_manager.h"
  9. #include "cache/charged_cache.h"
  10. #include "db/blob/blob_contents.h"
  11. #include "db/blob/blob_file_reader.h"
  12. #include "db/blob/blob_log_format.h"
  13. #include "monitoring/statistics_impl.h"
  14. #include "options/cf_options.h"
  15. #include "table/get_context.h"
  16. #include "table/multiget_context.h"
  17. namespace ROCKSDB_NAMESPACE {
  18. BlobSource::BlobSource(const ImmutableOptions& immutable_options,
  19. const MutableCFOptions& mutable_cf_options,
  20. const std::string& db_id,
  21. const std::string& db_session_id,
  22. BlobFileCache* blob_file_cache)
  23. : db_id_(db_id),
  24. db_session_id_(db_session_id),
  25. statistics_(immutable_options.statistics.get()),
  26. blob_file_cache_(blob_file_cache),
  27. blob_cache_(immutable_options.blob_cache),
  28. lowest_used_cache_tier_(immutable_options.lowest_used_cache_tier) {
  29. auto bbto =
  30. mutable_cf_options.table_factory->GetOptions<BlockBasedTableOptions>();
  31. if (bbto &&
  32. bbto->cache_usage_options.options_overrides.at(CacheEntryRole::kBlobCache)
  33. .charged == CacheEntryRoleOptions::Decision::kEnabled) {
  34. blob_cache_ = SharedCacheInterface{std::make_shared<ChargedCache>(
  35. immutable_options.blob_cache, bbto->block_cache)};
  36. }
  37. }
  38. BlobSource::~BlobSource() = default;
  39. Status BlobSource::GetBlobFromCache(
  40. const Slice& cache_key, CacheHandleGuard<BlobContents>* cached_blob) const {
  41. assert(blob_cache_);
  42. assert(!cache_key.empty());
  43. assert(cached_blob);
  44. assert(cached_blob->IsEmpty());
  45. Cache::Handle* cache_handle = nullptr;
  46. cache_handle = GetEntryFromCache(cache_key);
  47. if (cache_handle != nullptr) {
  48. *cached_blob =
  49. CacheHandleGuard<BlobContents>(blob_cache_.get(), cache_handle);
  50. assert(cached_blob->GetValue());
  51. PERF_COUNTER_ADD(blob_cache_hit_count, 1);
  52. RecordTick(statistics_, BLOB_DB_CACHE_HIT);
  53. RecordTick(statistics_, BLOB_DB_CACHE_BYTES_READ,
  54. cached_blob->GetValue()->size());
  55. return Status::OK();
  56. }
  57. RecordTick(statistics_, BLOB_DB_CACHE_MISS);
  58. return Status::NotFound("Blob not found in cache");
  59. }
  60. Status BlobSource::PutBlobIntoCache(
  61. const Slice& cache_key, std::unique_ptr<BlobContents>* blob,
  62. CacheHandleGuard<BlobContents>* cached_blob) const {
  63. assert(blob_cache_);
  64. assert(!cache_key.empty());
  65. assert(blob);
  66. assert(*blob);
  67. assert(cached_blob);
  68. assert(cached_blob->IsEmpty());
  69. TypedHandle* cache_handle = nullptr;
  70. const Status s = InsertEntryIntoCache(cache_key, blob->get(), &cache_handle,
  71. Cache::Priority::BOTTOM);
  72. if (s.ok()) {
  73. blob->release();
  74. assert(cache_handle != nullptr);
  75. *cached_blob =
  76. CacheHandleGuard<BlobContents>(blob_cache_.get(), cache_handle);
  77. assert(cached_blob->GetValue());
  78. RecordTick(statistics_, BLOB_DB_CACHE_ADD);
  79. RecordTick(statistics_, BLOB_DB_CACHE_BYTES_WRITE,
  80. cached_blob->GetValue()->size());
  81. } else {
  82. RecordTick(statistics_, BLOB_DB_CACHE_ADD_FAILURES);
  83. }
  84. return s;
  85. }
  86. BlobSource::TypedHandle* BlobSource::GetEntryFromCache(const Slice& key) const {
  87. return blob_cache_.LookupFull(key, nullptr /* context */,
  88. Cache::Priority::BOTTOM, statistics_,
  89. lowest_used_cache_tier_);
  90. }
  91. void BlobSource::PinCachedBlob(CacheHandleGuard<BlobContents>* cached_blob,
  92. PinnableSlice* value) {
  93. assert(cached_blob);
  94. assert(cached_blob->GetValue());
  95. assert(value);
  96. // To avoid copying the cached blob into the buffer provided by the
  97. // application, we can simply transfer ownership of the cache handle to
  98. // the target PinnableSlice. This has the potential to save a lot of
  99. // CPU, especially with large blob values.
  100. value->Reset();
  101. constexpr Cleanable* cleanable = nullptr;
  102. value->PinSlice(cached_blob->GetValue()->data(), cleanable);
  103. cached_blob->TransferTo(value);
  104. }
  105. void BlobSource::PinOwnedBlob(std::unique_ptr<BlobContents>* owned_blob,
  106. PinnableSlice* value) {
  107. assert(owned_blob);
  108. assert(*owned_blob);
  109. assert(value);
  110. BlobContents* const blob = owned_blob->release();
  111. assert(blob);
  112. value->Reset();
  113. value->PinSlice(
  114. blob->data(),
  115. [](void* arg1, void* /* arg2 */) {
  116. delete static_cast<BlobContents*>(arg1);
  117. },
  118. blob, nullptr);
  119. }
  120. Status BlobSource::InsertEntryIntoCache(const Slice& key, BlobContents* value,
  121. TypedHandle** cache_handle,
  122. Cache::Priority priority) const {
  123. return blob_cache_.InsertFull(key, value, value->ApproximateMemoryUsage(),
  124. cache_handle, priority,
  125. lowest_used_cache_tier_);
  126. }
  127. Status BlobSource::GetBlob(const ReadOptions& read_options,
  128. const Slice& user_key, uint64_t file_number,
  129. uint64_t offset, uint64_t file_size,
  130. uint64_t value_size,
  131. CompressionType compression_type,
  132. FilePrefetchBuffer* prefetch_buffer,
  133. PinnableSlice* value, uint64_t* bytes_read) {
  134. assert(value);
  135. Status s;
  136. const CacheKey cache_key = GetCacheKey(file_number, file_size, offset);
  137. CacheHandleGuard<BlobContents> blob_handle;
  138. // First, try to get the blob from the cache
  139. //
  140. // If blob cache is enabled, we'll try to read from it.
  141. if (blob_cache_) {
  142. Slice key = cache_key.AsSlice();
  143. s = GetBlobFromCache(key, &blob_handle);
  144. if (s.ok()) {
  145. PinCachedBlob(&blob_handle, value);
  146. // For consistency, the size of on-disk (possibly compressed) blob record
  147. // is assigned to bytes_read.
  148. uint64_t adjustment =
  149. read_options.verify_checksums
  150. ? BlobLogRecord::CalculateAdjustmentForRecordHeader(
  151. user_key.size())
  152. : 0;
  153. assert(offset >= adjustment);
  154. uint64_t record_size = value_size + adjustment;
  155. if (bytes_read) {
  156. *bytes_read = record_size;
  157. }
  158. return s;
  159. }
  160. }
  161. assert(blob_handle.IsEmpty());
  162. const bool no_io = read_options.read_tier == kBlockCacheTier;
  163. if (no_io) {
  164. s = Status::Incomplete("Cannot read blob(s): no disk I/O allowed");
  165. return s;
  166. }
  167. // Can't find the blob from the cache. Since I/O is allowed, read from the
  168. // file.
  169. std::unique_ptr<BlobContents> blob_contents;
  170. {
  171. CacheHandleGuard<BlobFileReader> blob_file_reader;
  172. s = blob_file_cache_->GetBlobFileReader(read_options, file_number,
  173. &blob_file_reader);
  174. if (!s.ok()) {
  175. return s;
  176. }
  177. assert(blob_file_reader.GetValue());
  178. if (compression_type != blob_file_reader.GetValue()->GetCompressionType()) {
  179. return Status::Corruption("Compression type mismatch when reading blob");
  180. }
  181. MemoryAllocator* const allocator =
  182. (blob_cache_ && read_options.fill_cache)
  183. ? blob_cache_.get()->memory_allocator()
  184. : nullptr;
  185. uint64_t read_size = 0;
  186. s = blob_file_reader.GetValue()->GetBlob(
  187. read_options, user_key, offset, value_size, compression_type,
  188. prefetch_buffer, allocator, &blob_contents, &read_size);
  189. if (!s.ok()) {
  190. return s;
  191. }
  192. if (bytes_read) {
  193. *bytes_read = read_size;
  194. }
  195. }
  196. if (blob_cache_ && read_options.fill_cache) {
  197. // If filling cache is allowed and a cache is configured, try to put the
  198. // blob to the cache.
  199. Slice key = cache_key.AsSlice();
  200. s = PutBlobIntoCache(key, &blob_contents, &blob_handle);
  201. if (!s.ok()) {
  202. return s;
  203. }
  204. PinCachedBlob(&blob_handle, value);
  205. } else {
  206. PinOwnedBlob(&blob_contents, value);
  207. }
  208. assert(s.ok());
  209. return s;
  210. }
  211. void BlobSource::MultiGetBlob(const ReadOptions& read_options,
  212. autovector<BlobFileReadRequests>& blob_reqs,
  213. uint64_t* bytes_read) {
  214. assert(blob_reqs.size() > 0);
  215. uint64_t total_bytes_read = 0;
  216. uint64_t bytes_read_in_file = 0;
  217. for (auto& [file_number, file_size, blob_reqs_in_file] : blob_reqs) {
  218. // sort blob_reqs_in_file by file offset.
  219. std::sort(
  220. blob_reqs_in_file.begin(), blob_reqs_in_file.end(),
  221. [](const BlobReadRequest& lhs, const BlobReadRequest& rhs) -> bool {
  222. return lhs.offset < rhs.offset;
  223. });
  224. MultiGetBlobFromOneFile(read_options, file_number, file_size,
  225. blob_reqs_in_file, &bytes_read_in_file);
  226. total_bytes_read += bytes_read_in_file;
  227. }
  228. if (bytes_read) {
  229. *bytes_read = total_bytes_read;
  230. }
  231. }
  232. void BlobSource::MultiGetBlobFromOneFile(const ReadOptions& read_options,
  233. uint64_t file_number,
  234. uint64_t /*file_size*/,
  235. autovector<BlobReadRequest>& blob_reqs,
  236. uint64_t* bytes_read) {
  237. const size_t num_blobs = blob_reqs.size();
  238. assert(num_blobs > 0);
  239. assert(num_blobs <= MultiGetContext::MAX_BATCH_SIZE);
  240. #ifndef NDEBUG
  241. for (size_t i = 0; i < num_blobs - 1; ++i) {
  242. assert(blob_reqs[i].offset <= blob_reqs[i + 1].offset);
  243. }
  244. #endif // !NDEBUG
  245. using Mask = uint64_t;
  246. Mask cache_hit_mask = 0;
  247. uint64_t total_bytes = 0;
  248. const OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number);
  249. if (blob_cache_) {
  250. size_t cached_blob_count = 0;
  251. for (size_t i = 0; i < num_blobs; ++i) {
  252. auto& req = blob_reqs[i];
  253. CacheHandleGuard<BlobContents> blob_handle;
  254. const CacheKey cache_key = base_cache_key.WithOffset(req.offset);
  255. const Slice key = cache_key.AsSlice();
  256. const Status s = GetBlobFromCache(key, &blob_handle);
  257. if (s.ok()) {
  258. assert(req.status);
  259. *req.status = s;
  260. PinCachedBlob(&blob_handle, req.result);
  261. // Update the counter for the number of valid blobs read from the cache.
  262. ++cached_blob_count;
  263. // For consistency, the size of each on-disk (possibly compressed) blob
  264. // record is accumulated to total_bytes.
  265. uint64_t adjustment =
  266. read_options.verify_checksums
  267. ? BlobLogRecord::CalculateAdjustmentForRecordHeader(
  268. req.user_key->size())
  269. : 0;
  270. assert(req.offset >= adjustment);
  271. total_bytes += req.len + adjustment;
  272. cache_hit_mask |= (Mask{1} << i); // cache hit
  273. }
  274. }
  275. // All blobs were read from the cache.
  276. if (cached_blob_count == num_blobs) {
  277. if (bytes_read) {
  278. *bytes_read = total_bytes;
  279. }
  280. return;
  281. }
  282. }
  283. const bool no_io = read_options.read_tier == kBlockCacheTier;
  284. if (no_io) {
  285. for (size_t i = 0; i < num_blobs; ++i) {
  286. if (!(cache_hit_mask & (Mask{1} << i))) {
  287. BlobReadRequest& req = blob_reqs[i];
  288. assert(req.status);
  289. *req.status =
  290. Status::Incomplete("Cannot read blob(s): no disk I/O allowed");
  291. }
  292. }
  293. return;
  294. }
  295. {
  296. // Find the rest of blobs from the file since I/O is allowed.
  297. autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>
  298. _blob_reqs;
  299. uint64_t _bytes_read = 0;
  300. for (size_t i = 0; i < num_blobs; ++i) {
  301. if (!(cache_hit_mask & (Mask{1} << i))) {
  302. _blob_reqs.emplace_back(&blob_reqs[i], std::unique_ptr<BlobContents>());
  303. }
  304. }
  305. CacheHandleGuard<BlobFileReader> blob_file_reader;
  306. Status s = blob_file_cache_->GetBlobFileReader(read_options, file_number,
  307. &blob_file_reader);
  308. if (!s.ok()) {
  309. for (size_t i = 0; i < _blob_reqs.size(); ++i) {
  310. BlobReadRequest* const req = _blob_reqs[i].first;
  311. assert(req);
  312. assert(req->status);
  313. *req->status = s;
  314. }
  315. return;
  316. }
  317. assert(blob_file_reader.GetValue());
  318. MemoryAllocator* const allocator =
  319. (blob_cache_ && read_options.fill_cache)
  320. ? blob_cache_.get()->memory_allocator()
  321. : nullptr;
  322. blob_file_reader.GetValue()->MultiGetBlob(read_options, allocator,
  323. _blob_reqs, &_bytes_read);
  324. if (blob_cache_ && read_options.fill_cache) {
  325. // If filling cache is allowed and a cache is configured, try to put
  326. // the blob(s) to the cache.
  327. for (auto& [req, blob_contents] : _blob_reqs) {
  328. assert(req);
  329. if (req->status->ok()) {
  330. CacheHandleGuard<BlobContents> blob_handle;
  331. const CacheKey cache_key = base_cache_key.WithOffset(req->offset);
  332. const Slice key = cache_key.AsSlice();
  333. s = PutBlobIntoCache(key, &blob_contents, &blob_handle);
  334. if (!s.ok()) {
  335. *req->status = s;
  336. } else {
  337. PinCachedBlob(&blob_handle, req->result);
  338. }
  339. }
  340. }
  341. } else {
  342. for (auto& [req, blob_contents] : _blob_reqs) {
  343. assert(req);
  344. if (req->status->ok()) {
  345. PinOwnedBlob(&blob_contents, req->result);
  346. }
  347. }
  348. }
  349. total_bytes += _bytes_read;
  350. if (bytes_read) {
  351. *bytes_read = total_bytes;
  352. }
  353. }
  354. }
  355. bool BlobSource::TEST_BlobInCache(uint64_t file_number, uint64_t file_size,
  356. uint64_t offset, size_t* charge) const {
  357. const CacheKey cache_key = GetCacheKey(file_number, file_size, offset);
  358. const Slice key = cache_key.AsSlice();
  359. CacheHandleGuard<BlobContents> blob_handle;
  360. const Status s = GetBlobFromCache(key, &blob_handle);
  361. if (s.ok() && blob_handle.GetValue() != nullptr) {
  362. if (charge) {
  363. const Cache* const cache = blob_handle.GetCache();
  364. assert(cache);
  365. Cache::Handle* const handle = blob_handle.GetCacheHandle();
  366. assert(handle);
  367. *charge = cache->GetUsage(handle);
  368. }
  369. return true;
  370. }
  371. return false;
  372. }
  373. } // namespace ROCKSDB_NAMESPACE