block_based_table_reader.cc 170 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. #include "table/block_based/block_based_table_reader.h"
  10. #include <algorithm>
  11. #include <array>
  12. #include <limits>
  13. #include <string>
  14. #include <utility>
  15. #include <vector>
  16. #include "db/dbformat.h"
  17. #include "db/pinned_iterators_manager.h"
  18. #include "file/file_prefetch_buffer.h"
  19. #include "file/random_access_file_reader.h"
  20. #include "rocksdb/cache.h"
  21. #include "rocksdb/comparator.h"
  22. #include "rocksdb/env.h"
  23. #include "rocksdb/file_system.h"
  24. #include "rocksdb/filter_policy.h"
  25. #include "rocksdb/iterator.h"
  26. #include "rocksdb/options.h"
  27. #include "rocksdb/statistics.h"
  28. #include "rocksdb/table.h"
  29. #include "rocksdb/table_properties.h"
  30. #include "table/block_based/block.h"
  31. #include "table/block_based/block_based_filter_block.h"
  32. #include "table/block_based/block_based_table_factory.h"
  33. #include "table/block_based/block_prefix_index.h"
  34. #include "table/block_based/filter_block.h"
  35. #include "table/block_based/full_filter_block.h"
  36. #include "table/block_based/partitioned_filter_block.h"
  37. #include "table/block_fetcher.h"
  38. #include "table/format.h"
  39. #include "table/get_context.h"
  40. #include "table/internal_iterator.h"
  41. #include "table/meta_blocks.h"
  42. #include "table/multiget_context.h"
  43. #include "table/persistent_cache_helper.h"
  44. #include "table/sst_file_writer_collectors.h"
  45. #include "table/two_level_iterator.h"
  46. #include "monitoring/perf_context_imp.h"
  47. #include "test_util/sync_point.h"
  48. #include "util/coding.h"
  49. #include "util/crc32c.h"
  50. #include "util/stop_watch.h"
  51. #include "util/string_util.h"
  52. #include "util/util.h"
  53. #include "util/xxhash.h"
  54. namespace ROCKSDB_NAMESPACE {
  55. extern const uint64_t kBlockBasedTableMagicNumber;
  56. extern const std::string kHashIndexPrefixesBlock;
  57. extern const std::string kHashIndexPrefixesMetadataBlock;
  58. typedef BlockBasedTable::IndexReader IndexReader;
  59. // Found that 256 KB readahead size provides the best performance, based on
  60. // experiments, for auto readahead. Experiment data is in PR #3282.
  61. const size_t BlockBasedTable::kMaxAutoReadaheadSize = 256 * 1024;
  62. BlockBasedTable::~BlockBasedTable() {
  63. delete rep_;
  64. }
  65. std::atomic<uint64_t> BlockBasedTable::next_cache_key_id_(0);
  66. template <typename TBlocklike>
  67. class BlocklikeTraits;
  68. template <>
  69. class BlocklikeTraits<BlockContents> {
  70. public:
  71. static BlockContents* Create(BlockContents&& contents,
  72. SequenceNumber /* global_seqno */,
  73. size_t /* read_amp_bytes_per_bit */,
  74. Statistics* /* statistics */,
  75. bool /* using_zstd */,
  76. const FilterPolicy* /* filter_policy */) {
  77. return new BlockContents(std::move(contents));
  78. }
  79. static uint32_t GetNumRestarts(const BlockContents& /* contents */) {
  80. return 0;
  81. }
  82. };
  83. template <>
  84. class BlocklikeTraits<ParsedFullFilterBlock> {
  85. public:
  86. static ParsedFullFilterBlock* Create(BlockContents&& contents,
  87. SequenceNumber /* global_seqno */,
  88. size_t /* read_amp_bytes_per_bit */,
  89. Statistics* /* statistics */,
  90. bool /* using_zstd */,
  91. const FilterPolicy* filter_policy) {
  92. return new ParsedFullFilterBlock(filter_policy, std::move(contents));
  93. }
  94. static uint32_t GetNumRestarts(const ParsedFullFilterBlock& /* block */) {
  95. return 0;
  96. }
  97. };
  98. template <>
  99. class BlocklikeTraits<Block> {
  100. public:
  101. static Block* Create(BlockContents&& contents, SequenceNumber global_seqno,
  102. size_t read_amp_bytes_per_bit, Statistics* statistics,
  103. bool /* using_zstd */,
  104. const FilterPolicy* /* filter_policy */) {
  105. return new Block(std::move(contents), global_seqno, read_amp_bytes_per_bit,
  106. statistics);
  107. }
  108. static uint32_t GetNumRestarts(const Block& block) {
  109. return block.NumRestarts();
  110. }
  111. };
  112. template <>
  113. class BlocklikeTraits<UncompressionDict> {
  114. public:
  115. static UncompressionDict* Create(BlockContents&& contents,
  116. SequenceNumber /* global_seqno */,
  117. size_t /* read_amp_bytes_per_bit */,
  118. Statistics* /* statistics */,
  119. bool using_zstd,
  120. const FilterPolicy* /* filter_policy */) {
  121. return new UncompressionDict(contents.data, std::move(contents.allocation),
  122. using_zstd);
  123. }
  124. static uint32_t GetNumRestarts(const UncompressionDict& /* dict */) {
  125. return 0;
  126. }
  127. };
  128. namespace {
  129. // Read the block identified by "handle" from "file".
  130. // The only relevant option is options.verify_checksums for now.
  131. // On failure return non-OK.
  132. // On success fill *result and return OK - caller owns *result
  133. // @param uncompression_dict Data for presetting the compression library's
  134. // dictionary.
  135. template <typename TBlocklike>
  136. Status ReadBlockFromFile(
  137. RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
  138. const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
  139. std::unique_ptr<TBlocklike>* result, const ImmutableCFOptions& ioptions,
  140. bool do_uncompress, bool maybe_compressed, BlockType block_type,
  141. const UncompressionDict& uncompression_dict,
  142. const PersistentCacheOptions& cache_options, SequenceNumber global_seqno,
  143. size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator,
  144. bool for_compaction, bool using_zstd, const FilterPolicy* filter_policy) {
  145. assert(result);
  146. BlockContents contents;
  147. BlockFetcher block_fetcher(
  148. file, prefetch_buffer, footer, options, handle, &contents, ioptions,
  149. do_uncompress, maybe_compressed, block_type, uncompression_dict,
  150. cache_options, memory_allocator, nullptr, for_compaction);
  151. Status s = block_fetcher.ReadBlockContents();
  152. if (s.ok()) {
  153. result->reset(BlocklikeTraits<TBlocklike>::Create(
  154. std::move(contents), global_seqno, read_amp_bytes_per_bit,
  155. ioptions.statistics, using_zstd, filter_policy));
  156. }
  157. return s;
  158. }
  159. inline MemoryAllocator* GetMemoryAllocator(
  160. const BlockBasedTableOptions& table_options) {
  161. return table_options.block_cache.get()
  162. ? table_options.block_cache->memory_allocator()
  163. : nullptr;
  164. }
  165. inline MemoryAllocator* GetMemoryAllocatorForCompressedBlock(
  166. const BlockBasedTableOptions& table_options) {
  167. return table_options.block_cache_compressed.get()
  168. ? table_options.block_cache_compressed->memory_allocator()
  169. : nullptr;
  170. }
  171. // Delete the entry resided in the cache.
  172. template <class Entry>
  173. void DeleteCachedEntry(const Slice& /*key*/, void* value) {
  174. auto entry = reinterpret_cast<Entry*>(value);
  175. delete entry;
  176. }
  177. // Release the cached entry and decrement its ref count.
  178. void ForceReleaseCachedEntry(void* arg, void* h) {
  179. Cache* cache = reinterpret_cast<Cache*>(arg);
  180. Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
  181. cache->Release(handle, true /* force_erase */);
  182. }
  183. // Release the cached entry and decrement its ref count.
  184. // Do not force erase
  185. void ReleaseCachedEntry(void* arg, void* h) {
  186. Cache* cache = reinterpret_cast<Cache*>(arg);
  187. Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
  188. cache->Release(handle, false /* force_erase */);
  189. }
  190. // For hash based index, return true if prefix_extractor and
  191. // prefix_extractor_block mismatch, false otherwise. This flag will be used
  192. // as total_order_seek via NewIndexIterator
  193. bool PrefixExtractorChanged(const TableProperties* table_properties,
  194. const SliceTransform* prefix_extractor) {
  195. // BlockBasedTableOptions::kHashSearch requires prefix_extractor to be set.
  196. // Turn off hash index in prefix_extractor is not set; if prefix_extractor
  197. // is set but prefix_extractor_block is not set, also disable hash index
  198. if (prefix_extractor == nullptr || table_properties == nullptr ||
  199. table_properties->prefix_extractor_name.empty()) {
  200. return true;
  201. }
  202. // prefix_extractor and prefix_extractor_block are both non-empty
  203. if (table_properties->prefix_extractor_name.compare(
  204. prefix_extractor->Name()) != 0) {
  205. return true;
  206. } else {
  207. return false;
  208. }
  209. }
  210. CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) {
  211. CacheAllocationPtr heap_buf;
  212. heap_buf = AllocateBlock(buf.size(), allocator);
  213. memcpy(heap_buf.get(), buf.data(), buf.size());
  214. return heap_buf;
  215. }
  216. } // namespace
  217. // Encapsulates common functionality for the various index reader
  218. // implementations. Provides access to the index block regardless of whether
  219. // it is owned by the reader or stored in the cache, or whether it is pinned
  220. // in the cache or not.
  221. class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
  222. public:
  223. IndexReaderCommon(const BlockBasedTable* t,
  224. CachableEntry<Block>&& index_block)
  225. : table_(t), index_block_(std::move(index_block)) {
  226. assert(table_ != nullptr);
  227. }
  228. protected:
  229. static Status ReadIndexBlock(const BlockBasedTable* table,
  230. FilePrefetchBuffer* prefetch_buffer,
  231. const ReadOptions& read_options, bool use_cache,
  232. GetContext* get_context,
  233. BlockCacheLookupContext* lookup_context,
  234. CachableEntry<Block>* index_block);
  235. const BlockBasedTable* table() const { return table_; }
  236. const InternalKeyComparator* internal_comparator() const {
  237. assert(table_ != nullptr);
  238. assert(table_->get_rep() != nullptr);
  239. return &table_->get_rep()->internal_comparator;
  240. }
  241. bool index_has_first_key() const {
  242. assert(table_ != nullptr);
  243. assert(table_->get_rep() != nullptr);
  244. return table_->get_rep()->index_has_first_key;
  245. }
  246. bool index_key_includes_seq() const {
  247. assert(table_ != nullptr);
  248. assert(table_->get_rep() != nullptr);
  249. return table_->get_rep()->index_key_includes_seq;
  250. }
  251. bool index_value_is_full() const {
  252. assert(table_ != nullptr);
  253. assert(table_->get_rep() != nullptr);
  254. return table_->get_rep()->index_value_is_full;
  255. }
  256. bool cache_index_blocks() const {
  257. assert(table_ != nullptr);
  258. assert(table_->get_rep() != nullptr);
  259. return table_->get_rep()->table_options.cache_index_and_filter_blocks;
  260. }
  261. Status GetOrReadIndexBlock(bool no_io, GetContext* get_context,
  262. BlockCacheLookupContext* lookup_context,
  263. CachableEntry<Block>* index_block) const;
  264. size_t ApproximateIndexBlockMemoryUsage() const {
  265. assert(!index_block_.GetOwnValue() || index_block_.GetValue() != nullptr);
  266. return index_block_.GetOwnValue()
  267. ? index_block_.GetValue()->ApproximateMemoryUsage()
  268. : 0;
  269. }
  270. private:
  271. const BlockBasedTable* table_;
  272. CachableEntry<Block> index_block_;
  273. };
  274. Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
  275. const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
  276. const ReadOptions& read_options, bool use_cache, GetContext* get_context,
  277. BlockCacheLookupContext* lookup_context,
  278. CachableEntry<Block>* index_block) {
  279. PERF_TIMER_GUARD(read_index_block_nanos);
  280. assert(table != nullptr);
  281. assert(index_block != nullptr);
  282. assert(index_block->IsEmpty());
  283. const Rep* const rep = table->get_rep();
  284. assert(rep != nullptr);
  285. const Status s = table->RetrieveBlock(
  286. prefetch_buffer, read_options, rep->footer.index_handle(),
  287. UncompressionDict::GetEmptyDict(), index_block, BlockType::kIndex,
  288. get_context, lookup_context, /* for_compaction */ false, use_cache);
  289. return s;
  290. }
  291. Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock(
  292. bool no_io, GetContext* get_context,
  293. BlockCacheLookupContext* lookup_context,
  294. CachableEntry<Block>* index_block) const {
  295. assert(index_block != nullptr);
  296. if (!index_block_.IsEmpty()) {
  297. index_block->SetUnownedValue(index_block_.GetValue());
  298. return Status::OK();
  299. }
  300. ReadOptions read_options;
  301. if (no_io) {
  302. read_options.read_tier = kBlockCacheTier;
  303. }
  304. return ReadIndexBlock(table_, /*prefetch_buffer=*/nullptr, read_options,
  305. cache_index_blocks(), get_context, lookup_context,
  306. index_block);
  307. }
  308. // Index that allows binary search lookup in a two-level index structure.
  309. class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
  310. public:
  311. // Read the partition index from the file and create an instance for
  312. // `PartitionIndexReader`.
  313. // On success, index_reader will be populated; otherwise it will remain
  314. // unmodified.
  315. static Status Create(const BlockBasedTable* table,
  316. FilePrefetchBuffer* prefetch_buffer, bool use_cache,
  317. bool prefetch, bool pin,
  318. BlockCacheLookupContext* lookup_context,
  319. std::unique_ptr<IndexReader>* index_reader) {
  320. assert(table != nullptr);
  321. assert(table->get_rep());
  322. assert(!pin || prefetch);
  323. assert(index_reader != nullptr);
  324. CachableEntry<Block> index_block;
  325. if (prefetch || !use_cache) {
  326. const Status s =
  327. ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
  328. /*get_context=*/nullptr, lookup_context, &index_block);
  329. if (!s.ok()) {
  330. return s;
  331. }
  332. if (use_cache && !pin) {
  333. index_block.Reset();
  334. }
  335. }
  336. index_reader->reset(
  337. new PartitionIndexReader(table, std::move(index_block)));
  338. return Status::OK();
  339. }
  340. // return a two-level iterator: first level is on the partition index
  341. InternalIteratorBase<IndexValue>* NewIterator(
  342. const ReadOptions& read_options, bool /* disable_prefix_seek */,
  343. IndexBlockIter* iter, GetContext* get_context,
  344. BlockCacheLookupContext* lookup_context) override {
  345. const bool no_io = (read_options.read_tier == kBlockCacheTier);
  346. CachableEntry<Block> index_block;
  347. const Status s =
  348. GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
  349. if (!s.ok()) {
  350. if (iter != nullptr) {
  351. iter->Invalidate(s);
  352. return iter;
  353. }
  354. return NewErrorInternalIterator<IndexValue>(s);
  355. }
  356. InternalIteratorBase<IndexValue>* it = nullptr;
  357. Statistics* kNullStats = nullptr;
  358. // Filters are already checked before seeking the index
  359. if (!partition_map_.empty()) {
  360. // We don't return pinned data from index blocks, so no need
  361. // to set `block_contents_pinned`.
  362. it = NewTwoLevelIterator(
  363. new BlockBasedTable::PartitionedIndexIteratorState(table(),
  364. &partition_map_),
  365. index_block.GetValue()->NewIndexIterator(
  366. internal_comparator(), internal_comparator()->user_comparator(),
  367. nullptr, kNullStats, true, index_has_first_key(),
  368. index_key_includes_seq(), index_value_is_full()));
  369. } else {
  370. ReadOptions ro;
  371. ro.fill_cache = read_options.fill_cache;
  372. // We don't return pinned data from index blocks, so no need
  373. // to set `block_contents_pinned`.
  374. it = new BlockBasedTableIterator<IndexBlockIter, IndexValue>(
  375. table(), ro, *internal_comparator(),
  376. index_block.GetValue()->NewIndexIterator(
  377. internal_comparator(), internal_comparator()->user_comparator(),
  378. nullptr, kNullStats, true, index_has_first_key(),
  379. index_key_includes_seq(), index_value_is_full()),
  380. false, true, /* prefix_extractor */ nullptr, BlockType::kIndex,
  381. lookup_context ? lookup_context->caller
  382. : TableReaderCaller::kUncategorized);
  383. }
  384. assert(it != nullptr);
  385. index_block.TransferTo(it);
  386. return it;
  387. // TODO(myabandeh): Update TwoLevelIterator to be able to make use of
  388. // on-stack BlockIter while the state is on heap. Currentlly it assumes
  389. // the first level iter is always on heap and will attempt to delete it
  390. // in its destructor.
  391. }
  392. void CacheDependencies(bool pin) override {
  393. // Before read partitions, prefetch them to avoid lots of IOs
  394. BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
  395. const BlockBasedTable::Rep* rep = table()->rep_;
  396. IndexBlockIter biter;
  397. BlockHandle handle;
  398. Statistics* kNullStats = nullptr;
  399. CachableEntry<Block> index_block;
  400. Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */,
  401. &lookup_context, &index_block);
  402. if (!s.ok()) {
  403. ROCKS_LOG_WARN(rep->ioptions.info_log,
  404. "Error retrieving top-level index block while trying to "
  405. "cache index partitions: %s",
  406. s.ToString().c_str());
  407. return;
  408. }
  409. // We don't return pinned data from index blocks, so no need
  410. // to set `block_contents_pinned`.
  411. index_block.GetValue()->NewIndexIterator(
  412. internal_comparator(), internal_comparator()->user_comparator(), &biter,
  413. kNullStats, true, index_has_first_key(), index_key_includes_seq(),
  414. index_value_is_full());
  415. // Index partitions are assumed to be consecuitive. Prefetch them all.
  416. // Read the first block offset
  417. biter.SeekToFirst();
  418. if (!biter.Valid()) {
  419. // Empty index.
  420. return;
  421. }
  422. handle = biter.value().handle;
  423. uint64_t prefetch_off = handle.offset();
  424. // Read the last block's offset
  425. biter.SeekToLast();
  426. if (!biter.Valid()) {
  427. // Empty index.
  428. return;
  429. }
  430. handle = biter.value().handle;
  431. uint64_t last_off = handle.offset() + block_size(handle);
  432. uint64_t prefetch_len = last_off - prefetch_off;
  433. std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
  434. rep->CreateFilePrefetchBuffer(0, 0, &prefetch_buffer);
  435. s = prefetch_buffer->Prefetch(rep->file.get(), prefetch_off,
  436. static_cast<size_t>(prefetch_len));
  437. // After prefetch, read the partitions one by one
  438. biter.SeekToFirst();
  439. auto ro = ReadOptions();
  440. for (; biter.Valid(); biter.Next()) {
  441. handle = biter.value().handle;
  442. CachableEntry<Block> block;
  443. // TODO: Support counter batch update for partitioned index and
  444. // filter blocks
  445. s = table()->MaybeReadBlockAndLoadToCache(
  446. prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
  447. &block, BlockType::kIndex, /*get_context=*/nullptr, &lookup_context,
  448. /*contents=*/nullptr);
  449. assert(s.ok() || block.GetValue() == nullptr);
  450. if (s.ok() && block.GetValue() != nullptr) {
  451. if (block.IsCached()) {
  452. if (pin) {
  453. partition_map_[handle.offset()] = std::move(block);
  454. }
  455. }
  456. }
  457. }
  458. }
  459. size_t ApproximateMemoryUsage() const override {
  460. size_t usage = ApproximateIndexBlockMemoryUsage();
  461. #ifdef ROCKSDB_MALLOC_USABLE_SIZE
  462. usage += malloc_usable_size(const_cast<PartitionIndexReader*>(this));
  463. #else
  464. usage += sizeof(*this);
  465. #endif // ROCKSDB_MALLOC_USABLE_SIZE
  466. // TODO(myabandeh): more accurate estimate of partition_map_ mem usage
  467. return usage;
  468. }
  469. private:
  470. PartitionIndexReader(const BlockBasedTable* t,
  471. CachableEntry<Block>&& index_block)
  472. : IndexReaderCommon(t, std::move(index_block)) {}
  473. std::unordered_map<uint64_t, CachableEntry<Block>> partition_map_;
  474. };
  475. // Index that allows binary search lookup for the first key of each block.
  476. // This class can be viewed as a thin wrapper for `Block` class which already
  477. // supports binary search.
  478. class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
  479. public:
  480. // Read index from the file and create an intance for
  481. // `BinarySearchIndexReader`.
  482. // On success, index_reader will be populated; otherwise it will remain
  483. // unmodified.
  484. static Status Create(const BlockBasedTable* table,
  485. FilePrefetchBuffer* prefetch_buffer, bool use_cache,
  486. bool prefetch, bool pin,
  487. BlockCacheLookupContext* lookup_context,
  488. std::unique_ptr<IndexReader>* index_reader) {
  489. assert(table != nullptr);
  490. assert(table->get_rep());
  491. assert(!pin || prefetch);
  492. assert(index_reader != nullptr);
  493. CachableEntry<Block> index_block;
  494. if (prefetch || !use_cache) {
  495. const Status s =
  496. ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
  497. /*get_context=*/nullptr, lookup_context, &index_block);
  498. if (!s.ok()) {
  499. return s;
  500. }
  501. if (use_cache && !pin) {
  502. index_block.Reset();
  503. }
  504. }
  505. index_reader->reset(
  506. new BinarySearchIndexReader(table, std::move(index_block)));
  507. return Status::OK();
  508. }
  509. InternalIteratorBase<IndexValue>* NewIterator(
  510. const ReadOptions& read_options, bool /* disable_prefix_seek */,
  511. IndexBlockIter* iter, GetContext* get_context,
  512. BlockCacheLookupContext* lookup_context) override {
  513. const bool no_io = (read_options.read_tier == kBlockCacheTier);
  514. CachableEntry<Block> index_block;
  515. const Status s =
  516. GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
  517. if (!s.ok()) {
  518. if (iter != nullptr) {
  519. iter->Invalidate(s);
  520. return iter;
  521. }
  522. return NewErrorInternalIterator<IndexValue>(s);
  523. }
  524. Statistics* kNullStats = nullptr;
  525. // We don't return pinned data from index blocks, so no need
  526. // to set `block_contents_pinned`.
  527. auto it = index_block.GetValue()->NewIndexIterator(
  528. internal_comparator(), internal_comparator()->user_comparator(), iter,
  529. kNullStats, true, index_has_first_key(), index_key_includes_seq(),
  530. index_value_is_full());
  531. assert(it != nullptr);
  532. index_block.TransferTo(it);
  533. return it;
  534. }
  535. size_t ApproximateMemoryUsage() const override {
  536. size_t usage = ApproximateIndexBlockMemoryUsage();
  537. #ifdef ROCKSDB_MALLOC_USABLE_SIZE
  538. usage += malloc_usable_size(const_cast<BinarySearchIndexReader*>(this));
  539. #else
  540. usage += sizeof(*this);
  541. #endif // ROCKSDB_MALLOC_USABLE_SIZE
  542. return usage;
  543. }
  544. private:
  545. BinarySearchIndexReader(const BlockBasedTable* t,
  546. CachableEntry<Block>&& index_block)
  547. : IndexReaderCommon(t, std::move(index_block)) {}
  548. };
  549. // Index that leverages an internal hash table to quicken the lookup for a given
  550. // key.
  551. class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
  552. public:
  553. static Status Create(const BlockBasedTable* table,
  554. FilePrefetchBuffer* prefetch_buffer,
  555. InternalIterator* meta_index_iter, bool use_cache,
  556. bool prefetch, bool pin,
  557. BlockCacheLookupContext* lookup_context,
  558. std::unique_ptr<IndexReader>* index_reader) {
  559. assert(table != nullptr);
  560. assert(index_reader != nullptr);
  561. assert(!pin || prefetch);
  562. const BlockBasedTable::Rep* rep = table->get_rep();
  563. assert(rep != nullptr);
  564. CachableEntry<Block> index_block;
  565. if (prefetch || !use_cache) {
  566. const Status s =
  567. ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
  568. /*get_context=*/nullptr, lookup_context, &index_block);
  569. if (!s.ok()) {
  570. return s;
  571. }
  572. if (use_cache && !pin) {
  573. index_block.Reset();
  574. }
  575. }
  576. // Note, failure to create prefix hash index does not need to be a
  577. // hard error. We can still fall back to the original binary search index.
  578. // So, Create will succeed regardless, from this point on.
  579. index_reader->reset(new HashIndexReader(table, std::move(index_block)));
  580. // Get prefixes block
  581. BlockHandle prefixes_handle;
  582. Status s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock,
  583. &prefixes_handle);
  584. if (!s.ok()) {
  585. // TODO: log error
  586. return Status::OK();
  587. }
  588. // Get index metadata block
  589. BlockHandle prefixes_meta_handle;
  590. s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesMetadataBlock,
  591. &prefixes_meta_handle);
  592. if (!s.ok()) {
  593. // TODO: log error
  594. return Status::OK();
  595. }
  596. RandomAccessFileReader* const file = rep->file.get();
  597. const Footer& footer = rep->footer;
  598. const ImmutableCFOptions& ioptions = rep->ioptions;
  599. const PersistentCacheOptions& cache_options = rep->persistent_cache_options;
  600. MemoryAllocator* const memory_allocator =
  601. GetMemoryAllocator(rep->table_options);
  602. // Read contents for the blocks
  603. BlockContents prefixes_contents;
  604. BlockFetcher prefixes_block_fetcher(
  605. file, prefetch_buffer, footer, ReadOptions(), prefixes_handle,
  606. &prefixes_contents, ioptions, true /*decompress*/,
  607. true /*maybe_compressed*/, BlockType::kHashIndexPrefixes,
  608. UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
  609. s = prefixes_block_fetcher.ReadBlockContents();
  610. if (!s.ok()) {
  611. return s;
  612. }
  613. BlockContents prefixes_meta_contents;
  614. BlockFetcher prefixes_meta_block_fetcher(
  615. file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle,
  616. &prefixes_meta_contents, ioptions, true /*decompress*/,
  617. true /*maybe_compressed*/, BlockType::kHashIndexMetadata,
  618. UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
  619. s = prefixes_meta_block_fetcher.ReadBlockContents();
  620. if (!s.ok()) {
  621. // TODO: log error
  622. return Status::OK();
  623. }
  624. BlockPrefixIndex* prefix_index = nullptr;
  625. assert(rep->internal_prefix_transform.get() != nullptr);
  626. s = BlockPrefixIndex::Create(rep->internal_prefix_transform.get(),
  627. prefixes_contents.data,
  628. prefixes_meta_contents.data, &prefix_index);
  629. // TODO: log error
  630. if (s.ok()) {
  631. HashIndexReader* const hash_index_reader =
  632. static_cast<HashIndexReader*>(index_reader->get());
  633. hash_index_reader->prefix_index_.reset(prefix_index);
  634. }
  635. return Status::OK();
  636. }
  637. InternalIteratorBase<IndexValue>* NewIterator(
  638. const ReadOptions& read_options, bool disable_prefix_seek,
  639. IndexBlockIter* iter, GetContext* get_context,
  640. BlockCacheLookupContext* lookup_context) override {
  641. const bool no_io = (read_options.read_tier == kBlockCacheTier);
  642. CachableEntry<Block> index_block;
  643. const Status s =
  644. GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
  645. if (!s.ok()) {
  646. if (iter != nullptr) {
  647. iter->Invalidate(s);
  648. return iter;
  649. }
  650. return NewErrorInternalIterator<IndexValue>(s);
  651. }
  652. Statistics* kNullStats = nullptr;
  653. const bool total_order_seek =
  654. read_options.total_order_seek || disable_prefix_seek;
  655. // We don't return pinned data from index blocks, so no need
  656. // to set `block_contents_pinned`.
  657. auto it = index_block.GetValue()->NewIndexIterator(
  658. internal_comparator(), internal_comparator()->user_comparator(), iter,
  659. kNullStats, total_order_seek, index_has_first_key(),
  660. index_key_includes_seq(), index_value_is_full(),
  661. false /* block_contents_pinned */, prefix_index_.get());
  662. assert(it != nullptr);
  663. index_block.TransferTo(it);
  664. return it;
  665. }
  666. size_t ApproximateMemoryUsage() const override {
  667. size_t usage = ApproximateIndexBlockMemoryUsage();
  668. #ifdef ROCKSDB_MALLOC_USABLE_SIZE
  669. usage += malloc_usable_size(const_cast<HashIndexReader*>(this));
  670. #else
  671. if (prefix_index_) {
  672. usage += prefix_index_->ApproximateMemoryUsage();
  673. }
  674. usage += sizeof(*this);
  675. #endif // ROCKSDB_MALLOC_USABLE_SIZE
  676. return usage;
  677. }
  678. private:
  679. HashIndexReader(const BlockBasedTable* t, CachableEntry<Block>&& index_block)
  680. : IndexReaderCommon(t, std::move(index_block)) {}
  681. std::unique_ptr<BlockPrefixIndex> prefix_index_;
  682. };
  683. void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type,
  684. GetContext* get_context,
  685. size_t usage) const {
  686. Statistics* const statistics = rep_->ioptions.statistics;
  687. PERF_COUNTER_ADD(block_cache_hit_count, 1);
  688. PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1,
  689. static_cast<uint32_t>(rep_->level));
  690. if (get_context) {
  691. ++get_context->get_context_stats_.num_cache_hit;
  692. get_context->get_context_stats_.num_cache_bytes_read += usage;
  693. } else {
  694. RecordTick(statistics, BLOCK_CACHE_HIT);
  695. RecordTick(statistics, BLOCK_CACHE_BYTES_READ, usage);
  696. }
  697. switch (block_type) {
  698. case BlockType::kFilter:
  699. PERF_COUNTER_ADD(block_cache_filter_hit_count, 1);
  700. if (get_context) {
  701. ++get_context->get_context_stats_.num_cache_filter_hit;
  702. } else {
  703. RecordTick(statistics, BLOCK_CACHE_FILTER_HIT);
  704. }
  705. break;
  706. case BlockType::kCompressionDictionary:
  707. // TODO: introduce perf counter for compression dictionary hit count
  708. if (get_context) {
  709. ++get_context->get_context_stats_.num_cache_compression_dict_hit;
  710. } else {
  711. RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_HIT);
  712. }
  713. break;
  714. case BlockType::kIndex:
  715. PERF_COUNTER_ADD(block_cache_index_hit_count, 1);
  716. if (get_context) {
  717. ++get_context->get_context_stats_.num_cache_index_hit;
  718. } else {
  719. RecordTick(statistics, BLOCK_CACHE_INDEX_HIT);
  720. }
  721. break;
  722. default:
  723. // TODO: introduce dedicated tickers/statistics/counters
  724. // for range tombstones
  725. if (get_context) {
  726. ++get_context->get_context_stats_.num_cache_data_hit;
  727. } else {
  728. RecordTick(statistics, BLOCK_CACHE_DATA_HIT);
  729. }
  730. break;
  731. }
  732. }
  733. void BlockBasedTable::UpdateCacheMissMetrics(BlockType block_type,
  734. GetContext* get_context) const {
  735. Statistics* const statistics = rep_->ioptions.statistics;
  736. // TODO: introduce aggregate (not per-level) block cache miss count
  737. PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1,
  738. static_cast<uint32_t>(rep_->level));
  739. if (get_context) {
  740. ++get_context->get_context_stats_.num_cache_miss;
  741. } else {
  742. RecordTick(statistics, BLOCK_CACHE_MISS);
  743. }
  744. // TODO: introduce perf counters for misses per block type
  745. switch (block_type) {
  746. case BlockType::kFilter:
  747. if (get_context) {
  748. ++get_context->get_context_stats_.num_cache_filter_miss;
  749. } else {
  750. RecordTick(statistics, BLOCK_CACHE_FILTER_MISS);
  751. }
  752. break;
  753. case BlockType::kCompressionDictionary:
  754. if (get_context) {
  755. ++get_context->get_context_stats_.num_cache_compression_dict_miss;
  756. } else {
  757. RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_MISS);
  758. }
  759. break;
  760. case BlockType::kIndex:
  761. if (get_context) {
  762. ++get_context->get_context_stats_.num_cache_index_miss;
  763. } else {
  764. RecordTick(statistics, BLOCK_CACHE_INDEX_MISS);
  765. }
  766. break;
  767. default:
  768. // TODO: introduce dedicated tickers/statistics/counters
  769. // for range tombstones
  770. if (get_context) {
  771. ++get_context->get_context_stats_.num_cache_data_miss;
  772. } else {
  773. RecordTick(statistics, BLOCK_CACHE_DATA_MISS);
  774. }
  775. break;
  776. }
  777. }
  778. void BlockBasedTable::UpdateCacheInsertionMetrics(BlockType block_type,
  779. GetContext* get_context,
  780. size_t usage) const {
  781. Statistics* const statistics = rep_->ioptions.statistics;
  782. // TODO: introduce perf counters for block cache insertions
  783. if (get_context) {
  784. ++get_context->get_context_stats_.num_cache_add;
  785. get_context->get_context_stats_.num_cache_bytes_write += usage;
  786. } else {
  787. RecordTick(statistics, BLOCK_CACHE_ADD);
  788. RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage);
  789. }
  790. switch (block_type) {
  791. case BlockType::kFilter:
  792. if (get_context) {
  793. ++get_context->get_context_stats_.num_cache_filter_add;
  794. get_context->get_context_stats_.num_cache_filter_bytes_insert += usage;
  795. } else {
  796. RecordTick(statistics, BLOCK_CACHE_FILTER_ADD);
  797. RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage);
  798. }
  799. break;
  800. case BlockType::kCompressionDictionary:
  801. if (get_context) {
  802. ++get_context->get_context_stats_.num_cache_compression_dict_add;
  803. get_context->get_context_stats_
  804. .num_cache_compression_dict_bytes_insert += usage;
  805. } else {
  806. RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD);
  807. RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
  808. usage);
  809. }
  810. break;
  811. case BlockType::kIndex:
  812. if (get_context) {
  813. ++get_context->get_context_stats_.num_cache_index_add;
  814. get_context->get_context_stats_.num_cache_index_bytes_insert += usage;
  815. } else {
  816. RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
  817. RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, usage);
  818. }
  819. break;
  820. default:
  821. // TODO: introduce dedicated tickers/statistics/counters
  822. // for range tombstones
  823. if (get_context) {
  824. ++get_context->get_context_stats_.num_cache_data_add;
  825. get_context->get_context_stats_.num_cache_data_bytes_insert += usage;
  826. } else {
  827. RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
  828. RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, usage);
  829. }
  830. break;
  831. }
  832. }
  833. Cache::Handle* BlockBasedTable::GetEntryFromCache(
  834. Cache* block_cache, const Slice& key, BlockType block_type,
  835. GetContext* get_context) const {
  836. auto cache_handle = block_cache->Lookup(key, rep_->ioptions.statistics);
  837. if (cache_handle != nullptr) {
  838. UpdateCacheHitMetrics(block_type, get_context,
  839. block_cache->GetUsage(cache_handle));
  840. } else {
  841. UpdateCacheMissMetrics(block_type, get_context);
  842. }
  843. return cache_handle;
  844. }
  845. // Helper function to setup the cache key's prefix for the Table.
  846. void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) {
  847. assert(kMaxCacheKeyPrefixSize >= 10);
  848. rep->cache_key_prefix_size = 0;
  849. rep->compressed_cache_key_prefix_size = 0;
  850. if (rep->table_options.block_cache != nullptr) {
  851. GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(),
  852. &rep->cache_key_prefix[0], &rep->cache_key_prefix_size);
  853. }
  854. if (rep->table_options.persistent_cache != nullptr) {
  855. GenerateCachePrefix(/*cache=*/nullptr, rep->file->file(),
  856. &rep->persistent_cache_key_prefix[0],
  857. &rep->persistent_cache_key_prefix_size);
  858. }
  859. if (rep->table_options.block_cache_compressed != nullptr) {
  860. GenerateCachePrefix(rep->table_options.block_cache_compressed.get(),
  861. rep->file->file(), &rep->compressed_cache_key_prefix[0],
  862. &rep->compressed_cache_key_prefix_size);
  863. }
  864. }
  865. void BlockBasedTable::GenerateCachePrefix(Cache* cc, FSRandomAccessFile* file,
  866. char* buffer, size_t* size) {
  867. // generate an id from the file
  868. *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
  869. // If the prefix wasn't generated or was too long,
  870. // create one from the cache.
  871. if (cc != nullptr && *size == 0) {
  872. char* end = EncodeVarint64(buffer, cc->NewId());
  873. *size = static_cast<size_t>(end - buffer);
  874. }
  875. }
  876. void BlockBasedTable::GenerateCachePrefix(Cache* cc, FSWritableFile* file,
  877. char* buffer, size_t* size) {
  878. // generate an id from the file
  879. *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
  880. // If the prefix wasn't generated or was too long,
  881. // create one from the cache.
  882. if (cc != nullptr && *size == 0) {
  883. char* end = EncodeVarint64(buffer, cc->NewId());
  884. *size = static_cast<size_t>(end - buffer);
  885. }
  886. }
  887. namespace {
  888. // Return True if table_properties has `user_prop_name` has a `true` value
  889. // or it doesn't contain this property (for backward compatible).
  890. bool IsFeatureSupported(const TableProperties& table_properties,
  891. const std::string& user_prop_name, Logger* info_log) {
  892. auto& props = table_properties.user_collected_properties;
  893. auto pos = props.find(user_prop_name);
  894. // Older version doesn't have this value set. Skip this check.
  895. if (pos != props.end()) {
  896. if (pos->second == kPropFalse) {
  897. return false;
  898. } else if (pos->second != kPropTrue) {
  899. ROCKS_LOG_WARN(info_log, "Property %s has invalidate value %s",
  900. user_prop_name.c_str(), pos->second.c_str());
  901. }
  902. }
  903. return true;
  904. }
  905. // Caller has to ensure seqno is not nullptr.
  906. Status GetGlobalSequenceNumber(const TableProperties& table_properties,
  907. SequenceNumber largest_seqno,
  908. SequenceNumber* seqno) {
  909. const auto& props = table_properties.user_collected_properties;
  910. const auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion);
  911. const auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno);
  912. *seqno = kDisableGlobalSequenceNumber;
  913. if (version_pos == props.end()) {
  914. if (seqno_pos != props.end()) {
  915. std::array<char, 200> msg_buf;
  916. // This is not an external sst file, global_seqno is not supported.
  917. snprintf(
  918. msg_buf.data(), msg_buf.max_size(),
  919. "A non-external sst file have global seqno property with value %s",
  920. seqno_pos->second.c_str());
  921. return Status::Corruption(msg_buf.data());
  922. }
  923. return Status::OK();
  924. }
  925. uint32_t version = DecodeFixed32(version_pos->second.c_str());
  926. if (version < 2) {
  927. if (seqno_pos != props.end() || version != 1) {
  928. std::array<char, 200> msg_buf;
  929. // This is a v1 external sst file, global_seqno is not supported.
  930. snprintf(msg_buf.data(), msg_buf.max_size(),
  931. "An external sst file with version %u have global seqno "
  932. "property with value %s",
  933. version, seqno_pos->second.c_str());
  934. return Status::Corruption(msg_buf.data());
  935. }
  936. return Status::OK();
  937. }
  938. // Since we have a plan to deprecate global_seqno, we do not return failure
  939. // if seqno_pos == props.end(). We rely on version_pos to detect whether the
  940. // SST is external.
  941. SequenceNumber global_seqno(0);
  942. if (seqno_pos != props.end()) {
  943. global_seqno = DecodeFixed64(seqno_pos->second.c_str());
  944. }
  945. // SstTableReader open table reader with kMaxSequenceNumber as largest_seqno
  946. // to denote it is unknown.
  947. if (largest_seqno < kMaxSequenceNumber) {
  948. if (global_seqno == 0) {
  949. global_seqno = largest_seqno;
  950. }
  951. if (global_seqno != largest_seqno) {
  952. std::array<char, 200> msg_buf;
  953. snprintf(
  954. msg_buf.data(), msg_buf.max_size(),
  955. "An external sst file with version %u have global seqno property "
  956. "with value %s, while largest seqno in the file is %llu",
  957. version, seqno_pos->second.c_str(),
  958. static_cast<unsigned long long>(largest_seqno));
  959. return Status::Corruption(msg_buf.data());
  960. }
  961. }
  962. *seqno = global_seqno;
  963. if (global_seqno > kMaxSequenceNumber) {
  964. std::array<char, 200> msg_buf;
  965. snprintf(msg_buf.data(), msg_buf.max_size(),
  966. "An external sst file with version %u have global seqno property "
  967. "with value %llu, which is greater than kMaxSequenceNumber",
  968. version, static_cast<unsigned long long>(global_seqno));
  969. return Status::Corruption(msg_buf.data());
  970. }
  971. return Status::OK();
  972. }
  973. } // namespace
  974. Slice BlockBasedTable::GetCacheKey(const char* cache_key_prefix,
  975. size_t cache_key_prefix_size,
  976. const BlockHandle& handle, char* cache_key) {
  977. assert(cache_key != nullptr);
  978. assert(cache_key_prefix_size != 0);
  979. assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize);
  980. memcpy(cache_key, cache_key_prefix, cache_key_prefix_size);
  981. char* end =
  982. EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset());
  983. return Slice(cache_key, static_cast<size_t>(end - cache_key));
  984. }
  985. Status BlockBasedTable::Open(
  986. const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
  987. const BlockBasedTableOptions& table_options,
  988. const InternalKeyComparator& internal_comparator,
  989. std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
  990. std::unique_ptr<TableReader>* table_reader,
  991. const SliceTransform* prefix_extractor,
  992. const bool prefetch_index_and_filter_in_cache, const bool skip_filters,
  993. const int level, const bool immortal_table,
  994. const SequenceNumber largest_seqno, TailPrefetchStats* tail_prefetch_stats,
  995. BlockCacheTracer* const block_cache_tracer) {
  996. table_reader->reset();
  997. Status s;
  998. Footer footer;
  999. std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
  1000. // prefetch both index and filters, down to all partitions
  1001. const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0;
  1002. const bool preload_all = !table_options.cache_index_and_filter_blocks;
  1003. if (!ioptions.allow_mmap_reads) {
  1004. s = PrefetchTail(file.get(), file_size, tail_prefetch_stats, prefetch_all,
  1005. preload_all, &prefetch_buffer);
  1006. } else {
  1007. // Should not prefetch for mmap mode.
  1008. prefetch_buffer.reset(new FilePrefetchBuffer(
  1009. nullptr, 0, 0, false /* enable */, true /* track_min_offset */));
  1010. }
  1011. // Read in the following order:
  1012. // 1. Footer
  1013. // 2. [metaindex block]
  1014. // 3. [meta block: properties]
  1015. // 4. [meta block: range deletion tombstone]
  1016. // 5. [meta block: compression dictionary]
  1017. // 6. [meta block: index]
  1018. // 7. [meta block: filter]
  1019. s = ReadFooterFromFile(file.get(), prefetch_buffer.get(), file_size, &footer,
  1020. kBlockBasedTableMagicNumber);
  1021. if (!s.ok()) {
  1022. return s;
  1023. }
  1024. if (!BlockBasedTableSupportedVersion(footer.version())) {
  1025. return Status::Corruption(
  1026. "Unknown Footer version. Maybe this file was created with newer "
  1027. "version of RocksDB?");
  1028. }
  1029. // We've successfully read the footer. We are ready to serve requests.
  1030. // Better not mutate rep_ after the creation. eg. internal_prefix_transform
  1031. // raw pointer will be used to create HashIndexReader, whose reset may
  1032. // access a dangling pointer.
  1033. BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
  1034. Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options,
  1035. internal_comparator, skip_filters, level,
  1036. immortal_table);
  1037. rep->file = std::move(file);
  1038. rep->footer = footer;
  1039. rep->hash_index_allow_collision = table_options.hash_index_allow_collision;
  1040. // We need to wrap data with internal_prefix_transform to make sure it can
  1041. // handle prefix correctly.
  1042. if (prefix_extractor != nullptr) {
  1043. rep->internal_prefix_transform.reset(
  1044. new InternalKeySliceTransform(prefix_extractor));
  1045. }
  1046. SetupCacheKeyPrefix(rep);
  1047. std::unique_ptr<BlockBasedTable> new_table(
  1048. new BlockBasedTable(rep, block_cache_tracer));
  1049. // page cache options
  1050. rep->persistent_cache_options =
  1051. PersistentCacheOptions(rep->table_options.persistent_cache,
  1052. std::string(rep->persistent_cache_key_prefix,
  1053. rep->persistent_cache_key_prefix_size),
  1054. rep->ioptions.statistics);
  1055. // Meta-blocks are not dictionary compressed. Explicitly set the dictionary
  1056. // handle to null, otherwise it may be seen as uninitialized during the below
  1057. // meta-block reads.
  1058. rep->compression_dict_handle = BlockHandle::NullBlockHandle();
  1059. // Read metaindex
  1060. std::unique_ptr<Block> metaindex;
  1061. std::unique_ptr<InternalIterator> metaindex_iter;
  1062. s = new_table->ReadMetaIndexBlock(prefetch_buffer.get(), &metaindex,
  1063. &metaindex_iter);
  1064. if (!s.ok()) {
  1065. return s;
  1066. }
  1067. // Populates table_properties and some fields that depend on it,
  1068. // such as index_type.
  1069. s = new_table->ReadPropertiesBlock(prefetch_buffer.get(),
  1070. metaindex_iter.get(), largest_seqno);
  1071. if (!s.ok()) {
  1072. return s;
  1073. }
  1074. s = new_table->ReadRangeDelBlock(prefetch_buffer.get(), metaindex_iter.get(),
  1075. internal_comparator, &lookup_context);
  1076. if (!s.ok()) {
  1077. return s;
  1078. }
  1079. s = new_table->PrefetchIndexAndFilterBlocks(
  1080. prefetch_buffer.get(), metaindex_iter.get(), new_table.get(),
  1081. prefetch_all, table_options, level, &lookup_context);
  1082. if (s.ok()) {
  1083. // Update tail prefetch stats
  1084. assert(prefetch_buffer.get() != nullptr);
  1085. if (tail_prefetch_stats != nullptr) {
  1086. assert(prefetch_buffer->min_offset_read() < file_size);
  1087. tail_prefetch_stats->RecordEffectiveSize(
  1088. static_cast<size_t>(file_size) - prefetch_buffer->min_offset_read());
  1089. }
  1090. *table_reader = std::move(new_table);
  1091. }
  1092. return s;
  1093. }
  1094. Status BlockBasedTable::PrefetchTail(
  1095. RandomAccessFileReader* file, uint64_t file_size,
  1096. TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all,
  1097. const bool preload_all,
  1098. std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer) {
  1099. size_t tail_prefetch_size = 0;
  1100. if (tail_prefetch_stats != nullptr) {
  1101. // Multiple threads may get a 0 (no history) when running in parallel,
  1102. // but it will get cleared after the first of them finishes.
  1103. tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize();
  1104. }
  1105. if (tail_prefetch_size == 0) {
  1106. // Before read footer, readahead backwards to prefetch data. Do more
  1107. // readahead if we're going to read index/filter.
  1108. // TODO: This may incorrectly select small readahead in case partitioned
  1109. // index/filter is enabled and top-level partition pinning is enabled.
  1110. // That's because we need to issue readahead before we read the properties,
  1111. // at which point we don't yet know the index type.
  1112. tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024;
  1113. }
  1114. size_t prefetch_off;
  1115. size_t prefetch_len;
  1116. if (file_size < tail_prefetch_size) {
  1117. prefetch_off = 0;
  1118. prefetch_len = static_cast<size_t>(file_size);
  1119. } else {
  1120. prefetch_off = static_cast<size_t>(file_size - tail_prefetch_size);
  1121. prefetch_len = tail_prefetch_size;
  1122. }
  1123. TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen",
  1124. &tail_prefetch_size);
  1125. Status s;
  1126. // TODO should not have this special logic in the future.
  1127. if (!file->use_direct_io()) {
  1128. prefetch_buffer->reset(new FilePrefetchBuffer(
  1129. nullptr, 0, 0, false /* enable */, true /* track_min_offset */));
  1130. s = file->Prefetch(prefetch_off, prefetch_len);
  1131. } else {
  1132. prefetch_buffer->reset(new FilePrefetchBuffer(
  1133. nullptr, 0, 0, true /* enable */, true /* track_min_offset */));
  1134. s = (*prefetch_buffer)->Prefetch(file, prefetch_off, prefetch_len);
  1135. }
  1136. return s;
  1137. }
  1138. Status VerifyChecksum(const ChecksumType type, const char* buf, size_t len,
  1139. uint32_t expected) {
  1140. Status s;
  1141. uint32_t actual = 0;
  1142. switch (type) {
  1143. case kNoChecksum:
  1144. break;
  1145. case kCRC32c:
  1146. expected = crc32c::Unmask(expected);
  1147. actual = crc32c::Value(buf, len);
  1148. break;
  1149. case kxxHash:
  1150. actual = XXH32(buf, static_cast<int>(len), 0);
  1151. break;
  1152. case kxxHash64:
  1153. actual = static_cast<uint32_t>(XXH64(buf, static_cast<int>(len), 0) &
  1154. uint64_t{0xffffffff});
  1155. break;
  1156. default:
  1157. s = Status::Corruption("unknown checksum type");
  1158. }
  1159. if (s.ok() && actual != expected) {
  1160. s = Status::Corruption("properties block checksum mismatched");
  1161. }
  1162. return s;
  1163. }
  1164. Status BlockBasedTable::TryReadPropertiesWithGlobalSeqno(
  1165. FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value,
  1166. TableProperties** table_properties) {
  1167. assert(table_properties != nullptr);
  1168. // If this is an external SST file ingested with write_global_seqno set to
  1169. // true, then we expect the checksum mismatch because checksum was written
  1170. // by SstFileWriter, but its global seqno in the properties block may have
  1171. // been changed during ingestion. In this case, we read the properties
  1172. // block, copy it to a memory buffer, change the global seqno to its
  1173. // original value, i.e. 0, and verify the checksum again.
  1174. BlockHandle props_block_handle;
  1175. CacheAllocationPtr tmp_buf;
  1176. Status s = ReadProperties(handle_value, rep_->file.get(), prefetch_buffer,
  1177. rep_->footer, rep_->ioptions, table_properties,
  1178. false /* verify_checksum */, &props_block_handle,
  1179. &tmp_buf, false /* compression_type_missing */,
  1180. nullptr /* memory_allocator */);
  1181. if (s.ok() && tmp_buf) {
  1182. const auto seqno_pos_iter =
  1183. (*table_properties)
  1184. ->properties_offsets.find(
  1185. ExternalSstFilePropertyNames::kGlobalSeqno);
  1186. size_t block_size = static_cast<size_t>(props_block_handle.size());
  1187. if (seqno_pos_iter != (*table_properties)->properties_offsets.end()) {
  1188. uint64_t global_seqno_offset = seqno_pos_iter->second;
  1189. EncodeFixed64(
  1190. tmp_buf.get() + global_seqno_offset - props_block_handle.offset(), 0);
  1191. }
  1192. uint32_t value = DecodeFixed32(tmp_buf.get() + block_size + 1);
  1193. s = ROCKSDB_NAMESPACE::VerifyChecksum(rep_->footer.checksum(),
  1194. tmp_buf.get(), block_size + 1, value);
  1195. }
  1196. return s;
  1197. }
  1198. Status BlockBasedTable::ReadPropertiesBlock(
  1199. FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
  1200. const SequenceNumber largest_seqno) {
  1201. bool found_properties_block = true;
  1202. Status s;
  1203. s = SeekToPropertiesBlock(meta_iter, &found_properties_block);
  1204. if (!s.ok()) {
  1205. ROCKS_LOG_WARN(rep_->ioptions.info_log,
  1206. "Error when seeking to properties block from file: %s",
  1207. s.ToString().c_str());
  1208. } else if (found_properties_block) {
  1209. s = meta_iter->status();
  1210. TableProperties* table_properties = nullptr;
  1211. if (s.ok()) {
  1212. s = ReadProperties(
  1213. meta_iter->value(), rep_->file.get(), prefetch_buffer, rep_->footer,
  1214. rep_->ioptions, &table_properties, true /* verify_checksum */,
  1215. nullptr /* ret_block_handle */, nullptr /* ret_block_contents */,
  1216. false /* compression_type_missing */, nullptr /* memory_allocator */);
  1217. }
  1218. if (s.IsCorruption()) {
  1219. s = TryReadPropertiesWithGlobalSeqno(prefetch_buffer, meta_iter->value(),
  1220. &table_properties);
  1221. }
  1222. std::unique_ptr<TableProperties> props_guard;
  1223. if (table_properties != nullptr) {
  1224. props_guard.reset(table_properties);
  1225. }
  1226. if (!s.ok()) {
  1227. ROCKS_LOG_WARN(rep_->ioptions.info_log,
  1228. "Encountered error while reading data from properties "
  1229. "block %s",
  1230. s.ToString().c_str());
  1231. } else {
  1232. assert(table_properties != nullptr);
  1233. rep_->table_properties.reset(props_guard.release());
  1234. rep_->blocks_maybe_compressed =
  1235. rep_->table_properties->compression_name !=
  1236. CompressionTypeToString(kNoCompression);
  1237. rep_->blocks_definitely_zstd_compressed =
  1238. (rep_->table_properties->compression_name ==
  1239. CompressionTypeToString(kZSTD) ||
  1240. rep_->table_properties->compression_name ==
  1241. CompressionTypeToString(kZSTDNotFinalCompression));
  1242. }
  1243. } else {
  1244. ROCKS_LOG_ERROR(rep_->ioptions.info_log,
  1245. "Cannot find Properties block from file.");
  1246. }
  1247. #ifndef ROCKSDB_LITE
  1248. if (rep_->table_properties) {
  1249. ParseSliceTransform(rep_->table_properties->prefix_extractor_name,
  1250. &(rep_->table_prefix_extractor));
  1251. }
  1252. #endif // ROCKSDB_LITE
  1253. // Read the table properties, if provided.
  1254. if (rep_->table_properties) {
  1255. rep_->whole_key_filtering &=
  1256. IsFeatureSupported(*(rep_->table_properties),
  1257. BlockBasedTablePropertyNames::kWholeKeyFiltering,
  1258. rep_->ioptions.info_log);
  1259. rep_->prefix_filtering &=
  1260. IsFeatureSupported(*(rep_->table_properties),
  1261. BlockBasedTablePropertyNames::kPrefixFiltering,
  1262. rep_->ioptions.info_log);
  1263. rep_->index_key_includes_seq =
  1264. rep_->table_properties->index_key_is_user_key == 0;
  1265. rep_->index_value_is_full =
  1266. rep_->table_properties->index_value_is_delta_encoded == 0;
  1267. // Update index_type with the true type.
  1268. // If table properties don't contain index type, we assume that the table
  1269. // is in very old format and has kBinarySearch index type.
  1270. auto& props = rep_->table_properties->user_collected_properties;
  1271. auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
  1272. if (pos != props.end()) {
  1273. rep_->index_type = static_cast<BlockBasedTableOptions::IndexType>(
  1274. DecodeFixed32(pos->second.c_str()));
  1275. }
  1276. rep_->index_has_first_key =
  1277. rep_->index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey;
  1278. s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno,
  1279. &(rep_->global_seqno));
  1280. if (!s.ok()) {
  1281. ROCKS_LOG_ERROR(rep_->ioptions.info_log, "%s", s.ToString().c_str());
  1282. }
  1283. }
  1284. return s;
  1285. }
  1286. Status BlockBasedTable::ReadRangeDelBlock(
  1287. FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
  1288. const InternalKeyComparator& internal_comparator,
  1289. BlockCacheLookupContext* lookup_context) {
  1290. Status s;
  1291. bool found_range_del_block;
  1292. BlockHandle range_del_handle;
  1293. s = SeekToRangeDelBlock(meta_iter, &found_range_del_block, &range_del_handle);
  1294. if (!s.ok()) {
  1295. ROCKS_LOG_WARN(
  1296. rep_->ioptions.info_log,
  1297. "Error when seeking to range delete tombstones block from file: %s",
  1298. s.ToString().c_str());
  1299. } else if (found_range_del_block && !range_del_handle.IsNull()) {
  1300. ReadOptions read_options;
  1301. std::unique_ptr<InternalIterator> iter(NewDataBlockIterator<DataBlockIter>(
  1302. read_options, range_del_handle,
  1303. /*input_iter=*/nullptr, BlockType::kRangeDeletion,
  1304. /*get_context=*/nullptr, lookup_context, Status(), prefetch_buffer));
  1305. assert(iter != nullptr);
  1306. s = iter->status();
  1307. if (!s.ok()) {
  1308. ROCKS_LOG_WARN(
  1309. rep_->ioptions.info_log,
  1310. "Encountered error while reading data from range del block %s",
  1311. s.ToString().c_str());
  1312. } else {
  1313. rep_->fragmented_range_dels =
  1314. std::make_shared<FragmentedRangeTombstoneList>(std::move(iter),
  1315. internal_comparator);
  1316. }
  1317. }
  1318. return s;
  1319. }
  1320. Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
  1321. FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
  1322. BlockBasedTable* new_table, bool prefetch_all,
  1323. const BlockBasedTableOptions& table_options, const int level,
  1324. BlockCacheLookupContext* lookup_context) {
  1325. Status s;
  1326. // Find filter handle and filter type
  1327. if (rep_->filter_policy) {
  1328. for (auto filter_type :
  1329. {Rep::FilterType::kFullFilter, Rep::FilterType::kPartitionedFilter,
  1330. Rep::FilterType::kBlockFilter}) {
  1331. std::string prefix;
  1332. switch (filter_type) {
  1333. case Rep::FilterType::kFullFilter:
  1334. prefix = kFullFilterBlockPrefix;
  1335. break;
  1336. case Rep::FilterType::kPartitionedFilter:
  1337. prefix = kPartitionedFilterBlockPrefix;
  1338. break;
  1339. case Rep::FilterType::kBlockFilter:
  1340. prefix = kFilterBlockPrefix;
  1341. break;
  1342. default:
  1343. assert(0);
  1344. }
  1345. std::string filter_block_key = prefix;
  1346. filter_block_key.append(rep_->filter_policy->Name());
  1347. if (FindMetaBlock(meta_iter, filter_block_key, &rep_->filter_handle)
  1348. .ok()) {
  1349. rep_->filter_type = filter_type;
  1350. break;
  1351. }
  1352. }
  1353. }
  1354. // Find compression dictionary handle
  1355. bool found_compression_dict = false;
  1356. s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict,
  1357. &rep_->compression_dict_handle);
  1358. if (!s.ok()) {
  1359. return s;
  1360. }
  1361. BlockBasedTableOptions::IndexType index_type = rep_->index_type;
  1362. const bool use_cache = table_options.cache_index_and_filter_blocks;
  1363. // pin both index and filters, down to all partitions
  1364. const bool pin_all =
  1365. rep_->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0;
  1366. // prefetch the first level of index
  1367. const bool prefetch_index =
  1368. prefetch_all ||
  1369. (table_options.pin_top_level_index_and_filter &&
  1370. index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
  1371. // pin the first level of index
  1372. const bool pin_index =
  1373. pin_all || (table_options.pin_top_level_index_and_filter &&
  1374. index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
  1375. std::unique_ptr<IndexReader> index_reader;
  1376. s = new_table->CreateIndexReader(prefetch_buffer, meta_iter, use_cache,
  1377. prefetch_index, pin_index, lookup_context,
  1378. &index_reader);
  1379. if (!s.ok()) {
  1380. return s;
  1381. }
  1382. rep_->index_reader = std::move(index_reader);
  1383. // The partitions of partitioned index are always stored in cache. They
  1384. // are hence follow the configuration for pin and prefetch regardless of
  1385. // the value of cache_index_and_filter_blocks
  1386. if (prefetch_all) {
  1387. rep_->index_reader->CacheDependencies(pin_all);
  1388. }
  1389. // prefetch the first level of filter
  1390. const bool prefetch_filter =
  1391. prefetch_all ||
  1392. (table_options.pin_top_level_index_and_filter &&
  1393. rep_->filter_type == Rep::FilterType::kPartitionedFilter);
  1394. // Partition fitlers cannot be enabled without partition indexes
  1395. assert(!prefetch_filter || prefetch_index);
  1396. // pin the first level of filter
  1397. const bool pin_filter =
  1398. pin_all || (table_options.pin_top_level_index_and_filter &&
  1399. rep_->filter_type == Rep::FilterType::kPartitionedFilter);
  1400. if (rep_->filter_policy) {
  1401. auto filter = new_table->CreateFilterBlockReader(
  1402. prefetch_buffer, use_cache, prefetch_filter, pin_filter,
  1403. lookup_context);
  1404. if (filter) {
  1405. // Refer to the comment above about paritioned indexes always being cached
  1406. if (prefetch_all) {
  1407. filter->CacheDependencies(pin_all);
  1408. }
  1409. rep_->filter = std::move(filter);
  1410. }
  1411. }
  1412. if (!rep_->compression_dict_handle.IsNull()) {
  1413. std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
  1414. s = UncompressionDictReader::Create(this, prefetch_buffer, use_cache,
  1415. prefetch_all, pin_all, lookup_context,
  1416. &uncompression_dict_reader);
  1417. if (!s.ok()) {
  1418. return s;
  1419. }
  1420. rep_->uncompression_dict_reader = std::move(uncompression_dict_reader);
  1421. }
  1422. assert(s.ok());
  1423. return s;
  1424. }
  1425. void BlockBasedTable::SetupForCompaction() {
  1426. switch (rep_->ioptions.access_hint_on_compaction_start) {
  1427. case Options::NONE:
  1428. break;
  1429. case Options::NORMAL:
  1430. rep_->file->file()->Hint(FSRandomAccessFile::kNormal);
  1431. break;
  1432. case Options::SEQUENTIAL:
  1433. rep_->file->file()->Hint(FSRandomAccessFile::kSequential);
  1434. break;
  1435. case Options::WILLNEED:
  1436. rep_->file->file()->Hint(FSRandomAccessFile::kWillNeed);
  1437. break;
  1438. default:
  1439. assert(false);
  1440. }
  1441. }
  1442. std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties()
  1443. const {
  1444. return rep_->table_properties;
  1445. }
  1446. size_t BlockBasedTable::ApproximateMemoryUsage() const {
  1447. size_t usage = 0;
  1448. if (rep_->filter) {
  1449. usage += rep_->filter->ApproximateMemoryUsage();
  1450. }
  1451. if (rep_->index_reader) {
  1452. usage += rep_->index_reader->ApproximateMemoryUsage();
  1453. }
  1454. if (rep_->uncompression_dict_reader) {
  1455. usage += rep_->uncompression_dict_reader->ApproximateMemoryUsage();
  1456. }
  1457. return usage;
  1458. }
  1459. // Load the meta-index-block from the file. On success, return the loaded
  1460. // metaindex
  1461. // block and its iterator.
  1462. Status BlockBasedTable::ReadMetaIndexBlock(
  1463. FilePrefetchBuffer* prefetch_buffer,
  1464. std::unique_ptr<Block>* metaindex_block,
  1465. std::unique_ptr<InternalIterator>* iter) {
  1466. // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
  1467. // it is an empty block.
  1468. std::unique_ptr<Block> metaindex;
  1469. Status s = ReadBlockFromFile(
  1470. rep_->file.get(), prefetch_buffer, rep_->footer, ReadOptions(),
  1471. rep_->footer.metaindex_handle(), &metaindex, rep_->ioptions,
  1472. true /* decompress */, true /*maybe_compressed*/, BlockType::kMetaIndex,
  1473. UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options,
  1474. kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */,
  1475. GetMemoryAllocator(rep_->table_options), false /* for_compaction */,
  1476. rep_->blocks_definitely_zstd_compressed, nullptr /* filter_policy */);
  1477. if (!s.ok()) {
  1478. ROCKS_LOG_ERROR(rep_->ioptions.info_log,
  1479. "Encountered error while reading data from properties"
  1480. " block %s",
  1481. s.ToString().c_str());
  1482. return s;
  1483. }
  1484. *metaindex_block = std::move(metaindex);
  1485. // meta block uses bytewise comparator.
  1486. iter->reset(metaindex_block->get()->NewDataIterator(BytewiseComparator(),
  1487. BytewiseComparator()));
  1488. return Status::OK();
  1489. }
  1490. template <typename TBlocklike>
  1491. Status BlockBasedTable::GetDataBlockFromCache(
  1492. const Slice& block_cache_key, const Slice& compressed_block_cache_key,
  1493. Cache* block_cache, Cache* block_cache_compressed,
  1494. const ReadOptions& read_options, CachableEntry<TBlocklike>* block,
  1495. const UncompressionDict& uncompression_dict, BlockType block_type,
  1496. GetContext* get_context) const {
  1497. const size_t read_amp_bytes_per_bit =
  1498. block_type == BlockType::kData
  1499. ? rep_->table_options.read_amp_bytes_per_bit
  1500. : 0;
  1501. assert(block);
  1502. assert(block->IsEmpty());
  1503. Status s;
  1504. BlockContents* compressed_block = nullptr;
  1505. Cache::Handle* block_cache_compressed_handle = nullptr;
  1506. // Lookup uncompressed cache first
  1507. if (block_cache != nullptr) {
  1508. auto cache_handle = GetEntryFromCache(block_cache, block_cache_key,
  1509. block_type, get_context);
  1510. if (cache_handle != nullptr) {
  1511. block->SetCachedValue(
  1512. reinterpret_cast<TBlocklike*>(block_cache->Value(cache_handle)),
  1513. block_cache, cache_handle);
  1514. return s;
  1515. }
  1516. }
  1517. // If not found, search from the compressed block cache.
  1518. assert(block->IsEmpty());
  1519. if (block_cache_compressed == nullptr) {
  1520. return s;
  1521. }
  1522. assert(!compressed_block_cache_key.empty());
  1523. block_cache_compressed_handle =
  1524. block_cache_compressed->Lookup(compressed_block_cache_key);
  1525. Statistics* statistics = rep_->ioptions.statistics;
  1526. // if we found in the compressed cache, then uncompress and insert into
  1527. // uncompressed cache
  1528. if (block_cache_compressed_handle == nullptr) {
  1529. RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS);
  1530. return s;
  1531. }
  1532. // found compressed block
  1533. RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT);
  1534. compressed_block = reinterpret_cast<BlockContents*>(
  1535. block_cache_compressed->Value(block_cache_compressed_handle));
  1536. CompressionType compression_type = compressed_block->get_compression_type();
  1537. assert(compression_type != kNoCompression);
  1538. // Retrieve the uncompressed contents into a new buffer
  1539. BlockContents contents;
  1540. UncompressionContext context(compression_type);
  1541. UncompressionInfo info(context, uncompression_dict, compression_type);
  1542. s = UncompressBlockContents(
  1543. info, compressed_block->data.data(), compressed_block->data.size(),
  1544. &contents, rep_->table_options.format_version, rep_->ioptions,
  1545. GetMemoryAllocator(rep_->table_options));
  1546. // Insert uncompressed block into block cache
  1547. if (s.ok()) {
  1548. std::unique_ptr<TBlocklike> block_holder(
  1549. BlocklikeTraits<TBlocklike>::Create(
  1550. std::move(contents), rep_->get_global_seqno(block_type),
  1551. read_amp_bytes_per_bit, statistics,
  1552. rep_->blocks_definitely_zstd_compressed,
  1553. rep_->table_options.filter_policy.get())); // uncompressed block
  1554. if (block_cache != nullptr && block_holder->own_bytes() &&
  1555. read_options.fill_cache) {
  1556. size_t charge = block_holder->ApproximateMemoryUsage();
  1557. Cache::Handle* cache_handle = nullptr;
  1558. s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
  1559. &DeleteCachedEntry<TBlocklike>, &cache_handle);
  1560. if (s.ok()) {
  1561. assert(cache_handle != nullptr);
  1562. block->SetCachedValue(block_holder.release(), block_cache,
  1563. cache_handle);
  1564. UpdateCacheInsertionMetrics(block_type, get_context, charge);
  1565. } else {
  1566. RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
  1567. }
  1568. } else {
  1569. block->SetOwnedValue(block_holder.release());
  1570. }
  1571. }
  1572. // Release hold on compressed cache entry
  1573. block_cache_compressed->Release(block_cache_compressed_handle);
  1574. return s;
  1575. }
  1576. template <typename TBlocklike>
  1577. Status BlockBasedTable::PutDataBlockToCache(
  1578. const Slice& block_cache_key, const Slice& compressed_block_cache_key,
  1579. Cache* block_cache, Cache* block_cache_compressed,
  1580. CachableEntry<TBlocklike>* cached_block, BlockContents* raw_block_contents,
  1581. CompressionType raw_block_comp_type,
  1582. const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
  1583. MemoryAllocator* memory_allocator, BlockType block_type,
  1584. GetContext* get_context) const {
  1585. const ImmutableCFOptions& ioptions = rep_->ioptions;
  1586. const uint32_t format_version = rep_->table_options.format_version;
  1587. const size_t read_amp_bytes_per_bit =
  1588. block_type == BlockType::kData
  1589. ? rep_->table_options.read_amp_bytes_per_bit
  1590. : 0;
  1591. const Cache::Priority priority =
  1592. rep_->table_options.cache_index_and_filter_blocks_with_high_priority &&
  1593. (block_type == BlockType::kFilter ||
  1594. block_type == BlockType::kCompressionDictionary ||
  1595. block_type == BlockType::kIndex)
  1596. ? Cache::Priority::HIGH
  1597. : Cache::Priority::LOW;
  1598. assert(cached_block);
  1599. assert(cached_block->IsEmpty());
  1600. Status s;
  1601. Statistics* statistics = ioptions.statistics;
  1602. std::unique_ptr<TBlocklike> block_holder;
  1603. if (raw_block_comp_type != kNoCompression) {
  1604. // Retrieve the uncompressed contents into a new buffer
  1605. BlockContents uncompressed_block_contents;
  1606. UncompressionContext context(raw_block_comp_type);
  1607. UncompressionInfo info(context, uncompression_dict, raw_block_comp_type);
  1608. s = UncompressBlockContents(info, raw_block_contents->data.data(),
  1609. raw_block_contents->data.size(),
  1610. &uncompressed_block_contents, format_version,
  1611. ioptions, memory_allocator);
  1612. if (!s.ok()) {
  1613. return s;
  1614. }
  1615. block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
  1616. std::move(uncompressed_block_contents), seq_no, read_amp_bytes_per_bit,
  1617. statistics, rep_->blocks_definitely_zstd_compressed,
  1618. rep_->table_options.filter_policy.get()));
  1619. } else {
  1620. block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
  1621. std::move(*raw_block_contents), seq_no, read_amp_bytes_per_bit,
  1622. statistics, rep_->blocks_definitely_zstd_compressed,
  1623. rep_->table_options.filter_policy.get()));
  1624. }
  1625. // Insert compressed block into compressed block cache.
  1626. // Release the hold on the compressed cache entry immediately.
  1627. if (block_cache_compressed != nullptr &&
  1628. raw_block_comp_type != kNoCompression && raw_block_contents != nullptr &&
  1629. raw_block_contents->own_bytes()) {
  1630. #ifndef NDEBUG
  1631. assert(raw_block_contents->is_raw_block);
  1632. #endif // NDEBUG
  1633. // We cannot directly put raw_block_contents because this could point to
  1634. // an object in the stack.
  1635. BlockContents* block_cont_for_comp_cache =
  1636. new BlockContents(std::move(*raw_block_contents));
  1637. s = block_cache_compressed->Insert(
  1638. compressed_block_cache_key, block_cont_for_comp_cache,
  1639. block_cont_for_comp_cache->ApproximateMemoryUsage(),
  1640. &DeleteCachedEntry<BlockContents>);
  1641. if (s.ok()) {
  1642. // Avoid the following code to delete this cached block.
  1643. RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD);
  1644. } else {
  1645. RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
  1646. delete block_cont_for_comp_cache;
  1647. }
  1648. }
  1649. // insert into uncompressed block cache
  1650. if (block_cache != nullptr && block_holder->own_bytes()) {
  1651. size_t charge = block_holder->ApproximateMemoryUsage();
  1652. Cache::Handle* cache_handle = nullptr;
  1653. s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
  1654. &DeleteCachedEntry<TBlocklike>, &cache_handle,
  1655. priority);
  1656. if (s.ok()) {
  1657. assert(cache_handle != nullptr);
  1658. cached_block->SetCachedValue(block_holder.release(), block_cache,
  1659. cache_handle);
  1660. UpdateCacheInsertionMetrics(block_type, get_context, charge);
  1661. } else {
  1662. RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
  1663. }
  1664. } else {
  1665. cached_block->SetOwnedValue(block_holder.release());
  1666. }
  1667. return s;
  1668. }
  1669. std::unique_ptr<FilterBlockReader> BlockBasedTable::CreateFilterBlockReader(
  1670. FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
  1671. bool pin, BlockCacheLookupContext* lookup_context) {
  1672. auto& rep = rep_;
  1673. auto filter_type = rep->filter_type;
  1674. if (filter_type == Rep::FilterType::kNoFilter) {
  1675. return std::unique_ptr<FilterBlockReader>();
  1676. }
  1677. assert(rep->filter_policy);
  1678. switch (filter_type) {
  1679. case Rep::FilterType::kPartitionedFilter:
  1680. return PartitionedFilterBlockReader::Create(
  1681. this, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
  1682. case Rep::FilterType::kBlockFilter:
  1683. return BlockBasedFilterBlockReader::Create(
  1684. this, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
  1685. case Rep::FilterType::kFullFilter:
  1686. return FullFilterBlockReader::Create(this, prefetch_buffer, use_cache,
  1687. prefetch, pin, lookup_context);
  1688. default:
  1689. // filter_type is either kNoFilter (exited the function at the first if),
  1690. // or it must be covered in this switch block
  1691. assert(false);
  1692. return std::unique_ptr<FilterBlockReader>();
  1693. }
  1694. }
  1695. // disable_prefix_seek should be set to true when prefix_extractor found in SST
  1696. // differs from the one in mutable_cf_options and index type is HashBasedIndex
  1697. InternalIteratorBase<IndexValue>* BlockBasedTable::NewIndexIterator(
  1698. const ReadOptions& read_options, bool disable_prefix_seek,
  1699. IndexBlockIter* input_iter, GetContext* get_context,
  1700. BlockCacheLookupContext* lookup_context) const {
  1701. assert(rep_ != nullptr);
  1702. assert(rep_->index_reader != nullptr);
  1703. // We don't return pinned data from index blocks, so no need
  1704. // to set `block_contents_pinned`.
  1705. return rep_->index_reader->NewIterator(read_options, disable_prefix_seek,
  1706. input_iter, get_context,
  1707. lookup_context);
  1708. }
  1709. // Convert an index iterator value (i.e., an encoded BlockHandle)
  1710. // into an iterator over the contents of the corresponding block.
  1711. // If input_iter is null, new a iterator
  1712. // If input_iter is not null, update this iter and return it
  1713. template <typename TBlockIter>
  1714. TBlockIter* BlockBasedTable::NewDataBlockIterator(
  1715. const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter,
  1716. BlockType block_type, GetContext* get_context,
  1717. BlockCacheLookupContext* lookup_context, Status s,
  1718. FilePrefetchBuffer* prefetch_buffer, bool for_compaction) const {
  1719. PERF_TIMER_GUARD(new_table_block_iter_nanos);
  1720. TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
  1721. if (!s.ok()) {
  1722. iter->Invalidate(s);
  1723. return iter;
  1724. }
  1725. CachableEntry<UncompressionDict> uncompression_dict;
  1726. if (rep_->uncompression_dict_reader) {
  1727. const bool no_io = (ro.read_tier == kBlockCacheTier);
  1728. s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
  1729. prefetch_buffer, no_io, get_context, lookup_context,
  1730. &uncompression_dict);
  1731. if (!s.ok()) {
  1732. iter->Invalidate(s);
  1733. return iter;
  1734. }
  1735. }
  1736. const UncompressionDict& dict = uncompression_dict.GetValue()
  1737. ? *uncompression_dict.GetValue()
  1738. : UncompressionDict::GetEmptyDict();
  1739. CachableEntry<Block> block;
  1740. s = RetrieveBlock(prefetch_buffer, ro, handle, dict, &block, block_type,
  1741. get_context, lookup_context, for_compaction,
  1742. /* use_cache */ true);
  1743. if (!s.ok()) {
  1744. assert(block.IsEmpty());
  1745. iter->Invalidate(s);
  1746. return iter;
  1747. }
  1748. assert(block.GetValue() != nullptr);
  1749. // Block contents are pinned and it is still pinned after the iterator
  1750. // is destroyed as long as cleanup functions are moved to another object,
  1751. // when:
  1752. // 1. block cache handle is set to be released in cleanup function, or
  1753. // 2. it's pointing to immortal source. If own_bytes is true then we are
  1754. // not reading data from the original source, whether immortal or not.
  1755. // Otherwise, the block is pinned iff the source is immortal.
  1756. const bool block_contents_pinned =
  1757. block.IsCached() ||
  1758. (!block.GetValue()->own_bytes() && rep_->immortal_table);
  1759. iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), iter,
  1760. block_contents_pinned);
  1761. if (!block.IsCached()) {
  1762. if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) {
  1763. // insert a dummy record to block cache to track the memory usage
  1764. Cache* const block_cache = rep_->table_options.block_cache.get();
  1765. Cache::Handle* cache_handle = nullptr;
  1766. // There are two other types of cache keys: 1) SST cache key added in
  1767. // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in
  1768. // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate
  1769. // from SST cache key(31 bytes), and use non-zero prefix to
  1770. // differentiate from `write_buffer_manager`
  1771. const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
  1772. char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length];
  1773. // Prefix: use rep_->cache_key_prefix padded by 0s
  1774. memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length);
  1775. assert(rep_->cache_key_prefix_size != 0);
  1776. assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix);
  1777. memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size);
  1778. char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
  1779. next_cache_key_id_++);
  1780. assert(end - cache_key <=
  1781. static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
  1782. const Slice unique_key(cache_key, static_cast<size_t>(end - cache_key));
  1783. s = block_cache->Insert(unique_key, nullptr,
  1784. block.GetValue()->ApproximateMemoryUsage(),
  1785. nullptr, &cache_handle);
  1786. if (s.ok()) {
  1787. assert(cache_handle != nullptr);
  1788. iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
  1789. cache_handle);
  1790. }
  1791. }
  1792. } else {
  1793. iter->SetCacheHandle(block.GetCacheHandle());
  1794. }
  1795. block.TransferTo(iter);
  1796. return iter;
  1797. }
  1798. template <>
  1799. DataBlockIter* BlockBasedTable::InitBlockIterator<DataBlockIter>(
  1800. const Rep* rep, Block* block, DataBlockIter* input_iter,
  1801. bool block_contents_pinned) {
  1802. return block->NewDataIterator(
  1803. &rep->internal_comparator, rep->internal_comparator.user_comparator(),
  1804. input_iter, rep->ioptions.statistics, block_contents_pinned);
  1805. }
  1806. template <>
  1807. IndexBlockIter* BlockBasedTable::InitBlockIterator<IndexBlockIter>(
  1808. const Rep* rep, Block* block, IndexBlockIter* input_iter,
  1809. bool block_contents_pinned) {
  1810. return block->NewIndexIterator(
  1811. &rep->internal_comparator, rep->internal_comparator.user_comparator(),
  1812. input_iter, rep->ioptions.statistics, /* total_order_seek */ true,
  1813. rep->index_has_first_key, rep->index_key_includes_seq,
  1814. rep->index_value_is_full, block_contents_pinned);
  1815. }
  1816. // Convert an uncompressed data block (i.e CachableEntry<Block>)
  1817. // into an iterator over the contents of the corresponding block.
  1818. // If input_iter is null, new a iterator
  1819. // If input_iter is not null, update this iter and return it
  1820. template <typename TBlockIter>
  1821. TBlockIter* BlockBasedTable::NewDataBlockIterator(const ReadOptions& ro,
  1822. CachableEntry<Block>& block,
  1823. TBlockIter* input_iter,
  1824. Status s) const {
  1825. PERF_TIMER_GUARD(new_table_block_iter_nanos);
  1826. TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
  1827. if (!s.ok()) {
  1828. iter->Invalidate(s);
  1829. return iter;
  1830. }
  1831. assert(block.GetValue() != nullptr);
  1832. // Block contents are pinned and it is still pinned after the iterator
  1833. // is destroyed as long as cleanup functions are moved to another object,
  1834. // when:
  1835. // 1. block cache handle is set to be released in cleanup function, or
  1836. // 2. it's pointing to immortal source. If own_bytes is true then we are
  1837. // not reading data from the original source, whether immortal or not.
  1838. // Otherwise, the block is pinned iff the source is immortal.
  1839. const bool block_contents_pinned =
  1840. block.IsCached() ||
  1841. (!block.GetValue()->own_bytes() && rep_->immortal_table);
  1842. iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), iter,
  1843. block_contents_pinned);
  1844. if (!block.IsCached()) {
  1845. if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) {
  1846. // insert a dummy record to block cache to track the memory usage
  1847. Cache* const block_cache = rep_->table_options.block_cache.get();
  1848. Cache::Handle* cache_handle = nullptr;
  1849. // There are two other types of cache keys: 1) SST cache key added in
  1850. // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in
  1851. // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate
  1852. // from SST cache key(31 bytes), and use non-zero prefix to
  1853. // differentiate from `write_buffer_manager`
  1854. const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
  1855. char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length];
  1856. // Prefix: use rep_->cache_key_prefix padded by 0s
  1857. memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length);
  1858. assert(rep_->cache_key_prefix_size != 0);
  1859. assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix);
  1860. memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size);
  1861. char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
  1862. next_cache_key_id_++);
  1863. assert(end - cache_key <=
  1864. static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
  1865. const Slice unique_key(cache_key, static_cast<size_t>(end - cache_key));
  1866. s = block_cache->Insert(unique_key, nullptr,
  1867. block.GetValue()->ApproximateMemoryUsage(),
  1868. nullptr, &cache_handle);
  1869. if (s.ok()) {
  1870. assert(cache_handle != nullptr);
  1871. iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
  1872. cache_handle);
  1873. }
  1874. }
  1875. } else {
  1876. iter->SetCacheHandle(block.GetCacheHandle());
  1877. }
  1878. block.TransferTo(iter);
  1879. return iter;
  1880. }
  1881. // If contents is nullptr, this function looks up the block caches for the
  1882. // data block referenced by handle, and read the block from disk if necessary.
  1883. // If contents is non-null, it skips the cache lookup and disk read, since
  1884. // the caller has already read it. In both cases, if ro.fill_cache is true,
  1885. // it inserts the block into the block cache.
  1886. template <typename TBlocklike>
  1887. Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
  1888. FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
  1889. const BlockHandle& handle, const UncompressionDict& uncompression_dict,
  1890. CachableEntry<TBlocklike>* block_entry, BlockType block_type,
  1891. GetContext* get_context, BlockCacheLookupContext* lookup_context,
  1892. BlockContents* contents) const {
  1893. assert(block_entry != nullptr);
  1894. const bool no_io = (ro.read_tier == kBlockCacheTier);
  1895. Cache* block_cache = rep_->table_options.block_cache.get();
  1896. // No point to cache compressed blocks if it never goes away
  1897. Cache* block_cache_compressed =
  1898. rep_->immortal_table ? nullptr
  1899. : rep_->table_options.block_cache_compressed.get();
  1900. // First, try to get the block from the cache
  1901. //
  1902. // If either block cache is enabled, we'll try to read from it.
  1903. Status s;
  1904. char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
  1905. char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
  1906. Slice key /* key to the block cache */;
  1907. Slice ckey /* key to the compressed block cache */;
  1908. bool is_cache_hit = false;
  1909. if (block_cache != nullptr || block_cache_compressed != nullptr) {
  1910. // create key for block cache
  1911. if (block_cache != nullptr) {
  1912. key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
  1913. handle, cache_key);
  1914. }
  1915. if (block_cache_compressed != nullptr) {
  1916. ckey = GetCacheKey(rep_->compressed_cache_key_prefix,
  1917. rep_->compressed_cache_key_prefix_size, handle,
  1918. compressed_cache_key);
  1919. }
  1920. if (!contents) {
  1921. s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
  1922. ro, block_entry, uncompression_dict, block_type,
  1923. get_context);
  1924. if (block_entry->GetValue()) {
  1925. // TODO(haoyu): Differentiate cache hit on uncompressed block cache and
  1926. // compressed block cache.
  1927. is_cache_hit = true;
  1928. }
  1929. }
  1930. // Can't find the block from the cache. If I/O is allowed, read from the
  1931. // file.
  1932. if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) {
  1933. Statistics* statistics = rep_->ioptions.statistics;
  1934. const bool maybe_compressed =
  1935. block_type != BlockType::kFilter &&
  1936. block_type != BlockType::kCompressionDictionary &&
  1937. rep_->blocks_maybe_compressed;
  1938. const bool do_uncompress = maybe_compressed && !block_cache_compressed;
  1939. CompressionType raw_block_comp_type;
  1940. BlockContents raw_block_contents;
  1941. if (!contents) {
  1942. StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
  1943. BlockFetcher block_fetcher(
  1944. rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle,
  1945. &raw_block_contents, rep_->ioptions, do_uncompress,
  1946. maybe_compressed, block_type, uncompression_dict,
  1947. rep_->persistent_cache_options,
  1948. GetMemoryAllocator(rep_->table_options),
  1949. GetMemoryAllocatorForCompressedBlock(rep_->table_options));
  1950. s = block_fetcher.ReadBlockContents();
  1951. raw_block_comp_type = block_fetcher.get_compression_type();
  1952. contents = &raw_block_contents;
  1953. } else {
  1954. raw_block_comp_type = contents->get_compression_type();
  1955. }
  1956. if (s.ok()) {
  1957. SequenceNumber seq_no = rep_->get_global_seqno(block_type);
  1958. // If filling cache is allowed and a cache is configured, try to put the
  1959. // block to the cache.
  1960. s = PutDataBlockToCache(
  1961. key, ckey, block_cache, block_cache_compressed, block_entry,
  1962. contents, raw_block_comp_type, uncompression_dict, seq_no,
  1963. GetMemoryAllocator(rep_->table_options), block_type, get_context);
  1964. }
  1965. }
  1966. }
  1967. // Fill lookup_context.
  1968. if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() &&
  1969. lookup_context) {
  1970. size_t usage = 0;
  1971. uint64_t nkeys = 0;
  1972. if (block_entry->GetValue()) {
  1973. // Approximate the number of keys in the block using restarts.
  1974. nkeys =
  1975. rep_->table_options.block_restart_interval *
  1976. BlocklikeTraits<TBlocklike>::GetNumRestarts(*block_entry->GetValue());
  1977. usage = block_entry->GetValue()->ApproximateMemoryUsage();
  1978. }
  1979. TraceType trace_block_type = TraceType::kTraceMax;
  1980. switch (block_type) {
  1981. case BlockType::kData:
  1982. trace_block_type = TraceType::kBlockTraceDataBlock;
  1983. break;
  1984. case BlockType::kFilter:
  1985. trace_block_type = TraceType::kBlockTraceFilterBlock;
  1986. break;
  1987. case BlockType::kCompressionDictionary:
  1988. trace_block_type = TraceType::kBlockTraceUncompressionDictBlock;
  1989. break;
  1990. case BlockType::kRangeDeletion:
  1991. trace_block_type = TraceType::kBlockTraceRangeDeletionBlock;
  1992. break;
  1993. case BlockType::kIndex:
  1994. trace_block_type = TraceType::kBlockTraceIndexBlock;
  1995. break;
  1996. default:
  1997. // This cannot happen.
  1998. assert(false);
  1999. break;
  2000. }
  2001. bool no_insert = no_io || !ro.fill_cache;
  2002. if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(
  2003. trace_block_type, lookup_context->caller)) {
  2004. // Defer logging the access to Get() and MultiGet() to trace additional
  2005. // information, e.g., referenced_key_exist_in_block.
  2006. // Make a copy of the block key here since it will be logged later.
  2007. lookup_context->FillLookupContext(
  2008. is_cache_hit, no_insert, trace_block_type,
  2009. /*block_size=*/usage, /*block_key=*/key.ToString(), nkeys);
  2010. } else {
  2011. // Avoid making copy of block_key and cf_name when constructing the access
  2012. // record.
  2013. BlockCacheTraceRecord access_record(
  2014. rep_->ioptions.env->NowMicros(),
  2015. /*block_key=*/"", trace_block_type,
  2016. /*block_size=*/usage, rep_->cf_id_for_tracing(),
  2017. /*cf_name=*/"", rep_->level_for_tracing(),
  2018. rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
  2019. no_insert, lookup_context->get_id,
  2020. lookup_context->get_from_user_specified_snapshot,
  2021. /*referenced_key=*/"");
  2022. block_cache_tracer_->WriteBlockAccess(access_record, key,
  2023. rep_->cf_name_for_tracing(),
  2024. lookup_context->referenced_key);
  2025. }
  2026. }
  2027. assert(s.ok() || block_entry->GetValue() == nullptr);
  2028. return s;
  2029. }
  2030. // This function reads multiple data blocks from disk using Env::MultiRead()
  2031. // and optionally inserts them into the block cache. It uses the scratch
  2032. // buffer provided by the caller, which is contiguous. If scratch is a nullptr
  2033. // it allocates a separate buffer for each block. Typically, if the blocks
  2034. // need to be uncompressed and there is no compressed block cache, callers
  2035. // can allocate a temporary scratch buffer in order to minimize memory
  2036. // allocations.
  2037. // If options.fill_cache is true, it inserts the blocks into cache. If its
  2038. // false and scratch is non-null and the blocks are uncompressed, it copies
  2039. // the buffers to heap. In any case, the CachableEntry<Block> returned will
  2040. // own the data bytes.
  2041. // If compression is enabled and also there is no compressed block cache,
  2042. // the adjacent blocks are read out in one IO (combined read)
  2043. // batch - A MultiGetRange with only those keys with unique data blocks not
  2044. // found in cache
  2045. // handles - A vector of block handles. Some of them me be NULL handles
  2046. // scratch - An optional contiguous buffer to read compressed blocks into
  2047. void BlockBasedTable::RetrieveMultipleBlocks(
  2048. const ReadOptions& options, const MultiGetRange* batch,
  2049. const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles,
  2050. autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses,
  2051. autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>* results,
  2052. char* scratch, const UncompressionDict& uncompression_dict) const {
  2053. RandomAccessFileReader* file = rep_->file.get();
  2054. const Footer& footer = rep_->footer;
  2055. const ImmutableCFOptions& ioptions = rep_->ioptions;
  2056. SequenceNumber global_seqno = rep_->get_global_seqno(BlockType::kData);
  2057. size_t read_amp_bytes_per_bit = rep_->table_options.read_amp_bytes_per_bit;
  2058. MemoryAllocator* memory_allocator = GetMemoryAllocator(rep_->table_options);
  2059. if (file->use_direct_io() || ioptions.allow_mmap_reads) {
  2060. size_t idx_in_batch = 0;
  2061. for (auto mget_iter = batch->begin(); mget_iter != batch->end();
  2062. ++mget_iter, ++idx_in_batch) {
  2063. BlockCacheLookupContext lookup_data_block_context(
  2064. TableReaderCaller::kUserMultiGet);
  2065. const BlockHandle& handle = (*handles)[idx_in_batch];
  2066. if (handle.IsNull()) {
  2067. continue;
  2068. }
  2069. (*statuses)[idx_in_batch] =
  2070. RetrieveBlock(nullptr, options, handle, uncompression_dict,
  2071. &(*results)[idx_in_batch], BlockType::kData,
  2072. mget_iter->get_context, &lookup_data_block_context,
  2073. /* for_compaction */ false, /* use_cache */ true);
  2074. }
  2075. return;
  2076. }
  2077. autovector<FSReadRequest, MultiGetContext::MAX_BATCH_SIZE> read_reqs;
  2078. size_t buf_offset = 0;
  2079. size_t idx_in_batch = 0;
  2080. uint64_t prev_offset = 0;
  2081. size_t prev_len = 0;
  2082. autovector<size_t, MultiGetContext::MAX_BATCH_SIZE> req_idx_for_block;
  2083. autovector<size_t, MultiGetContext::MAX_BATCH_SIZE> req_offset_for_block;
  2084. for (auto mget_iter = batch->begin(); mget_iter != batch->end();
  2085. ++mget_iter, ++idx_in_batch) {
  2086. const BlockHandle& handle = (*handles)[idx_in_batch];
  2087. if (handle.IsNull()) {
  2088. continue;
  2089. }
  2090. size_t prev_end = static_cast<size_t>(prev_offset) + prev_len;
  2091. // If current block is adjacent to the previous one, at the same time,
  2092. // compression is enabled and there is no compressed cache, we combine
  2093. // the two block read as one.
  2094. if (scratch != nullptr && prev_end == handle.offset()) {
  2095. req_offset_for_block.emplace_back(prev_len);
  2096. prev_len += block_size(handle);
  2097. } else {
  2098. // No compression or current block and previous one is not adjacent:
  2099. // Step 1, create a new request for previous blocks
  2100. if (prev_len != 0) {
  2101. FSReadRequest req;
  2102. req.offset = prev_offset;
  2103. req.len = prev_len;
  2104. if (scratch == nullptr) {
  2105. req.scratch = new char[req.len];
  2106. } else {
  2107. req.scratch = scratch + buf_offset;
  2108. buf_offset += req.len;
  2109. }
  2110. req.status = IOStatus::OK();
  2111. read_reqs.emplace_back(req);
  2112. }
  2113. // Step 2, remeber the previous block info
  2114. prev_offset = handle.offset();
  2115. prev_len = block_size(handle);
  2116. req_offset_for_block.emplace_back(0);
  2117. }
  2118. req_idx_for_block.emplace_back(read_reqs.size());
  2119. }
  2120. // Handle the last block and process the pending last request
  2121. if (prev_len != 0) {
  2122. FSReadRequest req;
  2123. req.offset = prev_offset;
  2124. req.len = prev_len;
  2125. if (scratch == nullptr) {
  2126. req.scratch = new char[req.len];
  2127. } else {
  2128. req.scratch = scratch + buf_offset;
  2129. }
  2130. req.status = IOStatus::OK();
  2131. read_reqs.emplace_back(req);
  2132. }
  2133. file->MultiRead(&read_reqs[0], read_reqs.size());
  2134. idx_in_batch = 0;
  2135. size_t valid_batch_idx = 0;
  2136. for (auto mget_iter = batch->begin(); mget_iter != batch->end();
  2137. ++mget_iter, ++idx_in_batch) {
  2138. const BlockHandle& handle = (*handles)[idx_in_batch];
  2139. if (handle.IsNull()) {
  2140. continue;
  2141. }
  2142. assert(valid_batch_idx < req_idx_for_block.size());
  2143. assert(valid_batch_idx < req_offset_for_block.size());
  2144. assert(req_idx_for_block[valid_batch_idx] < read_reqs.size());
  2145. size_t& req_idx = req_idx_for_block[valid_batch_idx];
  2146. size_t& req_offset = req_offset_for_block[valid_batch_idx];
  2147. valid_batch_idx++;
  2148. FSReadRequest& req = read_reqs[req_idx];
  2149. Status s = req.status;
  2150. if (s.ok()) {
  2151. if (req.result.size() != req.len) {
  2152. s = Status::Corruption(
  2153. "truncated block read from " + rep_->file->file_name() +
  2154. " offset " + ToString(handle.offset()) + ", expected " +
  2155. ToString(req.len) + " bytes, got " + ToString(req.result.size()));
  2156. }
  2157. }
  2158. BlockContents raw_block_contents;
  2159. size_t cur_read_end = req_offset + block_size(handle);
  2160. if (cur_read_end > req.result.size()) {
  2161. s = Status::Corruption(
  2162. "truncated block read from " + rep_->file->file_name() + " offset " +
  2163. ToString(handle.offset()) + ", expected " + ToString(req.len) +
  2164. " bytes, got " + ToString(req.result.size()));
  2165. }
  2166. bool blocks_share_read_buffer = (req.result.size() != block_size(handle));
  2167. if (s.ok()) {
  2168. if (scratch == nullptr && !blocks_share_read_buffer) {
  2169. // We allocated a buffer for this block. Give ownership of it to
  2170. // BlockContents so it can free the memory
  2171. assert(req.result.data() == req.scratch);
  2172. std::unique_ptr<char[]> raw_block(req.scratch + req_offset);
  2173. raw_block_contents = BlockContents(std::move(raw_block), handle.size());
  2174. } else {
  2175. // We used the scratch buffer which are shared by the blocks.
  2176. // raw_block_contents does not have the ownership.
  2177. raw_block_contents =
  2178. BlockContents(Slice(req.scratch + req_offset, handle.size()));
  2179. }
  2180. #ifndef NDEBUG
  2181. raw_block_contents.is_raw_block = true;
  2182. #endif
  2183. if (options.verify_checksums) {
  2184. PERF_TIMER_GUARD(block_checksum_time);
  2185. const char* data = req.result.data();
  2186. uint32_t expected =
  2187. DecodeFixed32(data + req_offset + handle.size() + 1);
  2188. // Since the scratch might be shared. the offset of the data block in
  2189. // the buffer might not be 0. req.result.data() only point to the
  2190. // begin address of each read request, we need to add the offset
  2191. // in each read request. Checksum is stored in the block trailer,
  2192. // which is handle.size() + 1.
  2193. s = ROCKSDB_NAMESPACE::VerifyChecksum(footer.checksum(),
  2194. req.result.data() + req_offset,
  2195. handle.size() + 1, expected);
  2196. TEST_SYNC_POINT_CALLBACK("RetrieveMultipleBlocks:VerifyChecksum", &s);
  2197. }
  2198. }
  2199. if (s.ok()) {
  2200. // It handles a rare case: compression is set and these is no compressed
  2201. // cache (enable combined read). In this case, the scratch != nullptr.
  2202. // At the same time, some blocks are actually not compressed,
  2203. // since its compression space saving is smaller than the threshold. In
  2204. // this case, if the block shares the scratch memory, we need to copy it
  2205. // to the heap such that it can be added to the regular block cache.
  2206. CompressionType compression_type =
  2207. raw_block_contents.get_compression_type();
  2208. if (scratch != nullptr && compression_type == kNoCompression) {
  2209. Slice raw = Slice(req.scratch + req_offset, block_size(handle));
  2210. raw_block_contents = BlockContents(
  2211. CopyBufferToHeap(GetMemoryAllocator(rep_->table_options), raw),
  2212. handle.size());
  2213. #ifndef NDEBUG
  2214. raw_block_contents.is_raw_block = true;
  2215. #endif
  2216. }
  2217. }
  2218. if (s.ok()) {
  2219. if (options.fill_cache) {
  2220. BlockCacheLookupContext lookup_data_block_context(
  2221. TableReaderCaller::kUserMultiGet);
  2222. CachableEntry<Block>* block_entry = &(*results)[idx_in_batch];
  2223. // MaybeReadBlockAndLoadToCache will insert into the block caches if
  2224. // necessary. Since we're passing the raw block contents, it will
  2225. // avoid looking up the block cache
  2226. s = MaybeReadBlockAndLoadToCache(
  2227. nullptr, options, handle, uncompression_dict, block_entry,
  2228. BlockType::kData, mget_iter->get_context,
  2229. &lookup_data_block_context, &raw_block_contents);
  2230. // block_entry value could be null if no block cache is present, i.e
  2231. // BlockBasedTableOptions::no_block_cache is true and no compressed
  2232. // block cache is configured. In that case, fall
  2233. // through and set up the block explicitly
  2234. if (block_entry->GetValue() != nullptr) {
  2235. continue;
  2236. }
  2237. }
  2238. CompressionType compression_type =
  2239. raw_block_contents.get_compression_type();
  2240. BlockContents contents;
  2241. if (compression_type != kNoCompression) {
  2242. UncompressionContext context(compression_type);
  2243. UncompressionInfo info(context, uncompression_dict, compression_type);
  2244. s = UncompressBlockContents(info, req.result.data() + req_offset,
  2245. handle.size(), &contents, footer.version(),
  2246. rep_->ioptions, memory_allocator);
  2247. } else {
  2248. // There are two cases here: 1) caller uses the scratch buffer; 2) we
  2249. // use the requst buffer. If scratch buffer is used, we ensure that
  2250. // all raw blocks are copyed to the heap as single blocks. If scratch
  2251. // buffer is not used, we also have no combined read, so the raw
  2252. // block can be used directly.
  2253. contents = std::move(raw_block_contents);
  2254. }
  2255. if (s.ok()) {
  2256. (*results)[idx_in_batch].SetOwnedValue(
  2257. new Block(std::move(contents), global_seqno, read_amp_bytes_per_bit,
  2258. ioptions.statistics));
  2259. }
  2260. }
  2261. (*statuses)[idx_in_batch] = s;
  2262. }
  2263. }
  2264. template <typename TBlocklike>
  2265. Status BlockBasedTable::RetrieveBlock(
  2266. FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
  2267. const BlockHandle& handle, const UncompressionDict& uncompression_dict,
  2268. CachableEntry<TBlocklike>* block_entry, BlockType block_type,
  2269. GetContext* get_context, BlockCacheLookupContext* lookup_context,
  2270. bool for_compaction, bool use_cache) const {
  2271. assert(block_entry);
  2272. assert(block_entry->IsEmpty());
  2273. Status s;
  2274. if (use_cache) {
  2275. s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle,
  2276. uncompression_dict, block_entry,
  2277. block_type, get_context, lookup_context,
  2278. /*contents=*/nullptr);
  2279. if (!s.ok()) {
  2280. return s;
  2281. }
  2282. if (block_entry->GetValue() != nullptr) {
  2283. assert(s.ok());
  2284. return s;
  2285. }
  2286. }
  2287. assert(block_entry->IsEmpty());
  2288. const bool no_io = ro.read_tier == kBlockCacheTier;
  2289. if (no_io) {
  2290. return Status::Incomplete("no blocking io");
  2291. }
  2292. const bool maybe_compressed =
  2293. block_type != BlockType::kFilter &&
  2294. block_type != BlockType::kCompressionDictionary &&
  2295. rep_->blocks_maybe_compressed;
  2296. const bool do_uncompress = maybe_compressed;
  2297. std::unique_ptr<TBlocklike> block;
  2298. {
  2299. StopWatch sw(rep_->ioptions.env, rep_->ioptions.statistics,
  2300. READ_BLOCK_GET_MICROS);
  2301. s = ReadBlockFromFile(
  2302. rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block,
  2303. rep_->ioptions, do_uncompress, maybe_compressed, block_type,
  2304. uncompression_dict, rep_->persistent_cache_options,
  2305. rep_->get_global_seqno(block_type),
  2306. block_type == BlockType::kData
  2307. ? rep_->table_options.read_amp_bytes_per_bit
  2308. : 0,
  2309. GetMemoryAllocator(rep_->table_options), for_compaction,
  2310. rep_->blocks_definitely_zstd_compressed,
  2311. rep_->table_options.filter_policy.get());
  2312. }
  2313. if (!s.ok()) {
  2314. return s;
  2315. }
  2316. block_entry->SetOwnedValue(block.release());
  2317. assert(s.ok());
  2318. return s;
  2319. }
  2320. // Explicitly instantiate templates for both "blocklike" types we use.
  2321. // This makes it possible to keep the template definitions in the .cc file.
  2322. template Status BlockBasedTable::RetrieveBlock<BlockContents>(
  2323. FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
  2324. const BlockHandle& handle, const UncompressionDict& uncompression_dict,
  2325. CachableEntry<BlockContents>* block_entry, BlockType block_type,
  2326. GetContext* get_context, BlockCacheLookupContext* lookup_context,
  2327. bool for_compaction, bool use_cache) const;
  2328. template Status BlockBasedTable::RetrieveBlock<ParsedFullFilterBlock>(
  2329. FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
  2330. const BlockHandle& handle, const UncompressionDict& uncompression_dict,
  2331. CachableEntry<ParsedFullFilterBlock>* block_entry, BlockType block_type,
  2332. GetContext* get_context, BlockCacheLookupContext* lookup_context,
  2333. bool for_compaction, bool use_cache) const;
  2334. template Status BlockBasedTable::RetrieveBlock<Block>(
  2335. FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
  2336. const BlockHandle& handle, const UncompressionDict& uncompression_dict,
  2337. CachableEntry<Block>* block_entry, BlockType block_type,
  2338. GetContext* get_context, BlockCacheLookupContext* lookup_context,
  2339. bool for_compaction, bool use_cache) const;
  2340. template Status BlockBasedTable::RetrieveBlock<UncompressionDict>(
  2341. FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
  2342. const BlockHandle& handle, const UncompressionDict& uncompression_dict,
  2343. CachableEntry<UncompressionDict>* block_entry, BlockType block_type,
  2344. GetContext* get_context, BlockCacheLookupContext* lookup_context,
  2345. bool for_compaction, bool use_cache) const;
  2346. BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
  2347. const BlockBasedTable* table,
  2348. std::unordered_map<uint64_t, CachableEntry<Block>>* block_map)
  2349. : table_(table), block_map_(block_map) {}
  2350. InternalIteratorBase<IndexValue>*
  2351. BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
  2352. const BlockHandle& handle) {
  2353. // Return a block iterator on the index partition
  2354. auto block = block_map_->find(handle.offset());
  2355. // This is a possible scenario since block cache might not have had space
  2356. // for the partition
  2357. if (block != block_map_->end()) {
  2358. const Rep* rep = table_->get_rep();
  2359. assert(rep);
  2360. Statistics* kNullStats = nullptr;
  2361. // We don't return pinned data from index blocks, so no need
  2362. // to set `block_contents_pinned`.
  2363. return block->second.GetValue()->NewIndexIterator(
  2364. &rep->internal_comparator, rep->internal_comparator.user_comparator(),
  2365. nullptr, kNullStats, true, rep->index_has_first_key,
  2366. rep->index_key_includes_seq, rep->index_value_is_full);
  2367. }
  2368. // Create an empty iterator
  2369. return new IndexBlockIter();
  2370. }
  2371. // This will be broken if the user specifies an unusual implementation
  2372. // of Options.comparator, or if the user specifies an unusual
  2373. // definition of prefixes in BlockBasedTableOptions.filter_policy.
  2374. // In particular, we require the following three properties:
  2375. //
  2376. // 1) key.starts_with(prefix(key))
  2377. // 2) Compare(prefix(key), key) <= 0.
  2378. // 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0
  2379. //
  2380. // Otherwise, this method guarantees no I/O will be incurred.
  2381. //
  2382. // REQUIRES: this method shouldn't be called while the DB lock is held.
  2383. bool BlockBasedTable::PrefixMayMatch(
  2384. const Slice& internal_key, const ReadOptions& read_options,
  2385. const SliceTransform* options_prefix_extractor,
  2386. const bool need_upper_bound_check,
  2387. BlockCacheLookupContext* lookup_context) const {
  2388. if (!rep_->filter_policy) {
  2389. return true;
  2390. }
  2391. const SliceTransform* prefix_extractor;
  2392. if (rep_->table_prefix_extractor == nullptr) {
  2393. if (need_upper_bound_check) {
  2394. return true;
  2395. }
  2396. prefix_extractor = options_prefix_extractor;
  2397. } else {
  2398. prefix_extractor = rep_->table_prefix_extractor.get();
  2399. }
  2400. auto user_key = ExtractUserKey(internal_key);
  2401. if (!prefix_extractor->InDomain(user_key)) {
  2402. return true;
  2403. }
  2404. bool may_match = true;
  2405. Status s;
  2406. // First, try check with full filter
  2407. FilterBlockReader* const filter = rep_->filter.get();
  2408. bool filter_checked = true;
  2409. if (filter != nullptr) {
  2410. if (!filter->IsBlockBased()) {
  2411. const Slice* const const_ikey_ptr = &internal_key;
  2412. may_match = filter->RangeMayExist(
  2413. read_options.iterate_upper_bound, user_key, prefix_extractor,
  2414. rep_->internal_comparator.user_comparator(), const_ikey_ptr,
  2415. &filter_checked, need_upper_bound_check, lookup_context);
  2416. } else {
  2417. // if prefix_extractor changed for block based filter, skip filter
  2418. if (need_upper_bound_check) {
  2419. return true;
  2420. }
  2421. auto prefix = prefix_extractor->Transform(user_key);
  2422. InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue);
  2423. auto internal_prefix = internal_key_prefix.Encode();
  2424. // To prevent any io operation in this method, we set `read_tier` to make
  2425. // sure we always read index or filter only when they have already been
  2426. // loaded to memory.
  2427. ReadOptions no_io_read_options;
  2428. no_io_read_options.read_tier = kBlockCacheTier;
  2429. // Then, try find it within each block
  2430. // we already know prefix_extractor and prefix_extractor_name must match
  2431. // because `CheckPrefixMayMatch` first checks `check_filter_ == true`
  2432. std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
  2433. no_io_read_options,
  2434. /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
  2435. /*get_context=*/nullptr, lookup_context));
  2436. iiter->Seek(internal_prefix);
  2437. if (!iiter->Valid()) {
  2438. // we're past end of file
  2439. // if it's incomplete, it means that we avoided I/O
  2440. // and we're not really sure that we're past the end
  2441. // of the file
  2442. may_match = iiter->status().IsIncomplete();
  2443. } else if ((rep_->index_key_includes_seq ? ExtractUserKey(iiter->key())
  2444. : iiter->key())
  2445. .starts_with(ExtractUserKey(internal_prefix))) {
  2446. // we need to check for this subtle case because our only
  2447. // guarantee is that "the key is a string >= last key in that data
  2448. // block" according to the doc/table_format.txt spec.
  2449. //
  2450. // Suppose iiter->key() starts with the desired prefix; it is not
  2451. // necessarily the case that the corresponding data block will
  2452. // contain the prefix, since iiter->key() need not be in the
  2453. // block. However, the next data block may contain the prefix, so
  2454. // we return true to play it safe.
  2455. may_match = true;
  2456. } else if (filter->IsBlockBased()) {
  2457. // iiter->key() does NOT start with the desired prefix. Because
  2458. // Seek() finds the first key that is >= the seek target, this
  2459. // means that iiter->key() > prefix. Thus, any data blocks coming
  2460. // after the data block corresponding to iiter->key() cannot
  2461. // possibly contain the key. Thus, the corresponding data block
  2462. // is the only on could potentially contain the prefix.
  2463. BlockHandle handle = iiter->value().handle;
  2464. may_match = filter->PrefixMayMatch(
  2465. prefix, prefix_extractor, handle.offset(), /*no_io=*/false,
  2466. /*const_key_ptr=*/nullptr, /*get_context=*/nullptr, lookup_context);
  2467. }
  2468. }
  2469. }
  2470. if (filter_checked) {
  2471. Statistics* statistics = rep_->ioptions.statistics;
  2472. RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
  2473. if (!may_match) {
  2474. RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
  2475. }
  2476. }
  2477. return may_match;
  2478. }
  2479. template <class TBlockIter, typename TValue>
  2480. void BlockBasedTableIterator<TBlockIter, TValue>::Seek(const Slice& target) {
  2481. SeekImpl(&target);
  2482. }
  2483. template <class TBlockIter, typename TValue>
  2484. void BlockBasedTableIterator<TBlockIter, TValue>::SeekToFirst() {
  2485. SeekImpl(nullptr);
  2486. }
  2487. template <class TBlockIter, typename TValue>
  2488. void BlockBasedTableIterator<TBlockIter, TValue>::SeekImpl(
  2489. const Slice* target) {
  2490. is_out_of_bound_ = false;
  2491. is_at_first_key_from_index_ = false;
  2492. if (target && !CheckPrefixMayMatch(*target, IterDirection::kForward)) {
  2493. ResetDataIter();
  2494. return;
  2495. }
  2496. bool need_seek_index = true;
  2497. if (block_iter_points_to_real_block_ && block_iter_.Valid()) {
  2498. // Reseek.
  2499. prev_block_offset_ = index_iter_->value().handle.offset();
  2500. if (target) {
  2501. // We can avoid an index seek if:
  2502. // 1. The new seek key is larger than the current key
  2503. // 2. The new seek key is within the upper bound of the block
  2504. // Since we don't necessarily know the internal key for either
  2505. // the current key or the upper bound, we check user keys and
  2506. // exclude the equality case. Considering internal keys can
  2507. // improve for the boundary cases, but it would complicate the
  2508. // code.
  2509. if (user_comparator_.Compare(ExtractUserKey(*target),
  2510. block_iter_.user_key()) > 0 &&
  2511. user_comparator_.Compare(ExtractUserKey(*target),
  2512. index_iter_->user_key()) < 0) {
  2513. need_seek_index = false;
  2514. }
  2515. }
  2516. }
  2517. if (need_seek_index) {
  2518. if (target) {
  2519. index_iter_->Seek(*target);
  2520. } else {
  2521. index_iter_->SeekToFirst();
  2522. }
  2523. if (!index_iter_->Valid()) {
  2524. ResetDataIter();
  2525. return;
  2526. }
  2527. }
  2528. IndexValue v = index_iter_->value();
  2529. const bool same_block = block_iter_points_to_real_block_ &&
  2530. v.handle.offset() == prev_block_offset_;
  2531. // TODO(kolmike): Remove the != kBlockCacheTier condition.
  2532. if (!v.first_internal_key.empty() && !same_block &&
  2533. (!target || icomp_.Compare(*target, v.first_internal_key) <= 0) &&
  2534. read_options_.read_tier != kBlockCacheTier) {
  2535. // Index contains the first key of the block, and it's >= target.
  2536. // We can defer reading the block.
  2537. is_at_first_key_from_index_ = true;
  2538. // ResetDataIter() will invalidate block_iter_. Thus, there is no need to
  2539. // call CheckDataBlockWithinUpperBound() to check for iterate_upper_bound
  2540. // as that will be done later when the data block is actually read.
  2541. ResetDataIter();
  2542. } else {
  2543. // Need to use the data block.
  2544. if (!same_block) {
  2545. InitDataBlock();
  2546. } else {
  2547. // When the user does a reseek, the iterate_upper_bound might have
  2548. // changed. CheckDataBlockWithinUpperBound() needs to be called
  2549. // explicitly if the reseek ends up in the same data block.
  2550. // If the reseek ends up in a different block, InitDataBlock() will do
  2551. // the iterator upper bound check.
  2552. CheckDataBlockWithinUpperBound();
  2553. }
  2554. if (target) {
  2555. block_iter_.Seek(*target);
  2556. } else {
  2557. block_iter_.SeekToFirst();
  2558. }
  2559. FindKeyForward();
  2560. }
  2561. CheckOutOfBound();
  2562. if (target) {
  2563. assert(!Valid() || ((block_type_ == BlockType::kIndex &&
  2564. !table_->get_rep()->index_key_includes_seq)
  2565. ? (user_comparator_.Compare(ExtractUserKey(*target),
  2566. key()) <= 0)
  2567. : (icomp_.Compare(*target, key()) <= 0)));
  2568. }
  2569. }
  2570. template <class TBlockIter, typename TValue>
  2571. void BlockBasedTableIterator<TBlockIter, TValue>::SeekForPrev(
  2572. const Slice& target) {
  2573. is_out_of_bound_ = false;
  2574. is_at_first_key_from_index_ = false;
  2575. // For now totally disable prefix seek in auto prefix mode because we don't
  2576. // have logic
  2577. if (!CheckPrefixMayMatch(target, IterDirection::kBackward)) {
  2578. ResetDataIter();
  2579. return;
  2580. }
  2581. SavePrevIndexValue();
  2582. // Call Seek() rather than SeekForPrev() in the index block, because the
  2583. // target data block will likely to contain the position for `target`, the
  2584. // same as Seek(), rather than than before.
  2585. // For example, if we have three data blocks, each containing two keys:
  2586. // [2, 4] [6, 8] [10, 12]
  2587. // (the keys in the index block would be [4, 8, 12])
  2588. // and the user calls SeekForPrev(7), we need to go to the second block,
  2589. // just like if they call Seek(7).
  2590. // The only case where the block is difference is when they seek to a position
  2591. // in the boundary. For example, if they SeekForPrev(5), we should go to the
  2592. // first block, rather than the second. However, we don't have the information
  2593. // to distinguish the two unless we read the second block. In this case, we'll
  2594. // end up with reading two blocks.
  2595. index_iter_->Seek(target);
  2596. if (!index_iter_->Valid()) {
  2597. auto seek_status = index_iter_->status();
  2598. // Check for IO error
  2599. if (!seek_status.IsNotFound() && !seek_status.ok()) {
  2600. ResetDataIter();
  2601. return;
  2602. }
  2603. // With prefix index, Seek() returns NotFound if the prefix doesn't exist
  2604. if (seek_status.IsNotFound()) {
  2605. // Any key less than the target is fine for prefix seek
  2606. ResetDataIter();
  2607. return;
  2608. } else {
  2609. index_iter_->SeekToLast();
  2610. }
  2611. // Check for IO error
  2612. if (!index_iter_->Valid()) {
  2613. ResetDataIter();
  2614. return;
  2615. }
  2616. }
  2617. InitDataBlock();
  2618. block_iter_.SeekForPrev(target);
  2619. FindKeyBackward();
  2620. CheckDataBlockWithinUpperBound();
  2621. assert(!block_iter_.Valid() ||
  2622. icomp_.Compare(target, block_iter_.key()) >= 0);
  2623. }
  2624. template <class TBlockIter, typename TValue>
  2625. void BlockBasedTableIterator<TBlockIter, TValue>::SeekToLast() {
  2626. is_out_of_bound_ = false;
  2627. is_at_first_key_from_index_ = false;
  2628. SavePrevIndexValue();
  2629. index_iter_->SeekToLast();
  2630. if (!index_iter_->Valid()) {
  2631. ResetDataIter();
  2632. return;
  2633. }
  2634. InitDataBlock();
  2635. block_iter_.SeekToLast();
  2636. FindKeyBackward();
  2637. CheckDataBlockWithinUpperBound();
  2638. }
  2639. template <class TBlockIter, typename TValue>
  2640. void BlockBasedTableIterator<TBlockIter, TValue>::Next() {
  2641. if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) {
  2642. return;
  2643. }
  2644. assert(block_iter_points_to_real_block_);
  2645. block_iter_.Next();
  2646. FindKeyForward();
  2647. CheckOutOfBound();
  2648. }
  2649. template <class TBlockIter, typename TValue>
  2650. bool BlockBasedTableIterator<TBlockIter, TValue>::NextAndGetResult(
  2651. IterateResult* result) {
  2652. Next();
  2653. bool is_valid = Valid();
  2654. if (is_valid) {
  2655. result->key = key();
  2656. result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
  2657. }
  2658. return is_valid;
  2659. }
  2660. template <class TBlockIter, typename TValue>
  2661. void BlockBasedTableIterator<TBlockIter, TValue>::Prev() {
  2662. if (is_at_first_key_from_index_) {
  2663. is_at_first_key_from_index_ = false;
  2664. index_iter_->Prev();
  2665. if (!index_iter_->Valid()) {
  2666. return;
  2667. }
  2668. InitDataBlock();
  2669. block_iter_.SeekToLast();
  2670. } else {
  2671. assert(block_iter_points_to_real_block_);
  2672. block_iter_.Prev();
  2673. }
  2674. FindKeyBackward();
  2675. }
  2676. template <class TBlockIter, typename TValue>
  2677. void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
  2678. BlockHandle data_block_handle = index_iter_->value().handle;
  2679. if (!block_iter_points_to_real_block_ ||
  2680. data_block_handle.offset() != prev_block_offset_ ||
  2681. // if previous attempt of reading the block missed cache, try again
  2682. block_iter_.status().IsIncomplete()) {
  2683. if (block_iter_points_to_real_block_) {
  2684. ResetDataIter();
  2685. }
  2686. auto* rep = table_->get_rep();
  2687. // Prefetch additional data for range scans (iterators). Enabled only for
  2688. // user reads.
  2689. // Implicit auto readahead:
  2690. // Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
  2691. // Explicit user requested readahead:
  2692. // Enabled from the very first IO when ReadOptions.readahead_size is set.
  2693. if (lookup_context_.caller != TableReaderCaller::kCompaction) {
  2694. if (read_options_.readahead_size == 0) {
  2695. // Implicit auto readahead
  2696. num_file_reads_++;
  2697. if (num_file_reads_ >
  2698. BlockBasedTable::kMinNumFileReadsToStartAutoReadahead) {
  2699. if (!rep->file->use_direct_io() &&
  2700. (data_block_handle.offset() +
  2701. static_cast<size_t>(block_size(data_block_handle)) >
  2702. readahead_limit_)) {
  2703. // Buffered I/O
  2704. // Discarding the return status of Prefetch calls intentionally, as
  2705. // we can fallback to reading from disk if Prefetch fails.
  2706. rep->file->Prefetch(data_block_handle.offset(), readahead_size_);
  2707. readahead_limit_ = static_cast<size_t>(data_block_handle.offset() +
  2708. readahead_size_);
  2709. // Keep exponentially increasing readahead size until
  2710. // kMaxAutoReadaheadSize.
  2711. readahead_size_ = std::min(BlockBasedTable::kMaxAutoReadaheadSize,
  2712. readahead_size_ * 2);
  2713. } else if (rep->file->use_direct_io() && !prefetch_buffer_) {
  2714. // Direct I/O
  2715. // Let FilePrefetchBuffer take care of the readahead.
  2716. rep->CreateFilePrefetchBuffer(
  2717. BlockBasedTable::kInitAutoReadaheadSize,
  2718. BlockBasedTable::kMaxAutoReadaheadSize, &prefetch_buffer_);
  2719. }
  2720. }
  2721. } else if (!prefetch_buffer_) {
  2722. // Explicit user requested readahead
  2723. // The actual condition is:
  2724. // if (read_options_.readahead_size != 0 && !prefetch_buffer_)
  2725. rep->CreateFilePrefetchBuffer(read_options_.readahead_size,
  2726. read_options_.readahead_size,
  2727. &prefetch_buffer_);
  2728. }
  2729. } else if (!prefetch_buffer_) {
  2730. rep->CreateFilePrefetchBuffer(compaction_readahead_size_,
  2731. compaction_readahead_size_,
  2732. &prefetch_buffer_);
  2733. }
  2734. Status s;
  2735. table_->NewDataBlockIterator<TBlockIter>(
  2736. read_options_, data_block_handle, &block_iter_, block_type_,
  2737. /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get(),
  2738. /*for_compaction=*/lookup_context_.caller ==
  2739. TableReaderCaller::kCompaction);
  2740. block_iter_points_to_real_block_ = true;
  2741. CheckDataBlockWithinUpperBound();
  2742. }
  2743. }
  2744. template <class TBlockIter, typename TValue>
  2745. bool BlockBasedTableIterator<TBlockIter, TValue>::MaterializeCurrentBlock() {
  2746. assert(is_at_first_key_from_index_);
  2747. assert(!block_iter_points_to_real_block_);
  2748. assert(index_iter_->Valid());
  2749. is_at_first_key_from_index_ = false;
  2750. InitDataBlock();
  2751. assert(block_iter_points_to_real_block_);
  2752. block_iter_.SeekToFirst();
  2753. if (!block_iter_.Valid() ||
  2754. icomp_.Compare(block_iter_.key(),
  2755. index_iter_->value().first_internal_key) != 0) {
  2756. // Uh oh.
  2757. block_iter_.Invalidate(Status::Corruption(
  2758. "first key in index doesn't match first key in block"));
  2759. return false;
  2760. }
  2761. return true;
  2762. }
  2763. template <class TBlockIter, typename TValue>
  2764. void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyForward() {
  2765. // This method's code is kept short to make it likely to be inlined.
  2766. assert(!is_out_of_bound_);
  2767. assert(block_iter_points_to_real_block_);
  2768. if (!block_iter_.Valid()) {
  2769. // This is the only call site of FindBlockForward(), but it's extracted into
  2770. // a separate method to keep FindKeyForward() short and likely to be
  2771. // inlined. When transitioning to a different block, we call
  2772. // FindBlockForward(), which is much longer and is probably not inlined.
  2773. FindBlockForward();
  2774. } else {
  2775. // This is the fast path that avoids a function call.
  2776. }
  2777. }
  2778. template <class TBlockIter, typename TValue>
  2779. void BlockBasedTableIterator<TBlockIter, TValue>::FindBlockForward() {
  2780. // TODO the while loop inherits from two-level-iterator. We don't know
  2781. // whether a block can be empty so it can be replaced by an "if".
  2782. do {
  2783. if (!block_iter_.status().ok()) {
  2784. return;
  2785. }
  2786. // Whether next data block is out of upper bound, if there is one.
  2787. const bool next_block_is_out_of_bound =
  2788. read_options_.iterate_upper_bound != nullptr &&
  2789. block_iter_points_to_real_block_ && !data_block_within_upper_bound_;
  2790. assert(!next_block_is_out_of_bound ||
  2791. user_comparator_.Compare(*read_options_.iterate_upper_bound,
  2792. index_iter_->user_key()) <= 0);
  2793. ResetDataIter();
  2794. index_iter_->Next();
  2795. if (next_block_is_out_of_bound) {
  2796. // The next block is out of bound. No need to read it.
  2797. TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr);
  2798. // We need to make sure this is not the last data block before setting
  2799. // is_out_of_bound_, since the index key for the last data block can be
  2800. // larger than smallest key of the next file on the same level.
  2801. if (index_iter_->Valid()) {
  2802. is_out_of_bound_ = true;
  2803. }
  2804. return;
  2805. }
  2806. if (!index_iter_->Valid()) {
  2807. return;
  2808. }
  2809. IndexValue v = index_iter_->value();
  2810. // TODO(kolmike): Remove the != kBlockCacheTier condition.
  2811. if (!v.first_internal_key.empty() &&
  2812. read_options_.read_tier != kBlockCacheTier) {
  2813. // Index contains the first key of the block. Defer reading the block.
  2814. is_at_first_key_from_index_ = true;
  2815. return;
  2816. }
  2817. InitDataBlock();
  2818. block_iter_.SeekToFirst();
  2819. } while (!block_iter_.Valid());
  2820. }
  2821. template <class TBlockIter, typename TValue>
  2822. void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyBackward() {
  2823. while (!block_iter_.Valid()) {
  2824. if (!block_iter_.status().ok()) {
  2825. return;
  2826. }
  2827. ResetDataIter();
  2828. index_iter_->Prev();
  2829. if (index_iter_->Valid()) {
  2830. InitDataBlock();
  2831. block_iter_.SeekToLast();
  2832. } else {
  2833. return;
  2834. }
  2835. }
  2836. // We could have check lower bound here too, but we opt not to do it for
  2837. // code simplicity.
  2838. }
  2839. template <class TBlockIter, typename TValue>
  2840. void BlockBasedTableIterator<TBlockIter, TValue>::CheckOutOfBound() {
  2841. if (read_options_.iterate_upper_bound != nullptr && Valid()) {
  2842. is_out_of_bound_ = user_comparator_.Compare(
  2843. *read_options_.iterate_upper_bound, user_key()) <= 0;
  2844. }
  2845. }
  2846. template <class TBlockIter, typename TValue>
  2847. void BlockBasedTableIterator<TBlockIter,
  2848. TValue>::CheckDataBlockWithinUpperBound() {
  2849. if (read_options_.iterate_upper_bound != nullptr &&
  2850. block_iter_points_to_real_block_) {
  2851. data_block_within_upper_bound_ =
  2852. (user_comparator_.Compare(*read_options_.iterate_upper_bound,
  2853. index_iter_->user_key()) > 0);
  2854. }
  2855. }
  2856. InternalIterator* BlockBasedTable::NewIterator(
  2857. const ReadOptions& read_options, const SliceTransform* prefix_extractor,
  2858. Arena* arena, bool skip_filters, TableReaderCaller caller,
  2859. size_t compaction_readahead_size) {
  2860. BlockCacheLookupContext lookup_context{caller};
  2861. bool need_upper_bound_check =
  2862. read_options.auto_prefix_mode ||
  2863. PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor);
  2864. if (arena == nullptr) {
  2865. return new BlockBasedTableIterator<DataBlockIter>(
  2866. this, read_options, rep_->internal_comparator,
  2867. NewIndexIterator(
  2868. read_options,
  2869. need_upper_bound_check &&
  2870. rep_->index_type == BlockBasedTableOptions::kHashSearch,
  2871. /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context),
  2872. !skip_filters && !read_options.total_order_seek &&
  2873. prefix_extractor != nullptr,
  2874. need_upper_bound_check, prefix_extractor, BlockType::kData, caller,
  2875. compaction_readahead_size);
  2876. } else {
  2877. auto* mem =
  2878. arena->AllocateAligned(sizeof(BlockBasedTableIterator<DataBlockIter>));
  2879. return new (mem) BlockBasedTableIterator<DataBlockIter>(
  2880. this, read_options, rep_->internal_comparator,
  2881. NewIndexIterator(
  2882. read_options,
  2883. need_upper_bound_check &&
  2884. rep_->index_type == BlockBasedTableOptions::kHashSearch,
  2885. /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context),
  2886. !skip_filters && !read_options.total_order_seek &&
  2887. prefix_extractor != nullptr,
  2888. need_upper_bound_check, prefix_extractor, BlockType::kData, caller,
  2889. compaction_readahead_size);
  2890. }
  2891. }
  2892. FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator(
  2893. const ReadOptions& read_options) {
  2894. if (rep_->fragmented_range_dels == nullptr) {
  2895. return nullptr;
  2896. }
  2897. SequenceNumber snapshot = kMaxSequenceNumber;
  2898. if (read_options.snapshot != nullptr) {
  2899. snapshot = read_options.snapshot->GetSequenceNumber();
  2900. }
  2901. return new FragmentedRangeTombstoneIterator(
  2902. rep_->fragmented_range_dels, rep_->internal_comparator, snapshot);
  2903. }
  2904. bool BlockBasedTable::FullFilterKeyMayMatch(
  2905. const ReadOptions& read_options, FilterBlockReader* filter,
  2906. const Slice& internal_key, const bool no_io,
  2907. const SliceTransform* prefix_extractor, GetContext* get_context,
  2908. BlockCacheLookupContext* lookup_context) const {
  2909. if (filter == nullptr || filter->IsBlockBased()) {
  2910. return true;
  2911. }
  2912. Slice user_key = ExtractUserKey(internal_key);
  2913. const Slice* const const_ikey_ptr = &internal_key;
  2914. bool may_match = true;
  2915. if (rep_->whole_key_filtering) {
  2916. size_t ts_sz =
  2917. rep_->internal_comparator.user_comparator()->timestamp_size();
  2918. Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz);
  2919. may_match =
  2920. filter->KeyMayMatch(user_key_without_ts, prefix_extractor, kNotValid,
  2921. no_io, const_ikey_ptr, get_context, lookup_context);
  2922. } else if (!read_options.total_order_seek && prefix_extractor &&
  2923. rep_->table_properties->prefix_extractor_name.compare(
  2924. prefix_extractor->Name()) == 0 &&
  2925. prefix_extractor->InDomain(user_key) &&
  2926. !filter->PrefixMayMatch(prefix_extractor->Transform(user_key),
  2927. prefix_extractor, kNotValid, no_io,
  2928. const_ikey_ptr, get_context,
  2929. lookup_context)) {
  2930. may_match = false;
  2931. }
  2932. if (may_match) {
  2933. RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_POSITIVE);
  2934. PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level);
  2935. }
  2936. return may_match;
  2937. }
  2938. void BlockBasedTable::FullFilterKeysMayMatch(
  2939. const ReadOptions& read_options, FilterBlockReader* filter,
  2940. MultiGetRange* range, const bool no_io,
  2941. const SliceTransform* prefix_extractor,
  2942. BlockCacheLookupContext* lookup_context) const {
  2943. if (filter == nullptr || filter->IsBlockBased()) {
  2944. return;
  2945. }
  2946. if (rep_->whole_key_filtering) {
  2947. filter->KeysMayMatch(range, prefix_extractor, kNotValid, no_io,
  2948. lookup_context);
  2949. } else if (!read_options.total_order_seek && prefix_extractor &&
  2950. rep_->table_properties->prefix_extractor_name.compare(
  2951. prefix_extractor->Name()) == 0) {
  2952. filter->PrefixesMayMatch(range, prefix_extractor, kNotValid, false,
  2953. lookup_context);
  2954. }
  2955. }
  2956. Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
  2957. GetContext* get_context,
  2958. const SliceTransform* prefix_extractor,
  2959. bool skip_filters) {
  2960. assert(key.size() >= 8); // key must be internal key
  2961. assert(get_context != nullptr);
  2962. Status s;
  2963. const bool no_io = read_options.read_tier == kBlockCacheTier;
  2964. FilterBlockReader* const filter =
  2965. !skip_filters ? rep_->filter.get() : nullptr;
  2966. // First check the full filter
  2967. // If full filter not useful, Then go into each block
  2968. uint64_t tracing_get_id = get_context->get_tracing_get_id();
  2969. BlockCacheLookupContext lookup_context{
  2970. TableReaderCaller::kUserGet, tracing_get_id,
  2971. /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
  2972. if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
  2973. // Trace the key since it contains both user key and sequence number.
  2974. lookup_context.referenced_key = key.ToString();
  2975. lookup_context.get_from_user_specified_snapshot =
  2976. read_options.snapshot != nullptr;
  2977. }
  2978. const bool may_match =
  2979. FullFilterKeyMayMatch(read_options, filter, key, no_io, prefix_extractor,
  2980. get_context, &lookup_context);
  2981. if (!may_match) {
  2982. RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
  2983. PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
  2984. } else {
  2985. IndexBlockIter iiter_on_stack;
  2986. // if prefix_extractor found in block differs from options, disable
  2987. // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
  2988. bool need_upper_bound_check = false;
  2989. if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
  2990. need_upper_bound_check = PrefixExtractorChanged(
  2991. rep_->table_properties.get(), prefix_extractor);
  2992. }
  2993. auto iiter =
  2994. NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
  2995. get_context, &lookup_context);
  2996. std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
  2997. if (iiter != &iiter_on_stack) {
  2998. iiter_unique_ptr.reset(iiter);
  2999. }
  3000. size_t ts_sz =
  3001. rep_->internal_comparator.user_comparator()->timestamp_size();
  3002. bool matched = false; // if such user key mathced a key in SST
  3003. bool done = false;
  3004. for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
  3005. IndexValue v = iiter->value();
  3006. bool not_exist_in_filter =
  3007. filter != nullptr && filter->IsBlockBased() == true &&
  3008. !filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz),
  3009. prefix_extractor, v.handle.offset(), no_io,
  3010. /*const_ikey_ptr=*/nullptr, get_context,
  3011. &lookup_context);
  3012. if (not_exist_in_filter) {
  3013. // Not found
  3014. // TODO: think about interaction with Merge. If a user key cannot
  3015. // cross one data block, we should be fine.
  3016. RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
  3017. PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
  3018. break;
  3019. }
  3020. if (!v.first_internal_key.empty() && !skip_filters &&
  3021. UserComparatorWrapper(rep_->internal_comparator.user_comparator())
  3022. .Compare(ExtractUserKey(key),
  3023. ExtractUserKey(v.first_internal_key)) < 0) {
  3024. // The requested key falls between highest key in previous block and
  3025. // lowest key in current block.
  3026. break;
  3027. }
  3028. BlockCacheLookupContext lookup_data_block_context{
  3029. TableReaderCaller::kUserGet, tracing_get_id,
  3030. /*get_from_user_specified_snapshot=*/read_options.snapshot !=
  3031. nullptr};
  3032. bool does_referenced_key_exist = false;
  3033. DataBlockIter biter;
  3034. uint64_t referenced_data_size = 0;
  3035. NewDataBlockIterator<DataBlockIter>(
  3036. read_options, v.handle, &biter, BlockType::kData, get_context,
  3037. &lookup_data_block_context,
  3038. /*s=*/Status(), /*prefetch_buffer*/ nullptr);
  3039. if (no_io && biter.status().IsIncomplete()) {
  3040. // couldn't get block from block_cache
  3041. // Update Saver.state to Found because we are only looking for
  3042. // whether we can guarantee the key is not there when "no_io" is set
  3043. get_context->MarkKeyMayExist();
  3044. break;
  3045. }
  3046. if (!biter.status().ok()) {
  3047. s = biter.status();
  3048. break;
  3049. }
  3050. bool may_exist = biter.SeekForGet(key);
  3051. // If user-specified timestamp is supported, we cannot end the search
  3052. // just because hash index lookup indicates the key+ts does not exist.
  3053. if (!may_exist && ts_sz == 0) {
  3054. // HashSeek cannot find the key this block and the the iter is not
  3055. // the end of the block, i.e. cannot be in the following blocks
  3056. // either. In this case, the seek_key cannot be found, so we break
  3057. // from the top level for-loop.
  3058. done = true;
  3059. } else {
  3060. // Call the *saver function on each entry/block until it returns false
  3061. for (; biter.Valid(); biter.Next()) {
  3062. ParsedInternalKey parsed_key;
  3063. if (!ParseInternalKey(biter.key(), &parsed_key)) {
  3064. s = Status::Corruption(Slice());
  3065. }
  3066. if (!get_context->SaveValue(
  3067. parsed_key, biter.value(), &matched,
  3068. biter.IsValuePinned() ? &biter : nullptr)) {
  3069. if (get_context->State() == GetContext::GetState::kFound) {
  3070. does_referenced_key_exist = true;
  3071. referenced_data_size = biter.key().size() + biter.value().size();
  3072. }
  3073. done = true;
  3074. break;
  3075. }
  3076. }
  3077. s = biter.status();
  3078. }
  3079. // Write the block cache access record.
  3080. if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
  3081. // Avoid making copy of block_key, cf_name, and referenced_key when
  3082. // constructing the access record.
  3083. Slice referenced_key;
  3084. if (does_referenced_key_exist) {
  3085. referenced_key = biter.key();
  3086. } else {
  3087. referenced_key = key;
  3088. }
  3089. BlockCacheTraceRecord access_record(
  3090. rep_->ioptions.env->NowMicros(),
  3091. /*block_key=*/"", lookup_data_block_context.block_type,
  3092. lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
  3093. /*cf_name=*/"", rep_->level_for_tracing(),
  3094. rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
  3095. lookup_data_block_context.is_cache_hit,
  3096. lookup_data_block_context.no_insert,
  3097. lookup_data_block_context.get_id,
  3098. lookup_data_block_context.get_from_user_specified_snapshot,
  3099. /*referenced_key=*/"", referenced_data_size,
  3100. lookup_data_block_context.num_keys_in_block,
  3101. does_referenced_key_exist);
  3102. block_cache_tracer_->WriteBlockAccess(
  3103. access_record, lookup_data_block_context.block_key,
  3104. rep_->cf_name_for_tracing(), referenced_key);
  3105. }
  3106. if (done) {
  3107. // Avoid the extra Next which is expensive in two-level indexes
  3108. break;
  3109. }
  3110. }
  3111. if (matched && filter != nullptr && !filter->IsBlockBased()) {
  3112. RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
  3113. PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
  3114. rep_->level);
  3115. }
  3116. if (s.ok() && !iiter->status().IsNotFound()) {
  3117. s = iiter->status();
  3118. }
  3119. }
  3120. return s;
  3121. }
  3122. using MultiGetRange = MultiGetContext::Range;
  3123. void BlockBasedTable::MultiGet(const ReadOptions& read_options,
  3124. const MultiGetRange* mget_range,
  3125. const SliceTransform* prefix_extractor,
  3126. bool skip_filters) {
  3127. FilterBlockReader* const filter =
  3128. !skip_filters ? rep_->filter.get() : nullptr;
  3129. MultiGetRange sst_file_range(*mget_range, mget_range->begin(),
  3130. mget_range->end());
  3131. // First check the full filter
  3132. // If full filter not useful, Then go into each block
  3133. const bool no_io = read_options.read_tier == kBlockCacheTier;
  3134. uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId;
  3135. if (!sst_file_range.empty() && sst_file_range.begin()->get_context) {
  3136. tracing_mget_id = sst_file_range.begin()->get_context->get_tracing_get_id();
  3137. }
  3138. BlockCacheLookupContext lookup_context{
  3139. TableReaderCaller::kUserMultiGet, tracing_mget_id,
  3140. /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
  3141. FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io,
  3142. prefix_extractor, &lookup_context);
  3143. if (skip_filters || !sst_file_range.empty()) {
  3144. IndexBlockIter iiter_on_stack;
  3145. // if prefix_extractor found in block differs from options, disable
  3146. // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
  3147. bool need_upper_bound_check = false;
  3148. if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
  3149. need_upper_bound_check = PrefixExtractorChanged(
  3150. rep_->table_properties.get(), prefix_extractor);
  3151. }
  3152. auto iiter =
  3153. NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
  3154. sst_file_range.begin()->get_context, &lookup_context);
  3155. std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
  3156. if (iiter != &iiter_on_stack) {
  3157. iiter_unique_ptr.reset(iiter);
  3158. }
  3159. uint64_t offset = std::numeric_limits<uint64_t>::max();
  3160. autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE> block_handles;
  3161. autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE> results;
  3162. autovector<Status, MultiGetContext::MAX_BATCH_SIZE> statuses;
  3163. char stack_buf[kMultiGetReadStackBufSize];
  3164. std::unique_ptr<char[]> block_buf;
  3165. {
  3166. MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(),
  3167. sst_file_range.end());
  3168. CachableEntry<UncompressionDict> uncompression_dict;
  3169. Status uncompression_dict_status;
  3170. if (rep_->uncompression_dict_reader) {
  3171. uncompression_dict_status =
  3172. rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
  3173. nullptr /* prefetch_buffer */, no_io,
  3174. sst_file_range.begin()->get_context, &lookup_context,
  3175. &uncompression_dict);
  3176. }
  3177. const UncompressionDict& dict = uncompression_dict.GetValue()
  3178. ? *uncompression_dict.GetValue()
  3179. : UncompressionDict::GetEmptyDict();
  3180. size_t total_len = 0;
  3181. ReadOptions ro = read_options;
  3182. ro.read_tier = kBlockCacheTier;
  3183. for (auto miter = data_block_range.begin();
  3184. miter != data_block_range.end(); ++miter) {
  3185. const Slice& key = miter->ikey;
  3186. iiter->Seek(miter->ikey);
  3187. IndexValue v;
  3188. if (iiter->Valid()) {
  3189. v = iiter->value();
  3190. }
  3191. if (!iiter->Valid() ||
  3192. (!v.first_internal_key.empty() && !skip_filters &&
  3193. UserComparatorWrapper(rep_->internal_comparator.user_comparator())
  3194. .Compare(ExtractUserKey(key),
  3195. ExtractUserKey(v.first_internal_key)) < 0)) {
  3196. // The requested key falls between highest key in previous block and
  3197. // lowest key in current block.
  3198. *(miter->s) = iiter->status();
  3199. data_block_range.SkipKey(miter);
  3200. sst_file_range.SkipKey(miter);
  3201. continue;
  3202. }
  3203. if (!uncompression_dict_status.ok()) {
  3204. *(miter->s) = uncompression_dict_status;
  3205. data_block_range.SkipKey(miter);
  3206. sst_file_range.SkipKey(miter);
  3207. continue;
  3208. }
  3209. statuses.emplace_back();
  3210. results.emplace_back();
  3211. if (v.handle.offset() == offset) {
  3212. // We're going to reuse the block for this key later on. No need to
  3213. // look it up now. Place a null handle
  3214. block_handles.emplace_back(BlockHandle::NullBlockHandle());
  3215. continue;
  3216. }
  3217. // Lookup the cache for the given data block referenced by an index
  3218. // iterator value (i.e BlockHandle). If it exists in the cache,
  3219. // initialize block to the contents of the data block.
  3220. offset = v.handle.offset();
  3221. BlockHandle handle = v.handle;
  3222. BlockCacheLookupContext lookup_data_block_context(
  3223. TableReaderCaller::kUserMultiGet);
  3224. Status s = RetrieveBlock(
  3225. nullptr, ro, handle, dict, &(results.back()), BlockType::kData,
  3226. miter->get_context, &lookup_data_block_context,
  3227. /* for_compaction */ false, /* use_cache */ true);
  3228. if (s.IsIncomplete()) {
  3229. s = Status::OK();
  3230. }
  3231. if (s.ok() && !results.back().IsEmpty()) {
  3232. // Found it in the cache. Add NULL handle to indicate there is
  3233. // nothing to read from disk
  3234. block_handles.emplace_back(BlockHandle::NullBlockHandle());
  3235. } else {
  3236. block_handles.emplace_back(handle);
  3237. total_len += block_size(handle);
  3238. }
  3239. }
  3240. if (total_len) {
  3241. char* scratch = nullptr;
  3242. // If the blocks need to be uncompressed and we don't need the
  3243. // compressed blocks, then we can use a contiguous block of
  3244. // memory to read in all the blocks as it will be temporary
  3245. // storage
  3246. // 1. If blocks are compressed and compressed block cache is there,
  3247. // alloc heap bufs
  3248. // 2. If blocks are uncompressed, alloc heap bufs
  3249. // 3. If blocks are compressed and no compressed block cache, use
  3250. // stack buf
  3251. if (rep_->table_options.block_cache_compressed == nullptr &&
  3252. rep_->blocks_maybe_compressed) {
  3253. if (total_len <= kMultiGetReadStackBufSize) {
  3254. scratch = stack_buf;
  3255. } else {
  3256. scratch = new char[total_len];
  3257. block_buf.reset(scratch);
  3258. }
  3259. }
  3260. RetrieveMultipleBlocks(read_options, &data_block_range, &block_handles,
  3261. &statuses, &results, scratch, dict);
  3262. }
  3263. }
  3264. DataBlockIter first_biter;
  3265. DataBlockIter next_biter;
  3266. size_t idx_in_batch = 0;
  3267. for (auto miter = sst_file_range.begin(); miter != sst_file_range.end();
  3268. ++miter) {
  3269. Status s;
  3270. GetContext* get_context = miter->get_context;
  3271. const Slice& key = miter->ikey;
  3272. bool matched = false; // if such user key matched a key in SST
  3273. bool done = false;
  3274. bool first_block = true;
  3275. do {
  3276. DataBlockIter* biter = nullptr;
  3277. bool reusing_block = true;
  3278. uint64_t referenced_data_size = 0;
  3279. bool does_referenced_key_exist = false;
  3280. BlockCacheLookupContext lookup_data_block_context(
  3281. TableReaderCaller::kUserMultiGet, tracing_mget_id,
  3282. /*get_from_user_specified_snapshot=*/read_options.snapshot !=
  3283. nullptr);
  3284. if (first_block) {
  3285. if (!block_handles[idx_in_batch].IsNull() ||
  3286. !results[idx_in_batch].IsEmpty()) {
  3287. first_biter.Invalidate(Status::OK());
  3288. NewDataBlockIterator<DataBlockIter>(
  3289. read_options, results[idx_in_batch], &first_biter,
  3290. statuses[idx_in_batch]);
  3291. reusing_block = false;
  3292. }
  3293. biter = &first_biter;
  3294. idx_in_batch++;
  3295. } else {
  3296. IndexValue v = iiter->value();
  3297. if (!v.first_internal_key.empty() && !skip_filters &&
  3298. UserComparatorWrapper(rep_->internal_comparator.user_comparator())
  3299. .Compare(ExtractUserKey(key),
  3300. ExtractUserKey(v.first_internal_key)) < 0) {
  3301. // The requested key falls between highest key in previous block and
  3302. // lowest key in current block.
  3303. break;
  3304. }
  3305. next_biter.Invalidate(Status::OK());
  3306. NewDataBlockIterator<DataBlockIter>(
  3307. read_options, iiter->value().handle, &next_biter,
  3308. BlockType::kData, get_context, &lookup_data_block_context,
  3309. Status(), nullptr);
  3310. biter = &next_biter;
  3311. reusing_block = false;
  3312. }
  3313. if (read_options.read_tier == kBlockCacheTier &&
  3314. biter->status().IsIncomplete()) {
  3315. // couldn't get block from block_cache
  3316. // Update Saver.state to Found because we are only looking for
  3317. // whether we can guarantee the key is not there when "no_io" is set
  3318. get_context->MarkKeyMayExist();
  3319. break;
  3320. }
  3321. if (!biter->status().ok()) {
  3322. s = biter->status();
  3323. break;
  3324. }
  3325. bool may_exist = biter->SeekForGet(key);
  3326. if (!may_exist) {
  3327. // HashSeek cannot find the key this block and the the iter is not
  3328. // the end of the block, i.e. cannot be in the following blocks
  3329. // either. In this case, the seek_key cannot be found, so we break
  3330. // from the top level for-loop.
  3331. break;
  3332. }
  3333. // Call the *saver function on each entry/block until it returns false
  3334. for (; biter->Valid(); biter->Next()) {
  3335. ParsedInternalKey parsed_key;
  3336. Cleanable dummy;
  3337. Cleanable* value_pinner = nullptr;
  3338. if (!ParseInternalKey(biter->key(), &parsed_key)) {
  3339. s = Status::Corruption(Slice());
  3340. }
  3341. if (biter->IsValuePinned()) {
  3342. if (reusing_block) {
  3343. Cache* block_cache = rep_->table_options.block_cache.get();
  3344. assert(biter->cache_handle() != nullptr);
  3345. block_cache->Ref(biter->cache_handle());
  3346. dummy.RegisterCleanup(&ReleaseCachedEntry, block_cache,
  3347. biter->cache_handle());
  3348. value_pinner = &dummy;
  3349. } else {
  3350. value_pinner = biter;
  3351. }
  3352. }
  3353. if (!get_context->SaveValue(parsed_key, biter->value(), &matched,
  3354. value_pinner)) {
  3355. if (get_context->State() == GetContext::GetState::kFound) {
  3356. does_referenced_key_exist = true;
  3357. referenced_data_size =
  3358. biter->key().size() + biter->value().size();
  3359. }
  3360. done = true;
  3361. break;
  3362. }
  3363. s = biter->status();
  3364. }
  3365. // Write the block cache access.
  3366. if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
  3367. // Avoid making copy of block_key, cf_name, and referenced_key when
  3368. // constructing the access record.
  3369. Slice referenced_key;
  3370. if (does_referenced_key_exist) {
  3371. referenced_key = biter->key();
  3372. } else {
  3373. referenced_key = key;
  3374. }
  3375. BlockCacheTraceRecord access_record(
  3376. rep_->ioptions.env->NowMicros(),
  3377. /*block_key=*/"", lookup_data_block_context.block_type,
  3378. lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
  3379. /*cf_name=*/"", rep_->level_for_tracing(),
  3380. rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
  3381. lookup_data_block_context.is_cache_hit,
  3382. lookup_data_block_context.no_insert,
  3383. lookup_data_block_context.get_id,
  3384. lookup_data_block_context.get_from_user_specified_snapshot,
  3385. /*referenced_key=*/"", referenced_data_size,
  3386. lookup_data_block_context.num_keys_in_block,
  3387. does_referenced_key_exist);
  3388. block_cache_tracer_->WriteBlockAccess(
  3389. access_record, lookup_data_block_context.block_key,
  3390. rep_->cf_name_for_tracing(), referenced_key);
  3391. }
  3392. s = biter->status();
  3393. if (done) {
  3394. // Avoid the extra Next which is expensive in two-level indexes
  3395. break;
  3396. }
  3397. if (first_block) {
  3398. iiter->Seek(key);
  3399. }
  3400. first_block = false;
  3401. iiter->Next();
  3402. } while (iiter->Valid());
  3403. if (matched && filter != nullptr && !filter->IsBlockBased()) {
  3404. RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
  3405. PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
  3406. rep_->level);
  3407. }
  3408. if (s.ok()) {
  3409. s = iiter->status();
  3410. }
  3411. *(miter->s) = s;
  3412. }
  3413. }
  3414. }
  3415. Status BlockBasedTable::Prefetch(const Slice* const begin,
  3416. const Slice* const end) {
  3417. auto& comparator = rep_->internal_comparator;
  3418. UserComparatorWrapper user_comparator(comparator.user_comparator());
  3419. // pre-condition
  3420. if (begin && end && comparator.Compare(*begin, *end) > 0) {
  3421. return Status::InvalidArgument(*begin, *end);
  3422. }
  3423. BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
  3424. IndexBlockIter iiter_on_stack;
  3425. auto iiter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
  3426. &iiter_on_stack, /*get_context=*/nullptr,
  3427. &lookup_context);
  3428. std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
  3429. if (iiter != &iiter_on_stack) {
  3430. iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
  3431. }
  3432. if (!iiter->status().ok()) {
  3433. // error opening index iterator
  3434. return iiter->status();
  3435. }
  3436. // indicates if we are on the last page that need to be pre-fetched
  3437. bool prefetching_boundary_page = false;
  3438. for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid();
  3439. iiter->Next()) {
  3440. BlockHandle block_handle = iiter->value().handle;
  3441. const bool is_user_key = !rep_->index_key_includes_seq;
  3442. if (end &&
  3443. ((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) ||
  3444. (is_user_key &&
  3445. user_comparator.Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) {
  3446. if (prefetching_boundary_page) {
  3447. break;
  3448. }
  3449. // The index entry represents the last key in the data block.
  3450. // We should load this page into memory as well, but no more
  3451. prefetching_boundary_page = true;
  3452. }
  3453. // Load the block specified by the block_handle into the block cache
  3454. DataBlockIter biter;
  3455. NewDataBlockIterator<DataBlockIter>(
  3456. ReadOptions(), block_handle, &biter, /*type=*/BlockType::kData,
  3457. /*get_context=*/nullptr, &lookup_context, Status(),
  3458. /*prefetch_buffer=*/nullptr);
  3459. if (!biter.status().ok()) {
  3460. // there was an unexpected error while pre-fetching
  3461. return biter.status();
  3462. }
  3463. }
  3464. return Status::OK();
  3465. }
  3466. Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options,
  3467. TableReaderCaller caller) {
  3468. Status s;
  3469. // Check Meta blocks
  3470. std::unique_ptr<Block> metaindex;
  3471. std::unique_ptr<InternalIterator> metaindex_iter;
  3472. s = ReadMetaIndexBlock(nullptr /* prefetch buffer */, &metaindex,
  3473. &metaindex_iter);
  3474. if (s.ok()) {
  3475. s = VerifyChecksumInMetaBlocks(metaindex_iter.get());
  3476. if (!s.ok()) {
  3477. return s;
  3478. }
  3479. } else {
  3480. return s;
  3481. }
  3482. // Check Data blocks
  3483. IndexBlockIter iiter_on_stack;
  3484. BlockCacheLookupContext context{caller};
  3485. InternalIteratorBase<IndexValue>* iiter = NewIndexIterator(
  3486. read_options, /*disable_prefix_seek=*/false, &iiter_on_stack,
  3487. /*get_context=*/nullptr, &context);
  3488. std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
  3489. if (iiter != &iiter_on_stack) {
  3490. iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
  3491. }
  3492. if (!iiter->status().ok()) {
  3493. // error opening index iterator
  3494. return iiter->status();
  3495. }
  3496. s = VerifyChecksumInBlocks(read_options, iiter);
  3497. return s;
  3498. }
  3499. Status BlockBasedTable::VerifyChecksumInBlocks(
  3500. const ReadOptions& read_options,
  3501. InternalIteratorBase<IndexValue>* index_iter) {
  3502. Status s;
  3503. // We are scanning the whole file, so no need to do exponential
  3504. // increasing of the buffer size.
  3505. size_t readahead_size = (read_options.readahead_size != 0)
  3506. ? read_options.readahead_size
  3507. : kMaxAutoReadaheadSize;
  3508. // FilePrefetchBuffer doesn't work in mmap mode and readahead is not
  3509. // needed there.
  3510. FilePrefetchBuffer prefetch_buffer(
  3511. rep_->file.get(), readahead_size /* readadhead_size */,
  3512. readahead_size /* max_readahead_size */,
  3513. !rep_->ioptions.allow_mmap_reads /* enable */);
  3514. for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
  3515. s = index_iter->status();
  3516. if (!s.ok()) {
  3517. break;
  3518. }
  3519. BlockHandle handle = index_iter->value().handle;
  3520. BlockContents contents;
  3521. BlockFetcher block_fetcher(
  3522. rep_->file.get(), &prefetch_buffer, rep_->footer, ReadOptions(), handle,
  3523. &contents, rep_->ioptions, false /* decompress */,
  3524. false /*maybe_compressed*/, BlockType::kData,
  3525. UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
  3526. s = block_fetcher.ReadBlockContents();
  3527. if (!s.ok()) {
  3528. break;
  3529. }
  3530. }
  3531. return s;
  3532. }
  3533. BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName(
  3534. const Slice& meta_block_name) {
  3535. if (meta_block_name.starts_with(kFilterBlockPrefix) ||
  3536. meta_block_name.starts_with(kFullFilterBlockPrefix) ||
  3537. meta_block_name.starts_with(kPartitionedFilterBlockPrefix)) {
  3538. return BlockType::kFilter;
  3539. }
  3540. if (meta_block_name == kPropertiesBlock) {
  3541. return BlockType::kProperties;
  3542. }
  3543. if (meta_block_name == kCompressionDictBlock) {
  3544. return BlockType::kCompressionDictionary;
  3545. }
  3546. if (meta_block_name == kRangeDelBlock) {
  3547. return BlockType::kRangeDeletion;
  3548. }
  3549. if (meta_block_name == kHashIndexPrefixesBlock) {
  3550. return BlockType::kHashIndexPrefixes;
  3551. }
  3552. if (meta_block_name == kHashIndexPrefixesMetadataBlock) {
  3553. return BlockType::kHashIndexMetadata;
  3554. }
  3555. assert(false);
  3556. return BlockType::kInvalid;
  3557. }
  3558. Status BlockBasedTable::VerifyChecksumInMetaBlocks(
  3559. InternalIteratorBase<Slice>* index_iter) {
  3560. Status s;
  3561. for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
  3562. s = index_iter->status();
  3563. if (!s.ok()) {
  3564. break;
  3565. }
  3566. BlockHandle handle;
  3567. Slice input = index_iter->value();
  3568. s = handle.DecodeFrom(&input);
  3569. BlockContents contents;
  3570. const Slice meta_block_name = index_iter->key();
  3571. BlockFetcher block_fetcher(
  3572. rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
  3573. ReadOptions(), handle, &contents, rep_->ioptions,
  3574. false /* decompress */, false /*maybe_compressed*/,
  3575. GetBlockTypeForMetaBlockByName(meta_block_name),
  3576. UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
  3577. s = block_fetcher.ReadBlockContents();
  3578. if (s.IsCorruption() && meta_block_name == kPropertiesBlock) {
  3579. TableProperties* table_properties;
  3580. s = TryReadPropertiesWithGlobalSeqno(nullptr /* prefetch_buffer */,
  3581. index_iter->value(),
  3582. &table_properties);
  3583. delete table_properties;
  3584. }
  3585. if (!s.ok()) {
  3586. break;
  3587. }
  3588. }
  3589. return s;
  3590. }
  3591. bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const {
  3592. assert(rep_ != nullptr);
  3593. Cache* const cache = rep_->table_options.block_cache.get();
  3594. if (cache == nullptr) {
  3595. return false;
  3596. }
  3597. char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
  3598. Slice cache_key =
  3599. GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle,
  3600. cache_key_storage);
  3601. Cache::Handle* const cache_handle = cache->Lookup(cache_key);
  3602. if (cache_handle == nullptr) {
  3603. return false;
  3604. }
  3605. cache->Release(cache_handle);
  3606. return true;
  3607. }
  3608. bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
  3609. const Slice& key) {
  3610. std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
  3611. options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
  3612. /*get_context=*/nullptr, /*lookup_context=*/nullptr));
  3613. iiter->Seek(key);
  3614. assert(iiter->Valid());
  3615. return TEST_BlockInCache(iiter->value().handle);
  3616. }
  3617. // REQUIRES: The following fields of rep_ should have already been populated:
  3618. // 1. file
  3619. // 2. index_handle,
  3620. // 3. options
  3621. // 4. internal_comparator
  3622. // 5. index_type
  3623. Status BlockBasedTable::CreateIndexReader(
  3624. FilePrefetchBuffer* prefetch_buffer,
  3625. InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch,
  3626. bool pin, BlockCacheLookupContext* lookup_context,
  3627. std::unique_ptr<IndexReader>* index_reader) {
  3628. // kHashSearch requires non-empty prefix_extractor but bypass checking
  3629. // prefix_extractor here since we have no access to MutableCFOptions.
  3630. // Add need_upper_bound_check flag in BlockBasedTable::NewIndexIterator.
  3631. // If prefix_extractor does not match prefix_extractor_name from table
  3632. // properties, turn off Hash Index by setting total_order_seek to true
  3633. switch (rep_->index_type) {
  3634. case BlockBasedTableOptions::kTwoLevelIndexSearch: {
  3635. return PartitionIndexReader::Create(this, prefetch_buffer, use_cache,
  3636. prefetch, pin, lookup_context,
  3637. index_reader);
  3638. }
  3639. case BlockBasedTableOptions::kBinarySearch:
  3640. FALLTHROUGH_INTENDED;
  3641. case BlockBasedTableOptions::kBinarySearchWithFirstKey: {
  3642. return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache,
  3643. prefetch, pin, lookup_context,
  3644. index_reader);
  3645. }
  3646. case BlockBasedTableOptions::kHashSearch: {
  3647. std::unique_ptr<Block> metaindex_guard;
  3648. std::unique_ptr<InternalIterator> metaindex_iter_guard;
  3649. auto meta_index_iter = preloaded_meta_index_iter;
  3650. bool should_fallback = false;
  3651. if (rep_->internal_prefix_transform.get() == nullptr) {
  3652. ROCKS_LOG_WARN(rep_->ioptions.info_log,
  3653. "No prefix extractor passed in. Fall back to binary"
  3654. " search index.");
  3655. should_fallback = true;
  3656. } else if (meta_index_iter == nullptr) {
  3657. auto s = ReadMetaIndexBlock(prefetch_buffer, &metaindex_guard,
  3658. &metaindex_iter_guard);
  3659. if (!s.ok()) {
  3660. // we simply fall back to binary search in case there is any
  3661. // problem with prefix hash index loading.
  3662. ROCKS_LOG_WARN(rep_->ioptions.info_log,
  3663. "Unable to read the metaindex block."
  3664. " Fall back to binary search index.");
  3665. should_fallback = true;
  3666. }
  3667. meta_index_iter = metaindex_iter_guard.get();
  3668. }
  3669. if (should_fallback) {
  3670. return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache,
  3671. prefetch, pin, lookup_context,
  3672. index_reader);
  3673. } else {
  3674. return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter,
  3675. use_cache, prefetch, pin, lookup_context,
  3676. index_reader);
  3677. }
  3678. }
  3679. default: {
  3680. std::string error_message =
  3681. "Unrecognized index type: " + ToString(rep_->index_type);
  3682. return Status::InvalidArgument(error_message.c_str());
  3683. }
  3684. }
  3685. }
  3686. uint64_t BlockBasedTable::ApproximateOffsetOf(
  3687. const InternalIteratorBase<IndexValue>& index_iter) const {
  3688. uint64_t result = 0;
  3689. if (index_iter.Valid()) {
  3690. BlockHandle handle = index_iter.value().handle;
  3691. result = handle.offset();
  3692. } else {
  3693. // The iterator is past the last key in the file. If table_properties is not
  3694. // available, approximate the offset by returning the offset of the
  3695. // metaindex block (which is right near the end of the file).
  3696. if (rep_->table_properties) {
  3697. result = rep_->table_properties->data_size;
  3698. }
  3699. // table_properties is not present in the table.
  3700. if (result == 0) {
  3701. result = rep_->footer.metaindex_handle().offset();
  3702. }
  3703. }
  3704. return result;
  3705. }
  3706. uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
  3707. TableReaderCaller caller) {
  3708. BlockCacheLookupContext context(caller);
  3709. IndexBlockIter iiter_on_stack;
  3710. ReadOptions ro;
  3711. ro.total_order_seek = true;
  3712. auto index_iter =
  3713. NewIndexIterator(ro, /*disable_prefix_seek=*/true,
  3714. /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
  3715. /*lookup_context=*/&context);
  3716. std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
  3717. if (index_iter != &iiter_on_stack) {
  3718. iiter_unique_ptr.reset(index_iter);
  3719. }
  3720. index_iter->Seek(key);
  3721. return ApproximateOffsetOf(*index_iter);
  3722. }
  3723. uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end,
  3724. TableReaderCaller caller) {
  3725. assert(rep_->internal_comparator.Compare(start, end) <= 0);
  3726. BlockCacheLookupContext context(caller);
  3727. IndexBlockIter iiter_on_stack;
  3728. ReadOptions ro;
  3729. ro.total_order_seek = true;
  3730. auto index_iter =
  3731. NewIndexIterator(ro, /*disable_prefix_seek=*/true,
  3732. /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
  3733. /*lookup_context=*/&context);
  3734. std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
  3735. if (index_iter != &iiter_on_stack) {
  3736. iiter_unique_ptr.reset(index_iter);
  3737. }
  3738. index_iter->Seek(start);
  3739. uint64_t start_offset = ApproximateOffsetOf(*index_iter);
  3740. index_iter->Seek(end);
  3741. uint64_t end_offset = ApproximateOffsetOf(*index_iter);
  3742. assert(end_offset >= start_offset);
  3743. return end_offset - start_offset;
  3744. }
  3745. bool BlockBasedTable::TEST_FilterBlockInCache() const {
  3746. assert(rep_ != nullptr);
  3747. return TEST_BlockInCache(rep_->filter_handle);
  3748. }
  3749. bool BlockBasedTable::TEST_IndexBlockInCache() const {
  3750. assert(rep_ != nullptr);
  3751. return TEST_BlockInCache(rep_->footer.index_handle());
  3752. }
  3753. Status BlockBasedTable::GetKVPairsFromDataBlocks(
  3754. std::vector<KVPairBlock>* kv_pair_blocks) {
  3755. std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
  3756. NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
  3757. /*input_iter=*/nullptr, /*get_context=*/nullptr,
  3758. /*lookup_contex=*/nullptr));
  3759. Status s = blockhandles_iter->status();
  3760. if (!s.ok()) {
  3761. // Cannot read Index Block
  3762. return s;
  3763. }
  3764. for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
  3765. blockhandles_iter->Next()) {
  3766. s = blockhandles_iter->status();
  3767. if (!s.ok()) {
  3768. break;
  3769. }
  3770. std::unique_ptr<InternalIterator> datablock_iter;
  3771. datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
  3772. ReadOptions(), blockhandles_iter->value().handle,
  3773. /*input_iter=*/nullptr, /*type=*/BlockType::kData,
  3774. /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
  3775. /*prefetch_buffer=*/nullptr));
  3776. s = datablock_iter->status();
  3777. if (!s.ok()) {
  3778. // Error reading the block - Skipped
  3779. continue;
  3780. }
  3781. KVPairBlock kv_pair_block;
  3782. for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
  3783. datablock_iter->Next()) {
  3784. s = datablock_iter->status();
  3785. if (!s.ok()) {
  3786. // Error reading the block - Skipped
  3787. break;
  3788. }
  3789. const Slice& key = datablock_iter->key();
  3790. const Slice& value = datablock_iter->value();
  3791. std::string key_copy = std::string(key.data(), key.size());
  3792. std::string value_copy = std::string(value.data(), value.size());
  3793. kv_pair_block.push_back(
  3794. std::make_pair(std::move(key_copy), std::move(value_copy)));
  3795. }
  3796. kv_pair_blocks->push_back(std::move(kv_pair_block));
  3797. }
  3798. return Status::OK();
  3799. }
  3800. Status BlockBasedTable::DumpTable(WritableFile* out_file) {
  3801. // Output Footer
  3802. out_file->Append(
  3803. "Footer Details:\n"
  3804. "--------------------------------------\n"
  3805. " ");
  3806. out_file->Append(rep_->footer.ToString().c_str());
  3807. out_file->Append("\n");
  3808. // Output MetaIndex
  3809. out_file->Append(
  3810. "Metaindex Details:\n"
  3811. "--------------------------------------\n");
  3812. std::unique_ptr<Block> metaindex;
  3813. std::unique_ptr<InternalIterator> metaindex_iter;
  3814. Status s = ReadMetaIndexBlock(nullptr /* prefetch_buffer */, &metaindex,
  3815. &metaindex_iter);
  3816. if (s.ok()) {
  3817. for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
  3818. metaindex_iter->Next()) {
  3819. s = metaindex_iter->status();
  3820. if (!s.ok()) {
  3821. return s;
  3822. }
  3823. if (metaindex_iter->key() == ROCKSDB_NAMESPACE::kPropertiesBlock) {
  3824. out_file->Append(" Properties block handle: ");
  3825. out_file->Append(metaindex_iter->value().ToString(true).c_str());
  3826. out_file->Append("\n");
  3827. } else if (metaindex_iter->key() ==
  3828. ROCKSDB_NAMESPACE::kCompressionDictBlock) {
  3829. out_file->Append(" Compression dictionary block handle: ");
  3830. out_file->Append(metaindex_iter->value().ToString(true).c_str());
  3831. out_file->Append("\n");
  3832. } else if (strstr(metaindex_iter->key().ToString().c_str(),
  3833. "filter.rocksdb.") != nullptr) {
  3834. out_file->Append(" Filter block handle: ");
  3835. out_file->Append(metaindex_iter->value().ToString(true).c_str());
  3836. out_file->Append("\n");
  3837. } else if (metaindex_iter->key() == ROCKSDB_NAMESPACE::kRangeDelBlock) {
  3838. out_file->Append(" Range deletion block handle: ");
  3839. out_file->Append(metaindex_iter->value().ToString(true).c_str());
  3840. out_file->Append("\n");
  3841. }
  3842. }
  3843. out_file->Append("\n");
  3844. } else {
  3845. return s;
  3846. }
  3847. // Output TableProperties
  3848. const ROCKSDB_NAMESPACE::TableProperties* table_properties;
  3849. table_properties = rep_->table_properties.get();
  3850. if (table_properties != nullptr) {
  3851. out_file->Append(
  3852. "Table Properties:\n"
  3853. "--------------------------------------\n"
  3854. " ");
  3855. out_file->Append(table_properties->ToString("\n ", ": ").c_str());
  3856. out_file->Append("\n");
  3857. }
  3858. if (rep_->filter) {
  3859. out_file->Append(
  3860. "Filter Details:\n"
  3861. "--------------------------------------\n"
  3862. " ");
  3863. out_file->Append(rep_->filter->ToString().c_str());
  3864. out_file->Append("\n");
  3865. }
  3866. // Output Index block
  3867. s = DumpIndexBlock(out_file);
  3868. if (!s.ok()) {
  3869. return s;
  3870. }
  3871. // Output compression dictionary
  3872. if (rep_->uncompression_dict_reader) {
  3873. CachableEntry<UncompressionDict> uncompression_dict;
  3874. s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
  3875. nullptr /* prefetch_buffer */, false /* no_io */,
  3876. nullptr /* get_context */, nullptr /* lookup_context */,
  3877. &uncompression_dict);
  3878. if (!s.ok()) {
  3879. return s;
  3880. }
  3881. assert(uncompression_dict.GetValue());
  3882. const Slice& raw_dict = uncompression_dict.GetValue()->GetRawDict();
  3883. out_file->Append(
  3884. "Compression Dictionary:\n"
  3885. "--------------------------------------\n");
  3886. out_file->Append(" size (bytes): ");
  3887. out_file->Append(ROCKSDB_NAMESPACE::ToString(raw_dict.size()));
  3888. out_file->Append("\n\n");
  3889. out_file->Append(" HEX ");
  3890. out_file->Append(raw_dict.ToString(true).c_str());
  3891. out_file->Append("\n\n");
  3892. }
  3893. // Output range deletions block
  3894. auto* range_del_iter = NewRangeTombstoneIterator(ReadOptions());
  3895. if (range_del_iter != nullptr) {
  3896. range_del_iter->SeekToFirst();
  3897. if (range_del_iter->Valid()) {
  3898. out_file->Append(
  3899. "Range deletions:\n"
  3900. "--------------------------------------\n"
  3901. " ");
  3902. for (; range_del_iter->Valid(); range_del_iter->Next()) {
  3903. DumpKeyValue(range_del_iter->key(), range_del_iter->value(), out_file);
  3904. }
  3905. out_file->Append("\n");
  3906. }
  3907. delete range_del_iter;
  3908. }
  3909. // Output Data blocks
  3910. s = DumpDataBlocks(out_file);
  3911. return s;
  3912. }
  3913. Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
  3914. out_file->Append(
  3915. "Index Details:\n"
  3916. "--------------------------------------\n");
  3917. std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
  3918. NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
  3919. /*input_iter=*/nullptr, /*get_context=*/nullptr,
  3920. /*lookup_contex=*/nullptr));
  3921. Status s = blockhandles_iter->status();
  3922. if (!s.ok()) {
  3923. out_file->Append("Can not read Index Block \n\n");
  3924. return s;
  3925. }
  3926. out_file->Append(" Block key hex dump: Data block handle\n");
  3927. out_file->Append(" Block key ascii\n\n");
  3928. for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
  3929. blockhandles_iter->Next()) {
  3930. s = blockhandles_iter->status();
  3931. if (!s.ok()) {
  3932. break;
  3933. }
  3934. Slice key = blockhandles_iter->key();
  3935. Slice user_key;
  3936. InternalKey ikey;
  3937. if (!rep_->index_key_includes_seq) {
  3938. user_key = key;
  3939. } else {
  3940. ikey.DecodeFrom(key);
  3941. user_key = ikey.user_key();
  3942. }
  3943. out_file->Append(" HEX ");
  3944. out_file->Append(user_key.ToString(true).c_str());
  3945. out_file->Append(": ");
  3946. out_file->Append(blockhandles_iter->value()
  3947. .ToString(true, rep_->index_has_first_key)
  3948. .c_str());
  3949. out_file->Append("\n");
  3950. std::string str_key = user_key.ToString();
  3951. std::string res_key("");
  3952. char cspace = ' ';
  3953. for (size_t i = 0; i < str_key.size(); i++) {
  3954. res_key.append(&str_key[i], 1);
  3955. res_key.append(1, cspace);
  3956. }
  3957. out_file->Append(" ASCII ");
  3958. out_file->Append(res_key.c_str());
  3959. out_file->Append("\n ------\n");
  3960. }
  3961. out_file->Append("\n");
  3962. return Status::OK();
  3963. }
  3964. Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
  3965. std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
  3966. NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
  3967. /*input_iter=*/nullptr, /*get_context=*/nullptr,
  3968. /*lookup_contex=*/nullptr));
  3969. Status s = blockhandles_iter->status();
  3970. if (!s.ok()) {
  3971. out_file->Append("Can not read Index Block \n\n");
  3972. return s;
  3973. }
  3974. uint64_t datablock_size_min = std::numeric_limits<uint64_t>::max();
  3975. uint64_t datablock_size_max = 0;
  3976. uint64_t datablock_size_sum = 0;
  3977. size_t block_id = 1;
  3978. for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
  3979. block_id++, blockhandles_iter->Next()) {
  3980. s = blockhandles_iter->status();
  3981. if (!s.ok()) {
  3982. break;
  3983. }
  3984. BlockHandle bh = blockhandles_iter->value().handle;
  3985. uint64_t datablock_size = bh.size();
  3986. datablock_size_min = std::min(datablock_size_min, datablock_size);
  3987. datablock_size_max = std::max(datablock_size_max, datablock_size);
  3988. datablock_size_sum += datablock_size;
  3989. out_file->Append("Data Block # ");
  3990. out_file->Append(ROCKSDB_NAMESPACE::ToString(block_id));
  3991. out_file->Append(" @ ");
  3992. out_file->Append(blockhandles_iter->value().handle.ToString(true).c_str());
  3993. out_file->Append("\n");
  3994. out_file->Append("--------------------------------------\n");
  3995. std::unique_ptr<InternalIterator> datablock_iter;
  3996. datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
  3997. ReadOptions(), blockhandles_iter->value().handle,
  3998. /*input_iter=*/nullptr, /*type=*/BlockType::kData,
  3999. /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
  4000. /*prefetch_buffer=*/nullptr));
  4001. s = datablock_iter->status();
  4002. if (!s.ok()) {
  4003. out_file->Append("Error reading the block - Skipped \n\n");
  4004. continue;
  4005. }
  4006. for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
  4007. datablock_iter->Next()) {
  4008. s = datablock_iter->status();
  4009. if (!s.ok()) {
  4010. out_file->Append("Error reading the block - Skipped \n");
  4011. break;
  4012. }
  4013. DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_file);
  4014. }
  4015. out_file->Append("\n");
  4016. }
  4017. uint64_t num_datablocks = block_id - 1;
  4018. if (num_datablocks) {
  4019. double datablock_size_avg =
  4020. static_cast<double>(datablock_size_sum) / num_datablocks;
  4021. out_file->Append("Data Block Summary:\n");
  4022. out_file->Append("--------------------------------------");
  4023. out_file->Append("\n # data blocks: ");
  4024. out_file->Append(ROCKSDB_NAMESPACE::ToString(num_datablocks));
  4025. out_file->Append("\n min data block size: ");
  4026. out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_min));
  4027. out_file->Append("\n max data block size: ");
  4028. out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_max));
  4029. out_file->Append("\n avg data block size: ");
  4030. out_file->Append(ROCKSDB_NAMESPACE::ToString(datablock_size_avg));
  4031. out_file->Append("\n");
  4032. }
  4033. return Status::OK();
  4034. }
  4035. void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
  4036. WritableFile* out_file) {
  4037. InternalKey ikey;
  4038. ikey.DecodeFrom(key);
  4039. out_file->Append(" HEX ");
  4040. out_file->Append(ikey.user_key().ToString(true).c_str());
  4041. out_file->Append(": ");
  4042. out_file->Append(value.ToString(true).c_str());
  4043. out_file->Append("\n");
  4044. std::string str_key = ikey.user_key().ToString();
  4045. std::string str_value = value.ToString();
  4046. std::string res_key(""), res_value("");
  4047. char cspace = ' ';
  4048. for (size_t i = 0; i < str_key.size(); i++) {
  4049. if (str_key[i] == '\0') {
  4050. res_key.append("\\0", 2);
  4051. } else {
  4052. res_key.append(&str_key[i], 1);
  4053. }
  4054. res_key.append(1, cspace);
  4055. }
  4056. for (size_t i = 0; i < str_value.size(); i++) {
  4057. if (str_value[i] == '\0') {
  4058. res_value.append("\\0", 2);
  4059. } else {
  4060. res_value.append(&str_value[i], 1);
  4061. }
  4062. res_value.append(1, cspace);
  4063. }
  4064. out_file->Append(" ASCII ");
  4065. out_file->Append(res_key.c_str());
  4066. out_file->Append(": ");
  4067. out_file->Append(res_value.c_str());
  4068. out_file->Append("\n ------\n");
  4069. }
  4070. } // namespace ROCKSDB_NAMESPACE