version_set.cc 290 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636663766386639664066416642664366446645664666476648664966506651665266536654665566566657665866596660666166626663666466656666666766686669667066716672667366746675667666776678667966806681668266836684668566866687668866896690669166926693669466956696669766986699670067016702670367046705670667076708670967106711671267136714671567166717671867196720672167226723672467256726672767286729673067316732673367346735673667376738673967406741674267436744674567466747674867496750675167526753675467556756675767586759676067616762676367646765676667676768676967706771677267736774677567766777677867796780678167826783678467856786678767886789679067916792679367946795679667976798679968006801680268036804680568066807680868096810681168126813681468156816681768186819682068216822682368246825682668276828682968306831683268336834683568366837683868396840684168426843684468456846684768486849685068516852685368546855685668576858685968606861686268636864686568666867686868696870687168726873687468756876687768786879688068816882688368846885688668876888688968906891689268936894689568966897689868996900690169026903690469056906690769086909691069116912691369146915691669176918691969206921692269236924692569266927692869296930693169326933693469356936693769386939694069416942694369446945694669476948694969506951695269536954695569566957695869596960696169626963696469656966696769686969697069716972697369746975697669776978697969806981698269836984698569866987698869896990699169926993699469956996699769986999700070017002700370047005700670077008700970107011701270137014701570167017701870197020702170227023702470257026702770287029703070317032703370347035703670377038703970407041704270437044704570467047704870497050705170527053705470557056705770587059706070617062706370647065706670677068706970707071707270737074707570767077707870797080708170827083708470857086708770887089709070917092709370947095709670977098709971007101710271037104710571067107710871097110711171127113711471157116711771187119712071217122712371247125712671277128712971307131713271337134713571367137713871397140714171427143714471457146714771487149715071517152715371547155715671577158715971607161716271637164716571667167716871697170717171727173717471757176717771787179718071817182718371847185718671877188718971907191719271937194719571967197719871997200720172027203720472057206720772087209721072117212721372147215721672177218721972207221722272237224722572267227722872297230723172327233723472357236723772387239724072417242724372447245724672477248724972507251725272537254725572567257725872597260726172627263726472657266726772687269727072717272727372747275727672777278727972807281728272837284728572867287728872897290729172927293729472957296729772987299730073017302730373047305730673077308730973107311731273137314731573167317731873197320732173227323732473257326732773287329733073317332733373347335733673377338733973407341734273437344734573467347734873497350735173527353735473557356735773587359736073617362736373647365736673677368736973707371737273737374737573767377737873797380738173827383738473857386738773887389739073917392739373947395739673977398739974007401740274037404740574067407740874097410741174127413741474157416741774187419742074217422742374247425742674277428742974307431743274337434743574367437743874397440744174427443744474457446744774487449745074517452745374547455745674577458745974607461746274637464746574667467746874697470747174727473747474757476747774787479748074817482748374847485748674877488748974907491749274937494749574967497749874997500750175027503750475057506750775087509751075117512751375147515751675177518751975207521752275237524752575267527752875297530753175327533753475357536753775387539754075417542754375447545754675477548754975507551755275537554755575567557755875597560756175627563756475657566756775687569757075717572757375747575757675777578757975807581758275837584758575867587758875897590759175927593759475957596759775987599760076017602760376047605760676077608760976107611761276137614761576167617761876197620762176227623762476257626762776287629763076317632763376347635763676377638763976407641764276437644764576467647764876497650765176527653765476557656765776587659766076617662766376647665766676677668766976707671767276737674767576767677767876797680768176827683768476857686768776887689769076917692769376947695769676977698769977007701770277037704770577067707770877097710771177127713771477157716771777187719772077217722772377247725772677277728772977307731773277337734773577367737773877397740774177427743774477457746774777487749775077517752775377547755775677577758775977607761776277637764776577667767776877697770777177727773777477757776777777787779778077817782778377847785778677877788
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. #include "db/version_set.h"
  10. #include <algorithm>
  11. #include <array>
  12. #include <cinttypes>
  13. #include <cstdio>
  14. #include <list>
  15. #include <map>
  16. #include <set>
  17. #include <string>
  18. #include <unordered_map>
  19. #include <vector>
  20. #include "db/blob/blob_fetcher.h"
  21. #include "db/blob/blob_file_cache.h"
  22. #include "db/blob/blob_file_reader.h"
  23. #include "db/blob/blob_log_format.h"
  24. #include "db/blob/blob_source.h"
  25. #include "db/compaction/compaction.h"
  26. #include "db/compaction/file_pri.h"
  27. #include "db/dbformat.h"
  28. #include "db/internal_stats.h"
  29. #include "db/log_reader.h"
  30. #include "db/log_writer.h"
  31. #include "db/manifest_ops.h"
  32. #include "db/memtable.h"
  33. #include "db/merge_context.h"
  34. #include "db/merge_helper.h"
  35. #include "db/pinned_iterators_manager.h"
  36. #include "db/table_cache.h"
  37. #include "db/version_builder.h"
  38. #include "db/version_edit.h"
  39. #include "db/version_edit_handler.h"
  40. #include "db/wide/wide_columns_helper.h"
  41. #include "file/file_util.h"
  42. #include "table/compaction_merging_iterator.h"
  43. #if USE_COROUTINES
  44. #include "folly/coro/BlockingWait.h"
  45. #include "folly/coro/Collect.h"
  46. #endif
  47. #include "file/filename.h"
  48. #include "file/random_access_file_reader.h"
  49. #include "file/read_write_util.h"
  50. #include "file/writable_file_writer.h"
  51. #include "logging/logging.h"
  52. #include "monitoring/file_read_sample.h"
  53. #include "monitoring/perf_context_imp.h"
  54. #include "monitoring/persistent_stats_history.h"
  55. #include "options/options_helper.h"
  56. #include "rocksdb/env.h"
  57. #include "rocksdb/merge_operator.h"
  58. #include "rocksdb/write_buffer_manager.h"
  59. #include "table/format.h"
  60. #include "table/get_context.h"
  61. #include "table/internal_iterator.h"
  62. #include "table/merging_iterator.h"
  63. #include "table/meta_blocks.h"
  64. #include "table/multiget_context.h"
  65. #include "table/plain/plain_table_factory.h"
  66. #include "table/table_reader.h"
  67. #include "table/two_level_iterator.h"
  68. #include "table/unique_id_impl.h"
  69. #include "test_util/sync_point.h"
  70. #include "util/cast_util.h"
  71. #include "util/coding.h"
  72. #include "util/coro_utils.h"
  73. #include "util/stop_watch.h"
  74. #include "util/string_util.h"
  75. #include "util/user_comparator_wrapper.h"
  76. // Generate the regular and coroutine versions of some methods by
  77. // including version_set_sync_and_async.h twice
  78. // Macros in the header will expand differently based on whether
  79. // WITH_COROUTINES or WITHOUT_COROUTINES is defined
  80. // clang-format off
  81. #define WITHOUT_COROUTINES
  82. #include "db/version_set_sync_and_async.h"
  83. #undef WITHOUT_COROUTINES
  84. #define WITH_COROUTINES
  85. #include "db/version_set_sync_and_async.h"
  86. #undef WITH_COROUTINES
  87. // clang-format on
  88. namespace ROCKSDB_NAMESPACE {
  89. namespace {
  90. using ScanOptionsMap = std::unordered_map<size_t, MultiScanArgs>;
  91. // Find File in LevelFilesBrief data structure
  92. // Within an index range defined by left and right
  93. int FindFileInRange(const InternalKeyComparator& icmp,
  94. const LevelFilesBrief& file_level, const Slice& key,
  95. uint32_t left, uint32_t right) {
  96. auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool {
  97. return icmp.InternalKeyComparator::Compare(f.largest_key, k) < 0;
  98. };
  99. const auto& b = file_level.files;
  100. return static_cast<int>(std::lower_bound(b + left, b + right, key, cmp) - b);
  101. }
  102. Status OverlapWithIterator(const Comparator* ucmp,
  103. const Slice& smallest_user_key,
  104. const Slice& largest_user_key,
  105. InternalIterator* iter, bool* overlap) {
  106. InternalKey range_start(smallest_user_key, kMaxSequenceNumber,
  107. kValueTypeForSeek);
  108. iter->Seek(range_start.Encode());
  109. if (!iter->status().ok()) {
  110. return iter->status();
  111. }
  112. *overlap = false;
  113. if (iter->Valid()) {
  114. ParsedInternalKey seek_result;
  115. Status s = ParseInternalKey(iter->key(), &seek_result,
  116. false /* log_err_key */); // TODO
  117. if (!s.ok()) {
  118. return s;
  119. }
  120. if (ucmp->CompareWithoutTimestamp(seek_result.user_key, largest_user_key) <=
  121. 0) {
  122. *overlap = true;
  123. }
  124. }
  125. return iter->status();
  126. }
  127. // Class to help choose the next file to search for the particular key.
  128. // Searches and returns files level by level.
  129. // We can search level-by-level since entries never hop across
  130. // levels. Therefore we are guaranteed that if we find data
  131. // in a smaller level, later levels are irrelevant (unless we
  132. // are MergeInProgress).
  133. class FilePicker {
  134. public:
  135. FilePicker(const Slice& user_key, const Slice& ikey,
  136. autovector<LevelFilesBrief>* file_levels, unsigned int num_levels,
  137. FileIndexer* file_indexer, const Comparator* user_comparator,
  138. const InternalKeyComparator* internal_comparator)
  139. : num_levels_(num_levels),
  140. curr_level_(static_cast<unsigned int>(-1)),
  141. returned_file_level_(static_cast<unsigned int>(-1)),
  142. hit_file_level_(static_cast<unsigned int>(-1)),
  143. search_left_bound_(0),
  144. search_right_bound_(FileIndexer::kLevelMaxIndex),
  145. level_files_brief_(file_levels),
  146. is_hit_file_last_in_level_(false),
  147. curr_file_level_(nullptr),
  148. user_key_(user_key),
  149. ikey_(ikey),
  150. file_indexer_(file_indexer),
  151. user_comparator_(user_comparator),
  152. internal_comparator_(internal_comparator) {
  153. // Setup member variables to search first level.
  154. search_ended_ = !PrepareNextLevel();
  155. if (!search_ended_) {
  156. // Prefetch Level 0 table data to avoid cache miss if possible.
  157. for (unsigned int i = 0; i < (*level_files_brief_)[0].num_files; ++i) {
  158. auto* r = (*level_files_brief_)[0].files[i].fd.table_reader;
  159. if (r) {
  160. r->Prepare(ikey);
  161. }
  162. }
  163. }
  164. }
  165. int GetCurrentLevel() const { return curr_level_; }
  166. FdWithKeyRange* GetNextFile() {
  167. while (!search_ended_) { // Loops over different levels.
  168. while (curr_index_in_curr_level_ < curr_file_level_->num_files) {
  169. // Loops over all files in current level.
  170. FdWithKeyRange* f = &curr_file_level_->files[curr_index_in_curr_level_];
  171. hit_file_level_ = curr_level_;
  172. is_hit_file_last_in_level_ =
  173. curr_index_in_curr_level_ == curr_file_level_->num_files - 1;
  174. int cmp_largest = -1;
  175. // Do key range filtering of files or/and fractional cascading if:
  176. // (1) not all the files are in level 0, or
  177. // (2) there are more than 3 current level files
  178. // If there are only 3 or less current level files in the system, we
  179. // skip the key range filtering. In this case, more likely, the system
  180. // is highly tuned to minimize number of tables queried by each query,
  181. // so it is unlikely that key range filtering is more efficient than
  182. // querying the files.
  183. if (num_levels_ > 1 || curr_file_level_->num_files > 3) {
  184. // Check if key is within a file's range. If search left bound and
  185. // right bound point to the same find, we are sure key falls in
  186. // range.
  187. assert(curr_level_ == 0 ||
  188. curr_index_in_curr_level_ == start_index_in_curr_level_ ||
  189. user_comparator_->CompareWithoutTimestamp(
  190. user_key_, ExtractUserKey(f->smallest_key)) <= 0);
  191. int cmp_smallest = user_comparator_->CompareWithoutTimestamp(
  192. user_key_, ExtractUserKey(f->smallest_key));
  193. if (cmp_smallest >= 0) {
  194. cmp_largest = user_comparator_->CompareWithoutTimestamp(
  195. user_key_, ExtractUserKey(f->largest_key));
  196. }
  197. // Setup file search bound for the next level based on the
  198. // comparison results
  199. if (curr_level_ > 0) {
  200. file_indexer_->GetNextLevelIndex(
  201. curr_level_, curr_index_in_curr_level_, cmp_smallest,
  202. cmp_largest, &search_left_bound_, &search_right_bound_);
  203. }
  204. // Key falls out of current file's range
  205. if (cmp_smallest < 0 || cmp_largest > 0) {
  206. if (curr_level_ == 0) {
  207. ++curr_index_in_curr_level_;
  208. continue;
  209. } else {
  210. // Search next level.
  211. break;
  212. }
  213. }
  214. }
  215. returned_file_level_ = curr_level_;
  216. if (curr_level_ > 0 && cmp_largest < 0) {
  217. // No more files to search in this level.
  218. search_ended_ = !PrepareNextLevel();
  219. } else {
  220. ++curr_index_in_curr_level_;
  221. }
  222. return f;
  223. }
  224. // Start searching next level.
  225. search_ended_ = !PrepareNextLevel();
  226. }
  227. // Search ended.
  228. return nullptr;
  229. }
  230. // getter for current file level
  231. // for GET_HIT_L0, GET_HIT_L1 & GET_HIT_L2_AND_UP counts
  232. unsigned int GetHitFileLevel() { return hit_file_level_; }
  233. // Returns true if the most recent "hit file" (i.e., one returned by
  234. // GetNextFile()) is at the last index in its level.
  235. bool IsHitFileLastInLevel() { return is_hit_file_last_in_level_; }
  236. private:
  237. unsigned int num_levels_;
  238. unsigned int curr_level_;
  239. unsigned int returned_file_level_;
  240. unsigned int hit_file_level_;
  241. int32_t search_left_bound_;
  242. int32_t search_right_bound_;
  243. autovector<LevelFilesBrief>* level_files_brief_;
  244. bool search_ended_;
  245. bool is_hit_file_last_in_level_;
  246. LevelFilesBrief* curr_file_level_;
  247. unsigned int curr_index_in_curr_level_;
  248. unsigned int start_index_in_curr_level_;
  249. Slice user_key_;
  250. Slice ikey_;
  251. FileIndexer* file_indexer_;
  252. const Comparator* user_comparator_;
  253. const InternalKeyComparator* internal_comparator_;
  254. // Setup local variables to search next level.
  255. // Returns false if there are no more levels to search.
  256. bool PrepareNextLevel() {
  257. curr_level_++;
  258. while (curr_level_ < num_levels_) {
  259. curr_file_level_ = &(*level_files_brief_)[curr_level_];
  260. if (curr_file_level_->num_files == 0) {
  261. // When current level is empty, the search bound generated from upper
  262. // level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is
  263. // also empty.
  264. assert(search_left_bound_ == 0);
  265. assert(search_right_bound_ == -1 ||
  266. search_right_bound_ == FileIndexer::kLevelMaxIndex);
  267. // Since current level is empty, it will need to search all files in
  268. // the next level
  269. search_left_bound_ = 0;
  270. search_right_bound_ = FileIndexer::kLevelMaxIndex;
  271. curr_level_++;
  272. continue;
  273. }
  274. // Some files may overlap each other. We find
  275. // all files that overlap user_key and process them in order from
  276. // newest to oldest. In the context of merge-operator, this can occur at
  277. // any level. Otherwise, it only occurs at Level-0 (since Put/Deletes
  278. // are always compacted into a single entry).
  279. int32_t start_index;
  280. if (curr_level_ == 0) {
  281. // On Level-0, we read through all files to check for overlap.
  282. start_index = 0;
  283. } else {
  284. // On Level-n (n>=1), files are sorted. Binary search to find the
  285. // earliest file whose largest key >= ikey. Search left bound and
  286. // right bound are used to narrow the range.
  287. if (search_left_bound_ <= search_right_bound_) {
  288. if (search_right_bound_ == FileIndexer::kLevelMaxIndex) {
  289. search_right_bound_ =
  290. static_cast<int32_t>(curr_file_level_->num_files) - 1;
  291. }
  292. // `search_right_bound_` is an inclusive upper-bound, but since it was
  293. // determined based on user key, it is still possible the lookup key
  294. // falls to the right of `search_right_bound_`'s corresponding file.
  295. // So, pass a limit one higher, which allows us to detect this case.
  296. start_index =
  297. FindFileInRange(*internal_comparator_, *curr_file_level_, ikey_,
  298. static_cast<uint32_t>(search_left_bound_),
  299. static_cast<uint32_t>(search_right_bound_) + 1);
  300. if (start_index == search_right_bound_ + 1) {
  301. // `ikey_` comes after `search_right_bound_`. The lookup key does
  302. // not exist on this level, so let's skip this level and do a full
  303. // binary search on the next level.
  304. search_left_bound_ = 0;
  305. search_right_bound_ = FileIndexer::kLevelMaxIndex;
  306. curr_level_++;
  307. continue;
  308. }
  309. } else {
  310. // search_left_bound > search_right_bound, key does not exist in
  311. // this level. Since no comparison is done in this level, it will
  312. // need to search all files in the next level.
  313. search_left_bound_ = 0;
  314. search_right_bound_ = FileIndexer::kLevelMaxIndex;
  315. curr_level_++;
  316. continue;
  317. }
  318. }
  319. start_index_in_curr_level_ = start_index;
  320. curr_index_in_curr_level_ = start_index;
  321. return true;
  322. }
  323. // curr_level_ = num_levels_. So, no more levels to search.
  324. return false;
  325. }
  326. };
  327. } // anonymous namespace
  328. class FilePickerMultiGet {
  329. private:
  330. struct FilePickerContext;
  331. public:
  332. FilePickerMultiGet(MultiGetRange* range,
  333. autovector<LevelFilesBrief>* file_levels,
  334. unsigned int num_levels, FileIndexer* file_indexer,
  335. const Comparator* user_comparator,
  336. const InternalKeyComparator* internal_comparator)
  337. : num_levels_(num_levels),
  338. curr_level_(static_cast<unsigned int>(-1)),
  339. returned_file_level_(static_cast<unsigned int>(-1)),
  340. hit_file_level_(static_cast<unsigned int>(-1)),
  341. range_(*range, range->begin(), range->end()),
  342. maybe_repeat_key_(false),
  343. current_level_range_(*range, range->begin(), range->end()),
  344. current_file_range_(*range, range->begin(), range->end()),
  345. batch_iter_(range->begin()),
  346. batch_iter_prev_(range->begin()),
  347. upper_key_(range->begin()),
  348. level_files_brief_(file_levels),
  349. is_hit_file_last_in_level_(false),
  350. curr_file_level_(nullptr),
  351. file_indexer_(file_indexer),
  352. user_comparator_(user_comparator),
  353. internal_comparator_(internal_comparator),
  354. hit_file_(nullptr) {
  355. for (auto iter = range_.begin(); iter != range_.end(); ++iter) {
  356. fp_ctx_array_[iter.index()] =
  357. FilePickerContext(0, FileIndexer::kLevelMaxIndex);
  358. }
  359. // Setup member variables to search first level.
  360. search_ended_ = !PrepareNextLevel();
  361. if (!search_ended_) {
  362. // REVISIT
  363. // Prefetch Level 0 table data to avoid cache miss if possible.
  364. // As of now, only PlainTableReader and CuckooTableReader do any
  365. // prefetching. This may not be necessary anymore once we implement
  366. // batching in those table readers
  367. for (unsigned int i = 0; i < (*level_files_brief_)[0].num_files; ++i) {
  368. auto* r = (*level_files_brief_)[0].files[i].fd.table_reader;
  369. if (r) {
  370. for (auto iter = range_.begin(); iter != range_.end(); ++iter) {
  371. r->Prepare(iter->ikey);
  372. }
  373. }
  374. }
  375. }
  376. }
  377. FilePickerMultiGet(MultiGetRange* range, const FilePickerMultiGet& other)
  378. : num_levels_(other.num_levels_),
  379. curr_level_(other.curr_level_),
  380. returned_file_level_(other.returned_file_level_),
  381. hit_file_level_(other.hit_file_level_),
  382. fp_ctx_array_(other.fp_ctx_array_),
  383. range_(*range, range->begin(), range->end()),
  384. maybe_repeat_key_(false),
  385. current_level_range_(*range, range->begin(), range->end()),
  386. current_file_range_(*range, range->begin(), range->end()),
  387. batch_iter_(range->begin()),
  388. batch_iter_prev_(range->begin()),
  389. upper_key_(range->begin()),
  390. level_files_brief_(other.level_files_brief_),
  391. is_hit_file_last_in_level_(false),
  392. curr_file_level_(other.curr_file_level_),
  393. file_indexer_(other.file_indexer_),
  394. user_comparator_(other.user_comparator_),
  395. internal_comparator_(other.internal_comparator_),
  396. hit_file_(nullptr) {
  397. PrepareNextLevelForSearch();
  398. }
  399. int GetCurrentLevel() const { return curr_level_; }
  400. void PrepareNextLevelForSearch() { search_ended_ = !PrepareNextLevel(); }
  401. FdWithKeyRange* GetNextFileInLevel() {
  402. if (batch_iter_ == current_level_range_.end() || search_ended_) {
  403. hit_file_ = nullptr;
  404. return nullptr;
  405. } else {
  406. if (maybe_repeat_key_) {
  407. maybe_repeat_key_ = false;
  408. // Check if we found the final value for the last key in the
  409. // previous lookup range. If we did, then there's no need to look
  410. // any further for that key, so advance batch_iter_. Else, keep
  411. // batch_iter_ positioned on that key so we look it up again in
  412. // the next file
  413. // For L0, always advance the key because we will look in the next
  414. // file regardless for all keys not found yet
  415. if (current_level_range_.CheckKeyDone(batch_iter_) ||
  416. curr_level_ == 0) {
  417. batch_iter_ = upper_key_;
  418. }
  419. }
  420. // batch_iter_prev_ will become the start key for the next file
  421. // lookup
  422. batch_iter_prev_ = batch_iter_;
  423. }
  424. MultiGetRange next_file_range(current_level_range_, batch_iter_prev_,
  425. current_level_range_.end());
  426. size_t curr_file_index =
  427. (batch_iter_ != current_level_range_.end())
  428. ? fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level
  429. : curr_file_level_->num_files;
  430. FdWithKeyRange* f;
  431. bool is_last_key_in_file;
  432. if (!GetNextFileInLevelWithKeys(&next_file_range, &curr_file_index, &f,
  433. &is_last_key_in_file)) {
  434. hit_file_ = nullptr;
  435. return nullptr;
  436. } else {
  437. if (is_last_key_in_file) {
  438. // Since cmp_largest is 0, batch_iter_ still points to the last key
  439. // that falls in this file, instead of the next one. Increment
  440. // the file index for all keys between batch_iter_ and upper_key_
  441. auto tmp_iter = batch_iter_;
  442. while (tmp_iter != upper_key_) {
  443. ++(fp_ctx_array_[tmp_iter.index()].curr_index_in_curr_level);
  444. ++tmp_iter;
  445. }
  446. maybe_repeat_key_ = true;
  447. }
  448. // Set the range for this file
  449. current_file_range_ =
  450. MultiGetRange(next_file_range, batch_iter_prev_, upper_key_);
  451. returned_file_level_ = curr_level_;
  452. hit_file_level_ = curr_level_;
  453. is_hit_file_last_in_level_ =
  454. curr_file_index == curr_file_level_->num_files - 1;
  455. hit_file_ = f;
  456. return f;
  457. }
  458. }
  459. // getter for current file level
  460. // for GET_HIT_L0, GET_HIT_L1 & GET_HIT_L2_AND_UP counts
  461. unsigned int GetHitFileLevel() { return hit_file_level_; }
  462. FdWithKeyRange* GetHitFile() { return hit_file_; }
  463. // Returns true if the most recent "hit file" (i.e., one returned by
  464. // GetNextFile()) is at the last index in its level.
  465. bool IsHitFileLastInLevel() { return is_hit_file_last_in_level_; }
  466. bool KeyMaySpanNextFile() { return maybe_repeat_key_; }
  467. bool IsSearchEnded() { return search_ended_; }
  468. const MultiGetRange& CurrentFileRange() { return current_file_range_; }
  469. bool RemainingOverlapInLevel() {
  470. return !current_level_range_.Suffix(current_file_range_).empty();
  471. }
  472. MultiGetRange& GetRange() { return range_; }
  473. void ReplaceRange(const MultiGetRange& other) {
  474. assert(hit_file_ == nullptr);
  475. range_ = other;
  476. current_level_range_ = other;
  477. }
  478. FilePickerMultiGet(FilePickerMultiGet&& other)
  479. : num_levels_(other.num_levels_),
  480. curr_level_(other.curr_level_),
  481. returned_file_level_(other.returned_file_level_),
  482. hit_file_level_(other.hit_file_level_),
  483. fp_ctx_array_(std::move(other.fp_ctx_array_)),
  484. range_(std::move(other.range_)),
  485. maybe_repeat_key_(other.maybe_repeat_key_),
  486. current_level_range_(std::move(other.current_level_range_)),
  487. current_file_range_(std::move(other.current_file_range_)),
  488. batch_iter_(other.batch_iter_, &current_level_range_),
  489. batch_iter_prev_(other.batch_iter_prev_, &current_level_range_),
  490. upper_key_(other.upper_key_, &current_level_range_),
  491. level_files_brief_(other.level_files_brief_),
  492. search_ended_(other.search_ended_),
  493. is_hit_file_last_in_level_(other.is_hit_file_last_in_level_),
  494. curr_file_level_(other.curr_file_level_),
  495. file_indexer_(other.file_indexer_),
  496. user_comparator_(other.user_comparator_),
  497. internal_comparator_(other.internal_comparator_),
  498. hit_file_(other.hit_file_) {}
  499. private:
  500. unsigned int num_levels_;
  501. unsigned int curr_level_;
  502. unsigned int returned_file_level_;
  503. unsigned int hit_file_level_;
  504. struct FilePickerContext {
  505. int32_t search_left_bound;
  506. int32_t search_right_bound;
  507. unsigned int curr_index_in_curr_level;
  508. unsigned int start_index_in_curr_level;
  509. FilePickerContext(int32_t left, int32_t right)
  510. : search_left_bound(left),
  511. search_right_bound(right),
  512. curr_index_in_curr_level(0),
  513. start_index_in_curr_level(0) {}
  514. FilePickerContext() = default;
  515. };
  516. std::array<FilePickerContext, MultiGetContext::MAX_BATCH_SIZE> fp_ctx_array_;
  517. MultiGetRange range_;
  518. bool maybe_repeat_key_;
  519. MultiGetRange current_level_range_;
  520. MultiGetRange current_file_range_;
  521. // Iterator to iterate through the keys in a MultiGet batch, that gets reset
  522. // at the beginning of each level. Each call to GetNextFile() will position
  523. // batch_iter_ at or right after the last key that was found in the returned
  524. // SST file
  525. MultiGetRange::Iterator batch_iter_;
  526. // An iterator that records the previous position of batch_iter_, i.e last
  527. // key found in the previous SST file, in order to serve as the start of
  528. // the batch key range for the next SST file
  529. MultiGetRange::Iterator batch_iter_prev_;
  530. MultiGetRange::Iterator upper_key_;
  531. autovector<LevelFilesBrief>* level_files_brief_;
  532. bool search_ended_;
  533. bool is_hit_file_last_in_level_;
  534. LevelFilesBrief* curr_file_level_;
  535. FileIndexer* file_indexer_;
  536. const Comparator* user_comparator_;
  537. const InternalKeyComparator* internal_comparator_;
  538. FdWithKeyRange* hit_file_;
  539. // Iterates through files in the current level until it finds a file that
  540. // contains at least one key from the MultiGet batch
  541. bool GetNextFileInLevelWithKeys(MultiGetRange* next_file_range,
  542. size_t* file_index, FdWithKeyRange** fd,
  543. bool* is_last_key_in_file) {
  544. size_t curr_file_index = *file_index;
  545. FdWithKeyRange* f = nullptr;
  546. bool file_hit = false;
  547. int cmp_largest = -1;
  548. int cmp_smallest = -1;
  549. if (curr_file_index >= curr_file_level_->num_files) {
  550. // In the unlikely case the next key is a duplicate of the current key,
  551. // and the current key is the last in the level and the internal key
  552. // was not found, we need to skip lookup for the remaining keys and
  553. // reset the search bounds
  554. if (batch_iter_ != current_level_range_.end()) {
  555. #ifndef NDEBUG
  556. if (curr_level_ < num_levels_ + 1) {
  557. if ((*level_files_brief_)[curr_level_].num_files == 0) {
  558. struct FilePickerContext& fp_ctx =
  559. fp_ctx_array_[batch_iter_.index()];
  560. assert(fp_ctx.search_left_bound == 0);
  561. assert(fp_ctx.search_right_bound == -1 ||
  562. fp_ctx.search_right_bound == FileIndexer::kLevelMaxIndex);
  563. }
  564. }
  565. #endif // NDBEUG
  566. ++batch_iter_;
  567. for (; batch_iter_ != current_level_range_.end(); ++batch_iter_) {
  568. struct FilePickerContext& fp_ctx = fp_ctx_array_[batch_iter_.index()];
  569. fp_ctx.search_left_bound = 0;
  570. fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex;
  571. }
  572. }
  573. return false;
  574. }
  575. // Loops over keys in the MultiGet batch until it finds a file with
  576. // atleast one of the keys. Then it keeps moving forward until the
  577. // last key in the batch that falls in that file
  578. while (batch_iter_ != current_level_range_.end() &&
  579. (fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level ==
  580. curr_file_index ||
  581. !file_hit)) {
  582. struct FilePickerContext& fp_ctx = fp_ctx_array_[batch_iter_.index()];
  583. f = &curr_file_level_->files[fp_ctx.curr_index_in_curr_level];
  584. Slice& user_key = batch_iter_->ukey_without_ts;
  585. // Do key range filtering of files or/and fractional cascading if:
  586. // (1) not all the files are in level 0, or
  587. // (2) there are more than 3 current level files
  588. // If there are only 3 or less current level files in the system, we
  589. // skip the key range filtering. In this case, more likely, the system
  590. // is highly tuned to minimize number of tables queried by each query,
  591. // so it is unlikely that key range filtering is more efficient than
  592. // querying the files.
  593. if (num_levels_ > 1 || curr_file_level_->num_files > 3) {
  594. // Check if key is within a file's range. If search left bound and
  595. // right bound point to the same find, we are sure key falls in
  596. // range.
  597. cmp_smallest = user_comparator_->CompareWithoutTimestamp(
  598. user_key, false, ExtractUserKey(f->smallest_key), true);
  599. assert(curr_level_ == 0 ||
  600. fp_ctx.curr_index_in_curr_level ==
  601. fp_ctx.start_index_in_curr_level ||
  602. cmp_smallest <= 0);
  603. if (cmp_smallest >= 0) {
  604. cmp_largest = user_comparator_->CompareWithoutTimestamp(
  605. user_key, false, ExtractUserKey(f->largest_key), true);
  606. } else {
  607. cmp_largest = -1;
  608. }
  609. // Setup file search bound for the next level based on the
  610. // comparison results
  611. if (curr_level_ > 0) {
  612. file_indexer_->GetNextLevelIndex(
  613. curr_level_, fp_ctx.curr_index_in_curr_level, cmp_smallest,
  614. cmp_largest, &fp_ctx.search_left_bound,
  615. &fp_ctx.search_right_bound);
  616. }
  617. // Key falls out of current file's range
  618. if (cmp_smallest < 0 || cmp_largest > 0) {
  619. next_file_range->SkipKey(batch_iter_);
  620. } else {
  621. file_hit = true;
  622. }
  623. } else {
  624. file_hit = true;
  625. }
  626. if (cmp_largest == 0) {
  627. // cmp_largest is 0, which means the next key will not be in this
  628. // file, so stop looking further. However, its possible there are
  629. // duplicates in the batch, so find the upper bound for the batch
  630. // in this file (upper_key_) by skipping past the duplicates. We
  631. // leave batch_iter_ as is since we may have to pick up from there
  632. // for the next file, if this file has a merge value rather than
  633. // final value
  634. upper_key_ = batch_iter_;
  635. ++upper_key_;
  636. while (upper_key_ != current_level_range_.end() &&
  637. user_comparator_->CompareWithoutTimestamp(
  638. batch_iter_->ukey_without_ts, false,
  639. upper_key_->ukey_without_ts, false) == 0) {
  640. if (curr_level_ > 0) {
  641. struct FilePickerContext& ctx = fp_ctx_array_[upper_key_.index()];
  642. file_indexer_->GetNextLevelIndex(
  643. curr_level_, ctx.curr_index_in_curr_level, cmp_smallest,
  644. cmp_largest, &ctx.search_left_bound, &ctx.search_right_bound);
  645. }
  646. ++upper_key_;
  647. }
  648. break;
  649. } else {
  650. if (curr_level_ == 0) {
  651. // We need to look through all files in level 0
  652. ++fp_ctx.curr_index_in_curr_level;
  653. }
  654. ++batch_iter_;
  655. }
  656. if (!file_hit) {
  657. curr_file_index =
  658. (batch_iter_ != current_level_range_.end())
  659. ? fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level
  660. : curr_file_level_->num_files;
  661. }
  662. }
  663. *fd = f;
  664. *file_index = curr_file_index;
  665. *is_last_key_in_file = cmp_largest == 0;
  666. if (!*is_last_key_in_file) {
  667. // If the largest key in the batch overlapping the file is not the
  668. // largest key in the file, upper_ley_ would not have been updated so
  669. // update it here
  670. upper_key_ = batch_iter_;
  671. }
  672. return file_hit;
  673. }
  674. // Setup local variables to search next level.
  675. // Returns false if there are no more levels to search.
  676. bool PrepareNextLevel() {
  677. if (curr_level_ == 0) {
  678. MultiGetRange::Iterator mget_iter = current_level_range_.begin();
  679. if (fp_ctx_array_[mget_iter.index()].curr_index_in_curr_level <
  680. curr_file_level_->num_files) {
  681. batch_iter_prev_ = current_level_range_.begin();
  682. upper_key_ = batch_iter_ = current_level_range_.begin();
  683. return true;
  684. }
  685. }
  686. curr_level_++;
  687. // Reset key range to saved value
  688. while (curr_level_ < num_levels_) {
  689. bool level_contains_keys = false;
  690. curr_file_level_ = &(*level_files_brief_)[curr_level_];
  691. if (curr_file_level_->num_files == 0) {
  692. // When current level is empty, the search bound generated from upper
  693. // level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is
  694. // also empty.
  695. for (auto mget_iter = current_level_range_.begin();
  696. mget_iter != current_level_range_.end(); ++mget_iter) {
  697. struct FilePickerContext& fp_ctx = fp_ctx_array_[mget_iter.index()];
  698. assert(fp_ctx.search_left_bound == 0);
  699. assert(fp_ctx.search_right_bound == -1 ||
  700. fp_ctx.search_right_bound == FileIndexer::kLevelMaxIndex);
  701. // Since current level is empty, it will need to search all files in
  702. // the next level
  703. fp_ctx.search_left_bound = 0;
  704. fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex;
  705. }
  706. // Skip all subsequent empty levels
  707. do {
  708. ++curr_level_;
  709. } while ((curr_level_ < num_levels_) &&
  710. (*level_files_brief_)[curr_level_].num_files == 0);
  711. continue;
  712. }
  713. // Some files may overlap each other. We find
  714. // all files that overlap user_key and process them in order from
  715. // newest to oldest. In the context of merge-operator, this can occur at
  716. // any level. Otherwise, it only occurs at Level-0 (since Put/Deletes
  717. // are always compacted into a single entry).
  718. int32_t start_index = -1;
  719. current_level_range_ =
  720. MultiGetRange(range_, range_.begin(), range_.end());
  721. for (auto mget_iter = current_level_range_.begin();
  722. mget_iter != current_level_range_.end(); ++mget_iter) {
  723. struct FilePickerContext& fp_ctx = fp_ctx_array_[mget_iter.index()];
  724. if (curr_level_ == 0) {
  725. // On Level-0, we read through all files to check for overlap.
  726. start_index = 0;
  727. level_contains_keys = true;
  728. } else {
  729. // On Level-n (n>=1), files are sorted. Binary search to find the
  730. // earliest file whose largest key >= ikey. Search left bound and
  731. // right bound are used to narrow the range.
  732. if (fp_ctx.search_left_bound <= fp_ctx.search_right_bound) {
  733. if (fp_ctx.search_right_bound == FileIndexer::kLevelMaxIndex) {
  734. fp_ctx.search_right_bound =
  735. static_cast<int32_t>(curr_file_level_->num_files) - 1;
  736. }
  737. // `search_right_bound_` is an inclusive upper-bound, but since it
  738. // was determined based on user key, it is still possible the lookup
  739. // key falls to the right of `search_right_bound_`'s corresponding
  740. // file. So, pass a limit one higher, which allows us to detect this
  741. // case.
  742. Slice& ikey = mget_iter->ikey;
  743. start_index = FindFileInRange(
  744. *internal_comparator_, *curr_file_level_, ikey,
  745. static_cast<uint32_t>(fp_ctx.search_left_bound),
  746. static_cast<uint32_t>(fp_ctx.search_right_bound) + 1);
  747. if (start_index == fp_ctx.search_right_bound + 1) {
  748. // `ikey_` comes after `search_right_bound_`. The lookup key does
  749. // not exist on this level, so let's skip this level and do a full
  750. // binary search on the next level.
  751. fp_ctx.search_left_bound = 0;
  752. fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex;
  753. current_level_range_.SkipKey(mget_iter);
  754. continue;
  755. } else {
  756. level_contains_keys = true;
  757. }
  758. } else {
  759. // search_left_bound > search_right_bound, key does not exist in
  760. // this level. Since no comparison is done in this level, it will
  761. // need to search all files in the next level.
  762. fp_ctx.search_left_bound = 0;
  763. fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex;
  764. current_level_range_.SkipKey(mget_iter);
  765. continue;
  766. }
  767. }
  768. assert(start_index >= 0);
  769. assert(start_index < static_cast<int32_t>(curr_file_level_->num_files));
  770. fp_ctx.start_index_in_curr_level = start_index;
  771. fp_ctx.curr_index_in_curr_level = start_index;
  772. }
  773. if (level_contains_keys) {
  774. batch_iter_prev_ = current_level_range_.begin();
  775. upper_key_ = batch_iter_ = current_level_range_.begin();
  776. return true;
  777. }
  778. curr_level_++;
  779. }
  780. // curr_level_ = num_levels_. So, no more levels to search.
  781. return false;
  782. }
  783. };
  784. VersionStorageInfo::~VersionStorageInfo() { delete[] files_; }
  785. Version::~Version() {
  786. assert(refs_ == 0);
  787. // Remove from linked list
  788. prev_->next_ = next_;
  789. next_->prev_ = prev_;
  790. // Drop references to files
  791. for (int level = 0; level < storage_info_.num_levels_; level++) {
  792. for (size_t i = 0; i < storage_info_.files_[level].size(); i++) {
  793. FileMetaData* f = storage_info_.files_[level][i];
  794. assert(f->refs > 0);
  795. f->refs--;
  796. if (f->refs <= 0) {
  797. assert(cfd_ != nullptr);
  798. // When not in the process of closing the DB, we'll have a superversion
  799. // to get current mutable options from
  800. auto* sv = cfd_->GetSuperVersion();
  801. uint32_t path_id = f->fd.GetPathId();
  802. assert(path_id < cfd_->ioptions().cf_paths.size());
  803. vset_->obsolete_files_.emplace_back(
  804. f, cfd_->ioptions().cf_paths[path_id].path,
  805. sv ? sv->mutable_cf_options.uncache_aggressiveness : 0,
  806. cfd_->GetFileMetadataCacheReservationManager());
  807. }
  808. }
  809. }
  810. }
  811. int FindFile(const InternalKeyComparator& icmp,
  812. const LevelFilesBrief& file_level, const Slice& key) {
  813. return FindFileInRange(icmp, file_level, key, 0,
  814. static_cast<uint32_t>(file_level.num_files));
  815. }
  816. void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
  817. const std::vector<FileMetaData*>& files,
  818. Arena* arena) {
  819. assert(file_level);
  820. assert(arena);
  821. size_t num = files.size();
  822. file_level->num_files = num;
  823. char* mem = arena->AllocateAligned(num * sizeof(FdWithKeyRange));
  824. file_level->files = new (mem) FdWithKeyRange[num];
  825. for (size_t i = 0; i < num; i++) {
  826. Slice smallest_key = files[i]->smallest.Encode();
  827. Slice largest_key = files[i]->largest.Encode();
  828. // Copy key slice to sequential memory
  829. size_t smallest_size = smallest_key.size();
  830. size_t largest_size = largest_key.size();
  831. mem = arena->AllocateAligned(smallest_size + largest_size);
  832. memcpy(mem, smallest_key.data(), smallest_size);
  833. memcpy(mem + smallest_size, largest_key.data(), largest_size);
  834. FdWithKeyRange& f = file_level->files[i];
  835. f.fd = files[i]->fd;
  836. f.file_metadata = files[i];
  837. f.smallest_key = Slice(mem, smallest_size);
  838. f.largest_key = Slice(mem + smallest_size, largest_size);
  839. }
  840. }
  841. static bool AfterFile(const Comparator* ucmp, const Slice* user_key,
  842. const FdWithKeyRange* f) {
  843. // nullptr user_key occurs before all keys and is therefore never after *f
  844. return (user_key != nullptr &&
  845. ucmp->CompareWithoutTimestamp(*user_key,
  846. ExtractUserKey(f->largest_key)) > 0);
  847. }
  848. static bool BeforeFile(const Comparator* ucmp, const Slice* user_key,
  849. const FdWithKeyRange* f) {
  850. // nullptr user_key occurs after all keys and is therefore never before *f
  851. return (user_key != nullptr &&
  852. ucmp->CompareWithoutTimestamp(*user_key,
  853. ExtractUserKey(f->smallest_key)) < 0);
  854. }
  855. bool SomeFileOverlapsRange(const InternalKeyComparator& icmp,
  856. bool disjoint_sorted_files,
  857. const LevelFilesBrief& file_level,
  858. const Slice* smallest_user_key,
  859. const Slice* largest_user_key) {
  860. const Comparator* ucmp = icmp.user_comparator();
  861. if (!disjoint_sorted_files) {
  862. // Need to check against all files
  863. for (size_t i = 0; i < file_level.num_files; i++) {
  864. const FdWithKeyRange* f = &(file_level.files[i]);
  865. if (AfterFile(ucmp, smallest_user_key, f) ||
  866. BeforeFile(ucmp, largest_user_key, f)) {
  867. // No overlap
  868. } else {
  869. return true; // Overlap
  870. }
  871. }
  872. return false;
  873. }
  874. // Binary search over file list
  875. uint32_t index = 0;
  876. if (smallest_user_key != nullptr) {
  877. // Find the leftmost possible internal key for smallest_user_key
  878. InternalKey small;
  879. small.SetMinPossibleForUserKey(*smallest_user_key);
  880. index = FindFile(icmp, file_level, small.Encode());
  881. }
  882. if (index >= file_level.num_files) {
  883. // beginning of range is after all files, so no overlap.
  884. return false;
  885. }
  886. return !BeforeFile(ucmp, largest_user_key, &file_level.files[index]);
  887. }
  888. namespace {
  889. class LevelIterator final : public InternalIterator {
  890. public:
  891. // NOTE: many of the const& parameters are saved in this object (so
  892. // must outlive this object)
  893. LevelIterator(
  894. TableCache* table_cache, const ReadOptions& read_options,
  895. const FileOptions& file_options, const InternalKeyComparator& icomparator,
  896. const LevelFilesBrief* flevel, const MutableCFOptions& mutable_cf_options,
  897. bool should_sample, HistogramImpl* file_read_hist,
  898. TableReaderCaller caller, bool skip_filters, int level,
  899. RangeDelAggregator* range_del_agg,
  900. const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries =
  901. nullptr,
  902. bool allow_unprepared_value = false,
  903. std::unique_ptr<TruncatedRangeDelIterator>*** range_tombstone_iter_ptr_ =
  904. nullptr)
  905. : table_cache_(table_cache),
  906. read_options_(read_options),
  907. file_options_(file_options),
  908. icomparator_(icomparator),
  909. user_comparator_(icomparator.user_comparator()),
  910. flevel_(flevel),
  911. mutable_cf_options_(mutable_cf_options),
  912. prefix_extractor_(mutable_cf_options.prefix_extractor.get()),
  913. file_read_hist_(file_read_hist),
  914. caller_(caller),
  915. file_index_(flevel_->num_files),
  916. range_del_agg_(range_del_agg),
  917. pinned_iters_mgr_(nullptr),
  918. compaction_boundaries_(compaction_boundaries),
  919. range_tombstone_iter_(nullptr),
  920. read_seq_(read_options.snapshot
  921. ? read_options.snapshot->GetSequenceNumber()
  922. : kMaxSequenceNumber),
  923. level_(level),
  924. should_sample_(should_sample),
  925. skip_filters_(skip_filters),
  926. allow_unprepared_value_(allow_unprepared_value),
  927. is_next_read_sequential_(false),
  928. to_return_sentinel_(false),
  929. scan_opts_(nullptr) {
  930. // Empty level is not supported.
  931. assert(flevel_ != nullptr && flevel_->num_files > 0);
  932. if (range_tombstone_iter_ptr_) {
  933. *range_tombstone_iter_ptr_ = &range_tombstone_iter_;
  934. }
  935. }
  936. ~LevelIterator() override { delete file_iter_.Set(nullptr); }
  937. // Seek to the first file with a key >= target.
  938. // If range_tombstone_iter_ is not nullptr, then we pretend that file
  939. // boundaries are fake keys (sentinel keys). These keys are used to keep range
  940. // tombstones alive even when all point keys in an SST file are exhausted.
  941. // These sentinel keys will be skipped in merging iterator.
  942. void Seek(const Slice& target) override;
  943. void SeekForPrev(const Slice& target) override;
  944. void SeekToFirst() override;
  945. void SeekToLast() override;
  946. void Next() final override;
  947. bool NextAndGetResult(IterateResult* result) override;
  948. void Prev() override;
  949. // In addition to valid and invalid state (!file_iter.Valid() and
  950. // status.ok()), a third state of the iterator is when !file_iter_.Valid() and
  951. // to_return_sentinel_. This means we are at the end of a file, and a sentinel
  952. // key (the file boundary that we pretend as a key) is to be returned next.
  953. // file_iter_.Valid() and to_return_sentinel_ should not both be true.
  954. bool Valid() const override {
  955. assert(!(file_iter_.Valid() && to_return_sentinel_));
  956. return file_iter_.Valid() || to_return_sentinel_;
  957. }
  958. Slice key() const override {
  959. assert(Valid());
  960. if (to_return_sentinel_) {
  961. // Sentinel should be returned after file_iter_ reaches the end of the
  962. // file
  963. assert(!file_iter_.Valid());
  964. return sentinel_;
  965. }
  966. return file_iter_.key();
  967. }
  968. Slice value() const override {
  969. assert(Valid());
  970. assert(!to_return_sentinel_);
  971. return file_iter_.value();
  972. }
  973. uint64_t write_unix_time() const override {
  974. assert(Valid());
  975. return file_iter_.write_unix_time();
  976. }
  977. Status status() const override {
  978. return file_iter_.iter() ? file_iter_.status() : Status::OK();
  979. }
  980. bool PrepareValue() override { return file_iter_.PrepareValue(); }
  981. inline bool MayBeOutOfLowerBound() override {
  982. assert(Valid());
  983. return may_be_out_of_lower_bound_ && file_iter_.MayBeOutOfLowerBound();
  984. }
  985. inline IterBoundCheck UpperBoundCheckResult() override {
  986. if (Valid()) {
  987. return file_iter_.UpperBoundCheckResult();
  988. } else {
  989. return IterBoundCheck::kUnknown;
  990. }
  991. }
  992. void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
  993. pinned_iters_mgr_ = pinned_iters_mgr;
  994. if (file_iter_.iter()) {
  995. file_iter_.SetPinnedItersMgr(pinned_iters_mgr);
  996. }
  997. }
  998. bool IsKeyPinned() const override {
  999. return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
  1000. file_iter_.iter() && file_iter_.IsKeyPinned();
  1001. }
  1002. bool IsValuePinned() const override {
  1003. return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
  1004. file_iter_.iter() && file_iter_.IsValuePinned();
  1005. }
  1006. bool IsDeleteRangeSentinelKey() const override { return to_return_sentinel_; }
  1007. void SetRangeDelReadSeqno(SequenceNumber read_seq) override {
  1008. read_seq_ = read_seq;
  1009. }
  1010. inline bool FileHasMultiScanArg(size_t file_index) {
  1011. if (file_to_scan_opts_.get()) {
  1012. auto it = file_to_scan_opts_->find(file_index);
  1013. if (it != file_to_scan_opts_->end()) {
  1014. return !it->second.empty();
  1015. }
  1016. }
  1017. return false;
  1018. }
  1019. MultiScanArgs& GetMultiScanArgForFile(size_t file_index) {
  1020. auto multi_scan_args_it = file_to_scan_opts_->find(file_index);
  1021. if (multi_scan_args_it == file_to_scan_opts_->end()) {
  1022. auto ret = file_to_scan_opts_->emplace(
  1023. file_index, MultiScanArgs(user_comparator_.user_comparator()));
  1024. multi_scan_args_it = ret.first;
  1025. assert(ret.second);
  1026. }
  1027. return multi_scan_args_it->second;
  1028. }
  1029. void Prepare(const MultiScanArgs* so) override {
  1030. // We assume here that scan_opts is sorted such that
  1031. // scan_opts[0].range.start < scan_opts[1].range.start, and non overlapping
  1032. if (so == nullptr) {
  1033. return;
  1034. }
  1035. scan_opts_ = so;
  1036. // Verify comparator is consistent
  1037. assert(so->GetComparator() == user_comparator_.user_comparator());
  1038. file_to_scan_opts_ = std::make_unique<ScanOptionsMap>();
  1039. for (size_t k = 0; k < scan_opts_->size(); k++) {
  1040. const ScanOptions& opt = scan_opts_->GetScanRanges().at(k);
  1041. auto start = opt.range.start;
  1042. auto end = opt.range.limit;
  1043. if (!start.has_value()) {
  1044. continue;
  1045. }
  1046. // We can capture this case in the future, but for now lets skip this.
  1047. if (!end.has_value()) {
  1048. continue;
  1049. }
  1050. const size_t timestamp_size =
  1051. user_comparator_.user_comparator()->timestamp_size();
  1052. InternalKey istart, iend;
  1053. if (timestamp_size == 0) {
  1054. istart =
  1055. InternalKey(start.value(), kMaxSequenceNumber, kValueTypeForSeek);
  1056. // end key is exclusive for multiscan
  1057. iend = InternalKey(end.value(), kMaxSequenceNumber, kValueTypeForSeek);
  1058. } else {
  1059. std::string start_key_with_ts, end_key_with_ts;
  1060. AppendKeyWithMaxTimestamp(&start_key_with_ts, start.value(),
  1061. timestamp_size);
  1062. AppendKeyWithMaxTimestamp(&end_key_with_ts, end.value(),
  1063. timestamp_size);
  1064. istart = InternalKey(start_key_with_ts, kMaxSequenceNumber,
  1065. kValueTypeForSeek);
  1066. // end key is exclusive for multiscan
  1067. iend =
  1068. InternalKey(end_key_with_ts, kMaxSequenceNumber, kValueTypeForSeek);
  1069. }
  1070. // TODO: This needs to be optimized, right now we iterate twice, which
  1071. // we dont need to. We can do this in N rather than 2N.
  1072. size_t fstart = FindFile(icomparator_, *flevel_, istart.Encode());
  1073. size_t fend = FindFile(icomparator_, *flevel_, iend.Encode());
  1074. // We need to check the relevant cases
  1075. // Cases:
  1076. // 1. [ S E ]
  1077. // 2. [ S ] [ E ]
  1078. // 3. [ S ] ...... [ E ]
  1079. for (auto i = fstart; i <= fend; i++) {
  1080. if (i < flevel_->num_files) {
  1081. // FindFile only compares against the largest_key, so we need this
  1082. // additional check to ensure the scan range overlaps the file
  1083. if (icomparator_.InternalKeyComparator::Compare(
  1084. iend.Encode(), flevel_->files[i].smallest_key) < 0) {
  1085. continue;
  1086. }
  1087. auto const metadata = flevel_->files[i].file_metadata;
  1088. if (metadata->FileIsStandAloneRangeTombstone()) {
  1089. // Skip stand alone range deletion files.
  1090. continue;
  1091. }
  1092. auto& args = GetMultiScanArgForFile(i);
  1093. args.insert(start.value(), end.value(), opt.property_bag);
  1094. }
  1095. }
  1096. }
  1097. // Propagate multiscan configs
  1098. for (auto& file_to_arg : *file_to_scan_opts_) {
  1099. file_to_arg.second.CopyConfigFrom(*so);
  1100. }
  1101. }
  1102. private:
  1103. // Return true if at least one invalid file is seen and skipped.
  1104. bool SkipEmptyFileForward();
  1105. void SkipEmptyFileBackward();
  1106. void SetFileIterator(InternalIterator* iter);
  1107. void InitFileIterator(size_t new_file_index);
  1108. const Slice& file_smallest_key(size_t file_index) {
  1109. assert(file_index < flevel_->num_files);
  1110. return flevel_->files[file_index].smallest_key;
  1111. }
  1112. const Slice& file_largest_key(size_t file_index) {
  1113. assert(file_index < flevel_->num_files);
  1114. return flevel_->files[file_index].largest_key;
  1115. }
  1116. bool KeyReachedUpperBound(const Slice& internal_key) {
  1117. return read_options_.iterate_upper_bound != nullptr &&
  1118. user_comparator_.CompareWithoutTimestamp(
  1119. ExtractUserKey(internal_key), /*a_has_ts=*/true,
  1120. *read_options_.iterate_upper_bound, /*b_has_ts=*/false) >= 0;
  1121. }
  1122. void ClearRangeTombstoneIter() {
  1123. if (range_tombstone_iter_) {
  1124. range_tombstone_iter_->reset();
  1125. }
  1126. }
  1127. // Move file_iter_ to the file at file_index_.
  1128. // range_tombstone_iter_ is updated with a range tombstone iterator
  1129. // into the new file. Old range tombstone iterator is cleared.
  1130. InternalIterator* NewFileIterator() {
  1131. assert(file_index_ < flevel_->num_files);
  1132. auto file_meta = flevel_->files[file_index_];
  1133. if (should_sample_) {
  1134. sample_file_read_inc(file_meta.file_metadata);
  1135. }
  1136. const InternalKey* smallest_compaction_key = nullptr;
  1137. const InternalKey* largest_compaction_key = nullptr;
  1138. if (compaction_boundaries_ != nullptr) {
  1139. smallest_compaction_key = (*compaction_boundaries_)[file_index_].smallest;
  1140. largest_compaction_key = (*compaction_boundaries_)[file_index_].largest;
  1141. }
  1142. CheckMayBeOutOfLowerBound();
  1143. ClearRangeTombstoneIter();
  1144. return table_cache_->NewIterator(
  1145. read_options_, file_options_, icomparator_, *file_meta.file_metadata,
  1146. range_del_agg_, mutable_cf_options_,
  1147. nullptr /* don't need reference to table */, file_read_hist_, caller_,
  1148. /*arena=*/nullptr, skip_filters_, level_,
  1149. /*max_file_size_for_l0_meta_pin=*/0, smallest_compaction_key,
  1150. largest_compaction_key, allow_unprepared_value_, &read_seq_,
  1151. range_tombstone_iter_);
  1152. }
  1153. // Check if current file being fully within iterate_lower_bound.
  1154. //
  1155. // Note MyRocks may update iterate bounds between seek. To workaround it,
  1156. // we need to check and update may_be_out_of_lower_bound_ accordingly.
  1157. void CheckMayBeOutOfLowerBound() {
  1158. if (read_options_.iterate_lower_bound != nullptr &&
  1159. file_index_ < flevel_->num_files) {
  1160. may_be_out_of_lower_bound_ =
  1161. user_comparator_.CompareWithoutTimestamp(
  1162. ExtractUserKey(file_smallest_key(file_index_)), /*a_has_ts=*/true,
  1163. *read_options_.iterate_lower_bound, /*b_has_ts=*/false) < 0;
  1164. }
  1165. }
  1166. #ifndef NDEBUG
  1167. bool OverlapRange(const ScanOptions& opts);
  1168. #endif
  1169. TableCache* table_cache_;
  1170. const ReadOptions& read_options_;
  1171. const FileOptions& file_options_;
  1172. const InternalKeyComparator& icomparator_;
  1173. const UserComparatorWrapper user_comparator_;
  1174. const LevelFilesBrief* flevel_;
  1175. mutable FileDescriptor current_value_;
  1176. const MutableCFOptions& mutable_cf_options_;
  1177. const SliceTransform* prefix_extractor_;
  1178. HistogramImpl* file_read_hist_;
  1179. TableReaderCaller caller_;
  1180. size_t file_index_;
  1181. RangeDelAggregator* range_del_agg_;
  1182. IteratorWrapper file_iter_; // May be nullptr
  1183. PinnedIteratorsManager* pinned_iters_mgr_;
  1184. // To be propagated to RangeDelAggregator in order to safely truncate range
  1185. // tombstones.
  1186. const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries_;
  1187. // This is set when this level iterator is used under a merging iterator
  1188. // that processes range tombstones. range_tombstone_iter_ points to where the
  1189. // merging iterator stores the range tombstones iterator for this level. When
  1190. // this level iterator moves to a new SST file, it updates the range
  1191. // tombstones accordingly through this pointer. So the merging iterator always
  1192. // has access to the current SST file's range tombstones.
  1193. //
  1194. // The level iterator treats file boundary as fake keys (sentinel keys) to
  1195. // keep range tombstones alive if needed and make upper level, i.e. merging
  1196. // iterator, aware of file changes (when level iterator moves to a new SST
  1197. // file, there is some bookkeeping work that needs to be done at merging
  1198. // iterator end).
  1199. //
  1200. // *range_tombstone_iter_ points to range tombstones of the current SST file
  1201. std::unique_ptr<TruncatedRangeDelIterator>* range_tombstone_iter_;
  1202. // The sentinel key to be returned
  1203. Slice sentinel_;
  1204. SequenceNumber read_seq_;
  1205. int level_;
  1206. bool should_sample_;
  1207. bool skip_filters_;
  1208. bool allow_unprepared_value_;
  1209. bool may_be_out_of_lower_bound_ = true;
  1210. bool is_next_read_sequential_;
  1211. // Set in Seek() when a prefix seek reaches end of the current file,
  1212. // and the next file has a different prefix. SkipEmptyFileForward()
  1213. // will not move to next file when this flag is set.
  1214. bool prefix_exhausted_ = false;
  1215. // Whether next/prev key is a sentinel key.
  1216. bool to_return_sentinel_ = false;
  1217. const MultiScanArgs* scan_opts_ = nullptr;
  1218. // Our stored scan_opts for each prefix
  1219. std::unique_ptr<ScanOptionsMap> file_to_scan_opts_ = nullptr;
  1220. // Sets flags for if we should return the sentinel key next.
  1221. // The condition for returning sentinel is reaching the end of current
  1222. // file_iter_: !Valid() && status.().ok().
  1223. void TrySetDeleteRangeSentinel(const Slice& boundary_key);
  1224. void ClearSentinel() { to_return_sentinel_ = false; }
  1225. };
  1226. void LevelIterator::TrySetDeleteRangeSentinel(const Slice& boundary_key) {
  1227. assert(range_tombstone_iter_);
  1228. if (file_iter_.iter() != nullptr && !file_iter_.Valid() &&
  1229. file_iter_.status().ok()) {
  1230. to_return_sentinel_ = true;
  1231. sentinel_ = boundary_key;
  1232. }
  1233. }
  1234. void LevelIterator::Seek(const Slice& target) {
  1235. prefix_exhausted_ = false;
  1236. ClearSentinel();
  1237. // Check whether the seek key fall under the same file
  1238. bool need_to_reseek = true;
  1239. if (file_iter_.iter() != nullptr && file_index_ < flevel_->num_files) {
  1240. const FdWithKeyRange& cur_file = flevel_->files[file_index_];
  1241. if (icomparator_.InternalKeyComparator::Compare(
  1242. target, cur_file.largest_key) <= 0 &&
  1243. icomparator_.InternalKeyComparator::Compare(
  1244. target, cur_file.smallest_key) >= 0) {
  1245. need_to_reseek = false;
  1246. assert(static_cast<size_t>(FindFile(icomparator_, *flevel_, target)) ==
  1247. file_index_);
  1248. }
  1249. }
  1250. if (need_to_reseek) {
  1251. TEST_SYNC_POINT("LevelIterator::Seek:BeforeFindFile");
  1252. size_t new_file_index = FindFile(icomparator_, *flevel_, target);
  1253. InitFileIterator(new_file_index);
  1254. }
  1255. if (file_iter_.iter() != nullptr) {
  1256. if (scan_opts_) {
  1257. // At this point, we only know that the seek target is < largest_key
  1258. // in the file. We need to check whether there is actual overlap.
  1259. const FdWithKeyRange& cur_file = flevel_->files[file_index_];
  1260. if (KeyReachedUpperBound(cur_file.smallest_key)) {
  1261. return;
  1262. }
  1263. }
  1264. file_iter_.Seek(target);
  1265. // Status::TryAgain indicates asynchronous request for retrieval of data
  1266. // blocks has been submitted. So it should return at this point and Seek
  1267. // should be called again to retrieve the requested block and execute the
  1268. // remaining code.
  1269. if (file_iter_.status() == Status::TryAgain()) {
  1270. return;
  1271. }
  1272. if (!file_iter_.Valid() && file_iter_.status().ok() &&
  1273. prefix_extractor_ != nullptr && !read_options_.total_order_seek &&
  1274. !read_options_.auto_prefix_mode &&
  1275. file_index_ < flevel_->num_files - 1) {
  1276. size_t ts_sz = user_comparator_.user_comparator()->timestamp_size();
  1277. Slice target_user_key_without_ts =
  1278. ExtractUserKeyAndStripTimestamp(target, ts_sz);
  1279. Slice next_file_first_user_key_without_ts =
  1280. ExtractUserKeyAndStripTimestamp(file_smallest_key(file_index_ + 1),
  1281. ts_sz);
  1282. if (prefix_extractor_->InDomain(target_user_key_without_ts) &&
  1283. (!prefix_extractor_->InDomain(next_file_first_user_key_without_ts) ||
  1284. prefix_extractor_->Transform(target_user_key_without_ts)
  1285. .compare(prefix_extractor_->Transform(
  1286. next_file_first_user_key_without_ts)) != 0)) {
  1287. // SkipEmptyFileForward() will not advance to next file when this flag
  1288. // is set for reason detailed below.
  1289. //
  1290. // The file we initially positioned to has no keys under the target
  1291. // prefix, and the next file's smallest key has a different prefix than
  1292. // target. When doing prefix iterator seek, when keys for one prefix
  1293. // have been exhausted, it can jump to any key that is larger. Here we
  1294. // are enforcing a stricter contract than that, in order to make it
  1295. // easier for higher layers (merging and DB iterator) to reason the
  1296. // correctness:
  1297. // 1. Within the prefix, the result should be accurate.
  1298. // 2. If keys for the prefix is exhausted, it is either positioned to
  1299. // the next key after the prefix, or make the iterator invalid.
  1300. // A side benefit will be that it invalidates the iterator earlier so
  1301. // that the upper level merging iterator can merge fewer child
  1302. // iterators.
  1303. //
  1304. // The flag is cleared in Seek*() calls. There is no need to clear the
  1305. // flag in Prev() since Prev() will not be called when the flag is set
  1306. // for reasons explained below. If range_tombstone_iter_ is nullptr,
  1307. // then there is no file boundary sentinel key. Since
  1308. // !file_iter_.Valid() from the if condition above, this level iterator
  1309. // is !Valid(), so Prev() will not be called. If range_tombstone_iter_
  1310. // is not nullptr, there are two cases depending on if this level
  1311. // iterator reaches top of the heap in merging iterator (the upper
  1312. // layer).
  1313. // If so, merging iterator will see the sentinel key, call
  1314. // NextAndGetResult() and the call to NextAndGetResult() will skip the
  1315. // sentinel key and makes this level iterator invalid. If not, then it
  1316. // could be because the upper layer is done before any method of this
  1317. // level iterator is called or another Seek*() call is invoked. Either
  1318. // way, Prev() is never called before Seek*().
  1319. // The flag should not be cleared at the beginning of
  1320. // Next/NextAndGetResult() since it is used in SkipEmptyFileForward()
  1321. // called in Next/NextAndGetResult().
  1322. prefix_exhausted_ = true;
  1323. }
  1324. }
  1325. if (range_tombstone_iter_) {
  1326. TrySetDeleteRangeSentinel(file_largest_key(file_index_));
  1327. }
  1328. }
  1329. SkipEmptyFileForward();
  1330. CheckMayBeOutOfLowerBound();
  1331. }
  1332. void LevelIterator::SeekForPrev(const Slice& target) {
  1333. prefix_exhausted_ = false;
  1334. ClearSentinel();
  1335. size_t new_file_index = FindFile(icomparator_, *flevel_, target);
  1336. // Seek beyond this level's smallest key
  1337. if (new_file_index == 0 &&
  1338. icomparator_.Compare(target, file_smallest_key(0)) < 0) {
  1339. SetFileIterator(nullptr);
  1340. ClearRangeTombstoneIter();
  1341. CheckMayBeOutOfLowerBound();
  1342. return;
  1343. }
  1344. if (new_file_index >= flevel_->num_files) {
  1345. new_file_index = flevel_->num_files - 1;
  1346. }
  1347. InitFileIterator(new_file_index);
  1348. if (file_iter_.iter() != nullptr) {
  1349. file_iter_.SeekForPrev(target);
  1350. if (range_tombstone_iter_ &&
  1351. icomparator_.Compare(target, file_smallest_key(file_index_)) >= 0) {
  1352. // In SeekForPrev() case, it is possible that the target is less than
  1353. // file's lower boundary since largest key is used to determine file index
  1354. // (FindFile()). When target is less than file's lower boundary, sentinel
  1355. // key should not be set so that SeekForPrev() does not result in a key
  1356. // larger than target. This is correct in that there is no need to keep
  1357. // the range tombstones in this file alive as they only cover keys
  1358. // starting from the file's lower boundary, which is after `target`.
  1359. TrySetDeleteRangeSentinel(file_smallest_key(file_index_));
  1360. }
  1361. SkipEmptyFileBackward();
  1362. }
  1363. CheckMayBeOutOfLowerBound();
  1364. }
  1365. void LevelIterator::SeekToFirst() {
  1366. prefix_exhausted_ = false;
  1367. ClearSentinel();
  1368. InitFileIterator(0);
  1369. if (file_iter_.iter() != nullptr) {
  1370. file_iter_.SeekToFirst();
  1371. if (range_tombstone_iter_) {
  1372. // We do this in SeekToFirst() and SeekToLast() since
  1373. // we could have an empty file with only range tombstones.
  1374. TrySetDeleteRangeSentinel(file_largest_key(file_index_));
  1375. }
  1376. }
  1377. SkipEmptyFileForward();
  1378. CheckMayBeOutOfLowerBound();
  1379. }
  1380. void LevelIterator::SeekToLast() {
  1381. prefix_exhausted_ = false;
  1382. ClearSentinel();
  1383. InitFileIterator(flevel_->num_files - 1);
  1384. if (file_iter_.iter() != nullptr) {
  1385. file_iter_.SeekToLast();
  1386. if (range_tombstone_iter_) {
  1387. TrySetDeleteRangeSentinel(file_smallest_key(file_index_));
  1388. }
  1389. }
  1390. SkipEmptyFileBackward();
  1391. CheckMayBeOutOfLowerBound();
  1392. }
  1393. void LevelIterator::Next() {
  1394. assert(Valid());
  1395. if (to_return_sentinel_) {
  1396. // file_iter_ is at EOF already when to_return_sentinel_
  1397. ClearSentinel();
  1398. } else {
  1399. file_iter_.Next();
  1400. if (range_tombstone_iter_) {
  1401. TrySetDeleteRangeSentinel(file_largest_key(file_index_));
  1402. }
  1403. }
  1404. SkipEmptyFileForward();
  1405. }
  1406. bool LevelIterator::NextAndGetResult(IterateResult* result) {
  1407. assert(Valid());
  1408. // file_iter_ is at EOF already when to_return_sentinel_
  1409. bool is_valid = !to_return_sentinel_ && file_iter_.NextAndGetResult(result);
  1410. if (!is_valid) {
  1411. if (to_return_sentinel_) {
  1412. ClearSentinel();
  1413. } else if (range_tombstone_iter_) {
  1414. TrySetDeleteRangeSentinel(file_largest_key(file_index_));
  1415. }
  1416. is_next_read_sequential_ = true;
  1417. SkipEmptyFileForward();
  1418. is_next_read_sequential_ = false;
  1419. is_valid = Valid();
  1420. if (is_valid) {
  1421. // This could be set in TrySetDeleteRangeSentinel() or
  1422. // SkipEmptyFileForward() above.
  1423. if (to_return_sentinel_) {
  1424. result->key = sentinel_;
  1425. result->bound_check_result = IterBoundCheck::kUnknown;
  1426. result->value_prepared = true;
  1427. } else {
  1428. result->key = key();
  1429. result->bound_check_result = file_iter_.UpperBoundCheckResult();
  1430. // Ideally, we should return the real file_iter_.value_prepared but the
  1431. // information is not here. It would casue an extra PrepareValue()
  1432. // for the first key of a file.
  1433. result->value_prepared = !allow_unprepared_value_;
  1434. }
  1435. }
  1436. }
  1437. return is_valid;
  1438. }
  1439. void LevelIterator::Prev() {
  1440. assert(Valid());
  1441. if (to_return_sentinel_) {
  1442. ClearSentinel();
  1443. } else {
  1444. file_iter_.Prev();
  1445. if (range_tombstone_iter_) {
  1446. TrySetDeleteRangeSentinel(file_smallest_key(file_index_));
  1447. }
  1448. }
  1449. SkipEmptyFileBackward();
  1450. }
  1451. bool LevelIterator::SkipEmptyFileForward() {
  1452. bool seen_empty_file = false;
  1453. // Pause at sentinel key
  1454. while (!to_return_sentinel_ &&
  1455. (file_iter_.iter() == nullptr ||
  1456. (!file_iter_.Valid() && file_iter_.status().ok() &&
  1457. file_iter_.iter()->UpperBoundCheckResult() !=
  1458. IterBoundCheck::kOutOfBound))) {
  1459. seen_empty_file = true;
  1460. // Move to next file
  1461. if (file_index_ >= flevel_->num_files - 1 ||
  1462. KeyReachedUpperBound(file_smallest_key(file_index_ + 1)) ||
  1463. prefix_exhausted_) {
  1464. SetFileIterator(nullptr);
  1465. ClearRangeTombstoneIter();
  1466. break;
  1467. }
  1468. // may init a new *range_tombstone_iter
  1469. InitFileIterator(file_index_ + 1);
  1470. // We moved to a new SST file
  1471. // Seek range_tombstone_iter_ to reset its !Valid() default state.
  1472. // We do not need to call range_tombstone_iter_.Seek* in
  1473. // LevelIterator::Seek* since when the merging iterator calls
  1474. // LevelIterator::Seek*, it should also call Seek* into the corresponding
  1475. // range tombstone iterator.
  1476. if (file_iter_.iter() != nullptr) {
  1477. // If we are doing prepared scan opts then we should seek to the values
  1478. // specified by the scan opts
  1479. if (scan_opts_ && FileHasMultiScanArg(file_index_)) {
  1480. const ScanOptions& opts =
  1481. GetMultiScanArgForFile(file_index_).GetScanRanges().front();
  1482. if (opts.range.start.has_value()) {
  1483. InternalKey target(*opts.range.start.AsPtr(), kMaxSequenceNumber,
  1484. kValueTypeForSeek);
  1485. file_iter_.Seek(target.Encode());
  1486. }
  1487. } else {
  1488. file_iter_.SeekToFirst();
  1489. }
  1490. if (range_tombstone_iter_) {
  1491. if (*range_tombstone_iter_) {
  1492. (*range_tombstone_iter_)->SeekToFirst();
  1493. }
  1494. TrySetDeleteRangeSentinel(file_largest_key(file_index_));
  1495. }
  1496. }
  1497. }
  1498. return seen_empty_file;
  1499. }
  1500. void LevelIterator::SkipEmptyFileBackward() {
  1501. // Pause at sentinel key
  1502. while (!to_return_sentinel_ &&
  1503. (file_iter_.iter() == nullptr ||
  1504. (!file_iter_.Valid() && file_iter_.status().ok()))) {
  1505. // Move to previous file
  1506. if (file_index_ == 0) {
  1507. // Already the first file
  1508. SetFileIterator(nullptr);
  1509. ClearRangeTombstoneIter();
  1510. return;
  1511. }
  1512. InitFileIterator(file_index_ - 1);
  1513. // We moved to a new SST file
  1514. // Seek range_tombstone_iter_ to reset its !Valid() default state.
  1515. if (file_iter_.iter() != nullptr) {
  1516. file_iter_.SeekToLast();
  1517. if (range_tombstone_iter_) {
  1518. if (*range_tombstone_iter_) {
  1519. (*range_tombstone_iter_)->SeekToLast();
  1520. }
  1521. TrySetDeleteRangeSentinel(file_smallest_key(file_index_));
  1522. if (to_return_sentinel_) {
  1523. break;
  1524. }
  1525. }
  1526. }
  1527. }
  1528. }
  1529. #ifndef NDEBUG
  1530. bool LevelIterator::OverlapRange(const ScanOptions& opts) {
  1531. return (user_comparator_.CompareWithoutTimestamp(
  1532. opts.range.start.value(), /*a_has_ts=*/false,
  1533. ExtractUserKey(flevel_->files[file_index_].largest_key),
  1534. /*b_has_ts=*/true) <= 0 &&
  1535. user_comparator_.CompareWithoutTimestamp(
  1536. opts.range.limit.value(), /*a_has_ts=*/false,
  1537. ExtractUserKey(flevel_->files[file_index_].smallest_key),
  1538. /*b_has_ts=*/true) > 0);
  1539. }
  1540. #endif
  1541. void LevelIterator::SetFileIterator(InternalIterator* iter) {
  1542. if (pinned_iters_mgr_ && iter) {
  1543. iter->SetPinnedItersMgr(pinned_iters_mgr_);
  1544. }
  1545. InternalIterator* old_iter = file_iter_.Set(iter);
  1546. if (iter && scan_opts_) {
  1547. if (FileHasMultiScanArg(file_index_)) {
  1548. const MultiScanArgs& new_opts = GetMultiScanArgForFile(file_index_);
  1549. assert(OverlapRange(*new_opts.GetScanRanges().begin()) &&
  1550. OverlapRange(*new_opts.GetScanRanges().rbegin()));
  1551. file_iter_.Prepare(&new_opts);
  1552. }
  1553. }
  1554. // Update the read pattern for PrefetchBuffer.
  1555. if (is_next_read_sequential_) {
  1556. file_iter_.UpdateReadaheadState(old_iter);
  1557. }
  1558. if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
  1559. pinned_iters_mgr_->PinIterator(old_iter);
  1560. } else {
  1561. delete old_iter;
  1562. }
  1563. }
  1564. void LevelIterator::InitFileIterator(size_t new_file_index) {
  1565. if (new_file_index >= flevel_->num_files) {
  1566. file_index_ = new_file_index;
  1567. SetFileIterator(nullptr);
  1568. ClearRangeTombstoneIter();
  1569. return;
  1570. } else {
  1571. // If the file iterator shows incomplete, we try it again if users seek
  1572. // to the same file, as this time we may go to a different data block
  1573. // which is cached in block cache.
  1574. //
  1575. if (file_iter_.iter() != nullptr && !file_iter_.status().IsIncomplete() &&
  1576. new_file_index == file_index_) {
  1577. // file_iter_ is already constructed with this iterator, so
  1578. // no need to change anything
  1579. } else {
  1580. file_index_ = new_file_index;
  1581. InternalIterator* iter = NewFileIterator();
  1582. SetFileIterator(iter);
  1583. }
  1584. }
  1585. }
  1586. } // anonymous namespace
  1587. Status Version::GetTableProperties(const ReadOptions& read_options,
  1588. std::shared_ptr<const TableProperties>* tp,
  1589. const FileMetaData* file_meta,
  1590. const std::string* fname) const {
  1591. auto* table_cache = cfd_->table_cache();
  1592. const auto& ioptions = cfd_->ioptions();
  1593. Status s = table_cache->GetTableProperties(
  1594. file_options_, read_options, cfd_->internal_comparator(), *file_meta, tp,
  1595. mutable_cf_options_, true /* no io */);
  1596. if (s.ok()) {
  1597. return s;
  1598. }
  1599. // We only ignore error type `Incomplete` since it's by design that we
  1600. // disallow table when it's not in table cache.
  1601. if (!s.IsIncomplete()) {
  1602. return s;
  1603. }
  1604. // 2. Table is not present in table cache, we'll read the table properties
  1605. // directly from the properties block in the file.
  1606. std::unique_ptr<FSRandomAccessFile> file;
  1607. std::string file_name;
  1608. if (fname != nullptr) {
  1609. file_name = *fname;
  1610. } else {
  1611. file_name = TableFileName(ioptions.cf_paths, file_meta->fd.GetNumber(),
  1612. file_meta->fd.GetPathId());
  1613. }
  1614. s = ioptions.fs->NewRandomAccessFile(file_name, file_options_, &file,
  1615. nullptr);
  1616. if (!s.ok()) {
  1617. return s;
  1618. }
  1619. // By setting the magic number to kNullTableMagicNumber, we can bypass
  1620. // the magic number check in the footer.
  1621. std::unique_ptr<RandomAccessFileReader> file_reader(
  1622. new RandomAccessFileReader(
  1623. std::move(file), file_name, ioptions.clock /* clock */, io_tracer_,
  1624. ioptions.stats /* stats */,
  1625. Histograms::SST_READ_MICROS /* hist_type */,
  1626. nullptr /* file_read_hist */, nullptr /* rate_limiter */,
  1627. ioptions.listeners));
  1628. std::unique_ptr<TableProperties> props;
  1629. s = ReadTableProperties(
  1630. file_reader.get(), file_meta->fd.GetFileSize(),
  1631. Footer::kNullTableMagicNumber /* table's magic number */, ioptions,
  1632. read_options, &props);
  1633. if (!s.ok()) {
  1634. return s;
  1635. }
  1636. *tp = std::move(props);
  1637. RecordTick(ioptions.stats, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES);
  1638. return s;
  1639. }
  1640. Status Version::GetPropertiesOfAllTables(
  1641. const ReadOptions& read_options, TablePropertiesCollection* props) const {
  1642. Status s;
  1643. for (int level = 0; level < storage_info_.num_levels_; level++) {
  1644. s = GetPropertiesOfAllTables(read_options, props, level);
  1645. if (!s.ok()) {
  1646. return s;
  1647. }
  1648. }
  1649. return Status::OK();
  1650. }
  1651. Status Version::TablesRangeTombstoneSummary(int max_entries_to_print,
  1652. std::string* out_str) {
  1653. if (max_entries_to_print <= 0) {
  1654. return Status::OK();
  1655. }
  1656. int num_entries_left = max_entries_to_print;
  1657. std::stringstream ss;
  1658. // TODO: plumb Env::IOActivity, Env::IOPriority
  1659. const ReadOptions read_options;
  1660. for (int level = 0; level < storage_info_.num_levels_; level++) {
  1661. for (const auto& file_meta : storage_info_.files_[level]) {
  1662. auto fname =
  1663. TableFileName(cfd_->ioptions().cf_paths, file_meta->fd.GetNumber(),
  1664. file_meta->fd.GetPathId());
  1665. ss << "=== file : " << fname << " ===\n";
  1666. TableCache* table_cache = cfd_->table_cache();
  1667. std::unique_ptr<FragmentedRangeTombstoneIterator> tombstone_iter;
  1668. Status s = table_cache->GetRangeTombstoneIterator(
  1669. read_options, cfd_->internal_comparator(), *file_meta,
  1670. mutable_cf_options_, &tombstone_iter);
  1671. if (!s.ok()) {
  1672. return s;
  1673. }
  1674. if (tombstone_iter) {
  1675. tombstone_iter->SeekToFirst();
  1676. // TODO: print timestamp
  1677. while (tombstone_iter->Valid() && num_entries_left > 0) {
  1678. ss << "start: " << tombstone_iter->start_key().ToString(true)
  1679. << " end: " << tombstone_iter->end_key().ToString(true)
  1680. << " seq: " << tombstone_iter->seq() << '\n';
  1681. tombstone_iter->Next();
  1682. num_entries_left--;
  1683. }
  1684. if (num_entries_left <= 0) {
  1685. break;
  1686. }
  1687. }
  1688. }
  1689. if (num_entries_left <= 0) {
  1690. break;
  1691. }
  1692. }
  1693. assert(num_entries_left >= 0);
  1694. if (num_entries_left <= 0) {
  1695. ss << "(results may not be complete)\n";
  1696. }
  1697. *out_str = ss.str();
  1698. return Status::OK();
  1699. }
  1700. Status Version::GetPropertiesOfAllTables(const ReadOptions& read_options,
  1701. TablePropertiesCollection* props,
  1702. int level) const {
  1703. for (const auto& file_meta : storage_info_.files_[level]) {
  1704. auto fname =
  1705. TableFileName(cfd_->ioptions().cf_paths, file_meta->fd.GetNumber(),
  1706. file_meta->fd.GetPathId());
  1707. // 1. If the table is already present in table cache, load table
  1708. // properties from there.
  1709. std::shared_ptr<const TableProperties> table_properties;
  1710. Status s =
  1711. GetTableProperties(read_options, &table_properties, file_meta, &fname);
  1712. if (s.ok()) {
  1713. props->insert({fname, table_properties});
  1714. } else {
  1715. return s;
  1716. }
  1717. }
  1718. return Status::OK();
  1719. }
  1720. Status Version::GetPropertiesOfTablesInRange(
  1721. const ReadOptions& read_options, const autovector<UserKeyRange>& ranges,
  1722. TablePropertiesCollection* props) const {
  1723. for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) {
  1724. for (const auto& range : ranges) {
  1725. // Convert user_key into a corresponding internal key.
  1726. InternalKey k1(range.start, kMaxSequenceNumber, kValueTypeForSeek);
  1727. InternalKey k2(range.limit, kMaxSequenceNumber, kValueTypeForSeek);
  1728. std::vector<FileMetaData*> files;
  1729. storage_info_.GetOverlappingInputs(level, &k1, &k2, &files, -1, nullptr,
  1730. false);
  1731. for (const auto& file_meta : files) {
  1732. auto fname =
  1733. TableFileName(cfd_->ioptions().cf_paths, file_meta->fd.GetNumber(),
  1734. file_meta->fd.GetPathId());
  1735. if (props->count(fname) == 0) {
  1736. // 1. If the table is already present in table cache, load table
  1737. // properties from there.
  1738. std::shared_ptr<const TableProperties> table_properties;
  1739. Status s = GetTableProperties(read_options, &table_properties,
  1740. file_meta, &fname);
  1741. if (s.ok()) {
  1742. props->insert({fname, table_properties});
  1743. } else {
  1744. return s;
  1745. }
  1746. }
  1747. }
  1748. }
  1749. }
  1750. return Status::OK();
  1751. }
  1752. Status Version::GetPropertiesOfTablesByLevel(
  1753. const ReadOptions& read_options,
  1754. std::vector<std::unique_ptr<TablePropertiesCollection>>* props_by_level)
  1755. const {
  1756. Status s;
  1757. props_by_level->reserve(storage_info_.num_levels_);
  1758. for (int level = 0; level < storage_info_.num_levels_; level++) {
  1759. props_by_level->push_back(std::make_unique<TablePropertiesCollection>());
  1760. s = GetPropertiesOfAllTables(read_options, props_by_level->back().get(),
  1761. level);
  1762. if (!s.ok()) {
  1763. return s;
  1764. }
  1765. }
  1766. return Status::OK();
  1767. }
  1768. Status Version::GetAggregatedTableProperties(
  1769. const ReadOptions& read_options, std::shared_ptr<const TableProperties>* tp,
  1770. int level) {
  1771. TablePropertiesCollection props;
  1772. Status s;
  1773. if (level < 0) {
  1774. s = GetPropertiesOfAllTables(read_options, &props);
  1775. } else {
  1776. s = GetPropertiesOfAllTables(read_options, &props, level);
  1777. }
  1778. if (!s.ok()) {
  1779. return s;
  1780. }
  1781. auto* new_tp = new TableProperties();
  1782. for (const auto& item : props) {
  1783. new_tp->Add(*item.second);
  1784. }
  1785. tp->reset(new_tp);
  1786. return Status::OK();
  1787. }
  1788. size_t Version::GetMemoryUsageByTableReaders(const ReadOptions& read_options) {
  1789. size_t total_usage = 0;
  1790. for (auto& file_level : storage_info_.level_files_brief_) {
  1791. for (size_t i = 0; i < file_level.num_files; i++) {
  1792. total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader(
  1793. file_options_, read_options, cfd_->internal_comparator(),
  1794. *file_level.files[i].file_metadata, mutable_cf_options_);
  1795. }
  1796. }
  1797. return total_usage;
  1798. }
  1799. void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
  1800. assert(cf_meta);
  1801. assert(cfd_);
  1802. cf_meta->name = cfd_->GetName();
  1803. cf_meta->size = 0;
  1804. cf_meta->file_count = 0;
  1805. cf_meta->levels.clear();
  1806. cf_meta->blob_file_size = 0;
  1807. cf_meta->blob_file_count = 0;
  1808. cf_meta->blob_files.clear();
  1809. const auto& ioptions = cfd_->ioptions();
  1810. auto* vstorage = storage_info();
  1811. for (int level = 0; level < cfd_->NumberLevels(); level++) {
  1812. uint64_t level_size = 0;
  1813. cf_meta->file_count += vstorage->LevelFiles(level).size();
  1814. std::vector<SstFileMetaData> files;
  1815. for (const auto& file : vstorage->LevelFiles(level)) {
  1816. uint32_t path_id = file->fd.GetPathId();
  1817. std::string file_path;
  1818. if (path_id < ioptions.cf_paths.size()) {
  1819. file_path = ioptions.cf_paths[path_id].path;
  1820. } else {
  1821. assert(!ioptions.cf_paths.empty());
  1822. file_path = ioptions.cf_paths.back().path;
  1823. }
  1824. const uint64_t file_number = file->fd.GetNumber();
  1825. files.emplace_back(
  1826. MakeTableFileName("", file_number), file_number, file_path,
  1827. file->fd.GetFileSize(), file->fd.smallest_seqno,
  1828. file->fd.largest_seqno, file->smallest.user_key().ToString(),
  1829. file->largest.user_key().ToString(),
  1830. file->stats.num_reads_sampled.load(std::memory_order_relaxed),
  1831. file->being_compacted, file->temperature,
  1832. file->oldest_blob_file_number, file->TryGetOldestAncesterTime(),
  1833. file->TryGetFileCreationTime(), file->epoch_number,
  1834. file->file_checksum, file->file_checksum_func_name);
  1835. files.back().num_entries = file->num_entries;
  1836. files.back().num_deletions = file->num_deletions;
  1837. files.back().smallest = file->smallest.Encode().ToString();
  1838. files.back().largest = file->largest.Encode().ToString();
  1839. level_size += file->fd.GetFileSize();
  1840. }
  1841. cf_meta->levels.emplace_back(level, level_size, std::move(files));
  1842. cf_meta->size += level_size;
  1843. }
  1844. for (const auto& meta : vstorage->GetBlobFiles()) {
  1845. assert(meta);
  1846. cf_meta->blob_files.emplace_back(
  1847. meta->GetBlobFileNumber(), BlobFileName("", meta->GetBlobFileNumber()),
  1848. ioptions.cf_paths.front().path, meta->GetBlobFileSize(),
  1849. meta->GetTotalBlobCount(), meta->GetTotalBlobBytes(),
  1850. meta->GetGarbageBlobCount(), meta->GetGarbageBlobBytes(),
  1851. meta->GetChecksumMethod(), meta->GetChecksumValue());
  1852. ++cf_meta->blob_file_count;
  1853. cf_meta->blob_file_size += meta->GetBlobFileSize();
  1854. }
  1855. }
  1856. uint64_t Version::GetSstFilesSize() {
  1857. uint64_t sst_files_size = 0;
  1858. for (int level = 0; level < storage_info_.num_levels_; level++) {
  1859. for (const auto& file_meta : storage_info_.LevelFiles(level)) {
  1860. sst_files_size += file_meta->fd.GetFileSize();
  1861. }
  1862. }
  1863. return sst_files_size;
  1864. }
  1865. void Version::GetSstFilesBoundaryKeys(Slice* smallest_user_key,
  1866. Slice* largest_user_key) {
  1867. smallest_user_key->clear();
  1868. largest_user_key->clear();
  1869. bool initialized = false;
  1870. const Comparator* ucmp = storage_info_.user_comparator_;
  1871. for (int level = 0; level < cfd_->NumberLevels(); level++) {
  1872. if (storage_info_.LevelFiles(level).size() == 0) {
  1873. continue;
  1874. }
  1875. if (level == 0) {
  1876. // we need to consider all files on level 0
  1877. for (const auto& file : storage_info_.LevelFiles(level)) {
  1878. const Slice& start_user_key = file->smallest.user_key();
  1879. if (!initialized ||
  1880. ucmp->Compare(start_user_key, *smallest_user_key) < 0) {
  1881. *smallest_user_key = start_user_key;
  1882. }
  1883. const Slice& end_user_key = file->largest.user_key();
  1884. if (!initialized ||
  1885. ucmp->Compare(end_user_key, *largest_user_key) > 0) {
  1886. *largest_user_key = end_user_key;
  1887. }
  1888. initialized = true;
  1889. }
  1890. } else {
  1891. // we only need to consider the first and last file
  1892. const Slice& start_user_key =
  1893. storage_info_.LevelFiles(level)[0]->smallest.user_key();
  1894. if (!initialized ||
  1895. ucmp->Compare(start_user_key, *smallest_user_key) < 0) {
  1896. *smallest_user_key = start_user_key;
  1897. }
  1898. const Slice& end_user_key =
  1899. storage_info_.LevelFiles(level).back()->largest.user_key();
  1900. if (!initialized || ucmp->Compare(end_user_key, *largest_user_key) > 0) {
  1901. *largest_user_key = end_user_key;
  1902. }
  1903. initialized = true;
  1904. }
  1905. }
  1906. }
  1907. void Version::GetCreationTimeOfOldestFile(uint64_t* creation_time) {
  1908. uint64_t oldest_time = std::numeric_limits<uint64_t>::max();
  1909. for (int level = 0; level < storage_info_.num_non_empty_levels_; level++) {
  1910. for (FileMetaData* meta : storage_info_.LevelFiles(level)) {
  1911. assert(meta->fd.table_reader != nullptr);
  1912. uint64_t file_creation_time = meta->TryGetFileCreationTime();
  1913. if (file_creation_time == kUnknownFileCreationTime) {
  1914. *creation_time = 0;
  1915. return;
  1916. }
  1917. if (file_creation_time < oldest_time) {
  1918. oldest_time = file_creation_time;
  1919. }
  1920. }
  1921. }
  1922. *creation_time = oldest_time;
  1923. }
  1924. InternalIterator* Version::TEST_GetLevelIterator(
  1925. const ReadOptions& read_options, MergeIteratorBuilder* merge_iter_builder,
  1926. int level, bool allow_unprepared_value) {
  1927. auto* arena = merge_iter_builder->GetArena();
  1928. auto* mem = arena->AllocateAligned(sizeof(LevelIterator));
  1929. std::unique_ptr<TruncatedRangeDelIterator>** tombstone_iter_ptr = nullptr;
  1930. auto level_iter = new (mem) LevelIterator(
  1931. cfd_->table_cache(), read_options, file_options_,
  1932. cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
  1933. mutable_cf_options_, should_sample_file_read(),
  1934. cfd_->internal_stats()->GetFileReadHist(level),
  1935. TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
  1936. nullptr /* range_del_agg */, nullptr /* compaction_boundaries */,
  1937. allow_unprepared_value, &tombstone_iter_ptr);
  1938. if (read_options.ignore_range_deletions) {
  1939. merge_iter_builder->AddIterator(level_iter);
  1940. } else {
  1941. merge_iter_builder->AddPointAndTombstoneIterator(
  1942. level_iter, nullptr /* tombstone_iter */, tombstone_iter_ptr);
  1943. }
  1944. return level_iter;
  1945. }
  1946. uint64_t VersionStorageInfo::GetEstimatedActiveKeys() const {
  1947. // Estimation will be inaccurate when:
  1948. // (1) there exist merge keys
  1949. // (2) keys are directly overwritten
  1950. // (3) deletion on non-existing keys
  1951. // (4) low number of samples
  1952. if (current_num_samples_ == 0) {
  1953. return 0;
  1954. }
  1955. if (current_num_non_deletions_ <= current_num_deletions_) {
  1956. return 0;
  1957. }
  1958. uint64_t est = current_num_non_deletions_ - current_num_deletions_;
  1959. uint64_t file_count = 0;
  1960. for (int level = 0; level < num_levels_; ++level) {
  1961. file_count += files_[level].size();
  1962. }
  1963. if (current_num_samples_ < file_count) {
  1964. assert(current_num_samples_ != 0);
  1965. assert(est != 0);
  1966. double multiplier = static_cast<double>(file_count) / current_num_samples_;
  1967. double maximum_multiplier =
  1968. static_cast<double>(std::numeric_limits<uint64_t>::max()) / est;
  1969. // If it can overflow, we return the maximum unsigned long.
  1970. if (multiplier >= maximum_multiplier) {
  1971. return std::numeric_limits<uint64_t>::max();
  1972. }
  1973. return static_cast<uint64_t>(est * multiplier);
  1974. } else {
  1975. return est;
  1976. }
  1977. }
  1978. double VersionStorageInfo::GetEstimatedCompressionRatioAtLevel(
  1979. int level) const {
  1980. assert(level < num_levels_);
  1981. uint64_t sum_file_size_bytes = 0;
  1982. uint64_t sum_data_size_bytes = 0;
  1983. for (auto* file_meta : files_[level]) {
  1984. auto raw_size = file_meta->raw_key_size + file_meta->raw_value_size;
  1985. // Check if the table property is properly initialized. It might not be
  1986. // because in `UpdateAccumulatedStats` we limit the maximum number of
  1987. // properties to read once.
  1988. if (raw_size > 0) {
  1989. sum_file_size_bytes += file_meta->fd.GetFileSize();
  1990. sum_data_size_bytes += raw_size;
  1991. }
  1992. }
  1993. if (sum_file_size_bytes == 0) {
  1994. return -1.0;
  1995. }
  1996. return static_cast<double>(sum_data_size_bytes) / sum_file_size_bytes;
  1997. }
  1998. void Version::AddIterators(const ReadOptions& read_options,
  1999. const FileOptions& soptions,
  2000. MergeIteratorBuilder* merge_iter_builder,
  2001. bool allow_unprepared_value) {
  2002. assert(storage_info_.finalized_);
  2003. for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) {
  2004. AddIteratorsForLevel(read_options, soptions, merge_iter_builder, level,
  2005. allow_unprepared_value);
  2006. }
  2007. }
  2008. void Version::AddIteratorsForLevel(const ReadOptions& read_options,
  2009. const FileOptions& soptions,
  2010. MergeIteratorBuilder* merge_iter_builder,
  2011. int level, bool allow_unprepared_value) {
  2012. assert(storage_info_.finalized_);
  2013. if (level >= storage_info_.num_non_empty_levels()) {
  2014. // This is an empty level
  2015. return;
  2016. } else if (storage_info_.LevelFilesBrief(level).num_files == 0) {
  2017. // No files in this level
  2018. return;
  2019. }
  2020. bool should_sample = should_sample_file_read();
  2021. auto* arena = merge_iter_builder->GetArena();
  2022. if (level == 0) {
  2023. // Merge all level zero files together since they may overlap
  2024. std::unique_ptr<TruncatedRangeDelIterator> tombstone_iter = nullptr;
  2025. for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) {
  2026. const auto& file = storage_info_.LevelFilesBrief(0).files[i];
  2027. auto table_iter = cfd_->table_cache()->NewIterator(
  2028. read_options, soptions, cfd_->internal_comparator(),
  2029. *file.file_metadata, /*range_del_agg=*/nullptr, mutable_cf_options_,
  2030. nullptr, cfd_->internal_stats()->GetFileReadHist(0),
  2031. TableReaderCaller::kUserIterator, arena,
  2032. /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_,
  2033. /*smallest_compaction_key=*/nullptr,
  2034. /*largest_compaction_key=*/nullptr, allow_unprepared_value,
  2035. /*range_del_read_seqno=*/nullptr, &tombstone_iter);
  2036. if (read_options.ignore_range_deletions) {
  2037. merge_iter_builder->AddIterator(table_iter);
  2038. } else {
  2039. merge_iter_builder->AddPointAndTombstoneIterator(
  2040. table_iter, std::move(tombstone_iter));
  2041. }
  2042. }
  2043. if (should_sample) {
  2044. // Count ones for every L0 files. This is done per iterator creation
  2045. // rather than Seek(), while files in other levels are recored per seek.
  2046. // If users execute one range query per iterator, there may be some
  2047. // discrepancy here.
  2048. for (FileMetaData* meta : storage_info_.LevelFiles(0)) {
  2049. sample_file_read_inc(meta);
  2050. }
  2051. }
  2052. } else if (storage_info_.LevelFilesBrief(level).num_files > 0) {
  2053. // For levels > 0, we can use a concatenating iterator that sequentially
  2054. // walks through the non-overlapping files in the level, opening them
  2055. // lazily.
  2056. auto* mem = arena->AllocateAligned(sizeof(LevelIterator));
  2057. std::unique_ptr<TruncatedRangeDelIterator>** tombstone_iter_ptr = nullptr;
  2058. auto level_iter = new (mem) LevelIterator(
  2059. cfd_->table_cache(), read_options, soptions,
  2060. cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
  2061. mutable_cf_options_, should_sample_file_read(),
  2062. cfd_->internal_stats()->GetFileReadHist(level),
  2063. TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
  2064. /*range_del_agg=*/nullptr,
  2065. /*compaction_boundaries=*/nullptr, allow_unprepared_value,
  2066. &tombstone_iter_ptr);
  2067. if (read_options.ignore_range_deletions) {
  2068. merge_iter_builder->AddIterator(level_iter);
  2069. } else {
  2070. merge_iter_builder->AddPointAndTombstoneIterator(
  2071. level_iter, nullptr /* tombstone_iter */, tombstone_iter_ptr);
  2072. }
  2073. }
  2074. }
  2075. Status Version::OverlapWithLevelIterator(const ReadOptions& read_options,
  2076. const FileOptions& file_options,
  2077. const Slice& smallest_user_key,
  2078. const Slice& largest_user_key,
  2079. int level, bool* overlap) {
  2080. assert(storage_info_.finalized_);
  2081. auto icmp = cfd_->internal_comparator();
  2082. auto ucmp = icmp.user_comparator();
  2083. Arena arena;
  2084. Status status;
  2085. ReadRangeDelAggregator range_del_agg(&icmp,
  2086. kMaxSequenceNumber /* upper_bound */);
  2087. *overlap = false;
  2088. if (level == 0) {
  2089. for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) {
  2090. const auto file = &storage_info_.LevelFilesBrief(0).files[i];
  2091. if (AfterFile(ucmp, &smallest_user_key, file) ||
  2092. BeforeFile(ucmp, &largest_user_key, file)) {
  2093. continue;
  2094. }
  2095. ScopedArenaPtr<InternalIterator> iter(cfd_->table_cache()->NewIterator(
  2096. read_options, file_options, cfd_->internal_comparator(),
  2097. *file->file_metadata, &range_del_agg, mutable_cf_options_, nullptr,
  2098. cfd_->internal_stats()->GetFileReadHist(0),
  2099. TableReaderCaller::kUserIterator, &arena,
  2100. /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_,
  2101. /*smallest_compaction_key=*/nullptr,
  2102. /*largest_compaction_key=*/nullptr,
  2103. /*allow_unprepared_value=*/false));
  2104. status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key,
  2105. iter.get(), overlap);
  2106. if (!status.ok() || *overlap) {
  2107. break;
  2108. }
  2109. }
  2110. } else if (storage_info_.LevelFilesBrief(level).num_files > 0) {
  2111. auto mem = arena.AllocateAligned(sizeof(LevelIterator));
  2112. ScopedArenaPtr<InternalIterator> iter(new (mem) LevelIterator(
  2113. cfd_->table_cache(), read_options, file_options,
  2114. cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
  2115. mutable_cf_options_, should_sample_file_read(),
  2116. cfd_->internal_stats()->GetFileReadHist(level),
  2117. TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
  2118. &range_del_agg, nullptr, false));
  2119. status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key,
  2120. iter.get(), overlap);
  2121. }
  2122. if (status.ok() && *overlap == false &&
  2123. range_del_agg.IsRangeOverlapped(smallest_user_key, largest_user_key)) {
  2124. *overlap = true;
  2125. }
  2126. return status;
  2127. }
  2128. VersionStorageInfo::VersionStorageInfo(
  2129. const InternalKeyComparator* internal_comparator,
  2130. const Comparator* user_comparator, int levels,
  2131. CompactionStyle compaction_style, VersionStorageInfo* ref_vstorage,
  2132. bool _force_consistency_checks,
  2133. EpochNumberRequirement epoch_number_requirement, SystemClock* clock,
  2134. uint32_t bottommost_file_compaction_delay,
  2135. OffpeakTimeOption offpeak_time_option)
  2136. : internal_comparator_(internal_comparator),
  2137. user_comparator_(user_comparator),
  2138. // cfd is nullptr if Version is dummy
  2139. num_levels_(levels),
  2140. num_non_empty_levels_(0),
  2141. file_indexer_(user_comparator),
  2142. compaction_style_(compaction_style),
  2143. files_(new std::vector<FileMetaData*>[num_levels_]),
  2144. base_level_(num_levels_ == 1 ? -1 : 1),
  2145. lowest_unnecessary_level_(-1),
  2146. level_multiplier_(0.0),
  2147. files_by_compaction_pri_(num_levels_),
  2148. level0_non_overlapping_(false),
  2149. next_file_to_compact_by_size_(num_levels_),
  2150. compaction_score_(num_levels_),
  2151. compaction_level_(num_levels_),
  2152. l0_delay_trigger_count_(0),
  2153. compact_cursor_(num_levels_),
  2154. accumulated_file_size_(0),
  2155. accumulated_raw_key_size_(0),
  2156. accumulated_raw_value_size_(0),
  2157. accumulated_num_non_deletions_(0),
  2158. accumulated_num_deletions_(0),
  2159. current_num_non_deletions_(0),
  2160. current_num_deletions_(0),
  2161. current_num_samples_(0),
  2162. estimated_compaction_needed_bytes_(0),
  2163. clock_(clock),
  2164. bottommost_file_compaction_delay_(bottommost_file_compaction_delay),
  2165. finalized_(false),
  2166. force_consistency_checks_(_force_consistency_checks),
  2167. epoch_number_requirement_(epoch_number_requirement),
  2168. offpeak_time_option_(std::move(offpeak_time_option)) {
  2169. if (ref_vstorage != nullptr) {
  2170. accumulated_file_size_ = ref_vstorage->accumulated_file_size_;
  2171. accumulated_raw_key_size_ = ref_vstorage->accumulated_raw_key_size_;
  2172. accumulated_raw_value_size_ = ref_vstorage->accumulated_raw_value_size_;
  2173. accumulated_num_non_deletions_ =
  2174. ref_vstorage->accumulated_num_non_deletions_;
  2175. accumulated_num_deletions_ = ref_vstorage->accumulated_num_deletions_;
  2176. current_num_non_deletions_ = ref_vstorage->current_num_non_deletions_;
  2177. current_num_deletions_ = ref_vstorage->current_num_deletions_;
  2178. current_num_samples_ = ref_vstorage->current_num_samples_;
  2179. oldest_snapshot_seqnum_ = ref_vstorage->oldest_snapshot_seqnum_;
  2180. compact_cursor_ = ref_vstorage->compact_cursor_;
  2181. compact_cursor_.resize(num_levels_);
  2182. }
  2183. }
  2184. Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
  2185. const FileOptions& file_opt,
  2186. const MutableCFOptions& mutable_cf_options,
  2187. const std::shared_ptr<IOTracer>& io_tracer,
  2188. uint64_t version_number,
  2189. EpochNumberRequirement epoch_number_requirement)
  2190. : env_(vset->env_),
  2191. clock_(vset->clock_),
  2192. cfd_(column_family_data),
  2193. info_log_((cfd_ == nullptr) ? nullptr : cfd_->ioptions().logger),
  2194. db_statistics_((cfd_ == nullptr) ? nullptr : cfd_->ioptions().stats),
  2195. table_cache_((cfd_ == nullptr) ? nullptr : cfd_->table_cache()),
  2196. blob_source_(cfd_ ? cfd_->blob_source() : nullptr),
  2197. merge_operator_(
  2198. (cfd_ == nullptr) ? nullptr : cfd_->ioptions().merge_operator.get()),
  2199. storage_info_(
  2200. (cfd_ == nullptr) ? nullptr : &cfd_->internal_comparator(),
  2201. (cfd_ == nullptr) ? nullptr : cfd_->user_comparator(),
  2202. cfd_ == nullptr ? 0 : cfd_->NumberLevels(),
  2203. cfd_ == nullptr ? kCompactionStyleLevel
  2204. : cfd_->ioptions().compaction_style,
  2205. (cfd_ == nullptr || cfd_->current() == nullptr)
  2206. ? nullptr
  2207. : cfd_->current()->storage_info(),
  2208. cfd_ == nullptr ? false : cfd_->ioptions().force_consistency_checks,
  2209. epoch_number_requirement,
  2210. cfd_ == nullptr ? nullptr : cfd_->ioptions().clock,
  2211. cfd_ == nullptr ? 0
  2212. : mutable_cf_options.bottommost_file_compaction_delay,
  2213. vset->offpeak_time_option()),
  2214. vset_(vset),
  2215. next_(this),
  2216. prev_(this),
  2217. refs_(0),
  2218. file_options_(file_opt),
  2219. mutable_cf_options_(mutable_cf_options),
  2220. max_file_size_for_l0_meta_pin_(
  2221. MaxFileSizeForL0MetaPin(mutable_cf_options_)),
  2222. version_number_(version_number),
  2223. io_tracer_(io_tracer),
  2224. use_async_io_(false) {
  2225. if (CheckFSFeatureSupport(env_->GetFileSystem().get(),
  2226. FSSupportedOps::kAsyncIO)) {
  2227. use_async_io_ = true;
  2228. }
  2229. }
  2230. Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key,
  2231. const Slice& blob_index_slice,
  2232. FilePrefetchBuffer* prefetch_buffer,
  2233. PinnableSlice* value, uint64_t* bytes_read) const {
  2234. BlobIndex blob_index;
  2235. {
  2236. Status s = blob_index.DecodeFrom(blob_index_slice);
  2237. if (!s.ok()) {
  2238. return s;
  2239. }
  2240. }
  2241. return GetBlob(read_options, user_key, blob_index, prefetch_buffer, value,
  2242. bytes_read);
  2243. }
  2244. Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key,
  2245. const BlobIndex& blob_index,
  2246. FilePrefetchBuffer* prefetch_buffer,
  2247. PinnableSlice* value, uint64_t* bytes_read) const {
  2248. assert(value);
  2249. if (blob_index.HasTTL() || blob_index.IsInlined()) {
  2250. return Status::Corruption("Unexpected TTL/inlined blob index");
  2251. }
  2252. const uint64_t blob_file_number = blob_index.file_number();
  2253. auto blob_file_meta = storage_info_.GetBlobFileMetaData(blob_file_number);
  2254. if (!blob_file_meta) {
  2255. return Status::Corruption("Invalid blob file number");
  2256. }
  2257. assert(blob_source_);
  2258. value->Reset();
  2259. const Status s = blob_source_->GetBlob(
  2260. read_options, user_key, blob_file_number, blob_index.offset(),
  2261. blob_file_meta->GetBlobFileSize(), blob_index.size(),
  2262. blob_index.compression(), prefetch_buffer, value, bytes_read);
  2263. return s;
  2264. }
  2265. void Version::MultiGetBlob(
  2266. const ReadOptions& read_options, MultiGetRange& range,
  2267. std::unordered_map<uint64_t, BlobReadContexts>& blob_ctxs) {
  2268. assert(!blob_ctxs.empty());
  2269. autovector<BlobFileReadRequests> blob_reqs;
  2270. for (auto& ctx : blob_ctxs) {
  2271. const auto file_number = ctx.first;
  2272. const auto blob_file_meta = storage_info_.GetBlobFileMetaData(file_number);
  2273. autovector<BlobReadRequest> blob_reqs_in_file;
  2274. BlobReadContexts& blobs_in_file = ctx.second;
  2275. for (auto& blob : blobs_in_file) {
  2276. const BlobIndex& blob_index = blob.blob_index;
  2277. const KeyContext* const key_context = blob.key_context;
  2278. assert(key_context);
  2279. assert(key_context->get_context);
  2280. assert(key_context->s);
  2281. if (key_context->value) {
  2282. key_context->value->Reset();
  2283. } else {
  2284. assert(key_context->columns);
  2285. key_context->columns->Reset();
  2286. }
  2287. if (!blob_file_meta) {
  2288. *key_context->s = Status::Corruption("Invalid blob file number");
  2289. continue;
  2290. }
  2291. if (blob_index.HasTTL() || blob_index.IsInlined()) {
  2292. *key_context->s =
  2293. Status::Corruption("Unexpected TTL/inlined blob index");
  2294. continue;
  2295. }
  2296. blob_reqs_in_file.emplace_back(
  2297. key_context->get_context->ukey_to_get_blob_value(),
  2298. blob_index.offset(), blob_index.size(), blob_index.compression(),
  2299. &blob.result, key_context->s);
  2300. }
  2301. if (blob_reqs_in_file.size() > 0) {
  2302. const auto file_size = blob_file_meta->GetBlobFileSize();
  2303. blob_reqs.emplace_back(file_number, file_size, blob_reqs_in_file);
  2304. }
  2305. }
  2306. if (blob_reqs.size() > 0) {
  2307. blob_source_->MultiGetBlob(read_options, blob_reqs,
  2308. /*bytes_read=*/nullptr);
  2309. }
  2310. for (auto& ctx : blob_ctxs) {
  2311. BlobReadContexts& blobs_in_file = ctx.second;
  2312. for (auto& blob : blobs_in_file) {
  2313. const KeyContext* const key_context = blob.key_context;
  2314. assert(key_context);
  2315. assert(key_context->get_context);
  2316. assert(key_context->s);
  2317. if (key_context->s->ok()) {
  2318. if (key_context->value) {
  2319. *key_context->value = std::move(blob.result);
  2320. range.AddValueSize(key_context->value->size());
  2321. } else {
  2322. assert(key_context->columns);
  2323. key_context->columns->SetPlainValue(std::move(blob.result));
  2324. range.AddValueSize(key_context->columns->serialized_size());
  2325. }
  2326. if (range.GetValueSize() > read_options.value_size_soft_limit) {
  2327. *key_context->s = Status::Aborted();
  2328. }
  2329. } else if (key_context->s->IsIncomplete()) {
  2330. // read_options.read_tier == kBlockCacheTier
  2331. // Cannot read blob(s): no disk I/O allowed
  2332. auto& get_context = *(key_context->get_context);
  2333. get_context.MarkKeyMayExist();
  2334. }
  2335. }
  2336. }
  2337. }
  2338. void Version::Get(const ReadOptions& read_options, const LookupKey& k,
  2339. PinnableSlice* value, PinnableWideColumns* columns,
  2340. std::string* timestamp, Status* status,
  2341. MergeContext* merge_context,
  2342. SequenceNumber* max_covering_tombstone_seq,
  2343. PinnedIteratorsManager* pinned_iters_mgr, bool* value_found,
  2344. bool* key_exists, SequenceNumber* seq, ReadCallback* callback,
  2345. bool* is_blob, bool do_merge) {
  2346. Slice ikey = k.internal_key();
  2347. Slice user_key = k.user_key();
  2348. assert(status->ok() || status->IsMergeInProgress());
  2349. if (key_exists != nullptr) {
  2350. // will falsify below if not found
  2351. *key_exists = true;
  2352. }
  2353. uint64_t tracing_get_id = BlockCacheTraceHelper::kReservedGetId;
  2354. if (vset_ && vset_->block_cache_tracer_ &&
  2355. vset_->block_cache_tracer_->is_tracing_enabled()) {
  2356. tracing_get_id = vset_->block_cache_tracer_->NextGetId();
  2357. }
  2358. // Note: the old StackableDB-based BlobDB passes in
  2359. // GetImplOptions::is_blob_index; for the integrated BlobDB implementation, we
  2360. // need to provide it here.
  2361. bool is_blob_index = false;
  2362. bool* const is_blob_to_use = is_blob ? is_blob : &is_blob_index;
  2363. BlobFetcher blob_fetcher(this, read_options);
  2364. assert(pinned_iters_mgr);
  2365. GetContext get_context(
  2366. user_comparator(), merge_operator_, info_log_, db_statistics_,
  2367. status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key,
  2368. do_merge ? value : nullptr, do_merge ? columns : nullptr,
  2369. do_merge ? timestamp : nullptr, value_found, merge_context, do_merge,
  2370. max_covering_tombstone_seq, clock_, seq,
  2371. merge_operator_ ? pinned_iters_mgr : nullptr, callback, is_blob_to_use,
  2372. tracing_get_id, &blob_fetcher);
  2373. // Pin blocks that we read to hold merge operands
  2374. if (merge_operator_) {
  2375. pinned_iters_mgr->StartPinning();
  2376. }
  2377. FilePicker fp(user_key, ikey, &storage_info_.level_files_brief_,
  2378. storage_info_.num_non_empty_levels_,
  2379. &storage_info_.file_indexer_, user_comparator(),
  2380. internal_comparator());
  2381. FdWithKeyRange* f = fp.GetNextFile();
  2382. while (f != nullptr) {
  2383. if (*max_covering_tombstone_seq > 0) {
  2384. // The remaining files we look at will only contain covered keys, so we
  2385. // stop here.
  2386. break;
  2387. }
  2388. if (get_context.sample()) {
  2389. sample_file_read_inc(f->file_metadata);
  2390. }
  2391. bool timer_enabled =
  2392. GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex &&
  2393. get_perf_context()->per_level_perf_context_enabled;
  2394. StopWatchNano timer(clock_, timer_enabled /* auto_start */);
  2395. *status = table_cache_->Get(
  2396. read_options, *internal_comparator(), *f->file_metadata, ikey,
  2397. &get_context, mutable_cf_options_,
  2398. cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
  2399. IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
  2400. fp.IsHitFileLastInLevel()),
  2401. fp.GetHitFileLevel(), max_file_size_for_l0_meta_pin_);
  2402. // TODO: examine the behavior for corrupted key
  2403. if (timer_enabled) {
  2404. PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(),
  2405. fp.GetHitFileLevel());
  2406. }
  2407. if (!status->ok()) {
  2408. if (db_statistics_ != nullptr) {
  2409. get_context.ReportCounters();
  2410. }
  2411. return;
  2412. }
  2413. // report the counters before returning
  2414. if (get_context.State() != GetContext::kNotFound &&
  2415. get_context.State() != GetContext::kMerge &&
  2416. db_statistics_ != nullptr) {
  2417. get_context.ReportCounters();
  2418. }
  2419. switch (get_context.State()) {
  2420. case GetContext::kNotFound:
  2421. // Keep searching in other files
  2422. break;
  2423. case GetContext::kMerge:
  2424. // TODO: update per-level perfcontext user_key_return_count for kMerge
  2425. break;
  2426. case GetContext::kFound:
  2427. if (fp.GetHitFileLevel() == 0) {
  2428. RecordTick(db_statistics_, GET_HIT_L0);
  2429. } else if (fp.GetHitFileLevel() == 1) {
  2430. RecordTick(db_statistics_, GET_HIT_L1);
  2431. } else if (fp.GetHitFileLevel() >= 2) {
  2432. RecordTick(db_statistics_, GET_HIT_L2_AND_UP);
  2433. }
  2434. PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1,
  2435. fp.GetHitFileLevel());
  2436. if (is_blob_index && do_merge && (value || columns)) {
  2437. Slice blob_index =
  2438. value ? *value
  2439. : WideColumnsHelper::GetDefaultColumn(columns->columns());
  2440. TEST_SYNC_POINT_CALLBACK("Version::Get::TamperWithBlobIndex",
  2441. &blob_index);
  2442. constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
  2443. PinnableSlice result;
  2444. constexpr uint64_t* bytes_read = nullptr;
  2445. *status = GetBlob(read_options, get_context.ukey_to_get_blob_value(),
  2446. blob_index, prefetch_buffer, &result, bytes_read);
  2447. if (!status->ok()) {
  2448. if (status->IsIncomplete()) {
  2449. get_context.MarkKeyMayExist();
  2450. }
  2451. return;
  2452. }
  2453. if (value) {
  2454. *value = std::move(result);
  2455. } else {
  2456. assert(columns);
  2457. columns->SetPlainValue(std::move(result));
  2458. }
  2459. }
  2460. return;
  2461. case GetContext::kDeleted:
  2462. // Use empty error message for speed
  2463. *status = Status::NotFound();
  2464. return;
  2465. case GetContext::kCorrupt:
  2466. *status = Status::Corruption("corrupted key for ", user_key);
  2467. return;
  2468. case GetContext::kUnexpectedBlobIndex:
  2469. ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index.");
  2470. *status = Status::NotSupported(
  2471. "Encounter unexpected blob index. Please open DB with "
  2472. "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
  2473. return;
  2474. case GetContext::kMergeOperatorFailed:
  2475. *status = Status::Corruption(Status::SubCode::kMergeOperatorFailed);
  2476. return;
  2477. }
  2478. f = fp.GetNextFile();
  2479. }
  2480. if (db_statistics_ != nullptr) {
  2481. get_context.ReportCounters();
  2482. }
  2483. if (GetContext::kMerge == get_context.State()) {
  2484. if (!do_merge) {
  2485. *status = Status::OK();
  2486. return;
  2487. }
  2488. if (!merge_operator_) {
  2489. *status = Status::InvalidArgument(
  2490. "merge_operator is not properly initialized.");
  2491. return;
  2492. }
  2493. // merge_operands are in saver and we hit the beginning of the key history
  2494. // do a final merge of nullptr and operands;
  2495. if (value || columns) {
  2496. // `op_failure_scope` (an output parameter) is not provided (set to
  2497. // nullptr) since a failure must be propagated regardless of its value.
  2498. *status = MergeHelper::TimedFullMerge(
  2499. merge_operator_, user_key, MergeHelper::kNoBaseValue,
  2500. merge_context->GetOperands(), info_log_, db_statistics_, clock_,
  2501. /* update_num_ops_stats */ true, /* op_failure_scope */ nullptr,
  2502. value ? value->GetSelf() : nullptr, columns);
  2503. if (status->ok()) {
  2504. if (LIKELY(value != nullptr)) {
  2505. value->PinSelf();
  2506. }
  2507. }
  2508. }
  2509. } else {
  2510. if (key_exists != nullptr) {
  2511. *key_exists = false;
  2512. }
  2513. *status = Status::NotFound(); // Use an empty error message for speed
  2514. }
  2515. }
  2516. void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
  2517. ReadCallback* callback) {
  2518. PinnedIteratorsManager pinned_iters_mgr;
  2519. // Pin blocks that we read to hold merge operands
  2520. if (merge_operator_) {
  2521. pinned_iters_mgr.StartPinning();
  2522. }
  2523. uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId;
  2524. if (vset_ && vset_->block_cache_tracer_ &&
  2525. vset_->block_cache_tracer_->is_tracing_enabled()) {
  2526. tracing_mget_id = vset_->block_cache_tracer_->NextGetId();
  2527. }
  2528. // Even though we know the batch size won't be > MAX_BATCH_SIZE,
  2529. // use autovector in order to avoid unnecessary construction of GetContext
  2530. // objects, which is expensive
  2531. autovector<GetContext, 16> get_ctx;
  2532. BlobFetcher blob_fetcher(this, read_options);
  2533. for (auto iter = range->begin(); iter != range->end(); ++iter) {
  2534. assert(iter->s->ok() || iter->s->IsMergeInProgress());
  2535. get_ctx.emplace_back(
  2536. user_comparator(), merge_operator_, info_log_, db_statistics_,
  2537. iter->s->ok() ? GetContext::kNotFound : GetContext::kMerge,
  2538. iter->ukey_with_ts, iter->value, iter->columns, iter->timestamp,
  2539. nullptr, &(iter->merge_context), true,
  2540. &iter->max_covering_tombstone_seq, clock_, nullptr,
  2541. merge_operator_ ? &pinned_iters_mgr : nullptr, callback,
  2542. &iter->is_blob_index, tracing_mget_id, &blob_fetcher);
  2543. // MergeInProgress status, if set, has been transferred to the get_context
  2544. // state, so we set status to ok here. From now on, the iter status will
  2545. // be used for IO errors, and get_context state will be used for any
  2546. // key level errors
  2547. *(iter->s) = Status::OK();
  2548. }
  2549. int get_ctx_index = 0;
  2550. for (auto iter = range->begin(); iter != range->end();
  2551. ++iter, get_ctx_index++) {
  2552. iter->get_context = &(get_ctx[get_ctx_index]);
  2553. }
  2554. Status s;
  2555. // blob_file => [[blob_idx, it], ...]
  2556. std::unordered_map<uint64_t, BlobReadContexts> blob_ctxs;
  2557. MultiGetRange keys_with_blobs_range(*range, range->begin(), range->end());
  2558. #if USE_COROUTINES
  2559. if (read_options.async_io && read_options.optimize_multiget_for_io &&
  2560. using_coroutines() && use_async_io_) {
  2561. s = MultiGetAsync(read_options, range, &blob_ctxs);
  2562. } else
  2563. #endif // USE_COROUTINES
  2564. {
  2565. MultiGetRange file_picker_range(*range, range->begin(), range->end());
  2566. FilePickerMultiGet fp(&file_picker_range, &storage_info_.level_files_brief_,
  2567. storage_info_.num_non_empty_levels_,
  2568. &storage_info_.file_indexer_, user_comparator(),
  2569. internal_comparator());
  2570. FdWithKeyRange* f = fp.GetNextFileInLevel();
  2571. uint64_t num_index_read = 0;
  2572. uint64_t num_filter_read = 0;
  2573. uint64_t num_sst_read = 0;
  2574. uint64_t num_level_read = 0;
  2575. int prev_level = -1;
  2576. while (!fp.IsSearchEnded()) {
  2577. // This will be set to true later if we actually look up in a file in L0.
  2578. // For per level stats purposes, an L0 file is treated as a level
  2579. bool dump_stats_for_l0_file = false;
  2580. // Avoid using the coroutine version if we're looking in a L0 file, since
  2581. // L0 files won't be parallelized anyway. The regular synchronous version
  2582. // is faster.
  2583. if (!read_options.async_io || !using_coroutines() || !use_async_io_ ||
  2584. fp.GetHitFileLevel() == 0 || !fp.RemainingOverlapInLevel()) {
  2585. if (f) {
  2586. bool skip_filters =
  2587. IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
  2588. fp.IsHitFileLastInLevel());
  2589. // Call MultiGetFromSST for looking up a single file
  2590. s = MultiGetFromSST(read_options, fp.CurrentFileRange(),
  2591. fp.GetHitFileLevel(), skip_filters,
  2592. /*skip_range_deletions=*/false, f, blob_ctxs,
  2593. /*table_handle=*/nullptr, num_filter_read,
  2594. num_index_read, num_sst_read);
  2595. if (fp.GetHitFileLevel() == 0) {
  2596. dump_stats_for_l0_file = true;
  2597. }
  2598. }
  2599. if (s.ok()) {
  2600. f = fp.GetNextFileInLevel();
  2601. }
  2602. #if USE_COROUTINES
  2603. } else {
  2604. std::vector<folly::coro::Task<Status>> mget_tasks;
  2605. while (f != nullptr) {
  2606. MultiGetRange file_range = fp.CurrentFileRange();
  2607. TableCache::TypedHandle* table_handle = nullptr;
  2608. bool skip_filters =
  2609. IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
  2610. fp.IsHitFileLastInLevel());
  2611. bool skip_range_deletions = false;
  2612. if (!skip_filters) {
  2613. Status status = table_cache_->MultiGetFilter(
  2614. read_options, *internal_comparator(), *f->file_metadata,
  2615. mutable_cf_options_,
  2616. cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
  2617. fp.GetHitFileLevel(), &file_range, &table_handle);
  2618. skip_range_deletions = true;
  2619. if (status.ok()) {
  2620. skip_filters = true;
  2621. } else if (!status.IsNotSupported()) {
  2622. s = status;
  2623. }
  2624. }
  2625. if (!s.ok()) {
  2626. break;
  2627. }
  2628. if (!file_range.empty()) {
  2629. mget_tasks.emplace_back(MultiGetFromSSTCoroutine(
  2630. read_options, file_range, fp.GetHitFileLevel(), skip_filters,
  2631. skip_range_deletions, f, blob_ctxs, table_handle,
  2632. num_filter_read, num_index_read, num_sst_read));
  2633. }
  2634. if (fp.KeyMaySpanNextFile()) {
  2635. break;
  2636. }
  2637. f = fp.GetNextFileInLevel();
  2638. }
  2639. if (mget_tasks.size() > 0) {
  2640. RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT,
  2641. mget_tasks.size());
  2642. // Collect all results so far
  2643. std::vector<Status> statuses =
  2644. folly::coro::blockingWait(co_withExecutor(
  2645. &range->context()->executor(),
  2646. folly::coro::collectAllRange(std::move(mget_tasks))));
  2647. if (s.ok()) {
  2648. for (Status stat : statuses) {
  2649. if (!stat.ok()) {
  2650. s = std::move(stat);
  2651. break;
  2652. }
  2653. }
  2654. }
  2655. if (s.ok() && fp.KeyMaySpanNextFile()) {
  2656. f = fp.GetNextFileInLevel();
  2657. }
  2658. }
  2659. #endif // USE_COROUTINES
  2660. }
  2661. // If bad status or we found final result for all the keys
  2662. if (!s.ok() || file_picker_range.empty()) {
  2663. break;
  2664. }
  2665. if (!f) {
  2666. // Reached the end of this level. Prepare the next level
  2667. fp.PrepareNextLevelForSearch();
  2668. if (!fp.IsSearchEnded()) {
  2669. // Its possible there is no overlap on this level and f is nullptr
  2670. f = fp.GetNextFileInLevel();
  2671. }
  2672. if (dump_stats_for_l0_file ||
  2673. (prev_level != 0 && prev_level != (int)fp.GetHitFileLevel())) {
  2674. // Dump the stats if the search has moved to the next level and
  2675. // reset for next level.
  2676. if (num_filter_read + num_index_read) {
  2677. RecordInHistogram(db_statistics_,
  2678. NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
  2679. num_index_read + num_filter_read);
  2680. }
  2681. if (num_sst_read) {
  2682. RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL,
  2683. num_sst_read);
  2684. num_level_read++;
  2685. }
  2686. num_filter_read = 0;
  2687. num_index_read = 0;
  2688. num_sst_read = 0;
  2689. }
  2690. prev_level = fp.GetHitFileLevel();
  2691. }
  2692. }
  2693. // Dump stats for most recent level
  2694. if (num_filter_read + num_index_read) {
  2695. RecordInHistogram(db_statistics_,
  2696. NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
  2697. num_index_read + num_filter_read);
  2698. }
  2699. if (num_sst_read) {
  2700. RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read);
  2701. num_level_read++;
  2702. }
  2703. if (num_level_read) {
  2704. RecordInHistogram(db_statistics_, NUM_LEVEL_READ_PER_MULTIGET,
  2705. num_level_read);
  2706. }
  2707. }
  2708. if (!blob_ctxs.empty()) {
  2709. MultiGetBlob(read_options, keys_with_blobs_range, blob_ctxs);
  2710. }
  2711. // Process any left over keys
  2712. for (auto iter = range->begin(); s.ok() && iter != range->end(); ++iter) {
  2713. GetContext& get_context = *iter->get_context;
  2714. Status* status = iter->s;
  2715. Slice user_key = iter->lkey->user_key();
  2716. if (db_statistics_ != nullptr) {
  2717. get_context.ReportCounters();
  2718. }
  2719. if (GetContext::kMerge == get_context.State()) {
  2720. if (!merge_operator_) {
  2721. *status = Status::InvalidArgument(
  2722. "merge_operator is not properly initialized.");
  2723. range->MarkKeyDone(iter);
  2724. continue;
  2725. }
  2726. // merge_operands are in saver and we hit the beginning of the key history
  2727. // do a final merge of nullptr and operands;
  2728. // `op_failure_scope` (an output parameter) is not provided (set to
  2729. // nullptr) since a failure must be propagated regardless of its value.
  2730. *status = MergeHelper::TimedFullMerge(
  2731. merge_operator_, user_key, MergeHelper::kNoBaseValue,
  2732. iter->merge_context.GetOperands(), info_log_, db_statistics_, clock_,
  2733. /* update_num_ops_stats */ true, /* op_failure_scope */ nullptr,
  2734. iter->value ? iter->value->GetSelf() : nullptr, iter->columns);
  2735. if (LIKELY(iter->value != nullptr)) {
  2736. iter->value->PinSelf();
  2737. range->AddValueSize(iter->value->size());
  2738. } else {
  2739. assert(iter->columns);
  2740. range->AddValueSize(iter->columns->serialized_size());
  2741. }
  2742. range->MarkKeyDone(iter);
  2743. if (range->GetValueSize() > read_options.value_size_soft_limit) {
  2744. s = Status::Aborted();
  2745. break;
  2746. }
  2747. } else {
  2748. range->MarkKeyDone(iter);
  2749. *status = Status::NotFound(); // Use an empty error message for speed
  2750. }
  2751. }
  2752. for (auto iter = range->begin(); iter != range->end(); ++iter) {
  2753. range->MarkKeyDone(iter);
  2754. *(iter->s) = s;
  2755. }
  2756. }
  2757. #ifdef USE_COROUTINES
  2758. Status Version::ProcessBatch(
  2759. const ReadOptions& read_options, FilePickerMultiGet* batch,
  2760. std::vector<folly::coro::Task<Status>>& mget_tasks,
  2761. std::unordered_map<uint64_t, BlobReadContexts>* blob_ctxs,
  2762. autovector<FilePickerMultiGet, 4>& batches, std::deque<size_t>& waiting,
  2763. std::deque<size_t>& to_process, unsigned int& num_tasks_queued,
  2764. std::unordered_map<int, std::tuple<uint64_t, uint64_t, uint64_t>>&
  2765. mget_stats) {
  2766. FilePickerMultiGet& fp = *batch;
  2767. MultiGetRange range = fp.GetRange();
  2768. // Initialize a new empty range. Any keys that are not in this level will
  2769. // eventually become part of the new range.
  2770. MultiGetRange leftover(range, range.begin(), range.begin());
  2771. FdWithKeyRange* f = nullptr;
  2772. Status s;
  2773. f = fp.GetNextFileInLevel();
  2774. while (!f) {
  2775. fp.PrepareNextLevelForSearch();
  2776. if (!fp.IsSearchEnded()) {
  2777. f = fp.GetNextFileInLevel();
  2778. } else {
  2779. break;
  2780. }
  2781. }
  2782. while (f) {
  2783. MultiGetRange file_range = fp.CurrentFileRange();
  2784. TableCache::TypedHandle* table_handle = nullptr;
  2785. bool skip_filters = IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
  2786. fp.IsHitFileLastInLevel());
  2787. bool skip_range_deletions = false;
  2788. if (!skip_filters) {
  2789. Status status = table_cache_->MultiGetFilter(
  2790. read_options, *internal_comparator(), *f->file_metadata,
  2791. mutable_cf_options_,
  2792. cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
  2793. fp.GetHitFileLevel(), &file_range, &table_handle);
  2794. if (status.ok()) {
  2795. skip_filters = true;
  2796. skip_range_deletions = true;
  2797. } else if (!status.IsNotSupported()) {
  2798. s = status;
  2799. }
  2800. }
  2801. if (!s.ok()) {
  2802. break;
  2803. }
  2804. // At this point, file_range contains any keys that are likely in this
  2805. // file. It may have false positives, but that's ok since higher level
  2806. // lookups for the key are dependent on this lookup anyway.
  2807. // Add the complement of file_range to leftover. That's the set of keys
  2808. // definitely not in this level.
  2809. // Subtract the complement of file_range from range, since they will be
  2810. // processed in a separate batch in parallel.
  2811. leftover += ~file_range;
  2812. range -= ~file_range;
  2813. if (!file_range.empty()) {
  2814. int level = fp.GetHitFileLevel();
  2815. auto stat = mget_stats.find(level);
  2816. if (stat == mget_stats.end()) {
  2817. auto entry = mget_stats.insert({level, {0, 0, 0}});
  2818. assert(entry.second);
  2819. stat = entry.first;
  2820. }
  2821. if (waiting.empty() && to_process.empty() &&
  2822. !fp.RemainingOverlapInLevel() && leftover.empty() &&
  2823. mget_tasks.empty()) {
  2824. // All keys are in one SST file, so take the fast path
  2825. s = MultiGetFromSST(read_options, file_range, fp.GetHitFileLevel(),
  2826. skip_filters, skip_range_deletions, f, *blob_ctxs,
  2827. table_handle, std::get<0>(stat->second),
  2828. std::get<1>(stat->second),
  2829. std::get<2>(stat->second));
  2830. } else {
  2831. mget_tasks.emplace_back(MultiGetFromSSTCoroutine(
  2832. read_options, file_range, fp.GetHitFileLevel(), skip_filters,
  2833. skip_range_deletions, f, *blob_ctxs, table_handle,
  2834. std::get<0>(stat->second), std::get<1>(stat->second),
  2835. std::get<2>(stat->second)));
  2836. ++num_tasks_queued;
  2837. }
  2838. }
  2839. if (fp.KeyMaySpanNextFile() && !file_range.empty()) {
  2840. break;
  2841. }
  2842. f = fp.GetNextFileInLevel();
  2843. }
  2844. // Split the current batch only if some keys are likely in this level and
  2845. // some are not. Only split if we're done with this level, i.e f is null.
  2846. // Otherwise, it means there are more files in this level to look at.
  2847. if (s.ok() && !f && !leftover.empty() && !range.empty()) {
  2848. fp.ReplaceRange(range);
  2849. batches.emplace_back(&leftover, fp);
  2850. to_process.emplace_back(batches.size() - 1);
  2851. }
  2852. // 1. If f is non-null, that means we might not be done with this level.
  2853. // This can happen if one of the keys is the last key in the file, i.e
  2854. // fp.KeyMaySpanNextFile() is true.
  2855. // 2. If range is empty, then we're done with this range and no need to
  2856. // prepare the next level
  2857. // 3. If some tasks were queued for this range, then the next level will be
  2858. // prepared after executing those tasks
  2859. if (!f && !range.empty() && !num_tasks_queued) {
  2860. fp.PrepareNextLevelForSearch();
  2861. }
  2862. return s;
  2863. }
  2864. Status Version::MultiGetAsync(
  2865. const ReadOptions& options, MultiGetRange* range,
  2866. std::unordered_map<uint64_t, BlobReadContexts>* blob_ctxs) {
  2867. autovector<FilePickerMultiGet, 4> batches;
  2868. std::deque<size_t> waiting;
  2869. std::deque<size_t> to_process;
  2870. Status s;
  2871. std::vector<folly::coro::Task<Status>> mget_tasks;
  2872. std::unordered_map<int, std::tuple<uint64_t, uint64_t, uint64_t>> mget_stats;
  2873. // Create the initial batch with the input range
  2874. batches.emplace_back(range, &storage_info_.level_files_brief_,
  2875. storage_info_.num_non_empty_levels_,
  2876. &storage_info_.file_indexer_, user_comparator(),
  2877. internal_comparator());
  2878. to_process.emplace_back(0);
  2879. while (!to_process.empty()) {
  2880. // As we process a batch, it may get split into two. So reserve space for
  2881. // an additional batch in the autovector in order to prevent later moves
  2882. // of elements in ProcessBatch().
  2883. batches.reserve(batches.size() + 1);
  2884. size_t idx = to_process.front();
  2885. FilePickerMultiGet* batch = &batches.at(idx);
  2886. unsigned int num_tasks_queued = 0;
  2887. to_process.pop_front();
  2888. if (batch->IsSearchEnded() || batch->GetRange().empty()) {
  2889. // If to_process is empty, i.e no more batches to look at, then we need
  2890. // schedule the enqueued coroutines and wait for them. Otherwise, we
  2891. // skip this batch and move to the next one in to_process.
  2892. if (!to_process.empty()) {
  2893. continue;
  2894. }
  2895. } else {
  2896. // Look through one level. This may split the batch and enqueue it to
  2897. // to_process
  2898. s = ProcessBatch(options, batch, mget_tasks, blob_ctxs, batches, waiting,
  2899. to_process, num_tasks_queued, mget_stats);
  2900. // If ProcessBatch didn't enqueue any coroutine tasks, it means all
  2901. // keys were filtered out. So put the batch back in to_process to
  2902. // lookup in the next level
  2903. if (!num_tasks_queued && !batch->IsSearchEnded()) {
  2904. // Put this back in the processing queue
  2905. to_process.emplace_back(idx);
  2906. } else if (num_tasks_queued) {
  2907. waiting.emplace_back(idx);
  2908. }
  2909. }
  2910. // If ProcessBatch() returned an error, then schedule the enqueued
  2911. // coroutines and wait for them, then abort the MultiGet.
  2912. if (to_process.empty() || !s.ok()) {
  2913. if (mget_tasks.size() > 0) {
  2914. assert(waiting.size());
  2915. RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT, mget_tasks.size());
  2916. // Collect all results so far
  2917. std::vector<Status> statuses =
  2918. folly::coro::blockingWait(co_withExecutor(
  2919. &range->context()->executor(),
  2920. folly::coro::collectAllRange(std::move(mget_tasks))));
  2921. mget_tasks.clear();
  2922. if (s.ok()) {
  2923. for (Status stat : statuses) {
  2924. if (!stat.ok()) {
  2925. s = std::move(stat);
  2926. break;
  2927. }
  2928. }
  2929. }
  2930. if (!s.ok()) {
  2931. break;
  2932. }
  2933. for (size_t wait_idx : waiting) {
  2934. FilePickerMultiGet& fp = batches.at(wait_idx);
  2935. // 1. If fp.GetHitFile() is non-null, then there could be more
  2936. // overlap in this level. So skip preparing next level.
  2937. // 2. If fp.GetRange() is empty, then this batch is completed
  2938. // and no need to prepare the next level.
  2939. if (!fp.GetHitFile() && !fp.GetRange().empty()) {
  2940. fp.PrepareNextLevelForSearch();
  2941. }
  2942. }
  2943. to_process.swap(waiting);
  2944. } else {
  2945. assert(!s.ok() || waiting.size() == 0);
  2946. }
  2947. }
  2948. if (!s.ok()) {
  2949. break;
  2950. }
  2951. }
  2952. uint64_t num_levels = 0;
  2953. for (auto& stat : mget_stats) {
  2954. if (stat.first == 0) {
  2955. num_levels += std::get<2>(stat.second);
  2956. } else {
  2957. num_levels++;
  2958. }
  2959. uint64_t num_meta_reads =
  2960. std::get<0>(stat.second) + std::get<1>(stat.second);
  2961. uint64_t num_sst_reads = std::get<2>(stat.second);
  2962. if (num_meta_reads > 0) {
  2963. RecordInHistogram(db_statistics_,
  2964. NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
  2965. num_meta_reads);
  2966. }
  2967. if (num_sst_reads > 0) {
  2968. RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_reads);
  2969. }
  2970. }
  2971. if (num_levels > 0) {
  2972. RecordInHistogram(db_statistics_, NUM_LEVEL_READ_PER_MULTIGET, num_levels);
  2973. }
  2974. return s;
  2975. }
  2976. #endif
  2977. bool Version::IsFilterSkipped(int level, bool is_file_last_in_level) {
  2978. // Reaching the bottom level implies misses at all upper levels, so we'll
  2979. // skip checking the filters when we predict a hit.
  2980. return cfd_->ioptions().optimize_filters_for_hits &&
  2981. (level > 0 || is_file_last_in_level) &&
  2982. level == storage_info_.num_non_empty_levels() - 1;
  2983. }
  2984. void VersionStorageInfo::GenerateLevelFilesBrief() {
  2985. level_files_brief_.resize(num_non_empty_levels_);
  2986. for (int level = 0; level < num_non_empty_levels_; level++) {
  2987. DoGenerateLevelFilesBrief(&level_files_brief_[level], files_[level],
  2988. &arena_);
  2989. }
  2990. }
  2991. void VersionStorageInfo::PrepareForVersionAppend(
  2992. const ImmutableOptions& immutable_options,
  2993. const MutableCFOptions& mutable_cf_options) {
  2994. ComputeCompensatedSizes();
  2995. UpdateNumNonEmptyLevels();
  2996. CalculateBaseBytes(immutable_options, mutable_cf_options);
  2997. UpdateFilesByCompactionPri(immutable_options, mutable_cf_options);
  2998. GenerateFileIndexer();
  2999. GenerateLevelFilesBrief();
  3000. GenerateLevel0NonOverlapping();
  3001. GenerateBottommostFiles();
  3002. GenerateFileLocationIndex();
  3003. }
  3004. void Version::PrepareAppend(const ReadOptions& read_options,
  3005. bool update_stats) {
  3006. TEST_SYNC_POINT_CALLBACK(
  3007. "Version::PrepareAppend:forced_check",
  3008. static_cast<void*>(&storage_info_.force_consistency_checks_));
  3009. if (update_stats) {
  3010. UpdateAccumulatedStats(read_options);
  3011. }
  3012. storage_info_.PrepareForVersionAppend(cfd_->ioptions(), mutable_cf_options_);
  3013. }
  3014. bool Version::MaybeInitializeFileMetaData(const ReadOptions& read_options,
  3015. FileMetaData* file_meta) {
  3016. if (file_meta->init_stats_from_file || file_meta->compensated_file_size > 0) {
  3017. return false;
  3018. }
  3019. std::shared_ptr<const TableProperties> tp;
  3020. Status s = GetTableProperties(read_options, &tp, file_meta);
  3021. file_meta->init_stats_from_file = true;
  3022. if (!s.ok()) {
  3023. ROCKS_LOG_ERROR(vset_->db_options_->info_log,
  3024. "Unable to load table properties for file %" PRIu64
  3025. " --- %s\n",
  3026. file_meta->fd.GetNumber(), s.ToString().c_str());
  3027. return false;
  3028. }
  3029. if (tp.get() == nullptr) {
  3030. return false;
  3031. }
  3032. file_meta->num_entries = tp->num_entries;
  3033. file_meta->num_deletions = tp->num_deletions;
  3034. file_meta->raw_value_size = tp->raw_value_size;
  3035. file_meta->raw_key_size = tp->raw_key_size;
  3036. file_meta->num_range_deletions = tp->num_range_deletions;
  3037. // Ensure new invariants on old files
  3038. file_meta->num_deletions =
  3039. std::max(tp->num_deletions, tp->num_range_deletions);
  3040. file_meta->num_entries = std::max(tp->num_entries, tp->num_deletions);
  3041. return true;
  3042. }
  3043. void VersionStorageInfo::UpdateAccumulatedStats(FileMetaData* file_meta) {
  3044. TEST_SYNC_POINT_CALLBACK("VersionStorageInfo::UpdateAccumulatedStats",
  3045. nullptr);
  3046. assert(file_meta->init_stats_from_file);
  3047. accumulated_file_size_ += file_meta->fd.GetFileSize();
  3048. accumulated_raw_key_size_ += file_meta->raw_key_size;
  3049. accumulated_raw_value_size_ += file_meta->raw_value_size;
  3050. assert(file_meta->num_entries >= file_meta->num_deletions);
  3051. accumulated_num_non_deletions_ +=
  3052. file_meta->num_entries - file_meta->num_deletions;
  3053. accumulated_num_deletions_ += file_meta->num_deletions;
  3054. current_num_non_deletions_ +=
  3055. file_meta->num_entries - file_meta->num_deletions;
  3056. current_num_deletions_ += file_meta->num_deletions;
  3057. current_num_samples_++;
  3058. }
  3059. void VersionStorageInfo::RemoveCurrentStats(FileMetaData* file_meta) {
  3060. if (file_meta->init_stats_from_file) {
  3061. current_num_non_deletions_ -=
  3062. file_meta->num_entries - file_meta->num_deletions;
  3063. current_num_deletions_ -= file_meta->num_deletions;
  3064. current_num_samples_--;
  3065. }
  3066. }
  3067. void Version::UpdateAccumulatedStats(const ReadOptions& read_options) {
  3068. // maximum number of table properties loaded from files.
  3069. const int kMaxInitCount = 20;
  3070. int init_count = 0;
  3071. // here only the first kMaxInitCount files which haven't been
  3072. // initialized from file will be updated with num_deletions.
  3073. // The motivation here is to cap the maximum I/O per Version creation.
  3074. // The reason for choosing files from lower-level instead of higher-level
  3075. // is that such design is able to propagate the initialization from
  3076. // lower-level to higher-level: When the num_deletions of lower-level
  3077. // files are updated, it will make the lower-level files have accurate
  3078. // compensated_file_size, making lower-level to higher-level compaction
  3079. // will be triggered, which creates higher-level files whose num_deletions
  3080. // will be updated here.
  3081. for (int level = 0;
  3082. level < storage_info_.num_levels_ && init_count < kMaxInitCount;
  3083. ++level) {
  3084. for (auto* file_meta : storage_info_.files_[level]) {
  3085. if (MaybeInitializeFileMetaData(read_options, file_meta)) {
  3086. // each FileMeta will be initialized only once.
  3087. storage_info_.UpdateAccumulatedStats(file_meta);
  3088. // when option "max_open_files" is -1, all the file metadata has
  3089. // already been read, so MaybeInitializeFileMetaData() won't incur
  3090. // any I/O cost. "max_open_files=-1" means that the table cache passed
  3091. // to the VersionSet and then to the ColumnFamilySet has a size of
  3092. // TableCache::kInfiniteCapacity
  3093. if (vset_->GetColumnFamilySet()->get_table_cache()->GetCapacity() ==
  3094. TableCache::kInfiniteCapacity) {
  3095. continue;
  3096. }
  3097. if (++init_count >= kMaxInitCount) {
  3098. break;
  3099. }
  3100. }
  3101. }
  3102. }
  3103. // In case all sampled-files contain only deletion entries, then we
  3104. // load the table-property of a file in higher-level to initialize
  3105. // that value.
  3106. for (int level = storage_info_.num_levels_ - 1;
  3107. storage_info_.accumulated_raw_value_size_ == 0 && level >= 0; --level) {
  3108. for (int i = static_cast<int>(storage_info_.files_[level].size()) - 1;
  3109. storage_info_.accumulated_raw_value_size_ == 0 && i >= 0; --i) {
  3110. if (MaybeInitializeFileMetaData(read_options,
  3111. storage_info_.files_[level][i])) {
  3112. storage_info_.UpdateAccumulatedStats(storage_info_.files_[level][i]);
  3113. }
  3114. }
  3115. }
  3116. }
  3117. void VersionStorageInfo::ComputeCompensatedSizes() {
  3118. static const int kDeletionWeightOnCompaction = 2;
  3119. uint64_t average_value_size = GetAverageValueSize();
  3120. // compute the compensated size
  3121. for (int level = 0; level < num_levels_; level++) {
  3122. for (auto* file_meta : files_[level]) {
  3123. // Here we only compute compensated_file_size for those file_meta
  3124. // which compensated_file_size is uninitialized (== 0). This is true only
  3125. // for files that have been created right now and no other thread has
  3126. // access to them. That's why we can safely mutate compensated_file_size.
  3127. if (file_meta->compensated_file_size == 0) {
  3128. file_meta->compensated_file_size = file_meta->fd.GetFileSize();
  3129. // Here we only boost the size of deletion entries of a file only
  3130. // when the number of deletion entries is greater than the number of
  3131. // non-deletion entries in the file. The motivation here is that in
  3132. // a stable workload, the number of deletion entries should be roughly
  3133. // equal to the number of non-deletion entries. If we compensate the
  3134. // size of deletion entries in a stable workload, the deletion
  3135. // compensation logic might introduce unwanted effet which changes the
  3136. // shape of LSM tree.
  3137. if ((file_meta->num_deletions - file_meta->num_range_deletions) * 2 >=
  3138. file_meta->num_entries) {
  3139. file_meta->compensated_file_size +=
  3140. ((file_meta->num_deletions - file_meta->num_range_deletions) * 2 -
  3141. file_meta->num_entries) *
  3142. average_value_size * kDeletionWeightOnCompaction;
  3143. }
  3144. file_meta->compensated_file_size +=
  3145. file_meta->compensated_range_deletion_size;
  3146. }
  3147. }
  3148. }
  3149. }
  3150. int VersionStorageInfo::MaxInputLevel() const {
  3151. if (compaction_style_ == kCompactionStyleLevel) {
  3152. return num_levels() - 2;
  3153. }
  3154. return 0;
  3155. }
  3156. int VersionStorageInfo::MaxOutputLevel(bool allow_ingest_behind) const {
  3157. if (allow_ingest_behind) {
  3158. assert(num_levels() > 1);
  3159. return num_levels() - 2;
  3160. }
  3161. return num_levels() - 1;
  3162. }
  3163. void VersionStorageInfo::EstimateCompactionBytesNeeded(
  3164. const MutableCFOptions& mutable_cf_options) {
  3165. // Only implemented for level-based compaction
  3166. if (compaction_style_ != kCompactionStyleLevel) {
  3167. estimated_compaction_needed_bytes_ = 0;
  3168. return;
  3169. }
  3170. // Start from Level 0, if level 0 qualifies compaction to level 1,
  3171. // we estimate the size of compaction.
  3172. // Then we move on to the next level and see whether it qualifies compaction
  3173. // to the next level. The size of the level is estimated as the actual size
  3174. // on the level plus the input bytes from the previous level if there is any.
  3175. // If it exceeds, take the exceeded bytes as compaction input and add the size
  3176. // of the compaction size to tatal size.
  3177. // We keep doing it to Level 2, 3, etc, until the last level and return the
  3178. // accumulated bytes.
  3179. uint64_t bytes_compact_to_next_level = 0;
  3180. uint64_t level_size = 0;
  3181. for (auto* f : files_[0]) {
  3182. level_size += f->fd.GetFileSize();
  3183. }
  3184. // Level 0
  3185. bool level0_compact_triggered = false;
  3186. if (static_cast<int>(files_[0].size()) >=
  3187. mutable_cf_options.level0_file_num_compaction_trigger ||
  3188. level_size >= mutable_cf_options.max_bytes_for_level_base) {
  3189. level0_compact_triggered = true;
  3190. estimated_compaction_needed_bytes_ = level_size;
  3191. bytes_compact_to_next_level = level_size;
  3192. } else {
  3193. estimated_compaction_needed_bytes_ = 0;
  3194. }
  3195. // Level 1 and up.
  3196. uint64_t bytes_next_level = 0;
  3197. for (int level = base_level(); level <= MaxInputLevel(); level++) {
  3198. level_size = 0;
  3199. if (bytes_next_level > 0) {
  3200. #ifndef NDEBUG
  3201. uint64_t level_size2 = 0;
  3202. for (auto* f : files_[level]) {
  3203. level_size2 += f->fd.GetFileSize();
  3204. }
  3205. assert(level_size2 == bytes_next_level);
  3206. #endif
  3207. level_size = bytes_next_level;
  3208. bytes_next_level = 0;
  3209. } else {
  3210. for (auto* f : files_[level]) {
  3211. level_size += f->fd.GetFileSize();
  3212. }
  3213. }
  3214. if (level == base_level() && level0_compact_triggered) {
  3215. // Add base level size to compaction if level0 compaction triggered.
  3216. estimated_compaction_needed_bytes_ += level_size;
  3217. }
  3218. // Add size added by previous compaction
  3219. level_size += bytes_compact_to_next_level;
  3220. bytes_compact_to_next_level = 0;
  3221. uint64_t level_target = MaxBytesForLevel(level);
  3222. if (level_size > level_target) {
  3223. bytes_compact_to_next_level = level_size - level_target;
  3224. // Estimate the actual compaction fan-out ratio as size ratio between
  3225. // the two levels.
  3226. assert(bytes_next_level == 0);
  3227. if (level + 1 < num_levels_) {
  3228. for (auto* f : files_[level + 1]) {
  3229. bytes_next_level += f->fd.GetFileSize();
  3230. }
  3231. }
  3232. if (bytes_next_level > 0) {
  3233. assert(level_size > 0);
  3234. estimated_compaction_needed_bytes_ += static_cast<uint64_t>(
  3235. static_cast<double>(bytes_compact_to_next_level) *
  3236. (static_cast<double>(bytes_next_level) /
  3237. static_cast<double>(level_size) +
  3238. 1));
  3239. }
  3240. }
  3241. }
  3242. }
  3243. namespace {
  3244. uint32_t GetExpiredTtlFilesCount(const ImmutableOptions& ioptions,
  3245. const MutableCFOptions& mutable_cf_options,
  3246. const std::vector<FileMetaData*>& files) {
  3247. uint32_t ttl_expired_files_count = 0;
  3248. int64_t _current_time;
  3249. auto status = ioptions.clock->GetCurrentTime(&_current_time);
  3250. if (status.ok()) {
  3251. const uint64_t current_time = static_cast<uint64_t>(_current_time);
  3252. for (FileMetaData* f : files) {
  3253. if (!f->being_compacted) {
  3254. uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
  3255. if (oldest_ancester_time != 0 &&
  3256. oldest_ancester_time < (current_time - mutable_cf_options.ttl)) {
  3257. ttl_expired_files_count++;
  3258. }
  3259. }
  3260. }
  3261. }
  3262. return ttl_expired_files_count;
  3263. }
  3264. bool ShouldChangeFileTemperature(const ImmutableOptions& ioptions,
  3265. const MutableCFOptions& mutable_cf_options,
  3266. const std::vector<FileMetaData*>& files) {
  3267. const std::vector<FileTemperatureAge>& ages =
  3268. mutable_cf_options.compaction_options_fifo
  3269. .file_temperature_age_thresholds;
  3270. if (ages.empty()) {
  3271. return false;
  3272. }
  3273. if (files.empty()) {
  3274. return false;
  3275. }
  3276. int64_t _current_time;
  3277. auto status = ioptions.clock->GetCurrentTime(&_current_time);
  3278. const uint64_t current_time = static_cast<uint64_t>(_current_time);
  3279. // This is the same logic used in
  3280. // FIFOCompactionPicker::PickTemperatureChangeCompaction().
  3281. if (status.ok() && current_time >= ages[0].age) {
  3282. uint64_t create_time_threshold = current_time - ages[0].age;
  3283. Temperature target_temp;
  3284. assert(files.size() >= 1);
  3285. for (size_t index = files.size(); index >= 1; --index) {
  3286. FileMetaData* cur_file = files[index - 1];
  3287. FileMetaData* prev_file = index < 2 ? nullptr : files[index - 2];
  3288. if (!cur_file->being_compacted) {
  3289. uint64_t est_newest_key_time = cur_file->TryGetNewestKeyTime(prev_file);
  3290. // Newer file could have newest_key_time populated
  3291. if (est_newest_key_time == kUnknownNewestKeyTime) {
  3292. continue;
  3293. }
  3294. if (est_newest_key_time > create_time_threshold) {
  3295. return false;
  3296. }
  3297. target_temp = ages[0].temperature;
  3298. for (size_t i = 1; i < ages.size(); ++i) {
  3299. if (current_time >= ages[i].age &&
  3300. est_newest_key_time <= current_time - ages[i].age) {
  3301. target_temp = ages[i].temperature;
  3302. }
  3303. }
  3304. if (cur_file->temperature != target_temp) {
  3305. return true;
  3306. }
  3307. }
  3308. }
  3309. }
  3310. return false;
  3311. }
  3312. } // anonymous namespace
  3313. void VersionStorageInfo::ComputeCompactionScore(
  3314. const ImmutableOptions& immutable_options,
  3315. const MutableCFOptions& mutable_cf_options) {
  3316. double total_downcompact_bytes = 0.0;
  3317. // Historically, score is defined as actual bytes in a level divided by
  3318. // the level's target size, and 1.0 is the threshold for triggering
  3319. // compaction. Higher score means higher prioritization.
  3320. // Now we keep the compaction triggering condition, but consider more
  3321. // factors for prioritization, while still keeping the 1.0 threshold.
  3322. // In order to provide flexibility for reducing score while still
  3323. // maintaining it to be over 1.0, we scale the original score by 10x
  3324. // if it is larger than 1.0.
  3325. const double kScoreScale = 10.0;
  3326. int max_output_level =
  3327. MaxOutputLevel(immutable_options.cf_allow_ingest_behind ||
  3328. immutable_options.allow_ingest_behind);
  3329. for (int level = 0; level <= MaxInputLevel(); level++) {
  3330. double score;
  3331. if (level == 0) {
  3332. // We treat level-0 specially by bounding the number of files
  3333. // instead of number of bytes for two reasons:
  3334. //
  3335. // (1) With larger write-buffer sizes, it is nice not to do too
  3336. // many level-0 compactions.
  3337. //
  3338. // (2) The files in level-0 are merged on every read and
  3339. // therefore we wish to avoid too many files when the individual
  3340. // file size is small (perhaps because of a small write-buffer
  3341. // setting, or very high compression ratios, or lots of
  3342. // overwrites/deletions).
  3343. int num_sorted_runs = 0;
  3344. uint64_t total_size = 0;
  3345. for (auto* f : files_[level]) {
  3346. total_downcompact_bytes += static_cast<double>(f->fd.GetFileSize());
  3347. if (!f->being_compacted) {
  3348. total_size += f->compensated_file_size;
  3349. num_sorted_runs++;
  3350. }
  3351. }
  3352. if (compaction_style_ == kCompactionStyleUniversal) {
  3353. // For universal compaction, we use level0 score to indicate
  3354. // compaction score for the whole DB. Adding other levels as if
  3355. // they are L0 files.
  3356. for (int i = 1; i <= max_output_level; i++) {
  3357. // It's possible that a subset of the files in a level may be in a
  3358. // compaction, due to delete triggered compaction or trivial move.
  3359. // In that case, the below check may not catch a level being
  3360. // compacted as it only checks the first file. The worst that can
  3361. // happen is a scheduled compaction thread will find nothing to do.
  3362. if (!files_[i].empty() && !files_[i][0]->being_compacted) {
  3363. num_sorted_runs++;
  3364. }
  3365. }
  3366. }
  3367. if (compaction_style_ == kCompactionStyleFIFO) {
  3368. auto max_table_files_size =
  3369. mutable_cf_options.compaction_options_fifo.max_table_files_size;
  3370. if (max_table_files_size == 0) {
  3371. // avoid divide 0
  3372. max_table_files_size = 1;
  3373. }
  3374. score = static_cast<double>(total_size) / max_table_files_size;
  3375. if (score < 1 &&
  3376. mutable_cf_options.compaction_options_fifo.allow_compaction) {
  3377. score = std::max(
  3378. static_cast<double>(num_sorted_runs) /
  3379. mutable_cf_options.level0_file_num_compaction_trigger,
  3380. score);
  3381. }
  3382. if (score < 1 && mutable_cf_options.ttl > 0) {
  3383. score =
  3384. std::max(static_cast<double>(GetExpiredTtlFilesCount(
  3385. immutable_options, mutable_cf_options, files_[0])),
  3386. score);
  3387. }
  3388. if (score < 1 &&
  3389. ShouldChangeFileTemperature(immutable_options, mutable_cf_options,
  3390. files_[0])) {
  3391. // For FIFO, just need a large enough score to trigger compaction.
  3392. const double kScoreForNeedCompaction = 1.1;
  3393. score = kScoreForNeedCompaction;
  3394. }
  3395. } else {
  3396. // For universal compaction, if a user configures `max_read_amp`, then
  3397. // the score may be a false positive signal.
  3398. // `level0_file_num_compaction_trigger` is used as a trigger to check
  3399. // if there is any compaction work to do.
  3400. score = static_cast<double>(num_sorted_runs) /
  3401. mutable_cf_options.level0_file_num_compaction_trigger;
  3402. if (compaction_style_ == kCompactionStyleLevel && num_levels() > 1) {
  3403. // Level-based involves L0->L0 compactions that can lead to oversized
  3404. // L0 files. Take into account size as well to avoid later giant
  3405. // compactions to the base level.
  3406. // If score in L0 is always too high, L0->LBase will always be
  3407. // prioritized over LBase->LBase+1 compaction and LBase will
  3408. // accumulate to too large. But if L0 score isn't high enough, L0 will
  3409. // accumulate and data is not moved to LBase fast enough. The score
  3410. // calculation below takes into account L0 size vs LBase size.
  3411. if (immutable_options.level_compaction_dynamic_level_bytes) {
  3412. if (total_size >= mutable_cf_options.max_bytes_for_level_base) {
  3413. // When calculating estimated_compaction_needed_bytes, we assume
  3414. // L0 is qualified as pending compactions. We will need to make
  3415. // sure that it qualifies for compaction.
  3416. // It might be guaranteed by logic below anyway, but we are
  3417. // explicit here to make sure we don't stop writes with no
  3418. // compaction scheduled.
  3419. score = std::max(score, 1.01);
  3420. }
  3421. if (total_size > level_max_bytes_[base_level_]) {
  3422. // In this case, we compare L0 size with actual LBase size and
  3423. // make sure score is more than 1.0 (10.0 after scaled) if L0 is
  3424. // larger than LBase. Since LBase score = LBase size /
  3425. // (target size + total_downcompact_bytes) where
  3426. // total_downcompact_bytes = total_size > LBase size,
  3427. // LBase score is lower than 10.0. So L0->LBase is prioritized
  3428. // over LBase -> LBase+1.
  3429. uint64_t base_level_size = 0;
  3430. for (auto f : files_[base_level_]) {
  3431. base_level_size += f->compensated_file_size;
  3432. }
  3433. score = std::max(score, static_cast<double>(total_size) /
  3434. static_cast<double>(std::max(
  3435. base_level_size,
  3436. level_max_bytes_[base_level_])));
  3437. }
  3438. if (score > 1.0) {
  3439. score *= kScoreScale;
  3440. }
  3441. } else {
  3442. score = std::max(score,
  3443. static_cast<double>(total_size) /
  3444. mutable_cf_options.max_bytes_for_level_base);
  3445. }
  3446. }
  3447. }
  3448. } else { // level > 0
  3449. // Compute the ratio of current size to size limit.
  3450. uint64_t level_bytes_no_compacting = 0;
  3451. uint64_t level_total_bytes = 0;
  3452. for (auto f : files_[level]) {
  3453. level_total_bytes += f->fd.GetFileSize();
  3454. if (!f->being_compacted) {
  3455. level_bytes_no_compacting += f->compensated_file_size;
  3456. }
  3457. }
  3458. if (!immutable_options.level_compaction_dynamic_level_bytes) {
  3459. score = static_cast<double>(level_bytes_no_compacting) /
  3460. MaxBytesForLevel(level);
  3461. } else {
  3462. if (level_bytes_no_compacting < MaxBytesForLevel(level)) {
  3463. score = static_cast<double>(level_bytes_no_compacting) /
  3464. MaxBytesForLevel(level);
  3465. } else {
  3466. // If there are a large mount of data being compacted down to the
  3467. // current level soon, we would de-prioritize compaction from
  3468. // a level where the incoming data would be a large ratio. We do
  3469. // it by dividing level size not by target level size, but
  3470. // the target size and the incoming compaction bytes.
  3471. score = static_cast<double>(level_bytes_no_compacting) /
  3472. (MaxBytesForLevel(level) + total_downcompact_bytes) *
  3473. kScoreScale;
  3474. }
  3475. // Drain unnecessary levels, but with lower priority compared to
  3476. // when L0 is eligible. Only non-empty levels can be unnecessary.
  3477. // If there is no unnecessary levels, lowest_unnecessary_level_ = -1.
  3478. if (level_bytes_no_compacting > 0 &&
  3479. level <= lowest_unnecessary_level_) {
  3480. score = std::max(
  3481. score, kScoreScale *
  3482. (1.001 + 0.001 * (lowest_unnecessary_level_ - level)));
  3483. }
  3484. }
  3485. if (level <= lowest_unnecessary_level_) {
  3486. total_downcompact_bytes += level_total_bytes;
  3487. } else if (level_total_bytes > MaxBytesForLevel(level)) {
  3488. total_downcompact_bytes +=
  3489. static_cast<double>(level_total_bytes - MaxBytesForLevel(level));
  3490. }
  3491. }
  3492. compaction_level_[level] = level;
  3493. compaction_score_[level] = score;
  3494. }
  3495. // sort all the levels based on their score. Higher scores get listed
  3496. // first. Use bubble sort because the number of entries are small.
  3497. for (int i = 0; i < num_levels() - 2; i++) {
  3498. for (int j = i + 1; j < num_levels() - 1; j++) {
  3499. if (compaction_score_[i] < compaction_score_[j]) {
  3500. double score = compaction_score_[i];
  3501. int level = compaction_level_[i];
  3502. compaction_score_[i] = compaction_score_[j];
  3503. compaction_level_[i] = compaction_level_[j];
  3504. compaction_score_[j] = score;
  3505. compaction_level_[j] = level;
  3506. }
  3507. }
  3508. }
  3509. ComputeFilesMarkedForCompaction(max_output_level);
  3510. ComputeBottommostFilesMarkedForCompaction(
  3511. immutable_options.cf_allow_ingest_behind ||
  3512. immutable_options.allow_ingest_behind);
  3513. ComputeExpiredTtlFiles(immutable_options, mutable_cf_options.ttl);
  3514. ComputeFilesMarkedForPeriodicCompaction(
  3515. immutable_options, mutable_cf_options.periodic_compaction_seconds,
  3516. max_output_level);
  3517. ComputeFilesMarkedForForcedBlobGC(
  3518. mutable_cf_options.blob_garbage_collection_age_cutoff,
  3519. mutable_cf_options.blob_garbage_collection_force_threshold,
  3520. mutable_cf_options.enable_blob_garbage_collection);
  3521. EstimateCompactionBytesNeeded(mutable_cf_options);
  3522. }
  3523. void VersionStorageInfo::ComputeFilesMarkedForCompaction(int last_level) {
  3524. files_marked_for_compaction_.clear();
  3525. int last_qualify_level = 0;
  3526. standalone_range_tombstone_files_mark_threshold_ = kMaxSequenceNumber;
  3527. // Do not include files from the last level with data
  3528. // If table properties collector suggests a file on the last level,
  3529. // we should not move it to a new level.
  3530. for (int level = last_level; level >= 1; level--) {
  3531. if (!files_[level].empty()) {
  3532. last_qualify_level = level - 1;
  3533. break;
  3534. }
  3535. }
  3536. for (int level = 0; level <= last_qualify_level; level++) {
  3537. for (auto* f : files_[level]) {
  3538. if (!f->being_compacted && f->marked_for_compaction) {
  3539. files_marked_for_compaction_.emplace_back(level, f);
  3540. if (f->FileIsStandAloneRangeTombstone()) {
  3541. standalone_range_tombstone_files_mark_threshold_ =
  3542. std::min(standalone_range_tombstone_files_mark_threshold_,
  3543. f->fd.smallest_seqno);
  3544. }
  3545. }
  3546. }
  3547. }
  3548. }
  3549. void VersionStorageInfo::ComputeExpiredTtlFiles(
  3550. const ImmutableOptions& ioptions, const uint64_t ttl) {
  3551. expired_ttl_files_.clear();
  3552. if (ttl == 0 || compaction_style_ != CompactionStyle::kCompactionStyleLevel) {
  3553. return;
  3554. }
  3555. int64_t _current_time;
  3556. auto status = ioptions.clock->GetCurrentTime(&_current_time);
  3557. if (!status.ok()) {
  3558. return;
  3559. }
  3560. const uint64_t current_time = static_cast<uint64_t>(_current_time);
  3561. for (int level = 0; level < num_levels() - 1; level++) {
  3562. for (FileMetaData* f : files_[level]) {
  3563. if (!f->being_compacted) {
  3564. uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
  3565. if (oldest_ancester_time > 0 &&
  3566. oldest_ancester_time < (current_time - ttl)) {
  3567. expired_ttl_files_.emplace_back(level, f);
  3568. }
  3569. }
  3570. }
  3571. }
  3572. }
  3573. void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction(
  3574. const ImmutableOptions& ioptions,
  3575. const uint64_t periodic_compaction_seconds, int last_level) {
  3576. files_marked_for_periodic_compaction_.clear();
  3577. if (periodic_compaction_seconds == 0) {
  3578. return;
  3579. }
  3580. int64_t temp_current_time;
  3581. auto status = ioptions.clock->GetCurrentTime(&temp_current_time);
  3582. if (!status.ok()) {
  3583. return;
  3584. }
  3585. const uint64_t current_time = static_cast<uint64_t>(temp_current_time);
  3586. // If periodic_compaction_seconds is larger than current time, periodic
  3587. // compaction can't possibly be triggered.
  3588. if (periodic_compaction_seconds > current_time) {
  3589. return;
  3590. }
  3591. const uint64_t allowed_time_limit =
  3592. current_time - periodic_compaction_seconds;
  3593. // Find the adjust_allowed_time_limit such that it includes files that are
  3594. // going to expire by the time next daily offpeak starts.
  3595. const OffpeakTimeInfo offpeak_time_info =
  3596. offpeak_time_option_.GetOffpeakTimeInfo(current_time);
  3597. const uint64_t adjusted_allowed_time_limit =
  3598. allowed_time_limit +
  3599. (offpeak_time_info.is_now_offpeak
  3600. ? offpeak_time_info.seconds_till_next_offpeak_start
  3601. : 0);
  3602. for (int level = 0; level <= last_level; level++) {
  3603. for (auto f : files_[level]) {
  3604. if (!f->being_compacted) {
  3605. // Compute a file's modification time in the following order:
  3606. // 1. Use file_creation_time table property if it is > 0.
  3607. // 2. Use creation_time table property if it is > 0.
  3608. // 3. Use file's mtime metadata if the above two table properties are 0.
  3609. // Don't consider the file at all if the modification time cannot be
  3610. // correctly determined based on the above conditions.
  3611. uint64_t file_modification_time = f->TryGetFileCreationTime();
  3612. if (file_modification_time == kUnknownFileCreationTime) {
  3613. file_modification_time = f->TryGetOldestAncesterTime();
  3614. }
  3615. if (file_modification_time == kUnknownOldestAncesterTime) {
  3616. auto file_path = TableFileName(ioptions.cf_paths, f->fd.GetNumber(),
  3617. f->fd.GetPathId());
  3618. status = ioptions.env->GetFileModificationTime(
  3619. file_path, &file_modification_time);
  3620. if (!status.ok()) {
  3621. ROCKS_LOG_WARN(ioptions.logger,
  3622. "Can't get file modification time: %s: %s",
  3623. file_path.c_str(), status.ToString().c_str());
  3624. continue;
  3625. }
  3626. }
  3627. if (file_modification_time > 0 &&
  3628. file_modification_time < adjusted_allowed_time_limit) {
  3629. files_marked_for_periodic_compaction_.emplace_back(level, f);
  3630. }
  3631. }
  3632. }
  3633. }
  3634. }
  3635. void VersionStorageInfo::ComputeFilesMarkedForForcedBlobGC(
  3636. double blob_garbage_collection_age_cutoff,
  3637. double blob_garbage_collection_force_threshold,
  3638. bool enable_blob_garbage_collection) {
  3639. files_marked_for_forced_blob_gc_.clear();
  3640. if (!(enable_blob_garbage_collection &&
  3641. blob_garbage_collection_age_cutoff > 0.0 &&
  3642. blob_garbage_collection_force_threshold < 1.0)) {
  3643. return;
  3644. }
  3645. if (blob_files_.empty()) {
  3646. return;
  3647. }
  3648. // Number of blob files eligible for GC based on age
  3649. const size_t cutoff_count = static_cast<size_t>(
  3650. blob_garbage_collection_age_cutoff * blob_files_.size());
  3651. if (!cutoff_count) {
  3652. return;
  3653. }
  3654. // Compute the sum of total and garbage bytes over the batch of blob files
  3655. // currently eligible for garbage collection based on
  3656. // blob_garbage_collection_age_cutoff, and if the garbage ratio exceeds
  3657. // blob_garbage_collection_force_threshold, schedule compaction for the
  3658. // SST files that reference the oldest batch of blob files. Here is a toy
  3659. // example. Let's assume we have three SSTs 1, 2, and 3, and four blob files
  3660. // 10, 11, 12, and 13, which correspond to the range that is eligible for GC
  3661. // and satisfy the garbage ratio threshold. Also, let's say SSTs 1 and 2 both
  3662. // rely on blob file 10 and potentially some higher-numbered ones, while SST 3
  3663. // relies on blob file 12 and potentially some higher-numbered ones. Then, the
  3664. // SST to oldest blob file mapping is as follows:
  3665. //
  3666. // SST file number Oldest blob file number
  3667. // 1 10
  3668. // 2 10
  3669. // 3 12
  3670. //
  3671. // This is what the same thing looks like from the blob files' POV. (Note that
  3672. // the linked SSTs simply denote the inverse mapping of the above.)
  3673. //
  3674. // Blob file number Linked SST set
  3675. // 10 {1, 2}
  3676. // 11 {}
  3677. // 12 {3}
  3678. // 13 {}
  3679. //
  3680. // Then, the oldest batch of blob files consists of blob files 10 and 11,
  3681. // and we can get rid of them by forcing the compaction of SSTs 1 and 2.
  3682. const auto& oldest_meta = blob_files_.front();
  3683. assert(oldest_meta);
  3684. const auto& linked_ssts = oldest_meta->GetLinkedSsts();
  3685. assert(!linked_ssts.empty());
  3686. size_t count = 1;
  3687. uint64_t sum_total_blob_bytes = oldest_meta->GetTotalBlobBytes();
  3688. uint64_t sum_garbage_blob_bytes = oldest_meta->GetGarbageBlobBytes();
  3689. assert(cutoff_count <= blob_files_.size());
  3690. for (; count < cutoff_count; ++count) {
  3691. const auto& meta = blob_files_[count];
  3692. assert(meta);
  3693. sum_total_blob_bytes += meta->GetTotalBlobBytes();
  3694. sum_garbage_blob_bytes += meta->GetGarbageBlobBytes();
  3695. }
  3696. if (sum_garbage_blob_bytes <
  3697. blob_garbage_collection_force_threshold * sum_total_blob_bytes) {
  3698. return;
  3699. }
  3700. for (uint64_t sst_file_number : linked_ssts) {
  3701. const FileLocation location = GetFileLocation(sst_file_number);
  3702. assert(location.IsValid());
  3703. const int level = location.GetLevel();
  3704. assert(level >= 0);
  3705. const size_t pos = location.GetPosition();
  3706. FileMetaData* const sst_meta = files_[level][pos];
  3707. assert(sst_meta);
  3708. if (sst_meta->being_compacted) {
  3709. continue;
  3710. }
  3711. files_marked_for_forced_blob_gc_.emplace_back(level, sst_meta);
  3712. }
  3713. }
  3714. namespace {
  3715. // used to sort files by size
  3716. struct Fsize {
  3717. size_t index;
  3718. FileMetaData* file;
  3719. };
  3720. // Comparator that is used to sort files based on their size
  3721. // In normal mode: descending size
  3722. bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) {
  3723. return (first.file->compensated_file_size >
  3724. second.file->compensated_file_size);
  3725. }
  3726. } // anonymous namespace
  3727. void VersionStorageInfo::AddFile(int level, FileMetaData* f) {
  3728. auto& level_files = files_[level];
  3729. level_files.push_back(f);
  3730. f->refs++;
  3731. }
  3732. void VersionStorageInfo::AddBlobFile(
  3733. std::shared_ptr<BlobFileMetaData> blob_file_meta) {
  3734. assert(blob_file_meta);
  3735. assert(blob_files_.empty() ||
  3736. (blob_files_.back() && blob_files_.back()->GetBlobFileNumber() <
  3737. blob_file_meta->GetBlobFileNumber()));
  3738. blob_files_.emplace_back(std::move(blob_file_meta));
  3739. }
  3740. VersionStorageInfo::BlobFiles::const_iterator
  3741. VersionStorageInfo::GetBlobFileMetaDataLB(uint64_t blob_file_number) const {
  3742. return std::lower_bound(
  3743. blob_files_.begin(), blob_files_.end(), blob_file_number,
  3744. [](const std::shared_ptr<BlobFileMetaData>& lhs, uint64_t rhs) {
  3745. assert(lhs);
  3746. return lhs->GetBlobFileNumber() < rhs;
  3747. });
  3748. }
  3749. void VersionStorageInfo::SetFinalized() {
  3750. finalized_ = true;
  3751. #ifndef NDEBUG
  3752. if (compaction_style_ != kCompactionStyleLevel) {
  3753. // Not level based compaction.
  3754. return;
  3755. }
  3756. assert(base_level_ < 0 || num_levels() == 1 ||
  3757. (base_level_ >= 1 && base_level_ < num_levels()));
  3758. // Verify all levels newer than base_level are empty except L0
  3759. for (int level = 1; level < base_level(); level++) {
  3760. assert(NumLevelBytes(level) == 0);
  3761. }
  3762. uint64_t max_bytes_prev_level = 0;
  3763. for (int level = base_level(); level < num_levels() - 1; level++) {
  3764. if (LevelFiles(level).size() == 0) {
  3765. continue;
  3766. }
  3767. assert(MaxBytesForLevel(level) >= max_bytes_prev_level);
  3768. max_bytes_prev_level = MaxBytesForLevel(level);
  3769. }
  3770. for (int level = 0; level < num_levels(); level++) {
  3771. assert(LevelFiles(level).size() == 0 ||
  3772. LevelFiles(level).size() == LevelFilesBrief(level).num_files);
  3773. if (LevelFiles(level).size() > 0) {
  3774. assert(level < num_non_empty_levels());
  3775. }
  3776. }
  3777. assert(compaction_level_.size() > 0);
  3778. assert(compaction_level_.size() == compaction_score_.size());
  3779. #endif
  3780. }
  3781. void VersionStorageInfo::UpdateNumNonEmptyLevels() {
  3782. num_non_empty_levels_ = num_levels_;
  3783. for (int i = num_levels_ - 1; i >= 0; i--) {
  3784. if (files_[i].size() != 0) {
  3785. return;
  3786. } else {
  3787. num_non_empty_levels_ = i;
  3788. }
  3789. }
  3790. }
  3791. namespace {
  3792. // Sort `temp` based on ratio of overlapping size over file size
  3793. void SortFileByOverlappingRatio(
  3794. const InternalKeyComparator& icmp, const std::vector<FileMetaData*>& files,
  3795. const std::vector<FileMetaData*>& next_level_files, SystemClock* clock,
  3796. int level, int num_non_empty_levels, uint64_t ttl,
  3797. std::vector<Fsize>* temp) {
  3798. std::unordered_map<uint64_t, uint64_t> file_to_order;
  3799. auto next_level_it = next_level_files.begin();
  3800. int64_t curr_time;
  3801. Status status = clock->GetCurrentTime(&curr_time);
  3802. if (!status.ok()) {
  3803. // If we can't get time, disable TTL.
  3804. ttl = 0;
  3805. }
  3806. FileTtlBooster ttl_booster(static_cast<uint64_t>(curr_time), ttl,
  3807. num_non_empty_levels, level);
  3808. for (auto& file : files) {
  3809. uint64_t overlapping_bytes = 0;
  3810. // Skip files in next level that is smaller than current file
  3811. while (next_level_it != next_level_files.end() &&
  3812. icmp.Compare((*next_level_it)->largest, file->smallest) < 0) {
  3813. next_level_it++;
  3814. }
  3815. while (next_level_it != next_level_files.end() &&
  3816. icmp.Compare((*next_level_it)->smallest, file->largest) < 0) {
  3817. overlapping_bytes += (*next_level_it)->fd.file_size;
  3818. if (icmp.Compare((*next_level_it)->largest, file->largest) > 0) {
  3819. // next level file cross large boundary of current file.
  3820. break;
  3821. }
  3822. next_level_it++;
  3823. }
  3824. uint64_t ttl_boost_score = (ttl > 0) ? ttl_booster.GetBoostScore(file) : 1;
  3825. assert(ttl_boost_score > 0);
  3826. assert(file->compensated_file_size != 0);
  3827. file_to_order[file->fd.GetNumber()] = overlapping_bytes * 1024U /
  3828. file->compensated_file_size /
  3829. ttl_boost_score;
  3830. }
  3831. size_t num_to_sort = temp->size() > VersionStorageInfo::kNumberFilesToSort
  3832. ? VersionStorageInfo::kNumberFilesToSort
  3833. : temp->size();
  3834. std::partial_sort(
  3835. temp->begin(), temp->begin() + num_to_sort, temp->end(),
  3836. [&](const Fsize& f1, const Fsize& f2) -> bool {
  3837. // If score is the same, pick file with smaller keys.
  3838. // This makes the algorithm more deterministic, and also
  3839. // help the trivial move case to have more files to
  3840. // extend.
  3841. if (f1.file->marked_for_compaction == f2.file->marked_for_compaction) {
  3842. if (file_to_order[f1.file->fd.GetNumber()] ==
  3843. file_to_order[f2.file->fd.GetNumber()]) {
  3844. return icmp.Compare(f1.file->smallest, f2.file->smallest) < 0;
  3845. }
  3846. return file_to_order[f1.file->fd.GetNumber()] <
  3847. file_to_order[f2.file->fd.GetNumber()];
  3848. } else {
  3849. return f1.file->marked_for_compaction >
  3850. f2.file->marked_for_compaction;
  3851. }
  3852. });
  3853. }
  3854. void SortFileByRoundRobin(const InternalKeyComparator& icmp,
  3855. std::vector<InternalKey>* compact_cursor,
  3856. bool level0_non_overlapping, int level,
  3857. std::vector<Fsize>* temp) {
  3858. if (level == 0 && !level0_non_overlapping) {
  3859. // Using kOldestSmallestSeqFirst when level === 0, since the
  3860. // files may overlap (not fully sorted)
  3861. std::sort(temp->begin(), temp->end(),
  3862. [](const Fsize& f1, const Fsize& f2) -> bool {
  3863. return f1.file->fd.smallest_seqno < f2.file->fd.smallest_seqno;
  3864. });
  3865. return;
  3866. }
  3867. bool should_move_files =
  3868. compact_cursor->at(level).size() > 0 && temp->size() > 1;
  3869. // The iterator points to the Fsize with smallest key larger than or equal to
  3870. // the given cursor
  3871. std::vector<Fsize>::iterator current_file_iter;
  3872. if (should_move_files) {
  3873. // Find the file of which the smallest key is larger than or equal to
  3874. // the cursor (the smallest key in the successor file of the last
  3875. // chosen file), skip this if the cursor is invalid or there is only
  3876. // one file in this level
  3877. current_file_iter = std::lower_bound(
  3878. temp->begin(), temp->end(), compact_cursor->at(level),
  3879. [&](const Fsize& f, const InternalKey& cursor) -> bool {
  3880. return icmp.Compare(cursor, f.file->smallest) > 0;
  3881. });
  3882. should_move_files =
  3883. current_file_iter != temp->end() && current_file_iter != temp->begin();
  3884. }
  3885. if (should_move_files) {
  3886. // Construct a local temporary vector
  3887. std::vector<Fsize> local_temp;
  3888. local_temp.reserve(temp->size());
  3889. // Move the selected File into the first position and its successors
  3890. // into the second, third, ..., positions
  3891. for (auto iter = current_file_iter; iter != temp->end(); iter++) {
  3892. local_temp.push_back(*iter);
  3893. }
  3894. // Move the origin predecessors of the selected file in a round-robin
  3895. // manner
  3896. for (auto iter = temp->begin(); iter != current_file_iter; iter++) {
  3897. local_temp.push_back(*iter);
  3898. }
  3899. // Replace all the items in temp
  3900. for (size_t i = 0; i < local_temp.size(); i++) {
  3901. temp->at(i) = local_temp[i];
  3902. }
  3903. }
  3904. }
  3905. } // anonymous namespace
  3906. void VersionStorageInfo::UpdateFilesByCompactionPri(
  3907. const ImmutableOptions& ioptions, const MutableCFOptions& options) {
  3908. if (compaction_style_ == kCompactionStyleNone ||
  3909. compaction_style_ == kCompactionStyleFIFO ||
  3910. compaction_style_ == kCompactionStyleUniversal) {
  3911. // don't need this
  3912. return;
  3913. }
  3914. // No need to sort the highest level because it is never compacted.
  3915. for (int level = 0; level < num_levels() - 1; level++) {
  3916. const std::vector<FileMetaData*>& files = files_[level];
  3917. auto& files_by_compaction_pri = files_by_compaction_pri_[level];
  3918. assert(files_by_compaction_pri.size() == 0);
  3919. // populate a temp vector for sorting based on size
  3920. std::vector<Fsize> temp(files.size());
  3921. for (size_t i = 0; i < files.size(); i++) {
  3922. temp[i].index = i;
  3923. temp[i].file = files[i];
  3924. }
  3925. // sort the top kNumberFilesToSort based on file size
  3926. size_t num = VersionStorageInfo::kNumberFilesToSort;
  3927. if (num > temp.size()) {
  3928. num = temp.size();
  3929. }
  3930. switch (ioptions.compaction_pri) {
  3931. case kByCompensatedSize:
  3932. std::partial_sort(temp.begin(), temp.begin() + num, temp.end(),
  3933. CompareCompensatedSizeDescending);
  3934. break;
  3935. case kOldestLargestSeqFirst:
  3936. std::sort(temp.begin(), temp.end(),
  3937. [](const Fsize& f1, const Fsize& f2) -> bool {
  3938. return f1.file->fd.largest_seqno <
  3939. f2.file->fd.largest_seqno;
  3940. });
  3941. break;
  3942. case kOldestSmallestSeqFirst:
  3943. std::sort(temp.begin(), temp.end(),
  3944. [](const Fsize& f1, const Fsize& f2) -> bool {
  3945. return f1.file->fd.smallest_seqno <
  3946. f2.file->fd.smallest_seqno;
  3947. });
  3948. break;
  3949. case kMinOverlappingRatio:
  3950. SortFileByOverlappingRatio(*internal_comparator_, files_[level],
  3951. files_[level + 1], ioptions.clock, level,
  3952. num_non_empty_levels_, options.ttl, &temp);
  3953. break;
  3954. case kRoundRobin:
  3955. SortFileByRoundRobin(*internal_comparator_, &compact_cursor_,
  3956. level0_non_overlapping_, level, &temp);
  3957. break;
  3958. default:
  3959. assert(false);
  3960. }
  3961. assert(temp.size() == files.size());
  3962. // initialize files_by_compaction_pri_
  3963. for (size_t i = 0; i < temp.size(); i++) {
  3964. files_by_compaction_pri.push_back(static_cast<int>(temp[i].index));
  3965. }
  3966. next_file_to_compact_by_size_[level] = 0;
  3967. assert(files_[level].size() == files_by_compaction_pri_[level].size());
  3968. }
  3969. }
  3970. void VersionStorageInfo::GenerateLevel0NonOverlapping() {
  3971. assert(!finalized_);
  3972. level0_non_overlapping_ = true;
  3973. if (level_files_brief_.size() == 0) {
  3974. return;
  3975. }
  3976. // A copy of L0 files sorted by smallest key
  3977. std::vector<FdWithKeyRange> level0_sorted_file(
  3978. level_files_brief_[0].files,
  3979. level_files_brief_[0].files + level_files_brief_[0].num_files);
  3980. std::sort(level0_sorted_file.begin(), level0_sorted_file.end(),
  3981. [this](const FdWithKeyRange& f1, const FdWithKeyRange& f2) -> bool {
  3982. return (internal_comparator_->Compare(f1.smallest_key,
  3983. f2.smallest_key) < 0);
  3984. });
  3985. for (size_t i = 1; i < level0_sorted_file.size(); ++i) {
  3986. FdWithKeyRange& f = level0_sorted_file[i];
  3987. FdWithKeyRange& prev = level0_sorted_file[i - 1];
  3988. if (internal_comparator_->Compare(prev.largest_key, f.smallest_key) >= 0) {
  3989. level0_non_overlapping_ = false;
  3990. break;
  3991. }
  3992. }
  3993. }
  3994. void VersionStorageInfo::GenerateBottommostFiles() {
  3995. assert(!finalized_);
  3996. assert(bottommost_files_.empty());
  3997. for (size_t level = 0; level < level_files_brief_.size(); ++level) {
  3998. for (size_t file_idx = 0; file_idx < level_files_brief_[level].num_files;
  3999. ++file_idx) {
  4000. const FdWithKeyRange& f = level_files_brief_[level].files[file_idx];
  4001. int l0_file_idx;
  4002. if (level == 0) {
  4003. l0_file_idx = static_cast<int>(file_idx);
  4004. } else {
  4005. l0_file_idx = -1;
  4006. }
  4007. Slice smallest_user_key = ExtractUserKey(f.smallest_key);
  4008. Slice largest_user_key = ExtractUserKey(f.largest_key);
  4009. if (!RangeMightExistAfterSortedRun(smallest_user_key, largest_user_key,
  4010. static_cast<int>(level),
  4011. l0_file_idx)) {
  4012. bottommost_files_.emplace_back(static_cast<int>(level),
  4013. f.file_metadata);
  4014. }
  4015. }
  4016. }
  4017. }
  4018. void VersionStorageInfo::GenerateFileLocationIndex() {
  4019. size_t num_files = 0;
  4020. for (int level = 0; level < num_levels_; ++level) {
  4021. num_files += files_[level].size();
  4022. }
  4023. file_locations_.reserve(num_files);
  4024. for (int level = 0; level < num_levels_; ++level) {
  4025. for (size_t pos = 0; pos < files_[level].size(); ++pos) {
  4026. const FileMetaData* const meta = files_[level][pos];
  4027. assert(meta);
  4028. const uint64_t file_number = meta->fd.GetNumber();
  4029. assert(file_locations_.find(file_number) == file_locations_.end());
  4030. file_locations_.emplace(file_number, FileLocation(level, pos));
  4031. }
  4032. }
  4033. }
  4034. void VersionStorageInfo::UpdateOldestSnapshot(SequenceNumber seqnum,
  4035. bool allow_ingest_behind) {
  4036. assert(seqnum >= oldest_snapshot_seqnum_);
  4037. oldest_snapshot_seqnum_ = seqnum;
  4038. if (oldest_snapshot_seqnum_ > bottommost_files_mark_threshold_) {
  4039. ComputeBottommostFilesMarkedForCompaction(allow_ingest_behind);
  4040. }
  4041. }
  4042. void VersionStorageInfo::ComputeBottommostFilesMarkedForCompaction(
  4043. bool allow_ingest_behind) {
  4044. bottommost_files_marked_for_compaction_.clear();
  4045. bottommost_files_mark_threshold_ = kMaxSequenceNumber;
  4046. if (allow_ingest_behind) {
  4047. return;
  4048. }
  4049. // If a file's creation time is larger than creation_time_ub,
  4050. // it is too new to be marked for compaction.
  4051. int64_t creation_time_ub = 0;
  4052. bool needs_delay = bottommost_file_compaction_delay_ > 0;
  4053. if (needs_delay) {
  4054. int64_t current_time = 0;
  4055. clock_->GetCurrentTime(&current_time).PermitUncheckedError();
  4056. // Note that if GetCurrentTime() fails, current_time will be 0.
  4057. // We will treat it as is and treat all files as too new.
  4058. // The subtraction will not underflow since
  4059. // bottommost_file_compaction_delay_ is of type uint32_t.
  4060. creation_time_ub =
  4061. current_time - static_cast<int64_t>(bottommost_file_compaction_delay_);
  4062. }
  4063. for (auto& level_and_file : bottommost_files_) {
  4064. if (!level_and_file.second->being_compacted &&
  4065. level_and_file.second->fd.largest_seqno != 0) {
  4066. // largest_seqno might be nonzero due to containing the final key in an
  4067. // earlier compaction, whose seqnum we didn't zero out.
  4068. if (level_and_file.second->fd.largest_seqno < oldest_snapshot_seqnum_) {
  4069. if (!needs_delay) {
  4070. bottommost_files_marked_for_compaction_.push_back(level_and_file);
  4071. } else if (creation_time_ub > 0) {
  4072. int64_t creation_time = static_cast<int64_t>(
  4073. level_and_file.second->TryGetFileCreationTime());
  4074. if (creation_time == kUnknownFileCreationTime ||
  4075. creation_time <= creation_time_ub) {
  4076. bottommost_files_marked_for_compaction_.push_back(level_and_file);
  4077. } else {
  4078. // Just ignore this file for both
  4079. // bottommost_files_marked_for_compaction_ and
  4080. // bottommost_files_mark_threshold_. The next time
  4081. // this method is called, it will try this file again. The method
  4082. // is called after a new Version creation (compaction, flush, etc.),
  4083. // after a compaction is picked, and after a snapshot newer than
  4084. // bottommost_files_mark_threshold_ is released.
  4085. }
  4086. } else {
  4087. // creation_time_ub <= 0, all files are too new to be marked for
  4088. // compaction.
  4089. }
  4090. } else {
  4091. bottommost_files_mark_threshold_ =
  4092. std::min(bottommost_files_mark_threshold_,
  4093. level_and_file.second->fd.largest_seqno);
  4094. }
  4095. }
  4096. }
  4097. }
  4098. void Version::Ref() { ++refs_; }
  4099. bool Version::Unref() {
  4100. assert(refs_ >= 1);
  4101. --refs_;
  4102. if (refs_ == 0) {
  4103. delete this;
  4104. return true;
  4105. }
  4106. return false;
  4107. }
  4108. bool VersionStorageInfo::OverlapInLevel(int level,
  4109. const Slice* smallest_user_key,
  4110. const Slice* largest_user_key) {
  4111. if (level >= num_non_empty_levels_) {
  4112. // empty level, no overlap
  4113. return false;
  4114. }
  4115. return SomeFileOverlapsRange(*internal_comparator_, (level > 0),
  4116. level_files_brief_[level], smallest_user_key,
  4117. largest_user_key);
  4118. }
  4119. // Store in "*inputs" all files in "level" that overlap [begin,end]
  4120. // If hint_index is specified, then it points to a file in the
  4121. // overlapping range.
  4122. // The file_index returns a pointer to any file in an overlapping range.
  4123. void VersionStorageInfo::GetOverlappingInputs(
  4124. int level, const InternalKey* begin, const InternalKey* end,
  4125. std::vector<FileMetaData*>* inputs, int hint_index, int* file_index,
  4126. bool expand_range, const FileMetaData* starting_l0_file,
  4127. InternalKey** next_smallest) const {
  4128. if (level >= num_non_empty_levels_) {
  4129. // this level is empty, no overlapping inputs
  4130. return;
  4131. }
  4132. inputs->clear();
  4133. if (file_index) {
  4134. *file_index = -1;
  4135. }
  4136. const Comparator* user_cmp = user_comparator_;
  4137. if (level > 0) {
  4138. GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs, hint_index,
  4139. file_index, false, next_smallest);
  4140. return;
  4141. }
  4142. if (next_smallest) {
  4143. // next_smallest key only makes sense for non-level 0, where files are
  4144. // non-overlapping
  4145. *next_smallest = nullptr;
  4146. }
  4147. Slice user_begin, user_end;
  4148. if (begin != nullptr) {
  4149. user_begin = begin->user_key();
  4150. }
  4151. if (end != nullptr) {
  4152. user_end = end->user_key();
  4153. }
  4154. // index stores the file index need to check.
  4155. std::list<size_t> index;
  4156. size_t start_index = 0;
  4157. if (starting_l0_file != nullptr) {
  4158. uint64_t starting_file_number = starting_l0_file->fd.GetNumber();
  4159. for (size_t i = 0; i < level_files_brief_[level].num_files; i++) {
  4160. if (level_files_brief_[level].files[i].fd.GetNumber() ==
  4161. starting_file_number) {
  4162. start_index = i;
  4163. break;
  4164. }
  4165. }
  4166. assert(start_index < level_files_brief_[level].num_files);
  4167. }
  4168. for (size_t i = start_index; i < level_files_brief_[level].num_files; i++) {
  4169. index.emplace_back(i);
  4170. }
  4171. while (!index.empty()) {
  4172. bool found_overlapping_file = false;
  4173. auto iter = index.begin();
  4174. while (iter != index.end()) {
  4175. FdWithKeyRange* f = &(level_files_brief_[level].files[*iter]);
  4176. const Slice file_start = ExtractUserKey(f->smallest_key);
  4177. const Slice file_limit = ExtractUserKey(f->largest_key);
  4178. if (begin != nullptr &&
  4179. user_cmp->CompareWithoutTimestamp(file_limit, user_begin) < 0) {
  4180. // "f" is completely before specified range; skip it
  4181. iter++;
  4182. } else if (end != nullptr &&
  4183. user_cmp->CompareWithoutTimestamp(file_start, user_end) > 0) {
  4184. // "f" is completely after specified range; skip it
  4185. iter++;
  4186. } else {
  4187. // if overlap
  4188. inputs->emplace_back(files_[level][*iter]);
  4189. found_overlapping_file = true;
  4190. // record the first file index.
  4191. if (file_index && *file_index == -1) {
  4192. *file_index = static_cast<int>(*iter);
  4193. }
  4194. // the related file is overlap, erase to avoid checking again.
  4195. iter = index.erase(iter);
  4196. if (expand_range) {
  4197. if (begin != nullptr &&
  4198. user_cmp->CompareWithoutTimestamp(file_start, user_begin) < 0) {
  4199. user_begin = file_start;
  4200. }
  4201. if (end != nullptr &&
  4202. user_cmp->CompareWithoutTimestamp(file_limit, user_end) > 0) {
  4203. user_end = file_limit;
  4204. }
  4205. }
  4206. }
  4207. }
  4208. // if all the files left are not overlap, break
  4209. if (!found_overlapping_file) {
  4210. break;
  4211. }
  4212. }
  4213. }
  4214. // Store in "*inputs" files in "level" that within range [begin,end]
  4215. // Guarantee a "clean cut" boundary between the files in inputs
  4216. // and the surrounding files and the maxinum number of files.
  4217. // This will ensure that no parts of a key are lost during compaction.
  4218. // If hint_index is specified, then it points to a file in the range.
  4219. // The file_index returns a pointer to any file in an overlapping range.
  4220. void VersionStorageInfo::GetCleanInputsWithinInterval(
  4221. int level, const InternalKey* begin, const InternalKey* end,
  4222. std::vector<FileMetaData*>* inputs, int hint_index, int* file_index) const {
  4223. inputs->clear();
  4224. if (file_index) {
  4225. *file_index = -1;
  4226. }
  4227. if (level >= num_non_empty_levels_ || level == 0 ||
  4228. level_files_brief_[level].num_files == 0) {
  4229. // this level is empty, no inputs within range
  4230. // also don't support clean input interval within L0
  4231. return;
  4232. }
  4233. GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs, hint_index,
  4234. file_index, true /* within_interval */);
  4235. }
  4236. // Store in "*inputs" all files in "level" that overlap [begin,end]
  4237. // Employ binary search to find at least one file that overlaps the
  4238. // specified range. From that file, iterate backwards and
  4239. // forwards to find all overlapping files.
  4240. // if within_range is set, then only store the maximum clean inputs
  4241. // within range [begin, end]. "clean" means there is a boundary
  4242. // between the files in "*inputs" and the surrounding files
  4243. void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch(
  4244. int level, const InternalKey* begin, const InternalKey* end,
  4245. std::vector<FileMetaData*>* inputs, int hint_index, int* file_index,
  4246. bool within_interval, InternalKey** next_smallest) const {
  4247. assert(level > 0);
  4248. auto user_cmp = user_comparator_;
  4249. const FdWithKeyRange* files = level_files_brief_[level].files;
  4250. const int num_files = static_cast<int>(level_files_brief_[level].num_files);
  4251. // begin to use binary search to find lower bound
  4252. // and upper bound.
  4253. int start_index = 0;
  4254. int end_index = num_files;
  4255. if (begin != nullptr) {
  4256. // if within_interval is true, with file_key would find
  4257. // not overlapping ranges in std::lower_bound.
  4258. auto cmp = [&user_cmp, &within_interval](const FdWithKeyRange& f,
  4259. const InternalKey* k) {
  4260. auto& file_key = within_interval ? f.file_metadata->smallest
  4261. : f.file_metadata->largest;
  4262. return sstableKeyCompare(user_cmp, file_key, *k) < 0;
  4263. };
  4264. start_index = static_cast<int>(
  4265. std::lower_bound(files,
  4266. files + (hint_index == -1 ? num_files : hint_index),
  4267. begin, cmp) -
  4268. files);
  4269. if (start_index > 0 && within_interval) {
  4270. bool is_overlapping = true;
  4271. while (is_overlapping && start_index < num_files) {
  4272. auto& pre_limit = files[start_index - 1].file_metadata->largest;
  4273. auto& cur_start = files[start_index].file_metadata->smallest;
  4274. is_overlapping = sstableKeyCompare(user_cmp, pre_limit, cur_start) == 0;
  4275. start_index += is_overlapping;
  4276. }
  4277. }
  4278. }
  4279. if (end != nullptr) {
  4280. // if within_interval is true, with file_key would find
  4281. // not overlapping ranges in std::upper_bound.
  4282. auto cmp = [&user_cmp, &within_interval](const InternalKey* k,
  4283. const FdWithKeyRange& f) {
  4284. auto& file_key = within_interval ? f.file_metadata->largest
  4285. : f.file_metadata->smallest;
  4286. return sstableKeyCompare(user_cmp, *k, file_key) < 0;
  4287. };
  4288. end_index = static_cast<int>(
  4289. std::upper_bound(files + start_index, files + num_files, end, cmp) -
  4290. files);
  4291. if (end_index < num_files && within_interval) {
  4292. bool is_overlapping = true;
  4293. while (is_overlapping && end_index > start_index) {
  4294. auto& next_start = files[end_index].file_metadata->smallest;
  4295. auto& cur_limit = files[end_index - 1].file_metadata->largest;
  4296. is_overlapping =
  4297. sstableKeyCompare(user_cmp, cur_limit, next_start) == 0;
  4298. end_index -= is_overlapping;
  4299. }
  4300. }
  4301. }
  4302. assert(start_index <= end_index);
  4303. // If there were no overlapping files, return immediately.
  4304. if (start_index == end_index) {
  4305. if (next_smallest) {
  4306. *next_smallest = nullptr;
  4307. }
  4308. return;
  4309. }
  4310. assert(start_index < end_index);
  4311. // returns the index where an overlap is found
  4312. if (file_index) {
  4313. *file_index = start_index;
  4314. }
  4315. // insert overlapping files into vector
  4316. for (int i = start_index; i < end_index; i++) {
  4317. inputs->push_back(files_[level][i]);
  4318. }
  4319. if (next_smallest != nullptr) {
  4320. // Provide the next key outside the range covered by inputs
  4321. if (end_index < static_cast<int>(files_[level].size())) {
  4322. **next_smallest = files_[level][end_index]->smallest;
  4323. } else {
  4324. *next_smallest = nullptr;
  4325. }
  4326. }
  4327. }
  4328. uint64_t VersionStorageInfo::NumLevelBytes(int level) const {
  4329. assert(level >= 0);
  4330. assert(level < num_levels());
  4331. return TotalFileSize(files_[level]);
  4332. }
  4333. const char* VersionStorageInfo::LevelSummary(
  4334. LevelSummaryStorage* scratch) const {
  4335. int len = 0;
  4336. if (compaction_style_ == kCompactionStyleLevel && num_levels() > 1) {
  4337. assert(base_level_ < static_cast<int>(level_max_bytes_.size()));
  4338. if (level_multiplier_ != 0.0) {
  4339. len = snprintf(
  4340. scratch->buffer, sizeof(scratch->buffer),
  4341. "base level %d level multiplier %.2f max bytes base %" PRIu64 " ",
  4342. base_level_, level_multiplier_, level_max_bytes_[base_level_]);
  4343. }
  4344. }
  4345. len +=
  4346. snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "files[");
  4347. for (int i = 0; i < num_levels(); i++) {
  4348. int sz = sizeof(scratch->buffer) - len;
  4349. int ret = snprintf(scratch->buffer + len, sz, "%d ", int(files_[i].size()));
  4350. if (ret < 0 || ret >= sz) {
  4351. break;
  4352. }
  4353. len += ret;
  4354. }
  4355. if (len > 0) {
  4356. // overwrite the last space
  4357. --len;
  4358. }
  4359. len +=
  4360. snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
  4361. "] max score %.2f, estimated pending compaction bytes %" PRIu64,
  4362. compaction_score_[0], estimated_compaction_needed_bytes_);
  4363. if (!files_marked_for_compaction_.empty()) {
  4364. snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
  4365. " (%" ROCKSDB_PRIszt " files need compaction)",
  4366. files_marked_for_compaction_.size());
  4367. }
  4368. return scratch->buffer;
  4369. }
  4370. const char* VersionStorageInfo::LevelFileSummary(FileSummaryStorage* scratch,
  4371. int level) const {
  4372. int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size[");
  4373. for (const auto& f : files_[level]) {
  4374. int sz = sizeof(scratch->buffer) - len;
  4375. char sztxt[16];
  4376. AppendHumanBytes(f->fd.GetFileSize(), sztxt, sizeof(sztxt));
  4377. int ret = snprintf(scratch->buffer + len, sz,
  4378. "#%" PRIu64 "(seq=%" PRIu64 ",sz=%s,%d) ",
  4379. f->fd.GetNumber(), f->fd.smallest_seqno, sztxt,
  4380. static_cast<int>(f->being_compacted));
  4381. if (ret < 0 || ret >= sz) {
  4382. break;
  4383. }
  4384. len += ret;
  4385. }
  4386. // overwrite the last space (only if files_[level].size() is non-zero)
  4387. if (files_[level].size() && len > 0) {
  4388. --len;
  4389. }
  4390. snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
  4391. return scratch->buffer;
  4392. }
  4393. bool VersionStorageInfo::HasMissingEpochNumber() const {
  4394. for (int level = 0; level < num_levels_; ++level) {
  4395. for (const FileMetaData* f : files_[level]) {
  4396. if (f->epoch_number == kUnknownEpochNumber) {
  4397. return true;
  4398. }
  4399. }
  4400. }
  4401. return false;
  4402. }
  4403. uint64_t VersionStorageInfo::GetMaxEpochNumberOfFiles() const {
  4404. uint64_t max_epoch_number = kUnknownEpochNumber;
  4405. for (int level = 0; level < num_levels_; ++level) {
  4406. for (const FileMetaData* f : files_[level]) {
  4407. max_epoch_number = std::max(max_epoch_number, f->epoch_number);
  4408. }
  4409. }
  4410. return max_epoch_number;
  4411. }
  4412. void VersionStorageInfo::RecoverEpochNumbers(ColumnFamilyData* cfd,
  4413. bool restart_epoch, bool force) {
  4414. if (restart_epoch) {
  4415. cfd->ResetNextEpochNumber();
  4416. bool reserve_epoch_num_for_file_ingested_behind = cfd->AllowIngestBehind();
  4417. if (reserve_epoch_num_for_file_ingested_behind) {
  4418. uint64_t reserved_epoch_number = cfd->NewEpochNumber();
  4419. assert(reserved_epoch_number ==
  4420. kReservedEpochNumberForFileIngestedBehind);
  4421. ROCKS_LOG_INFO(cfd->ioptions().info_log.get(),
  4422. "[%s]CF has reserved epoch number %" PRIu64
  4423. " for files ingested "
  4424. "behind since `Options::allow_ingest_behind` or "
  4425. "`Options::cf_allow_ingest_behind` is true",
  4426. cfd->GetName().c_str(), reserved_epoch_number);
  4427. }
  4428. }
  4429. bool missing_epoch_number = HasMissingEpochNumber();
  4430. if (missing_epoch_number || force) {
  4431. for (int level = num_levels_ - 1; level >= 1; --level) {
  4432. auto& files_at_level = files_[level];
  4433. if (files_at_level.empty()) {
  4434. continue;
  4435. }
  4436. uint64_t next_epoch_number = cfd->NewEpochNumber();
  4437. for (FileMetaData* f : files_at_level) {
  4438. f->epoch_number = next_epoch_number;
  4439. }
  4440. }
  4441. for (auto file_meta_iter = files_[0].rbegin();
  4442. file_meta_iter != files_[0].rend(); file_meta_iter++) {
  4443. FileMetaData* f = *file_meta_iter;
  4444. f->epoch_number = cfd->NewEpochNumber();
  4445. }
  4446. if (missing_epoch_number) {
  4447. assert(epoch_number_requirement_ ==
  4448. EpochNumberRequirement::kMightMissing);
  4449. ROCKS_LOG_WARN(cfd->ioptions().info_log.get(),
  4450. "[%s]CF's epoch numbers are inferred based on seqno",
  4451. cfd->GetName().c_str());
  4452. epoch_number_requirement_ = EpochNumberRequirement::kMustPresent;
  4453. }
  4454. } else {
  4455. assert(epoch_number_requirement_ == EpochNumberRequirement::kMustPresent);
  4456. cfd->SetNextEpochNumber(
  4457. std::max(GetMaxEpochNumberOfFiles() + 1, cfd->GetNextEpochNumber()));
  4458. }
  4459. }
  4460. uint64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() {
  4461. uint64_t result = 0;
  4462. std::vector<FileMetaData*> overlaps;
  4463. for (int level = 1; level < num_levels() - 1; level++) {
  4464. for (const auto& f : files_[level]) {
  4465. GetOverlappingInputs(level + 1, &f->smallest, &f->largest, &overlaps);
  4466. const uint64_t sum = TotalFileSize(overlaps);
  4467. if (sum > result) {
  4468. result = sum;
  4469. }
  4470. }
  4471. }
  4472. return result;
  4473. }
  4474. uint64_t VersionStorageInfo::MaxBytesForLevel(int level) const {
  4475. // Note: the result for level zero is not really used since we set
  4476. // the level-0 compaction threshold based on number of files.
  4477. assert(level >= 0);
  4478. assert(level < static_cast<int>(level_max_bytes_.size()));
  4479. return level_max_bytes_[level];
  4480. }
  4481. void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions,
  4482. const MutableCFOptions& options) {
  4483. // Special logic to set number of sorted runs.
  4484. // It is to match the previous behavior when all files are in L0.
  4485. int num_l0_count = static_cast<int>(files_[0].size());
  4486. if (compaction_style_ == kCompactionStyleUniversal) {
  4487. // For universal compaction, we use level0 score to indicate
  4488. // compaction score for the whole DB. Adding other levels as if
  4489. // they are L0 files.
  4490. for (int i = 1; i < num_levels(); i++) {
  4491. if (!files_[i].empty()) {
  4492. num_l0_count++;
  4493. }
  4494. }
  4495. }
  4496. set_l0_delay_trigger_count(num_l0_count);
  4497. level_max_bytes_.resize(ioptions.num_levels);
  4498. if (!ioptions.level_compaction_dynamic_level_bytes) {
  4499. base_level_ = (ioptions.compaction_style == kCompactionStyleLevel) ? 1 : -1;
  4500. // Calculate for static bytes base case
  4501. for (int i = 0; i < ioptions.num_levels; ++i) {
  4502. if (i == 0 && ioptions.compaction_style == kCompactionStyleUniversal) {
  4503. level_max_bytes_[i] = options.max_bytes_for_level_base;
  4504. } else if (i > 1) {
  4505. level_max_bytes_[i] = MultiplyCheckOverflow(
  4506. MultiplyCheckOverflow(level_max_bytes_[i - 1],
  4507. options.max_bytes_for_level_multiplier),
  4508. options.MaxBytesMultiplerAdditional(i - 1));
  4509. } else {
  4510. level_max_bytes_[i] = options.max_bytes_for_level_base;
  4511. }
  4512. }
  4513. } else {
  4514. assert(ioptions.compaction_style == kCompactionStyleLevel);
  4515. uint64_t max_level_size = 0;
  4516. int first_non_empty_level = -1;
  4517. // Find size of non-L0 level of most data.
  4518. // Cannot use the size of the last level because it can be empty or less
  4519. // than previous levels after compaction.
  4520. for (int i = 1; i < num_levels_; i++) {
  4521. uint64_t total_size = 0;
  4522. for (const auto& f : files_[i]) {
  4523. total_size += f->fd.GetFileSize();
  4524. }
  4525. if (total_size > 0 && first_non_empty_level == -1) {
  4526. first_non_empty_level = i;
  4527. }
  4528. if (total_size > max_level_size) {
  4529. max_level_size = total_size;
  4530. }
  4531. }
  4532. // Prefill every level's max bytes to disallow compaction from there.
  4533. for (int i = 0; i < num_levels_; i++) {
  4534. level_max_bytes_[i] = std::numeric_limits<uint64_t>::max();
  4535. }
  4536. lowest_unnecessary_level_ = -1;
  4537. if (max_level_size == 0) {
  4538. // No data for L1 and up. L0 compacts to last level directly.
  4539. // No compaction from L1+ needs to be scheduled.
  4540. base_level_ = num_levels_ - 1;
  4541. } else {
  4542. assert(first_non_empty_level >= 1);
  4543. uint64_t base_bytes_max = options.max_bytes_for_level_base;
  4544. uint64_t base_bytes_min = static_cast<uint64_t>(
  4545. base_bytes_max / options.max_bytes_for_level_multiplier);
  4546. // Try whether we can make last level's target size to be max_level_size
  4547. uint64_t cur_level_size = max_level_size;
  4548. for (int i = num_levels_ - 2; i >= first_non_empty_level; i--) {
  4549. // Round up after dividing
  4550. cur_level_size = static_cast<uint64_t>(
  4551. cur_level_size / options.max_bytes_for_level_multiplier);
  4552. if (lowest_unnecessary_level_ == -1 &&
  4553. cur_level_size <= base_bytes_min &&
  4554. (options.preclude_last_level_data_seconds == 0 ||
  4555. i < num_levels_ - 2)) {
  4556. // When per_key_placement is enabled, the proximal level is
  4557. // necessary.
  4558. lowest_unnecessary_level_ = i;
  4559. }
  4560. }
  4561. // Calculate base level and its size.
  4562. uint64_t base_level_size;
  4563. if (cur_level_size <= base_bytes_min) {
  4564. // If per_key_placement is not enabled,
  4565. // either there is only one non-empty level after level 0,
  4566. // which can less than base_bytes_min AND necessary,
  4567. // or there is some unnecessary level.
  4568. assert(first_non_empty_level == num_levels_ - 1 ||
  4569. options.preclude_last_level_data_seconds > 0 ||
  4570. lowest_unnecessary_level_ != -1);
  4571. // Case 1. If we make target size of last level to be max_level_size,
  4572. // target size of the first non-empty level would be smaller than
  4573. // base_bytes_min. We set it be base_bytes_min.
  4574. base_level_size = base_bytes_min + 1U;
  4575. base_level_ = first_non_empty_level;
  4576. if (base_level_ < num_levels_ - 1) {
  4577. ROCKS_LOG_INFO(
  4578. ioptions.logger,
  4579. "More existing levels in DB than needed: all non-zero "
  4580. "levels <= level %d are unnecessary. "
  4581. "max_bytes_for_level_multiplier may not be guaranteed.",
  4582. lowest_unnecessary_level_);
  4583. }
  4584. } else {
  4585. assert(lowest_unnecessary_level_ == -1);
  4586. // Find base level (where L0 data is compacted to).
  4587. base_level_ = first_non_empty_level;
  4588. while (base_level_ > 1 && cur_level_size > base_bytes_max) {
  4589. --base_level_;
  4590. cur_level_size = static_cast<uint64_t>(
  4591. cur_level_size / options.max_bytes_for_level_multiplier);
  4592. }
  4593. if (cur_level_size > base_bytes_max) {
  4594. // Even L1 will be too large
  4595. assert(base_level_ == 1);
  4596. base_level_size = base_bytes_max;
  4597. } else {
  4598. base_level_size = std::max(static_cast<uint64_t>(1), cur_level_size);
  4599. }
  4600. }
  4601. level_multiplier_ = options.max_bytes_for_level_multiplier;
  4602. assert(base_level_size > 0);
  4603. uint64_t level_size = base_level_size;
  4604. for (int i = base_level_; i < num_levels_; i++) {
  4605. if (i > base_level_) {
  4606. level_size = MultiplyCheckOverflow(level_size, level_multiplier_);
  4607. }
  4608. // Don't set any level below base_bytes_max. Otherwise, the LSM can
  4609. // assume an hourglass shape where L1+ sizes are smaller than L0. This
  4610. // causes compaction scoring, which depends on level sizes, to favor L1+
  4611. // at the expense of L0, which may fill up and stall.
  4612. level_max_bytes_[i] = std::max(level_size, base_bytes_max);
  4613. }
  4614. }
  4615. }
  4616. }
  4617. uint64_t VersionStorageInfo::EstimateLiveDataSize() const {
  4618. // Estimate the live data size by adding up the size of a maximal set of
  4619. // sst files with no range overlap in same or higher level. The less
  4620. // compacted, the more optimistic (smaller) this estimate is. Also,
  4621. // for multiple sorted runs within a level, file order will matter.
  4622. uint64_t size = 0;
  4623. auto ikey_lt = [this](InternalKey* x, InternalKey* y) {
  4624. return internal_comparator_->Compare(*x, *y) < 0;
  4625. };
  4626. // (Ordered) map of largest keys in files being included in size estimate
  4627. std::map<InternalKey*, FileMetaData*, decltype(ikey_lt)> ranges(ikey_lt);
  4628. for (int l = num_levels_ - 1; l >= 0; l--) {
  4629. bool found_end = false;
  4630. for (auto file : files_[l]) {
  4631. // Find the first file already included with largest key is larger than
  4632. // the smallest key of `file`. If that file does not overlap with the
  4633. // current file, none of the files in the map does. If there is
  4634. // no potential overlap, we can safely insert the rest of this level
  4635. // (if the level is not 0) into the map without checking again because
  4636. // the elements in the level are sorted and non-overlapping.
  4637. auto lb = (found_end && l != 0) ? ranges.end()
  4638. : ranges.lower_bound(&file->smallest);
  4639. found_end = (lb == ranges.end());
  4640. if (found_end || internal_comparator_->Compare(
  4641. file->largest, (*lb).second->smallest) < 0) {
  4642. ranges.emplace_hint(lb, &file->largest, file);
  4643. size += file->fd.file_size;
  4644. }
  4645. }
  4646. }
  4647. // For BlobDB, the result also includes the exact value of live bytes in the
  4648. // blob files of the version.
  4649. for (const auto& meta : blob_files_) {
  4650. assert(meta);
  4651. size += meta->GetTotalBlobBytes();
  4652. size -= meta->GetGarbageBlobBytes();
  4653. }
  4654. return size;
  4655. }
  4656. bool VersionStorageInfo::RangeMightExistAfterSortedRun(
  4657. const Slice& smallest_user_key, const Slice& largest_user_key,
  4658. int last_level, int last_l0_idx) {
  4659. assert((last_l0_idx != -1) == (last_level == 0));
  4660. // TODO(ajkr): this preserves earlier behavior where we considered an L0 file
  4661. // bottommost only if it's the oldest L0 file and there are no files on older
  4662. // levels. It'd be better to consider it bottommost if there's no overlap in
  4663. // older levels/files.
  4664. if (last_level == 0 &&
  4665. last_l0_idx != static_cast<int>(LevelFiles(0).size() - 1)) {
  4666. return true;
  4667. }
  4668. // Checks whether there are files living beyond the `last_level`. If lower
  4669. // levels have files, it checks for overlap between [`smallest_key`,
  4670. // `largest_key`] and those files. Bottomlevel optimizations can be made if
  4671. // there are no files in lower levels or if there is no overlap with the files
  4672. // in the lower levels.
  4673. for (int level = last_level + 1; level < num_levels(); level++) {
  4674. // The range is not in the bottommost level if there are files in lower
  4675. // levels when the `last_level` is 0 or if there are files in lower levels
  4676. // which overlap with [`smallest_key`, `largest_key`].
  4677. if (files_[level].size() > 0 &&
  4678. (last_level == 0 ||
  4679. OverlapInLevel(level, &smallest_user_key, &largest_user_key))) {
  4680. return true;
  4681. }
  4682. }
  4683. return false;
  4684. }
  4685. Env::WriteLifeTimeHint VersionStorageInfo::CalculateSSTWriteHint(
  4686. int level, CompactionStyleSet compaction_style_set) const {
  4687. if (!compaction_style_set.Contains(compaction_style_)) {
  4688. return Env::WLTH_NOT_SET;
  4689. }
  4690. switch (compaction_style_) {
  4691. case kCompactionStyleLevel:
  4692. if (level == 0) {
  4693. return Env::WLTH_MEDIUM;
  4694. }
  4695. // L1: medium, L2: long, ...
  4696. if (level - base_level_ >= 2) {
  4697. return Env::WLTH_EXTREME;
  4698. } else if (level < base_level_) {
  4699. // There is no restriction which prevents level passed in to be smaller
  4700. // than base_level.
  4701. return Env::WLTH_MEDIUM;
  4702. }
  4703. return static_cast<Env::WriteLifeTimeHint>(
  4704. level - base_level_ + static_cast<int>(Env::WLTH_MEDIUM));
  4705. case kCompactionStyleUniversal:
  4706. if (level == 0) {
  4707. return Env::WLTH_SHORT;
  4708. }
  4709. if (level == 1) {
  4710. return Env::WLTH_MEDIUM;
  4711. }
  4712. return Env::WLTH_LONG;
  4713. default:
  4714. return Env::WLTH_NOT_SET;
  4715. }
  4716. }
  4717. void Version::AddLiveFiles(std::vector<uint64_t>* live_table_files,
  4718. std::vector<uint64_t>* live_blob_files) const {
  4719. assert(live_table_files);
  4720. assert(live_blob_files);
  4721. for (int level = 0; level < storage_info_.num_levels(); ++level) {
  4722. const auto& level_files = storage_info_.LevelFiles(level);
  4723. for (const auto& meta : level_files) {
  4724. assert(meta);
  4725. live_table_files->emplace_back(meta->fd.GetNumber());
  4726. }
  4727. }
  4728. const auto& blob_files = storage_info_.GetBlobFiles();
  4729. for (const auto& meta : blob_files) {
  4730. assert(meta);
  4731. live_blob_files->emplace_back(meta->GetBlobFileNumber());
  4732. }
  4733. }
  4734. void Version::RemoveLiveFiles(
  4735. std::vector<ObsoleteFileInfo>& sst_delete_candidates,
  4736. std::vector<ObsoleteBlobFileInfo>& blob_delete_candidates) const {
  4737. for (ObsoleteFileInfo& fi : sst_delete_candidates) {
  4738. if (!fi.only_delete_metadata &&
  4739. storage_info()->GetFileLocation(fi.metadata->fd.GetNumber()) !=
  4740. VersionStorageInfo::FileLocation::Invalid()) {
  4741. fi.only_delete_metadata = true;
  4742. }
  4743. }
  4744. blob_delete_candidates.erase(
  4745. std::remove_if(
  4746. blob_delete_candidates.begin(), blob_delete_candidates.end(),
  4747. [this](ObsoleteBlobFileInfo& x) {
  4748. return storage_info()->GetBlobFileMetaData(x.GetBlobFileNumber());
  4749. }),
  4750. blob_delete_candidates.end());
  4751. }
  4752. std::string Version::DebugString(bool hex, bool print_stats) const {
  4753. std::string r;
  4754. for (int level = 0; level < storage_info_.num_levels_; level++) {
  4755. // E.g.,
  4756. // --- level 1 ---
  4757. // 17:123[1 .. 124]['a' .. 'd']
  4758. // 20:43[124 .. 128]['e' .. 'g']
  4759. //
  4760. // if print_stats=true:
  4761. // 17:123[1 .. 124]['a' .. 'd'](4096)
  4762. r.append("--- level ");
  4763. AppendNumberTo(&r, level);
  4764. r.append(" --- version# ");
  4765. AppendNumberTo(&r, version_number_);
  4766. if (storage_info_.compact_cursor_[level].Valid()) {
  4767. r.append(" --- compact_cursor: ");
  4768. r.append(storage_info_.compact_cursor_[level].DebugString(hex));
  4769. }
  4770. r.append(" ---\n");
  4771. const std::vector<FileMetaData*>& files = storage_info_.files_[level];
  4772. for (size_t i = 0; i < files.size(); i++) {
  4773. r.push_back(' ');
  4774. AppendNumberTo(&r, files[i]->fd.GetNumber());
  4775. r.push_back(':');
  4776. AppendNumberTo(&r, files[i]->fd.GetFileSize());
  4777. r.append("[");
  4778. AppendNumberTo(&r, files[i]->fd.smallest_seqno);
  4779. r.append(" .. ");
  4780. AppendNumberTo(&r, files[i]->fd.largest_seqno);
  4781. r.append("]");
  4782. r.append("[");
  4783. r.append(files[i]->smallest.DebugString(hex));
  4784. r.append(" .. ");
  4785. r.append(files[i]->largest.DebugString(hex));
  4786. r.append("]");
  4787. if (files[i]->oldest_blob_file_number != kInvalidBlobFileNumber) {
  4788. r.append(" blob_file:");
  4789. AppendNumberTo(&r, files[i]->oldest_blob_file_number);
  4790. }
  4791. if (print_stats) {
  4792. r.append("(");
  4793. r.append(std::to_string(
  4794. files[i]->stats.num_reads_sampled.load(std::memory_order_relaxed)));
  4795. r.append(")");
  4796. }
  4797. r.append("\n");
  4798. }
  4799. }
  4800. const auto& blob_files = storage_info_.GetBlobFiles();
  4801. if (!blob_files.empty()) {
  4802. r.append("--- blob files --- version# ");
  4803. AppendNumberTo(&r, version_number_);
  4804. r.append(" ---\n");
  4805. for (const auto& blob_file_meta : blob_files) {
  4806. assert(blob_file_meta);
  4807. r.append(blob_file_meta->DebugString());
  4808. r.push_back('\n');
  4809. }
  4810. }
  4811. return r;
  4812. }
  4813. // this is used to batch writes to the manifest file
  4814. struct VersionSet::ManifestWriter {
  4815. Status status;
  4816. bool done;
  4817. InstrumentedCondVar cv;
  4818. ColumnFamilyData* cfd;
  4819. const autovector<VersionEdit*>& edit_list;
  4820. const std::function<void(const Status&)> manifest_write_callback;
  4821. explicit ManifestWriter(
  4822. InstrumentedMutex* mu, ColumnFamilyData* _cfd,
  4823. const autovector<VersionEdit*>& e,
  4824. const std::function<void(const Status&)>& manifest_wcb)
  4825. : done(false),
  4826. cv(mu),
  4827. cfd(_cfd),
  4828. edit_list(e),
  4829. manifest_write_callback(manifest_wcb) {}
  4830. ~ManifestWriter() { status.PermitUncheckedError(); }
  4831. bool IsAllWalEdits() const {
  4832. bool all_wal_edits = true;
  4833. for (const auto& e : edit_list) {
  4834. if (!e->IsWalManipulation()) {
  4835. all_wal_edits = false;
  4836. break;
  4837. }
  4838. }
  4839. return all_wal_edits;
  4840. }
  4841. };
  4842. Status AtomicGroupReadBuffer::AddEdit(VersionEdit* edit) {
  4843. assert(edit);
  4844. if (edit->IsInAtomicGroup()) {
  4845. TEST_SYNC_POINT("AtomicGroupReadBuffer::AddEdit:AtomicGroup");
  4846. if (replay_buffer_.empty()) {
  4847. replay_buffer_.resize(edit->GetRemainingEntries() + 1);
  4848. TEST_SYNC_POINT_CALLBACK(
  4849. "AtomicGroupReadBuffer::AddEdit:FirstInAtomicGroup", edit);
  4850. }
  4851. read_edits_in_atomic_group_++;
  4852. if (read_edits_in_atomic_group_ + edit->GetRemainingEntries() !=
  4853. static_cast<uint32_t>(replay_buffer_.size())) {
  4854. TEST_SYNC_POINT_CALLBACK(
  4855. "AtomicGroupReadBuffer::AddEdit:IncorrectAtomicGroupSize", edit);
  4856. return Status::Corruption("corrupted atomic group");
  4857. }
  4858. replay_buffer_[read_edits_in_atomic_group_ - 1] = *edit;
  4859. if (read_edits_in_atomic_group_ == replay_buffer_.size()) {
  4860. TEST_SYNC_POINT_CALLBACK(
  4861. "AtomicGroupReadBuffer::AddEdit:LastInAtomicGroup", edit);
  4862. return Status::OK();
  4863. }
  4864. return Status::OK();
  4865. }
  4866. // A normal edit.
  4867. if (!replay_buffer().empty()) {
  4868. TEST_SYNC_POINT_CALLBACK(
  4869. "AtomicGroupReadBuffer::AddEdit:AtomicGroupMixedWithNormalEdits", edit);
  4870. return Status::Corruption("corrupted atomic group");
  4871. }
  4872. return Status::OK();
  4873. }
  4874. bool AtomicGroupReadBuffer::IsFull() const {
  4875. return read_edits_in_atomic_group_ == replay_buffer_.size();
  4876. }
  4877. bool AtomicGroupReadBuffer::IsEmpty() const { return replay_buffer_.empty(); }
  4878. void AtomicGroupReadBuffer::Clear() {
  4879. read_edits_in_atomic_group_ = 0;
  4880. replay_buffer_.clear();
  4881. }
  4882. VersionSet::VersionSet(
  4883. const std::string& dbname, const ImmutableDBOptions* _db_options,
  4884. const FileOptions& storage_options, Cache* table_cache,
  4885. WriteBufferManager* write_buffer_manager, WriteController* write_controller,
  4886. BlockCacheTracer* const block_cache_tracer,
  4887. const std::shared_ptr<IOTracer>& io_tracer, const std::string& db_id,
  4888. const std::string& db_session_id, const std::string& daily_offpeak_time_utc,
  4889. ErrorHandler* error_handler, bool unchanging)
  4890. : column_family_set_(new ColumnFamilySet(
  4891. dbname, _db_options, storage_options, table_cache,
  4892. write_buffer_manager, write_controller, block_cache_tracer, io_tracer,
  4893. db_id, db_session_id)),
  4894. table_cache_(table_cache),
  4895. env_(_db_options->env),
  4896. fs_(_db_options->fs, io_tracer),
  4897. clock_(_db_options->clock),
  4898. dbname_(dbname),
  4899. db_id_(db_id),
  4900. db_options_(_db_options),
  4901. next_file_number_(2),
  4902. manifest_file_number_(0), // Filled by Recover()
  4903. options_file_number_(0),
  4904. options_file_size_(0),
  4905. pending_manifest_file_number_(0),
  4906. last_sequence_(0),
  4907. last_allocated_sequence_(0),
  4908. last_published_sequence_(0),
  4909. prev_log_number_(0),
  4910. current_version_number_(0),
  4911. manifest_file_size_(0),
  4912. file_options_(storage_options),
  4913. block_cache_tracer_(block_cache_tracer),
  4914. io_tracer_(io_tracer),
  4915. db_session_id_(db_session_id),
  4916. offpeak_time_option_(OffpeakTimeOption(daily_offpeak_time_utc)),
  4917. error_handler_(error_handler),
  4918. unchanging_(unchanging),
  4919. closed_(false) {}
  4920. Status VersionSet::Close(FSDirectory* db_dir, InstrumentedMutex* mu) {
  4921. Status s;
  4922. if (closed_ || unchanging_ || !manifest_file_number_ || !descriptor_log_) {
  4923. return s;
  4924. }
  4925. std::string manifest_file_name =
  4926. DescriptorFileName(dbname_, manifest_file_number_);
  4927. uint64_t size = 0;
  4928. IOStatus io_s = descriptor_log_->Close(WriteOptions());
  4929. descriptor_log_.reset();
  4930. TEST_SYNC_POINT("VersionSet::Close:AfterClose");
  4931. if (io_s.ok()) {
  4932. io_s = fs_->GetFileSize(manifest_file_name, IOOptions(), &size, nullptr);
  4933. }
  4934. if (!io_s.ok() || size != manifest_file_size_) {
  4935. if (io_s.ok()) {
  4936. // This means the size is not as expected. So we treat it as a
  4937. // corruption and set io_s appropriately
  4938. io_s = IOStatus::Corruption();
  4939. }
  4940. ColumnFamilyData* cfd = GetColumnFamilySet()->GetDefault();
  4941. IOErrorInfo io_error_info(io_s, FileOperationType::kVerify,
  4942. manifest_file_name, /*length=*/size,
  4943. /*offset=*/0);
  4944. for (auto& listener : cfd->ioptions().listeners) {
  4945. listener->OnIOError(io_error_info);
  4946. }
  4947. io_s.PermitUncheckedError();
  4948. io_error_info.io_status.PermitUncheckedError();
  4949. ROCKS_LOG_ERROR(db_options_->info_log,
  4950. "MANIFEST verification on Close, "
  4951. "filename %s, expected size %" PRIu64
  4952. " failed with status %s and "
  4953. "actual size %" PRIu64 "\n",
  4954. manifest_file_name.c_str(), manifest_file_size_,
  4955. io_s.ToString().c_str(), size);
  4956. VersionEdit edit;
  4957. assert(cfd);
  4958. s = LogAndApply(cfd, ReadOptions(), WriteOptions(), &edit, mu, db_dir);
  4959. }
  4960. closed_ = true;
  4961. return s;
  4962. }
  4963. VersionSet::~VersionSet() {
  4964. // Must clean up column families to make all files "obsolete"
  4965. column_family_set_.reset();
  4966. for (auto& file : obsolete_files_) {
  4967. // NOTE: DB is shutting down, so file is probably not obsolete, just
  4968. // no longer referenced by Versions in memory.
  4969. // For more context, see comment on "table_cache_->EraseUnRefEntries()"
  4970. // in DBImpl::CloseHelper().
  4971. // Using uncache_aggressiveness=0 overrides any previous marking to
  4972. // attempt to uncache the file's blocks (which after cleaning up
  4973. // column families could cause use-after-free)
  4974. TableCache::ReleaseObsolete(table_cache_, file.metadata->fd.GetNumber(),
  4975. file.metadata->table_reader_handle,
  4976. /*uncache_aggressiveness=*/0);
  4977. file.DeleteMetadata();
  4978. }
  4979. obsolete_files_.clear();
  4980. io_status_.PermitUncheckedError();
  4981. }
  4982. void VersionSet::Reset() {
  4983. if (column_family_set_) {
  4984. WriteBufferManager* wbm = column_family_set_->write_buffer_manager();
  4985. WriteController* wc = column_family_set_->write_controller();
  4986. // db_id becomes the source of truth after DBImpl::Recover():
  4987. // https://github.com/facebook/rocksdb/blob/v7.3.1/db/db_impl/db_impl_open.cc#L527
  4988. // Note: we may not be able to recover db_id from MANIFEST if
  4989. // options.write_dbid_to_manifest is false (default).
  4990. column_family_set_.reset(new ColumnFamilySet(
  4991. dbname_, db_options_, file_options_, table_cache_, wbm, wc,
  4992. block_cache_tracer_, io_tracer_, db_id_, db_session_id_));
  4993. }
  4994. db_id_.clear();
  4995. next_file_number_.store(2);
  4996. min_log_number_to_keep_.store(0);
  4997. manifest_file_number_ = 0;
  4998. options_file_number_ = 0;
  4999. pending_manifest_file_number_ = 0;
  5000. last_sequence_.store(0);
  5001. last_allocated_sequence_.store(0);
  5002. last_published_sequence_.store(0);
  5003. prev_log_number_ = 0;
  5004. descriptor_log_.reset();
  5005. current_version_number_ = 0;
  5006. manifest_writers_.clear();
  5007. manifest_file_size_ = 0;
  5008. obsolete_files_.clear();
  5009. obsolete_manifests_.clear();
  5010. wals_.Reset();
  5011. }
  5012. void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
  5013. Version* v) {
  5014. // compute new compaction score
  5015. v->storage_info()->ComputeCompactionScore(
  5016. column_family_data->ioptions(),
  5017. column_family_data->GetLatestMutableCFOptions());
  5018. // Mark v finalized
  5019. v->storage_info_.SetFinalized();
  5020. // Make "v" current
  5021. assert(v->refs_ == 0);
  5022. Version* current = column_family_data->current();
  5023. assert(v != current);
  5024. if (current != nullptr) {
  5025. assert(current->refs_ > 0);
  5026. current->Unref();
  5027. }
  5028. column_family_data->SetCurrent(v);
  5029. v->Ref();
  5030. // Append to linked list
  5031. v->prev_ = column_family_data->dummy_versions()->prev_;
  5032. v->next_ = column_family_data->dummy_versions();
  5033. v->prev_->next_ = v;
  5034. v->next_->prev_ = v;
  5035. }
  5036. Status VersionSet::ProcessManifestWrites(
  5037. std::deque<ManifestWriter>& writers, InstrumentedMutex* mu,
  5038. FSDirectory* dir_contains_current_file, bool new_descriptor_log,
  5039. const ColumnFamilyOptions* new_cf_options, const ReadOptions& read_options,
  5040. const WriteOptions& write_options) {
  5041. mu->AssertHeld();
  5042. assert(!writers.empty());
  5043. ManifestWriter& first_writer = writers.front();
  5044. ManifestWriter* last_writer = &first_writer;
  5045. assert(!manifest_writers_.empty());
  5046. assert(manifest_writers_.front() == &first_writer);
  5047. autovector<VersionEdit*> batch_edits;
  5048. // This vector keeps track of the corresponding user-defined timestamp size
  5049. // for `batch_edits` side by side, which is only needed for encoding a
  5050. // `VersionEdit` that adds new SST files.
  5051. // Note that anytime `batch_edits` has new element added or get existing
  5052. // element removed, `batch_edits_ts_sz` should be updated too.
  5053. autovector<std::optional<size_t>> batch_edits_ts_sz;
  5054. autovector<Version*> versions;
  5055. std::vector<std::unique_ptr<BaseReferencedVersionBuilder>> builder_guards;
  5056. autovector<const autovector<uint64_t>*> files_to_quarantine_if_commit_fail;
  5057. autovector<uint64_t> limbo_descriptor_log_file_number;
  5058. // Tracking `max_last_sequence` is needed to ensure we write
  5059. // `VersionEdit::last_sequence_`s in non-decreasing order according to the
  5060. // recovery code's requirement. It also allows us to defer updating
  5061. // `descriptor_last_sequence_` until the apply phase, after the log phase
  5062. // succeeds.
  5063. SequenceNumber max_last_sequence = descriptor_last_sequence_;
  5064. bool skip_manifest_write =
  5065. first_writer.edit_list.front()->IsNoManifestWriteDummy();
  5066. if (first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
  5067. // No group commits for column family add or drop
  5068. LogAndApplyCFHelper(first_writer.edit_list.front(), &max_last_sequence);
  5069. batch_edits.push_back(first_writer.edit_list.front());
  5070. batch_edits_ts_sz.push_back(std::nullopt);
  5071. } else {
  5072. auto it = manifest_writers_.cbegin();
  5073. size_t group_start = std::numeric_limits<size_t>::max();
  5074. for (;;) {
  5075. assert(!(*it)->edit_list.front()->IsColumnFamilyManipulation());
  5076. last_writer = *it;
  5077. assert(last_writer != nullptr);
  5078. assert(last_writer->cfd != nullptr);
  5079. if (last_writer->cfd->IsDropped()) {
  5080. // If we detect a dropped CF at this point, and the corresponding
  5081. // version edits belong to an atomic group, then we need to find out
  5082. // the preceding version edits in the same atomic group, and update
  5083. // their `remaining_entries_` member variable because we are NOT going
  5084. // to write the version edits' of dropped CF to the MANIFEST. If we
  5085. // don't update, then Recover can report corrupted atomic group because
  5086. // the `remaining_entries_` do not match.
  5087. if (!batch_edits.empty()) {
  5088. if (batch_edits.back()->IsInAtomicGroup() &&
  5089. batch_edits.back()->GetRemainingEntries() > 0) {
  5090. assert(group_start < batch_edits.size());
  5091. const auto& edit_list = last_writer->edit_list;
  5092. size_t k = 0;
  5093. while (k < edit_list.size()) {
  5094. if (!edit_list[k]->IsInAtomicGroup()) {
  5095. break;
  5096. } else if (edit_list[k]->GetRemainingEntries() == 0) {
  5097. ++k;
  5098. break;
  5099. }
  5100. ++k;
  5101. }
  5102. for (auto i = group_start; i < batch_edits.size(); ++i) {
  5103. assert(static_cast<uint32_t>(k) <=
  5104. batch_edits.back()->GetRemainingEntries());
  5105. batch_edits[i]->SetRemainingEntries(
  5106. batch_edits[i]->GetRemainingEntries() -
  5107. static_cast<uint32_t>(k));
  5108. }
  5109. }
  5110. }
  5111. } else {
  5112. // We do a linear search on versions because versions is small.
  5113. // TODO(yanqin) maybe consider unordered_map
  5114. Version* version = nullptr;
  5115. VersionBuilder* builder = nullptr;
  5116. for (int i = 0; i != static_cast<int>(versions.size()); ++i) {
  5117. uint32_t cf_id = last_writer->cfd->GetID();
  5118. if (versions[i]->cfd()->GetID() == cf_id) {
  5119. version = versions[i];
  5120. assert(!builder_guards.empty() &&
  5121. builder_guards.size() == versions.size());
  5122. builder = builder_guards[i]->version_builder();
  5123. TEST_SYNC_POINT_CALLBACK(
  5124. "VersionSet::ProcessManifestWrites:SameColumnFamily", &cf_id);
  5125. break;
  5126. }
  5127. }
  5128. if (version == nullptr) {
  5129. // WAL manipulations do not need to be applied to versions.
  5130. if (!last_writer->IsAllWalEdits()) {
  5131. version = new Version(
  5132. last_writer->cfd, this, file_options_,
  5133. last_writer->cfd ? last_writer->cfd->GetLatestMutableCFOptions()
  5134. : MutableCFOptions(*new_cf_options),
  5135. io_tracer_, current_version_number_++);
  5136. versions.push_back(version);
  5137. builder_guards.emplace_back(
  5138. new BaseReferencedVersionBuilder(last_writer->cfd));
  5139. builder = builder_guards.back()->version_builder();
  5140. }
  5141. assert(last_writer->IsAllWalEdits() || builder);
  5142. assert(last_writer->IsAllWalEdits() || version);
  5143. TEST_SYNC_POINT_CALLBACK(
  5144. "VersionSet::ProcessManifestWrites:NewVersion", version);
  5145. }
  5146. const Comparator* ucmp = last_writer->cfd->user_comparator();
  5147. assert(ucmp);
  5148. std::optional<size_t> edit_ts_sz = ucmp->timestamp_size();
  5149. for (const auto& e : last_writer->edit_list) {
  5150. if (e->IsInAtomicGroup()) {
  5151. if (batch_edits.empty() || !batch_edits.back()->IsInAtomicGroup() ||
  5152. (batch_edits.back()->IsInAtomicGroup() &&
  5153. batch_edits.back()->GetRemainingEntries() == 0)) {
  5154. group_start = batch_edits.size();
  5155. }
  5156. } else if (group_start != std::numeric_limits<size_t>::max()) {
  5157. group_start = std::numeric_limits<size_t>::max();
  5158. }
  5159. Status s = LogAndApplyHelper(last_writer->cfd, builder, e,
  5160. &max_last_sequence, mu);
  5161. if (!s.ok()) {
  5162. // free up the allocated memory
  5163. for (auto v : versions) {
  5164. delete v;
  5165. }
  5166. // FIXME? manifest_writers_ still has requested updates
  5167. return s;
  5168. }
  5169. batch_edits.push_back(e);
  5170. batch_edits_ts_sz.push_back(edit_ts_sz);
  5171. }
  5172. }
  5173. // Loop increment/conditions
  5174. ++it;
  5175. if (it == manifest_writers_.cend()) {
  5176. break;
  5177. }
  5178. if (skip_manifest_write) {
  5179. // no grouping when skipping manifest write
  5180. break;
  5181. }
  5182. const auto* next = (*it)->edit_list.front();
  5183. if (next->IsColumnFamilyManipulation() ||
  5184. next->IsNoManifestWriteDummy()) {
  5185. // no group commits for column family add or drop
  5186. // nor for dummy skipping manifest write
  5187. break;
  5188. }
  5189. }
  5190. for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
  5191. assert(!builder_guards.empty() &&
  5192. builder_guards.size() == versions.size());
  5193. auto* builder = builder_guards[i]->version_builder();
  5194. Status s = builder->SaveTo(versions[i]->storage_info());
  5195. if (!s.ok()) {
  5196. // free up the allocated memory
  5197. for (auto v : versions) {
  5198. delete v;
  5199. }
  5200. // FIXME? manifest_writers_ still has requested updates
  5201. return s;
  5202. }
  5203. }
  5204. }
  5205. #ifndef NDEBUG
  5206. // Verify that version edits of atomic groups have correct
  5207. // remaining_entries_.
  5208. size_t k = 0;
  5209. while (k < batch_edits.size()) {
  5210. while (k < batch_edits.size() && !batch_edits[k]->IsInAtomicGroup()) {
  5211. ++k;
  5212. }
  5213. if (k == batch_edits.size()) {
  5214. break;
  5215. }
  5216. size_t i = k;
  5217. while (i < batch_edits.size()) {
  5218. if (!batch_edits[i]->IsInAtomicGroup()) {
  5219. break;
  5220. }
  5221. assert(i - k + batch_edits[i]->GetRemainingEntries() ==
  5222. batch_edits[k]->GetRemainingEntries());
  5223. if (batch_edits[i]->GetRemainingEntries() == 0) {
  5224. ++i;
  5225. break;
  5226. }
  5227. ++i;
  5228. }
  5229. assert(batch_edits[i - 1]->IsInAtomicGroup());
  5230. assert(0 == batch_edits[i - 1]->GetRemainingEntries());
  5231. std::vector<VersionEdit*> tmp;
  5232. for (size_t j = k; j != i; ++j) {
  5233. tmp.emplace_back(batch_edits[j]);
  5234. }
  5235. TEST_SYNC_POINT_CALLBACK(
  5236. "VersionSet::ProcessManifestWrites:CheckOneAtomicGroup", &tmp);
  5237. k = i;
  5238. }
  5239. if (skip_manifest_write) {
  5240. // no grouping when skipping manifest write
  5241. assert(last_writer == &first_writer);
  5242. }
  5243. #endif // NDEBUG
  5244. assert(pending_manifest_file_number_ == 0);
  5245. if (!skip_manifest_write &&
  5246. (!descriptor_log_ ||
  5247. manifest_file_size_ > db_options_->max_manifest_file_size)) {
  5248. TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:BeforeNewManifest");
  5249. new_descriptor_log = true;
  5250. } else {
  5251. pending_manifest_file_number_ = manifest_file_number_;
  5252. }
  5253. // Local cached copy of state variable(s). WriteCurrentStateToManifest()
  5254. // reads its content after releasing db mutex to avoid race with
  5255. // SwitchMemtable().
  5256. std::unordered_map<uint32_t, MutableCFState> curr_state;
  5257. VersionEdit wal_additions;
  5258. if (new_descriptor_log) {
  5259. pending_manifest_file_number_ = NewFileNumber();
  5260. batch_edits.back()->SetNextFile(next_file_number_.load());
  5261. // if we are writing out new snapshot make sure to persist max column
  5262. // family.
  5263. if (column_family_set_->GetMaxColumnFamily() > 0) {
  5264. first_writer.edit_list.front()->SetMaxColumnFamily(
  5265. column_family_set_->GetMaxColumnFamily());
  5266. }
  5267. for (const auto* cfd : *column_family_set_) {
  5268. assert(curr_state.find(cfd->GetID()) == curr_state.end());
  5269. curr_state.emplace(
  5270. cfd->GetID(),
  5271. MutableCFState(cfd->GetLogNumber(), cfd->GetFullHistoryTsLow()));
  5272. }
  5273. for (const auto& wal : wals_.GetWals()) {
  5274. wal_additions.AddWal(wal.first, wal.second);
  5275. }
  5276. }
  5277. uint64_t new_manifest_file_size = 0;
  5278. Status s;
  5279. IOStatus io_s;
  5280. IOStatus manifest_io_status;
  5281. manifest_io_status.PermitUncheckedError();
  5282. std::unique_ptr<log::Writer> new_desc_log_ptr;
  5283. if (skip_manifest_write) {
  5284. if (s.ok()) {
  5285. constexpr bool update_stats = true;
  5286. for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
  5287. // NOTE: normally called with DB mutex released, but we don't
  5288. // want to release the DB mutex in this mode of LogAndApply
  5289. versions[i]->PrepareAppend(read_options, update_stats);
  5290. }
  5291. }
  5292. } else {
  5293. FileOptions opt_file_opts = fs_->OptimizeForManifestWrite(file_options_);
  5294. // DB option (in file_options_) takes precedence when not kUnknown
  5295. if (file_options_.temperature != Temperature::kUnknown) {
  5296. opt_file_opts.temperature = file_options_.temperature;
  5297. }
  5298. mu->Unlock();
  5299. TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestStart");
  5300. TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WriteManifest", nullptr);
  5301. if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
  5302. for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
  5303. assert(!builder_guards.empty() &&
  5304. builder_guards.size() == versions.size());
  5305. ColumnFamilyData* cfd = versions[i]->cfd_;
  5306. s = builder_guards[i]->version_builder()->LoadTableHandlers(
  5307. cfd->internal_stats(), 1 /* max_threads */,
  5308. true /* prefetch_index_and_filter_in_cache */,
  5309. false /* is_initial_load */, versions[i]->GetMutableCFOptions(),
  5310. MaxFileSizeForL0MetaPin(versions[i]->GetMutableCFOptions()),
  5311. read_options);
  5312. if (!s.ok()) {
  5313. if (db_options_->paranoid_checks) {
  5314. break;
  5315. }
  5316. s = Status::OK();
  5317. }
  5318. }
  5319. }
  5320. log::Writer* raw_desc_log_ptr = descriptor_log_.get();
  5321. if (s.ok() && new_descriptor_log) {
  5322. // This is fine because everything inside of this block is serialized --
  5323. // only one thread can be here at the same time
  5324. // create new manifest file
  5325. ROCKS_LOG_INFO(db_options_->info_log, "Creating manifest %" PRIu64 "\n",
  5326. pending_manifest_file_number_);
  5327. std::string descriptor_fname =
  5328. DescriptorFileName(dbname_, pending_manifest_file_number_);
  5329. std::unique_ptr<FSWritableFile> descriptor_file;
  5330. io_s = NewWritableFile(fs_.get(), descriptor_fname, &descriptor_file,
  5331. opt_file_opts);
  5332. if (io_s.ok()) {
  5333. descriptor_file->SetPreallocationBlockSize(
  5334. db_options_->manifest_preallocation_size);
  5335. FileTypeSet tmp_set = db_options_->checksum_handoff_file_types;
  5336. std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
  5337. std::move(descriptor_file), descriptor_fname, opt_file_opts, clock_,
  5338. io_tracer_, nullptr, Histograms::HISTOGRAM_ENUM_MAX /* hist_type */,
  5339. db_options_->listeners, nullptr,
  5340. tmp_set.Contains(FileType::kDescriptorFile),
  5341. tmp_set.Contains(FileType::kDescriptorFile)));
  5342. new_desc_log_ptr.reset(
  5343. new log::Writer(std::move(file_writer), 0, false));
  5344. raw_desc_log_ptr = new_desc_log_ptr.get();
  5345. s = WriteCurrentStateToManifest(write_options, curr_state,
  5346. wal_additions, raw_desc_log_ptr, io_s);
  5347. assert(s == io_s);
  5348. }
  5349. if (!io_s.ok()) {
  5350. manifest_io_status = io_s;
  5351. s = io_s;
  5352. }
  5353. }
  5354. if (s.ok()) {
  5355. if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
  5356. constexpr bool update_stats = true;
  5357. for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
  5358. versions[i]->PrepareAppend(read_options, update_stats);
  5359. }
  5360. }
  5361. // Write new records to MANIFEST log
  5362. #ifndef NDEBUG
  5363. size_t idx = 0;
  5364. #endif
  5365. assert(batch_edits.size() == batch_edits_ts_sz.size());
  5366. for (size_t bidx = 0; bidx < batch_edits.size(); bidx++) {
  5367. auto& e = batch_edits[bidx];
  5368. files_to_quarantine_if_commit_fail.push_back(
  5369. e->GetFilesToQuarantineIfCommitFail());
  5370. std::string record;
  5371. if (!e->EncodeTo(&record, batch_edits_ts_sz[bidx])) {
  5372. s = Status::Corruption("Unable to encode VersionEdit:" +
  5373. e->DebugString(true));
  5374. break;
  5375. }
  5376. TEST_KILL_RANDOM_WITH_WEIGHT("VersionSet::LogAndApply:BeforeAddRecord",
  5377. REDUCE_ODDS2);
  5378. #ifndef NDEBUG
  5379. if (batch_edits.size() > 1 && batch_edits.size() - 1 == idx) {
  5380. TEST_SYNC_POINT_CALLBACK(
  5381. "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0",
  5382. nullptr);
  5383. TEST_SYNC_POINT(
  5384. "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:1");
  5385. }
  5386. ++idx;
  5387. #endif /* !NDEBUG */
  5388. io_s = raw_desc_log_ptr->AddRecord(write_options, record);
  5389. if (!io_s.ok()) {
  5390. s = io_s;
  5391. manifest_io_status = io_s;
  5392. break;
  5393. }
  5394. }
  5395. if (s.ok()) {
  5396. io_s =
  5397. SyncManifest(db_options_, write_options, raw_desc_log_ptr->file());
  5398. manifest_io_status = io_s;
  5399. TEST_SYNC_POINT_CALLBACK(
  5400. "VersionSet::ProcessManifestWrites:AfterSyncManifest", &io_s);
  5401. }
  5402. if (!io_s.ok()) {
  5403. s = io_s;
  5404. ROCKS_LOG_ERROR(db_options_->info_log, "MANIFEST write %s\n",
  5405. s.ToString().c_str());
  5406. }
  5407. }
  5408. // If we just created a new descriptor file, install it by writing a
  5409. // new CURRENT file that points to it.
  5410. if (s.ok()) {
  5411. assert(manifest_io_status.ok());
  5412. }
  5413. if (s.ok() && new_descriptor_log) {
  5414. io_s = SetCurrentFile(
  5415. write_options, fs_.get(), dbname_, pending_manifest_file_number_,
  5416. file_options_.temperature, dir_contains_current_file);
  5417. if (!io_s.ok()) {
  5418. s = io_s;
  5419. // Quarantine old manifest file in case new manifest file's CURRENT file
  5420. // wasn't created successfully and the old manifest is needed.
  5421. limbo_descriptor_log_file_number.push_back(manifest_file_number_);
  5422. files_to_quarantine_if_commit_fail.push_back(
  5423. &limbo_descriptor_log_file_number);
  5424. }
  5425. }
  5426. if (s.ok()) {
  5427. // find offset in manifest file where this version is stored.
  5428. new_manifest_file_size = raw_desc_log_ptr->file()->GetFileSize();
  5429. }
  5430. if (first_writer.edit_list.front()->IsColumnFamilyDrop()) {
  5431. TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:0");
  5432. TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:1");
  5433. TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:2");
  5434. }
  5435. LogFlush(db_options_->info_log);
  5436. TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestDone");
  5437. mu->Lock();
  5438. }
  5439. if (s.ok()) {
  5440. // Apply WAL edits, DB mutex must be held.
  5441. for (auto& e : batch_edits) {
  5442. if (e->IsWalAddition()) {
  5443. s = wals_.AddWals(e->GetWalAdditions());
  5444. } else if (e->IsWalDeletion()) {
  5445. s = wals_.DeleteWalsBefore(e->GetWalDeletion().GetLogNumber());
  5446. }
  5447. if (!s.ok()) {
  5448. break;
  5449. }
  5450. }
  5451. }
  5452. if (!io_s.ok()) {
  5453. if (io_status_.ok()) {
  5454. io_status_ = io_s;
  5455. if (error_handler_) {
  5456. error_handler_->AddFilesToQuarantine(
  5457. files_to_quarantine_if_commit_fail);
  5458. }
  5459. }
  5460. } else if (!io_status_.ok()) {
  5461. io_status_ = io_s;
  5462. if (error_handler_) {
  5463. error_handler_->ClearFilesToQuarantine();
  5464. }
  5465. }
  5466. // Append the old manifest file to the obsolete_manifest_ list to be deleted
  5467. // by PurgeObsoleteFiles later.
  5468. if (s.ok() && new_descriptor_log) {
  5469. descriptor_log_ = std::move(new_desc_log_ptr);
  5470. obsolete_manifests_.emplace_back(
  5471. DescriptorFileName("", manifest_file_number_));
  5472. }
  5473. // Install the new versions
  5474. if (s.ok()) {
  5475. if (first_writer.edit_list.front()->IsColumnFamilyAdd()) {
  5476. assert(batch_edits.size() == 1);
  5477. assert(new_cf_options != nullptr);
  5478. assert(max_last_sequence == descriptor_last_sequence_);
  5479. CreateColumnFamily(*new_cf_options, read_options,
  5480. first_writer.edit_list.front(),
  5481. /*read_only*/ false);
  5482. } else if (first_writer.edit_list.front()->IsColumnFamilyDrop()) {
  5483. assert(batch_edits.size() == 1);
  5484. assert(max_last_sequence == descriptor_last_sequence_);
  5485. first_writer.cfd->SetDropped();
  5486. first_writer.cfd->UnrefAndTryDelete();
  5487. } else {
  5488. // Each version in versions corresponds to a column family.
  5489. // For each column family, update its log number indicating that logs
  5490. // with number smaller than this should be ignored.
  5491. uint64_t last_min_log_number_to_keep = 0;
  5492. for (const auto& e : batch_edits) {
  5493. ColumnFamilyData* cfd = nullptr;
  5494. if (!e->IsColumnFamilyManipulation()) {
  5495. cfd = column_family_set_->GetColumnFamily(e->GetColumnFamily());
  5496. // e would not have been added to batch_edits if its corresponding
  5497. // column family is dropped.
  5498. assert(cfd);
  5499. }
  5500. if (cfd) {
  5501. if (e->HasLogNumber() && e->GetLogNumber() > cfd->GetLogNumber()) {
  5502. cfd->SetLogNumber(e->GetLogNumber());
  5503. }
  5504. if (e->HasFullHistoryTsLow()) {
  5505. cfd->SetFullHistoryTsLow(e->GetFullHistoryTsLow());
  5506. }
  5507. }
  5508. if (e->HasMinLogNumberToKeep()) {
  5509. last_min_log_number_to_keep =
  5510. std::max(last_min_log_number_to_keep, e->GetMinLogNumberToKeep());
  5511. }
  5512. }
  5513. if (last_min_log_number_to_keep != 0) {
  5514. MarkMinLogNumberToKeep(last_min_log_number_to_keep);
  5515. }
  5516. for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
  5517. ColumnFamilyData* cfd = versions[i]->cfd_;
  5518. AppendVersion(cfd, versions[i]);
  5519. }
  5520. }
  5521. if (!skip_manifest_write) {
  5522. assert(max_last_sequence >= descriptor_last_sequence_);
  5523. descriptor_last_sequence_ = max_last_sequence;
  5524. manifest_file_number_ = pending_manifest_file_number_;
  5525. manifest_file_size_ = new_manifest_file_size;
  5526. prev_log_number_ = first_writer.edit_list.front()->GetPrevLogNumber();
  5527. }
  5528. } else {
  5529. std::string version_edits;
  5530. for (auto& e : batch_edits) {
  5531. version_edits += ("\n" + e->DebugString(true));
  5532. }
  5533. ROCKS_LOG_ERROR(db_options_->info_log,
  5534. "Error in committing version edit to MANIFEST: %s",
  5535. version_edits.c_str());
  5536. for (auto v : versions) {
  5537. delete v;
  5538. }
  5539. // If manifest append failed for whatever reason, the file could be
  5540. // corrupted. So we need to force the next version update to start a
  5541. // new manifest file.
  5542. descriptor_log_.reset();
  5543. new_desc_log_ptr.reset();
  5544. // If manifest operations failed, then we know the CURRENT file still
  5545. // points to the original MANIFEST. Therefore, we can safely delete the
  5546. // new MANIFEST.
  5547. // If manifest operations succeeded, and we are here, then it is possible
  5548. // that renaming tmp file to CURRENT failed.
  5549. //
  5550. // On local POSIX-compliant FS, the CURRENT must point to the original
  5551. // MANIFEST. We can delete the new MANIFEST for simplicity, but we can also
  5552. // keep it. Future recovery will ignore this MANIFEST. It's also ok for the
  5553. // process not to crash and continue using the db. Any future LogAndApply()
  5554. // call will switch to a new MANIFEST and update CURRENT, still ignoring
  5555. // this one.
  5556. //
  5557. // On non-local FS, it is
  5558. // possible that the rename operation succeeded on the server (remote)
  5559. // side, but the client somehow returns a non-ok status to RocksDB. Note
  5560. // that this does not violate atomicity. Should we delete the new MANIFEST
  5561. // successfully, a subsequent recovery attempt will likely see the CURRENT
  5562. // pointing to the new MANIFEST, thus fail. We will not be able to open the
  5563. // DB again. Therefore, if manifest operations succeed, we should keep the
  5564. // the new MANIFEST. If the process proceeds, any future LogAndApply() call
  5565. // will switch to a new MANIFEST and update CURRENT. If user tries to
  5566. // re-open the DB,
  5567. // a) CURRENT points to the new MANIFEST, and the new MANIFEST is present.
  5568. // b) CURRENT points to the original MANIFEST, and the original MANIFEST
  5569. // also exists.
  5570. if (!manifest_io_status.ok() && new_descriptor_log) {
  5571. ROCKS_LOG_INFO(db_options_->info_log,
  5572. "Deleting manifest %" PRIu64 " current manifest %" PRIu64
  5573. "\n",
  5574. pending_manifest_file_number_, manifest_file_number_);
  5575. Status manifest_del_status = env_->DeleteFile(
  5576. DescriptorFileName(dbname_, pending_manifest_file_number_));
  5577. if (!manifest_del_status.ok()) {
  5578. ROCKS_LOG_WARN(db_options_->info_log,
  5579. "Failed to delete manifest %" PRIu64 ": %s",
  5580. pending_manifest_file_number_,
  5581. manifest_del_status.ToString().c_str());
  5582. }
  5583. }
  5584. }
  5585. pending_manifest_file_number_ = 0;
  5586. #ifndef NDEBUG
  5587. // This is here kind of awkwardly because there's no other consistency
  5588. // checks on `VersionSet`'s updates for the new `Version`s. We might want
  5589. // to move it to a dedicated function, or remove it if we gain enough
  5590. // confidence in `descriptor_last_sequence_`.
  5591. if (s.ok()) {
  5592. for (const auto* v : versions) {
  5593. const auto* vstorage = v->storage_info();
  5594. for (int level = 0; level < vstorage->num_levels(); ++level) {
  5595. for (const auto& file : vstorage->LevelFiles(level)) {
  5596. assert(file->fd.largest_seqno <= descriptor_last_sequence_);
  5597. }
  5598. }
  5599. }
  5600. }
  5601. #endif // NDEBUG
  5602. // wake up all the waiting writers
  5603. while (true) {
  5604. ManifestWriter* ready = manifest_writers_.front();
  5605. manifest_writers_.pop_front();
  5606. bool need_signal = true;
  5607. for (const auto& w : writers) {
  5608. if (&w == ready) {
  5609. need_signal = false;
  5610. break;
  5611. }
  5612. }
  5613. ready->status = s;
  5614. ready->done = true;
  5615. if (ready->manifest_write_callback) {
  5616. (ready->manifest_write_callback)(s);
  5617. }
  5618. if (need_signal) {
  5619. ready->cv.Signal();
  5620. }
  5621. if (ready == last_writer) {
  5622. break;
  5623. }
  5624. }
  5625. if (!manifest_writers_.empty()) {
  5626. manifest_writers_.front()->cv.Signal();
  5627. }
  5628. return s;
  5629. }
  5630. void VersionSet::WakeUpWaitingManifestWriters() {
  5631. // wake up all the waiting writers
  5632. // Notify new head of manifest write queue.
  5633. if (!manifest_writers_.empty()) {
  5634. manifest_writers_.front()->cv.Signal();
  5635. }
  5636. }
  5637. // 'datas' is grammatically incorrect. We still use this notation to indicate
  5638. // that this variable represents a collection of column_family_data.
  5639. Status VersionSet::LogAndApply(
  5640. const autovector<ColumnFamilyData*>& column_family_datas,
  5641. const ReadOptions& read_options, const WriteOptions& write_options,
  5642. const autovector<autovector<VersionEdit*>>& edit_lists,
  5643. InstrumentedMutex* mu, FSDirectory* dir_contains_current_file,
  5644. bool new_descriptor_log, const ColumnFamilyOptions* new_cf_options,
  5645. const std::vector<std::function<void(const Status&)>>& manifest_wcbs,
  5646. const std::function<Status()>& pre_cb) {
  5647. mu->AssertHeld();
  5648. int num_edits = 0;
  5649. for (const auto& elist : edit_lists) {
  5650. num_edits += static_cast<int>(elist.size());
  5651. }
  5652. if (num_edits == 0) {
  5653. return Status::OK();
  5654. } else if (num_edits > 1) {
  5655. #ifndef NDEBUG
  5656. for (const auto& edit_list : edit_lists) {
  5657. for (const auto& edit : edit_list) {
  5658. assert(!edit->IsColumnFamilyManipulation());
  5659. assert(!edit->IsNoManifestWriteDummy());
  5660. }
  5661. }
  5662. #endif /* ! NDEBUG */
  5663. }
  5664. int num_cfds = static_cast<int>(column_family_datas.size());
  5665. if (num_cfds == 1 && column_family_datas[0] == nullptr) {
  5666. assert(edit_lists.size() == 1 && edit_lists[0].size() == 1);
  5667. assert(edit_lists[0][0]->IsColumnFamilyAdd());
  5668. assert(new_cf_options != nullptr);
  5669. }
  5670. std::deque<ManifestWriter> writers;
  5671. if (num_cfds > 0) {
  5672. assert(static_cast<size_t>(num_cfds) == edit_lists.size());
  5673. }
  5674. for (int i = 0; i < num_cfds; ++i) {
  5675. const auto wcb =
  5676. manifest_wcbs.empty() ? [](const Status&) {} : manifest_wcbs[i];
  5677. writers.emplace_back(mu, column_family_datas[i], edit_lists[i], wcb);
  5678. manifest_writers_.push_back(&writers[i]);
  5679. }
  5680. assert(!writers.empty());
  5681. ManifestWriter& first_writer = writers.front();
  5682. TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:BeforeWriterWaiting",
  5683. nullptr);
  5684. while (!first_writer.done && &first_writer != manifest_writers_.front()) {
  5685. first_writer.cv.Wait();
  5686. }
  5687. if (first_writer.done) {
  5688. // All non-CF-manipulation operations can be grouped together and committed
  5689. // to MANIFEST. They should all have finished. The status code is stored in
  5690. // the first manifest writer.
  5691. #ifndef NDEBUG
  5692. for (const auto& writer : writers) {
  5693. assert(writer.done);
  5694. }
  5695. TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WakeUpAndDone", mu);
  5696. #endif /* !NDEBUG */
  5697. // FIXME: One MANIFEST write failure can cause all writes to SetBGError,
  5698. // should only SetBGError once.
  5699. return first_writer.status;
  5700. }
  5701. TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WakeUpAndNotDone", mu);
  5702. int num_undropped_cfds = 0;
  5703. for (auto cfd : column_family_datas) {
  5704. // if cfd == nullptr, it is a column family add.
  5705. if (cfd == nullptr || !cfd->IsDropped()) {
  5706. ++num_undropped_cfds;
  5707. }
  5708. }
  5709. Status s;
  5710. if (0 == num_undropped_cfds) {
  5711. s = Status::ColumnFamilyDropped();
  5712. }
  5713. // Call pre_cb once we know we have work to do and are scheduled as the
  5714. // exclusive manifest writer (and new Version appender)
  5715. if (s.ok() && pre_cb) {
  5716. s = pre_cb();
  5717. }
  5718. if (!s.ok()) {
  5719. // Revert manifest_writers_
  5720. for (int i = 0; i != num_cfds; ++i) {
  5721. manifest_writers_.pop_front();
  5722. }
  5723. // Notify new head of manifest write queue.
  5724. if (!manifest_writers_.empty()) {
  5725. manifest_writers_.front()->cv.Signal();
  5726. }
  5727. return s;
  5728. } else {
  5729. return ProcessManifestWrites(writers, mu, dir_contains_current_file,
  5730. new_descriptor_log, new_cf_options,
  5731. read_options, write_options);
  5732. }
  5733. }
  5734. void VersionSet::LogAndApplyCFHelper(VersionEdit* edit,
  5735. SequenceNumber* max_last_sequence) {
  5736. assert(max_last_sequence != nullptr);
  5737. assert(edit->IsColumnFamilyManipulation());
  5738. edit->SetNextFile(next_file_number_.load());
  5739. assert(!edit->HasLastSequence());
  5740. edit->SetLastSequence(*max_last_sequence);
  5741. if (edit->IsColumnFamilyDrop()) {
  5742. // if we drop column family, we have to make sure to save max column family,
  5743. // so that we don't reuse existing ID
  5744. edit->SetMaxColumnFamily(column_family_set_->GetMaxColumnFamily());
  5745. }
  5746. }
  5747. Status VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
  5748. VersionBuilder* builder, VersionEdit* edit,
  5749. SequenceNumber* max_last_sequence,
  5750. InstrumentedMutex* mu) {
  5751. #ifdef NDEBUG
  5752. (void)cfd;
  5753. #endif
  5754. mu->AssertHeld();
  5755. assert(!edit->IsColumnFamilyManipulation());
  5756. assert(max_last_sequence != nullptr);
  5757. if (edit->HasLogNumber()) {
  5758. assert(edit->GetLogNumber() >= cfd->GetLogNumber());
  5759. assert(edit->GetLogNumber() < next_file_number_.load());
  5760. }
  5761. if (!edit->HasPrevLogNumber()) {
  5762. edit->SetPrevLogNumber(prev_log_number_);
  5763. }
  5764. edit->SetNextFile(next_file_number_.load());
  5765. if (edit->HasLastSequence() && edit->GetLastSequence() > *max_last_sequence) {
  5766. *max_last_sequence = edit->GetLastSequence();
  5767. } else {
  5768. edit->SetLastSequence(*max_last_sequence);
  5769. }
  5770. // The builder can be nullptr only if edit is WAL manipulation,
  5771. // because WAL edits do not need to be applied to versions,
  5772. // we return Status::OK() in this case.
  5773. assert(builder || edit->IsWalManipulation());
  5774. return builder ? builder->Apply(edit) : Status::OK();
  5775. }
  5776. Status VersionSet::Recover(
  5777. const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
  5778. std::string* db_id, bool no_error_if_files_missing, bool is_retry,
  5779. Status* log_status) {
  5780. const ReadOptions read_options(Env::IOActivity::kDBOpen);
  5781. // Read "CURRENT" file, which contains a pointer to the current manifest
  5782. // file
  5783. std::string manifest_path;
  5784. Status s = GetCurrentManifestPath(dbname_, fs_.get(), is_retry,
  5785. &manifest_path, &manifest_file_number_);
  5786. if (!s.ok()) {
  5787. return s;
  5788. }
  5789. ROCKS_LOG_INFO(db_options_->info_log, "Recovering from manifest file: %s\n",
  5790. manifest_path.c_str());
  5791. std::unique_ptr<SequentialFileReader> manifest_file_reader;
  5792. {
  5793. std::unique_ptr<FSSequentialFile> manifest_file;
  5794. s = fs_->NewSequentialFile(manifest_path,
  5795. fs_->OptimizeForManifestRead(file_options_),
  5796. &manifest_file, nullptr);
  5797. if (!s.ok()) {
  5798. return s;
  5799. }
  5800. manifest_file_reader.reset(new SequentialFileReader(
  5801. std::move(manifest_file), manifest_path,
  5802. db_options_->log_readahead_size, io_tracer_, db_options_->listeners,
  5803. /*rate_limiter=*/nullptr, is_retry));
  5804. }
  5805. TEST_SYNC_POINT("VersionSet::Recover:StartManifestRead");
  5806. uint64_t current_manifest_file_size = 0;
  5807. uint64_t log_number = 0;
  5808. {
  5809. VersionSet::LogReporter reporter;
  5810. Status log_read_status;
  5811. reporter.status = &log_read_status;
  5812. log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter,
  5813. true /* checksum */, 0 /* log_number */);
  5814. VersionEditHandler handler(
  5815. read_only, column_families, const_cast<VersionSet*>(this),
  5816. /*track_found_and_missing_files=*/false, no_error_if_files_missing,
  5817. io_tracer_, read_options, /*allow_incomplete_valid_version=*/false,
  5818. EpochNumberRequirement::kMightMissing);
  5819. handler.Iterate(reader, &log_read_status);
  5820. s = handler.status();
  5821. if (s.ok()) {
  5822. log_number = handler.GetVersionEditParams().GetLogNumber();
  5823. current_manifest_file_size = reader.GetReadOffset();
  5824. assert(current_manifest_file_size != 0);
  5825. handler.GetDbId(db_id);
  5826. }
  5827. if (s.ok()) {
  5828. RecoverEpochNumbers();
  5829. }
  5830. if (log_status) {
  5831. *log_status = log_read_status;
  5832. }
  5833. }
  5834. if (s.ok()) {
  5835. manifest_file_size_ = current_manifest_file_size;
  5836. ROCKS_LOG_INFO(
  5837. db_options_->info_log,
  5838. "Recovered from manifest file:%s succeeded,"
  5839. "manifest_file_number is %" PRIu64 ", next_file_number is %" PRIu64
  5840. ", last_sequence is %" PRIu64 ", log_number is %" PRIu64
  5841. ",prev_log_number is %" PRIu64 ",max_column_family is %" PRIu32
  5842. ",min_log_number_to_keep is %" PRIu64 "\n",
  5843. manifest_path.c_str(), manifest_file_number_, next_file_number_.load(),
  5844. last_sequence_.load(), log_number, prev_log_number_,
  5845. column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep());
  5846. for (auto cfd : *column_family_set_) {
  5847. if (cfd->IsDropped()) {
  5848. continue;
  5849. }
  5850. ROCKS_LOG_INFO(db_options_->info_log,
  5851. "Column family [%s] (ID %" PRIu32
  5852. "), log number is %" PRIu64 "\n",
  5853. cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber());
  5854. }
  5855. }
  5856. return s;
  5857. }
  5858. namespace {
  5859. class ManifestPicker {
  5860. public:
  5861. explicit ManifestPicker(const std::string& dbname,
  5862. const std::vector<std::string>& files_in_dbname);
  5863. // REQUIRES Valid() == true
  5864. std::string GetNextManifest(uint64_t* file_number, std::string* file_name);
  5865. bool Valid() const { return manifest_file_iter_ != manifest_files_.end(); }
  5866. private:
  5867. const std::string& dbname_;
  5868. // MANIFEST file names(s)
  5869. std::vector<std::string> manifest_files_;
  5870. std::vector<std::string>::const_iterator manifest_file_iter_;
  5871. };
  5872. ManifestPicker::ManifestPicker(const std::string& dbname,
  5873. const std::vector<std::string>& files_in_dbname)
  5874. : dbname_(dbname) {
  5875. // populate manifest files
  5876. assert(!files_in_dbname.empty());
  5877. for (const auto& fname : files_in_dbname) {
  5878. uint64_t file_num = 0;
  5879. FileType file_type;
  5880. bool parse_ok = ParseFileName(fname, &file_num, &file_type);
  5881. if (parse_ok && file_type == kDescriptorFile) {
  5882. manifest_files_.push_back(fname);
  5883. }
  5884. }
  5885. // seek to first manifest
  5886. std::sort(manifest_files_.begin(), manifest_files_.end(),
  5887. [](const std::string& lhs, const std::string& rhs) {
  5888. uint64_t num1 = 0;
  5889. uint64_t num2 = 0;
  5890. FileType type1;
  5891. FileType type2;
  5892. bool parse_ok1 = ParseFileName(lhs, &num1, &type1);
  5893. bool parse_ok2 = ParseFileName(rhs, &num2, &type2);
  5894. #ifndef NDEBUG
  5895. assert(parse_ok1);
  5896. assert(parse_ok2);
  5897. #else
  5898. (void)parse_ok1;
  5899. (void)parse_ok2;
  5900. #endif
  5901. return num1 > num2;
  5902. });
  5903. manifest_file_iter_ = manifest_files_.begin();
  5904. }
  5905. std::string ManifestPicker::GetNextManifest(uint64_t* number,
  5906. std::string* file_name) {
  5907. assert(Valid());
  5908. std::string ret;
  5909. if (manifest_file_iter_ != manifest_files_.end()) {
  5910. ret.assign(dbname_);
  5911. if (ret.back() != kFilePathSeparator) {
  5912. ret.push_back(kFilePathSeparator);
  5913. }
  5914. ret.append(*manifest_file_iter_);
  5915. if (number) {
  5916. FileType type;
  5917. bool parse = ParseFileName(*manifest_file_iter_, number, &type);
  5918. assert(type == kDescriptorFile);
  5919. #ifndef NDEBUG
  5920. assert(parse);
  5921. #else
  5922. (void)parse;
  5923. #endif
  5924. }
  5925. if (file_name) {
  5926. *file_name = *manifest_file_iter_;
  5927. }
  5928. ++manifest_file_iter_;
  5929. }
  5930. return ret;
  5931. }
  5932. } // anonymous namespace
  5933. Status VersionSet::TryRecover(
  5934. const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
  5935. const std::vector<std::string>& files_in_dbname, std::string* db_id,
  5936. bool* has_missing_table_file) {
  5937. ManifestPicker manifest_picker(dbname_, files_in_dbname);
  5938. if (!manifest_picker.Valid()) {
  5939. return Status::Corruption("Cannot locate MANIFEST file in " + dbname_);
  5940. }
  5941. Status s;
  5942. std::string manifest_path =
  5943. manifest_picker.GetNextManifest(&manifest_file_number_, nullptr);
  5944. while (!manifest_path.empty()) {
  5945. s = TryRecoverFromOneManifest(manifest_path, column_families, read_only,
  5946. db_id, has_missing_table_file);
  5947. if (s.ok() || !manifest_picker.Valid()) {
  5948. break;
  5949. }
  5950. Reset();
  5951. manifest_path =
  5952. manifest_picker.GetNextManifest(&manifest_file_number_, nullptr);
  5953. }
  5954. return s;
  5955. }
  5956. Status VersionSet::TryRecoverFromOneManifest(
  5957. const std::string& manifest_path,
  5958. const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
  5959. std::string* db_id, bool* has_missing_table_file) {
  5960. const ReadOptions read_options(Env::IOActivity::kDBOpen);
  5961. ROCKS_LOG_INFO(db_options_->info_log, "Trying to recover from manifest: %s\n",
  5962. manifest_path.c_str());
  5963. std::unique_ptr<SequentialFileReader> manifest_file_reader;
  5964. Status s;
  5965. {
  5966. std::unique_ptr<FSSequentialFile> manifest_file;
  5967. s = fs_->NewSequentialFile(manifest_path,
  5968. fs_->OptimizeForManifestRead(file_options_),
  5969. &manifest_file, nullptr);
  5970. if (!s.ok()) {
  5971. return s;
  5972. }
  5973. manifest_file_reader.reset(new SequentialFileReader(
  5974. std::move(manifest_file), manifest_path,
  5975. db_options_->log_readahead_size, io_tracer_, db_options_->listeners));
  5976. }
  5977. assert(s.ok());
  5978. VersionSet::LogReporter reporter;
  5979. reporter.status = &s;
  5980. log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter,
  5981. /*checksum=*/true, /*log_num=*/0);
  5982. VersionEditHandlerPointInTime handler_pit(
  5983. read_only, column_families, const_cast<VersionSet*>(this), io_tracer_,
  5984. read_options, /*allow_incomplete_valid_version=*/true,
  5985. EpochNumberRequirement::kMightMissing);
  5986. handler_pit.Iterate(reader, &s);
  5987. handler_pit.GetDbId(db_id);
  5988. assert(nullptr != has_missing_table_file);
  5989. *has_missing_table_file = handler_pit.HasMissingFiles();
  5990. s = handler_pit.status();
  5991. if (s.ok()) {
  5992. RecoverEpochNumbers();
  5993. }
  5994. return s;
  5995. }
  5996. void VersionSet::RecoverEpochNumbers() {
  5997. for (auto cfd : *column_family_set_) {
  5998. if (cfd->IsDropped()) {
  5999. continue;
  6000. }
  6001. assert(cfd->initialized());
  6002. cfd->RecoverEpochNumbers();
  6003. }
  6004. }
  6005. Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
  6006. const std::string& dbname,
  6007. FileSystem* fs) {
  6008. // Read "CURRENT" file, which contains a pointer to the current manifest file
  6009. std::string manifest_path;
  6010. uint64_t manifest_file_number;
  6011. Status s = GetCurrentManifestPath(dbname, fs, /*is_retry=*/false,
  6012. &manifest_path, &manifest_file_number);
  6013. if (!s.ok()) {
  6014. return s;
  6015. }
  6016. return ListColumnFamiliesFromManifest(manifest_path, fs, column_families);
  6017. }
  6018. Status VersionSet::ListColumnFamiliesFromManifest(
  6019. const std::string& manifest_path, FileSystem* fs,
  6020. std::vector<std::string>* column_families) {
  6021. // TODO: plumb Env::IOActivity, Env::IOPriority
  6022. const ReadOptions read_options;
  6023. std::unique_ptr<SequentialFileReader> file_reader;
  6024. Status s;
  6025. {
  6026. std::unique_ptr<FSSequentialFile> file;
  6027. // these are just for performance reasons, not correctness,
  6028. // so we're fine using the defaults
  6029. s = fs->NewSequentialFile(manifest_path, FileOptions(), &file, nullptr);
  6030. if (!s.ok()) {
  6031. return s;
  6032. }
  6033. file_reader = std::make_unique<SequentialFileReader>(
  6034. std::move(file), manifest_path, /*io_tracer=*/nullptr);
  6035. }
  6036. VersionSet::LogReporter reporter;
  6037. reporter.status = &s;
  6038. log::Reader reader(nullptr, std::move(file_reader), &reporter,
  6039. true /* checksum */, 0 /* log_number */);
  6040. ListColumnFamiliesHandler handler(read_options);
  6041. handler.Iterate(reader, &s);
  6042. assert(column_families);
  6043. column_families->clear();
  6044. if (handler.status().ok()) {
  6045. for (const auto& iter : handler.GetColumnFamilyNames()) {
  6046. column_families->push_back(iter.second);
  6047. }
  6048. }
  6049. return handler.status();
  6050. }
  6051. Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
  6052. const Options* options,
  6053. const FileOptions& file_options,
  6054. int new_levels) {
  6055. if (new_levels <= 1) {
  6056. return Status::InvalidArgument(
  6057. "Number of levels needs to be bigger than 1");
  6058. }
  6059. // TODO: plumb Env::IOActivity, Env::IOPriority
  6060. const ReadOptions read_options;
  6061. const WriteOptions write_options;
  6062. ImmutableDBOptions db_options(*options);
  6063. ColumnFamilyOptions cf_options(*options);
  6064. std::shared_ptr<Cache> tc(NewLRUCache(options->max_open_files - 10,
  6065. options->table_cache_numshardbits));
  6066. WriteController wc(options->delayed_write_rate);
  6067. WriteBufferManager wb(options->db_write_buffer_size);
  6068. VersionSet versions(dbname, &db_options, file_options, tc.get(), &wb, &wc,
  6069. nullptr /*BlockCacheTracer*/, nullptr /*IOTracer*/,
  6070. /*db_id*/ "",
  6071. /*db_session_id*/ "", options->daily_offpeak_time_utc,
  6072. /*error_handler_*/ nullptr, /*unchanging=*/false);
  6073. Status status;
  6074. std::vector<ColumnFamilyDescriptor> dummy;
  6075. ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName,
  6076. ColumnFamilyOptions(*options));
  6077. dummy.push_back(dummy_descriptor);
  6078. status = versions.Recover(dummy);
  6079. if (!status.ok()) {
  6080. return status;
  6081. }
  6082. Version* current_version =
  6083. versions.GetColumnFamilySet()->GetDefault()->current();
  6084. auto* vstorage = current_version->storage_info();
  6085. int current_levels = vstorage->num_levels();
  6086. if (current_levels <= new_levels) {
  6087. return Status::OK();
  6088. }
  6089. // Make sure there are file only on one level from
  6090. // (new_levels-1) to (current_levels-1)
  6091. int first_nonempty_level = -1;
  6092. int first_nonempty_level_filenum = 0;
  6093. for (int i = new_levels - 1; i < current_levels; i++) {
  6094. int file_num = vstorage->NumLevelFiles(i);
  6095. if (file_num != 0) {
  6096. if (first_nonempty_level < 0) {
  6097. first_nonempty_level = i;
  6098. first_nonempty_level_filenum = file_num;
  6099. } else {
  6100. char msg[255];
  6101. snprintf(msg, sizeof(msg),
  6102. "Found at least two levels containing files: "
  6103. "[%d:%d],[%d:%d].\n",
  6104. first_nonempty_level, first_nonempty_level_filenum, i,
  6105. file_num);
  6106. return Status::InvalidArgument(msg);
  6107. }
  6108. }
  6109. }
  6110. // we need to allocate an array with the old number of levels size to
  6111. // avoid SIGSEGV in WriteCurrentStatetoManifest()
  6112. // however, all levels bigger or equal to new_levels will be empty
  6113. std::vector<FileMetaData*>* new_files_list =
  6114. new std::vector<FileMetaData*>[current_levels];
  6115. for (int i = 0; i < new_levels - 1; i++) {
  6116. new_files_list[i] = vstorage->LevelFiles(i);
  6117. }
  6118. if (first_nonempty_level > 0) {
  6119. auto& new_last_level = new_files_list[new_levels - 1];
  6120. new_last_level = vstorage->LevelFiles(first_nonempty_level);
  6121. for (size_t i = 0; i < new_last_level.size(); ++i) {
  6122. const FileMetaData* const meta = new_last_level[i];
  6123. assert(meta);
  6124. const uint64_t file_number = meta->fd.GetNumber();
  6125. vstorage->file_locations_[file_number] =
  6126. VersionStorageInfo::FileLocation(new_levels - 1, i);
  6127. }
  6128. }
  6129. delete[] vstorage->files_;
  6130. vstorage->files_ = new_files_list;
  6131. vstorage->num_levels_ = new_levels;
  6132. vstorage->ResizeCompactCursors(new_levels);
  6133. VersionEdit ve;
  6134. InstrumentedMutex dummy_mutex;
  6135. InstrumentedMutexLock l(&dummy_mutex);
  6136. return versions.LogAndApply(versions.GetColumnFamilySet()->GetDefault(),
  6137. read_options, write_options, &ve, &dummy_mutex,
  6138. nullptr, true);
  6139. }
  6140. // Get the checksum information including the checksum and checksum function
  6141. // name of all SST and blob files in VersionSet. Store the information in
  6142. // FileChecksumList which contains a map from file number to its checksum info.
  6143. // If DB is not running, make sure call VersionSet::Recover() to load the file
  6144. // metadata from Manifest to VersionSet before calling this function.
  6145. Status VersionSet::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) {
  6146. // Clean the previously stored checksum information if any.
  6147. Status s;
  6148. if (checksum_list == nullptr) {
  6149. s = Status::InvalidArgument("checksum_list is nullptr");
  6150. return s;
  6151. }
  6152. checksum_list->reset();
  6153. for (auto cfd : *column_family_set_) {
  6154. assert(cfd);
  6155. if (cfd->IsDropped() || !cfd->initialized()) {
  6156. continue;
  6157. }
  6158. const auto* current = cfd->current();
  6159. assert(current);
  6160. const auto* vstorage = current->storage_info();
  6161. assert(vstorage);
  6162. /* SST files */
  6163. for (int level = 0; level < cfd->NumberLevels(); level++) {
  6164. const auto& level_files = vstorage->LevelFiles(level);
  6165. for (const auto& file : level_files) {
  6166. assert(file);
  6167. s = checksum_list->InsertOneFileChecksum(file->fd.GetNumber(),
  6168. file->file_checksum,
  6169. file->file_checksum_func_name);
  6170. if (!s.ok()) {
  6171. return s;
  6172. }
  6173. }
  6174. }
  6175. /* Blob files */
  6176. const auto& blob_files = vstorage->GetBlobFiles();
  6177. for (const auto& meta : blob_files) {
  6178. assert(meta);
  6179. std::string checksum_value = meta->GetChecksumValue();
  6180. std::string checksum_method = meta->GetChecksumMethod();
  6181. assert(checksum_value.empty() == checksum_method.empty());
  6182. if (meta->GetChecksumMethod().empty()) {
  6183. checksum_value = kUnknownFileChecksum;
  6184. checksum_method = kUnknownFileChecksumFuncName;
  6185. }
  6186. s = checksum_list->InsertOneFileChecksum(meta->GetBlobFileNumber(),
  6187. checksum_value, checksum_method);
  6188. if (!s.ok()) {
  6189. return s;
  6190. }
  6191. }
  6192. }
  6193. return s;
  6194. }
  6195. Status VersionSet::DumpManifest(
  6196. Options& options, std::string& dscname, bool verbose, bool hex, bool json,
  6197. const std::vector<ColumnFamilyDescriptor>& cf_descs) {
  6198. assert(options.env);
  6199. // TODO: plumb Env::IOActivity, Env::IOPriority
  6200. const ReadOptions read_options;
  6201. std::vector<std::string> column_families;
  6202. Status s = ListColumnFamiliesFromManifest(
  6203. dscname, options.env->GetFileSystem().get(), &column_families);
  6204. if (!s.ok()) {
  6205. return s;
  6206. }
  6207. // Open the specified manifest file.
  6208. std::unique_ptr<SequentialFileReader> file_reader;
  6209. {
  6210. std::unique_ptr<FSSequentialFile> file;
  6211. const std::shared_ptr<FileSystem>& fs = options.env->GetFileSystem();
  6212. s = fs->NewSequentialFile(
  6213. dscname, fs->OptimizeForManifestRead(file_options_), &file, nullptr);
  6214. if (!s.ok()) {
  6215. return s;
  6216. }
  6217. file_reader = std::make_unique<SequentialFileReader>(
  6218. std::move(file), dscname, db_options_->log_readahead_size, io_tracer_);
  6219. }
  6220. std::map<std::string, const ColumnFamilyDescriptor*> cf_name_to_desc;
  6221. for (const auto& cf_desc : cf_descs) {
  6222. cf_name_to_desc[cf_desc.name] = &cf_desc;
  6223. }
  6224. std::vector<ColumnFamilyDescriptor> final_cf_descs;
  6225. for (const auto& cf : column_families) {
  6226. const auto iter = cf_name_to_desc.find(cf);
  6227. if (iter != cf_name_to_desc.cend()) {
  6228. final_cf_descs.push_back(*iter->second);
  6229. } else {
  6230. final_cf_descs.emplace_back(cf, options);
  6231. }
  6232. }
  6233. DumpManifestHandler handler(final_cf_descs, this, io_tracer_, read_options,
  6234. verbose, hex, json);
  6235. {
  6236. VersionSet::LogReporter reporter;
  6237. reporter.status = &s;
  6238. log::Reader reader(nullptr, std::move(file_reader), &reporter,
  6239. true /* checksum */, 0 /* log_number */);
  6240. handler.Iterate(reader, &s);
  6241. }
  6242. return handler.status();
  6243. }
  6244. void VersionSet::MarkFileNumberUsed(uint64_t number) {
  6245. // only called during recovery and repair which are single threaded, so this
  6246. // works because there can't be concurrent calls
  6247. if (next_file_number_.load(std::memory_order_relaxed) <= number) {
  6248. next_file_number_.store(number + 1, std::memory_order_relaxed);
  6249. }
  6250. }
  6251. // Called only either from ::LogAndApply which is protected by mutex or during
  6252. // recovery which is single-threaded.
  6253. void VersionSet::MarkMinLogNumberToKeep(uint64_t number) {
  6254. if (min_log_number_to_keep_.load(std::memory_order_relaxed) < number) {
  6255. min_log_number_to_keep_.store(number, std::memory_order_relaxed);
  6256. }
  6257. }
  6258. Status VersionSet::WriteCurrentStateToManifest(
  6259. const WriteOptions& write_options,
  6260. const std::unordered_map<uint32_t, MutableCFState>& curr_state,
  6261. const VersionEdit& wal_additions, log::Writer* log, IOStatus& io_s) {
  6262. // TODO: Break up into multiple records to reduce memory usage on recovery?
  6263. // WARNING: This method doesn't hold a mutex!!
  6264. // This is done without DB mutex lock held, but only within single-threaded
  6265. // LogAndApply. Column family manipulations can only happen within LogAndApply
  6266. // (the same single thread), so we're safe to iterate.
  6267. assert(io_s.ok());
  6268. if (db_options_->write_dbid_to_manifest) {
  6269. VersionEdit edit_for_db_id;
  6270. assert(!db_id_.empty());
  6271. edit_for_db_id.SetDBId(db_id_);
  6272. std::string db_id_record;
  6273. if (!edit_for_db_id.EncodeTo(&db_id_record)) {
  6274. return Status::Corruption("Unable to Encode VersionEdit:" +
  6275. edit_for_db_id.DebugString(true));
  6276. }
  6277. io_s = log->AddRecord(write_options, db_id_record);
  6278. if (!io_s.ok()) {
  6279. return io_s;
  6280. }
  6281. }
  6282. // Save WALs.
  6283. if (!wal_additions.GetWalAdditions().empty()) {
  6284. TEST_SYNC_POINT_CALLBACK("VersionSet::WriteCurrentStateToManifest:SaveWal",
  6285. const_cast<VersionEdit*>(&wal_additions));
  6286. std::string record;
  6287. if (!wal_additions.EncodeTo(&record)) {
  6288. return Status::Corruption("Unable to Encode VersionEdit: " +
  6289. wal_additions.DebugString(true));
  6290. }
  6291. io_s = log->AddRecord(write_options, record);
  6292. if (!io_s.ok()) {
  6293. return io_s;
  6294. }
  6295. }
  6296. // New manifest should rollover the WAL deletion record from previous
  6297. // manifest. Otherwise, when an addition record of a deleted WAL gets added to
  6298. // this new manifest later (which can happens in e.g, SyncWAL()), this new
  6299. // manifest creates an illusion that such WAL hasn't been deleted.
  6300. VersionEdit wal_deletions;
  6301. wal_deletions.DeleteWalsBefore(min_log_number_to_keep());
  6302. std::string wal_deletions_record;
  6303. if (!wal_deletions.EncodeTo(&wal_deletions_record)) {
  6304. return Status::Corruption("Unable to Encode VersionEdit: " +
  6305. wal_deletions.DebugString(true));
  6306. }
  6307. io_s = log->AddRecord(write_options, wal_deletions_record);
  6308. if (!io_s.ok()) {
  6309. return io_s;
  6310. }
  6311. for (auto cfd : *column_family_set_) {
  6312. assert(cfd);
  6313. if (cfd->IsDropped()) {
  6314. continue;
  6315. }
  6316. assert(cfd->initialized());
  6317. {
  6318. // Store column family info
  6319. VersionEdit edit;
  6320. if (cfd->GetID() != 0) {
  6321. // default column family is always there,
  6322. // no need to explicitly write it
  6323. edit.AddColumnFamily(cfd->GetName());
  6324. edit.SetColumnFamily(cfd->GetID());
  6325. }
  6326. edit.SetComparatorName(
  6327. cfd->internal_comparator().user_comparator()->Name());
  6328. edit.SetPersistUserDefinedTimestamps(
  6329. cfd->ioptions().persist_user_defined_timestamps);
  6330. std::string record;
  6331. if (!edit.EncodeTo(&record)) {
  6332. return Status::Corruption("Unable to Encode VersionEdit:" +
  6333. edit.DebugString(true));
  6334. }
  6335. io_s = log->AddRecord(write_options, record);
  6336. if (!io_s.ok()) {
  6337. return io_s;
  6338. }
  6339. }
  6340. {
  6341. // Save files
  6342. VersionEdit edit;
  6343. edit.SetColumnFamily(cfd->GetID());
  6344. const auto* current = cfd->current();
  6345. assert(current);
  6346. const auto* vstorage = current->storage_info();
  6347. assert(vstorage);
  6348. for (int level = 0; level < cfd->NumberLevels(); level++) {
  6349. const auto& level_files = vstorage->LevelFiles(level);
  6350. for (const auto& f : level_files) {
  6351. assert(f);
  6352. edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(),
  6353. f->fd.GetFileSize(), f->smallest, f->largest,
  6354. f->fd.smallest_seqno, f->fd.largest_seqno,
  6355. f->marked_for_compaction, f->temperature,
  6356. f->oldest_blob_file_number, f->oldest_ancester_time,
  6357. f->file_creation_time, f->epoch_number, f->file_checksum,
  6358. f->file_checksum_func_name, f->unique_id,
  6359. f->compensated_range_deletion_size, f->tail_size,
  6360. f->user_defined_timestamps_persisted);
  6361. }
  6362. }
  6363. edit.SetCompactCursors(vstorage->GetCompactCursors());
  6364. const auto& blob_files = vstorage->GetBlobFiles();
  6365. for (const auto& meta : blob_files) {
  6366. assert(meta);
  6367. const uint64_t blob_file_number = meta->GetBlobFileNumber();
  6368. edit.AddBlobFile(blob_file_number, meta->GetTotalBlobCount(),
  6369. meta->GetTotalBlobBytes(), meta->GetChecksumMethod(),
  6370. meta->GetChecksumValue());
  6371. if (meta->GetGarbageBlobCount() > 0) {
  6372. edit.AddBlobFileGarbage(blob_file_number, meta->GetGarbageBlobCount(),
  6373. meta->GetGarbageBlobBytes());
  6374. }
  6375. }
  6376. const auto iter = curr_state.find(cfd->GetID());
  6377. assert(iter != curr_state.end());
  6378. uint64_t log_number = iter->second.log_number;
  6379. edit.SetLogNumber(log_number);
  6380. if (cfd->GetID() == 0) {
  6381. // min_log_number_to_keep is for the whole db, not for specific column
  6382. // family. So it does not need to be set for every column family, just
  6383. // need to be set once. Since default CF can never be dropped, we set
  6384. // the min_log to the default CF here.
  6385. uint64_t min_log = min_log_number_to_keep();
  6386. if (min_log != 0) {
  6387. edit.SetMinLogNumberToKeep(min_log);
  6388. }
  6389. }
  6390. const std::string& full_history_ts_low = iter->second.full_history_ts_low;
  6391. if (!full_history_ts_low.empty()) {
  6392. edit.SetFullHistoryTsLow(full_history_ts_low);
  6393. }
  6394. edit.SetLastSequence(descriptor_last_sequence_);
  6395. const Comparator* ucmp = cfd->user_comparator();
  6396. assert(ucmp);
  6397. std::string record;
  6398. if (!edit.EncodeTo(&record, ucmp->timestamp_size())) {
  6399. return Status::Corruption("Unable to Encode VersionEdit:" +
  6400. edit.DebugString(true));
  6401. }
  6402. io_s = log->AddRecord(write_options, record);
  6403. if (!io_s.ok()) {
  6404. return io_s;
  6405. }
  6406. }
  6407. }
  6408. return Status::OK();
  6409. }
  6410. // TODO(aekmekji): in CompactionJob::GenSubcompactionBoundaries(), this
  6411. // function is called repeatedly with consecutive pairs of slices. For example
  6412. // if the slice list is [a, b, c, d] this function is called with arguments
  6413. // (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible where
  6414. // we avoid doing binary search for the keys b and c twice and instead somehow
  6415. // maintain state of where they first appear in the files.
  6416. uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
  6417. const ReadOptions& read_options,
  6418. Version* v, const Slice& start,
  6419. const Slice& end, int start_level,
  6420. int end_level, TableReaderCaller caller) {
  6421. const auto& icmp = v->cfd_->internal_comparator();
  6422. // pre-condition
  6423. assert(icmp.Compare(start, end) <= 0);
  6424. uint64_t total_full_size = 0;
  6425. const auto* vstorage = v->storage_info();
  6426. const int num_non_empty_levels = vstorage->num_non_empty_levels();
  6427. end_level = (end_level == -1) ? num_non_empty_levels
  6428. : std::min(end_level, num_non_empty_levels);
  6429. if (end_level <= start_level) {
  6430. return 0;
  6431. }
  6432. // Outline of the optimization that uses options.files_size_error_margin.
  6433. // When approximating the files total size that is used to store a keys range,
  6434. // we first sum up the sizes of the files that fully fall into the range.
  6435. // Then we sum up the sizes of all the files that may intersect with the range
  6436. // (this includes all files in L0 as well). Then, if total_intersecting_size
  6437. // is smaller than total_full_size * options.files_size_error_margin - we can
  6438. // infer that the intersecting files have a sufficiently negligible
  6439. // contribution to the total size, and we can approximate the storage required
  6440. // for the keys in range as just half of the intersecting_files_size.
  6441. // E.g., if the value of files_size_error_margin is 0.1, then the error of the
  6442. // approximation is limited to only ~10% of the total size of files that fully
  6443. // fall into the keys range. In such case, this helps to avoid a costly
  6444. // process of binary searching the intersecting files that is required only
  6445. // for a more precise calculation of the total size.
  6446. autovector<FdWithKeyRange*, 32> first_files;
  6447. autovector<FdWithKeyRange*, 16> last_files;
  6448. // scan all the levels
  6449. for (int level = start_level; level < end_level; ++level) {
  6450. const LevelFilesBrief& files_brief = vstorage->LevelFilesBrief(level);
  6451. if (files_brief.num_files == 0) {
  6452. // empty level, skip exploration
  6453. continue;
  6454. }
  6455. if (level == 0) {
  6456. // level 0 files are not in sorted order, we need to iterate through
  6457. // the list to compute the total bytes that require scanning,
  6458. // so handle the case explicitly (similarly to first_files case)
  6459. for (size_t i = 0; i < files_brief.num_files; i++) {
  6460. first_files.push_back(&files_brief.files[i]);
  6461. }
  6462. continue;
  6463. }
  6464. assert(level > 0);
  6465. assert(files_brief.num_files > 0);
  6466. // identify the file position for start key
  6467. const int idx_start =
  6468. FindFileInRange(icmp, files_brief, start, 0,
  6469. static_cast<uint32_t>(files_brief.num_files - 1));
  6470. assert(static_cast<size_t>(idx_start) < files_brief.num_files);
  6471. // identify the file position for end key
  6472. int idx_end = idx_start;
  6473. if (icmp.Compare(files_brief.files[idx_end].largest_key, end) < 0) {
  6474. idx_end =
  6475. FindFileInRange(icmp, files_brief, end, idx_start,
  6476. static_cast<uint32_t>(files_brief.num_files - 1));
  6477. }
  6478. assert(idx_end >= idx_start &&
  6479. static_cast<size_t>(idx_end) < files_brief.num_files);
  6480. // scan all files from the starting index to the ending index
  6481. // (inferred from the sorted order)
  6482. // first scan all the intermediate full files (excluding first and last)
  6483. for (int i = idx_start + 1; i < idx_end; ++i) {
  6484. uint64_t file_size = files_brief.files[i].fd.GetFileSize();
  6485. // The entire file falls into the range, so we can just take its size.
  6486. assert(file_size == ApproximateSize(read_options, v, files_brief.files[i],
  6487. start, end, caller));
  6488. total_full_size += file_size;
  6489. }
  6490. // save the first and the last files (which may be the same file), so we
  6491. // can scan them later.
  6492. first_files.push_back(&files_brief.files[idx_start]);
  6493. if (idx_start != idx_end) {
  6494. // we need to estimate size for both files, only if they are different
  6495. last_files.push_back(&files_brief.files[idx_end]);
  6496. }
  6497. }
  6498. // The sum of all file sizes that intersect the [start, end] keys range.
  6499. uint64_t total_intersecting_size = 0;
  6500. for (const auto* file_ptr : first_files) {
  6501. total_intersecting_size += file_ptr->fd.GetFileSize();
  6502. }
  6503. for (const auto* file_ptr : last_files) {
  6504. total_intersecting_size += file_ptr->fd.GetFileSize();
  6505. }
  6506. // Now scan all the first & last files at each level, and estimate their size.
  6507. // If the total_intersecting_size is less than X% of the total_full_size - we
  6508. // want to approximate the result in order to avoid the costly binary search
  6509. // inside ApproximateSize. We use half of file size as an approximation below.
  6510. const double margin = options.files_size_error_margin;
  6511. if (margin > 0 && total_intersecting_size <
  6512. static_cast<uint64_t>(total_full_size * margin)) {
  6513. total_full_size += total_intersecting_size / 2;
  6514. } else {
  6515. // Estimate for all the first files (might also be last files), at each
  6516. // level
  6517. for (const auto file_ptr : first_files) {
  6518. total_full_size +=
  6519. ApproximateSize(read_options, v, *file_ptr, start, end, caller);
  6520. }
  6521. // Estimate for all the last files, at each level
  6522. for (const auto file_ptr : last_files) {
  6523. // We could use ApproximateSize here, but calling ApproximateOffsetOf
  6524. // directly is just more efficient.
  6525. total_full_size +=
  6526. ApproximateOffsetOf(read_options, v, *file_ptr, end, caller);
  6527. }
  6528. }
  6529. return total_full_size;
  6530. }
  6531. uint64_t VersionSet::ApproximateOffsetOf(const ReadOptions& read_options,
  6532. Version* v, const FdWithKeyRange& f,
  6533. const Slice& key,
  6534. TableReaderCaller caller) {
  6535. // pre-condition
  6536. assert(v);
  6537. const auto& icmp = v->cfd_->internal_comparator();
  6538. uint64_t result = 0;
  6539. if (icmp.Compare(f.largest_key, key) <= 0) {
  6540. // Entire file is before "key", so just add the file size
  6541. result = f.fd.GetFileSize();
  6542. } else if (icmp.Compare(f.smallest_key, key) > 0) {
  6543. // Entire file is after "key", so ignore
  6544. result = 0;
  6545. } else {
  6546. // "key" falls in the range for this table. Add the
  6547. // approximate offset of "key" within the table.
  6548. TableCache* table_cache = v->cfd_->table_cache();
  6549. const MutableCFOptions& cf_opts = v->GetMutableCFOptions();
  6550. if (table_cache != nullptr) {
  6551. result = table_cache->ApproximateOffsetOf(
  6552. read_options, key, *f.file_metadata, caller, icmp, cf_opts);
  6553. }
  6554. }
  6555. return result;
  6556. }
  6557. uint64_t VersionSet::ApproximateSize(const ReadOptions& read_options,
  6558. Version* v, const FdWithKeyRange& f,
  6559. const Slice& start, const Slice& end,
  6560. TableReaderCaller caller) {
  6561. // pre-condition
  6562. assert(v);
  6563. const auto& icmp = v->cfd_->internal_comparator();
  6564. assert(icmp.Compare(start, end) <= 0);
  6565. if (icmp.Compare(f.largest_key, start) <= 0 ||
  6566. icmp.Compare(f.smallest_key, end) > 0) {
  6567. // Entire file is before or after the start/end keys range
  6568. return 0;
  6569. }
  6570. if (icmp.Compare(f.smallest_key, start) >= 0) {
  6571. // Start of the range is before the file start - approximate by end offset
  6572. return ApproximateOffsetOf(read_options, v, f, end, caller);
  6573. }
  6574. if (icmp.Compare(f.largest_key, end) < 0) {
  6575. // End of the range is after the file end - approximate by subtracting
  6576. // start offset from the file size
  6577. uint64_t start_offset =
  6578. ApproximateOffsetOf(read_options, v, f, start, caller);
  6579. assert(f.fd.GetFileSize() >= start_offset);
  6580. return f.fd.GetFileSize() - start_offset;
  6581. }
  6582. // The interval falls entirely in the range for this file.
  6583. TableCache* table_cache = v->cfd_->table_cache();
  6584. if (table_cache == nullptr) {
  6585. return 0;
  6586. }
  6587. const MutableCFOptions& cf_opts = v->GetMutableCFOptions();
  6588. return table_cache->ApproximateSize(read_options, start, end,
  6589. *f.file_metadata, caller, icmp, cf_opts);
  6590. }
  6591. void VersionSet::RemoveLiveFiles(
  6592. std::vector<ObsoleteFileInfo>& sst_delete_candidates,
  6593. std::vector<ObsoleteBlobFileInfo>& blob_delete_candidates) const {
  6594. assert(column_family_set_);
  6595. for (auto cfd : *column_family_set_) {
  6596. assert(cfd);
  6597. if (!cfd->initialized()) {
  6598. continue;
  6599. }
  6600. auto* current = cfd->current();
  6601. bool found_current = false;
  6602. Version* const dummy_versions = cfd->dummy_versions();
  6603. assert(dummy_versions);
  6604. for (Version* v = dummy_versions->next_; v != dummy_versions;
  6605. v = v->next_) {
  6606. v->RemoveLiveFiles(sst_delete_candidates, blob_delete_candidates);
  6607. if (v == current) {
  6608. found_current = true;
  6609. }
  6610. }
  6611. if (!found_current && current != nullptr) {
  6612. // Should never happen unless it is a bug.
  6613. assert(false);
  6614. current->RemoveLiveFiles(sst_delete_candidates, blob_delete_candidates);
  6615. }
  6616. }
  6617. }
  6618. void VersionSet::AddLiveFiles(std::vector<uint64_t>* live_table_files,
  6619. std::vector<uint64_t>* live_blob_files) const {
  6620. assert(live_table_files);
  6621. assert(live_blob_files);
  6622. // pre-calculate space requirement
  6623. size_t total_table_files = 0;
  6624. size_t total_blob_files = 0;
  6625. assert(column_family_set_);
  6626. for (auto cfd : *column_family_set_) {
  6627. assert(cfd);
  6628. if (!cfd->initialized()) {
  6629. continue;
  6630. }
  6631. Version* const dummy_versions = cfd->dummy_versions();
  6632. assert(dummy_versions);
  6633. for (Version* v = dummy_versions->next_; v != dummy_versions;
  6634. v = v->next_) {
  6635. assert(v);
  6636. const auto* vstorage = v->storage_info();
  6637. assert(vstorage);
  6638. for (int level = 0; level < vstorage->num_levels(); ++level) {
  6639. total_table_files += vstorage->LevelFiles(level).size();
  6640. }
  6641. total_blob_files += vstorage->GetBlobFiles().size();
  6642. }
  6643. }
  6644. // just one time extension to the right size
  6645. live_table_files->reserve(live_table_files->size() + total_table_files);
  6646. live_blob_files->reserve(live_blob_files->size() + total_blob_files);
  6647. assert(column_family_set_);
  6648. for (auto cfd : *column_family_set_) {
  6649. assert(cfd);
  6650. if (!cfd->initialized()) {
  6651. continue;
  6652. }
  6653. auto* current = cfd->current();
  6654. bool found_current = false;
  6655. Version* const dummy_versions = cfd->dummy_versions();
  6656. assert(dummy_versions);
  6657. for (Version* v = dummy_versions->next_; v != dummy_versions;
  6658. v = v->next_) {
  6659. v->AddLiveFiles(live_table_files, live_blob_files);
  6660. if (v == current) {
  6661. found_current = true;
  6662. }
  6663. }
  6664. if (!found_current && current != nullptr) {
  6665. // Should never happen unless it is a bug.
  6666. assert(false);
  6667. current->AddLiveFiles(live_table_files, live_blob_files);
  6668. }
  6669. }
  6670. }
  6671. InternalIterator* VersionSet::MakeInputIterator(
  6672. const ReadOptions& read_options, const Compaction* c,
  6673. RangeDelAggregator* range_del_agg,
  6674. const FileOptions& file_options_compactions,
  6675. const std::optional<const Slice>& start,
  6676. const std::optional<const Slice>& end) {
  6677. auto cfd = c->column_family_data();
  6678. // Level-0 files have to be merged together. For other levels,
  6679. // we will make a concatenating iterator per level.
  6680. // TODO(opt): use concatenating iterator for level-0 if there is no overlap
  6681. const size_t space = (c->level() == 0 ? c->input_levels(0)->num_files +
  6682. c->num_input_levels() - 1
  6683. : c->num_input_levels());
  6684. InternalIterator** list = new InternalIterator*[space];
  6685. // First item in the pair is a pointer to range tombstones.
  6686. // Second item is a pointer to a member of a LevelIterator,
  6687. // that will be initialized to where CompactionMergingIterator stores
  6688. // pointer to its range tombstones. This is used by LevelIterator
  6689. // to update pointer to range tombstones as it traverse different SST files.
  6690. std::vector<std::pair<std::unique_ptr<TruncatedRangeDelIterator>,
  6691. std::unique_ptr<TruncatedRangeDelIterator>**>>
  6692. range_tombstones;
  6693. size_t num = 0;
  6694. [[maybe_unused]] size_t num_input_files = 0;
  6695. for (size_t which = 0; which < c->num_input_levels(); which++) {
  6696. const LevelFilesBrief* flevel = c->input_levels(which);
  6697. num_input_files += flevel->num_files;
  6698. if (flevel->num_files != 0) {
  6699. if (c->level(which) == 0) {
  6700. for (size_t i = 0; i < flevel->num_files; i++) {
  6701. const FileMetaData& fmd = *flevel->files[i].file_metadata;
  6702. if (start.has_value() &&
  6703. cfd->user_comparator()->CompareWithoutTimestamp(
  6704. *start, fmd.largest.user_key()) > 0) {
  6705. continue;
  6706. }
  6707. // We should be able to filter out the case where the end key
  6708. // equals to the end boundary, since the end key is exclusive.
  6709. // We try to be extra safe here.
  6710. if (end.has_value() &&
  6711. cfd->user_comparator()->CompareWithoutTimestamp(
  6712. *end, fmd.smallest.user_key()) < 0) {
  6713. continue;
  6714. }
  6715. std::unique_ptr<TruncatedRangeDelIterator> range_tombstone_iter =
  6716. nullptr;
  6717. list[num++] = cfd->table_cache()->NewIterator(
  6718. read_options, file_options_compactions,
  6719. cfd->internal_comparator(), fmd, range_del_agg,
  6720. c->mutable_cf_options(),
  6721. /*table_reader_ptr=*/nullptr,
  6722. /*file_read_hist=*/nullptr, TableReaderCaller::kCompaction,
  6723. /*arena=*/nullptr,
  6724. /*skip_filters=*/false,
  6725. /*level=*/static_cast<int>(c->level(which)),
  6726. MaxFileSizeForL0MetaPin(c->mutable_cf_options()),
  6727. /*smallest_compaction_key=*/nullptr,
  6728. /*largest_compaction_key=*/nullptr,
  6729. /*allow_unprepared_value=*/false,
  6730. /*range_del_read_seqno=*/nullptr,
  6731. /*range_del_iter=*/&range_tombstone_iter);
  6732. range_tombstones.emplace_back(std::move(range_tombstone_iter),
  6733. nullptr);
  6734. }
  6735. } else {
  6736. // Create concatenating iterator for the files from this level
  6737. std::unique_ptr<TruncatedRangeDelIterator>** tombstone_iter_ptr =
  6738. nullptr;
  6739. list[num++] = new LevelIterator(
  6740. cfd->table_cache(), read_options, file_options_compactions,
  6741. cfd->internal_comparator(), flevel, c->mutable_cf_options(),
  6742. /*should_sample=*/false,
  6743. /*no per level latency histogram=*/nullptr,
  6744. TableReaderCaller::kCompaction, /*skip_filters=*/false,
  6745. /*level=*/static_cast<int>(c->level(which)), range_del_agg,
  6746. c->boundaries(which), false, &tombstone_iter_ptr);
  6747. range_tombstones.emplace_back(nullptr, tombstone_iter_ptr);
  6748. }
  6749. }
  6750. }
  6751. TEST_SYNC_POINT_CALLBACK(
  6752. "VersionSet::MakeInputIterator:NewCompactionMergingIterator",
  6753. &num_input_files);
  6754. assert(num <= space);
  6755. InternalIterator* result = NewCompactionMergingIterator(
  6756. &c->column_family_data()->internal_comparator(), list,
  6757. static_cast<int>(num), range_tombstones, /*arena=*/nullptr,
  6758. c->column_family_data()->internal_stats());
  6759. delete[] list;
  6760. return result;
  6761. }
  6762. Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel,
  6763. FileMetaData** meta,
  6764. ColumnFamilyData** cfd) {
  6765. for (auto cfd_iter : *column_family_set_) {
  6766. if (!cfd_iter->initialized()) {
  6767. continue;
  6768. }
  6769. Version* version = cfd_iter->current();
  6770. const auto* vstorage = version->storage_info();
  6771. for (int level = 0; level < vstorage->num_levels(); level++) {
  6772. for (const auto& file : vstorage->LevelFiles(level)) {
  6773. if (file->fd.GetNumber() == number) {
  6774. *meta = file;
  6775. *filelevel = level;
  6776. *cfd = cfd_iter;
  6777. return Status::OK();
  6778. }
  6779. }
  6780. }
  6781. }
  6782. return Status::NotFound("File not present in any level");
  6783. }
  6784. void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
  6785. if (!metadata) {
  6786. return;
  6787. }
  6788. assert(metadata);
  6789. size_t count = 0;
  6790. for (auto cfd : *column_family_set_) {
  6791. if (cfd->IsDropped() || !cfd->initialized()) {
  6792. continue;
  6793. }
  6794. for (int level = 0; level < cfd->NumberLevels(); level++) {
  6795. count += cfd->current()->storage_info()->LevelFiles(level).size();
  6796. }
  6797. }
  6798. metadata->reserve(count);
  6799. for (auto cfd : *column_family_set_) {
  6800. if (cfd->IsDropped() || !cfd->initialized()) {
  6801. continue;
  6802. }
  6803. for (int level = 0; level < cfd->NumberLevels(); level++) {
  6804. for (const auto& file :
  6805. cfd->current()->storage_info()->LevelFiles(level)) {
  6806. LiveFileMetaData filemetadata;
  6807. filemetadata.column_family_name = cfd->GetName();
  6808. uint32_t path_id = file->fd.GetPathId();
  6809. if (path_id < cfd->ioptions().cf_paths.size()) {
  6810. filemetadata.db_path = cfd->ioptions().cf_paths[path_id].path;
  6811. } else {
  6812. assert(!cfd->ioptions().cf_paths.empty());
  6813. filemetadata.db_path = cfd->ioptions().cf_paths.back().path;
  6814. }
  6815. filemetadata.directory = filemetadata.db_path;
  6816. const uint64_t file_number = file->fd.GetNumber();
  6817. filemetadata.name = MakeTableFileName("", file_number);
  6818. filemetadata.relative_filename = filemetadata.name.substr(1);
  6819. filemetadata.file_number = file_number;
  6820. filemetadata.level = level;
  6821. filemetadata.size = file->fd.GetFileSize();
  6822. filemetadata.smallestkey = file->smallest.user_key().ToString();
  6823. filemetadata.largestkey = file->largest.user_key().ToString();
  6824. filemetadata.smallest_seqno = file->fd.smallest_seqno;
  6825. filemetadata.largest_seqno = file->fd.largest_seqno;
  6826. filemetadata.num_reads_sampled =
  6827. file->stats.num_reads_sampled.load(std::memory_order_relaxed);
  6828. filemetadata.being_compacted = file->being_compacted;
  6829. filemetadata.num_entries = file->num_entries;
  6830. filemetadata.num_deletions = file->num_deletions;
  6831. filemetadata.oldest_blob_file_number = file->oldest_blob_file_number;
  6832. filemetadata.file_checksum = file->file_checksum;
  6833. filemetadata.file_checksum_func_name = file->file_checksum_func_name;
  6834. filemetadata.temperature = file->temperature;
  6835. filemetadata.oldest_ancester_time = file->TryGetOldestAncesterTime();
  6836. filemetadata.file_creation_time = file->TryGetFileCreationTime();
  6837. filemetadata.epoch_number = file->epoch_number;
  6838. metadata->push_back(filemetadata);
  6839. }
  6840. }
  6841. }
  6842. }
  6843. void VersionSet::GetObsoleteFiles(std::vector<ObsoleteFileInfo>* files,
  6844. std::vector<ObsoleteBlobFileInfo>* blob_files,
  6845. std::vector<std::string>* manifest_filenames,
  6846. uint64_t min_pending_output) {
  6847. assert(files);
  6848. assert(blob_files);
  6849. assert(manifest_filenames);
  6850. assert(files->empty());
  6851. assert(blob_files->empty());
  6852. assert(manifest_filenames->empty());
  6853. std::vector<ObsoleteFileInfo> pending_files;
  6854. for (auto& f : obsolete_files_) {
  6855. if (f.metadata->fd.GetNumber() < min_pending_output) {
  6856. files->emplace_back(std::move(f));
  6857. } else {
  6858. pending_files.emplace_back(std::move(f));
  6859. }
  6860. }
  6861. obsolete_files_.swap(pending_files);
  6862. std::vector<ObsoleteBlobFileInfo> pending_blob_files;
  6863. for (auto& blob_file : obsolete_blob_files_) {
  6864. if (blob_file.GetBlobFileNumber() < min_pending_output) {
  6865. blob_files->emplace_back(std::move(blob_file));
  6866. } else {
  6867. pending_blob_files.emplace_back(std::move(blob_file));
  6868. }
  6869. }
  6870. obsolete_blob_files_.swap(pending_blob_files);
  6871. obsolete_manifests_.swap(*manifest_filenames);
  6872. }
  6873. uint64_t VersionSet::GetObsoleteSstFilesSize() const {
  6874. uint64_t ret = 0;
  6875. for (auto& f : obsolete_files_) {
  6876. if (f.metadata != nullptr) {
  6877. ret += f.metadata->fd.GetFileSize();
  6878. }
  6879. }
  6880. return ret;
  6881. }
  6882. ColumnFamilyData* VersionSet::CreateColumnFamily(
  6883. const ColumnFamilyOptions& cf_options, const ReadOptions& read_options,
  6884. const VersionEdit* edit, bool read_only) {
  6885. assert(edit->IsColumnFamilyAdd());
  6886. // Unchanging LSM tree implies no writes to the CF
  6887. assert(!unchanging_ || read_only);
  6888. MutableCFOptions dummy_cf_options;
  6889. Version* dummy_versions =
  6890. new Version(nullptr, this, file_options_, dummy_cf_options, io_tracer_);
  6891. // Ref() dummy version once so that later we can call Unref() to delete it
  6892. // by avoiding calling "delete" explicitly (~Version is private)
  6893. dummy_versions->Ref();
  6894. auto new_cfd = column_family_set_->CreateColumnFamily(
  6895. edit->GetColumnFamilyName(), edit->GetColumnFamily(), dummy_versions,
  6896. cf_options, read_only);
  6897. Version* v = new Version(new_cfd, this, file_options_,
  6898. new_cfd->GetLatestMutableCFOptions(), io_tracer_,
  6899. current_version_number_++);
  6900. constexpr bool update_stats = false;
  6901. v->PrepareAppend(read_options, update_stats);
  6902. AppendVersion(new_cfd, v);
  6903. // GetLatestMutableCFOptions() is safe here without mutex since the
  6904. // cfd is not available to client
  6905. new_cfd->CreateNewMemtable(LastSequence());
  6906. new_cfd->SetLogNumber(edit->GetLogNumber());
  6907. return new_cfd;
  6908. }
  6909. uint64_t VersionSet::GetNumLiveVersions(Version* dummy_versions) {
  6910. uint64_t count = 0;
  6911. for (Version* v = dummy_versions->next_; v != dummy_versions; v = v->next_) {
  6912. count++;
  6913. }
  6914. return count;
  6915. }
  6916. uint64_t VersionSet::GetTotalSstFilesSize(Version* dummy_versions) {
  6917. std::unordered_set<uint64_t> unique_files;
  6918. uint64_t total_files_size = 0;
  6919. for (Version* v = dummy_versions->next_; v != dummy_versions; v = v->next_) {
  6920. VersionStorageInfo* storage_info = v->storage_info();
  6921. for (int level = 0; level < storage_info->num_levels_; level++) {
  6922. for (const auto& file_meta : storage_info->LevelFiles(level)) {
  6923. if (unique_files.find(file_meta->fd.packed_number_and_path_id) ==
  6924. unique_files.end()) {
  6925. unique_files.insert(file_meta->fd.packed_number_and_path_id);
  6926. total_files_size += file_meta->fd.GetFileSize();
  6927. }
  6928. }
  6929. }
  6930. }
  6931. return total_files_size;
  6932. }
  6933. uint64_t VersionSet::GetTotalBlobFileSize(Version* dummy_versions) {
  6934. std::unordered_set<uint64_t> unique_blob_files;
  6935. uint64_t all_versions_blob_file_size = 0;
  6936. for (auto* v = dummy_versions->next_; v != dummy_versions; v = v->next_) {
  6937. // iterate all the versions
  6938. const auto* vstorage = v->storage_info();
  6939. assert(vstorage);
  6940. const auto& blob_files = vstorage->GetBlobFiles();
  6941. for (const auto& meta : blob_files) {
  6942. assert(meta);
  6943. const uint64_t blob_file_number = meta->GetBlobFileNumber();
  6944. if (unique_blob_files.find(blob_file_number) == unique_blob_files.end()) {
  6945. // find Blob file that has not been counted
  6946. unique_blob_files.insert(blob_file_number);
  6947. all_versions_blob_file_size += meta->GetBlobFileSize();
  6948. }
  6949. }
  6950. }
  6951. return all_versions_blob_file_size;
  6952. }
  6953. Status VersionSet::VerifyFileMetadata(const ReadOptions& read_options,
  6954. ColumnFamilyData* cfd,
  6955. const std::string& fpath, int level,
  6956. const FileMetaData& meta) {
  6957. uint64_t fsize = 0;
  6958. Status status = fs_->GetFileSize(fpath, IOOptions(), &fsize, nullptr);
  6959. if (status.ok()) {
  6960. if (fsize != meta.fd.GetFileSize()) {
  6961. status = Status::Corruption("File size mismatch: " + fpath);
  6962. }
  6963. }
  6964. if (status.ok() && db_options_->verify_sst_unique_id_in_manifest) {
  6965. assert(cfd);
  6966. TableCache* table_cache = cfd->table_cache();
  6967. assert(table_cache);
  6968. const auto& cf_opts = cfd->GetLatestMutableCFOptions();
  6969. size_t max_sz_for_l0_meta_pin = MaxFileSizeForL0MetaPin(cf_opts);
  6970. const FileOptions& file_opts = file_options();
  6971. Version* version = cfd->current();
  6972. assert(version);
  6973. VersionStorageInfo& storage_info = version->storage_info_;
  6974. const InternalKeyComparator* icmp = storage_info.InternalComparator();
  6975. assert(icmp);
  6976. InternalStats* internal_stats = cfd->internal_stats();
  6977. TableCache::TypedHandle* handle = nullptr;
  6978. FileMetaData meta_copy = meta;
  6979. status = table_cache->FindTable(
  6980. read_options, file_opts, *icmp, meta_copy, &handle, cf_opts,
  6981. /*no_io=*/false, internal_stats->GetFileReadHist(level), false, level,
  6982. /*prefetch_index_and_filter_in_cache*/ false, max_sz_for_l0_meta_pin,
  6983. meta_copy.temperature);
  6984. if (handle) {
  6985. table_cache->get_cache().Release(handle);
  6986. }
  6987. }
  6988. return status;
  6989. }
  6990. ReactiveVersionSet::ReactiveVersionSet(
  6991. const std::string& dbname, const ImmutableDBOptions* _db_options,
  6992. const FileOptions& _file_options, Cache* table_cache,
  6993. WriteBufferManager* write_buffer_manager, WriteController* write_controller,
  6994. const std::shared_ptr<IOTracer>& io_tracer)
  6995. : VersionSet(dbname, _db_options, _file_options, table_cache,
  6996. write_buffer_manager, write_controller,
  6997. /*block_cache_tracer=*/nullptr, io_tracer, /*db_id*/ "",
  6998. /*db_session_id*/ "", /*daily_offpeak_time_utc*/ "",
  6999. /*error_handler=*/nullptr, /*unchanging=*/false) {}
  7000. ReactiveVersionSet::~ReactiveVersionSet() = default;
  7001. Status ReactiveVersionSet::Recover(
  7002. const std::vector<ColumnFamilyDescriptor>& column_families,
  7003. std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
  7004. std::unique_ptr<log::Reader::Reporter>* manifest_reporter,
  7005. std::unique_ptr<Status>* manifest_reader_status) {
  7006. assert(manifest_reader != nullptr);
  7007. assert(manifest_reporter != nullptr);
  7008. assert(manifest_reader_status != nullptr);
  7009. manifest_reader_status->reset(new Status());
  7010. manifest_reporter->reset(new LogReporter());
  7011. static_cast_with_check<LogReporter>(manifest_reporter->get())->status =
  7012. manifest_reader_status->get();
  7013. Status s = MaybeSwitchManifest(manifest_reporter->get(), manifest_reader);
  7014. if (!s.ok()) {
  7015. return s;
  7016. }
  7017. log::Reader* reader = manifest_reader->get();
  7018. assert(reader);
  7019. manifest_tailer_.reset(new ManifestTailer(
  7020. column_families, const_cast<ReactiveVersionSet*>(this), io_tracer_,
  7021. read_options_, EpochNumberRequirement::kMightMissing));
  7022. manifest_tailer_->Iterate(*reader, manifest_reader_status->get());
  7023. s = manifest_tailer_->status();
  7024. if (s.ok()) {
  7025. RecoverEpochNumbers();
  7026. }
  7027. return s;
  7028. }
  7029. Status ReactiveVersionSet::ReadAndApply(
  7030. InstrumentedMutex* mu,
  7031. std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
  7032. Status* manifest_read_status,
  7033. std::unordered_set<ColumnFamilyData*>* cfds_changed,
  7034. std::vector<std::string>* files_to_delete) {
  7035. assert(manifest_reader != nullptr);
  7036. assert(cfds_changed != nullptr);
  7037. mu->AssertHeld();
  7038. Status s;
  7039. log::Reader* reader = manifest_reader->get();
  7040. assert(reader);
  7041. s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader);
  7042. if (!s.ok()) {
  7043. return s;
  7044. }
  7045. manifest_tailer_->Iterate(*(manifest_reader->get()), manifest_read_status);
  7046. s = manifest_tailer_->status();
  7047. if (s.ok()) {
  7048. *cfds_changed = std::move(manifest_tailer_->GetUpdatedColumnFamilies());
  7049. }
  7050. if (files_to_delete) {
  7051. *files_to_delete = manifest_tailer_->GetAndClearIntermediateFiles();
  7052. }
  7053. return s;
  7054. }
  7055. Status ReactiveVersionSet::MaybeSwitchManifest(
  7056. log::Reader::Reporter* reporter,
  7057. std::unique_ptr<log::FragmentBufferedReader>* manifest_reader) {
  7058. assert(manifest_reader != nullptr);
  7059. Status s;
  7060. std::string manifest_path;
  7061. s = GetCurrentManifestPath(dbname_, fs_.get(), /*is_retry=*/false,
  7062. &manifest_path, &manifest_file_number_);
  7063. if (!s.ok()) {
  7064. return s;
  7065. }
  7066. std::unique_ptr<FSSequentialFile> manifest_file;
  7067. if (manifest_reader->get() != nullptr &&
  7068. manifest_reader->get()->file()->file_name() == manifest_path) {
  7069. // CURRENT points to the same MANIFEST as before, no need to switch
  7070. // MANIFEST.
  7071. return s;
  7072. }
  7073. assert(nullptr == manifest_reader->get() ||
  7074. manifest_reader->get()->file()->file_name() != manifest_path);
  7075. s = fs_->FileExists(manifest_path, IOOptions(), nullptr);
  7076. if (s.IsNotFound()) {
  7077. return Status::TryAgain(
  7078. "The primary may have switched to a new MANIFEST and deleted the old "
  7079. "one.");
  7080. } else if (!s.ok()) {
  7081. return s;
  7082. }
  7083. TEST_SYNC_POINT(
  7084. "ReactiveVersionSet::MaybeSwitchManifest:"
  7085. "AfterGetCurrentManifestPath:0");
  7086. TEST_SYNC_POINT(
  7087. "ReactiveVersionSet::MaybeSwitchManifest:"
  7088. "AfterGetCurrentManifestPath:1");
  7089. // The primary can also delete the MANIFEST while the secondary is reading
  7090. // it. This is OK on POSIX. For other file systems, maybe create a hard link
  7091. // to MANIFEST. The hard link should be cleaned up later by the secondary.
  7092. s = fs_->NewSequentialFile(manifest_path,
  7093. fs_->OptimizeForManifestRead(file_options_),
  7094. &manifest_file, nullptr);
  7095. std::unique_ptr<SequentialFileReader> manifest_file_reader;
  7096. if (s.ok()) {
  7097. manifest_file_reader.reset(new SequentialFileReader(
  7098. std::move(manifest_file), manifest_path,
  7099. db_options_->log_readahead_size, io_tracer_, db_options_->listeners));
  7100. manifest_reader->reset(new log::FragmentBufferedReader(
  7101. nullptr, std::move(manifest_file_reader), reporter, true /* checksum */,
  7102. 0 /* log_number */));
  7103. ROCKS_LOG_INFO(db_options_->info_log, "Switched to new manifest: %s\n",
  7104. manifest_path.c_str());
  7105. if (manifest_tailer_) {
  7106. manifest_tailer_->PrepareToReadNewManifest();
  7107. }
  7108. } else if (s.IsPathNotFound()) {
  7109. // This can happen if the primary switches to a new MANIFEST after the
  7110. // secondary reads the CURRENT file but before the secondary actually tries
  7111. // to open the MANIFEST.
  7112. s = Status::TryAgain(
  7113. "The primary may have switched to a new MANIFEST and deleted the old "
  7114. "one.");
  7115. }
  7116. return s;
  7117. }
  7118. #ifndef NDEBUG
  7119. uint64_t ReactiveVersionSet::TEST_read_edits_in_atomic_group() const {
  7120. assert(manifest_tailer_);
  7121. return manifest_tailer_->GetReadBuffer().TEST_read_edits_in_atomic_group();
  7122. }
  7123. #endif // !NDEBUG
  7124. std::vector<VersionEdit>& ReactiveVersionSet::replay_buffer() {
  7125. assert(manifest_tailer_);
  7126. return manifest_tailer_->GetReadBuffer().replay_buffer();
  7127. }
  7128. } // namespace ROCKSDB_NAMESPACE