db_bench_tool.cc 253 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262626362646265626662676268626962706271627262736274627562766277627862796280628162826283628462856286628762886289629062916292629362946295629662976298629963006301630263036304630563066307630863096310631163126313631463156316631763186319632063216322632363246325632663276328632963306331633263336334633563366337633863396340634163426343634463456346634763486349635063516352635363546355635663576358635963606361636263636364636563666367636863696370637163726373637463756376637763786379638063816382638363846385638663876388638963906391639263936394639563966397639863996400640164026403640464056406640764086409641064116412641364146415641664176418641964206421642264236424642564266427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517651865196520652165226523652465256526652765286529653065316532653365346535653665376538653965406541654265436544654565466547654865496550655165526553655465556556655765586559656065616562656365646565656665676568656965706571657265736574657565766577657865796580658165826583658465856586658765886589659065916592659365946595659665976598659966006601660266036604660566066607660866096610661166126613661466156616661766186619662066216622662366246625662666276628662966306631663266336634663566366637663866396640664166426643664466456646664766486649665066516652665366546655665666576658665966606661666266636664666566666667666866696670667166726673667466756676667766786679668066816682668366846685668666876688668966906691669266936694669566966697669866996700670167026703670467056706670767086709671067116712671367146715671667176718671967206721672267236724672567266727672867296730673167326733673467356736673767386739674067416742674367446745674667476748674967506751675267536754675567566757675867596760676167626763676467656766676767686769677067716772677367746775677667776778677967806781678267836784678567866787678867896790679167926793679467956796679767986799680068016802680368046805680668076808680968106811681268136814681568166817681868196820682168226823682468256826682768286829683068316832683368346835683668376838683968406841684268436844684568466847684868496850685168526853685468556856685768586859686068616862686368646865686668676868686968706871687268736874687568766877687868796880688168826883688468856886688768886889689068916892689368946895689668976898689969006901690269036904690569066907690869096910691169126913691469156916691769186919692069216922692369246925692669276928692969306931693269336934693569366937693869396940694169426943694469456946694769486949695069516952695369546955695669576958695969606961696269636964696569666967696869696970697169726973697469756976697769786979698069816982698369846985698669876988698969906991699269936994699569966997699869997000700170027003700470057006700770087009701070117012701370147015701670177018701970207021702270237024702570267027702870297030703170327033703470357036703770387039704070417042704370447045704670477048
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. #ifdef GFLAGS
  10. #ifdef NUMA
  11. #include <numa.h>
  12. #include <numaif.h>
  13. #endif
  14. #ifndef OS_WIN
  15. #include <unistd.h>
  16. #endif
  17. #include <fcntl.h>
  18. #include <stdio.h>
  19. #include <stdlib.h>
  20. #include <sys/types.h>
  21. #include <atomic>
  22. #include <cinttypes>
  23. #include <condition_variable>
  24. #include <cstddef>
  25. #include <memory>
  26. #include <mutex>
  27. #include <thread>
  28. #include <unordered_map>
  29. #include "db/db_impl/db_impl.h"
  30. #include "db/malloc_stats.h"
  31. #include "db/version_set.h"
  32. #include "hdfs/env_hdfs.h"
  33. #include "monitoring/histogram.h"
  34. #include "monitoring/statistics.h"
  35. #include "options/cf_options.h"
  36. #include "port/port.h"
  37. #include "port/stack_trace.h"
  38. #include "rocksdb/cache.h"
  39. #include "rocksdb/db.h"
  40. #include "rocksdb/env.h"
  41. #include "rocksdb/filter_policy.h"
  42. #include "rocksdb/memtablerep.h"
  43. #include "rocksdb/options.h"
  44. #include "rocksdb/perf_context.h"
  45. #include "rocksdb/persistent_cache.h"
  46. #include "rocksdb/rate_limiter.h"
  47. #include "rocksdb/slice.h"
  48. #include "rocksdb/slice_transform.h"
  49. #include "rocksdb/stats_history.h"
  50. #include "rocksdb/utilities/object_registry.h"
  51. #include "rocksdb/utilities/optimistic_transaction_db.h"
  52. #include "rocksdb/utilities/options_util.h"
  53. #include "rocksdb/utilities/sim_cache.h"
  54. #include "rocksdb/utilities/transaction.h"
  55. #include "rocksdb/utilities/transaction_db.h"
  56. #include "rocksdb/write_batch.h"
  57. #include "test_util/testutil.h"
  58. #include "test_util/transaction_test_util.h"
  59. #include "util/cast_util.h"
  60. #include "util/compression.h"
  61. #include "util/crc32c.h"
  62. #include "util/gflags_compat.h"
  63. #include "util/mutexlock.h"
  64. #include "util/random.h"
  65. #include "util/stderr_logger.h"
  66. #include "util/string_util.h"
  67. #include "util/xxhash.h"
  68. #include "utilities/blob_db/blob_db.h"
  69. #include "utilities/merge_operators.h"
  70. #include "utilities/merge_operators/bytesxor.h"
  71. #include "utilities/merge_operators/sortlist.h"
  72. #include "utilities/persistent_cache/block_cache_tier.h"
  73. #ifdef OS_WIN
  74. #include <io.h> // open/close
  75. #endif
  76. using GFLAGS_NAMESPACE::ParseCommandLineFlags;
  77. using GFLAGS_NAMESPACE::RegisterFlagValidator;
  78. using GFLAGS_NAMESPACE::SetUsageMessage;
  79. DEFINE_string(
  80. benchmarks,
  81. "fillseq,"
  82. "fillseqdeterministic,"
  83. "fillsync,"
  84. "fillrandom,"
  85. "filluniquerandomdeterministic,"
  86. "overwrite,"
  87. "readrandom,"
  88. "newiterator,"
  89. "newiteratorwhilewriting,"
  90. "seekrandom,"
  91. "seekrandomwhilewriting,"
  92. "seekrandomwhilemerging,"
  93. "readseq,"
  94. "readreverse,"
  95. "compact,"
  96. "compactall,"
  97. "multireadrandom,"
  98. "mixgraph,"
  99. "readseq,"
  100. "readtorowcache,"
  101. "readtocache,"
  102. "readreverse,"
  103. "readwhilewriting,"
  104. "readwhilemerging,"
  105. "readwhilescanning,"
  106. "readrandomwriterandom,"
  107. "updaterandom,"
  108. "xorupdaterandom,"
  109. "randomwithverify,"
  110. "fill100K,"
  111. "crc32c,"
  112. "xxhash,"
  113. "compress,"
  114. "uncompress,"
  115. "acquireload,"
  116. "fillseekseq,"
  117. "randomtransaction,"
  118. "randomreplacekeys,"
  119. "timeseries,"
  120. "getmergeoperands",
  121. "Comma-separated list of operations to run in the specified"
  122. " order. Available benchmarks:\n"
  123. "\tfillseq -- write N values in sequential key"
  124. " order in async mode\n"
  125. "\tfillseqdeterministic -- write N values in the specified"
  126. " key order and keep the shape of the LSM tree\n"
  127. "\tfillrandom -- write N values in random key order in async"
  128. " mode\n"
  129. "\tfilluniquerandomdeterministic -- write N values in a random"
  130. " key order and keep the shape of the LSM tree\n"
  131. "\toverwrite -- overwrite N values in random key order in"
  132. " async mode\n"
  133. "\tfillsync -- write N/1000 values in random key order in "
  134. "sync mode\n"
  135. "\tfill100K -- write N/1000 100K values in random order in"
  136. " async mode\n"
  137. "\tdeleteseq -- delete N keys in sequential order\n"
  138. "\tdeleterandom -- delete N keys in random order\n"
  139. "\treadseq -- read N times sequentially\n"
  140. "\treadtocache -- 1 thread reading database sequentially\n"
  141. "\treadreverse -- read N times in reverse order\n"
  142. "\treadrandom -- read N times in random order\n"
  143. "\treadmissing -- read N missing keys in random order\n"
  144. "\treadwhilewriting -- 1 writer, N threads doing random "
  145. "reads\n"
  146. "\treadwhilemerging -- 1 merger, N threads doing random "
  147. "reads\n"
  148. "\treadwhilescanning -- 1 thread doing full table scan, "
  149. "N threads doing random reads\n"
  150. "\treadrandomwriterandom -- N threads doing random-read, "
  151. "random-write\n"
  152. "\tupdaterandom -- N threads doing read-modify-write for random "
  153. "keys\n"
  154. "\txorupdaterandom -- N threads doing read-XOR-write for "
  155. "random keys\n"
  156. "\tappendrandom -- N threads doing read-modify-write with "
  157. "growing values\n"
  158. "\tmergerandom -- same as updaterandom/appendrandom using merge"
  159. " operator. "
  160. "Must be used with merge_operator\n"
  161. "\treadrandommergerandom -- perform N random read-or-merge "
  162. "operations. Must be used with merge_operator\n"
  163. "\tnewiterator -- repeated iterator creation\n"
  164. "\tseekrandom -- N random seeks, call Next seek_nexts times "
  165. "per seek\n"
  166. "\tseekrandomwhilewriting -- seekrandom and 1 thread doing "
  167. "overwrite\n"
  168. "\tseekrandomwhilemerging -- seekrandom and 1 thread doing "
  169. "merge\n"
  170. "\tcrc32c -- repeated crc32c of 4K of data\n"
  171. "\txxhash -- repeated xxHash of 4K of data\n"
  172. "\tacquireload -- load N*1000 times\n"
  173. "\tfillseekseq -- write N values in sequential key, then read "
  174. "them by seeking to each key\n"
  175. "\trandomtransaction -- execute N random transactions and "
  176. "verify correctness\n"
  177. "\trandomreplacekeys -- randomly replaces N keys by deleting "
  178. "the old version and putting the new version\n\n"
  179. "\ttimeseries -- 1 writer generates time series data "
  180. "and multiple readers doing random reads on id\n\n"
  181. "Meta operations:\n"
  182. "\tcompact -- Compact the entire DB; If multiple, randomly choose one\n"
  183. "\tcompactall -- Compact the entire DB\n"
  184. "\tstats -- Print DB stats\n"
  185. "\tresetstats -- Reset DB stats\n"
  186. "\tlevelstats -- Print the number of files and bytes per level\n"
  187. "\tsstables -- Print sstable info\n"
  188. "\theapprofile -- Dump a heap profile (if supported by this port)\n"
  189. "\treplay -- replay the trace file specified with trace_file\n"
  190. "\tgetmergeoperands -- Insert lots of merge records which are a list of "
  191. "sorted ints for a key and then compare performance of lookup for another "
  192. "key "
  193. "by doing a Get followed by binary searching in the large sorted list vs "
  194. "doing a GetMergeOperands and binary searching in the operands which are"
  195. "sorted sub-lists. The MergeOperator used is sortlist.h\n");
  196. DEFINE_int64(num, 1000000, "Number of key/values to place in database");
  197. DEFINE_int64(numdistinct, 1000,
  198. "Number of distinct keys to use. Used in RandomWithVerify to "
  199. "read/write on fewer keys so that gets are more likely to find the"
  200. " key and puts are more likely to update the same key");
  201. DEFINE_int64(merge_keys, -1,
  202. "Number of distinct keys to use for MergeRandom and "
  203. "ReadRandomMergeRandom. "
  204. "If negative, there will be FLAGS_num keys.");
  205. DEFINE_int32(num_column_families, 1, "Number of Column Families to use.");
  206. DEFINE_int32(
  207. num_hot_column_families, 0,
  208. "Number of Hot Column Families. If more than 0, only write to this "
  209. "number of column families. After finishing all the writes to them, "
  210. "create new set of column families and insert to them. Only used "
  211. "when num_column_families > 1.");
  212. DEFINE_string(column_family_distribution, "",
  213. "Comma-separated list of percentages, where the ith element "
  214. "indicates the probability of an op using the ith column family. "
  215. "The number of elements must be `num_hot_column_families` if "
  216. "specified; otherwise, it must be `num_column_families`. The "
  217. "sum of elements must be 100. E.g., if `num_column_families=4`, "
  218. "and `num_hot_column_families=0`, a valid list could be "
  219. "\"10,20,30,40\".");
  220. DEFINE_int64(reads, -1, "Number of read operations to do. "
  221. "If negative, do FLAGS_num reads.");
  222. DEFINE_int64(deletes, -1, "Number of delete operations to do. "
  223. "If negative, do FLAGS_num deletions.");
  224. DEFINE_int32(bloom_locality, 0, "Control bloom filter probes locality");
  225. DEFINE_int64(seed, 0, "Seed base for random number generators. "
  226. "When 0 it is deterministic.");
  227. DEFINE_int32(threads, 1, "Number of concurrent threads to run.");
  228. DEFINE_int32(duration, 0, "Time in seconds for the random-ops tests to run."
  229. " When 0 then num & reads determine the test duration");
  230. DEFINE_string(value_size_distribution_type, "fixed",
  231. "Value size distribution type: fixed, uniform, normal");
  232. DEFINE_int32(value_size, 100, "Size of each value in fixed distribution");
  233. static unsigned int value_size = 100;
  234. DEFINE_int32(value_size_min, 100, "Min size of random value");
  235. DEFINE_int32(value_size_max, 102400, "Max size of random value");
  236. DEFINE_int32(seek_nexts, 0,
  237. "How many times to call Next() after Seek() in "
  238. "fillseekseq, seekrandom, seekrandomwhilewriting and "
  239. "seekrandomwhilemerging");
  240. DEFINE_bool(reverse_iterator, false,
  241. "When true use Prev rather than Next for iterators that do "
  242. "Seek and then Next");
  243. DEFINE_int64(max_scan_distance, 0,
  244. "Used to define iterate_upper_bound (or iterate_lower_bound "
  245. "if FLAGS_reverse_iterator is set to true) when value is nonzero");
  246. DEFINE_bool(use_uint64_comparator, false, "use Uint64 user comparator");
  247. DEFINE_int64(batch_size, 1, "Batch size");
  248. static bool ValidateKeySize(const char* /*flagname*/, int32_t /*value*/) {
  249. return true;
  250. }
  251. static bool ValidateUint32Range(const char* flagname, uint64_t value) {
  252. if (value > std::numeric_limits<uint32_t>::max()) {
  253. fprintf(stderr, "Invalid value for --%s: %lu, overflow\n", flagname,
  254. (unsigned long)value);
  255. return false;
  256. }
  257. return true;
  258. }
  259. DEFINE_int32(key_size, 16, "size of each key");
  260. DEFINE_int32(num_multi_db, 0,
  261. "Number of DBs used in the benchmark. 0 means single DB.");
  262. DEFINE_double(compression_ratio, 0.5, "Arrange to generate values that shrink"
  263. " to this fraction of their original size after compression");
  264. DEFINE_double(read_random_exp_range, 0.0,
  265. "Read random's key will be generated using distribution of "
  266. "num * exp(-r) where r is uniform number from 0 to this value. "
  267. "The larger the number is, the more skewed the reads are. "
  268. "Only used in readrandom and multireadrandom benchmarks.");
  269. DEFINE_bool(histogram, false, "Print histogram of operation timings");
  270. DEFINE_bool(enable_numa, false,
  271. "Make operations aware of NUMA architecture and bind memory "
  272. "and cpus corresponding to nodes together. In NUMA, memory "
  273. "in same node as CPUs are closer when compared to memory in "
  274. "other nodes. Reads can be faster when the process is bound to "
  275. "CPU and memory of same node. Use \"$numactl --hardware\" command "
  276. "to see NUMA memory architecture.");
  277. DEFINE_int64(db_write_buffer_size,
  278. ROCKSDB_NAMESPACE::Options().db_write_buffer_size,
  279. "Number of bytes to buffer in all memtables before compacting");
  280. DEFINE_bool(cost_write_buffer_to_cache, false,
  281. "The usage of memtable is costed to the block cache");
  282. DEFINE_int64(write_buffer_size, ROCKSDB_NAMESPACE::Options().write_buffer_size,
  283. "Number of bytes to buffer in memtable before compacting");
  284. DEFINE_int32(max_write_buffer_number,
  285. ROCKSDB_NAMESPACE::Options().max_write_buffer_number,
  286. "The number of in-memory memtables. Each memtable is of size"
  287. " write_buffer_size bytes.");
  288. DEFINE_int32(min_write_buffer_number_to_merge,
  289. ROCKSDB_NAMESPACE::Options().min_write_buffer_number_to_merge,
  290. "The minimum number of write buffers that will be merged together"
  291. "before writing to storage. This is cheap because it is an"
  292. "in-memory merge. If this feature is not enabled, then all these"
  293. "write buffers are flushed to L0 as separate files and this "
  294. "increases read amplification because a get request has to check"
  295. " in all of these files. Also, an in-memory merge may result in"
  296. " writing less data to storage if there are duplicate records "
  297. " in each of these individual write buffers.");
  298. DEFINE_int32(max_write_buffer_number_to_maintain,
  299. ROCKSDB_NAMESPACE::Options().max_write_buffer_number_to_maintain,
  300. "The total maximum number of write buffers to maintain in memory "
  301. "including copies of buffers that have already been flushed. "
  302. "Unlike max_write_buffer_number, this parameter does not affect "
  303. "flushing. This controls the minimum amount of write history "
  304. "that will be available in memory for conflict checking when "
  305. "Transactions are used. If this value is too low, some "
  306. "transactions may fail at commit time due to not being able to "
  307. "determine whether there were any write conflicts. Setting this "
  308. "value to 0 will cause write buffers to be freed immediately "
  309. "after they are flushed. If this value is set to -1, "
  310. "'max_write_buffer_number' will be used.");
  311. DEFINE_int64(max_write_buffer_size_to_maintain,
  312. ROCKSDB_NAMESPACE::Options().max_write_buffer_size_to_maintain,
  313. "The total maximum size of write buffers to maintain in memory "
  314. "including copies of buffers that have already been flushed. "
  315. "Unlike max_write_buffer_number, this parameter does not affect "
  316. "flushing. This controls the minimum amount of write history "
  317. "that will be available in memory for conflict checking when "
  318. "Transactions are used. If this value is too low, some "
  319. "transactions may fail at commit time due to not being able to "
  320. "determine whether there were any write conflicts. Setting this "
  321. "value to 0 will cause write buffers to be freed immediately "
  322. "after they are flushed. If this value is set to -1, "
  323. "'max_write_buffer_number' will be used.");
  324. DEFINE_int32(max_background_jobs,
  325. ROCKSDB_NAMESPACE::Options().max_background_jobs,
  326. "The maximum number of concurrent background jobs that can occur "
  327. "in parallel.");
  328. DEFINE_int32(num_bottom_pri_threads, 0,
  329. "The number of threads in the bottom-priority thread pool (used "
  330. "by universal compaction only).");
  331. DEFINE_int32(num_high_pri_threads, 0,
  332. "The maximum number of concurrent background compactions"
  333. " that can occur in parallel.");
  334. DEFINE_int32(num_low_pri_threads, 0,
  335. "The maximum number of concurrent background compactions"
  336. " that can occur in parallel.");
  337. DEFINE_int32(max_background_compactions,
  338. ROCKSDB_NAMESPACE::Options().max_background_compactions,
  339. "The maximum number of concurrent background compactions"
  340. " that can occur in parallel.");
  341. DEFINE_int32(base_background_compactions, -1, "DEPRECATED");
  342. DEFINE_uint64(subcompactions, 1,
  343. "Maximum number of subcompactions to divide L0-L1 compactions "
  344. "into.");
  345. static const bool FLAGS_subcompactions_dummy
  346. __attribute__((__unused__)) = RegisterFlagValidator(&FLAGS_subcompactions,
  347. &ValidateUint32Range);
  348. DEFINE_int32(max_background_flushes,
  349. ROCKSDB_NAMESPACE::Options().max_background_flushes,
  350. "The maximum number of concurrent background flushes"
  351. " that can occur in parallel.");
  352. static ROCKSDB_NAMESPACE::CompactionStyle FLAGS_compaction_style_e;
  353. DEFINE_int32(compaction_style,
  354. (int32_t)ROCKSDB_NAMESPACE::Options().compaction_style,
  355. "style of compaction: level-based, universal and fifo");
  356. static ROCKSDB_NAMESPACE::CompactionPri FLAGS_compaction_pri_e;
  357. DEFINE_int32(compaction_pri,
  358. (int32_t)ROCKSDB_NAMESPACE::Options().compaction_pri,
  359. "priority of files to compaction: by size or by data age");
  360. DEFINE_int32(universal_size_ratio, 0,
  361. "Percentage flexibility while comparing file size"
  362. " (for universal compaction only).");
  363. DEFINE_int32(universal_min_merge_width, 0, "The minimum number of files in a"
  364. " single compaction run (for universal compaction only).");
  365. DEFINE_int32(universal_max_merge_width, 0, "The max number of files to compact"
  366. " in universal style compaction");
  367. DEFINE_int32(universal_max_size_amplification_percent, 0,
  368. "The max size amplification for universal style compaction");
  369. DEFINE_int32(universal_compression_size_percent, -1,
  370. "The percentage of the database to compress for universal "
  371. "compaction. -1 means compress everything.");
  372. DEFINE_bool(universal_allow_trivial_move, false,
  373. "Allow trivial move in universal compaction.");
  374. DEFINE_int64(cache_size, 8 << 20, // 8MB
  375. "Number of bytes to use as a cache of uncompressed data");
  376. DEFINE_int32(cache_numshardbits, 6,
  377. "Number of shards for the block cache"
  378. " is 2 ** cache_numshardbits. Negative means use default settings."
  379. " This is applied only if FLAGS_cache_size is non-negative.");
  380. DEFINE_double(cache_high_pri_pool_ratio, 0.0,
  381. "Ratio of block cache reserve for high pri blocks. "
  382. "If > 0.0, we also enable "
  383. "cache_index_and_filter_blocks_with_high_priority.");
  384. DEFINE_bool(use_clock_cache, false,
  385. "Replace default LRU block cache with clock cache.");
  386. DEFINE_int64(simcache_size, -1,
  387. "Number of bytes to use as a simcache of "
  388. "uncompressed data. Nagative value disables simcache.");
  389. DEFINE_bool(cache_index_and_filter_blocks, false,
  390. "Cache index/filter blocks in block cache.");
  391. DEFINE_bool(partition_index_and_filters, false,
  392. "Partition index and filter blocks.");
  393. DEFINE_bool(partition_index, false, "Partition index blocks");
  394. DEFINE_int64(metadata_block_size,
  395. ROCKSDB_NAMESPACE::BlockBasedTableOptions().metadata_block_size,
  396. "Max partition size when partitioning index/filters");
  397. // The default reduces the overhead of reading time with flash. With HDD, which
  398. // offers much less throughput, however, this number better to be set to 1.
  399. DEFINE_int32(ops_between_duration_checks, 1000,
  400. "Check duration limit every x ops");
  401. DEFINE_bool(pin_l0_filter_and_index_blocks_in_cache, false,
  402. "Pin index/filter blocks of L0 files in block cache.");
  403. DEFINE_bool(
  404. pin_top_level_index_and_filter, false,
  405. "Pin top-level index of partitioned index/filter blocks in block cache.");
  406. DEFINE_int32(block_size,
  407. static_cast<int32_t>(
  408. ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_size),
  409. "Number of bytes in a block.");
  410. DEFINE_int32(format_version,
  411. static_cast<int32_t>(
  412. ROCKSDB_NAMESPACE::BlockBasedTableOptions().format_version),
  413. "Format version of SST files.");
  414. DEFINE_int32(block_restart_interval,
  415. ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_restart_interval,
  416. "Number of keys between restart points "
  417. "for delta encoding of keys in data block.");
  418. DEFINE_int32(
  419. index_block_restart_interval,
  420. ROCKSDB_NAMESPACE::BlockBasedTableOptions().index_block_restart_interval,
  421. "Number of keys between restart points "
  422. "for delta encoding of keys in index block.");
  423. DEFINE_int32(read_amp_bytes_per_bit,
  424. ROCKSDB_NAMESPACE::BlockBasedTableOptions().read_amp_bytes_per_bit,
  425. "Number of bytes per bit to be used in block read-amp bitmap");
  426. DEFINE_bool(
  427. enable_index_compression,
  428. ROCKSDB_NAMESPACE::BlockBasedTableOptions().enable_index_compression,
  429. "Compress the index block");
  430. DEFINE_bool(block_align,
  431. ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_align,
  432. "Align data blocks on page size");
  433. DEFINE_bool(use_data_block_hash_index, false,
  434. "if use kDataBlockBinaryAndHash "
  435. "instead of kDataBlockBinarySearch. "
  436. "This is valid if only we use BlockTable");
  437. DEFINE_double(data_block_hash_table_util_ratio, 0.75,
  438. "util ratio for data block hash index table. "
  439. "This is only valid if use_data_block_hash_index is "
  440. "set to true");
  441. DEFINE_int64(compressed_cache_size, -1,
  442. "Number of bytes to use as a cache of compressed data.");
  443. DEFINE_int64(row_cache_size, 0,
  444. "Number of bytes to use as a cache of individual rows"
  445. " (0 = disabled).");
  446. DEFINE_int32(open_files, ROCKSDB_NAMESPACE::Options().max_open_files,
  447. "Maximum number of files to keep open at the same time"
  448. " (use default if == 0)");
  449. DEFINE_int32(file_opening_threads,
  450. ROCKSDB_NAMESPACE::Options().max_file_opening_threads,
  451. "If open_files is set to -1, this option set the number of "
  452. "threads that will be used to open files during DB::Open()");
  453. DEFINE_bool(new_table_reader_for_compaction_inputs, true,
  454. "If true, uses a separate file handle for compaction inputs");
  455. DEFINE_int32(compaction_readahead_size, 0, "Compaction readahead size");
  456. DEFINE_int32(log_readahead_size, 0, "WAL and manifest readahead size");
  457. DEFINE_int32(random_access_max_buffer_size, 1024 * 1024,
  458. "Maximum windows randomaccess buffer size");
  459. DEFINE_int32(writable_file_max_buffer_size, 1024 * 1024,
  460. "Maximum write buffer for Writable File");
  461. DEFINE_int32(bloom_bits, -1, "Bloom filter bits per key. Negative means"
  462. " use default settings.");
  463. DEFINE_double(memtable_bloom_size_ratio, 0,
  464. "Ratio of memtable size used for bloom filter. 0 means no bloom "
  465. "filter.");
  466. DEFINE_bool(memtable_whole_key_filtering, false,
  467. "Try to use whole key bloom filter in memtables.");
  468. DEFINE_bool(memtable_use_huge_page, false,
  469. "Try to use huge page in memtables.");
  470. DEFINE_bool(use_existing_db, false, "If true, do not destroy the existing"
  471. " database. If you set this flag and also specify a benchmark that"
  472. " wants a fresh database, that benchmark will fail.");
  473. DEFINE_bool(use_existing_keys, false,
  474. "If true, uses existing keys in the DB, "
  475. "rather than generating new ones. This involves some startup "
  476. "latency to load all keys into memory. It is supported for the "
  477. "same read/overwrite benchmarks as `-use_existing_db=true`, which "
  478. "must also be set for this flag to be enabled. When this flag is "
  479. "set, the value for `-num` will be ignored.");
  480. DEFINE_bool(show_table_properties, false,
  481. "If true, then per-level table"
  482. " properties will be printed on every stats-interval when"
  483. " stats_interval is set and stats_per_interval is on.");
  484. DEFINE_string(db, "", "Use the db with the following name.");
  485. // Read cache flags
  486. DEFINE_string(read_cache_path, "",
  487. "If not empty string, a read cache will be used in this path");
  488. DEFINE_int64(read_cache_size, 4LL * 1024 * 1024 * 1024,
  489. "Maximum size of the read cache");
  490. DEFINE_bool(read_cache_direct_write, true,
  491. "Whether to use Direct IO for writing to the read cache");
  492. DEFINE_bool(read_cache_direct_read, true,
  493. "Whether to use Direct IO for reading from read cache");
  494. DEFINE_bool(use_keep_filter, false, "Whether to use a noop compaction filter");
  495. static bool ValidateCacheNumshardbits(const char* flagname, int32_t value) {
  496. if (value >= 20) {
  497. fprintf(stderr, "Invalid value for --%s: %d, must be < 20\n",
  498. flagname, value);
  499. return false;
  500. }
  501. return true;
  502. }
  503. DEFINE_bool(verify_checksum, true,
  504. "Verify checksum for every block read"
  505. " from storage");
  506. DEFINE_bool(statistics, false, "Database statistics");
  507. DEFINE_int32(stats_level, ROCKSDB_NAMESPACE::StatsLevel::kExceptDetailedTimers,
  508. "stats level for statistics");
  509. DEFINE_string(statistics_string, "", "Serialized statistics string");
  510. static class std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> dbstats;
  511. DEFINE_int64(writes, -1, "Number of write operations to do. If negative, do"
  512. " --num reads.");
  513. DEFINE_bool(finish_after_writes, false, "Write thread terminates after all writes are finished");
  514. DEFINE_bool(sync, false, "Sync all writes to disk");
  515. DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync");
  516. DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
  517. DEFINE_string(wal_dir, "", "If not empty, use the given dir for WAL");
  518. DEFINE_string(truth_db, "/dev/shm/truth_db/dbbench",
  519. "Truth key/values used when using verify");
  520. DEFINE_int32(num_levels, 7, "The total number of levels");
  521. DEFINE_int64(target_file_size_base,
  522. ROCKSDB_NAMESPACE::Options().target_file_size_base,
  523. "Target file size at level-1");
  524. DEFINE_int32(target_file_size_multiplier,
  525. ROCKSDB_NAMESPACE::Options().target_file_size_multiplier,
  526. "A multiplier to compute target level-N file size (N >= 2)");
  527. DEFINE_uint64(max_bytes_for_level_base,
  528. ROCKSDB_NAMESPACE::Options().max_bytes_for_level_base,
  529. "Max bytes for level-1");
  530. DEFINE_bool(level_compaction_dynamic_level_bytes, false,
  531. "Whether level size base is dynamic");
  532. DEFINE_double(max_bytes_for_level_multiplier, 10,
  533. "A multiplier to compute max bytes for level-N (N >= 2)");
  534. static std::vector<int> FLAGS_max_bytes_for_level_multiplier_additional_v;
  535. DEFINE_string(max_bytes_for_level_multiplier_additional, "",
  536. "A vector that specifies additional fanout per level");
  537. DEFINE_int32(level0_stop_writes_trigger,
  538. ROCKSDB_NAMESPACE::Options().level0_stop_writes_trigger,
  539. "Number of files in level-0"
  540. " that will trigger put stop.");
  541. DEFINE_int32(level0_slowdown_writes_trigger,
  542. ROCKSDB_NAMESPACE::Options().level0_slowdown_writes_trigger,
  543. "Number of files in level-0"
  544. " that will slow down writes.");
  545. DEFINE_int32(level0_file_num_compaction_trigger,
  546. ROCKSDB_NAMESPACE::Options().level0_file_num_compaction_trigger,
  547. "Number of files in level-0"
  548. " when compactions start");
  549. static bool ValidateInt32Percent(const char* flagname, int32_t value) {
  550. if (value <= 0 || value>=100) {
  551. fprintf(stderr, "Invalid value for --%s: %d, 0< pct <100 \n",
  552. flagname, value);
  553. return false;
  554. }
  555. return true;
  556. }
  557. DEFINE_int32(readwritepercent, 90, "Ratio of reads to reads/writes (expressed"
  558. " as percentage) for the ReadRandomWriteRandom workload. The "
  559. "default value 90 means 90% operations out of all reads and writes"
  560. " operations are reads. In other words, 9 gets for every 1 put.");
  561. DEFINE_int32(mergereadpercent, 70, "Ratio of merges to merges&reads (expressed"
  562. " as percentage) for the ReadRandomMergeRandom workload. The"
  563. " default value 70 means 70% out of all read and merge operations"
  564. " are merges. In other words, 7 merges for every 3 gets.");
  565. DEFINE_int32(deletepercent, 2, "Percentage of deletes out of reads/writes/"
  566. "deletes (used in RandomWithVerify only). RandomWithVerify "
  567. "calculates writepercent as (100 - FLAGS_readwritepercent - "
  568. "deletepercent), so deletepercent must be smaller than (100 - "
  569. "FLAGS_readwritepercent)");
  570. DEFINE_bool(optimize_filters_for_hits, false,
  571. "Optimizes bloom filters for workloads for most lookups return "
  572. "a value. For now this doesn't create bloom filters for the max "
  573. "level of the LSM to reduce metadata that should fit in RAM. ");
  574. DEFINE_uint64(delete_obsolete_files_period_micros, 0,
  575. "Ignored. Left here for backward compatibility");
  576. DEFINE_int64(writes_before_delete_range, 0,
  577. "Number of writes before DeleteRange is called regularly.");
  578. DEFINE_int64(writes_per_range_tombstone, 0,
  579. "Number of writes between range tombstones");
  580. DEFINE_int64(range_tombstone_width, 100, "Number of keys in tombstone's range");
  581. DEFINE_int64(max_num_range_tombstones, 0,
  582. "Maximum number of range tombstones "
  583. "to insert.");
  584. DEFINE_bool(expand_range_tombstones, false,
  585. "Expand range tombstone into sequential regular tombstones.");
  586. #ifndef ROCKSDB_LITE
  587. // Transactions Options
  588. DEFINE_bool(optimistic_transaction_db, false,
  589. "Open a OptimisticTransactionDB instance. "
  590. "Required for randomtransaction benchmark.");
  591. DEFINE_bool(transaction_db, false,
  592. "Open a TransactionDB instance. "
  593. "Required for randomtransaction benchmark.");
  594. DEFINE_uint64(transaction_sets, 2,
  595. "Number of keys each transaction will "
  596. "modify (use in RandomTransaction only). Max: 9999");
  597. DEFINE_bool(transaction_set_snapshot, false,
  598. "Setting to true will have each transaction call SetSnapshot()"
  599. " upon creation.");
  600. DEFINE_int32(transaction_sleep, 0,
  601. "Max microseconds to sleep in between "
  602. "reading and writing a value (used in RandomTransaction only). ");
  603. DEFINE_uint64(transaction_lock_timeout, 100,
  604. "If using a transaction_db, specifies the lock wait timeout in"
  605. " milliseconds before failing a transaction waiting on a lock");
  606. DEFINE_string(
  607. options_file, "",
  608. "The path to a RocksDB options file. If specified, then db_bench will "
  609. "run with the RocksDB options in the default column family of the "
  610. "specified options file. "
  611. "Note that with this setting, db_bench will ONLY accept the following "
  612. "RocksDB options related command-line arguments, all other arguments "
  613. "that are related to RocksDB options will be ignored:\n"
  614. "\t--use_existing_db\n"
  615. "\t--use_existing_keys\n"
  616. "\t--statistics\n"
  617. "\t--row_cache_size\n"
  618. "\t--row_cache_numshardbits\n"
  619. "\t--enable_io_prio\n"
  620. "\t--dump_malloc_stats\n"
  621. "\t--num_multi_db\n");
  622. // FIFO Compaction Options
  623. DEFINE_uint64(fifo_compaction_max_table_files_size_mb, 0,
  624. "The limit of total table file sizes to trigger FIFO compaction");
  625. DEFINE_bool(fifo_compaction_allow_compaction, true,
  626. "Allow compaction in FIFO compaction.");
  627. DEFINE_uint64(fifo_compaction_ttl, 0, "TTL for the SST Files in seconds.");
  628. // Blob DB Options
  629. DEFINE_bool(use_blob_db, false,
  630. "Open a BlobDB instance. "
  631. "Required for large value benchmark.");
  632. DEFINE_bool(
  633. blob_db_enable_gc,
  634. ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().enable_garbage_collection,
  635. "Enable BlobDB garbage collection.");
  636. DEFINE_double(
  637. blob_db_gc_cutoff,
  638. ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().garbage_collection_cutoff,
  639. "Cutoff ratio for BlobDB garbage collection.");
  640. DEFINE_bool(blob_db_is_fifo,
  641. ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().is_fifo,
  642. "Enable FIFO eviction strategy in BlobDB.");
  643. DEFINE_uint64(blob_db_max_db_size,
  644. ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().max_db_size,
  645. "Max size limit of the directory where blob files are stored.");
  646. DEFINE_uint64(
  647. blob_db_max_ttl_range, 0,
  648. "TTL range to generate BlobDB data (in seconds). 0 means no TTL.");
  649. DEFINE_uint64(blob_db_ttl_range_secs,
  650. ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().ttl_range_secs,
  651. "TTL bucket size to use when creating blob files.");
  652. DEFINE_uint64(blob_db_min_blob_size,
  653. ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size,
  654. "Smallest blob to store in a file. Blobs smaller than this "
  655. "will be inlined with the key in the LSM tree.");
  656. DEFINE_uint64(blob_db_bytes_per_sync,
  657. ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync,
  658. "Bytes to sync blob file at.");
  659. DEFINE_uint64(blob_db_file_size,
  660. ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().blob_file_size,
  661. "Target size of each blob file.");
  662. DEFINE_string(blob_db_compression_type, "snappy",
  663. "Algorithm to use to compress blob in blob file");
  664. static enum ROCKSDB_NAMESPACE::CompressionType
  665. FLAGS_blob_db_compression_type_e = ROCKSDB_NAMESPACE::kSnappyCompression;
  666. // Secondary DB instance Options
  667. DEFINE_bool(use_secondary_db, false,
  668. "Open a RocksDB secondary instance. A primary instance can be "
  669. "running in another db_bench process.");
  670. DEFINE_string(secondary_path, "",
  671. "Path to a directory used by the secondary instance to store "
  672. "private files, e.g. info log.");
  673. DEFINE_int32(secondary_update_interval, 5,
  674. "Secondary instance attempts to catch up with the primary every "
  675. "secondary_update_interval seconds.");
  676. #endif // ROCKSDB_LITE
  677. DEFINE_bool(report_bg_io_stats, false,
  678. "Measure times spents on I/Os while in compactions. ");
  679. DEFINE_bool(use_stderr_info_logger, false,
  680. "Write info logs to stderr instead of to LOG file. ");
  681. DEFINE_string(trace_file, "", "Trace workload to a file. ");
  682. DEFINE_int32(trace_replay_fast_forward, 1,
  683. "Fast forward trace replay, must >= 1. ");
  684. DEFINE_int32(block_cache_trace_sampling_frequency, 1,
  685. "Block cache trace sampling frequency, termed s. It uses spatial "
  686. "downsampling and samples accesses to one out of s blocks.");
  687. DEFINE_int64(
  688. block_cache_trace_max_trace_file_size_in_bytes,
  689. uint64_t{64} * 1024 * 1024 * 1024,
  690. "The maximum block cache trace file size in bytes. Block cache accesses "
  691. "will not be logged if the trace file size exceeds this threshold. Default "
  692. "is 64 GB.");
  693. DEFINE_string(block_cache_trace_file, "", "Block cache trace file path.");
  694. DEFINE_int32(trace_replay_threads, 1,
  695. "The number of threads to replay, must >=1.");
  696. static enum ROCKSDB_NAMESPACE::CompressionType StringToCompressionType(
  697. const char* ctype) {
  698. assert(ctype);
  699. if (!strcasecmp(ctype, "none"))
  700. return ROCKSDB_NAMESPACE::kNoCompression;
  701. else if (!strcasecmp(ctype, "snappy"))
  702. return ROCKSDB_NAMESPACE::kSnappyCompression;
  703. else if (!strcasecmp(ctype, "zlib"))
  704. return ROCKSDB_NAMESPACE::kZlibCompression;
  705. else if (!strcasecmp(ctype, "bzip2"))
  706. return ROCKSDB_NAMESPACE::kBZip2Compression;
  707. else if (!strcasecmp(ctype, "lz4"))
  708. return ROCKSDB_NAMESPACE::kLZ4Compression;
  709. else if (!strcasecmp(ctype, "lz4hc"))
  710. return ROCKSDB_NAMESPACE::kLZ4HCCompression;
  711. else if (!strcasecmp(ctype, "xpress"))
  712. return ROCKSDB_NAMESPACE::kXpressCompression;
  713. else if (!strcasecmp(ctype, "zstd"))
  714. return ROCKSDB_NAMESPACE::kZSTD;
  715. fprintf(stdout, "Cannot parse compression type '%s'\n", ctype);
  716. return ROCKSDB_NAMESPACE::kSnappyCompression; // default value
  717. }
  718. static std::string ColumnFamilyName(size_t i) {
  719. if (i == 0) {
  720. return ROCKSDB_NAMESPACE::kDefaultColumnFamilyName;
  721. } else {
  722. char name[100];
  723. snprintf(name, sizeof(name), "column_family_name_%06zu", i);
  724. return std::string(name);
  725. }
  726. }
  727. DEFINE_string(compression_type, "snappy",
  728. "Algorithm to use to compress the database");
  729. static enum ROCKSDB_NAMESPACE::CompressionType FLAGS_compression_type_e =
  730. ROCKSDB_NAMESPACE::kSnappyCompression;
  731. DEFINE_int64(sample_for_compression, 0, "Sample every N block for compression");
  732. DEFINE_int32(compression_level, ROCKSDB_NAMESPACE::CompressionOptions().level,
  733. "Compression level. The meaning of this value is library-"
  734. "dependent. If unset, we try to use the default for the library "
  735. "specified in `--compression_type`");
  736. DEFINE_int32(compression_max_dict_bytes,
  737. ROCKSDB_NAMESPACE::CompressionOptions().max_dict_bytes,
  738. "Maximum size of dictionary used to prime the compression "
  739. "library.");
  740. DEFINE_int32(compression_zstd_max_train_bytes,
  741. ROCKSDB_NAMESPACE::CompressionOptions().zstd_max_train_bytes,
  742. "Maximum size of training data passed to zstd's dictionary "
  743. "trainer.");
  744. DEFINE_int32(min_level_to_compress, -1, "If non-negative, compression starts"
  745. " from this level. Levels with number < min_level_to_compress are"
  746. " not compressed. Otherwise, apply compression_type to "
  747. "all levels.");
  748. static bool ValidateTableCacheNumshardbits(const char* flagname,
  749. int32_t value) {
  750. if (0 >= value || value > 20) {
  751. fprintf(stderr, "Invalid value for --%s: %d, must be 0 < val <= 20\n",
  752. flagname, value);
  753. return false;
  754. }
  755. return true;
  756. }
  757. DEFINE_int32(table_cache_numshardbits, 4, "");
  758. #ifndef ROCKSDB_LITE
  759. DEFINE_string(env_uri, "", "URI for registry Env lookup. Mutually exclusive"
  760. " with --hdfs.");
  761. #endif // ROCKSDB_LITE
  762. DEFINE_string(hdfs, "", "Name of hdfs environment. Mutually exclusive with"
  763. " --env_uri.");
  764. static std::shared_ptr<ROCKSDB_NAMESPACE::Env> env_guard;
  765. static ROCKSDB_NAMESPACE::Env* FLAGS_env = ROCKSDB_NAMESPACE::Env::Default();
  766. DEFINE_int64(stats_interval, 0, "Stats are reported every N operations when "
  767. "this is greater than zero. When 0 the interval grows over time.");
  768. DEFINE_int64(stats_interval_seconds, 0, "Report stats every N seconds. This "
  769. "overrides stats_interval when both are > 0.");
  770. DEFINE_int32(stats_per_interval, 0, "Reports additional stats per interval when"
  771. " this is greater than 0.");
  772. DEFINE_int64(report_interval_seconds, 0,
  773. "If greater than zero, it will write simple stats in CVS format "
  774. "to --report_file every N seconds");
  775. DEFINE_string(report_file, "report.csv",
  776. "Filename where some simple stats are reported to (if "
  777. "--report_interval_seconds is bigger than 0)");
  778. DEFINE_int32(thread_status_per_interval, 0,
  779. "Takes and report a snapshot of the current status of each thread"
  780. " when this is greater than 0.");
  781. DEFINE_int32(perf_level, ROCKSDB_NAMESPACE::PerfLevel::kDisable,
  782. "Level of perf collection");
  783. static bool ValidateRateLimit(const char* flagname, double value) {
  784. const double EPSILON = 1e-10;
  785. if ( value < -EPSILON ) {
  786. fprintf(stderr, "Invalid value for --%s: %12.6f, must be >= 0.0\n",
  787. flagname, value);
  788. return false;
  789. }
  790. return true;
  791. }
  792. DEFINE_double(soft_rate_limit, 0.0, "DEPRECATED");
  793. DEFINE_double(hard_rate_limit, 0.0, "DEPRECATED");
  794. DEFINE_uint64(soft_pending_compaction_bytes_limit, 64ull * 1024 * 1024 * 1024,
  795. "Slowdown writes if pending compaction bytes exceed this number");
  796. DEFINE_uint64(hard_pending_compaction_bytes_limit, 128ull * 1024 * 1024 * 1024,
  797. "Stop writes if pending compaction bytes exceed this number");
  798. DEFINE_uint64(delayed_write_rate, 8388608u,
  799. "Limited bytes allowed to DB when soft_rate_limit or "
  800. "level0_slowdown_writes_trigger triggers");
  801. DEFINE_bool(enable_pipelined_write, true,
  802. "Allow WAL and memtable writes to be pipelined");
  803. DEFINE_bool(unordered_write, false,
  804. "Allow WAL and memtable writes to be pipelined");
  805. DEFINE_bool(allow_concurrent_memtable_write, true,
  806. "Allow multi-writers to update mem tables in parallel.");
  807. DEFINE_bool(inplace_update_support,
  808. ROCKSDB_NAMESPACE::Options().inplace_update_support,
  809. "Support in-place memtable update for smaller or same-size values");
  810. DEFINE_uint64(inplace_update_num_locks,
  811. ROCKSDB_NAMESPACE::Options().inplace_update_num_locks,
  812. "Number of RW locks to protect in-place memtable updates");
  813. DEFINE_bool(enable_write_thread_adaptive_yield, true,
  814. "Use a yielding spin loop for brief writer thread waits.");
  815. DEFINE_uint64(
  816. write_thread_max_yield_usec, 100,
  817. "Maximum microseconds for enable_write_thread_adaptive_yield operation.");
  818. DEFINE_uint64(write_thread_slow_yield_usec, 3,
  819. "The threshold at which a slow yield is considered a signal that "
  820. "other processes or threads want the core.");
  821. DEFINE_int32(rate_limit_delay_max_milliseconds, 1000,
  822. "When hard_rate_limit is set then this is the max time a put will"
  823. " be stalled.");
  824. DEFINE_uint64(rate_limiter_bytes_per_sec, 0, "Set options.rate_limiter value.");
  825. DEFINE_bool(rate_limiter_auto_tuned, false,
  826. "Enable dynamic adjustment of rate limit according to demand for "
  827. "background I/O");
  828. DEFINE_bool(sine_write_rate, false,
  829. "Use a sine wave write_rate_limit");
  830. DEFINE_uint64(sine_write_rate_interval_milliseconds, 10000,
  831. "Interval of which the sine wave write_rate_limit is recalculated");
  832. DEFINE_double(sine_a, 1,
  833. "A in f(x) = A sin(bx + c) + d");
  834. DEFINE_double(sine_b, 1,
  835. "B in f(x) = A sin(bx + c) + d");
  836. DEFINE_double(sine_c, 0,
  837. "C in f(x) = A sin(bx + c) + d");
  838. DEFINE_double(sine_d, 1,
  839. "D in f(x) = A sin(bx + c) + d");
  840. DEFINE_bool(rate_limit_bg_reads, false,
  841. "Use options.rate_limiter on compaction reads");
  842. DEFINE_uint64(
  843. benchmark_write_rate_limit, 0,
  844. "If non-zero, db_bench will rate-limit the writes going into RocksDB. This "
  845. "is the global rate in bytes/second.");
  846. // the parameters of mix_graph
  847. DEFINE_double(keyrange_dist_a, 0.0,
  848. "The parameter 'a' of prefix average access distribution "
  849. "f(x)=a*exp(b*x)+c*exp(d*x)");
  850. DEFINE_double(keyrange_dist_b, 0.0,
  851. "The parameter 'b' of prefix average access distribution "
  852. "f(x)=a*exp(b*x)+c*exp(d*x)");
  853. DEFINE_double(keyrange_dist_c, 0.0,
  854. "The parameter 'c' of prefix average access distribution"
  855. "f(x)=a*exp(b*x)+c*exp(d*x)");
  856. DEFINE_double(keyrange_dist_d, 0.0,
  857. "The parameter 'd' of prefix average access distribution"
  858. "f(x)=a*exp(b*x)+c*exp(d*x)");
  859. DEFINE_int64(keyrange_num, 1,
  860. "The number of key ranges that are in the same prefix "
  861. "group, each prefix range will have its key acccess "
  862. "distribution");
  863. DEFINE_double(key_dist_a, 0.0,
  864. "The parameter 'a' of key access distribution model "
  865. "f(x)=a*x^b");
  866. DEFINE_double(key_dist_b, 0.0,
  867. "The parameter 'b' of key access distribution model "
  868. "f(x)=a*x^b");
  869. DEFINE_double(value_theta, 0.0,
  870. "The parameter 'theta' of Generized Pareto Distribution "
  871. "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
  872. DEFINE_double(value_k, 0.0,
  873. "The parameter 'k' of Generized Pareto Distribution "
  874. "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
  875. DEFINE_double(value_sigma, 0.0,
  876. "The parameter 'theta' of Generized Pareto Distribution "
  877. "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
  878. DEFINE_double(iter_theta, 0.0,
  879. "The parameter 'theta' of Generized Pareto Distribution "
  880. "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
  881. DEFINE_double(iter_k, 0.0,
  882. "The parameter 'k' of Generized Pareto Distribution "
  883. "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
  884. DEFINE_double(iter_sigma, 0.0,
  885. "The parameter 'sigma' of Generized Pareto Distribution "
  886. "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
  887. DEFINE_double(mix_get_ratio, 1.0,
  888. "The ratio of Get queries of mix_graph workload");
  889. DEFINE_double(mix_put_ratio, 0.0,
  890. "The ratio of Put queries of mix_graph workload");
  891. DEFINE_double(mix_seek_ratio, 0.0,
  892. "The ratio of Seek queries of mix_graph workload");
  893. DEFINE_int64(mix_max_scan_len, 10000, "The max scan length of Iterator");
  894. DEFINE_int64(mix_ave_kv_size, 512,
  895. "The average key-value size of this workload");
  896. DEFINE_int64(mix_max_value_size, 1024, "The max value size of this workload");
  897. DEFINE_double(
  898. sine_mix_rate_noise, 0.0,
  899. "Add the noise ratio to the sine rate, it is between 0.0 and 1.0");
  900. DEFINE_bool(sine_mix_rate, false,
  901. "Enable the sine QPS control on the mix workload");
  902. DEFINE_uint64(
  903. sine_mix_rate_interval_milliseconds, 10000,
  904. "Interval of which the sine wave read_rate_limit is recalculated");
  905. DEFINE_int64(mix_accesses, -1,
  906. "The total query accesses of mix_graph workload");
  907. DEFINE_uint64(
  908. benchmark_read_rate_limit, 0,
  909. "If non-zero, db_bench will rate-limit the reads from RocksDB. This "
  910. "is the global rate in ops/second.");
  911. DEFINE_uint64(max_compaction_bytes,
  912. ROCKSDB_NAMESPACE::Options().max_compaction_bytes,
  913. "Max bytes allowed in one compaction");
  914. #ifndef ROCKSDB_LITE
  915. DEFINE_bool(readonly, false, "Run read only benchmarks.");
  916. DEFINE_bool(print_malloc_stats, false,
  917. "Print malloc stats to stdout after benchmarks finish.");
  918. #endif // ROCKSDB_LITE
  919. DEFINE_bool(disable_auto_compactions, false, "Do not auto trigger compactions");
  920. DEFINE_uint64(wal_ttl_seconds, 0, "Set the TTL for the WAL Files in seconds.");
  921. DEFINE_uint64(wal_size_limit_MB, 0, "Set the size limit for the WAL Files"
  922. " in MB.");
  923. DEFINE_uint64(max_total_wal_size, 0, "Set total max WAL size");
  924. DEFINE_bool(mmap_read, ROCKSDB_NAMESPACE::Options().allow_mmap_reads,
  925. "Allow reads to occur via mmap-ing files");
  926. DEFINE_bool(mmap_write, ROCKSDB_NAMESPACE::Options().allow_mmap_writes,
  927. "Allow writes to occur via mmap-ing files");
  928. DEFINE_bool(use_direct_reads, ROCKSDB_NAMESPACE::Options().use_direct_reads,
  929. "Use O_DIRECT for reading data");
  930. DEFINE_bool(use_direct_io_for_flush_and_compaction,
  931. ROCKSDB_NAMESPACE::Options().use_direct_io_for_flush_and_compaction,
  932. "Use O_DIRECT for background flush and compaction writes");
  933. DEFINE_bool(advise_random_on_open,
  934. ROCKSDB_NAMESPACE::Options().advise_random_on_open,
  935. "Advise random access on table file open");
  936. DEFINE_string(compaction_fadvice, "NORMAL",
  937. "Access pattern advice when a file is compacted");
  938. static auto FLAGS_compaction_fadvice_e =
  939. ROCKSDB_NAMESPACE::Options().access_hint_on_compaction_start;
  940. DEFINE_bool(use_tailing_iterator, false,
  941. "Use tailing iterator to access a series of keys instead of get");
  942. DEFINE_bool(use_adaptive_mutex, ROCKSDB_NAMESPACE::Options().use_adaptive_mutex,
  943. "Use adaptive mutex");
  944. DEFINE_uint64(bytes_per_sync, ROCKSDB_NAMESPACE::Options().bytes_per_sync,
  945. "Allows OS to incrementally sync SST files to disk while they are"
  946. " being written, in the background. Issue one request for every"
  947. " bytes_per_sync written. 0 turns it off.");
  948. DEFINE_uint64(wal_bytes_per_sync,
  949. ROCKSDB_NAMESPACE::Options().wal_bytes_per_sync,
  950. "Allows OS to incrementally sync WAL files to disk while they are"
  951. " being written, in the background. Issue one request for every"
  952. " wal_bytes_per_sync written. 0 turns it off.");
  953. DEFINE_bool(use_single_deletes, true,
  954. "Use single deletes (used in RandomReplaceKeys only).");
  955. DEFINE_double(stddev, 2000.0,
  956. "Standard deviation of normal distribution used for picking keys"
  957. " (used in RandomReplaceKeys only).");
  958. DEFINE_int32(key_id_range, 100000,
  959. "Range of possible value of key id (used in TimeSeries only).");
  960. DEFINE_string(expire_style, "none",
  961. "Style to remove expired time entries. Can be one of the options "
  962. "below: none (do not expired data), compaction_filter (use a "
  963. "compaction filter to remove expired data), delete (seek IDs and "
  964. "remove expired data) (used in TimeSeries only).");
  965. DEFINE_uint64(
  966. time_range, 100000,
  967. "Range of timestamp that store in the database (used in TimeSeries"
  968. " only).");
  969. DEFINE_int32(num_deletion_threads, 1,
  970. "Number of threads to do deletion (used in TimeSeries and delete "
  971. "expire_style only).");
  972. DEFINE_int32(max_successive_merges, 0, "Maximum number of successive merge"
  973. " operations on a key in the memtable");
  974. static bool ValidatePrefixSize(const char* flagname, int32_t value) {
  975. if (value < 0 || value>=2000000000) {
  976. fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n",
  977. flagname, value);
  978. return false;
  979. }
  980. return true;
  981. }
  982. DEFINE_int32(prefix_size, 0, "control the prefix size for HashSkipList and "
  983. "plain table");
  984. DEFINE_int64(keys_per_prefix, 0, "control average number of keys generated "
  985. "per prefix, 0 means no special handling of the prefix, "
  986. "i.e. use the prefix comes with the generated random number.");
  987. DEFINE_bool(total_order_seek, false,
  988. "Enable total order seek regardless of index format.");
  989. DEFINE_bool(prefix_same_as_start, false,
  990. "Enforce iterator to return keys with prefix same as seek key.");
  991. DEFINE_bool(
  992. seek_missing_prefix, false,
  993. "Iterator seek to keys with non-exist prefixes. Require prefix_size > 8");
  994. DEFINE_int32(memtable_insert_with_hint_prefix_size, 0,
  995. "If non-zero, enable "
  996. "memtable insert with hint with the given prefix size.");
  997. DEFINE_bool(enable_io_prio, false, "Lower the background flush/compaction "
  998. "threads' IO priority");
  999. DEFINE_bool(enable_cpu_prio, false, "Lower the background flush/compaction "
  1000. "threads' CPU priority");
  1001. DEFINE_bool(identity_as_first_hash, false, "the first hash function of cuckoo "
  1002. "table becomes an identity function. This is only valid when key "
  1003. "is 8 bytes");
  1004. DEFINE_bool(dump_malloc_stats, true, "Dump malloc stats in LOG ");
  1005. DEFINE_uint64(stats_dump_period_sec,
  1006. ROCKSDB_NAMESPACE::Options().stats_dump_period_sec,
  1007. "Gap between printing stats to log in seconds");
  1008. DEFINE_uint64(stats_persist_period_sec,
  1009. ROCKSDB_NAMESPACE::Options().stats_persist_period_sec,
  1010. "Gap between persisting stats in seconds");
  1011. DEFINE_bool(persist_stats_to_disk,
  1012. ROCKSDB_NAMESPACE::Options().persist_stats_to_disk,
  1013. "whether to persist stats to disk");
  1014. DEFINE_uint64(stats_history_buffer_size,
  1015. ROCKSDB_NAMESPACE::Options().stats_history_buffer_size,
  1016. "Max number of stats snapshots to keep in memory");
  1017. DEFINE_int64(multiread_stride, 0,
  1018. "Stride length for the keys in a MultiGet batch");
  1019. DEFINE_bool(multiread_batched, false, "Use the new MultiGet API");
  1020. enum RepFactory {
  1021. kSkipList,
  1022. kPrefixHash,
  1023. kVectorRep,
  1024. kHashLinkedList,
  1025. };
  1026. static enum RepFactory StringToRepFactory(const char* ctype) {
  1027. assert(ctype);
  1028. if (!strcasecmp(ctype, "skip_list"))
  1029. return kSkipList;
  1030. else if (!strcasecmp(ctype, "prefix_hash"))
  1031. return kPrefixHash;
  1032. else if (!strcasecmp(ctype, "vector"))
  1033. return kVectorRep;
  1034. else if (!strcasecmp(ctype, "hash_linkedlist"))
  1035. return kHashLinkedList;
  1036. fprintf(stdout, "Cannot parse memreptable %s\n", ctype);
  1037. return kSkipList;
  1038. }
  1039. static enum RepFactory FLAGS_rep_factory;
  1040. DEFINE_string(memtablerep, "skip_list", "");
  1041. DEFINE_int64(hash_bucket_count, 1024 * 1024, "hash bucket count");
  1042. DEFINE_bool(use_plain_table, false, "if use plain table "
  1043. "instead of block-based table format");
  1044. DEFINE_bool(use_cuckoo_table, false, "if use cuckoo table format");
  1045. DEFINE_double(cuckoo_hash_ratio, 0.9, "Hash ratio for Cuckoo SST table.");
  1046. DEFINE_bool(use_hash_search, false, "if use kHashSearch "
  1047. "instead of kBinarySearch. "
  1048. "This is valid if only we use BlockTable");
  1049. DEFINE_bool(use_block_based_filter, false, "if use kBlockBasedFilter "
  1050. "instead of kFullFilter for filter block. "
  1051. "This is valid if only we use BlockTable");
  1052. DEFINE_string(merge_operator, "", "The merge operator to use with the database."
  1053. "If a new merge operator is specified, be sure to use fresh"
  1054. " database The possible merge operators are defined in"
  1055. " utilities/merge_operators.h");
  1056. DEFINE_int32(skip_list_lookahead, 0, "Used with skip_list memtablerep; try "
  1057. "linear search first for this many steps from the previous "
  1058. "position");
  1059. DEFINE_bool(report_file_operations, false, "if report number of file "
  1060. "operations");
  1061. DEFINE_int32(readahead_size, 0, "Iterator readahead size");
  1062. static const bool FLAGS_soft_rate_limit_dummy __attribute__((__unused__)) =
  1063. RegisterFlagValidator(&FLAGS_soft_rate_limit, &ValidateRateLimit);
  1064. static const bool FLAGS_hard_rate_limit_dummy __attribute__((__unused__)) =
  1065. RegisterFlagValidator(&FLAGS_hard_rate_limit, &ValidateRateLimit);
  1066. static const bool FLAGS_prefix_size_dummy __attribute__((__unused__)) =
  1067. RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
  1068. static const bool FLAGS_key_size_dummy __attribute__((__unused__)) =
  1069. RegisterFlagValidator(&FLAGS_key_size, &ValidateKeySize);
  1070. static const bool FLAGS_cache_numshardbits_dummy __attribute__((__unused__)) =
  1071. RegisterFlagValidator(&FLAGS_cache_numshardbits,
  1072. &ValidateCacheNumshardbits);
  1073. static const bool FLAGS_readwritepercent_dummy __attribute__((__unused__)) =
  1074. RegisterFlagValidator(&FLAGS_readwritepercent, &ValidateInt32Percent);
  1075. DEFINE_int32(disable_seek_compaction, false,
  1076. "Not used, left here for backwards compatibility");
  1077. static const bool FLAGS_deletepercent_dummy __attribute__((__unused__)) =
  1078. RegisterFlagValidator(&FLAGS_deletepercent, &ValidateInt32Percent);
  1079. static const bool FLAGS_table_cache_numshardbits_dummy __attribute__((__unused__)) =
  1080. RegisterFlagValidator(&FLAGS_table_cache_numshardbits,
  1081. &ValidateTableCacheNumshardbits);
  1082. namespace ROCKSDB_NAMESPACE {
  1083. namespace {
  1084. struct ReportFileOpCounters {
  1085. std::atomic<int> open_counter_;
  1086. std::atomic<int> read_counter_;
  1087. std::atomic<int> append_counter_;
  1088. std::atomic<uint64_t> bytes_read_;
  1089. std::atomic<uint64_t> bytes_written_;
  1090. };
  1091. // A special Env to records and report file operations in db_bench
  1092. class ReportFileOpEnv : public EnvWrapper {
  1093. public:
  1094. explicit ReportFileOpEnv(Env* base) : EnvWrapper(base) { reset(); }
  1095. void reset() {
  1096. counters_.open_counter_ = 0;
  1097. counters_.read_counter_ = 0;
  1098. counters_.append_counter_ = 0;
  1099. counters_.bytes_read_ = 0;
  1100. counters_.bytes_written_ = 0;
  1101. }
  1102. Status NewSequentialFile(const std::string& f,
  1103. std::unique_ptr<SequentialFile>* r,
  1104. const EnvOptions& soptions) override {
  1105. class CountingFile : public SequentialFile {
  1106. private:
  1107. std::unique_ptr<SequentialFile> target_;
  1108. ReportFileOpCounters* counters_;
  1109. public:
  1110. CountingFile(std::unique_ptr<SequentialFile>&& target,
  1111. ReportFileOpCounters* counters)
  1112. : target_(std::move(target)), counters_(counters) {}
  1113. Status Read(size_t n, Slice* result, char* scratch) override {
  1114. counters_->read_counter_.fetch_add(1, std::memory_order_relaxed);
  1115. Status rv = target_->Read(n, result, scratch);
  1116. counters_->bytes_read_.fetch_add(result->size(),
  1117. std::memory_order_relaxed);
  1118. return rv;
  1119. }
  1120. Status Skip(uint64_t n) override { return target_->Skip(n); }
  1121. };
  1122. Status s = target()->NewSequentialFile(f, r, soptions);
  1123. if (s.ok()) {
  1124. counters()->open_counter_.fetch_add(1, std::memory_order_relaxed);
  1125. r->reset(new CountingFile(std::move(*r), counters()));
  1126. }
  1127. return s;
  1128. }
  1129. Status NewRandomAccessFile(const std::string& f,
  1130. std::unique_ptr<RandomAccessFile>* r,
  1131. const EnvOptions& soptions) override {
  1132. class CountingFile : public RandomAccessFile {
  1133. private:
  1134. std::unique_ptr<RandomAccessFile> target_;
  1135. ReportFileOpCounters* counters_;
  1136. public:
  1137. CountingFile(std::unique_ptr<RandomAccessFile>&& target,
  1138. ReportFileOpCounters* counters)
  1139. : target_(std::move(target)), counters_(counters) {}
  1140. Status Read(uint64_t offset, size_t n, Slice* result,
  1141. char* scratch) const override {
  1142. counters_->read_counter_.fetch_add(1, std::memory_order_relaxed);
  1143. Status rv = target_->Read(offset, n, result, scratch);
  1144. counters_->bytes_read_.fetch_add(result->size(),
  1145. std::memory_order_relaxed);
  1146. return rv;
  1147. }
  1148. };
  1149. Status s = target()->NewRandomAccessFile(f, r, soptions);
  1150. if (s.ok()) {
  1151. counters()->open_counter_.fetch_add(1, std::memory_order_relaxed);
  1152. r->reset(new CountingFile(std::move(*r), counters()));
  1153. }
  1154. return s;
  1155. }
  1156. Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
  1157. const EnvOptions& soptions) override {
  1158. class CountingFile : public WritableFile {
  1159. private:
  1160. std::unique_ptr<WritableFile> target_;
  1161. ReportFileOpCounters* counters_;
  1162. public:
  1163. CountingFile(std::unique_ptr<WritableFile>&& target,
  1164. ReportFileOpCounters* counters)
  1165. : target_(std::move(target)), counters_(counters) {}
  1166. Status Append(const Slice& data) override {
  1167. counters_->append_counter_.fetch_add(1, std::memory_order_relaxed);
  1168. Status rv = target_->Append(data);
  1169. counters_->bytes_written_.fetch_add(data.size(),
  1170. std::memory_order_relaxed);
  1171. return rv;
  1172. }
  1173. Status Truncate(uint64_t size) override { return target_->Truncate(size); }
  1174. Status Close() override { return target_->Close(); }
  1175. Status Flush() override { return target_->Flush(); }
  1176. Status Sync() override { return target_->Sync(); }
  1177. };
  1178. Status s = target()->NewWritableFile(f, r, soptions);
  1179. if (s.ok()) {
  1180. counters()->open_counter_.fetch_add(1, std::memory_order_relaxed);
  1181. r->reset(new CountingFile(std::move(*r), counters()));
  1182. }
  1183. return s;
  1184. }
  1185. // getter
  1186. ReportFileOpCounters* counters() { return &counters_; }
  1187. private:
  1188. ReportFileOpCounters counters_;
  1189. };
  1190. } // namespace
  1191. enum DistributionType : unsigned char {
  1192. kFixed = 0,
  1193. kUniform,
  1194. kNormal
  1195. };
  1196. static enum DistributionType FLAGS_value_size_distribution_type_e = kFixed;
  1197. static enum DistributionType StringToDistributionType(const char* ctype) {
  1198. assert(ctype);
  1199. if (!strcasecmp(ctype, "fixed"))
  1200. return kFixed;
  1201. else if (!strcasecmp(ctype, "uniform"))
  1202. return kUniform;
  1203. else if (!strcasecmp(ctype, "normal"))
  1204. return kNormal;
  1205. fprintf(stdout, "Cannot parse distribution type '%s'\n", ctype);
  1206. return kFixed; // default value
  1207. }
  1208. class BaseDistribution {
  1209. public:
  1210. BaseDistribution(unsigned int min, unsigned int max) :
  1211. min_value_size_(min),
  1212. max_value_size_(max) {}
  1213. virtual ~BaseDistribution() {}
  1214. unsigned int Generate() {
  1215. auto val = Get();
  1216. if (NeedTruncate()) {
  1217. val = std::max(min_value_size_, val);
  1218. val = std::min(max_value_size_, val);
  1219. }
  1220. return val;
  1221. }
  1222. private:
  1223. virtual unsigned int Get() = 0;
  1224. virtual bool NeedTruncate() {
  1225. return true;
  1226. }
  1227. unsigned int min_value_size_;
  1228. unsigned int max_value_size_;
  1229. };
  1230. class FixedDistribution : public BaseDistribution
  1231. {
  1232. public:
  1233. FixedDistribution(unsigned int size) :
  1234. BaseDistribution(size, size),
  1235. size_(size) {}
  1236. private:
  1237. virtual unsigned int Get() override {
  1238. return size_;
  1239. }
  1240. virtual bool NeedTruncate() override {
  1241. return false;
  1242. }
  1243. unsigned int size_;
  1244. };
  1245. class NormalDistribution
  1246. : public BaseDistribution, public std::normal_distribution<double> {
  1247. public:
  1248. NormalDistribution(unsigned int min, unsigned int max) :
  1249. BaseDistribution(min, max),
  1250. // 99.7% values within the range [min, max].
  1251. std::normal_distribution<double>((double)(min + max) / 2.0 /*mean*/,
  1252. (double)(max - min) / 6.0 /*stddev*/),
  1253. gen_(rd_()) {}
  1254. private:
  1255. virtual unsigned int Get() override {
  1256. return static_cast<unsigned int>((*this)(gen_));
  1257. }
  1258. std::random_device rd_;
  1259. std::mt19937 gen_;
  1260. };
  1261. class UniformDistribution
  1262. : public BaseDistribution,
  1263. public std::uniform_int_distribution<unsigned int> {
  1264. public:
  1265. UniformDistribution(unsigned int min, unsigned int max) :
  1266. BaseDistribution(min, max),
  1267. std::uniform_int_distribution<unsigned int>(min, max),
  1268. gen_(rd_()) {}
  1269. private:
  1270. virtual unsigned int Get() override {
  1271. return (*this)(gen_);
  1272. }
  1273. virtual bool NeedTruncate() override {
  1274. return false;
  1275. }
  1276. std::random_device rd_;
  1277. std::mt19937 gen_;
  1278. };
  1279. // Helper for quickly generating random data.
  1280. class RandomGenerator {
  1281. private:
  1282. std::string data_;
  1283. unsigned int pos_;
  1284. std::unique_ptr<BaseDistribution> dist_;
  1285. public:
  1286. RandomGenerator() {
  1287. auto max_value_size = FLAGS_value_size_max;
  1288. switch (FLAGS_value_size_distribution_type_e) {
  1289. case kUniform:
  1290. dist_.reset(new UniformDistribution(FLAGS_value_size_min,
  1291. FLAGS_value_size_max));
  1292. break;
  1293. case kNormal:
  1294. dist_.reset(new NormalDistribution(FLAGS_value_size_min,
  1295. FLAGS_value_size_max));
  1296. break;
  1297. case kFixed:
  1298. default:
  1299. dist_.reset(new FixedDistribution(value_size));
  1300. max_value_size = value_size;
  1301. }
  1302. // We use a limited amount of data over and over again and ensure
  1303. // that it is larger than the compression window (32KB), and also
  1304. // large enough to serve all typical value sizes we want to write.
  1305. Random rnd(301);
  1306. std::string piece;
  1307. while (data_.size() < (unsigned)std::max(1048576, max_value_size)) {
  1308. // Add a short fragment that is as compressible as specified
  1309. // by FLAGS_compression_ratio.
  1310. test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece);
  1311. data_.append(piece);
  1312. }
  1313. pos_ = 0;
  1314. }
  1315. Slice Generate(unsigned int len) {
  1316. assert(len <= data_.size());
  1317. if (pos_ + len > data_.size()) {
  1318. pos_ = 0;
  1319. }
  1320. pos_ += len;
  1321. return Slice(data_.data() + pos_ - len, len);
  1322. }
  1323. Slice Generate() {
  1324. auto len = dist_->Generate();
  1325. return Generate(len);
  1326. }
  1327. };
  1328. static void AppendWithSpace(std::string* str, Slice msg) {
  1329. if (msg.empty()) return;
  1330. if (!str->empty()) {
  1331. str->push_back(' ');
  1332. }
  1333. str->append(msg.data(), msg.size());
  1334. }
  1335. struct DBWithColumnFamilies {
  1336. std::vector<ColumnFamilyHandle*> cfh;
  1337. DB* db;
  1338. #ifndef ROCKSDB_LITE
  1339. OptimisticTransactionDB* opt_txn_db;
  1340. #endif // ROCKSDB_LITE
  1341. std::atomic<size_t> num_created; // Need to be updated after all the
  1342. // new entries in cfh are set.
  1343. size_t num_hot; // Number of column families to be queried at each moment.
  1344. // After each CreateNewCf(), another num_hot number of new
  1345. // Column families will be created and used to be queried.
  1346. port::Mutex create_cf_mutex; // Only one thread can execute CreateNewCf()
  1347. std::vector<int> cfh_idx_to_prob; // ith index holds probability of operating
  1348. // on cfh[i].
  1349. DBWithColumnFamilies()
  1350. : db(nullptr)
  1351. #ifndef ROCKSDB_LITE
  1352. , opt_txn_db(nullptr)
  1353. #endif // ROCKSDB_LITE
  1354. {
  1355. cfh.clear();
  1356. num_created = 0;
  1357. num_hot = 0;
  1358. }
  1359. DBWithColumnFamilies(const DBWithColumnFamilies& other)
  1360. : cfh(other.cfh),
  1361. db(other.db),
  1362. #ifndef ROCKSDB_LITE
  1363. opt_txn_db(other.opt_txn_db),
  1364. #endif // ROCKSDB_LITE
  1365. num_created(other.num_created.load()),
  1366. num_hot(other.num_hot),
  1367. cfh_idx_to_prob(other.cfh_idx_to_prob) {
  1368. }
  1369. void DeleteDBs() {
  1370. std::for_each(cfh.begin(), cfh.end(),
  1371. [](ColumnFamilyHandle* cfhi) { delete cfhi; });
  1372. cfh.clear();
  1373. #ifndef ROCKSDB_LITE
  1374. if (opt_txn_db) {
  1375. delete opt_txn_db;
  1376. opt_txn_db = nullptr;
  1377. } else {
  1378. delete db;
  1379. db = nullptr;
  1380. }
  1381. #else
  1382. delete db;
  1383. db = nullptr;
  1384. #endif // ROCKSDB_LITE
  1385. }
  1386. ColumnFamilyHandle* GetCfh(int64_t rand_num) {
  1387. assert(num_hot > 0);
  1388. size_t rand_offset = 0;
  1389. if (!cfh_idx_to_prob.empty()) {
  1390. assert(cfh_idx_to_prob.size() == num_hot);
  1391. int sum = 0;
  1392. while (sum + cfh_idx_to_prob[rand_offset] < rand_num % 100) {
  1393. sum += cfh_idx_to_prob[rand_offset];
  1394. ++rand_offset;
  1395. }
  1396. assert(rand_offset < cfh_idx_to_prob.size());
  1397. } else {
  1398. rand_offset = rand_num % num_hot;
  1399. }
  1400. return cfh[num_created.load(std::memory_order_acquire) - num_hot +
  1401. rand_offset];
  1402. }
  1403. // stage: assume CF from 0 to stage * num_hot has be created. Need to create
  1404. // stage * num_hot + 1 to stage * (num_hot + 1).
  1405. void CreateNewCf(ColumnFamilyOptions options, int64_t stage) {
  1406. MutexLock l(&create_cf_mutex);
  1407. if ((stage + 1) * num_hot <= num_created) {
  1408. // Already created.
  1409. return;
  1410. }
  1411. auto new_num_created = num_created + num_hot;
  1412. assert(new_num_created <= cfh.size());
  1413. for (size_t i = num_created; i < new_num_created; i++) {
  1414. Status s =
  1415. db->CreateColumnFamily(options, ColumnFamilyName(i), &(cfh[i]));
  1416. if (!s.ok()) {
  1417. fprintf(stderr, "create column family error: %s\n",
  1418. s.ToString().c_str());
  1419. abort();
  1420. }
  1421. }
  1422. num_created.store(new_num_created, std::memory_order_release);
  1423. }
  1424. };
  1425. // a class that reports stats to CSV file
  1426. class ReporterAgent {
  1427. public:
  1428. ReporterAgent(Env* env, const std::string& fname,
  1429. uint64_t report_interval_secs)
  1430. : env_(env),
  1431. total_ops_done_(0),
  1432. last_report_(0),
  1433. report_interval_secs_(report_interval_secs),
  1434. stop_(false) {
  1435. auto s = env_->NewWritableFile(fname, &report_file_, EnvOptions());
  1436. if (s.ok()) {
  1437. s = report_file_->Append(Header() + "\n");
  1438. }
  1439. if (s.ok()) {
  1440. s = report_file_->Flush();
  1441. }
  1442. if (!s.ok()) {
  1443. fprintf(stderr, "Can't open %s: %s\n", fname.c_str(),
  1444. s.ToString().c_str());
  1445. abort();
  1446. }
  1447. reporting_thread_ = port::Thread([&]() { SleepAndReport(); });
  1448. }
  1449. ~ReporterAgent() {
  1450. {
  1451. std::unique_lock<std::mutex> lk(mutex_);
  1452. stop_ = true;
  1453. stop_cv_.notify_all();
  1454. }
  1455. reporting_thread_.join();
  1456. }
  1457. // thread safe
  1458. void ReportFinishedOps(int64_t num_ops) {
  1459. total_ops_done_.fetch_add(num_ops);
  1460. }
  1461. private:
  1462. std::string Header() const { return "secs_elapsed,interval_qps"; }
  1463. void SleepAndReport() {
  1464. auto time_started = env_->NowMicros();
  1465. while (true) {
  1466. {
  1467. std::unique_lock<std::mutex> lk(mutex_);
  1468. if (stop_ ||
  1469. stop_cv_.wait_for(lk, std::chrono::seconds(report_interval_secs_),
  1470. [&]() { return stop_; })) {
  1471. // stopping
  1472. break;
  1473. }
  1474. // else -> timeout, which means time for a report!
  1475. }
  1476. auto total_ops_done_snapshot = total_ops_done_.load();
  1477. // round the seconds elapsed
  1478. auto secs_elapsed =
  1479. (env_->NowMicros() - time_started + kMicrosInSecond / 2) /
  1480. kMicrosInSecond;
  1481. std::string report = ToString(secs_elapsed) + "," +
  1482. ToString(total_ops_done_snapshot - last_report_) +
  1483. "\n";
  1484. auto s = report_file_->Append(report);
  1485. if (s.ok()) {
  1486. s = report_file_->Flush();
  1487. }
  1488. if (!s.ok()) {
  1489. fprintf(stderr,
  1490. "Can't write to report file (%s), stopping the reporting\n",
  1491. s.ToString().c_str());
  1492. break;
  1493. }
  1494. last_report_ = total_ops_done_snapshot;
  1495. }
  1496. }
  1497. Env* env_;
  1498. std::unique_ptr<WritableFile> report_file_;
  1499. std::atomic<int64_t> total_ops_done_;
  1500. int64_t last_report_;
  1501. const uint64_t report_interval_secs_;
  1502. ROCKSDB_NAMESPACE::port::Thread reporting_thread_;
  1503. std::mutex mutex_;
  1504. // will notify on stop
  1505. std::condition_variable stop_cv_;
  1506. bool stop_;
  1507. };
  1508. enum OperationType : unsigned char {
  1509. kRead = 0,
  1510. kWrite,
  1511. kDelete,
  1512. kSeek,
  1513. kMerge,
  1514. kUpdate,
  1515. kCompress,
  1516. kUncompress,
  1517. kCrc,
  1518. kHash,
  1519. kOthers
  1520. };
  1521. static std::unordered_map<OperationType, std::string, std::hash<unsigned char>>
  1522. OperationTypeString = {
  1523. {kRead, "read"},
  1524. {kWrite, "write"},
  1525. {kDelete, "delete"},
  1526. {kSeek, "seek"},
  1527. {kMerge, "merge"},
  1528. {kUpdate, "update"},
  1529. {kCompress, "compress"},
  1530. {kCompress, "uncompress"},
  1531. {kCrc, "crc"},
  1532. {kHash, "hash"},
  1533. {kOthers, "op"}
  1534. };
  1535. class CombinedStats;
  1536. class Stats {
  1537. private:
  1538. int id_;
  1539. uint64_t start_;
  1540. uint64_t sine_interval_;
  1541. uint64_t finish_;
  1542. double seconds_;
  1543. uint64_t done_;
  1544. uint64_t last_report_done_;
  1545. uint64_t next_report_;
  1546. uint64_t bytes_;
  1547. uint64_t last_op_finish_;
  1548. uint64_t last_report_finish_;
  1549. std::unordered_map<OperationType, std::shared_ptr<HistogramImpl>,
  1550. std::hash<unsigned char>> hist_;
  1551. std::string message_;
  1552. bool exclude_from_merge_;
  1553. ReporterAgent* reporter_agent_; // does not own
  1554. friend class CombinedStats;
  1555. public:
  1556. Stats() { Start(-1); }
  1557. void SetReporterAgent(ReporterAgent* reporter_agent) {
  1558. reporter_agent_ = reporter_agent;
  1559. }
  1560. void Start(int id) {
  1561. id_ = id;
  1562. next_report_ = FLAGS_stats_interval ? FLAGS_stats_interval : 100;
  1563. last_op_finish_ = start_;
  1564. hist_.clear();
  1565. done_ = 0;
  1566. last_report_done_ = 0;
  1567. bytes_ = 0;
  1568. seconds_ = 0;
  1569. start_ = FLAGS_env->NowMicros();
  1570. sine_interval_ = FLAGS_env->NowMicros();
  1571. finish_ = start_;
  1572. last_report_finish_ = start_;
  1573. message_.clear();
  1574. // When set, stats from this thread won't be merged with others.
  1575. exclude_from_merge_ = false;
  1576. }
  1577. void Merge(const Stats& other) {
  1578. if (other.exclude_from_merge_)
  1579. return;
  1580. for (auto it = other.hist_.begin(); it != other.hist_.end(); ++it) {
  1581. auto this_it = hist_.find(it->first);
  1582. if (this_it != hist_.end()) {
  1583. this_it->second->Merge(*(other.hist_.at(it->first)));
  1584. } else {
  1585. hist_.insert({ it->first, it->second });
  1586. }
  1587. }
  1588. done_ += other.done_;
  1589. bytes_ += other.bytes_;
  1590. seconds_ += other.seconds_;
  1591. if (other.start_ < start_) start_ = other.start_;
  1592. if (other.finish_ > finish_) finish_ = other.finish_;
  1593. // Just keep the messages from one thread
  1594. if (message_.empty()) message_ = other.message_;
  1595. }
  1596. void Stop() {
  1597. finish_ = FLAGS_env->NowMicros();
  1598. seconds_ = (finish_ - start_) * 1e-6;
  1599. }
  1600. void AddMessage(Slice msg) {
  1601. AppendWithSpace(&message_, msg);
  1602. }
  1603. void SetId(int id) { id_ = id; }
  1604. void SetExcludeFromMerge() { exclude_from_merge_ = true; }
  1605. void PrintThreadStatus() {
  1606. std::vector<ThreadStatus> thread_list;
  1607. FLAGS_env->GetThreadList(&thread_list);
  1608. fprintf(stderr, "\n%18s %10s %12s %20s %13s %45s %12s %s\n",
  1609. "ThreadID", "ThreadType", "cfName", "Operation",
  1610. "ElapsedTime", "Stage", "State", "OperationProperties");
  1611. int64_t current_time = 0;
  1612. FLAGS_env->GetCurrentTime(&current_time);
  1613. for (auto ts : thread_list) {
  1614. fprintf(stderr, "%18" PRIu64 " %10s %12s %20s %13s %45s %12s",
  1615. ts.thread_id,
  1616. ThreadStatus::GetThreadTypeName(ts.thread_type).c_str(),
  1617. ts.cf_name.c_str(),
  1618. ThreadStatus::GetOperationName(ts.operation_type).c_str(),
  1619. ThreadStatus::MicrosToString(ts.op_elapsed_micros).c_str(),
  1620. ThreadStatus::GetOperationStageName(ts.operation_stage).c_str(),
  1621. ThreadStatus::GetStateName(ts.state_type).c_str());
  1622. auto op_properties = ThreadStatus::InterpretOperationProperties(
  1623. ts.operation_type, ts.op_properties);
  1624. for (const auto& op_prop : op_properties) {
  1625. fprintf(stderr, " %s %" PRIu64" |",
  1626. op_prop.first.c_str(), op_prop.second);
  1627. }
  1628. fprintf(stderr, "\n");
  1629. }
  1630. }
  1631. void ResetSineInterval() {
  1632. sine_interval_ = FLAGS_env->NowMicros();
  1633. }
  1634. uint64_t GetSineInterval() {
  1635. return sine_interval_;
  1636. }
  1637. uint64_t GetStart() {
  1638. return start_;
  1639. }
  1640. void ResetLastOpTime() {
  1641. // Set to now to avoid latency from calls to SleepForMicroseconds
  1642. last_op_finish_ = FLAGS_env->NowMicros();
  1643. }
  1644. void FinishedOps(DBWithColumnFamilies* db_with_cfh, DB* db, int64_t num_ops,
  1645. enum OperationType op_type = kOthers) {
  1646. if (reporter_agent_) {
  1647. reporter_agent_->ReportFinishedOps(num_ops);
  1648. }
  1649. if (FLAGS_histogram) {
  1650. uint64_t now = FLAGS_env->NowMicros();
  1651. uint64_t micros = now - last_op_finish_;
  1652. if (hist_.find(op_type) == hist_.end())
  1653. {
  1654. auto hist_temp = std::make_shared<HistogramImpl>();
  1655. hist_.insert({op_type, std::move(hist_temp)});
  1656. }
  1657. hist_[op_type]->Add(micros);
  1658. if (micros > 20000 && !FLAGS_stats_interval) {
  1659. fprintf(stderr, "long op: %" PRIu64 " micros%30s\r", micros, "");
  1660. fflush(stderr);
  1661. }
  1662. last_op_finish_ = now;
  1663. }
  1664. done_ += num_ops;
  1665. if (done_ >= next_report_) {
  1666. if (!FLAGS_stats_interval) {
  1667. if (next_report_ < 1000) next_report_ += 100;
  1668. else if (next_report_ < 5000) next_report_ += 500;
  1669. else if (next_report_ < 10000) next_report_ += 1000;
  1670. else if (next_report_ < 50000) next_report_ += 5000;
  1671. else if (next_report_ < 100000) next_report_ += 10000;
  1672. else if (next_report_ < 500000) next_report_ += 50000;
  1673. else next_report_ += 100000;
  1674. fprintf(stderr, "... finished %" PRIu64 " ops%30s\r", done_, "");
  1675. } else {
  1676. uint64_t now = FLAGS_env->NowMicros();
  1677. int64_t usecs_since_last = now - last_report_finish_;
  1678. // Determine whether to print status where interval is either
  1679. // each N operations or each N seconds.
  1680. if (FLAGS_stats_interval_seconds &&
  1681. usecs_since_last < (FLAGS_stats_interval_seconds * 1000000)) {
  1682. // Don't check again for this many operations
  1683. next_report_ += FLAGS_stats_interval;
  1684. } else {
  1685. fprintf(stderr,
  1686. "%s ... thread %d: (%" PRIu64 ",%" PRIu64 ") ops and "
  1687. "(%.1f,%.1f) ops/second in (%.6f,%.6f) seconds\n",
  1688. FLAGS_env->TimeToString(now/1000000).c_str(),
  1689. id_,
  1690. done_ - last_report_done_, done_,
  1691. (done_ - last_report_done_) /
  1692. (usecs_since_last / 1000000.0),
  1693. done_ / ((now - start_) / 1000000.0),
  1694. (now - last_report_finish_) / 1000000.0,
  1695. (now - start_) / 1000000.0);
  1696. if (id_ == 0 && FLAGS_stats_per_interval) {
  1697. std::string stats;
  1698. if (db_with_cfh && db_with_cfh->num_created.load()) {
  1699. for (size_t i = 0; i < db_with_cfh->num_created.load(); ++i) {
  1700. if (db->GetProperty(db_with_cfh->cfh[i], "rocksdb.cfstats",
  1701. &stats))
  1702. fprintf(stderr, "%s\n", stats.c_str());
  1703. if (FLAGS_show_table_properties) {
  1704. for (int level = 0; level < FLAGS_num_levels; ++level) {
  1705. if (db->GetProperty(
  1706. db_with_cfh->cfh[i],
  1707. "rocksdb.aggregated-table-properties-at-level" +
  1708. ToString(level),
  1709. &stats)) {
  1710. if (stats.find("# entries=0") == std::string::npos) {
  1711. fprintf(stderr, "Level[%d]: %s\n", level,
  1712. stats.c_str());
  1713. }
  1714. }
  1715. }
  1716. }
  1717. }
  1718. } else if (db) {
  1719. if (db->GetProperty("rocksdb.stats", &stats)) {
  1720. fprintf(stderr, "%s\n", stats.c_str());
  1721. }
  1722. if (FLAGS_show_table_properties) {
  1723. for (int level = 0; level < FLAGS_num_levels; ++level) {
  1724. if (db->GetProperty(
  1725. "rocksdb.aggregated-table-properties-at-level" +
  1726. ToString(level),
  1727. &stats)) {
  1728. if (stats.find("# entries=0") == std::string::npos) {
  1729. fprintf(stderr, "Level[%d]: %s\n", level, stats.c_str());
  1730. }
  1731. }
  1732. }
  1733. }
  1734. }
  1735. }
  1736. next_report_ += FLAGS_stats_interval;
  1737. last_report_finish_ = now;
  1738. last_report_done_ = done_;
  1739. }
  1740. }
  1741. if (id_ == 0 && FLAGS_thread_status_per_interval) {
  1742. PrintThreadStatus();
  1743. }
  1744. fflush(stderr);
  1745. }
  1746. }
  1747. void AddBytes(int64_t n) {
  1748. bytes_ += n;
  1749. }
  1750. void Report(const Slice& name) {
  1751. // Pretend at least one op was done in case we are running a benchmark
  1752. // that does not call FinishedOps().
  1753. if (done_ < 1) done_ = 1;
  1754. std::string extra;
  1755. if (bytes_ > 0) {
  1756. // Rate is computed on actual elapsed time, not the sum of per-thread
  1757. // elapsed times.
  1758. double elapsed = (finish_ - start_) * 1e-6;
  1759. char rate[100];
  1760. snprintf(rate, sizeof(rate), "%6.1f MB/s",
  1761. (bytes_ / 1048576.0) / elapsed);
  1762. extra = rate;
  1763. }
  1764. AppendWithSpace(&extra, message_);
  1765. double elapsed = (finish_ - start_) * 1e-6;
  1766. double throughput = (double)done_/elapsed;
  1767. fprintf(stdout, "%-12s : %11.3f micros/op %ld ops/sec;%s%s\n",
  1768. name.ToString().c_str(),
  1769. seconds_ * 1e6 / done_,
  1770. (long)throughput,
  1771. (extra.empty() ? "" : " "),
  1772. extra.c_str());
  1773. if (FLAGS_histogram) {
  1774. for (auto it = hist_.begin(); it != hist_.end(); ++it) {
  1775. fprintf(stdout, "Microseconds per %s:\n%s\n",
  1776. OperationTypeString[it->first].c_str(),
  1777. it->second->ToString().c_str());
  1778. }
  1779. }
  1780. if (FLAGS_report_file_operations) {
  1781. ReportFileOpEnv* env = static_cast<ReportFileOpEnv*>(FLAGS_env);
  1782. ReportFileOpCounters* counters = env->counters();
  1783. fprintf(stdout, "Num files opened: %d\n",
  1784. counters->open_counter_.load(std::memory_order_relaxed));
  1785. fprintf(stdout, "Num Read(): %d\n",
  1786. counters->read_counter_.load(std::memory_order_relaxed));
  1787. fprintf(stdout, "Num Append(): %d\n",
  1788. counters->append_counter_.load(std::memory_order_relaxed));
  1789. fprintf(stdout, "Num bytes read: %" PRIu64 "\n",
  1790. counters->bytes_read_.load(std::memory_order_relaxed));
  1791. fprintf(stdout, "Num bytes written: %" PRIu64 "\n",
  1792. counters->bytes_written_.load(std::memory_order_relaxed));
  1793. env->reset();
  1794. }
  1795. fflush(stdout);
  1796. }
  1797. };
  1798. class CombinedStats {
  1799. public:
  1800. void AddStats(const Stats& stat) {
  1801. uint64_t total_ops = stat.done_;
  1802. uint64_t total_bytes_ = stat.bytes_;
  1803. double elapsed;
  1804. if (total_ops < 1) {
  1805. total_ops = 1;
  1806. }
  1807. elapsed = (stat.finish_ - stat.start_) * 1e-6;
  1808. throughput_ops_.emplace_back(total_ops / elapsed);
  1809. if (total_bytes_ > 0) {
  1810. double mbs = (total_bytes_ / 1048576.0);
  1811. throughput_mbs_.emplace_back(mbs / elapsed);
  1812. }
  1813. }
  1814. void Report(const std::string& bench_name) {
  1815. const char* name = bench_name.c_str();
  1816. int num_runs = static_cast<int>(throughput_ops_.size());
  1817. if (throughput_mbs_.size() == throughput_ops_.size()) {
  1818. fprintf(stdout,
  1819. "%s [AVG %d runs] : %d ops/sec; %6.1f MB/sec\n"
  1820. "%s [MEDIAN %d runs] : %d ops/sec; %6.1f MB/sec\n",
  1821. name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)),
  1822. CalcAvg(throughput_mbs_), name, num_runs,
  1823. static_cast<int>(CalcMedian(throughput_ops_)),
  1824. CalcMedian(throughput_mbs_));
  1825. } else {
  1826. fprintf(stdout,
  1827. "%s [AVG %d runs] : %d ops/sec\n"
  1828. "%s [MEDIAN %d runs] : %d ops/sec\n",
  1829. name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)), name,
  1830. num_runs, static_cast<int>(CalcMedian(throughput_ops_)));
  1831. }
  1832. }
  1833. private:
  1834. double CalcAvg(std::vector<double> data) {
  1835. double avg = 0;
  1836. for (double x : data) {
  1837. avg += x;
  1838. }
  1839. avg = avg / data.size();
  1840. return avg;
  1841. }
  1842. double CalcMedian(std::vector<double> data) {
  1843. assert(data.size() > 0);
  1844. std::sort(data.begin(), data.end());
  1845. size_t mid = data.size() / 2;
  1846. if (data.size() % 2 == 1) {
  1847. // Odd number of entries
  1848. return data[mid];
  1849. } else {
  1850. // Even number of entries
  1851. return (data[mid] + data[mid - 1]) / 2;
  1852. }
  1853. }
  1854. std::vector<double> throughput_ops_;
  1855. std::vector<double> throughput_mbs_;
  1856. };
  1857. class TimestampEmulator {
  1858. private:
  1859. std::atomic<uint64_t> timestamp_;
  1860. public:
  1861. TimestampEmulator() : timestamp_(0) {}
  1862. uint64_t Get() const { return timestamp_.load(); }
  1863. void Inc() { timestamp_++; }
  1864. };
  1865. // State shared by all concurrent executions of the same benchmark.
  1866. struct SharedState {
  1867. port::Mutex mu;
  1868. port::CondVar cv;
  1869. int total;
  1870. int perf_level;
  1871. std::shared_ptr<RateLimiter> write_rate_limiter;
  1872. std::shared_ptr<RateLimiter> read_rate_limiter;
  1873. // Each thread goes through the following states:
  1874. // (1) initializing
  1875. // (2) waiting for others to be initialized
  1876. // (3) running
  1877. // (4) done
  1878. long num_initialized;
  1879. long num_done;
  1880. bool start;
  1881. SharedState() : cv(&mu), perf_level(FLAGS_perf_level) { }
  1882. };
  1883. // Per-thread state for concurrent executions of the same benchmark.
  1884. struct ThreadState {
  1885. int tid; // 0..n-1 when running in n threads
  1886. Random64 rand; // Has different seeds for different threads
  1887. Stats stats;
  1888. SharedState* shared;
  1889. /* implicit */ ThreadState(int index)
  1890. : tid(index),
  1891. rand((FLAGS_seed ? FLAGS_seed : 1000) + index) {
  1892. }
  1893. };
  1894. class Duration {
  1895. public:
  1896. Duration(uint64_t max_seconds, int64_t max_ops, int64_t ops_per_stage = 0) {
  1897. max_seconds_ = max_seconds;
  1898. max_ops_= max_ops;
  1899. ops_per_stage_ = (ops_per_stage > 0) ? ops_per_stage : max_ops;
  1900. ops_ = 0;
  1901. start_at_ = FLAGS_env->NowMicros();
  1902. }
  1903. int64_t GetStage() { return std::min(ops_, max_ops_ - 1) / ops_per_stage_; }
  1904. bool Done(int64_t increment) {
  1905. if (increment <= 0) increment = 1; // avoid Done(0) and infinite loops
  1906. ops_ += increment;
  1907. if (max_seconds_) {
  1908. // Recheck every appx 1000 ops (exact iff increment is factor of 1000)
  1909. auto granularity = FLAGS_ops_between_duration_checks;
  1910. if ((ops_ / granularity) != ((ops_ - increment) / granularity)) {
  1911. uint64_t now = FLAGS_env->NowMicros();
  1912. return ((now - start_at_) / 1000000) >= max_seconds_;
  1913. } else {
  1914. return false;
  1915. }
  1916. } else {
  1917. return ops_ > max_ops_;
  1918. }
  1919. }
  1920. private:
  1921. uint64_t max_seconds_;
  1922. int64_t max_ops_;
  1923. int64_t ops_per_stage_;
  1924. int64_t ops_;
  1925. uint64_t start_at_;
  1926. };
  1927. class Benchmark {
  1928. private:
  1929. std::shared_ptr<Cache> cache_;
  1930. std::shared_ptr<Cache> compressed_cache_;
  1931. std::shared_ptr<const FilterPolicy> filter_policy_;
  1932. const SliceTransform* prefix_extractor_;
  1933. DBWithColumnFamilies db_;
  1934. std::vector<DBWithColumnFamilies> multi_dbs_;
  1935. int64_t num_;
  1936. int key_size_;
  1937. int prefix_size_;
  1938. int64_t keys_per_prefix_;
  1939. int64_t entries_per_batch_;
  1940. int64_t writes_before_delete_range_;
  1941. int64_t writes_per_range_tombstone_;
  1942. int64_t range_tombstone_width_;
  1943. int64_t max_num_range_tombstones_;
  1944. WriteOptions write_options_;
  1945. Options open_options_; // keep options around to properly destroy db later
  1946. #ifndef ROCKSDB_LITE
  1947. TraceOptions trace_options_;
  1948. TraceOptions block_cache_trace_options_;
  1949. #endif
  1950. int64_t reads_;
  1951. int64_t deletes_;
  1952. double read_random_exp_range_;
  1953. int64_t writes_;
  1954. int64_t readwrites_;
  1955. int64_t merge_keys_;
  1956. bool report_file_operations_;
  1957. bool use_blob_db_;
  1958. std::vector<std::string> keys_;
  1959. class ErrorHandlerListener : public EventListener {
  1960. public:
  1961. #ifndef ROCKSDB_LITE
  1962. ErrorHandlerListener()
  1963. : mutex_(),
  1964. cv_(&mutex_),
  1965. no_auto_recovery_(false),
  1966. recovery_complete_(false) {}
  1967. ~ErrorHandlerListener() override {}
  1968. void OnErrorRecoveryBegin(BackgroundErrorReason /*reason*/,
  1969. Status /*bg_error*/,
  1970. bool* auto_recovery) override {
  1971. if (*auto_recovery && no_auto_recovery_) {
  1972. *auto_recovery = false;
  1973. }
  1974. }
  1975. void OnErrorRecoveryCompleted(Status /*old_bg_error*/) override {
  1976. InstrumentedMutexLock l(&mutex_);
  1977. recovery_complete_ = true;
  1978. cv_.SignalAll();
  1979. }
  1980. bool WaitForRecovery(uint64_t abs_time_us) {
  1981. InstrumentedMutexLock l(&mutex_);
  1982. if (!recovery_complete_) {
  1983. cv_.TimedWait(abs_time_us);
  1984. }
  1985. if (recovery_complete_) {
  1986. recovery_complete_ = false;
  1987. return true;
  1988. }
  1989. return false;
  1990. }
  1991. void EnableAutoRecovery(bool enable = true) { no_auto_recovery_ = !enable; }
  1992. private:
  1993. InstrumentedMutex mutex_;
  1994. InstrumentedCondVar cv_;
  1995. bool no_auto_recovery_;
  1996. bool recovery_complete_;
  1997. #else // ROCKSDB_LITE
  1998. bool WaitForRecovery(uint64_t /*abs_time_us*/) { return true; }
  1999. void EnableAutoRecovery(bool /*enable*/) {}
  2000. #endif // ROCKSDB_LITE
  2001. };
  2002. std::shared_ptr<ErrorHandlerListener> listener_;
  2003. bool SanityCheck() {
  2004. if (FLAGS_compression_ratio > 1) {
  2005. fprintf(stderr, "compression_ratio should be between 0 and 1\n");
  2006. return false;
  2007. }
  2008. return true;
  2009. }
  2010. inline bool CompressSlice(const CompressionInfo& compression_info,
  2011. const Slice& input, std::string* compressed) {
  2012. bool ok = true;
  2013. switch (FLAGS_compression_type_e) {
  2014. case ROCKSDB_NAMESPACE::kSnappyCompression:
  2015. ok = Snappy_Compress(compression_info, input.data(), input.size(),
  2016. compressed);
  2017. break;
  2018. case ROCKSDB_NAMESPACE::kZlibCompression:
  2019. ok = Zlib_Compress(compression_info, 2, input.data(), input.size(),
  2020. compressed);
  2021. break;
  2022. case ROCKSDB_NAMESPACE::kBZip2Compression:
  2023. ok = BZip2_Compress(compression_info, 2, input.data(), input.size(),
  2024. compressed);
  2025. break;
  2026. case ROCKSDB_NAMESPACE::kLZ4Compression:
  2027. ok = LZ4_Compress(compression_info, 2, input.data(), input.size(),
  2028. compressed);
  2029. break;
  2030. case ROCKSDB_NAMESPACE::kLZ4HCCompression:
  2031. ok = LZ4HC_Compress(compression_info, 2, input.data(), input.size(),
  2032. compressed);
  2033. break;
  2034. case ROCKSDB_NAMESPACE::kXpressCompression:
  2035. ok = XPRESS_Compress(input.data(),
  2036. input.size(), compressed);
  2037. break;
  2038. case ROCKSDB_NAMESPACE::kZSTD:
  2039. ok = ZSTD_Compress(compression_info, input.data(), input.size(),
  2040. compressed);
  2041. break;
  2042. default:
  2043. ok = false;
  2044. }
  2045. return ok;
  2046. }
  2047. void PrintHeader() {
  2048. PrintEnvironment();
  2049. fprintf(stdout, "Keys: %d bytes each\n", FLAGS_key_size);
  2050. auto avg_value_size = FLAGS_value_size;
  2051. if (FLAGS_value_size_distribution_type_e == kFixed) {
  2052. fprintf(stdout, "Values: %d bytes each (%d bytes after compression)\n",
  2053. avg_value_size,
  2054. static_cast<int>(avg_value_size * FLAGS_compression_ratio + 0.5));
  2055. } else {
  2056. avg_value_size = (FLAGS_value_size_min + FLAGS_value_size_max) / 2;
  2057. fprintf(stdout, "Values: %d avg bytes each (%d bytes after compression)\n",
  2058. avg_value_size,
  2059. static_cast<int>(avg_value_size * FLAGS_compression_ratio + 0.5));
  2060. fprintf(stdout, "Values Distribution: %s (min: %d, max: %d)\n",
  2061. FLAGS_value_size_distribution_type.c_str(),
  2062. FLAGS_value_size_min, FLAGS_value_size_max);
  2063. }
  2064. fprintf(stdout, "Entries: %" PRIu64 "\n", num_);
  2065. fprintf(stdout, "Prefix: %d bytes\n", FLAGS_prefix_size);
  2066. fprintf(stdout, "Keys per prefix: %" PRIu64 "\n", keys_per_prefix_);
  2067. fprintf(stdout, "RawSize: %.1f MB (estimated)\n",
  2068. ((static_cast<int64_t>(FLAGS_key_size + avg_value_size) * num_)
  2069. / 1048576.0));
  2070. fprintf(stdout, "FileSize: %.1f MB (estimated)\n",
  2071. (((FLAGS_key_size + avg_value_size * FLAGS_compression_ratio)
  2072. * num_)
  2073. / 1048576.0));
  2074. fprintf(stdout, "Write rate: %" PRIu64 " bytes/second\n",
  2075. FLAGS_benchmark_write_rate_limit);
  2076. fprintf(stdout, "Read rate: %" PRIu64 " ops/second\n",
  2077. FLAGS_benchmark_read_rate_limit);
  2078. if (FLAGS_enable_numa) {
  2079. fprintf(stderr, "Running in NUMA enabled mode.\n");
  2080. #ifndef NUMA
  2081. fprintf(stderr, "NUMA is not defined in the system.\n");
  2082. exit(1);
  2083. #else
  2084. if (numa_available() == -1) {
  2085. fprintf(stderr, "NUMA is not supported by the system.\n");
  2086. exit(1);
  2087. }
  2088. #endif
  2089. }
  2090. auto compression = CompressionTypeToString(FLAGS_compression_type_e);
  2091. fprintf(stdout, "Compression: %s\n", compression.c_str());
  2092. fprintf(stdout, "Compression sampling rate: %" PRId64 "\n",
  2093. FLAGS_sample_for_compression);
  2094. switch (FLAGS_rep_factory) {
  2095. case kPrefixHash:
  2096. fprintf(stdout, "Memtablerep: prefix_hash\n");
  2097. break;
  2098. case kSkipList:
  2099. fprintf(stdout, "Memtablerep: skip_list\n");
  2100. break;
  2101. case kVectorRep:
  2102. fprintf(stdout, "Memtablerep: vector\n");
  2103. break;
  2104. case kHashLinkedList:
  2105. fprintf(stdout, "Memtablerep: hash_linkedlist\n");
  2106. break;
  2107. }
  2108. fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level);
  2109. PrintWarnings(compression.c_str());
  2110. fprintf(stdout, "------------------------------------------------\n");
  2111. }
  2112. void PrintWarnings(const char* compression) {
  2113. #if defined(__GNUC__) && !defined(__OPTIMIZE__)
  2114. fprintf(stdout,
  2115. "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"
  2116. );
  2117. #endif
  2118. #ifndef NDEBUG
  2119. fprintf(stdout,
  2120. "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
  2121. #endif
  2122. if (FLAGS_compression_type_e != ROCKSDB_NAMESPACE::kNoCompression) {
  2123. // The test string should not be too small.
  2124. const int len = FLAGS_block_size;
  2125. std::string input_str(len, 'y');
  2126. std::string compressed;
  2127. CompressionOptions opts;
  2128. CompressionContext context(FLAGS_compression_type_e);
  2129. CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
  2130. FLAGS_compression_type_e,
  2131. FLAGS_sample_for_compression);
  2132. bool result = CompressSlice(info, Slice(input_str), &compressed);
  2133. if (!result) {
  2134. fprintf(stdout, "WARNING: %s compression is not enabled\n",
  2135. compression);
  2136. } else if (compressed.size() >= input_str.size()) {
  2137. fprintf(stdout, "WARNING: %s compression is not effective\n",
  2138. compression);
  2139. }
  2140. }
  2141. }
  2142. // Current the following isn't equivalent to OS_LINUX.
  2143. #if defined(__linux)
  2144. static Slice TrimSpace(Slice s) {
  2145. unsigned int start = 0;
  2146. while (start < s.size() && isspace(s[start])) {
  2147. start++;
  2148. }
  2149. unsigned int limit = static_cast<unsigned int>(s.size());
  2150. while (limit > start && isspace(s[limit-1])) {
  2151. limit--;
  2152. }
  2153. return Slice(s.data() + start, limit - start);
  2154. }
  2155. #endif
  2156. void PrintEnvironment() {
  2157. fprintf(stderr, "RocksDB: version %d.%d\n",
  2158. kMajorVersion, kMinorVersion);
  2159. #if defined(__linux)
  2160. time_t now = time(nullptr);
  2161. char buf[52];
  2162. // Lint complains about ctime() usage, so replace it with ctime_r(). The
  2163. // requirement is to provide a buffer which is at least 26 bytes.
  2164. fprintf(stderr, "Date: %s",
  2165. ctime_r(&now, buf)); // ctime_r() adds newline
  2166. FILE* cpuinfo = fopen("/proc/cpuinfo", "r");
  2167. if (cpuinfo != nullptr) {
  2168. char line[1000];
  2169. int num_cpus = 0;
  2170. std::string cpu_type;
  2171. std::string cache_size;
  2172. while (fgets(line, sizeof(line), cpuinfo) != nullptr) {
  2173. const char* sep = strchr(line, ':');
  2174. if (sep == nullptr) {
  2175. continue;
  2176. }
  2177. Slice key = TrimSpace(Slice(line, sep - 1 - line));
  2178. Slice val = TrimSpace(Slice(sep + 1));
  2179. if (key == "model name") {
  2180. ++num_cpus;
  2181. cpu_type = val.ToString();
  2182. } else if (key == "cache size") {
  2183. cache_size = val.ToString();
  2184. }
  2185. }
  2186. fclose(cpuinfo);
  2187. fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str());
  2188. fprintf(stderr, "CPUCache: %s\n", cache_size.c_str());
  2189. }
  2190. #endif
  2191. }
  2192. static bool KeyExpired(const TimestampEmulator* timestamp_emulator,
  2193. const Slice& key) {
  2194. const char* pos = key.data();
  2195. pos += 8;
  2196. uint64_t timestamp = 0;
  2197. if (port::kLittleEndian) {
  2198. int bytes_to_fill = 8;
  2199. for (int i = 0; i < bytes_to_fill; ++i) {
  2200. timestamp |= (static_cast<uint64_t>(static_cast<unsigned char>(pos[i]))
  2201. << ((bytes_to_fill - i - 1) << 3));
  2202. }
  2203. } else {
  2204. memcpy(&timestamp, pos, sizeof(timestamp));
  2205. }
  2206. return timestamp_emulator->Get() - timestamp > FLAGS_time_range;
  2207. }
  2208. class ExpiredTimeFilter : public CompactionFilter {
  2209. public:
  2210. explicit ExpiredTimeFilter(
  2211. const std::shared_ptr<TimestampEmulator>& timestamp_emulator)
  2212. : timestamp_emulator_(timestamp_emulator) {}
  2213. bool Filter(int /*level*/, const Slice& key,
  2214. const Slice& /*existing_value*/, std::string* /*new_value*/,
  2215. bool* /*value_changed*/) const override {
  2216. return KeyExpired(timestamp_emulator_.get(), key);
  2217. }
  2218. const char* Name() const override { return "ExpiredTimeFilter"; }
  2219. private:
  2220. std::shared_ptr<TimestampEmulator> timestamp_emulator_;
  2221. };
  2222. class KeepFilter : public CompactionFilter {
  2223. public:
  2224. bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
  2225. std::string* /*new_value*/,
  2226. bool* /*value_changed*/) const override {
  2227. return false;
  2228. }
  2229. const char* Name() const override { return "KeepFilter"; }
  2230. };
  2231. std::shared_ptr<Cache> NewCache(int64_t capacity) {
  2232. if (capacity <= 0) {
  2233. return nullptr;
  2234. }
  2235. if (FLAGS_use_clock_cache) {
  2236. auto cache = NewClockCache(static_cast<size_t>(capacity),
  2237. FLAGS_cache_numshardbits);
  2238. if (!cache) {
  2239. fprintf(stderr, "Clock cache not supported.");
  2240. exit(1);
  2241. }
  2242. return cache;
  2243. } else {
  2244. return NewLRUCache(
  2245. static_cast<size_t>(capacity), FLAGS_cache_numshardbits,
  2246. false /*strict_capacity_limit*/, FLAGS_cache_high_pri_pool_ratio);
  2247. }
  2248. }
  2249. public:
  2250. Benchmark()
  2251. : cache_(NewCache(FLAGS_cache_size)),
  2252. compressed_cache_(NewCache(FLAGS_compressed_cache_size)),
  2253. filter_policy_(FLAGS_bloom_bits >= 0
  2254. ? NewBloomFilterPolicy(FLAGS_bloom_bits,
  2255. FLAGS_use_block_based_filter)
  2256. : nullptr),
  2257. prefix_extractor_(NewFixedPrefixTransform(FLAGS_prefix_size)),
  2258. num_(FLAGS_num),
  2259. key_size_(FLAGS_key_size),
  2260. prefix_size_(FLAGS_prefix_size),
  2261. keys_per_prefix_(FLAGS_keys_per_prefix),
  2262. entries_per_batch_(1),
  2263. reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
  2264. read_random_exp_range_(0.0),
  2265. writes_(FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes),
  2266. readwrites_(
  2267. (FLAGS_writes < 0 && FLAGS_reads < 0)
  2268. ? FLAGS_num
  2269. : ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads)),
  2270. merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys),
  2271. report_file_operations_(FLAGS_report_file_operations),
  2272. #ifndef ROCKSDB_LITE
  2273. use_blob_db_(FLAGS_use_blob_db)
  2274. #else
  2275. use_blob_db_(false)
  2276. #endif // !ROCKSDB_LITE
  2277. {
  2278. // use simcache instead of cache
  2279. if (FLAGS_simcache_size >= 0) {
  2280. if (FLAGS_cache_numshardbits >= 1) {
  2281. cache_ =
  2282. NewSimCache(cache_, FLAGS_simcache_size, FLAGS_cache_numshardbits);
  2283. } else {
  2284. cache_ = NewSimCache(cache_, FLAGS_simcache_size, 0);
  2285. }
  2286. }
  2287. if (report_file_operations_) {
  2288. if (!FLAGS_hdfs.empty()) {
  2289. fprintf(stderr,
  2290. "--hdfs and --report_file_operations cannot be enabled "
  2291. "at the same time");
  2292. exit(1);
  2293. }
  2294. FLAGS_env = new ReportFileOpEnv(FLAGS_env);
  2295. }
  2296. if (FLAGS_prefix_size > FLAGS_key_size) {
  2297. fprintf(stderr, "prefix size is larger than key size");
  2298. exit(1);
  2299. }
  2300. std::vector<std::string> files;
  2301. FLAGS_env->GetChildren(FLAGS_db, &files);
  2302. for (size_t i = 0; i < files.size(); i++) {
  2303. if (Slice(files[i]).starts_with("heap-")) {
  2304. FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]);
  2305. }
  2306. }
  2307. if (!FLAGS_use_existing_db) {
  2308. Options options;
  2309. options.env = FLAGS_env;
  2310. if (!FLAGS_wal_dir.empty()) {
  2311. options.wal_dir = FLAGS_wal_dir;
  2312. }
  2313. #ifndef ROCKSDB_LITE
  2314. if (use_blob_db_) {
  2315. blob_db::DestroyBlobDB(FLAGS_db, options, blob_db::BlobDBOptions());
  2316. }
  2317. #endif // !ROCKSDB_LITE
  2318. DestroyDB(FLAGS_db, options);
  2319. if (!FLAGS_wal_dir.empty()) {
  2320. FLAGS_env->DeleteDir(FLAGS_wal_dir);
  2321. }
  2322. if (FLAGS_num_multi_db > 1) {
  2323. FLAGS_env->CreateDir(FLAGS_db);
  2324. if (!FLAGS_wal_dir.empty()) {
  2325. FLAGS_env->CreateDir(FLAGS_wal_dir);
  2326. }
  2327. }
  2328. }
  2329. listener_.reset(new ErrorHandlerListener());
  2330. }
  2331. ~Benchmark() {
  2332. db_.DeleteDBs();
  2333. delete prefix_extractor_;
  2334. if (cache_.get() != nullptr) {
  2335. // this will leak, but we're shutting down so nobody cares
  2336. cache_->DisownData();
  2337. }
  2338. }
  2339. Slice AllocateKey(std::unique_ptr<const char[]>* key_guard) {
  2340. char* data = new char[key_size_];
  2341. const char* const_data = data;
  2342. key_guard->reset(const_data);
  2343. return Slice(key_guard->get(), key_size_);
  2344. }
  2345. // Generate key according to the given specification and random number.
  2346. // The resulting key will have the following format (if keys_per_prefix_
  2347. // is positive), extra trailing bytes are either cut off or padded with '0'.
  2348. // The prefix value is derived from key value.
  2349. // ----------------------------
  2350. // | prefix 00000 | key 00000 |
  2351. // ----------------------------
  2352. // If keys_per_prefix_ is 0, the key is simply a binary representation of
  2353. // random number followed by trailing '0's
  2354. // ----------------------------
  2355. // | key 00000 |
  2356. // ----------------------------
  2357. void GenerateKeyFromInt(uint64_t v, int64_t num_keys, Slice* key) {
  2358. if (!keys_.empty()) {
  2359. assert(FLAGS_use_existing_keys);
  2360. assert(keys_.size() == static_cast<size_t>(num_keys));
  2361. assert(v < static_cast<uint64_t>(num_keys));
  2362. *key = keys_[v];
  2363. return;
  2364. }
  2365. char* start = const_cast<char*>(key->data());
  2366. char* pos = start;
  2367. if (keys_per_prefix_ > 0) {
  2368. int64_t num_prefix = num_keys / keys_per_prefix_;
  2369. int64_t prefix = v % num_prefix;
  2370. int bytes_to_fill = std::min(prefix_size_, 8);
  2371. if (port::kLittleEndian) {
  2372. for (int i = 0; i < bytes_to_fill; ++i) {
  2373. pos[i] = (prefix >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
  2374. }
  2375. } else {
  2376. memcpy(pos, static_cast<void*>(&prefix), bytes_to_fill);
  2377. }
  2378. if (prefix_size_ > 8) {
  2379. // fill the rest with 0s
  2380. memset(pos + 8, '0', prefix_size_ - 8);
  2381. }
  2382. pos += prefix_size_;
  2383. }
  2384. int bytes_to_fill = std::min(key_size_ - static_cast<int>(pos - start), 8);
  2385. if (port::kLittleEndian) {
  2386. for (int i = 0; i < bytes_to_fill; ++i) {
  2387. pos[i] = (v >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
  2388. }
  2389. } else {
  2390. memcpy(pos, static_cast<void*>(&v), bytes_to_fill);
  2391. }
  2392. pos += bytes_to_fill;
  2393. if (key_size_ > pos - start) {
  2394. memset(pos, '0', key_size_ - (pos - start));
  2395. }
  2396. }
  2397. void GenerateKeyFromIntForSeek(uint64_t v, int64_t num_keys, Slice* key) {
  2398. GenerateKeyFromInt(v, num_keys, key);
  2399. if (FLAGS_seek_missing_prefix) {
  2400. assert(prefix_size_ > 8);
  2401. char* key_ptr = const_cast<char*>(key->data());
  2402. // This rely on GenerateKeyFromInt filling paddings with '0's.
  2403. // Putting a '1' will create a non-existing prefix.
  2404. key_ptr[8] = '1';
  2405. }
  2406. }
  2407. std::string GetPathForMultiple(std::string base_name, size_t id) {
  2408. if (!base_name.empty()) {
  2409. #ifndef OS_WIN
  2410. if (base_name.back() != '/') {
  2411. base_name += '/';
  2412. }
  2413. #else
  2414. if (base_name.back() != '\\') {
  2415. base_name += '\\';
  2416. }
  2417. #endif
  2418. }
  2419. return base_name + ToString(id);
  2420. }
  2421. void VerifyDBFromDB(std::string& truth_db_name) {
  2422. DBWithColumnFamilies truth_db;
  2423. auto s = DB::OpenForReadOnly(open_options_, truth_db_name, &truth_db.db);
  2424. if (!s.ok()) {
  2425. fprintf(stderr, "open error: %s\n", s.ToString().c_str());
  2426. exit(1);
  2427. }
  2428. ReadOptions ro;
  2429. ro.total_order_seek = true;
  2430. std::unique_ptr<Iterator> truth_iter(truth_db.db->NewIterator(ro));
  2431. std::unique_ptr<Iterator> db_iter(db_.db->NewIterator(ro));
  2432. // Verify that all the key/values in truth_db are retrivable in db with
  2433. // ::Get
  2434. fprintf(stderr, "Verifying db >= truth_db with ::Get...\n");
  2435. for (truth_iter->SeekToFirst(); truth_iter->Valid(); truth_iter->Next()) {
  2436. std::string value;
  2437. s = db_.db->Get(ro, truth_iter->key(), &value);
  2438. assert(s.ok());
  2439. // TODO(myabandeh): provide debugging hints
  2440. assert(Slice(value) == truth_iter->value());
  2441. }
  2442. // Verify that the db iterator does not give any extra key/value
  2443. fprintf(stderr, "Verifying db == truth_db...\n");
  2444. for (db_iter->SeekToFirst(), truth_iter->SeekToFirst(); db_iter->Valid();
  2445. db_iter->Next(), truth_iter->Next()) {
  2446. assert(truth_iter->Valid());
  2447. assert(truth_iter->value() == db_iter->value());
  2448. }
  2449. // No more key should be left unchecked in truth_db
  2450. assert(!truth_iter->Valid());
  2451. fprintf(stderr, "...Verified\n");
  2452. }
  2453. void Run() {
  2454. if (!SanityCheck()) {
  2455. exit(1);
  2456. }
  2457. Open(&open_options_);
  2458. PrintHeader();
  2459. std::stringstream benchmark_stream(FLAGS_benchmarks);
  2460. std::string name;
  2461. std::unique_ptr<ExpiredTimeFilter> filter;
  2462. while (std::getline(benchmark_stream, name, ',')) {
  2463. // Sanitize parameters
  2464. num_ = FLAGS_num;
  2465. reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads);
  2466. writes_ = (FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes);
  2467. deletes_ = (FLAGS_deletes < 0 ? FLAGS_num : FLAGS_deletes);
  2468. value_size = FLAGS_value_size;
  2469. key_size_ = FLAGS_key_size;
  2470. entries_per_batch_ = FLAGS_batch_size;
  2471. writes_before_delete_range_ = FLAGS_writes_before_delete_range;
  2472. writes_per_range_tombstone_ = FLAGS_writes_per_range_tombstone;
  2473. range_tombstone_width_ = FLAGS_range_tombstone_width;
  2474. max_num_range_tombstones_ = FLAGS_max_num_range_tombstones;
  2475. write_options_ = WriteOptions();
  2476. read_random_exp_range_ = FLAGS_read_random_exp_range;
  2477. if (FLAGS_sync) {
  2478. write_options_.sync = true;
  2479. }
  2480. write_options_.disableWAL = FLAGS_disable_wal;
  2481. void (Benchmark::*method)(ThreadState*) = nullptr;
  2482. void (Benchmark::*post_process_method)() = nullptr;
  2483. bool fresh_db = false;
  2484. int num_threads = FLAGS_threads;
  2485. int num_repeat = 1;
  2486. int num_warmup = 0;
  2487. if (!name.empty() && *name.rbegin() == ']') {
  2488. auto it = name.find('[');
  2489. if (it == std::string::npos) {
  2490. fprintf(stderr, "unknown benchmark arguments '%s'\n", name.c_str());
  2491. exit(1);
  2492. }
  2493. std::string args = name.substr(it + 1);
  2494. args.resize(args.size() - 1);
  2495. name.resize(it);
  2496. std::string bench_arg;
  2497. std::stringstream args_stream(args);
  2498. while (std::getline(args_stream, bench_arg, '-')) {
  2499. if (bench_arg.empty()) {
  2500. continue;
  2501. }
  2502. if (bench_arg[0] == 'X') {
  2503. // Repeat the benchmark n times
  2504. std::string num_str = bench_arg.substr(1);
  2505. num_repeat = std::stoi(num_str);
  2506. } else if (bench_arg[0] == 'W') {
  2507. // Warm up the benchmark for n times
  2508. std::string num_str = bench_arg.substr(1);
  2509. num_warmup = std::stoi(num_str);
  2510. }
  2511. }
  2512. }
  2513. // Both fillseqdeterministic and filluniquerandomdeterministic
  2514. // fill the levels except the max level with UNIQUE_RANDOM
  2515. // and fill the max level with fillseq and filluniquerandom, respectively
  2516. if (name == "fillseqdeterministic" ||
  2517. name == "filluniquerandomdeterministic") {
  2518. if (!FLAGS_disable_auto_compactions) {
  2519. fprintf(stderr,
  2520. "Please disable_auto_compactions in FillDeterministic "
  2521. "benchmark\n");
  2522. exit(1);
  2523. }
  2524. if (num_threads > 1) {
  2525. fprintf(stderr,
  2526. "filldeterministic multithreaded not supported"
  2527. ", use 1 thread\n");
  2528. num_threads = 1;
  2529. }
  2530. fresh_db = true;
  2531. if (name == "fillseqdeterministic") {
  2532. method = &Benchmark::WriteSeqDeterministic;
  2533. } else {
  2534. method = &Benchmark::WriteUniqueRandomDeterministic;
  2535. }
  2536. } else if (name == "fillseq") {
  2537. fresh_db = true;
  2538. method = &Benchmark::WriteSeq;
  2539. } else if (name == "fillbatch") {
  2540. fresh_db = true;
  2541. entries_per_batch_ = 1000;
  2542. method = &Benchmark::WriteSeq;
  2543. } else if (name == "fillrandom") {
  2544. fresh_db = true;
  2545. method = &Benchmark::WriteRandom;
  2546. } else if (name == "filluniquerandom") {
  2547. fresh_db = true;
  2548. if (num_threads > 1) {
  2549. fprintf(stderr,
  2550. "filluniquerandom multithreaded not supported"
  2551. ", use 1 thread");
  2552. num_threads = 1;
  2553. }
  2554. method = &Benchmark::WriteUniqueRandom;
  2555. } else if (name == "overwrite") {
  2556. method = &Benchmark::WriteRandom;
  2557. } else if (name == "fillsync") {
  2558. fresh_db = true;
  2559. num_ /= 1000;
  2560. write_options_.sync = true;
  2561. method = &Benchmark::WriteRandom;
  2562. } else if (name == "fill100K") {
  2563. fresh_db = true;
  2564. num_ /= 1000;
  2565. value_size = 100 * 1000;
  2566. method = &Benchmark::WriteRandom;
  2567. } else if (name == "readseq") {
  2568. method = &Benchmark::ReadSequential;
  2569. } else if (name == "readtorowcache") {
  2570. if (!FLAGS_use_existing_keys || !FLAGS_row_cache_size) {
  2571. fprintf(stderr,
  2572. "Please set use_existing_keys to true and specify a "
  2573. "row cache size in readtorowcache benchmark\n");
  2574. exit(1);
  2575. }
  2576. method = &Benchmark::ReadToRowCache;
  2577. } else if (name == "readtocache") {
  2578. method = &Benchmark::ReadSequential;
  2579. num_threads = 1;
  2580. reads_ = num_;
  2581. } else if (name == "readreverse") {
  2582. method = &Benchmark::ReadReverse;
  2583. } else if (name == "readrandom") {
  2584. if (FLAGS_multiread_stride) {
  2585. fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
  2586. entries_per_batch_);
  2587. }
  2588. method = &Benchmark::ReadRandom;
  2589. } else if (name == "readrandomfast") {
  2590. method = &Benchmark::ReadRandomFast;
  2591. } else if (name == "multireadrandom") {
  2592. fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
  2593. entries_per_batch_);
  2594. method = &Benchmark::MultiReadRandom;
  2595. } else if (name == "mixgraph") {
  2596. method = &Benchmark::MixGraph;
  2597. } else if (name == "readmissing") {
  2598. ++key_size_;
  2599. method = &Benchmark::ReadRandom;
  2600. } else if (name == "newiterator") {
  2601. method = &Benchmark::IteratorCreation;
  2602. } else if (name == "newiteratorwhilewriting") {
  2603. num_threads++; // Add extra thread for writing
  2604. method = &Benchmark::IteratorCreationWhileWriting;
  2605. } else if (name == "seekrandom") {
  2606. method = &Benchmark::SeekRandom;
  2607. } else if (name == "seekrandomwhilewriting") {
  2608. num_threads++; // Add extra thread for writing
  2609. method = &Benchmark::SeekRandomWhileWriting;
  2610. } else if (name == "seekrandomwhilemerging") {
  2611. num_threads++; // Add extra thread for merging
  2612. method = &Benchmark::SeekRandomWhileMerging;
  2613. } else if (name == "readrandomsmall") {
  2614. reads_ /= 1000;
  2615. method = &Benchmark::ReadRandom;
  2616. } else if (name == "deleteseq") {
  2617. method = &Benchmark::DeleteSeq;
  2618. } else if (name == "deleterandom") {
  2619. method = &Benchmark::DeleteRandom;
  2620. } else if (name == "readwhilewriting") {
  2621. num_threads++; // Add extra thread for writing
  2622. method = &Benchmark::ReadWhileWriting;
  2623. } else if (name == "readwhilemerging") {
  2624. num_threads++; // Add extra thread for writing
  2625. method = &Benchmark::ReadWhileMerging;
  2626. } else if (name == "readwhilescanning") {
  2627. num_threads++; // Add extra thread for scaning
  2628. method = &Benchmark::ReadWhileScanning;
  2629. } else if (name == "readrandomwriterandom") {
  2630. method = &Benchmark::ReadRandomWriteRandom;
  2631. } else if (name == "readrandommergerandom") {
  2632. if (FLAGS_merge_operator.empty()) {
  2633. fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
  2634. name.c_str());
  2635. exit(1);
  2636. }
  2637. method = &Benchmark::ReadRandomMergeRandom;
  2638. } else if (name == "updaterandom") {
  2639. method = &Benchmark::UpdateRandom;
  2640. } else if (name == "xorupdaterandom") {
  2641. method = &Benchmark::XORUpdateRandom;
  2642. } else if (name == "appendrandom") {
  2643. method = &Benchmark::AppendRandom;
  2644. } else if (name == "mergerandom") {
  2645. if (FLAGS_merge_operator.empty()) {
  2646. fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
  2647. name.c_str());
  2648. exit(1);
  2649. }
  2650. method = &Benchmark::MergeRandom;
  2651. } else if (name == "randomwithverify") {
  2652. method = &Benchmark::RandomWithVerify;
  2653. } else if (name == "fillseekseq") {
  2654. method = &Benchmark::WriteSeqSeekSeq;
  2655. } else if (name == "compact") {
  2656. method = &Benchmark::Compact;
  2657. } else if (name == "compactall") {
  2658. CompactAll();
  2659. } else if (name == "crc32c") {
  2660. method = &Benchmark::Crc32c;
  2661. } else if (name == "xxhash") {
  2662. method = &Benchmark::xxHash;
  2663. } else if (name == "acquireload") {
  2664. method = &Benchmark::AcquireLoad;
  2665. } else if (name == "compress") {
  2666. method = &Benchmark::Compress;
  2667. } else if (name == "uncompress") {
  2668. method = &Benchmark::Uncompress;
  2669. #ifndef ROCKSDB_LITE
  2670. } else if (name == "randomtransaction") {
  2671. method = &Benchmark::RandomTransaction;
  2672. post_process_method = &Benchmark::RandomTransactionVerify;
  2673. #endif // ROCKSDB_LITE
  2674. } else if (name == "randomreplacekeys") {
  2675. fresh_db = true;
  2676. method = &Benchmark::RandomReplaceKeys;
  2677. } else if (name == "timeseries") {
  2678. timestamp_emulator_.reset(new TimestampEmulator());
  2679. if (FLAGS_expire_style == "compaction_filter") {
  2680. filter.reset(new ExpiredTimeFilter(timestamp_emulator_));
  2681. fprintf(stdout, "Compaction filter is used to remove expired data");
  2682. open_options_.compaction_filter = filter.get();
  2683. }
  2684. fresh_db = true;
  2685. method = &Benchmark::TimeSeries;
  2686. } else if (name == "stats") {
  2687. PrintStats("rocksdb.stats");
  2688. } else if (name == "resetstats") {
  2689. ResetStats();
  2690. } else if (name == "verify") {
  2691. VerifyDBFromDB(FLAGS_truth_db);
  2692. } else if (name == "levelstats") {
  2693. PrintStats("rocksdb.levelstats");
  2694. } else if (name == "sstables") {
  2695. PrintStats("rocksdb.sstables");
  2696. } else if (name == "stats_history") {
  2697. PrintStatsHistory();
  2698. } else if (name == "replay") {
  2699. if (num_threads > 1) {
  2700. fprintf(stderr, "Multi-threaded replay is not yet supported\n");
  2701. exit(1);
  2702. }
  2703. if (FLAGS_trace_file == "") {
  2704. fprintf(stderr, "Please set --trace_file to be replayed from\n");
  2705. exit(1);
  2706. }
  2707. method = &Benchmark::Replay;
  2708. } else if (name == "getmergeoperands") {
  2709. method = &Benchmark::GetMergeOperands;
  2710. } else if (!name.empty()) { // No error message for empty name
  2711. fprintf(stderr, "unknown benchmark '%s'\n", name.c_str());
  2712. exit(1);
  2713. }
  2714. if (fresh_db) {
  2715. if (FLAGS_use_existing_db) {
  2716. fprintf(stdout, "%-12s : skipped (--use_existing_db is true)\n",
  2717. name.c_str());
  2718. method = nullptr;
  2719. } else {
  2720. if (db_.db != nullptr) {
  2721. db_.DeleteDBs();
  2722. DestroyDB(FLAGS_db, open_options_);
  2723. }
  2724. Options options = open_options_;
  2725. for (size_t i = 0; i < multi_dbs_.size(); i++) {
  2726. delete multi_dbs_[i].db;
  2727. if (!open_options_.wal_dir.empty()) {
  2728. options.wal_dir = GetPathForMultiple(open_options_.wal_dir, i);
  2729. }
  2730. DestroyDB(GetPathForMultiple(FLAGS_db, i), options);
  2731. }
  2732. multi_dbs_.clear();
  2733. }
  2734. Open(&open_options_); // use open_options for the last accessed
  2735. }
  2736. if (method != nullptr) {
  2737. fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
  2738. #ifndef ROCKSDB_LITE
  2739. // A trace_file option can be provided both for trace and replay
  2740. // operations. But db_bench does not support tracing and replaying at
  2741. // the same time, for now. So, start tracing only when it is not a
  2742. // replay.
  2743. if (FLAGS_trace_file != "" && name != "replay") {
  2744. std::unique_ptr<TraceWriter> trace_writer;
  2745. Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(),
  2746. FLAGS_trace_file, &trace_writer);
  2747. if (!s.ok()) {
  2748. fprintf(stderr, "Encountered an error starting a trace, %s\n",
  2749. s.ToString().c_str());
  2750. exit(1);
  2751. }
  2752. s = db_.db->StartTrace(trace_options_, std::move(trace_writer));
  2753. if (!s.ok()) {
  2754. fprintf(stderr, "Encountered an error starting a trace, %s\n",
  2755. s.ToString().c_str());
  2756. exit(1);
  2757. }
  2758. fprintf(stdout, "Tracing the workload to: [%s]\n",
  2759. FLAGS_trace_file.c_str());
  2760. }
  2761. // Start block cache tracing.
  2762. if (!FLAGS_block_cache_trace_file.empty()) {
  2763. // Sanity checks.
  2764. if (FLAGS_block_cache_trace_sampling_frequency <= 0) {
  2765. fprintf(stderr,
  2766. "Block cache trace sampling frequency must be higher than "
  2767. "0.\n");
  2768. exit(1);
  2769. }
  2770. if (FLAGS_block_cache_trace_max_trace_file_size_in_bytes <= 0) {
  2771. fprintf(stderr,
  2772. "The maximum file size for block cache tracing must be "
  2773. "higher than 0.\n");
  2774. exit(1);
  2775. }
  2776. block_cache_trace_options_.max_trace_file_size =
  2777. FLAGS_block_cache_trace_max_trace_file_size_in_bytes;
  2778. block_cache_trace_options_.sampling_frequency =
  2779. FLAGS_block_cache_trace_sampling_frequency;
  2780. std::unique_ptr<TraceWriter> block_cache_trace_writer;
  2781. Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(),
  2782. FLAGS_block_cache_trace_file,
  2783. &block_cache_trace_writer);
  2784. if (!s.ok()) {
  2785. fprintf(stderr,
  2786. "Encountered an error when creating trace writer, %s\n",
  2787. s.ToString().c_str());
  2788. exit(1);
  2789. }
  2790. s = db_.db->StartBlockCacheTrace(block_cache_trace_options_,
  2791. std::move(block_cache_trace_writer));
  2792. if (!s.ok()) {
  2793. fprintf(
  2794. stderr,
  2795. "Encountered an error when starting block cache tracing, %s\n",
  2796. s.ToString().c_str());
  2797. exit(1);
  2798. }
  2799. fprintf(stdout, "Tracing block cache accesses to: [%s]\n",
  2800. FLAGS_block_cache_trace_file.c_str());
  2801. }
  2802. #endif // ROCKSDB_LITE
  2803. if (num_warmup > 0) {
  2804. printf("Warming up benchmark by running %d times\n", num_warmup);
  2805. }
  2806. for (int i = 0; i < num_warmup; i++) {
  2807. RunBenchmark(num_threads, name, method);
  2808. }
  2809. if (num_repeat > 1) {
  2810. printf("Running benchmark for %d times\n", num_repeat);
  2811. }
  2812. CombinedStats combined_stats;
  2813. for (int i = 0; i < num_repeat; i++) {
  2814. Stats stats = RunBenchmark(num_threads, name, method);
  2815. combined_stats.AddStats(stats);
  2816. }
  2817. if (num_repeat > 1) {
  2818. combined_stats.Report(name);
  2819. }
  2820. }
  2821. if (post_process_method != nullptr) {
  2822. (this->*post_process_method)();
  2823. }
  2824. }
  2825. if (secondary_update_thread_) {
  2826. secondary_update_stopped_.store(1, std::memory_order_relaxed);
  2827. secondary_update_thread_->join();
  2828. secondary_update_thread_.reset();
  2829. }
  2830. #ifndef ROCKSDB_LITE
  2831. if (name != "replay" && FLAGS_trace_file != "") {
  2832. Status s = db_.db->EndTrace();
  2833. if (!s.ok()) {
  2834. fprintf(stderr, "Encountered an error ending the trace, %s\n",
  2835. s.ToString().c_str());
  2836. }
  2837. }
  2838. if (!FLAGS_block_cache_trace_file.empty()) {
  2839. Status s = db_.db->EndBlockCacheTrace();
  2840. if (!s.ok()) {
  2841. fprintf(stderr,
  2842. "Encountered an error ending the block cache tracing, %s\n",
  2843. s.ToString().c_str());
  2844. }
  2845. }
  2846. #endif // ROCKSDB_LITE
  2847. if (FLAGS_statistics) {
  2848. fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
  2849. }
  2850. if (FLAGS_simcache_size >= 0) {
  2851. fprintf(stdout, "SIMULATOR CACHE STATISTICS:\n%s\n",
  2852. static_cast_with_check<SimCache, Cache>(cache_.get())
  2853. ->ToString()
  2854. .c_str());
  2855. }
  2856. #ifndef ROCKSDB_LITE
  2857. if (FLAGS_use_secondary_db) {
  2858. fprintf(stdout, "Secondary instance updated %" PRIu64 " times.\n",
  2859. secondary_db_updates_);
  2860. }
  2861. #endif // ROCKSDB_LITE
  2862. }
  2863. private:
  2864. std::shared_ptr<TimestampEmulator> timestamp_emulator_;
  2865. std::unique_ptr<port::Thread> secondary_update_thread_;
  2866. std::atomic<int> secondary_update_stopped_{0};
  2867. #ifndef ROCKSDB_LITE
  2868. uint64_t secondary_db_updates_ = 0;
  2869. #endif // ROCKSDB_LITE
  2870. struct ThreadArg {
  2871. Benchmark* bm;
  2872. SharedState* shared;
  2873. ThreadState* thread;
  2874. void (Benchmark::*method)(ThreadState*);
  2875. };
  2876. static void ThreadBody(void* v) {
  2877. ThreadArg* arg = reinterpret_cast<ThreadArg*>(v);
  2878. SharedState* shared = arg->shared;
  2879. ThreadState* thread = arg->thread;
  2880. {
  2881. MutexLock l(&shared->mu);
  2882. shared->num_initialized++;
  2883. if (shared->num_initialized >= shared->total) {
  2884. shared->cv.SignalAll();
  2885. }
  2886. while (!shared->start) {
  2887. shared->cv.Wait();
  2888. }
  2889. }
  2890. SetPerfLevel(static_cast<PerfLevel> (shared->perf_level));
  2891. perf_context.EnablePerLevelPerfContext();
  2892. thread->stats.Start(thread->tid);
  2893. (arg->bm->*(arg->method))(thread);
  2894. thread->stats.Stop();
  2895. {
  2896. MutexLock l(&shared->mu);
  2897. shared->num_done++;
  2898. if (shared->num_done >= shared->total) {
  2899. shared->cv.SignalAll();
  2900. }
  2901. }
  2902. }
  2903. Stats RunBenchmark(int n, Slice name,
  2904. void (Benchmark::*method)(ThreadState*)) {
  2905. SharedState shared;
  2906. shared.total = n;
  2907. shared.num_initialized = 0;
  2908. shared.num_done = 0;
  2909. shared.start = false;
  2910. if (FLAGS_benchmark_write_rate_limit > 0) {
  2911. shared.write_rate_limiter.reset(
  2912. NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
  2913. }
  2914. if (FLAGS_benchmark_read_rate_limit > 0) {
  2915. shared.read_rate_limiter.reset(NewGenericRateLimiter(
  2916. FLAGS_benchmark_read_rate_limit, 100000 /* refill_period_us */,
  2917. 10 /* fairness */, RateLimiter::Mode::kReadsOnly));
  2918. }
  2919. std::unique_ptr<ReporterAgent> reporter_agent;
  2920. if (FLAGS_report_interval_seconds > 0) {
  2921. reporter_agent.reset(new ReporterAgent(FLAGS_env, FLAGS_report_file,
  2922. FLAGS_report_interval_seconds));
  2923. }
  2924. ThreadArg* arg = new ThreadArg[n];
  2925. for (int i = 0; i < n; i++) {
  2926. #ifdef NUMA
  2927. if (FLAGS_enable_numa) {
  2928. // Performs a local allocation of memory to threads in numa node.
  2929. int n_nodes = numa_num_task_nodes(); // Number of nodes in NUMA.
  2930. numa_exit_on_error = 1;
  2931. int numa_node = i % n_nodes;
  2932. bitmask* nodes = numa_allocate_nodemask();
  2933. numa_bitmask_clearall(nodes);
  2934. numa_bitmask_setbit(nodes, numa_node);
  2935. // numa_bind() call binds the process to the node and these
  2936. // properties are passed on to the thread that is created in
  2937. // StartThread method called later in the loop.
  2938. numa_bind(nodes);
  2939. numa_set_strict(1);
  2940. numa_free_nodemask(nodes);
  2941. }
  2942. #endif
  2943. arg[i].bm = this;
  2944. arg[i].method = method;
  2945. arg[i].shared = &shared;
  2946. arg[i].thread = new ThreadState(i);
  2947. arg[i].thread->stats.SetReporterAgent(reporter_agent.get());
  2948. arg[i].thread->shared = &shared;
  2949. FLAGS_env->StartThread(ThreadBody, &arg[i]);
  2950. }
  2951. shared.mu.Lock();
  2952. while (shared.num_initialized < n) {
  2953. shared.cv.Wait();
  2954. }
  2955. shared.start = true;
  2956. shared.cv.SignalAll();
  2957. while (shared.num_done < n) {
  2958. shared.cv.Wait();
  2959. }
  2960. shared.mu.Unlock();
  2961. // Stats for some threads can be excluded.
  2962. Stats merge_stats;
  2963. for (int i = 0; i < n; i++) {
  2964. merge_stats.Merge(arg[i].thread->stats);
  2965. }
  2966. merge_stats.Report(name);
  2967. for (int i = 0; i < n; i++) {
  2968. delete arg[i].thread;
  2969. }
  2970. delete[] arg;
  2971. return merge_stats;
  2972. }
  2973. void Crc32c(ThreadState* thread) {
  2974. // Checksum about 500MB of data total
  2975. const int size = FLAGS_block_size; // use --block_size option for db_bench
  2976. std::string labels = "(" + ToString(FLAGS_block_size) + " per op)";
  2977. const char* label = labels.c_str();
  2978. std::string data(size, 'x');
  2979. int64_t bytes = 0;
  2980. uint32_t crc = 0;
  2981. while (bytes < 500 * 1048576) {
  2982. crc = crc32c::Value(data.data(), size);
  2983. thread->stats.FinishedOps(nullptr, nullptr, 1, kCrc);
  2984. bytes += size;
  2985. }
  2986. // Print so result is not dead
  2987. fprintf(stderr, "... crc=0x%x\r", static_cast<unsigned int>(crc));
  2988. thread->stats.AddBytes(bytes);
  2989. thread->stats.AddMessage(label);
  2990. }
  2991. void xxHash(ThreadState* thread) {
  2992. // Checksum about 500MB of data total
  2993. const int size = 4096;
  2994. const char* label = "(4K per op)";
  2995. std::string data(size, 'x');
  2996. int64_t bytes = 0;
  2997. unsigned int xxh32 = 0;
  2998. while (bytes < 500 * 1048576) {
  2999. xxh32 = XXH32(data.data(), size, 0);
  3000. thread->stats.FinishedOps(nullptr, nullptr, 1, kHash);
  3001. bytes += size;
  3002. }
  3003. // Print so result is not dead
  3004. fprintf(stderr, "... xxh32=0x%x\r", static_cast<unsigned int>(xxh32));
  3005. thread->stats.AddBytes(bytes);
  3006. thread->stats.AddMessage(label);
  3007. }
  3008. void AcquireLoad(ThreadState* thread) {
  3009. int dummy;
  3010. std::atomic<void*> ap(&dummy);
  3011. int count = 0;
  3012. void *ptr = nullptr;
  3013. thread->stats.AddMessage("(each op is 1000 loads)");
  3014. while (count < 100000) {
  3015. for (int i = 0; i < 1000; i++) {
  3016. ptr = ap.load(std::memory_order_acquire);
  3017. }
  3018. count++;
  3019. thread->stats.FinishedOps(nullptr, nullptr, 1, kOthers);
  3020. }
  3021. if (ptr == nullptr) exit(1); // Disable unused variable warning.
  3022. }
  3023. void Compress(ThreadState *thread) {
  3024. RandomGenerator gen;
  3025. Slice input = gen.Generate(FLAGS_block_size);
  3026. int64_t bytes = 0;
  3027. int64_t produced = 0;
  3028. bool ok = true;
  3029. std::string compressed;
  3030. CompressionOptions opts;
  3031. CompressionContext context(FLAGS_compression_type_e);
  3032. CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
  3033. FLAGS_compression_type_e,
  3034. FLAGS_sample_for_compression);
  3035. // Compress 1G
  3036. while (ok && bytes < int64_t(1) << 30) {
  3037. compressed.clear();
  3038. ok = CompressSlice(info, input, &compressed);
  3039. produced += compressed.size();
  3040. bytes += input.size();
  3041. thread->stats.FinishedOps(nullptr, nullptr, 1, kCompress);
  3042. }
  3043. if (!ok) {
  3044. thread->stats.AddMessage("(compression failure)");
  3045. } else {
  3046. char buf[340];
  3047. snprintf(buf, sizeof(buf), "(output: %.1f%%)",
  3048. (produced * 100.0) / bytes);
  3049. thread->stats.AddMessage(buf);
  3050. thread->stats.AddBytes(bytes);
  3051. }
  3052. }
  3053. void Uncompress(ThreadState *thread) {
  3054. RandomGenerator gen;
  3055. Slice input = gen.Generate(FLAGS_block_size);
  3056. std::string compressed;
  3057. CompressionContext compression_ctx(FLAGS_compression_type_e);
  3058. CompressionOptions compression_opts;
  3059. CompressionInfo compression_info(
  3060. compression_opts, compression_ctx, CompressionDict::GetEmptyDict(),
  3061. FLAGS_compression_type_e, FLAGS_sample_for_compression);
  3062. UncompressionContext uncompression_ctx(FLAGS_compression_type_e);
  3063. UncompressionInfo uncompression_info(uncompression_ctx,
  3064. UncompressionDict::GetEmptyDict(),
  3065. FLAGS_compression_type_e);
  3066. bool ok = CompressSlice(compression_info, input, &compressed);
  3067. int64_t bytes = 0;
  3068. int decompress_size;
  3069. while (ok && bytes < 1024 * 1048576) {
  3070. CacheAllocationPtr uncompressed;
  3071. switch (FLAGS_compression_type_e) {
  3072. case ROCKSDB_NAMESPACE::kSnappyCompression: {
  3073. // get size and allocate here to make comparison fair
  3074. size_t ulength = 0;
  3075. if (!Snappy_GetUncompressedLength(compressed.data(),
  3076. compressed.size(), &ulength)) {
  3077. ok = false;
  3078. break;
  3079. }
  3080. uncompressed = AllocateBlock(ulength, nullptr);
  3081. ok = Snappy_Uncompress(compressed.data(), compressed.size(),
  3082. uncompressed.get());
  3083. break;
  3084. }
  3085. case ROCKSDB_NAMESPACE::kZlibCompression:
  3086. uncompressed =
  3087. Zlib_Uncompress(uncompression_info, compressed.data(),
  3088. compressed.size(), &decompress_size, 2);
  3089. ok = uncompressed.get() != nullptr;
  3090. break;
  3091. case ROCKSDB_NAMESPACE::kBZip2Compression:
  3092. uncompressed = BZip2_Uncompress(compressed.data(), compressed.size(),
  3093. &decompress_size, 2);
  3094. ok = uncompressed.get() != nullptr;
  3095. break;
  3096. case ROCKSDB_NAMESPACE::kLZ4Compression:
  3097. uncompressed = LZ4_Uncompress(uncompression_info, compressed.data(),
  3098. compressed.size(), &decompress_size, 2);
  3099. ok = uncompressed.get() != nullptr;
  3100. break;
  3101. case ROCKSDB_NAMESPACE::kLZ4HCCompression:
  3102. uncompressed = LZ4_Uncompress(uncompression_info, compressed.data(),
  3103. compressed.size(), &decompress_size, 2);
  3104. ok = uncompressed.get() != nullptr;
  3105. break;
  3106. case ROCKSDB_NAMESPACE::kXpressCompression:
  3107. uncompressed.reset(XPRESS_Uncompress(
  3108. compressed.data(), compressed.size(), &decompress_size));
  3109. ok = uncompressed.get() != nullptr;
  3110. break;
  3111. case ROCKSDB_NAMESPACE::kZSTD:
  3112. uncompressed = ZSTD_Uncompress(uncompression_info, compressed.data(),
  3113. compressed.size(), &decompress_size);
  3114. ok = uncompressed.get() != nullptr;
  3115. break;
  3116. default:
  3117. ok = false;
  3118. }
  3119. bytes += input.size();
  3120. thread->stats.FinishedOps(nullptr, nullptr, 1, kUncompress);
  3121. }
  3122. if (!ok) {
  3123. thread->stats.AddMessage("(compression failure)");
  3124. } else {
  3125. thread->stats.AddBytes(bytes);
  3126. }
  3127. }
  3128. // Returns true if the options is initialized from the specified
  3129. // options file.
  3130. bool InitializeOptionsFromFile(Options* opts) {
  3131. #ifndef ROCKSDB_LITE
  3132. printf("Initializing RocksDB Options from the specified file\n");
  3133. DBOptions db_opts;
  3134. std::vector<ColumnFamilyDescriptor> cf_descs;
  3135. if (FLAGS_options_file != "") {
  3136. auto s = LoadOptionsFromFile(FLAGS_options_file, FLAGS_env, &db_opts,
  3137. &cf_descs);
  3138. db_opts.env = FLAGS_env;
  3139. if (s.ok()) {
  3140. *opts = Options(db_opts, cf_descs[0].options);
  3141. return true;
  3142. }
  3143. fprintf(stderr, "Unable to load options file %s --- %s\n",
  3144. FLAGS_options_file.c_str(), s.ToString().c_str());
  3145. exit(1);
  3146. }
  3147. #else
  3148. (void)opts;
  3149. #endif
  3150. return false;
  3151. }
  3152. void InitializeOptionsFromFlags(Options* opts) {
  3153. printf("Initializing RocksDB Options from command-line flags\n");
  3154. Options& options = *opts;
  3155. assert(db_.db == nullptr);
  3156. options.env = FLAGS_env;
  3157. options.max_open_files = FLAGS_open_files;
  3158. if (FLAGS_cost_write_buffer_to_cache || FLAGS_db_write_buffer_size != 0) {
  3159. options.write_buffer_manager.reset(
  3160. new WriteBufferManager(FLAGS_db_write_buffer_size, cache_));
  3161. }
  3162. options.write_buffer_size = FLAGS_write_buffer_size;
  3163. options.max_write_buffer_number = FLAGS_max_write_buffer_number;
  3164. options.min_write_buffer_number_to_merge =
  3165. FLAGS_min_write_buffer_number_to_merge;
  3166. options.max_write_buffer_number_to_maintain =
  3167. FLAGS_max_write_buffer_number_to_maintain;
  3168. options.max_write_buffer_size_to_maintain =
  3169. FLAGS_max_write_buffer_size_to_maintain;
  3170. options.max_background_jobs = FLAGS_max_background_jobs;
  3171. options.max_background_compactions = FLAGS_max_background_compactions;
  3172. options.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
  3173. options.max_background_flushes = FLAGS_max_background_flushes;
  3174. options.compaction_style = FLAGS_compaction_style_e;
  3175. options.compaction_pri = FLAGS_compaction_pri_e;
  3176. options.allow_mmap_reads = FLAGS_mmap_read;
  3177. options.allow_mmap_writes = FLAGS_mmap_write;
  3178. options.use_direct_reads = FLAGS_use_direct_reads;
  3179. options.use_direct_io_for_flush_and_compaction =
  3180. FLAGS_use_direct_io_for_flush_and_compaction;
  3181. #ifndef ROCKSDB_LITE
  3182. options.ttl = FLAGS_fifo_compaction_ttl;
  3183. options.compaction_options_fifo = CompactionOptionsFIFO(
  3184. FLAGS_fifo_compaction_max_table_files_size_mb * 1024 * 1024,
  3185. FLAGS_fifo_compaction_allow_compaction);
  3186. #endif // ROCKSDB_LITE
  3187. if (FLAGS_prefix_size != 0) {
  3188. options.prefix_extractor.reset(
  3189. NewFixedPrefixTransform(FLAGS_prefix_size));
  3190. }
  3191. if (FLAGS_use_uint64_comparator) {
  3192. options.comparator = test::Uint64Comparator();
  3193. if (FLAGS_key_size != 8) {
  3194. fprintf(stderr, "Using Uint64 comparator but key size is not 8.\n");
  3195. exit(1);
  3196. }
  3197. }
  3198. if (FLAGS_use_stderr_info_logger) {
  3199. options.info_log.reset(new StderrLogger());
  3200. }
  3201. options.memtable_huge_page_size = FLAGS_memtable_use_huge_page ? 2048 : 0;
  3202. options.memtable_prefix_bloom_size_ratio = FLAGS_memtable_bloom_size_ratio;
  3203. options.memtable_whole_key_filtering = FLAGS_memtable_whole_key_filtering;
  3204. if (FLAGS_memtable_insert_with_hint_prefix_size > 0) {
  3205. options.memtable_insert_with_hint_prefix_extractor.reset(
  3206. NewCappedPrefixTransform(
  3207. FLAGS_memtable_insert_with_hint_prefix_size));
  3208. }
  3209. options.bloom_locality = FLAGS_bloom_locality;
  3210. options.max_file_opening_threads = FLAGS_file_opening_threads;
  3211. options.new_table_reader_for_compaction_inputs =
  3212. FLAGS_new_table_reader_for_compaction_inputs;
  3213. options.compaction_readahead_size = FLAGS_compaction_readahead_size;
  3214. options.log_readahead_size = FLAGS_log_readahead_size;
  3215. options.random_access_max_buffer_size = FLAGS_random_access_max_buffer_size;
  3216. options.writable_file_max_buffer_size = FLAGS_writable_file_max_buffer_size;
  3217. options.use_fsync = FLAGS_use_fsync;
  3218. options.num_levels = FLAGS_num_levels;
  3219. options.target_file_size_base = FLAGS_target_file_size_base;
  3220. options.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
  3221. options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
  3222. options.level_compaction_dynamic_level_bytes =
  3223. FLAGS_level_compaction_dynamic_level_bytes;
  3224. options.max_bytes_for_level_multiplier =
  3225. FLAGS_max_bytes_for_level_multiplier;
  3226. if ((FLAGS_prefix_size == 0) && (FLAGS_rep_factory == kPrefixHash ||
  3227. FLAGS_rep_factory == kHashLinkedList)) {
  3228. fprintf(stderr, "prefix_size should be non-zero if PrefixHash or "
  3229. "HashLinkedList memtablerep is used\n");
  3230. exit(1);
  3231. }
  3232. switch (FLAGS_rep_factory) {
  3233. case kSkipList:
  3234. options.memtable_factory.reset(new SkipListFactory(
  3235. FLAGS_skip_list_lookahead));
  3236. break;
  3237. #ifndef ROCKSDB_LITE
  3238. case kPrefixHash:
  3239. options.memtable_factory.reset(
  3240. NewHashSkipListRepFactory(FLAGS_hash_bucket_count));
  3241. break;
  3242. case kHashLinkedList:
  3243. options.memtable_factory.reset(NewHashLinkListRepFactory(
  3244. FLAGS_hash_bucket_count));
  3245. break;
  3246. case kVectorRep:
  3247. options.memtable_factory.reset(
  3248. new VectorRepFactory
  3249. );
  3250. break;
  3251. #else
  3252. default:
  3253. fprintf(stderr, "Only skip list is supported in lite mode\n");
  3254. exit(1);
  3255. #endif // ROCKSDB_LITE
  3256. }
  3257. if (FLAGS_use_plain_table) {
  3258. #ifndef ROCKSDB_LITE
  3259. if (FLAGS_rep_factory != kPrefixHash &&
  3260. FLAGS_rep_factory != kHashLinkedList) {
  3261. fprintf(stderr, "Waring: plain table is used with skipList\n");
  3262. }
  3263. int bloom_bits_per_key = FLAGS_bloom_bits;
  3264. if (bloom_bits_per_key < 0) {
  3265. bloom_bits_per_key = 0;
  3266. }
  3267. PlainTableOptions plain_table_options;
  3268. plain_table_options.user_key_len = FLAGS_key_size;
  3269. plain_table_options.bloom_bits_per_key = bloom_bits_per_key;
  3270. plain_table_options.hash_table_ratio = 0.75;
  3271. options.table_factory = std::shared_ptr<TableFactory>(
  3272. NewPlainTableFactory(plain_table_options));
  3273. #else
  3274. fprintf(stderr, "Plain table is not supported in lite mode\n");
  3275. exit(1);
  3276. #endif // ROCKSDB_LITE
  3277. } else if (FLAGS_use_cuckoo_table) {
  3278. #ifndef ROCKSDB_LITE
  3279. if (FLAGS_cuckoo_hash_ratio > 1 || FLAGS_cuckoo_hash_ratio < 0) {
  3280. fprintf(stderr, "Invalid cuckoo_hash_ratio\n");
  3281. exit(1);
  3282. }
  3283. if (!FLAGS_mmap_read) {
  3284. fprintf(stderr, "cuckoo table format requires mmap read to operate\n");
  3285. exit(1);
  3286. }
  3287. ROCKSDB_NAMESPACE::CuckooTableOptions table_options;
  3288. table_options.hash_table_ratio = FLAGS_cuckoo_hash_ratio;
  3289. table_options.identity_as_first_hash = FLAGS_identity_as_first_hash;
  3290. options.table_factory = std::shared_ptr<TableFactory>(
  3291. NewCuckooTableFactory(table_options));
  3292. #else
  3293. fprintf(stderr, "Cuckoo table is not supported in lite mode\n");
  3294. exit(1);
  3295. #endif // ROCKSDB_LITE
  3296. } else {
  3297. BlockBasedTableOptions block_based_options;
  3298. if (FLAGS_use_hash_search) {
  3299. if (FLAGS_prefix_size == 0) {
  3300. fprintf(stderr,
  3301. "prefix_size not assigned when enable use_hash_search \n");
  3302. exit(1);
  3303. }
  3304. block_based_options.index_type = BlockBasedTableOptions::kHashSearch;
  3305. } else {
  3306. block_based_options.index_type = BlockBasedTableOptions::kBinarySearch;
  3307. }
  3308. if (FLAGS_partition_index_and_filters || FLAGS_partition_index) {
  3309. if (FLAGS_use_hash_search) {
  3310. fprintf(stderr,
  3311. "use_hash_search is incompatible with "
  3312. "partition index and is ignored");
  3313. }
  3314. block_based_options.index_type =
  3315. BlockBasedTableOptions::kTwoLevelIndexSearch;
  3316. block_based_options.metadata_block_size = FLAGS_metadata_block_size;
  3317. if (FLAGS_partition_index_and_filters) {
  3318. block_based_options.partition_filters = true;
  3319. }
  3320. }
  3321. if (cache_ == nullptr) {
  3322. block_based_options.no_block_cache = true;
  3323. }
  3324. block_based_options.cache_index_and_filter_blocks =
  3325. FLAGS_cache_index_and_filter_blocks;
  3326. block_based_options.pin_l0_filter_and_index_blocks_in_cache =
  3327. FLAGS_pin_l0_filter_and_index_blocks_in_cache;
  3328. block_based_options.pin_top_level_index_and_filter =
  3329. FLAGS_pin_top_level_index_and_filter;
  3330. if (FLAGS_cache_high_pri_pool_ratio > 1e-6) { // > 0.0 + eps
  3331. block_based_options.cache_index_and_filter_blocks_with_high_priority =
  3332. true;
  3333. }
  3334. block_based_options.block_cache = cache_;
  3335. block_based_options.block_cache_compressed = compressed_cache_;
  3336. block_based_options.block_size = FLAGS_block_size;
  3337. block_based_options.block_restart_interval = FLAGS_block_restart_interval;
  3338. block_based_options.index_block_restart_interval =
  3339. FLAGS_index_block_restart_interval;
  3340. block_based_options.filter_policy = filter_policy_;
  3341. block_based_options.format_version =
  3342. static_cast<uint32_t>(FLAGS_format_version);
  3343. block_based_options.read_amp_bytes_per_bit = FLAGS_read_amp_bytes_per_bit;
  3344. block_based_options.enable_index_compression =
  3345. FLAGS_enable_index_compression;
  3346. block_based_options.block_align = FLAGS_block_align;
  3347. if (FLAGS_use_data_block_hash_index) {
  3348. block_based_options.data_block_index_type =
  3349. ROCKSDB_NAMESPACE::BlockBasedTableOptions::kDataBlockBinaryAndHash;
  3350. } else {
  3351. block_based_options.data_block_index_type =
  3352. ROCKSDB_NAMESPACE::BlockBasedTableOptions::kDataBlockBinarySearch;
  3353. }
  3354. block_based_options.data_block_hash_table_util_ratio =
  3355. FLAGS_data_block_hash_table_util_ratio;
  3356. if (FLAGS_read_cache_path != "") {
  3357. #ifndef ROCKSDB_LITE
  3358. Status rc_status;
  3359. // Read cache need to be provided with a the Logger, we will put all
  3360. // reac cache logs in the read cache path in a file named rc_LOG
  3361. rc_status = FLAGS_env->CreateDirIfMissing(FLAGS_read_cache_path);
  3362. std::shared_ptr<Logger> read_cache_logger;
  3363. if (rc_status.ok()) {
  3364. rc_status = FLAGS_env->NewLogger(FLAGS_read_cache_path + "/rc_LOG",
  3365. &read_cache_logger);
  3366. }
  3367. if (rc_status.ok()) {
  3368. PersistentCacheConfig rc_cfg(FLAGS_env, FLAGS_read_cache_path,
  3369. FLAGS_read_cache_size,
  3370. read_cache_logger);
  3371. rc_cfg.enable_direct_reads = FLAGS_read_cache_direct_read;
  3372. rc_cfg.enable_direct_writes = FLAGS_read_cache_direct_write;
  3373. rc_cfg.writer_qdepth = 4;
  3374. rc_cfg.writer_dispatch_size = 4 * 1024;
  3375. auto pcache = std::make_shared<BlockCacheTier>(rc_cfg);
  3376. block_based_options.persistent_cache = pcache;
  3377. rc_status = pcache->Open();
  3378. }
  3379. if (!rc_status.ok()) {
  3380. fprintf(stderr, "Error initializing read cache, %s\n",
  3381. rc_status.ToString().c_str());
  3382. exit(1);
  3383. }
  3384. #else
  3385. fprintf(stderr, "Read cache is not supported in LITE\n");
  3386. exit(1);
  3387. #endif
  3388. }
  3389. options.table_factory.reset(
  3390. NewBlockBasedTableFactory(block_based_options));
  3391. }
  3392. if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() > 0) {
  3393. if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() !=
  3394. static_cast<unsigned int>(FLAGS_num_levels)) {
  3395. fprintf(stderr, "Insufficient number of fanouts specified %d\n",
  3396. static_cast<int>(
  3397. FLAGS_max_bytes_for_level_multiplier_additional_v.size()));
  3398. exit(1);
  3399. }
  3400. options.max_bytes_for_level_multiplier_additional =
  3401. FLAGS_max_bytes_for_level_multiplier_additional_v;
  3402. }
  3403. options.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
  3404. options.level0_file_num_compaction_trigger =
  3405. FLAGS_level0_file_num_compaction_trigger;
  3406. options.level0_slowdown_writes_trigger =
  3407. FLAGS_level0_slowdown_writes_trigger;
  3408. options.compression = FLAGS_compression_type_e;
  3409. options.sample_for_compression = FLAGS_sample_for_compression;
  3410. options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds;
  3411. options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB;
  3412. options.max_total_wal_size = FLAGS_max_total_wal_size;
  3413. if (FLAGS_min_level_to_compress >= 0) {
  3414. assert(FLAGS_min_level_to_compress <= FLAGS_num_levels);
  3415. options.compression_per_level.resize(FLAGS_num_levels);
  3416. for (int i = 0; i < FLAGS_min_level_to_compress; i++) {
  3417. options.compression_per_level[i] = kNoCompression;
  3418. }
  3419. for (int i = FLAGS_min_level_to_compress;
  3420. i < FLAGS_num_levels; i++) {
  3421. options.compression_per_level[i] = FLAGS_compression_type_e;
  3422. }
  3423. }
  3424. options.soft_rate_limit = FLAGS_soft_rate_limit;
  3425. options.hard_rate_limit = FLAGS_hard_rate_limit;
  3426. options.soft_pending_compaction_bytes_limit =
  3427. FLAGS_soft_pending_compaction_bytes_limit;
  3428. options.hard_pending_compaction_bytes_limit =
  3429. FLAGS_hard_pending_compaction_bytes_limit;
  3430. options.delayed_write_rate = FLAGS_delayed_write_rate;
  3431. options.allow_concurrent_memtable_write =
  3432. FLAGS_allow_concurrent_memtable_write;
  3433. options.inplace_update_support = FLAGS_inplace_update_support;
  3434. options.inplace_update_num_locks = FLAGS_inplace_update_num_locks;
  3435. options.enable_write_thread_adaptive_yield =
  3436. FLAGS_enable_write_thread_adaptive_yield;
  3437. options.enable_pipelined_write = FLAGS_enable_pipelined_write;
  3438. options.unordered_write = FLAGS_unordered_write;
  3439. options.write_thread_max_yield_usec = FLAGS_write_thread_max_yield_usec;
  3440. options.write_thread_slow_yield_usec = FLAGS_write_thread_slow_yield_usec;
  3441. options.rate_limit_delay_max_milliseconds =
  3442. FLAGS_rate_limit_delay_max_milliseconds;
  3443. options.table_cache_numshardbits = FLAGS_table_cache_numshardbits;
  3444. options.max_compaction_bytes = FLAGS_max_compaction_bytes;
  3445. options.disable_auto_compactions = FLAGS_disable_auto_compactions;
  3446. options.optimize_filters_for_hits = FLAGS_optimize_filters_for_hits;
  3447. // fill storage options
  3448. options.advise_random_on_open = FLAGS_advise_random_on_open;
  3449. options.access_hint_on_compaction_start = FLAGS_compaction_fadvice_e;
  3450. options.use_adaptive_mutex = FLAGS_use_adaptive_mutex;
  3451. options.bytes_per_sync = FLAGS_bytes_per_sync;
  3452. options.wal_bytes_per_sync = FLAGS_wal_bytes_per_sync;
  3453. // merge operator options
  3454. options.merge_operator = MergeOperators::CreateFromStringId(
  3455. FLAGS_merge_operator);
  3456. if (options.merge_operator == nullptr && !FLAGS_merge_operator.empty()) {
  3457. fprintf(stderr, "invalid merge operator: %s\n",
  3458. FLAGS_merge_operator.c_str());
  3459. exit(1);
  3460. }
  3461. options.max_successive_merges = FLAGS_max_successive_merges;
  3462. options.report_bg_io_stats = FLAGS_report_bg_io_stats;
  3463. // set universal style compaction configurations, if applicable
  3464. if (FLAGS_universal_size_ratio != 0) {
  3465. options.compaction_options_universal.size_ratio =
  3466. FLAGS_universal_size_ratio;
  3467. }
  3468. if (FLAGS_universal_min_merge_width != 0) {
  3469. options.compaction_options_universal.min_merge_width =
  3470. FLAGS_universal_min_merge_width;
  3471. }
  3472. if (FLAGS_universal_max_merge_width != 0) {
  3473. options.compaction_options_universal.max_merge_width =
  3474. FLAGS_universal_max_merge_width;
  3475. }
  3476. if (FLAGS_universal_max_size_amplification_percent != 0) {
  3477. options.compaction_options_universal.max_size_amplification_percent =
  3478. FLAGS_universal_max_size_amplification_percent;
  3479. }
  3480. if (FLAGS_universal_compression_size_percent != -1) {
  3481. options.compaction_options_universal.compression_size_percent =
  3482. FLAGS_universal_compression_size_percent;
  3483. }
  3484. options.compaction_options_universal.allow_trivial_move =
  3485. FLAGS_universal_allow_trivial_move;
  3486. if (FLAGS_thread_status_per_interval > 0) {
  3487. options.enable_thread_tracking = true;
  3488. }
  3489. #ifndef ROCKSDB_LITE
  3490. if (FLAGS_readonly && FLAGS_transaction_db) {
  3491. fprintf(stderr, "Cannot use readonly flag with transaction_db\n");
  3492. exit(1);
  3493. }
  3494. if (FLAGS_use_secondary_db &&
  3495. (FLAGS_transaction_db || FLAGS_optimistic_transaction_db)) {
  3496. fprintf(stderr, "Cannot use use_secondary_db flag with transaction_db\n");
  3497. exit(1);
  3498. }
  3499. #endif // ROCKSDB_LITE
  3500. }
  3501. void InitializeOptionsGeneral(Options* opts) {
  3502. Options& options = *opts;
  3503. options.create_missing_column_families = FLAGS_num_column_families > 1;
  3504. options.statistics = dbstats;
  3505. options.wal_dir = FLAGS_wal_dir;
  3506. options.create_if_missing = !FLAGS_use_existing_db;
  3507. options.dump_malloc_stats = FLAGS_dump_malloc_stats;
  3508. options.stats_dump_period_sec =
  3509. static_cast<unsigned int>(FLAGS_stats_dump_period_sec);
  3510. options.stats_persist_period_sec =
  3511. static_cast<unsigned int>(FLAGS_stats_persist_period_sec);
  3512. options.persist_stats_to_disk = FLAGS_persist_stats_to_disk;
  3513. options.stats_history_buffer_size =
  3514. static_cast<size_t>(FLAGS_stats_history_buffer_size);
  3515. options.compression_opts.level = FLAGS_compression_level;
  3516. options.compression_opts.max_dict_bytes = FLAGS_compression_max_dict_bytes;
  3517. options.compression_opts.zstd_max_train_bytes =
  3518. FLAGS_compression_zstd_max_train_bytes;
  3519. // If this is a block based table, set some related options
  3520. if (options.table_factory->Name() == BlockBasedTableFactory::kName &&
  3521. options.table_factory->GetOptions() != nullptr) {
  3522. BlockBasedTableOptions* table_options =
  3523. reinterpret_cast<BlockBasedTableOptions*>(
  3524. options.table_factory->GetOptions());
  3525. if (FLAGS_cache_size) {
  3526. table_options->block_cache = cache_;
  3527. }
  3528. if (FLAGS_bloom_bits >= 0) {
  3529. table_options->filter_policy.reset(NewBloomFilterPolicy(
  3530. FLAGS_bloom_bits, FLAGS_use_block_based_filter));
  3531. }
  3532. }
  3533. if (FLAGS_row_cache_size) {
  3534. if (FLAGS_cache_numshardbits >= 1) {
  3535. options.row_cache =
  3536. NewLRUCache(FLAGS_row_cache_size, FLAGS_cache_numshardbits);
  3537. } else {
  3538. options.row_cache = NewLRUCache(FLAGS_row_cache_size);
  3539. }
  3540. }
  3541. if (FLAGS_enable_io_prio) {
  3542. FLAGS_env->LowerThreadPoolIOPriority(Env::LOW);
  3543. FLAGS_env->LowerThreadPoolIOPriority(Env::HIGH);
  3544. }
  3545. if (FLAGS_enable_cpu_prio) {
  3546. FLAGS_env->LowerThreadPoolCPUPriority(Env::LOW);
  3547. FLAGS_env->LowerThreadPoolCPUPriority(Env::HIGH);
  3548. }
  3549. options.env = FLAGS_env;
  3550. if (FLAGS_sine_write_rate) {
  3551. FLAGS_benchmark_write_rate_limit = static_cast<uint64_t>(SineRate(0));
  3552. }
  3553. if (FLAGS_rate_limiter_bytes_per_sec > 0) {
  3554. if (FLAGS_rate_limit_bg_reads &&
  3555. !FLAGS_new_table_reader_for_compaction_inputs) {
  3556. fprintf(stderr,
  3557. "rate limit compaction reads must have "
  3558. "new_table_reader_for_compaction_inputs set\n");
  3559. exit(1);
  3560. }
  3561. options.rate_limiter.reset(NewGenericRateLimiter(
  3562. FLAGS_rate_limiter_bytes_per_sec, 100 * 1000 /* refill_period_us */,
  3563. 10 /* fairness */,
  3564. FLAGS_rate_limit_bg_reads ? RateLimiter::Mode::kReadsOnly
  3565. : RateLimiter::Mode::kWritesOnly,
  3566. FLAGS_rate_limiter_auto_tuned));
  3567. }
  3568. options.listeners.emplace_back(listener_);
  3569. if (FLAGS_num_multi_db <= 1) {
  3570. OpenDb(options, FLAGS_db, &db_);
  3571. } else {
  3572. multi_dbs_.clear();
  3573. multi_dbs_.resize(FLAGS_num_multi_db);
  3574. auto wal_dir = options.wal_dir;
  3575. for (int i = 0; i < FLAGS_num_multi_db; i++) {
  3576. if (!wal_dir.empty()) {
  3577. options.wal_dir = GetPathForMultiple(wal_dir, i);
  3578. }
  3579. OpenDb(options, GetPathForMultiple(FLAGS_db, i), &multi_dbs_[i]);
  3580. }
  3581. options.wal_dir = wal_dir;
  3582. }
  3583. // KeepFilter is a noop filter, this can be used to test compaction filter
  3584. if (FLAGS_use_keep_filter) {
  3585. options.compaction_filter = new KeepFilter();
  3586. fprintf(stdout, "A noop compaction filter is used\n");
  3587. }
  3588. if (FLAGS_use_existing_keys) {
  3589. // Only work on single database
  3590. assert(db_.db != nullptr);
  3591. ReadOptions read_opts;
  3592. read_opts.total_order_seek = true;
  3593. Iterator* iter = db_.db->NewIterator(read_opts);
  3594. for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
  3595. keys_.emplace_back(iter->key().ToString());
  3596. }
  3597. delete iter;
  3598. FLAGS_num = keys_.size();
  3599. }
  3600. }
  3601. void Open(Options* opts) {
  3602. if (!InitializeOptionsFromFile(opts)) {
  3603. InitializeOptionsFromFlags(opts);
  3604. }
  3605. InitializeOptionsGeneral(opts);
  3606. }
  3607. void OpenDb(Options options, const std::string& db_name,
  3608. DBWithColumnFamilies* db) {
  3609. Status s;
  3610. // Open with column families if necessary.
  3611. if (FLAGS_num_column_families > 1) {
  3612. size_t num_hot = FLAGS_num_column_families;
  3613. if (FLAGS_num_hot_column_families > 0 &&
  3614. FLAGS_num_hot_column_families < FLAGS_num_column_families) {
  3615. num_hot = FLAGS_num_hot_column_families;
  3616. } else {
  3617. FLAGS_num_hot_column_families = FLAGS_num_column_families;
  3618. }
  3619. std::vector<ColumnFamilyDescriptor> column_families;
  3620. for (size_t i = 0; i < num_hot; i++) {
  3621. column_families.push_back(ColumnFamilyDescriptor(
  3622. ColumnFamilyName(i), ColumnFamilyOptions(options)));
  3623. }
  3624. std::vector<int> cfh_idx_to_prob;
  3625. if (!FLAGS_column_family_distribution.empty()) {
  3626. std::stringstream cf_prob_stream(FLAGS_column_family_distribution);
  3627. std::string cf_prob;
  3628. int sum = 0;
  3629. while (std::getline(cf_prob_stream, cf_prob, ',')) {
  3630. cfh_idx_to_prob.push_back(std::stoi(cf_prob));
  3631. sum += cfh_idx_to_prob.back();
  3632. }
  3633. if (sum != 100) {
  3634. fprintf(stderr, "column_family_distribution items must sum to 100\n");
  3635. exit(1);
  3636. }
  3637. if (cfh_idx_to_prob.size() != num_hot) {
  3638. fprintf(stderr,
  3639. "got %" ROCKSDB_PRIszt
  3640. " column_family_distribution items; expected "
  3641. "%" ROCKSDB_PRIszt "\n",
  3642. cfh_idx_to_prob.size(), num_hot);
  3643. exit(1);
  3644. }
  3645. }
  3646. #ifndef ROCKSDB_LITE
  3647. if (FLAGS_readonly) {
  3648. s = DB::OpenForReadOnly(options, db_name, column_families,
  3649. &db->cfh, &db->db);
  3650. } else if (FLAGS_optimistic_transaction_db) {
  3651. s = OptimisticTransactionDB::Open(options, db_name, column_families,
  3652. &db->cfh, &db->opt_txn_db);
  3653. if (s.ok()) {
  3654. db->db = db->opt_txn_db->GetBaseDB();
  3655. }
  3656. } else if (FLAGS_transaction_db) {
  3657. TransactionDB* ptr;
  3658. TransactionDBOptions txn_db_options;
  3659. if (options.unordered_write) {
  3660. options.two_write_queues = true;
  3661. txn_db_options.skip_concurrency_control = true;
  3662. txn_db_options.write_policy = WRITE_PREPARED;
  3663. }
  3664. s = TransactionDB::Open(options, txn_db_options, db_name,
  3665. column_families, &db->cfh, &ptr);
  3666. if (s.ok()) {
  3667. db->db = ptr;
  3668. }
  3669. } else {
  3670. s = DB::Open(options, db_name, column_families, &db->cfh, &db->db);
  3671. }
  3672. #else
  3673. s = DB::Open(options, db_name, column_families, &db->cfh, &db->db);
  3674. #endif // ROCKSDB_LITE
  3675. db->cfh.resize(FLAGS_num_column_families);
  3676. db->num_created = num_hot;
  3677. db->num_hot = num_hot;
  3678. db->cfh_idx_to_prob = std::move(cfh_idx_to_prob);
  3679. #ifndef ROCKSDB_LITE
  3680. } else if (FLAGS_readonly) {
  3681. s = DB::OpenForReadOnly(options, db_name, &db->db);
  3682. } else if (FLAGS_optimistic_transaction_db) {
  3683. s = OptimisticTransactionDB::Open(options, db_name, &db->opt_txn_db);
  3684. if (s.ok()) {
  3685. db->db = db->opt_txn_db->GetBaseDB();
  3686. }
  3687. } else if (FLAGS_transaction_db) {
  3688. TransactionDB* ptr = nullptr;
  3689. TransactionDBOptions txn_db_options;
  3690. if (options.unordered_write) {
  3691. options.two_write_queues = true;
  3692. txn_db_options.skip_concurrency_control = true;
  3693. txn_db_options.write_policy = WRITE_PREPARED;
  3694. }
  3695. s = CreateLoggerFromOptions(db_name, options, &options.info_log);
  3696. if (s.ok()) {
  3697. s = TransactionDB::Open(options, txn_db_options, db_name, &ptr);
  3698. }
  3699. if (s.ok()) {
  3700. db->db = ptr;
  3701. }
  3702. } else if (FLAGS_use_blob_db) {
  3703. blob_db::BlobDBOptions blob_db_options;
  3704. blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc;
  3705. blob_db_options.garbage_collection_cutoff = FLAGS_blob_db_gc_cutoff;
  3706. blob_db_options.is_fifo = FLAGS_blob_db_is_fifo;
  3707. blob_db_options.max_db_size = FLAGS_blob_db_max_db_size;
  3708. blob_db_options.ttl_range_secs = FLAGS_blob_db_ttl_range_secs;
  3709. blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size;
  3710. blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync;
  3711. blob_db_options.blob_file_size = FLAGS_blob_db_file_size;
  3712. blob_db_options.compression = FLAGS_blob_db_compression_type_e;
  3713. blob_db::BlobDB* ptr = nullptr;
  3714. s = blob_db::BlobDB::Open(options, blob_db_options, db_name, &ptr);
  3715. if (s.ok()) {
  3716. db->db = ptr;
  3717. }
  3718. } else if (FLAGS_use_secondary_db) {
  3719. if (FLAGS_secondary_path.empty()) {
  3720. std::string default_secondary_path;
  3721. FLAGS_env->GetTestDirectory(&default_secondary_path);
  3722. default_secondary_path += "/dbbench_secondary";
  3723. FLAGS_secondary_path = default_secondary_path;
  3724. }
  3725. s = DB::OpenAsSecondary(options, db_name, FLAGS_secondary_path, &db->db);
  3726. if (s.ok() && FLAGS_secondary_update_interval > 0) {
  3727. secondary_update_thread_.reset(new port::Thread(
  3728. [this](int interval, DBWithColumnFamilies* _db) {
  3729. while (0 == secondary_update_stopped_.load(
  3730. std::memory_order_relaxed)) {
  3731. Status secondary_update_status =
  3732. _db->db->TryCatchUpWithPrimary();
  3733. if (!secondary_update_status.ok()) {
  3734. fprintf(stderr, "Failed to catch up with primary: %s\n",
  3735. secondary_update_status.ToString().c_str());
  3736. break;
  3737. }
  3738. ++secondary_db_updates_;
  3739. FLAGS_env->SleepForMicroseconds(interval * 1000000);
  3740. }
  3741. },
  3742. FLAGS_secondary_update_interval, db));
  3743. }
  3744. #endif // ROCKSDB_LITE
  3745. } else {
  3746. s = DB::Open(options, db_name, &db->db);
  3747. }
  3748. if (!s.ok()) {
  3749. fprintf(stderr, "open error: %s\n", s.ToString().c_str());
  3750. exit(1);
  3751. }
  3752. }
  3753. enum WriteMode {
  3754. RANDOM, SEQUENTIAL, UNIQUE_RANDOM
  3755. };
  3756. void WriteSeqDeterministic(ThreadState* thread) {
  3757. DoDeterministicCompact(thread, open_options_.compaction_style, SEQUENTIAL);
  3758. }
  3759. void WriteUniqueRandomDeterministic(ThreadState* thread) {
  3760. DoDeterministicCompact(thread, open_options_.compaction_style,
  3761. UNIQUE_RANDOM);
  3762. }
  3763. void WriteSeq(ThreadState* thread) {
  3764. DoWrite(thread, SEQUENTIAL);
  3765. }
  3766. void WriteRandom(ThreadState* thread) {
  3767. DoWrite(thread, RANDOM);
  3768. }
  3769. void WriteUniqueRandom(ThreadState* thread) {
  3770. DoWrite(thread, UNIQUE_RANDOM);
  3771. }
  3772. class KeyGenerator {
  3773. public:
  3774. KeyGenerator(Random64* rand, WriteMode mode, uint64_t num,
  3775. uint64_t /*num_per_set*/ = 64 * 1024)
  3776. : rand_(rand), mode_(mode), num_(num), next_(0) {
  3777. if (mode_ == UNIQUE_RANDOM) {
  3778. // NOTE: if memory consumption of this approach becomes a concern,
  3779. // we can either break it into pieces and only random shuffle a section
  3780. // each time. Alternatively, use a bit map implementation
  3781. // (https://reviews.facebook.net/differential/diff/54627/)
  3782. values_.resize(num_);
  3783. for (uint64_t i = 0; i < num_; ++i) {
  3784. values_[i] = i;
  3785. }
  3786. std::shuffle(
  3787. values_.begin(), values_.end(),
  3788. std::default_random_engine(static_cast<unsigned int>(FLAGS_seed)));
  3789. }
  3790. }
  3791. uint64_t Next() {
  3792. switch (mode_) {
  3793. case SEQUENTIAL:
  3794. return next_++;
  3795. case RANDOM:
  3796. return rand_->Next() % num_;
  3797. case UNIQUE_RANDOM:
  3798. assert(next_ < num_);
  3799. return values_[next_++];
  3800. }
  3801. assert(false);
  3802. return std::numeric_limits<uint64_t>::max();
  3803. }
  3804. private:
  3805. Random64* rand_;
  3806. WriteMode mode_;
  3807. const uint64_t num_;
  3808. uint64_t next_;
  3809. std::vector<uint64_t> values_;
  3810. };
  3811. DB* SelectDB(ThreadState* thread) {
  3812. return SelectDBWithCfh(thread)->db;
  3813. }
  3814. DBWithColumnFamilies* SelectDBWithCfh(ThreadState* thread) {
  3815. return SelectDBWithCfh(thread->rand.Next());
  3816. }
  3817. DBWithColumnFamilies* SelectDBWithCfh(uint64_t rand_int) {
  3818. if (db_.db != nullptr) {
  3819. return &db_;
  3820. } else {
  3821. return &multi_dbs_[rand_int % multi_dbs_.size()];
  3822. }
  3823. }
  3824. double SineRate(double x) {
  3825. return FLAGS_sine_a*sin((FLAGS_sine_b*x) + FLAGS_sine_c) + FLAGS_sine_d;
  3826. }
  3827. void DoWrite(ThreadState* thread, WriteMode write_mode) {
  3828. const int test_duration = write_mode == RANDOM ? FLAGS_duration : 0;
  3829. const int64_t num_ops = writes_ == 0 ? num_ : writes_;
  3830. size_t num_key_gens = 1;
  3831. if (db_.db == nullptr) {
  3832. num_key_gens = multi_dbs_.size();
  3833. }
  3834. std::vector<std::unique_ptr<KeyGenerator>> key_gens(num_key_gens);
  3835. int64_t max_ops = num_ops * num_key_gens;
  3836. int64_t ops_per_stage = max_ops;
  3837. if (FLAGS_num_column_families > 1 && FLAGS_num_hot_column_families > 0) {
  3838. ops_per_stage = (max_ops - 1) / (FLAGS_num_column_families /
  3839. FLAGS_num_hot_column_families) +
  3840. 1;
  3841. }
  3842. Duration duration(test_duration, max_ops, ops_per_stage);
  3843. for (size_t i = 0; i < num_key_gens; i++) {
  3844. key_gens[i].reset(new KeyGenerator(&(thread->rand), write_mode,
  3845. num_ + max_num_range_tombstones_,
  3846. ops_per_stage));
  3847. }
  3848. if (num_ != FLAGS_num) {
  3849. char msg[100];
  3850. snprintf(msg, sizeof(msg), "(%" PRIu64 " ops)", num_);
  3851. thread->stats.AddMessage(msg);
  3852. }
  3853. RandomGenerator gen;
  3854. WriteBatch batch;
  3855. Status s;
  3856. int64_t bytes = 0;
  3857. std::unique_ptr<const char[]> key_guard;
  3858. Slice key = AllocateKey(&key_guard);
  3859. std::unique_ptr<const char[]> begin_key_guard;
  3860. Slice begin_key = AllocateKey(&begin_key_guard);
  3861. std::unique_ptr<const char[]> end_key_guard;
  3862. Slice end_key = AllocateKey(&end_key_guard);
  3863. std::vector<std::unique_ptr<const char[]>> expanded_key_guards;
  3864. std::vector<Slice> expanded_keys;
  3865. if (FLAGS_expand_range_tombstones) {
  3866. expanded_key_guards.resize(range_tombstone_width_);
  3867. for (auto& expanded_key_guard : expanded_key_guards) {
  3868. expanded_keys.emplace_back(AllocateKey(&expanded_key_guard));
  3869. }
  3870. }
  3871. int64_t stage = 0;
  3872. int64_t num_written = 0;
  3873. while (!duration.Done(entries_per_batch_)) {
  3874. if (duration.GetStage() != stage) {
  3875. stage = duration.GetStage();
  3876. if (db_.db != nullptr) {
  3877. db_.CreateNewCf(open_options_, stage);
  3878. } else {
  3879. for (auto& db : multi_dbs_) {
  3880. db.CreateNewCf(open_options_, stage);
  3881. }
  3882. }
  3883. }
  3884. size_t id = thread->rand.Next() % num_key_gens;
  3885. DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(id);
  3886. batch.Clear();
  3887. int64_t batch_bytes = 0;
  3888. for (int64_t j = 0; j < entries_per_batch_; j++) {
  3889. int64_t rand_num = key_gens[id]->Next();
  3890. GenerateKeyFromInt(rand_num, FLAGS_num, &key);
  3891. Slice val = gen.Generate();
  3892. if (use_blob_db_) {
  3893. #ifndef ROCKSDB_LITE
  3894. blob_db::BlobDB* blobdb =
  3895. static_cast<blob_db::BlobDB*>(db_with_cfh->db);
  3896. if (FLAGS_blob_db_max_ttl_range > 0) {
  3897. int ttl = rand() % FLAGS_blob_db_max_ttl_range;
  3898. s = blobdb->PutWithTTL(write_options_, key, val, ttl);
  3899. } else {
  3900. s = blobdb->Put(write_options_, key, val);
  3901. }
  3902. #endif // ROCKSDB_LITE
  3903. } else if (FLAGS_num_column_families <= 1) {
  3904. batch.Put(key, val);
  3905. } else {
  3906. // We use same rand_num as seed for key and column family so that we
  3907. // can deterministically find the cfh corresponding to a particular
  3908. // key while reading the key.
  3909. batch.Put(db_with_cfh->GetCfh(rand_num), key,
  3910. val);
  3911. }
  3912. batch_bytes += val.size() + key_size_;
  3913. bytes += val.size() + key_size_;
  3914. ++num_written;
  3915. if (writes_per_range_tombstone_ > 0 &&
  3916. num_written > writes_before_delete_range_ &&
  3917. (num_written - writes_before_delete_range_) /
  3918. writes_per_range_tombstone_ <=
  3919. max_num_range_tombstones_ &&
  3920. (num_written - writes_before_delete_range_) %
  3921. writes_per_range_tombstone_ ==
  3922. 0) {
  3923. int64_t begin_num = key_gens[id]->Next();
  3924. if (FLAGS_expand_range_tombstones) {
  3925. for (int64_t offset = 0; offset < range_tombstone_width_;
  3926. ++offset) {
  3927. GenerateKeyFromInt(begin_num + offset, FLAGS_num,
  3928. &expanded_keys[offset]);
  3929. if (use_blob_db_) {
  3930. #ifndef ROCKSDB_LITE
  3931. s = db_with_cfh->db->Delete(write_options_,
  3932. expanded_keys[offset]);
  3933. #endif // ROCKSDB_LITE
  3934. } else if (FLAGS_num_column_families <= 1) {
  3935. batch.Delete(expanded_keys[offset]);
  3936. } else {
  3937. batch.Delete(db_with_cfh->GetCfh(rand_num),
  3938. expanded_keys[offset]);
  3939. }
  3940. }
  3941. } else {
  3942. GenerateKeyFromInt(begin_num, FLAGS_num, &begin_key);
  3943. GenerateKeyFromInt(begin_num + range_tombstone_width_, FLAGS_num,
  3944. &end_key);
  3945. if (use_blob_db_) {
  3946. #ifndef ROCKSDB_LITE
  3947. s = db_with_cfh->db->DeleteRange(
  3948. write_options_, db_with_cfh->db->DefaultColumnFamily(),
  3949. begin_key, end_key);
  3950. #endif // ROCKSDB_LITE
  3951. } else if (FLAGS_num_column_families <= 1) {
  3952. batch.DeleteRange(begin_key, end_key);
  3953. } else {
  3954. batch.DeleteRange(db_with_cfh->GetCfh(rand_num), begin_key,
  3955. end_key);
  3956. }
  3957. }
  3958. }
  3959. }
  3960. if (thread->shared->write_rate_limiter.get() != nullptr) {
  3961. thread->shared->write_rate_limiter->Request(
  3962. batch_bytes, Env::IO_HIGH,
  3963. nullptr /* stats */, RateLimiter::OpType::kWrite);
  3964. // Set time at which last op finished to Now() to hide latency and
  3965. // sleep from rate limiter. Also, do the check once per batch, not
  3966. // once per write.
  3967. thread->stats.ResetLastOpTime();
  3968. }
  3969. if (!use_blob_db_) {
  3970. s = db_with_cfh->db->Write(write_options_, &batch);
  3971. }
  3972. thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db,
  3973. entries_per_batch_, kWrite);
  3974. if (FLAGS_sine_write_rate) {
  3975. uint64_t now = FLAGS_env->NowMicros();
  3976. uint64_t usecs_since_last;
  3977. if (now > thread->stats.GetSineInterval()) {
  3978. usecs_since_last = now - thread->stats.GetSineInterval();
  3979. } else {
  3980. usecs_since_last = 0;
  3981. }
  3982. if (usecs_since_last >
  3983. (FLAGS_sine_write_rate_interval_milliseconds * uint64_t{1000})) {
  3984. double usecs_since_start =
  3985. static_cast<double>(now - thread->stats.GetStart());
  3986. thread->stats.ResetSineInterval();
  3987. uint64_t write_rate =
  3988. static_cast<uint64_t>(SineRate(usecs_since_start / 1000000.0));
  3989. thread->shared->write_rate_limiter.reset(
  3990. NewGenericRateLimiter(write_rate));
  3991. }
  3992. }
  3993. if (!s.ok()) {
  3994. s = listener_->WaitForRecovery(600000000) ? Status::OK() : s;
  3995. }
  3996. if (!s.ok()) {
  3997. fprintf(stderr, "put error: %s\n", s.ToString().c_str());
  3998. exit(1);
  3999. }
  4000. }
  4001. thread->stats.AddBytes(bytes);
  4002. }
  4003. Status DoDeterministicCompact(ThreadState* thread,
  4004. CompactionStyle compaction_style,
  4005. WriteMode write_mode) {
  4006. #ifndef ROCKSDB_LITE
  4007. ColumnFamilyMetaData meta;
  4008. std::vector<DB*> db_list;
  4009. if (db_.db != nullptr) {
  4010. db_list.push_back(db_.db);
  4011. } else {
  4012. for (auto& db : multi_dbs_) {
  4013. db_list.push_back(db.db);
  4014. }
  4015. }
  4016. std::vector<Options> options_list;
  4017. for (auto db : db_list) {
  4018. options_list.push_back(db->GetOptions());
  4019. if (compaction_style != kCompactionStyleFIFO) {
  4020. db->SetOptions({{"disable_auto_compactions", "1"},
  4021. {"level0_slowdown_writes_trigger", "400000000"},
  4022. {"level0_stop_writes_trigger", "400000000"}});
  4023. } else {
  4024. db->SetOptions({{"disable_auto_compactions", "1"}});
  4025. }
  4026. }
  4027. assert(!db_list.empty());
  4028. auto num_db = db_list.size();
  4029. size_t num_levels = static_cast<size_t>(open_options_.num_levels);
  4030. size_t output_level = open_options_.num_levels - 1;
  4031. std::vector<std::vector<std::vector<SstFileMetaData>>> sorted_runs(num_db);
  4032. std::vector<size_t> num_files_at_level0(num_db, 0);
  4033. if (compaction_style == kCompactionStyleLevel) {
  4034. if (num_levels == 0) {
  4035. return Status::InvalidArgument("num_levels should be larger than 1");
  4036. }
  4037. bool should_stop = false;
  4038. while (!should_stop) {
  4039. if (sorted_runs[0].empty()) {
  4040. DoWrite(thread, write_mode);
  4041. } else {
  4042. DoWrite(thread, UNIQUE_RANDOM);
  4043. }
  4044. for (size_t i = 0; i < num_db; i++) {
  4045. auto db = db_list[i];
  4046. db->Flush(FlushOptions());
  4047. db->GetColumnFamilyMetaData(&meta);
  4048. if (num_files_at_level0[i] == meta.levels[0].files.size() ||
  4049. writes_ == 0) {
  4050. should_stop = true;
  4051. continue;
  4052. }
  4053. sorted_runs[i].emplace_back(
  4054. meta.levels[0].files.begin(),
  4055. meta.levels[0].files.end() - num_files_at_level0[i]);
  4056. num_files_at_level0[i] = meta.levels[0].files.size();
  4057. if (sorted_runs[i].back().size() == 1) {
  4058. should_stop = true;
  4059. continue;
  4060. }
  4061. if (sorted_runs[i].size() == output_level) {
  4062. auto& L1 = sorted_runs[i].back();
  4063. L1.erase(L1.begin(), L1.begin() + L1.size() / 3);
  4064. should_stop = true;
  4065. continue;
  4066. }
  4067. }
  4068. writes_ /= static_cast<int64_t>(open_options_.max_bytes_for_level_multiplier);
  4069. }
  4070. for (size_t i = 0; i < num_db; i++) {
  4071. if (sorted_runs[i].size() < num_levels - 1) {
  4072. fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt " levels\n", num_levels);
  4073. exit(1);
  4074. }
  4075. }
  4076. for (size_t i = 0; i < num_db; i++) {
  4077. auto db = db_list[i];
  4078. auto compactionOptions = CompactionOptions();
  4079. compactionOptions.compression = FLAGS_compression_type_e;
  4080. auto options = db->GetOptions();
  4081. MutableCFOptions mutable_cf_options(options);
  4082. for (size_t j = 0; j < sorted_runs[i].size(); j++) {
  4083. compactionOptions.output_file_size_limit =
  4084. MaxFileSizeForLevel(mutable_cf_options,
  4085. static_cast<int>(output_level), compaction_style);
  4086. std::cout << sorted_runs[i][j].size() << std::endl;
  4087. db->CompactFiles(compactionOptions, {sorted_runs[i][j].back().name,
  4088. sorted_runs[i][j].front().name},
  4089. static_cast<int>(output_level - j) /*level*/);
  4090. }
  4091. }
  4092. } else if (compaction_style == kCompactionStyleUniversal) {
  4093. auto ratio = open_options_.compaction_options_universal.size_ratio;
  4094. bool should_stop = false;
  4095. while (!should_stop) {
  4096. if (sorted_runs[0].empty()) {
  4097. DoWrite(thread, write_mode);
  4098. } else {
  4099. DoWrite(thread, UNIQUE_RANDOM);
  4100. }
  4101. for (size_t i = 0; i < num_db; i++) {
  4102. auto db = db_list[i];
  4103. db->Flush(FlushOptions());
  4104. db->GetColumnFamilyMetaData(&meta);
  4105. if (num_files_at_level0[i] == meta.levels[0].files.size() ||
  4106. writes_ == 0) {
  4107. should_stop = true;
  4108. continue;
  4109. }
  4110. sorted_runs[i].emplace_back(
  4111. meta.levels[0].files.begin(),
  4112. meta.levels[0].files.end() - num_files_at_level0[i]);
  4113. num_files_at_level0[i] = meta.levels[0].files.size();
  4114. if (sorted_runs[i].back().size() == 1) {
  4115. should_stop = true;
  4116. continue;
  4117. }
  4118. num_files_at_level0[i] = meta.levels[0].files.size();
  4119. }
  4120. writes_ = static_cast<int64_t>(writes_* static_cast<double>(100) / (ratio + 200));
  4121. }
  4122. for (size_t i = 0; i < num_db; i++) {
  4123. if (sorted_runs[i].size() < num_levels) {
  4124. fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt " levels\n", num_levels);
  4125. exit(1);
  4126. }
  4127. }
  4128. for (size_t i = 0; i < num_db; i++) {
  4129. auto db = db_list[i];
  4130. auto compactionOptions = CompactionOptions();
  4131. compactionOptions.compression = FLAGS_compression_type_e;
  4132. auto options = db->GetOptions();
  4133. MutableCFOptions mutable_cf_options(options);
  4134. for (size_t j = 0; j < sorted_runs[i].size(); j++) {
  4135. compactionOptions.output_file_size_limit =
  4136. MaxFileSizeForLevel(mutable_cf_options,
  4137. static_cast<int>(output_level), compaction_style);
  4138. db->CompactFiles(
  4139. compactionOptions,
  4140. {sorted_runs[i][j].back().name, sorted_runs[i][j].front().name},
  4141. (output_level > j ? static_cast<int>(output_level - j)
  4142. : 0) /*level*/);
  4143. }
  4144. }
  4145. } else if (compaction_style == kCompactionStyleFIFO) {
  4146. if (num_levels != 1) {
  4147. return Status::InvalidArgument(
  4148. "num_levels should be 1 for FIFO compaction");
  4149. }
  4150. if (FLAGS_num_multi_db != 0) {
  4151. return Status::InvalidArgument("Doesn't support multiDB");
  4152. }
  4153. auto db = db_list[0];
  4154. std::vector<std::string> file_names;
  4155. while (true) {
  4156. if (sorted_runs[0].empty()) {
  4157. DoWrite(thread, write_mode);
  4158. } else {
  4159. DoWrite(thread, UNIQUE_RANDOM);
  4160. }
  4161. db->Flush(FlushOptions());
  4162. db->GetColumnFamilyMetaData(&meta);
  4163. auto total_size = meta.levels[0].size;
  4164. if (total_size >=
  4165. db->GetOptions().compaction_options_fifo.max_table_files_size) {
  4166. for (auto file_meta : meta.levels[0].files) {
  4167. file_names.emplace_back(file_meta.name);
  4168. }
  4169. break;
  4170. }
  4171. }
  4172. // TODO(shuzhang1989): Investigate why CompactFiles not working
  4173. // auto compactionOptions = CompactionOptions();
  4174. // db->CompactFiles(compactionOptions, file_names, 0);
  4175. auto compactionOptions = CompactRangeOptions();
  4176. db->CompactRange(compactionOptions, nullptr, nullptr);
  4177. } else {
  4178. fprintf(stdout,
  4179. "%-12s : skipped (-compaction_stype=kCompactionStyleNone)\n",
  4180. "filldeterministic");
  4181. return Status::InvalidArgument("None compaction is not supported");
  4182. }
  4183. // Verify seqno and key range
  4184. // Note: the seqno get changed at the max level by implementation
  4185. // optimization, so skip the check of the max level.
  4186. #ifndef NDEBUG
  4187. for (size_t k = 0; k < num_db; k++) {
  4188. auto db = db_list[k];
  4189. db->GetColumnFamilyMetaData(&meta);
  4190. // verify the number of sorted runs
  4191. if (compaction_style == kCompactionStyleLevel) {
  4192. assert(num_levels - 1 == sorted_runs[k].size());
  4193. } else if (compaction_style == kCompactionStyleUniversal) {
  4194. assert(meta.levels[0].files.size() + num_levels - 1 ==
  4195. sorted_runs[k].size());
  4196. } else if (compaction_style == kCompactionStyleFIFO) {
  4197. // TODO(gzh): FIFO compaction
  4198. db->GetColumnFamilyMetaData(&meta);
  4199. auto total_size = meta.levels[0].size;
  4200. assert(total_size <=
  4201. db->GetOptions().compaction_options_fifo.max_table_files_size);
  4202. break;
  4203. }
  4204. // verify smallest/largest seqno and key range of each sorted run
  4205. auto max_level = num_levels - 1;
  4206. int level;
  4207. for (size_t i = 0; i < sorted_runs[k].size(); i++) {
  4208. level = static_cast<int>(max_level - i);
  4209. SequenceNumber sorted_run_smallest_seqno = kMaxSequenceNumber;
  4210. SequenceNumber sorted_run_largest_seqno = 0;
  4211. std::string sorted_run_smallest_key, sorted_run_largest_key;
  4212. bool first_key = true;
  4213. for (auto fileMeta : sorted_runs[k][i]) {
  4214. sorted_run_smallest_seqno =
  4215. std::min(sorted_run_smallest_seqno, fileMeta.smallest_seqno);
  4216. sorted_run_largest_seqno =
  4217. std::max(sorted_run_largest_seqno, fileMeta.largest_seqno);
  4218. if (first_key ||
  4219. db->DefaultColumnFamily()->GetComparator()->Compare(
  4220. fileMeta.smallestkey, sorted_run_smallest_key) < 0) {
  4221. sorted_run_smallest_key = fileMeta.smallestkey;
  4222. }
  4223. if (first_key ||
  4224. db->DefaultColumnFamily()->GetComparator()->Compare(
  4225. fileMeta.largestkey, sorted_run_largest_key) > 0) {
  4226. sorted_run_largest_key = fileMeta.largestkey;
  4227. }
  4228. first_key = false;
  4229. }
  4230. if (compaction_style == kCompactionStyleLevel ||
  4231. (compaction_style == kCompactionStyleUniversal && level > 0)) {
  4232. SequenceNumber level_smallest_seqno = kMaxSequenceNumber;
  4233. SequenceNumber level_largest_seqno = 0;
  4234. for (auto fileMeta : meta.levels[level].files) {
  4235. level_smallest_seqno =
  4236. std::min(level_smallest_seqno, fileMeta.smallest_seqno);
  4237. level_largest_seqno =
  4238. std::max(level_largest_seqno, fileMeta.largest_seqno);
  4239. }
  4240. assert(sorted_run_smallest_key ==
  4241. meta.levels[level].files.front().smallestkey);
  4242. assert(sorted_run_largest_key ==
  4243. meta.levels[level].files.back().largestkey);
  4244. if (level != static_cast<int>(max_level)) {
  4245. // compaction at max_level would change sequence number
  4246. assert(sorted_run_smallest_seqno == level_smallest_seqno);
  4247. assert(sorted_run_largest_seqno == level_largest_seqno);
  4248. }
  4249. } else if (compaction_style == kCompactionStyleUniversal) {
  4250. // level <= 0 means sorted runs on level 0
  4251. auto level0_file =
  4252. meta.levels[0].files[sorted_runs[k].size() - 1 - i];
  4253. assert(sorted_run_smallest_key == level0_file.smallestkey);
  4254. assert(sorted_run_largest_key == level0_file.largestkey);
  4255. if (level != static_cast<int>(max_level)) {
  4256. assert(sorted_run_smallest_seqno == level0_file.smallest_seqno);
  4257. assert(sorted_run_largest_seqno == level0_file.largest_seqno);
  4258. }
  4259. }
  4260. }
  4261. }
  4262. #endif
  4263. // print the size of each sorted_run
  4264. for (size_t k = 0; k < num_db; k++) {
  4265. auto db = db_list[k];
  4266. fprintf(stdout,
  4267. "---------------------- DB %" ROCKSDB_PRIszt " LSM ---------------------\n", k);
  4268. db->GetColumnFamilyMetaData(&meta);
  4269. for (auto& levelMeta : meta.levels) {
  4270. if (levelMeta.files.empty()) {
  4271. continue;
  4272. }
  4273. if (levelMeta.level == 0) {
  4274. for (auto& fileMeta : levelMeta.files) {
  4275. fprintf(stdout, "Level[%d]: %s(size: %" ROCKSDB_PRIszt " bytes)\n",
  4276. levelMeta.level, fileMeta.name.c_str(), fileMeta.size);
  4277. }
  4278. } else {
  4279. fprintf(stdout, "Level[%d]: %s - %s(total size: %" PRIi64 " bytes)\n",
  4280. levelMeta.level, levelMeta.files.front().name.c_str(),
  4281. levelMeta.files.back().name.c_str(), levelMeta.size);
  4282. }
  4283. }
  4284. }
  4285. for (size_t i = 0; i < num_db; i++) {
  4286. db_list[i]->SetOptions(
  4287. {{"disable_auto_compactions",
  4288. std::to_string(options_list[i].disable_auto_compactions)},
  4289. {"level0_slowdown_writes_trigger",
  4290. std::to_string(options_list[i].level0_slowdown_writes_trigger)},
  4291. {"level0_stop_writes_trigger",
  4292. std::to_string(options_list[i].level0_stop_writes_trigger)}});
  4293. }
  4294. return Status::OK();
  4295. #else
  4296. (void)thread;
  4297. (void)compaction_style;
  4298. (void)write_mode;
  4299. fprintf(stderr, "Rocksdb Lite doesn't support filldeterministic\n");
  4300. return Status::NotSupported(
  4301. "Rocksdb Lite doesn't support filldeterministic");
  4302. #endif // ROCKSDB_LITE
  4303. }
  4304. void ReadSequential(ThreadState* thread) {
  4305. if (db_.db != nullptr) {
  4306. ReadSequential(thread, db_.db);
  4307. } else {
  4308. for (const auto& db_with_cfh : multi_dbs_) {
  4309. ReadSequential(thread, db_with_cfh.db);
  4310. }
  4311. }
  4312. }
  4313. void ReadSequential(ThreadState* thread, DB* db) {
  4314. ReadOptions options(FLAGS_verify_checksum, true);
  4315. options.tailing = FLAGS_use_tailing_iterator;
  4316. Iterator* iter = db->NewIterator(options);
  4317. int64_t i = 0;
  4318. int64_t bytes = 0;
  4319. for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) {
  4320. bytes += iter->key().size() + iter->value().size();
  4321. thread->stats.FinishedOps(nullptr, db, 1, kRead);
  4322. ++i;
  4323. if (thread->shared->read_rate_limiter.get() != nullptr &&
  4324. i % 1024 == 1023) {
  4325. thread->shared->read_rate_limiter->Request(1024, Env::IO_HIGH,
  4326. nullptr /* stats */,
  4327. RateLimiter::OpType::kRead);
  4328. }
  4329. }
  4330. delete iter;
  4331. thread->stats.AddBytes(bytes);
  4332. if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
  4333. thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
  4334. get_perf_context()->ToString());
  4335. }
  4336. }
  4337. void ReadToRowCache(ThreadState* thread) {
  4338. int64_t read = 0;
  4339. int64_t found = 0;
  4340. int64_t bytes = 0;
  4341. int64_t key_rand = 0;
  4342. ReadOptions options(FLAGS_verify_checksum, true);
  4343. std::unique_ptr<const char[]> key_guard;
  4344. Slice key = AllocateKey(&key_guard);
  4345. PinnableSlice pinnable_val;
  4346. while (key_rand < FLAGS_num) {
  4347. DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
  4348. // We use same key_rand as seed for key and column family so that we can
  4349. // deterministically find the cfh corresponding to a particular key, as it
  4350. // is done in DoWrite method.
  4351. GenerateKeyFromInt(key_rand, FLAGS_num, &key);
  4352. key_rand++;
  4353. read++;
  4354. Status s;
  4355. if (FLAGS_num_column_families > 1) {
  4356. s = db_with_cfh->db->Get(options, db_with_cfh->GetCfh(key_rand), key,
  4357. &pinnable_val);
  4358. } else {
  4359. pinnable_val.Reset();
  4360. s = db_with_cfh->db->Get(options,
  4361. db_with_cfh->db->DefaultColumnFamily(), key,
  4362. &pinnable_val);
  4363. }
  4364. if (s.ok()) {
  4365. found++;
  4366. bytes += key.size() + pinnable_val.size();
  4367. } else if (!s.IsNotFound()) {
  4368. fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
  4369. abort();
  4370. }
  4371. if (thread->shared->read_rate_limiter.get() != nullptr &&
  4372. read % 256 == 255) {
  4373. thread->shared->read_rate_limiter->Request(
  4374. 256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
  4375. }
  4376. thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
  4377. }
  4378. char msg[100];
  4379. snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", found,
  4380. read);
  4381. thread->stats.AddBytes(bytes);
  4382. thread->stats.AddMessage(msg);
  4383. if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
  4384. thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
  4385. get_perf_context()->ToString());
  4386. }
  4387. }
  4388. void ReadReverse(ThreadState* thread) {
  4389. if (db_.db != nullptr) {
  4390. ReadReverse(thread, db_.db);
  4391. } else {
  4392. for (const auto& db_with_cfh : multi_dbs_) {
  4393. ReadReverse(thread, db_with_cfh.db);
  4394. }
  4395. }
  4396. }
  4397. void ReadReverse(ThreadState* thread, DB* db) {
  4398. Iterator* iter = db->NewIterator(ReadOptions(FLAGS_verify_checksum, true));
  4399. int64_t i = 0;
  4400. int64_t bytes = 0;
  4401. for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) {
  4402. bytes += iter->key().size() + iter->value().size();
  4403. thread->stats.FinishedOps(nullptr, db, 1, kRead);
  4404. ++i;
  4405. if (thread->shared->read_rate_limiter.get() != nullptr &&
  4406. i % 1024 == 1023) {
  4407. thread->shared->read_rate_limiter->Request(1024, Env::IO_HIGH,
  4408. nullptr /* stats */,
  4409. RateLimiter::OpType::kRead);
  4410. }
  4411. }
  4412. delete iter;
  4413. thread->stats.AddBytes(bytes);
  4414. }
  4415. void ReadRandomFast(ThreadState* thread) {
  4416. int64_t read = 0;
  4417. int64_t found = 0;
  4418. int64_t nonexist = 0;
  4419. ReadOptions options(FLAGS_verify_checksum, true);
  4420. std::unique_ptr<const char[]> key_guard;
  4421. Slice key = AllocateKey(&key_guard);
  4422. std::string value;
  4423. DB* db = SelectDBWithCfh(thread)->db;
  4424. int64_t pot = 1;
  4425. while (pot < FLAGS_num) {
  4426. pot <<= 1;
  4427. }
  4428. Duration duration(FLAGS_duration, reads_);
  4429. do {
  4430. for (int i = 0; i < 100; ++i) {
  4431. int64_t key_rand = thread->rand.Next() & (pot - 1);
  4432. GenerateKeyFromInt(key_rand, FLAGS_num, &key);
  4433. ++read;
  4434. auto status = db->Get(options, key, &value);
  4435. if (status.ok()) {
  4436. ++found;
  4437. } else if (!status.IsNotFound()) {
  4438. fprintf(stderr, "Get returned an error: %s\n",
  4439. status.ToString().c_str());
  4440. abort();
  4441. }
  4442. if (key_rand >= FLAGS_num) {
  4443. ++nonexist;
  4444. }
  4445. }
  4446. if (thread->shared->read_rate_limiter.get() != nullptr) {
  4447. thread->shared->read_rate_limiter->Request(
  4448. 100, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
  4449. }
  4450. thread->stats.FinishedOps(nullptr, db, 100, kRead);
  4451. } while (!duration.Done(100));
  4452. char msg[100];
  4453. snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found, "
  4454. "issued %" PRIu64 " non-exist keys)\n",
  4455. found, read, nonexist);
  4456. thread->stats.AddMessage(msg);
  4457. if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
  4458. thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
  4459. get_perf_context()->ToString());
  4460. }
  4461. }
  4462. int64_t GetRandomKey(Random64* rand) {
  4463. uint64_t rand_int = rand->Next();
  4464. int64_t key_rand;
  4465. if (read_random_exp_range_ == 0) {
  4466. key_rand = rand_int % FLAGS_num;
  4467. } else {
  4468. const uint64_t kBigInt = static_cast<uint64_t>(1U) << 62;
  4469. long double order = -static_cast<long double>(rand_int % kBigInt) /
  4470. static_cast<long double>(kBigInt) *
  4471. read_random_exp_range_;
  4472. long double exp_ran = std::exp(order);
  4473. uint64_t rand_num =
  4474. static_cast<int64_t>(exp_ran * static_cast<long double>(FLAGS_num));
  4475. // Map to a different number to avoid locality.
  4476. const uint64_t kBigPrime = 0x5bd1e995;
  4477. // Overflow is like %(2^64). Will have little impact of results.
  4478. key_rand = static_cast<int64_t>((rand_num * kBigPrime) % FLAGS_num);
  4479. }
  4480. return key_rand;
  4481. }
  4482. void ReadRandom(ThreadState* thread) {
  4483. int64_t read = 0;
  4484. int64_t found = 0;
  4485. int64_t bytes = 0;
  4486. int num_keys = 0;
  4487. int64_t key_rand = GetRandomKey(&thread->rand);
  4488. ReadOptions options(FLAGS_verify_checksum, true);
  4489. std::unique_ptr<const char[]> key_guard;
  4490. Slice key = AllocateKey(&key_guard);
  4491. PinnableSlice pinnable_val;
  4492. Duration duration(FLAGS_duration, reads_);
  4493. while (!duration.Done(1)) {
  4494. DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
  4495. // We use same key_rand as seed for key and column family so that we can
  4496. // deterministically find the cfh corresponding to a particular key, as it
  4497. // is done in DoWrite method.
  4498. GenerateKeyFromInt(key_rand, FLAGS_num, &key);
  4499. if (entries_per_batch_ > 1 && FLAGS_multiread_stride) {
  4500. if (++num_keys == entries_per_batch_) {
  4501. num_keys = 0;
  4502. key_rand = GetRandomKey(&thread->rand);
  4503. if ((key_rand + (entries_per_batch_ - 1) * FLAGS_multiread_stride) >=
  4504. FLAGS_num) {
  4505. key_rand = FLAGS_num - entries_per_batch_ * FLAGS_multiread_stride;
  4506. }
  4507. } else {
  4508. key_rand += FLAGS_multiread_stride;
  4509. }
  4510. } else {
  4511. key_rand = GetRandomKey(&thread->rand);
  4512. }
  4513. read++;
  4514. Status s;
  4515. if (FLAGS_num_column_families > 1) {
  4516. s = db_with_cfh->db->Get(options, db_with_cfh->GetCfh(key_rand), key,
  4517. &pinnable_val);
  4518. } else {
  4519. pinnable_val.Reset();
  4520. s = db_with_cfh->db->Get(options,
  4521. db_with_cfh->db->DefaultColumnFamily(), key,
  4522. &pinnable_val);
  4523. }
  4524. if (s.ok()) {
  4525. found++;
  4526. bytes += key.size() + pinnable_val.size();
  4527. } else if (!s.IsNotFound()) {
  4528. fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
  4529. abort();
  4530. }
  4531. if (thread->shared->read_rate_limiter.get() != nullptr &&
  4532. read % 256 == 255) {
  4533. thread->shared->read_rate_limiter->Request(
  4534. 256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
  4535. }
  4536. thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
  4537. }
  4538. char msg[100];
  4539. snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n",
  4540. found, read);
  4541. thread->stats.AddBytes(bytes);
  4542. thread->stats.AddMessage(msg);
  4543. if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
  4544. thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
  4545. get_perf_context()->ToString());
  4546. }
  4547. }
  4548. // Calls MultiGet over a list of keys from a random distribution.
  4549. // Returns the total number of keys found.
  4550. void MultiReadRandom(ThreadState* thread) {
  4551. int64_t read = 0;
  4552. int64_t num_multireads = 0;
  4553. int64_t found = 0;
  4554. ReadOptions options(FLAGS_verify_checksum, true);
  4555. std::vector<Slice> keys;
  4556. std::vector<std::unique_ptr<const char[]> > key_guards;
  4557. std::vector<std::string> values(entries_per_batch_);
  4558. PinnableSlice* pin_values = new PinnableSlice[entries_per_batch_];
  4559. std::unique_ptr<PinnableSlice[]> pin_values_guard(pin_values);
  4560. std::vector<Status> stat_list(entries_per_batch_);
  4561. while (static_cast<int64_t>(keys.size()) < entries_per_batch_) {
  4562. key_guards.push_back(std::unique_ptr<const char[]>());
  4563. keys.push_back(AllocateKey(&key_guards.back()));
  4564. }
  4565. Duration duration(FLAGS_duration, reads_);
  4566. while (!duration.Done(1)) {
  4567. DB* db = SelectDB(thread);
  4568. if (FLAGS_multiread_stride) {
  4569. int64_t key = GetRandomKey(&thread->rand);
  4570. if ((key + (entries_per_batch_ - 1) * FLAGS_multiread_stride) >=
  4571. static_cast<int64_t>(FLAGS_num)) {
  4572. key = FLAGS_num - entries_per_batch_ * FLAGS_multiread_stride;
  4573. }
  4574. for (int64_t i = 0; i < entries_per_batch_; ++i) {
  4575. GenerateKeyFromInt(key, FLAGS_num, &keys[i]);
  4576. key += FLAGS_multiread_stride;
  4577. }
  4578. } else {
  4579. for (int64_t i = 0; i < entries_per_batch_; ++i) {
  4580. GenerateKeyFromInt(GetRandomKey(&thread->rand), FLAGS_num, &keys[i]);
  4581. }
  4582. }
  4583. if (!FLAGS_multiread_batched) {
  4584. std::vector<Status> statuses = db->MultiGet(options, keys, &values);
  4585. assert(static_cast<int64_t>(statuses.size()) == entries_per_batch_);
  4586. read += entries_per_batch_;
  4587. num_multireads++;
  4588. for (int64_t i = 0; i < entries_per_batch_; ++i) {
  4589. if (statuses[i].ok()) {
  4590. ++found;
  4591. } else if (!statuses[i].IsNotFound()) {
  4592. fprintf(stderr, "MultiGet returned an error: %s\n",
  4593. statuses[i].ToString().c_str());
  4594. abort();
  4595. }
  4596. }
  4597. } else {
  4598. db->MultiGet(options, db->DefaultColumnFamily(), keys.size(),
  4599. keys.data(), pin_values, stat_list.data());
  4600. read += entries_per_batch_;
  4601. num_multireads++;
  4602. for (int64_t i = 0; i < entries_per_batch_; ++i) {
  4603. if (stat_list[i].ok()) {
  4604. ++found;
  4605. } else if (!stat_list[i].IsNotFound()) {
  4606. fprintf(stderr, "MultiGet returned an error: %s\n",
  4607. stat_list[i].ToString().c_str());
  4608. abort();
  4609. }
  4610. stat_list[i] = Status::OK();
  4611. pin_values[i].Reset();
  4612. }
  4613. }
  4614. if (thread->shared->read_rate_limiter.get() != nullptr &&
  4615. num_multireads % 256 == 255) {
  4616. thread->shared->read_rate_limiter->Request(
  4617. 256 * entries_per_batch_, Env::IO_HIGH, nullptr /* stats */,
  4618. RateLimiter::OpType::kRead);
  4619. }
  4620. thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kRead);
  4621. }
  4622. char msg[100];
  4623. snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)",
  4624. found, read);
  4625. thread->stats.AddMessage(msg);
  4626. }
  4627. // The inverse function of Pareto distribution
  4628. int64_t ParetoCdfInversion(double u, double theta, double k, double sigma) {
  4629. double ret;
  4630. if (k == 0.0) {
  4631. ret = theta - sigma * std::log(u);
  4632. } else {
  4633. ret = theta + sigma * (std::pow(u, -1 * k) - 1) / k;
  4634. }
  4635. return static_cast<int64_t>(ceil(ret));
  4636. }
  4637. // The inverse function of power distribution (y=ax^b)
  4638. int64_t PowerCdfInversion(double u, double a, double b) {
  4639. double ret;
  4640. ret = std::pow((u / a), (1 / b));
  4641. return static_cast<int64_t>(ceil(ret));
  4642. }
  4643. // Add the noice to the QPS
  4644. double AddNoise(double origin, double noise_ratio) {
  4645. if (noise_ratio < 0.0 || noise_ratio > 1.0) {
  4646. return origin;
  4647. }
  4648. int band_int = static_cast<int>(FLAGS_sine_a);
  4649. double delta = (rand() % band_int - band_int / 2) * noise_ratio;
  4650. if (origin + delta < 0) {
  4651. return origin;
  4652. } else {
  4653. return (origin + delta);
  4654. }
  4655. }
  4656. // Decide the ratio of different query types
  4657. // 0 Get, 1 Put, 2 Seek, 3 SeekForPrev, 4 Delete, 5 SingleDelete, 6 merge
  4658. class QueryDecider {
  4659. public:
  4660. std::vector<int> type_;
  4661. std::vector<double> ratio_;
  4662. int range_;
  4663. QueryDecider() {}
  4664. ~QueryDecider() {}
  4665. Status Initiate(std::vector<double> ratio_input) {
  4666. int range_max = 1000;
  4667. double sum = 0.0;
  4668. for (auto& ratio : ratio_input) {
  4669. sum += ratio;
  4670. }
  4671. range_ = 0;
  4672. for (auto& ratio : ratio_input) {
  4673. range_ += static_cast<int>(ceil(range_max * (ratio / sum)));
  4674. type_.push_back(range_);
  4675. ratio_.push_back(ratio / sum);
  4676. }
  4677. return Status::OK();
  4678. }
  4679. int GetType(int64_t rand_num) {
  4680. if (rand_num < 0) {
  4681. rand_num = rand_num * (-1);
  4682. }
  4683. assert(range_ != 0);
  4684. int pos = static_cast<int>(rand_num % range_);
  4685. for (int i = 0; i < static_cast<int>(type_.size()); i++) {
  4686. if (pos < type_[i]) {
  4687. return i;
  4688. }
  4689. }
  4690. return 0;
  4691. }
  4692. };
  4693. // KeyrangeUnit is the struct of a keyrange. It is used in a keyrange vector
  4694. // to transfer a random value to one keyrange based on the hotness.
  4695. struct KeyrangeUnit {
  4696. int64_t keyrange_start;
  4697. int64_t keyrange_access;
  4698. int64_t keyrange_keys;
  4699. };
  4700. // From our observations, the prefix hotness (key-range hotness) follows
  4701. // the two-term-exponential distribution: f(x) = a*exp(b*x) + c*exp(d*x).
  4702. // However, we cannot directly use the inverse function to decide a
  4703. // key-range from a random distribution. To achieve it, we create a list of
  4704. // KeyrangeUnit, each KeyrangeUnit occupies a range of integers whose size is
  4705. // decided based on the hotness of the key-range. When a random value is
  4706. // generated based on uniform distribution, we map it to the KeyrangeUnit Vec
  4707. // and one KeyrangeUnit is selected. The probability of a KeyrangeUnit being
  4708. // selected is the same as the hotness of this KeyrangeUnit. After that, the
  4709. // key can be randomly allocated to the key-range of this KeyrangeUnit, or we
  4710. // can based on the power distribution (y=ax^b) to generate the offset of
  4711. // the key in the selected key-range. In this way, we generate the keyID
  4712. // based on the hotness of the prefix and also the key hotness distribution.
  4713. class GenerateTwoTermExpKeys {
  4714. public:
  4715. int64_t keyrange_rand_max_;
  4716. int64_t keyrange_size_;
  4717. int64_t keyrange_num_;
  4718. bool initiated_;
  4719. std::vector<KeyrangeUnit> keyrange_set_;
  4720. GenerateTwoTermExpKeys() {
  4721. keyrange_rand_max_ = FLAGS_num;
  4722. initiated_ = false;
  4723. }
  4724. ~GenerateTwoTermExpKeys() {}
  4725. // Initiate the KeyrangeUnit vector and calculate the size of each
  4726. // KeyrangeUnit.
  4727. Status InitiateExpDistribution(int64_t total_keys, double prefix_a,
  4728. double prefix_b, double prefix_c,
  4729. double prefix_d) {
  4730. int64_t amplify = 0;
  4731. int64_t keyrange_start = 0;
  4732. initiated_ = true;
  4733. if (FLAGS_keyrange_num <= 0) {
  4734. keyrange_num_ = 1;
  4735. } else {
  4736. keyrange_num_ = FLAGS_keyrange_num;
  4737. }
  4738. keyrange_size_ = total_keys / keyrange_num_;
  4739. // Calculate the key-range shares size based on the input parameters
  4740. for (int64_t pfx = keyrange_num_; pfx >= 1; pfx--) {
  4741. // Step 1. Calculate the probability that this key range will be
  4742. // accessed in a query. It is based on the two-term expoential
  4743. // distribution
  4744. double keyrange_p = prefix_a * std::exp(prefix_b * pfx) +
  4745. prefix_c * std::exp(prefix_d * pfx);
  4746. if (keyrange_p < std::pow(10.0, -16.0)) {
  4747. keyrange_p = 0.0;
  4748. }
  4749. // Step 2. Calculate the amplify
  4750. // In order to allocate a query to a key-range based on the random
  4751. // number generated for this query, we need to extend the probability
  4752. // of each key range from [0,1] to [0, amplify]. Amplify is calculated
  4753. // by 1/(smallest key-range probability). In this way, we ensure that
  4754. // all key-ranges are assigned with an Integer that >=0
  4755. if (amplify == 0 && keyrange_p > 0) {
  4756. amplify = static_cast<int64_t>(std::floor(1 / keyrange_p)) + 1;
  4757. }
  4758. // Step 3. For each key-range, we calculate its position in the
  4759. // [0, amplify] range, including the start, the size (keyrange_access)
  4760. KeyrangeUnit p_unit;
  4761. p_unit.keyrange_start = keyrange_start;
  4762. if (0.0 >= keyrange_p) {
  4763. p_unit.keyrange_access = 0;
  4764. } else {
  4765. p_unit.keyrange_access =
  4766. static_cast<int64_t>(std::floor(amplify * keyrange_p));
  4767. }
  4768. p_unit.keyrange_keys = keyrange_size_;
  4769. keyrange_set_.push_back(p_unit);
  4770. keyrange_start += p_unit.keyrange_access;
  4771. }
  4772. keyrange_rand_max_ = keyrange_start;
  4773. // Step 4. Shuffle the key-ranges randomly
  4774. // Since the access probability is calculated from small to large,
  4775. // If we do not re-allocate them, hot key-ranges are always at the end
  4776. // and cold key-ranges are at the begin of the key space. Therefore, the
  4777. // key-ranges are shuffled and the rand seed is only decide by the
  4778. // key-range hotness distribution. With the same distribution parameters
  4779. // the shuffle results are the same.
  4780. Random64 rand_loca(keyrange_rand_max_);
  4781. for (int64_t i = 0; i < FLAGS_keyrange_num; i++) {
  4782. int64_t pos = rand_loca.Next() % FLAGS_keyrange_num;
  4783. assert(i >= 0 && i < static_cast<int64_t>(keyrange_set_.size()) &&
  4784. pos >= 0 && pos < static_cast<int64_t>(keyrange_set_.size()));
  4785. std::swap(keyrange_set_[i], keyrange_set_[pos]);
  4786. }
  4787. // Step 5. Recalculate the prefix start postion after shuffling
  4788. int64_t offset = 0;
  4789. for (auto& p_unit : keyrange_set_) {
  4790. p_unit.keyrange_start = offset;
  4791. offset += p_unit.keyrange_access;
  4792. }
  4793. return Status::OK();
  4794. }
  4795. // Generate the Key ID according to the input ini_rand and key distribution
  4796. int64_t DistGetKeyID(int64_t ini_rand, double key_dist_a,
  4797. double key_dist_b) {
  4798. int64_t keyrange_rand = ini_rand % keyrange_rand_max_;
  4799. // Calculate and select one key-range that contains the new key
  4800. int64_t start = 0, end = static_cast<int64_t>(keyrange_set_.size());
  4801. while (start + 1 < end) {
  4802. int64_t mid = start + (end - start) / 2;
  4803. assert(mid >= 0 && mid < static_cast<int64_t>(keyrange_set_.size()));
  4804. if (keyrange_rand < keyrange_set_[mid].keyrange_start) {
  4805. end = mid;
  4806. } else {
  4807. start = mid;
  4808. }
  4809. }
  4810. int64_t keyrange_id = start;
  4811. // Select one key in the key-range and compose the keyID
  4812. int64_t key_offset = 0, key_seed;
  4813. if (key_dist_a == 0.0 && key_dist_b == 0.0) {
  4814. key_offset = ini_rand % keyrange_size_;
  4815. } else {
  4816. key_seed = static_cast<int64_t>(
  4817. ceil(std::pow((ini_rand / key_dist_a), (1 / key_dist_b))));
  4818. Random64 rand_key(key_seed);
  4819. key_offset = static_cast<int64_t>(rand_key.Next()) % keyrange_size_;
  4820. }
  4821. return keyrange_size_ * keyrange_id + key_offset;
  4822. }
  4823. };
  4824. // The social graph wokrload mixed with Get, Put, Iterator queries.
  4825. // The value size and iterator length follow Pareto distribution.
  4826. // The overall key access follow power distribution. If user models the
  4827. // workload based on different key-ranges (or different prefixes), user
  4828. // can use two-term-exponential distribution to fit the workload. User
  4829. // needs to decides the ratio between Get, Put, Iterator queries before
  4830. // starting the benchmark.
  4831. void MixGraph(ThreadState* thread) {
  4832. int64_t read = 0; // including single gets and Next of iterators
  4833. int64_t gets = 0;
  4834. int64_t puts = 0;
  4835. int64_t found = 0;
  4836. int64_t seek = 0;
  4837. int64_t seek_found = 0;
  4838. int64_t bytes = 0;
  4839. const int64_t default_value_max = 1 * 1024 * 1024;
  4840. int64_t value_max = default_value_max;
  4841. int64_t scan_len_max = FLAGS_mix_max_scan_len;
  4842. double write_rate = 1000000.0;
  4843. double read_rate = 1000000.0;
  4844. bool use_prefix_modeling = false;
  4845. GenerateTwoTermExpKeys gen_exp;
  4846. std::vector<double> ratio{FLAGS_mix_get_ratio, FLAGS_mix_put_ratio,
  4847. FLAGS_mix_seek_ratio};
  4848. char value_buffer[default_value_max];
  4849. QueryDecider query;
  4850. RandomGenerator gen;
  4851. Status s;
  4852. if (value_max > FLAGS_mix_max_value_size) {
  4853. value_max = FLAGS_mix_max_value_size;
  4854. }
  4855. ReadOptions options(FLAGS_verify_checksum, true);
  4856. std::unique_ptr<const char[]> key_guard;
  4857. Slice key = AllocateKey(&key_guard);
  4858. PinnableSlice pinnable_val;
  4859. query.Initiate(ratio);
  4860. // the limit of qps initiation
  4861. if (FLAGS_sine_a != 0 || FLAGS_sine_d != 0) {
  4862. thread->shared->read_rate_limiter.reset(NewGenericRateLimiter(
  4863. static_cast<int64_t>(read_rate), 100000 /* refill_period_us */, 10 /* fairness */,
  4864. RateLimiter::Mode::kReadsOnly));
  4865. thread->shared->write_rate_limiter.reset(
  4866. NewGenericRateLimiter(static_cast<int64_t>(write_rate)));
  4867. }
  4868. // Decide if user wants to use prefix based key generation
  4869. if (FLAGS_keyrange_dist_a != 0.0 || FLAGS_keyrange_dist_b != 0.0 ||
  4870. FLAGS_keyrange_dist_c != 0.0 || FLAGS_keyrange_dist_d != 0.0) {
  4871. use_prefix_modeling = true;
  4872. gen_exp.InitiateExpDistribution(
  4873. FLAGS_num, FLAGS_keyrange_dist_a, FLAGS_keyrange_dist_b,
  4874. FLAGS_keyrange_dist_c, FLAGS_keyrange_dist_d);
  4875. }
  4876. Duration duration(FLAGS_duration, reads_);
  4877. while (!duration.Done(1)) {
  4878. DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
  4879. int64_t ini_rand, rand_v, key_rand, key_seed;
  4880. ini_rand = GetRandomKey(&thread->rand);
  4881. rand_v = ini_rand % FLAGS_num;
  4882. double u = static_cast<double>(rand_v) / FLAGS_num;
  4883. // Generate the keyID based on the key hotness and prefix hotness
  4884. if (use_prefix_modeling) {
  4885. key_rand =
  4886. gen_exp.DistGetKeyID(ini_rand, FLAGS_key_dist_a, FLAGS_key_dist_b);
  4887. } else {
  4888. key_seed = PowerCdfInversion(u, FLAGS_key_dist_a, FLAGS_key_dist_b);
  4889. Random64 rand(key_seed);
  4890. key_rand = static_cast<int64_t>(rand.Next()) % FLAGS_num;
  4891. }
  4892. GenerateKeyFromInt(key_rand, FLAGS_num, &key);
  4893. int query_type = query.GetType(rand_v);
  4894. // change the qps
  4895. uint64_t now = FLAGS_env->NowMicros();
  4896. uint64_t usecs_since_last;
  4897. if (now > thread->stats.GetSineInterval()) {
  4898. usecs_since_last = now - thread->stats.GetSineInterval();
  4899. } else {
  4900. usecs_since_last = 0;
  4901. }
  4902. if (usecs_since_last >
  4903. (FLAGS_sine_mix_rate_interval_milliseconds * uint64_t{1000})) {
  4904. double usecs_since_start =
  4905. static_cast<double>(now - thread->stats.GetStart());
  4906. thread->stats.ResetSineInterval();
  4907. double mix_rate_with_noise = AddNoise(
  4908. SineRate(usecs_since_start / 1000000.0), FLAGS_sine_mix_rate_noise);
  4909. read_rate = mix_rate_with_noise * (query.ratio_[0] + query.ratio_[2]);
  4910. write_rate =
  4911. mix_rate_with_noise * query.ratio_[1] * FLAGS_mix_ave_kv_size;
  4912. thread->shared->write_rate_limiter.reset(
  4913. NewGenericRateLimiter(static_cast<int64_t>(write_rate)));
  4914. thread->shared->read_rate_limiter.reset(NewGenericRateLimiter(
  4915. static_cast<int64_t>(read_rate),
  4916. FLAGS_sine_mix_rate_interval_milliseconds * uint64_t{1000}, 10,
  4917. RateLimiter::Mode::kReadsOnly));
  4918. }
  4919. // Start the query
  4920. if (query_type == 0) {
  4921. // the Get query
  4922. gets++;
  4923. read++;
  4924. if (FLAGS_num_column_families > 1) {
  4925. s = db_with_cfh->db->Get(options, db_with_cfh->GetCfh(key_rand), key,
  4926. &pinnable_val);
  4927. } else {
  4928. pinnable_val.Reset();
  4929. s = db_with_cfh->db->Get(options,
  4930. db_with_cfh->db->DefaultColumnFamily(), key,
  4931. &pinnable_val);
  4932. }
  4933. if (s.ok()) {
  4934. found++;
  4935. bytes += key.size() + pinnable_val.size();
  4936. } else if (!s.IsNotFound()) {
  4937. fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
  4938. abort();
  4939. }
  4940. if (thread->shared->read_rate_limiter.get() != nullptr &&
  4941. read % 256 == 255) {
  4942. thread->shared->read_rate_limiter->Request(
  4943. 256, Env::IO_HIGH, nullptr /* stats */,
  4944. RateLimiter::OpType::kRead);
  4945. }
  4946. thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
  4947. } else if (query_type == 1) {
  4948. // the Put query
  4949. puts++;
  4950. int64_t val_size = ParetoCdfInversion(
  4951. u, FLAGS_value_theta, FLAGS_value_k, FLAGS_value_sigma);
  4952. if (val_size < 0) {
  4953. val_size = 10;
  4954. } else if (val_size > value_max) {
  4955. val_size = val_size % value_max;
  4956. }
  4957. s = db_with_cfh->db->Put(
  4958. write_options_, key,
  4959. gen.Generate(static_cast<unsigned int>(val_size)));
  4960. if (!s.ok()) {
  4961. fprintf(stderr, "put error: %s\n", s.ToString().c_str());
  4962. exit(1);
  4963. }
  4964. if (thread->shared->write_rate_limiter) {
  4965. thread->shared->write_rate_limiter->Request(
  4966. key.size() + val_size, Env::IO_HIGH, nullptr /*stats*/,
  4967. RateLimiter::OpType::kWrite);
  4968. }
  4969. thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kWrite);
  4970. } else if (query_type == 2) {
  4971. // Seek query
  4972. if (db_with_cfh->db != nullptr) {
  4973. Iterator* single_iter = nullptr;
  4974. single_iter = db_with_cfh->db->NewIterator(options);
  4975. if (single_iter != nullptr) {
  4976. single_iter->Seek(key);
  4977. seek++;
  4978. read++;
  4979. if (single_iter->Valid() && single_iter->key().compare(key) == 0) {
  4980. seek_found++;
  4981. }
  4982. int64_t scan_length =
  4983. ParetoCdfInversion(u, FLAGS_iter_theta, FLAGS_iter_k,
  4984. FLAGS_iter_sigma) %
  4985. scan_len_max;
  4986. for (int64_t j = 0; j < scan_length && single_iter->Valid(); j++) {
  4987. Slice value = single_iter->value();
  4988. memcpy(value_buffer, value.data(),
  4989. std::min(value.size(), sizeof(value_buffer)));
  4990. bytes += single_iter->key().size() + single_iter->value().size();
  4991. single_iter->Next();
  4992. assert(single_iter->status().ok());
  4993. }
  4994. }
  4995. delete single_iter;
  4996. }
  4997. thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kSeek);
  4998. }
  4999. }
  5000. char msg[256];
  5001. snprintf(msg, sizeof(msg),
  5002. "( Gets:%" PRIu64 " Puts:%" PRIu64 " Seek:%" PRIu64 " of %" PRIu64
  5003. " in %" PRIu64 " found)\n",
  5004. gets, puts, seek, found, read);
  5005. thread->stats.AddBytes(bytes);
  5006. thread->stats.AddMessage(msg);
  5007. if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
  5008. thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
  5009. get_perf_context()->ToString());
  5010. }
  5011. }
  5012. void IteratorCreation(ThreadState* thread) {
  5013. Duration duration(FLAGS_duration, reads_);
  5014. ReadOptions options(FLAGS_verify_checksum, true);
  5015. while (!duration.Done(1)) {
  5016. DB* db = SelectDB(thread);
  5017. Iterator* iter = db->NewIterator(options);
  5018. delete iter;
  5019. thread->stats.FinishedOps(nullptr, db, 1, kOthers);
  5020. }
  5021. }
  5022. void IteratorCreationWhileWriting(ThreadState* thread) {
  5023. if (thread->tid > 0) {
  5024. IteratorCreation(thread);
  5025. } else {
  5026. BGWriter(thread, kWrite);
  5027. }
  5028. }
  5029. void SeekRandom(ThreadState* thread) {
  5030. int64_t read = 0;
  5031. int64_t found = 0;
  5032. int64_t bytes = 0;
  5033. ReadOptions options(FLAGS_verify_checksum, true);
  5034. options.total_order_seek = FLAGS_total_order_seek;
  5035. options.prefix_same_as_start = FLAGS_prefix_same_as_start;
  5036. options.tailing = FLAGS_use_tailing_iterator;
  5037. options.readahead_size = FLAGS_readahead_size;
  5038. Iterator* single_iter = nullptr;
  5039. std::vector<Iterator*> multi_iters;
  5040. if (db_.db != nullptr) {
  5041. single_iter = db_.db->NewIterator(options);
  5042. } else {
  5043. for (const auto& db_with_cfh : multi_dbs_) {
  5044. multi_iters.push_back(db_with_cfh.db->NewIterator(options));
  5045. }
  5046. }
  5047. std::unique_ptr<const char[]> key_guard;
  5048. Slice key = AllocateKey(&key_guard);
  5049. std::unique_ptr<const char[]> upper_bound_key_guard;
  5050. Slice upper_bound = AllocateKey(&upper_bound_key_guard);
  5051. std::unique_ptr<const char[]> lower_bound_key_guard;
  5052. Slice lower_bound = AllocateKey(&lower_bound_key_guard);
  5053. Duration duration(FLAGS_duration, reads_);
  5054. char value_buffer[256];
  5055. while (!duration.Done(1)) {
  5056. int64_t seek_pos = thread->rand.Next() % FLAGS_num;
  5057. GenerateKeyFromIntForSeek(static_cast<uint64_t>(seek_pos), FLAGS_num,
  5058. &key);
  5059. if (FLAGS_max_scan_distance != 0) {
  5060. if (FLAGS_reverse_iterator) {
  5061. GenerateKeyFromInt(
  5062. static_cast<uint64_t>(std::max(
  5063. static_cast<int64_t>(0), seek_pos - FLAGS_max_scan_distance)),
  5064. FLAGS_num, &lower_bound);
  5065. options.iterate_lower_bound = &lower_bound;
  5066. } else {
  5067. auto min_num =
  5068. std::min(FLAGS_num, seek_pos + FLAGS_max_scan_distance);
  5069. GenerateKeyFromInt(static_cast<uint64_t>(min_num), FLAGS_num,
  5070. &upper_bound);
  5071. options.iterate_upper_bound = &upper_bound;
  5072. }
  5073. }
  5074. if (!FLAGS_use_tailing_iterator) {
  5075. if (db_.db != nullptr) {
  5076. delete single_iter;
  5077. single_iter = db_.db->NewIterator(options);
  5078. } else {
  5079. for (auto iter : multi_iters) {
  5080. delete iter;
  5081. }
  5082. multi_iters.clear();
  5083. for (const auto& db_with_cfh : multi_dbs_) {
  5084. multi_iters.push_back(db_with_cfh.db->NewIterator(options));
  5085. }
  5086. }
  5087. }
  5088. // Pick a Iterator to use
  5089. Iterator* iter_to_use = single_iter;
  5090. if (single_iter == nullptr) {
  5091. iter_to_use = multi_iters[thread->rand.Next() % multi_iters.size()];
  5092. }
  5093. iter_to_use->Seek(key);
  5094. read++;
  5095. if (iter_to_use->Valid() && iter_to_use->key().compare(key) == 0) {
  5096. found++;
  5097. }
  5098. for (int j = 0; j < FLAGS_seek_nexts && iter_to_use->Valid(); ++j) {
  5099. // Copy out iterator's value to make sure we read them.
  5100. Slice value = iter_to_use->value();
  5101. memcpy(value_buffer, value.data(),
  5102. std::min(value.size(), sizeof(value_buffer)));
  5103. bytes += iter_to_use->key().size() + iter_to_use->value().size();
  5104. if (!FLAGS_reverse_iterator) {
  5105. iter_to_use->Next();
  5106. } else {
  5107. iter_to_use->Prev();
  5108. }
  5109. assert(iter_to_use->status().ok());
  5110. }
  5111. if (thread->shared->read_rate_limiter.get() != nullptr &&
  5112. read % 256 == 255) {
  5113. thread->shared->read_rate_limiter->Request(
  5114. 256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
  5115. }
  5116. thread->stats.FinishedOps(&db_, db_.db, 1, kSeek);
  5117. }
  5118. delete single_iter;
  5119. for (auto iter : multi_iters) {
  5120. delete iter;
  5121. }
  5122. char msg[100];
  5123. snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n",
  5124. found, read);
  5125. thread->stats.AddBytes(bytes);
  5126. thread->stats.AddMessage(msg);
  5127. if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
  5128. thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
  5129. get_perf_context()->ToString());
  5130. }
  5131. }
  5132. void SeekRandomWhileWriting(ThreadState* thread) {
  5133. if (thread->tid > 0) {
  5134. SeekRandom(thread);
  5135. } else {
  5136. BGWriter(thread, kWrite);
  5137. }
  5138. }
  5139. void SeekRandomWhileMerging(ThreadState* thread) {
  5140. if (thread->tid > 0) {
  5141. SeekRandom(thread);
  5142. } else {
  5143. BGWriter(thread, kMerge);
  5144. }
  5145. }
  5146. void DoDelete(ThreadState* thread, bool seq) {
  5147. WriteBatch batch;
  5148. Duration duration(seq ? 0 : FLAGS_duration, deletes_);
  5149. int64_t i = 0;
  5150. std::unique_ptr<const char[]> key_guard;
  5151. Slice key = AllocateKey(&key_guard);
  5152. while (!duration.Done(entries_per_batch_)) {
  5153. DB* db = SelectDB(thread);
  5154. batch.Clear();
  5155. for (int64_t j = 0; j < entries_per_batch_; ++j) {
  5156. const int64_t k = seq ? i + j : (thread->rand.Next() % FLAGS_num);
  5157. GenerateKeyFromInt(k, FLAGS_num, &key);
  5158. batch.Delete(key);
  5159. }
  5160. auto s = db->Write(write_options_, &batch);
  5161. thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kDelete);
  5162. if (!s.ok()) {
  5163. fprintf(stderr, "del error: %s\n", s.ToString().c_str());
  5164. exit(1);
  5165. }
  5166. i += entries_per_batch_;
  5167. }
  5168. }
  5169. void DeleteSeq(ThreadState* thread) {
  5170. DoDelete(thread, true);
  5171. }
  5172. void DeleteRandom(ThreadState* thread) {
  5173. DoDelete(thread, false);
  5174. }
  5175. void ReadWhileWriting(ThreadState* thread) {
  5176. if (thread->tid > 0) {
  5177. ReadRandom(thread);
  5178. } else {
  5179. BGWriter(thread, kWrite);
  5180. }
  5181. }
  5182. void ReadWhileMerging(ThreadState* thread) {
  5183. if (thread->tid > 0) {
  5184. ReadRandom(thread);
  5185. } else {
  5186. BGWriter(thread, kMerge);
  5187. }
  5188. }
  5189. void BGWriter(ThreadState* thread, enum OperationType write_merge) {
  5190. // Special thread that keeps writing until other threads are done.
  5191. RandomGenerator gen;
  5192. int64_t bytes = 0;
  5193. std::unique_ptr<RateLimiter> write_rate_limiter;
  5194. if (FLAGS_benchmark_write_rate_limit > 0) {
  5195. write_rate_limiter.reset(
  5196. NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
  5197. }
  5198. // Don't merge stats from this thread with the readers.
  5199. thread->stats.SetExcludeFromMerge();
  5200. std::unique_ptr<const char[]> key_guard;
  5201. Slice key = AllocateKey(&key_guard);
  5202. uint32_t written = 0;
  5203. bool hint_printed = false;
  5204. while (true) {
  5205. DB* db = SelectDB(thread);
  5206. {
  5207. MutexLock l(&thread->shared->mu);
  5208. if (FLAGS_finish_after_writes && written == writes_) {
  5209. fprintf(stderr, "Exiting the writer after %u writes...\n", written);
  5210. break;
  5211. }
  5212. if (thread->shared->num_done + 1 >= thread->shared->num_initialized) {
  5213. // Other threads have finished
  5214. if (FLAGS_finish_after_writes) {
  5215. // Wait for the writes to be finished
  5216. if (!hint_printed) {
  5217. fprintf(stderr, "Reads are finished. Have %d more writes to do\n",
  5218. static_cast<int>(writes_) - written);
  5219. hint_printed = true;
  5220. }
  5221. } else {
  5222. // Finish the write immediately
  5223. break;
  5224. }
  5225. }
  5226. }
  5227. GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
  5228. Status s;
  5229. Slice val = gen.Generate();
  5230. if (write_merge == kWrite) {
  5231. s = db->Put(write_options_, key, val);
  5232. } else {
  5233. s = db->Merge(write_options_, key, val);
  5234. }
  5235. written++;
  5236. if (!s.ok()) {
  5237. fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
  5238. exit(1);
  5239. }
  5240. bytes += key.size() + val.size();
  5241. thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
  5242. if (FLAGS_benchmark_write_rate_limit > 0) {
  5243. write_rate_limiter->Request(
  5244. key.size() + val.size(), Env::IO_HIGH,
  5245. nullptr /* stats */, RateLimiter::OpType::kWrite);
  5246. }
  5247. }
  5248. thread->stats.AddBytes(bytes);
  5249. }
  5250. void ReadWhileScanning(ThreadState* thread) {
  5251. if (thread->tid > 0) {
  5252. ReadRandom(thread);
  5253. } else {
  5254. BGScan(thread);
  5255. }
  5256. }
  5257. void BGScan(ThreadState* thread) {
  5258. if (FLAGS_num_multi_db > 0) {
  5259. fprintf(stderr, "Not supporting multiple DBs.\n");
  5260. abort();
  5261. }
  5262. assert(db_.db != nullptr);
  5263. ReadOptions read_options;
  5264. Iterator* iter = db_.db->NewIterator(read_options);
  5265. fprintf(stderr, "num reads to do %" PRIu64 "\n", reads_);
  5266. Duration duration(FLAGS_duration, reads_);
  5267. uint64_t num_seek_to_first = 0;
  5268. uint64_t num_next = 0;
  5269. while (!duration.Done(1)) {
  5270. if (!iter->Valid()) {
  5271. iter->SeekToFirst();
  5272. num_seek_to_first++;
  5273. } else if (!iter->status().ok()) {
  5274. fprintf(stderr, "Iterator error: %s\n",
  5275. iter->status().ToString().c_str());
  5276. abort();
  5277. } else {
  5278. iter->Next();
  5279. num_next++;
  5280. }
  5281. thread->stats.FinishedOps(&db_, db_.db, 1, kSeek);
  5282. }
  5283. delete iter;
  5284. }
  5285. // Given a key K and value V, this puts (K+"0", V), (K+"1", V), (K+"2", V)
  5286. // in DB atomically i.e in a single batch. Also refer GetMany.
  5287. Status PutMany(DB* db, const WriteOptions& writeoptions, const Slice& key,
  5288. const Slice& value) {
  5289. std::string suffixes[3] = {"2", "1", "0"};
  5290. std::string keys[3];
  5291. WriteBatch batch;
  5292. Status s;
  5293. for (int i = 0; i < 3; i++) {
  5294. keys[i] = key.ToString() + suffixes[i];
  5295. batch.Put(keys[i], value);
  5296. }
  5297. s = db->Write(writeoptions, &batch);
  5298. return s;
  5299. }
  5300. // Given a key K, this deletes (K+"0", V), (K+"1", V), (K+"2", V)
  5301. // in DB atomically i.e in a single batch. Also refer GetMany.
  5302. Status DeleteMany(DB* db, const WriteOptions& writeoptions,
  5303. const Slice& key) {
  5304. std::string suffixes[3] = {"1", "2", "0"};
  5305. std::string keys[3];
  5306. WriteBatch batch;
  5307. Status s;
  5308. for (int i = 0; i < 3; i++) {
  5309. keys[i] = key.ToString() + suffixes[i];
  5310. batch.Delete(keys[i]);
  5311. }
  5312. s = db->Write(writeoptions, &batch);
  5313. return s;
  5314. }
  5315. // Given a key K and value V, this gets values for K+"0", K+"1" and K+"2"
  5316. // in the same snapshot, and verifies that all the values are identical.
  5317. // ASSUMES that PutMany was used to put (K, V) into the DB.
  5318. Status GetMany(DB* db, const ReadOptions& readoptions, const Slice& key,
  5319. std::string* value) {
  5320. std::string suffixes[3] = {"0", "1", "2"};
  5321. std::string keys[3];
  5322. Slice key_slices[3];
  5323. std::string values[3];
  5324. ReadOptions readoptionscopy = readoptions;
  5325. readoptionscopy.snapshot = db->GetSnapshot();
  5326. Status s;
  5327. for (int i = 0; i < 3; i++) {
  5328. keys[i] = key.ToString() + suffixes[i];
  5329. key_slices[i] = keys[i];
  5330. s = db->Get(readoptionscopy, key_slices[i], value);
  5331. if (!s.ok() && !s.IsNotFound()) {
  5332. fprintf(stderr, "get error: %s\n", s.ToString().c_str());
  5333. values[i] = "";
  5334. // we continue after error rather than exiting so that we can
  5335. // find more errors if any
  5336. } else if (s.IsNotFound()) {
  5337. values[i] = "";
  5338. } else {
  5339. values[i] = *value;
  5340. }
  5341. }
  5342. db->ReleaseSnapshot(readoptionscopy.snapshot);
  5343. if ((values[0] != values[1]) || (values[1] != values[2])) {
  5344. fprintf(stderr, "inconsistent values for key %s: %s, %s, %s\n",
  5345. key.ToString().c_str(), values[0].c_str(), values[1].c_str(),
  5346. values[2].c_str());
  5347. // we continue after error rather than exiting so that we can
  5348. // find more errors if any
  5349. }
  5350. return s;
  5351. }
  5352. // Differs from readrandomwriterandom in the following ways:
  5353. // (a) Uses GetMany/PutMany to read/write key values. Refer to those funcs.
  5354. // (b) Does deletes as well (per FLAGS_deletepercent)
  5355. // (c) In order to achieve high % of 'found' during lookups, and to do
  5356. // multiple writes (including puts and deletes) it uses upto
  5357. // FLAGS_numdistinct distinct keys instead of FLAGS_num distinct keys.
  5358. // (d) Does not have a MultiGet option.
  5359. void RandomWithVerify(ThreadState* thread) {
  5360. ReadOptions options(FLAGS_verify_checksum, true);
  5361. RandomGenerator gen;
  5362. std::string value;
  5363. int64_t found = 0;
  5364. int get_weight = 0;
  5365. int put_weight = 0;
  5366. int delete_weight = 0;
  5367. int64_t gets_done = 0;
  5368. int64_t puts_done = 0;
  5369. int64_t deletes_done = 0;
  5370. std::unique_ptr<const char[]> key_guard;
  5371. Slice key = AllocateKey(&key_guard);
  5372. // the number of iterations is the larger of read_ or write_
  5373. for (int64_t i = 0; i < readwrites_; i++) {
  5374. DB* db = SelectDB(thread);
  5375. if (get_weight == 0 && put_weight == 0 && delete_weight == 0) {
  5376. // one batch completed, reinitialize for next batch
  5377. get_weight = FLAGS_readwritepercent;
  5378. delete_weight = FLAGS_deletepercent;
  5379. put_weight = 100 - get_weight - delete_weight;
  5380. }
  5381. GenerateKeyFromInt(thread->rand.Next() % FLAGS_numdistinct,
  5382. FLAGS_numdistinct, &key);
  5383. if (get_weight > 0) {
  5384. // do all the gets first
  5385. Status s = GetMany(db, options, key, &value);
  5386. if (!s.ok() && !s.IsNotFound()) {
  5387. fprintf(stderr, "getmany error: %s\n", s.ToString().c_str());
  5388. // we continue after error rather than exiting so that we can
  5389. // find more errors if any
  5390. } else if (!s.IsNotFound()) {
  5391. found++;
  5392. }
  5393. get_weight--;
  5394. gets_done++;
  5395. thread->stats.FinishedOps(&db_, db_.db, 1, kRead);
  5396. } else if (put_weight > 0) {
  5397. // then do all the corresponding number of puts
  5398. // for all the gets we have done earlier
  5399. Status s = PutMany(db, write_options_, key, gen.Generate());
  5400. if (!s.ok()) {
  5401. fprintf(stderr, "putmany error: %s\n", s.ToString().c_str());
  5402. exit(1);
  5403. }
  5404. put_weight--;
  5405. puts_done++;
  5406. thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
  5407. } else if (delete_weight > 0) {
  5408. Status s = DeleteMany(db, write_options_, key);
  5409. if (!s.ok()) {
  5410. fprintf(stderr, "deletemany error: %s\n", s.ToString().c_str());
  5411. exit(1);
  5412. }
  5413. delete_weight--;
  5414. deletes_done++;
  5415. thread->stats.FinishedOps(&db_, db_.db, 1, kDelete);
  5416. }
  5417. }
  5418. char msg[128];
  5419. snprintf(msg, sizeof(msg),
  5420. "( get:%" PRIu64 " put:%" PRIu64 " del:%" PRIu64 " total:%" \
  5421. PRIu64 " found:%" PRIu64 ")",
  5422. gets_done, puts_done, deletes_done, readwrites_, found);
  5423. thread->stats.AddMessage(msg);
  5424. }
  5425. // This is different from ReadWhileWriting because it does not use
  5426. // an extra thread.
  5427. void ReadRandomWriteRandom(ThreadState* thread) {
  5428. ReadOptions options(FLAGS_verify_checksum, true);
  5429. RandomGenerator gen;
  5430. std::string value;
  5431. int64_t found = 0;
  5432. int get_weight = 0;
  5433. int put_weight = 0;
  5434. int64_t reads_done = 0;
  5435. int64_t writes_done = 0;
  5436. Duration duration(FLAGS_duration, readwrites_);
  5437. std::unique_ptr<const char[]> key_guard;
  5438. Slice key = AllocateKey(&key_guard);
  5439. // the number of iterations is the larger of read_ or write_
  5440. while (!duration.Done(1)) {
  5441. DB* db = SelectDB(thread);
  5442. GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
  5443. if (get_weight == 0 && put_weight == 0) {
  5444. // one batch completed, reinitialize for next batch
  5445. get_weight = FLAGS_readwritepercent;
  5446. put_weight = 100 - get_weight;
  5447. }
  5448. if (get_weight > 0) {
  5449. // do all the gets first
  5450. Status s = db->Get(options, key, &value);
  5451. if (!s.ok() && !s.IsNotFound()) {
  5452. fprintf(stderr, "get error: %s\n", s.ToString().c_str());
  5453. // we continue after error rather than exiting so that we can
  5454. // find more errors if any
  5455. } else if (!s.IsNotFound()) {
  5456. found++;
  5457. }
  5458. get_weight--;
  5459. reads_done++;
  5460. thread->stats.FinishedOps(nullptr, db, 1, kRead);
  5461. } else if (put_weight > 0) {
  5462. // then do all the corresponding number of puts
  5463. // for all the gets we have done earlier
  5464. Status s = db->Put(write_options_, key, gen.Generate());
  5465. if (!s.ok()) {
  5466. fprintf(stderr, "put error: %s\n", s.ToString().c_str());
  5467. exit(1);
  5468. }
  5469. put_weight--;
  5470. writes_done++;
  5471. thread->stats.FinishedOps(nullptr, db, 1, kWrite);
  5472. }
  5473. }
  5474. char msg[100];
  5475. snprintf(msg, sizeof(msg), "( reads:%" PRIu64 " writes:%" PRIu64 \
  5476. " total:%" PRIu64 " found:%" PRIu64 ")",
  5477. reads_done, writes_done, readwrites_, found);
  5478. thread->stats.AddMessage(msg);
  5479. }
  5480. //
  5481. // Read-modify-write for random keys
  5482. void UpdateRandom(ThreadState* thread) {
  5483. ReadOptions options(FLAGS_verify_checksum, true);
  5484. RandomGenerator gen;
  5485. std::string value;
  5486. int64_t found = 0;
  5487. int64_t bytes = 0;
  5488. Duration duration(FLAGS_duration, readwrites_);
  5489. std::unique_ptr<const char[]> key_guard;
  5490. Slice key = AllocateKey(&key_guard);
  5491. // the number of iterations is the larger of read_ or write_
  5492. while (!duration.Done(1)) {
  5493. DB* db = SelectDB(thread);
  5494. GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
  5495. auto status = db->Get(options, key, &value);
  5496. if (status.ok()) {
  5497. ++found;
  5498. bytes += key.size() + value.size();
  5499. } else if (!status.IsNotFound()) {
  5500. fprintf(stderr, "Get returned an error: %s\n",
  5501. status.ToString().c_str());
  5502. abort();
  5503. }
  5504. if (thread->shared->write_rate_limiter) {
  5505. thread->shared->write_rate_limiter->Request(
  5506. key.size() + value.size(), Env::IO_HIGH, nullptr /*stats*/,
  5507. RateLimiter::OpType::kWrite);
  5508. }
  5509. Slice val = gen.Generate();
  5510. Status s = db->Put(write_options_, key, val);
  5511. if (!s.ok()) {
  5512. fprintf(stderr, "put error: %s\n", s.ToString().c_str());
  5513. exit(1);
  5514. }
  5515. bytes += key.size() + val.size();
  5516. thread->stats.FinishedOps(nullptr, db, 1, kUpdate);
  5517. }
  5518. char msg[100];
  5519. snprintf(msg, sizeof(msg),
  5520. "( updates:%" PRIu64 " found:%" PRIu64 ")", readwrites_, found);
  5521. thread->stats.AddBytes(bytes);
  5522. thread->stats.AddMessage(msg);
  5523. }
  5524. // Read-XOR-write for random keys. Xors the existing value with a randomly
  5525. // generated value, and stores the result. Assuming A in the array of bytes
  5526. // representing the existing value, we generate an array B of the same size,
  5527. // then compute C = A^B as C[i]=A[i]^B[i], and store C
  5528. void XORUpdateRandom(ThreadState* thread) {
  5529. ReadOptions options(FLAGS_verify_checksum, true);
  5530. RandomGenerator gen;
  5531. std::string existing_value;
  5532. int64_t found = 0;
  5533. Duration duration(FLAGS_duration, readwrites_);
  5534. BytesXOROperator xor_operator;
  5535. std::unique_ptr<const char[]> key_guard;
  5536. Slice key = AllocateKey(&key_guard);
  5537. // the number of iterations is the larger of read_ or write_
  5538. while (!duration.Done(1)) {
  5539. DB* db = SelectDB(thread);
  5540. GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
  5541. auto status = db->Get(options, key, &existing_value);
  5542. if (status.ok()) {
  5543. ++found;
  5544. } else if (!status.IsNotFound()) {
  5545. fprintf(stderr, "Get returned an error: %s\n",
  5546. status.ToString().c_str());
  5547. exit(1);
  5548. }
  5549. Slice value = gen.Generate(static_cast<unsigned int>(existing_value.size()));
  5550. std::string new_value;
  5551. if (status.ok()) {
  5552. Slice existing_value_slice = Slice(existing_value);
  5553. xor_operator.XOR(&existing_value_slice, value, &new_value);
  5554. } else {
  5555. xor_operator.XOR(nullptr, value, &new_value);
  5556. }
  5557. Status s = db->Put(write_options_, key, Slice(new_value));
  5558. if (!s.ok()) {
  5559. fprintf(stderr, "put error: %s\n", s.ToString().c_str());
  5560. exit(1);
  5561. }
  5562. thread->stats.FinishedOps(nullptr, db, 1);
  5563. }
  5564. char msg[100];
  5565. snprintf(msg, sizeof(msg),
  5566. "( updates:%" PRIu64 " found:%" PRIu64 ")", readwrites_, found);
  5567. thread->stats.AddMessage(msg);
  5568. }
  5569. // Read-modify-write for random keys.
  5570. // Each operation causes the key grow by value_size (simulating an append).
  5571. // Generally used for benchmarking against merges of similar type
  5572. void AppendRandom(ThreadState* thread) {
  5573. ReadOptions options(FLAGS_verify_checksum, true);
  5574. RandomGenerator gen;
  5575. std::string value;
  5576. int64_t found = 0;
  5577. int64_t bytes = 0;
  5578. std::unique_ptr<const char[]> key_guard;
  5579. Slice key = AllocateKey(&key_guard);
  5580. // The number of iterations is the larger of read_ or write_
  5581. Duration duration(FLAGS_duration, readwrites_);
  5582. while (!duration.Done(1)) {
  5583. DB* db = SelectDB(thread);
  5584. GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
  5585. auto status = db->Get(options, key, &value);
  5586. if (status.ok()) {
  5587. ++found;
  5588. bytes += key.size() + value.size();
  5589. } else if (!status.IsNotFound()) {
  5590. fprintf(stderr, "Get returned an error: %s\n",
  5591. status.ToString().c_str());
  5592. abort();
  5593. } else {
  5594. // If not existing, then just assume an empty string of data
  5595. value.clear();
  5596. }
  5597. // Update the value (by appending data)
  5598. Slice operand = gen.Generate();
  5599. if (value.size() > 0) {
  5600. // Use a delimiter to match the semantics for StringAppendOperator
  5601. value.append(1,',');
  5602. }
  5603. value.append(operand.data(), operand.size());
  5604. // Write back to the database
  5605. Status s = db->Put(write_options_, key, value);
  5606. if (!s.ok()) {
  5607. fprintf(stderr, "put error: %s\n", s.ToString().c_str());
  5608. exit(1);
  5609. }
  5610. bytes += key.size() + value.size();
  5611. thread->stats.FinishedOps(nullptr, db, 1, kUpdate);
  5612. }
  5613. char msg[100];
  5614. snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")",
  5615. readwrites_, found);
  5616. thread->stats.AddBytes(bytes);
  5617. thread->stats.AddMessage(msg);
  5618. }
  5619. // Read-modify-write for random keys (using MergeOperator)
  5620. // The merge operator to use should be defined by FLAGS_merge_operator
  5621. // Adjust FLAGS_value_size so that the keys are reasonable for this operator
  5622. // Assumes that the merge operator is non-null (i.e.: is well-defined)
  5623. //
  5624. // For example, use FLAGS_merge_operator="uint64add" and FLAGS_value_size=8
  5625. // to simulate random additions over 64-bit integers using merge.
  5626. //
  5627. // The number of merges on the same key can be controlled by adjusting
  5628. // FLAGS_merge_keys.
  5629. void MergeRandom(ThreadState* thread) {
  5630. RandomGenerator gen;
  5631. int64_t bytes = 0;
  5632. std::unique_ptr<const char[]> key_guard;
  5633. Slice key = AllocateKey(&key_guard);
  5634. // The number of iterations is the larger of read_ or write_
  5635. Duration duration(FLAGS_duration, readwrites_);
  5636. while (!duration.Done(1)) {
  5637. DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
  5638. int64_t key_rand = thread->rand.Next() % merge_keys_;
  5639. GenerateKeyFromInt(key_rand, merge_keys_, &key);
  5640. Status s;
  5641. Slice val = gen.Generate();
  5642. if (FLAGS_num_column_families > 1) {
  5643. s = db_with_cfh->db->Merge(write_options_,
  5644. db_with_cfh->GetCfh(key_rand), key,
  5645. val);
  5646. } else {
  5647. s = db_with_cfh->db->Merge(write_options_,
  5648. db_with_cfh->db->DefaultColumnFamily(), key,
  5649. val);
  5650. }
  5651. if (!s.ok()) {
  5652. fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
  5653. exit(1);
  5654. }
  5655. bytes += key.size() + val.size();
  5656. thread->stats.FinishedOps(nullptr, db_with_cfh->db, 1, kMerge);
  5657. }
  5658. // Print some statistics
  5659. char msg[100];
  5660. snprintf(msg, sizeof(msg), "( updates:%" PRIu64 ")", readwrites_);
  5661. thread->stats.AddBytes(bytes);
  5662. thread->stats.AddMessage(msg);
  5663. }
  5664. // Read and merge random keys. The amount of reads and merges are controlled
  5665. // by adjusting FLAGS_num and FLAGS_mergereadpercent. The number of distinct
  5666. // keys (and thus also the number of reads and merges on the same key) can be
  5667. // adjusted with FLAGS_merge_keys.
  5668. //
  5669. // As with MergeRandom, the merge operator to use should be defined by
  5670. // FLAGS_merge_operator.
  5671. void ReadRandomMergeRandom(ThreadState* thread) {
  5672. ReadOptions options(FLAGS_verify_checksum, true);
  5673. RandomGenerator gen;
  5674. std::string value;
  5675. int64_t num_hits = 0;
  5676. int64_t num_gets = 0;
  5677. int64_t num_merges = 0;
  5678. size_t max_length = 0;
  5679. std::unique_ptr<const char[]> key_guard;
  5680. Slice key = AllocateKey(&key_guard);
  5681. // the number of iterations is the larger of read_ or write_
  5682. Duration duration(FLAGS_duration, readwrites_);
  5683. while (!duration.Done(1)) {
  5684. DB* db = SelectDB(thread);
  5685. GenerateKeyFromInt(thread->rand.Next() % merge_keys_, merge_keys_, &key);
  5686. bool do_merge = int(thread->rand.Next() % 100) < FLAGS_mergereadpercent;
  5687. if (do_merge) {
  5688. Status s = db->Merge(write_options_, key, gen.Generate());
  5689. if (!s.ok()) {
  5690. fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
  5691. exit(1);
  5692. }
  5693. num_merges++;
  5694. thread->stats.FinishedOps(nullptr, db, 1, kMerge);
  5695. } else {
  5696. Status s = db->Get(options, key, &value);
  5697. if (value.length() > max_length)
  5698. max_length = value.length();
  5699. if (!s.ok() && !s.IsNotFound()) {
  5700. fprintf(stderr, "get error: %s\n", s.ToString().c_str());
  5701. // we continue after error rather than exiting so that we can
  5702. // find more errors if any
  5703. } else if (!s.IsNotFound()) {
  5704. num_hits++;
  5705. }
  5706. num_gets++;
  5707. thread->stats.FinishedOps(nullptr, db, 1, kRead);
  5708. }
  5709. }
  5710. char msg[100];
  5711. snprintf(msg, sizeof(msg),
  5712. "(reads:%" PRIu64 " merges:%" PRIu64 " total:%" PRIu64
  5713. " hits:%" PRIu64 " maxlength:%" ROCKSDB_PRIszt ")",
  5714. num_gets, num_merges, readwrites_, num_hits, max_length);
  5715. thread->stats.AddMessage(msg);
  5716. }
  5717. void WriteSeqSeekSeq(ThreadState* thread) {
  5718. writes_ = FLAGS_num;
  5719. DoWrite(thread, SEQUENTIAL);
  5720. // exclude writes from the ops/sec calculation
  5721. thread->stats.Start(thread->tid);
  5722. DB* db = SelectDB(thread);
  5723. std::unique_ptr<Iterator> iter(
  5724. db->NewIterator(ReadOptions(FLAGS_verify_checksum, true)));
  5725. std::unique_ptr<const char[]> key_guard;
  5726. Slice key = AllocateKey(&key_guard);
  5727. for (int64_t i = 0; i < FLAGS_num; ++i) {
  5728. GenerateKeyFromInt(i, FLAGS_num, &key);
  5729. iter->Seek(key);
  5730. assert(iter->Valid() && iter->key() == key);
  5731. thread->stats.FinishedOps(nullptr, db, 1, kSeek);
  5732. for (int j = 0; j < FLAGS_seek_nexts && i + 1 < FLAGS_num; ++j) {
  5733. if (!FLAGS_reverse_iterator) {
  5734. iter->Next();
  5735. } else {
  5736. iter->Prev();
  5737. }
  5738. GenerateKeyFromInt(++i, FLAGS_num, &key);
  5739. assert(iter->Valid() && iter->key() == key);
  5740. thread->stats.FinishedOps(nullptr, db, 1, kSeek);
  5741. }
  5742. iter->Seek(key);
  5743. assert(iter->Valid() && iter->key() == key);
  5744. thread->stats.FinishedOps(nullptr, db, 1, kSeek);
  5745. }
  5746. }
  5747. bool binary_search(std::vector<int>& data, int start, int end, int key) {
  5748. if (data.empty()) return false;
  5749. if (start > end) return false;
  5750. int mid = start + (end - start) / 2;
  5751. if (mid > static_cast<int>(data.size()) - 1) return false;
  5752. if (data[mid] == key) {
  5753. return true;
  5754. } else if (data[mid] > key) {
  5755. return binary_search(data, start, mid - 1, key);
  5756. } else {
  5757. return binary_search(data, mid + 1, end, key);
  5758. }
  5759. }
  5760. // Does a bunch of merge operations for a key(key1) where the merge operand
  5761. // is a sorted list. Next performance comparison is done between doing a Get
  5762. // for key1 followed by searching for another key(key2) in the large sorted
  5763. // list vs calling GetMergeOperands for key1 and then searching for the key2
  5764. // in all the sorted sub-lists. Later case is expected to be a lot faster.
  5765. void GetMergeOperands(ThreadState* thread) {
  5766. DB* db = SelectDB(thread);
  5767. const int kTotalValues = 100000;
  5768. const int kListSize = 100;
  5769. std::string key = "my_key";
  5770. std::string value;
  5771. for (int i = 1; i < kTotalValues; i++) {
  5772. if (i % kListSize == 0) {
  5773. // Remove trailing ','
  5774. value.pop_back();
  5775. db->Merge(WriteOptions(), key, value);
  5776. value.clear();
  5777. } else {
  5778. value.append(std::to_string(i)).append(",");
  5779. }
  5780. }
  5781. SortList s;
  5782. std::vector<int> data;
  5783. // This value can be experimented with and it will demonstrate the
  5784. // perf difference between doing a Get and searching for lookup_key in the
  5785. // resultant large sorted list vs doing GetMergeOperands and searching
  5786. // for lookup_key within this resultant sorted sub-lists.
  5787. int lookup_key = 1;
  5788. // Get API call
  5789. std::cout << "--- Get API call --- \n";
  5790. PinnableSlice p_slice;
  5791. uint64_t st = FLAGS_env->NowNanos();
  5792. db->Get(ReadOptions(), db->DefaultColumnFamily(), key, &p_slice);
  5793. s.MakeVector(data, p_slice);
  5794. bool found =
  5795. binary_search(data, 0, static_cast<int>(data.size() - 1), lookup_key);
  5796. std::cout << "Found key? " << std::to_string(found) << "\n";
  5797. uint64_t sp = FLAGS_env->NowNanos();
  5798. std::cout << "Get: " << (sp - st) / 1000000000.0 << " seconds\n";
  5799. std::string* dat_ = p_slice.GetSelf();
  5800. std::cout << "Sample data from Get API call: " << dat_->substr(0, 10)
  5801. << "\n";
  5802. data.clear();
  5803. // GetMergeOperands API call
  5804. std::cout << "--- GetMergeOperands API --- \n";
  5805. std::vector<PinnableSlice> a_slice((kTotalValues / kListSize) + 1);
  5806. st = FLAGS_env->NowNanos();
  5807. int number_of_operands = 0;
  5808. GetMergeOperandsOptions get_merge_operands_options;
  5809. get_merge_operands_options.expected_max_number_of_operands =
  5810. (kTotalValues / 100) + 1;
  5811. db->GetMergeOperands(ReadOptions(), db->DefaultColumnFamily(), key,
  5812. a_slice.data(), &get_merge_operands_options,
  5813. &number_of_operands);
  5814. for (PinnableSlice& psl : a_slice) {
  5815. s.MakeVector(data, psl);
  5816. found =
  5817. binary_search(data, 0, static_cast<int>(data.size() - 1), lookup_key);
  5818. data.clear();
  5819. if (found) break;
  5820. }
  5821. std::cout << "Found key? " << std::to_string(found) << "\n";
  5822. sp = FLAGS_env->NowNanos();
  5823. std::cout << "Get Merge operands: " << (sp - st) / 1000000000.0
  5824. << " seconds \n";
  5825. int to_print = 0;
  5826. std::cout << "Sample data from GetMergeOperands API call: ";
  5827. for (PinnableSlice& psl : a_slice) {
  5828. std::cout << "List: " << to_print << " : " << *psl.GetSelf() << "\n";
  5829. if (to_print++ > 2) break;
  5830. }
  5831. }
  5832. #ifndef ROCKSDB_LITE
  5833. // This benchmark stress tests Transactions. For a given --duration (or
  5834. // total number of --writes, a Transaction will perform a read-modify-write
  5835. // to increment the value of a key in each of N(--transaction-sets) sets of
  5836. // keys (where each set has --num keys). If --threads is set, this will be
  5837. // done in parallel.
  5838. //
  5839. // To test transactions, use --transaction_db=true. Not setting this
  5840. // parameter
  5841. // will run the same benchmark without transactions.
  5842. //
  5843. // RandomTransactionVerify() will then validate the correctness of the results
  5844. // by checking if the sum of all keys in each set is the same.
  5845. void RandomTransaction(ThreadState* thread) {
  5846. ReadOptions options(FLAGS_verify_checksum, true);
  5847. Duration duration(FLAGS_duration, readwrites_);
  5848. ReadOptions read_options(FLAGS_verify_checksum, true);
  5849. uint16_t num_prefix_ranges = static_cast<uint16_t>(FLAGS_transaction_sets);
  5850. uint64_t transactions_done = 0;
  5851. if (num_prefix_ranges == 0 || num_prefix_ranges > 9999) {
  5852. fprintf(stderr, "invalid value for transaction_sets\n");
  5853. abort();
  5854. }
  5855. TransactionOptions txn_options;
  5856. txn_options.lock_timeout = FLAGS_transaction_lock_timeout;
  5857. txn_options.set_snapshot = FLAGS_transaction_set_snapshot;
  5858. RandomTransactionInserter inserter(&thread->rand, write_options_,
  5859. read_options, FLAGS_num,
  5860. num_prefix_ranges);
  5861. if (FLAGS_num_multi_db > 1) {
  5862. fprintf(stderr,
  5863. "Cannot run RandomTransaction benchmark with "
  5864. "FLAGS_multi_db > 1.");
  5865. abort();
  5866. }
  5867. while (!duration.Done(1)) {
  5868. bool success;
  5869. // RandomTransactionInserter will attempt to insert a key for each
  5870. // # of FLAGS_transaction_sets
  5871. if (FLAGS_optimistic_transaction_db) {
  5872. success = inserter.OptimisticTransactionDBInsert(db_.opt_txn_db);
  5873. } else if (FLAGS_transaction_db) {
  5874. TransactionDB* txn_db = reinterpret_cast<TransactionDB*>(db_.db);
  5875. success = inserter.TransactionDBInsert(txn_db, txn_options);
  5876. } else {
  5877. success = inserter.DBInsert(db_.db);
  5878. }
  5879. if (!success) {
  5880. fprintf(stderr, "Unexpected error: %s\n",
  5881. inserter.GetLastStatus().ToString().c_str());
  5882. abort();
  5883. }
  5884. thread->stats.FinishedOps(nullptr, db_.db, 1, kOthers);
  5885. transactions_done++;
  5886. }
  5887. char msg[100];
  5888. if (FLAGS_optimistic_transaction_db || FLAGS_transaction_db) {
  5889. snprintf(msg, sizeof(msg),
  5890. "( transactions:%" PRIu64 " aborts:%" PRIu64 ")",
  5891. transactions_done, inserter.GetFailureCount());
  5892. } else {
  5893. snprintf(msg, sizeof(msg), "( batches:%" PRIu64 " )", transactions_done);
  5894. }
  5895. thread->stats.AddMessage(msg);
  5896. if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
  5897. thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
  5898. get_perf_context()->ToString());
  5899. }
  5900. thread->stats.AddBytes(static_cast<int64_t>(inserter.GetBytesInserted()));
  5901. }
  5902. // Verifies consistency of data after RandomTransaction() has been run.
  5903. // Since each iteration of RandomTransaction() incremented a key in each set
  5904. // by the same value, the sum of the keys in each set should be the same.
  5905. void RandomTransactionVerify() {
  5906. if (!FLAGS_transaction_db && !FLAGS_optimistic_transaction_db) {
  5907. // transactions not used, nothing to verify.
  5908. return;
  5909. }
  5910. Status s =
  5911. RandomTransactionInserter::Verify(db_.db,
  5912. static_cast<uint16_t>(FLAGS_transaction_sets));
  5913. if (s.ok()) {
  5914. fprintf(stdout, "RandomTransactionVerify Success.\n");
  5915. } else {
  5916. fprintf(stdout, "RandomTransactionVerify FAILED!!\n");
  5917. }
  5918. }
  5919. #endif // ROCKSDB_LITE
  5920. // Writes and deletes random keys without overwriting keys.
  5921. //
  5922. // This benchmark is intended to partially replicate the behavior of MyRocks
  5923. // secondary indices: All data is stored in keys and updates happen by
  5924. // deleting the old version of the key and inserting the new version.
  5925. void RandomReplaceKeys(ThreadState* thread) {
  5926. std::unique_ptr<const char[]> key_guard;
  5927. Slice key = AllocateKey(&key_guard);
  5928. std::vector<uint32_t> counters(FLAGS_numdistinct, 0);
  5929. size_t max_counter = 50;
  5930. RandomGenerator gen;
  5931. Status s;
  5932. DB* db = SelectDB(thread);
  5933. for (int64_t i = 0; i < FLAGS_numdistinct; i++) {
  5934. GenerateKeyFromInt(i * max_counter, FLAGS_num, &key);
  5935. s = db->Put(write_options_, key, gen.Generate());
  5936. if (!s.ok()) {
  5937. fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str());
  5938. exit(1);
  5939. }
  5940. }
  5941. db->GetSnapshot();
  5942. std::default_random_engine generator;
  5943. std::normal_distribution<double> distribution(FLAGS_numdistinct / 2.0,
  5944. FLAGS_stddev);
  5945. Duration duration(FLAGS_duration, FLAGS_num);
  5946. while (!duration.Done(1)) {
  5947. int64_t rnd_id = static_cast<int64_t>(distribution(generator));
  5948. int64_t key_id = std::max(std::min(FLAGS_numdistinct - 1, rnd_id),
  5949. static_cast<int64_t>(0));
  5950. GenerateKeyFromInt(key_id * max_counter + counters[key_id], FLAGS_num,
  5951. &key);
  5952. s = FLAGS_use_single_deletes ? db->SingleDelete(write_options_, key)
  5953. : db->Delete(write_options_, key);
  5954. if (s.ok()) {
  5955. counters[key_id] = (counters[key_id] + 1) % max_counter;
  5956. GenerateKeyFromInt(key_id * max_counter + counters[key_id], FLAGS_num,
  5957. &key);
  5958. s = db->Put(write_options_, key, Slice());
  5959. }
  5960. if (!s.ok()) {
  5961. fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str());
  5962. exit(1);
  5963. }
  5964. thread->stats.FinishedOps(nullptr, db, 1, kOthers);
  5965. }
  5966. char msg[200];
  5967. snprintf(msg, sizeof(msg),
  5968. "use single deletes: %d, "
  5969. "standard deviation: %lf\n",
  5970. FLAGS_use_single_deletes, FLAGS_stddev);
  5971. thread->stats.AddMessage(msg);
  5972. }
  5973. void TimeSeriesReadOrDelete(ThreadState* thread, bool do_deletion) {
  5974. ReadOptions options(FLAGS_verify_checksum, true);
  5975. int64_t read = 0;
  5976. int64_t found = 0;
  5977. int64_t bytes = 0;
  5978. Iterator* iter = nullptr;
  5979. // Only work on single database
  5980. assert(db_.db != nullptr);
  5981. iter = db_.db->NewIterator(options);
  5982. std::unique_ptr<const char[]> key_guard;
  5983. Slice key = AllocateKey(&key_guard);
  5984. char value_buffer[256];
  5985. while (true) {
  5986. {
  5987. MutexLock l(&thread->shared->mu);
  5988. if (thread->shared->num_done >= 1) {
  5989. // Write thread have finished
  5990. break;
  5991. }
  5992. }
  5993. if (!FLAGS_use_tailing_iterator) {
  5994. delete iter;
  5995. iter = db_.db->NewIterator(options);
  5996. }
  5997. // Pick a Iterator to use
  5998. int64_t key_id = thread->rand.Next() % FLAGS_key_id_range;
  5999. GenerateKeyFromInt(key_id, FLAGS_num, &key);
  6000. // Reset last 8 bytes to 0
  6001. char* start = const_cast<char*>(key.data());
  6002. start += key.size() - 8;
  6003. memset(start, 0, 8);
  6004. ++read;
  6005. bool key_found = false;
  6006. // Seek the prefix
  6007. for (iter->Seek(key); iter->Valid() && iter->key().starts_with(key);
  6008. iter->Next()) {
  6009. key_found = true;
  6010. // Copy out iterator's value to make sure we read them.
  6011. if (do_deletion) {
  6012. bytes += iter->key().size();
  6013. if (KeyExpired(timestamp_emulator_.get(), iter->key())) {
  6014. thread->stats.FinishedOps(&db_, db_.db, 1, kDelete);
  6015. db_.db->Delete(write_options_, iter->key());
  6016. } else {
  6017. break;
  6018. }
  6019. } else {
  6020. bytes += iter->key().size() + iter->value().size();
  6021. thread->stats.FinishedOps(&db_, db_.db, 1, kRead);
  6022. Slice value = iter->value();
  6023. memcpy(value_buffer, value.data(),
  6024. std::min(value.size(), sizeof(value_buffer)));
  6025. assert(iter->status().ok());
  6026. }
  6027. }
  6028. found += key_found;
  6029. if (thread->shared->read_rate_limiter.get() != nullptr) {
  6030. thread->shared->read_rate_limiter->Request(
  6031. 1, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
  6032. }
  6033. }
  6034. delete iter;
  6035. char msg[100];
  6036. snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", found,
  6037. read);
  6038. thread->stats.AddBytes(bytes);
  6039. thread->stats.AddMessage(msg);
  6040. if (FLAGS_perf_level > ROCKSDB_NAMESPACE::PerfLevel::kDisable) {
  6041. thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
  6042. get_perf_context()->ToString());
  6043. }
  6044. }
  6045. void TimeSeriesWrite(ThreadState* thread) {
  6046. // Special thread that keeps writing until other threads are done.
  6047. RandomGenerator gen;
  6048. int64_t bytes = 0;
  6049. // Don't merge stats from this thread with the readers.
  6050. thread->stats.SetExcludeFromMerge();
  6051. std::unique_ptr<RateLimiter> write_rate_limiter;
  6052. if (FLAGS_benchmark_write_rate_limit > 0) {
  6053. write_rate_limiter.reset(
  6054. NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
  6055. }
  6056. std::unique_ptr<const char[]> key_guard;
  6057. Slice key = AllocateKey(&key_guard);
  6058. Duration duration(FLAGS_duration, writes_);
  6059. while (!duration.Done(1)) {
  6060. DB* db = SelectDB(thread);
  6061. uint64_t key_id = thread->rand.Next() % FLAGS_key_id_range;
  6062. // Write key id
  6063. GenerateKeyFromInt(key_id, FLAGS_num, &key);
  6064. // Write timestamp
  6065. char* start = const_cast<char*>(key.data());
  6066. char* pos = start + 8;
  6067. int bytes_to_fill =
  6068. std::min(key_size_ - static_cast<int>(pos - start), 8);
  6069. uint64_t timestamp_value = timestamp_emulator_->Get();
  6070. if (port::kLittleEndian) {
  6071. for (int i = 0; i < bytes_to_fill; ++i) {
  6072. pos[i] = (timestamp_value >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
  6073. }
  6074. } else {
  6075. memcpy(pos, static_cast<void*>(&timestamp_value), bytes_to_fill);
  6076. }
  6077. timestamp_emulator_->Inc();
  6078. Status s;
  6079. Slice val = gen.Generate();
  6080. s = db->Put(write_options_, key, val);
  6081. if (!s.ok()) {
  6082. fprintf(stderr, "put error: %s\n", s.ToString().c_str());
  6083. exit(1);
  6084. }
  6085. bytes = key.size() + val.size();
  6086. thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
  6087. thread->stats.AddBytes(bytes);
  6088. if (FLAGS_benchmark_write_rate_limit > 0) {
  6089. write_rate_limiter->Request(
  6090. key.size() + val.size(), Env::IO_HIGH,
  6091. nullptr /* stats */, RateLimiter::OpType::kWrite);
  6092. }
  6093. }
  6094. }
  6095. void TimeSeries(ThreadState* thread) {
  6096. if (thread->tid > 0) {
  6097. bool do_deletion = FLAGS_expire_style == "delete" &&
  6098. thread->tid <= FLAGS_num_deletion_threads;
  6099. TimeSeriesReadOrDelete(thread, do_deletion);
  6100. } else {
  6101. TimeSeriesWrite(thread);
  6102. thread->stats.Stop();
  6103. thread->stats.Report("timeseries write");
  6104. }
  6105. }
  6106. void Compact(ThreadState* thread) {
  6107. DB* db = SelectDB(thread);
  6108. CompactRangeOptions cro;
  6109. cro.bottommost_level_compaction =
  6110. BottommostLevelCompaction::kForceOptimized;
  6111. db->CompactRange(cro, nullptr, nullptr);
  6112. }
  6113. void CompactAll() {
  6114. if (db_.db != nullptr) {
  6115. db_.db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
  6116. }
  6117. for (const auto& db_with_cfh : multi_dbs_) {
  6118. db_with_cfh.db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
  6119. }
  6120. }
  6121. void ResetStats() {
  6122. if (db_.db != nullptr) {
  6123. db_.db->ResetStats();
  6124. }
  6125. for (const auto& db_with_cfh : multi_dbs_) {
  6126. db_with_cfh.db->ResetStats();
  6127. }
  6128. }
  6129. void PrintStatsHistory() {
  6130. if (db_.db != nullptr) {
  6131. PrintStatsHistoryImpl(db_.db, false);
  6132. }
  6133. for (const auto& db_with_cfh : multi_dbs_) {
  6134. PrintStatsHistoryImpl(db_with_cfh.db, true);
  6135. }
  6136. }
  6137. void PrintStatsHistoryImpl(DB* db, bool print_header) {
  6138. if (print_header) {
  6139. fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
  6140. }
  6141. std::unique_ptr<StatsHistoryIterator> shi;
  6142. Status s = db->GetStatsHistory(0, port::kMaxUint64, &shi);
  6143. if (!s.ok()) {
  6144. fprintf(stdout, "%s\n", s.ToString().c_str());
  6145. return;
  6146. }
  6147. assert(shi);
  6148. while (shi->Valid()) {
  6149. uint64_t stats_time = shi->GetStatsTime();
  6150. fprintf(stdout, "------ %s ------\n",
  6151. TimeToHumanString(static_cast<int>(stats_time)).c_str());
  6152. for (auto& entry : shi->GetStatsMap()) {
  6153. fprintf(stdout, " %" PRIu64 " %s %" PRIu64 "\n", stats_time,
  6154. entry.first.c_str(), entry.second);
  6155. }
  6156. shi->Next();
  6157. }
  6158. }
  6159. void PrintStats(const char* key) {
  6160. if (db_.db != nullptr) {
  6161. PrintStats(db_.db, key, false);
  6162. }
  6163. for (const auto& db_with_cfh : multi_dbs_) {
  6164. PrintStats(db_with_cfh.db, key, true);
  6165. }
  6166. }
  6167. void PrintStats(DB* db, const char* key, bool print_header = false) {
  6168. if (print_header) {
  6169. fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
  6170. }
  6171. std::string stats;
  6172. if (!db->GetProperty(key, &stats)) {
  6173. stats = "(failed)";
  6174. }
  6175. fprintf(stdout, "\n%s\n", stats.c_str());
  6176. }
  6177. void Replay(ThreadState* thread) {
  6178. if (db_.db != nullptr) {
  6179. Replay(thread, &db_);
  6180. }
  6181. }
  6182. void Replay(ThreadState* /*thread*/, DBWithColumnFamilies* db_with_cfh) {
  6183. Status s;
  6184. std::unique_ptr<TraceReader> trace_reader;
  6185. s = NewFileTraceReader(FLAGS_env, EnvOptions(), FLAGS_trace_file,
  6186. &trace_reader);
  6187. if (!s.ok()) {
  6188. fprintf(
  6189. stderr,
  6190. "Encountered an error creating a TraceReader from the trace file. "
  6191. "Error: %s\n",
  6192. s.ToString().c_str());
  6193. exit(1);
  6194. }
  6195. Replayer replayer(db_with_cfh->db, db_with_cfh->cfh,
  6196. std::move(trace_reader));
  6197. replayer.SetFastForward(
  6198. static_cast<uint32_t>(FLAGS_trace_replay_fast_forward));
  6199. s = replayer.MultiThreadReplay(
  6200. static_cast<uint32_t>(FLAGS_trace_replay_threads));
  6201. if (s.ok()) {
  6202. fprintf(stdout, "Replay started from trace_file: %s\n",
  6203. FLAGS_trace_file.c_str());
  6204. } else {
  6205. fprintf(stderr, "Starting replay failed. Error: %s\n",
  6206. s.ToString().c_str());
  6207. }
  6208. }
  6209. };
  6210. int db_bench_tool(int argc, char** argv) {
  6211. ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
  6212. static bool initialized = false;
  6213. if (!initialized) {
  6214. SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
  6215. " [OPTIONS]...");
  6216. initialized = true;
  6217. }
  6218. ParseCommandLineFlags(&argc, &argv, true);
  6219. FLAGS_compaction_style_e =
  6220. (ROCKSDB_NAMESPACE::CompactionStyle)FLAGS_compaction_style;
  6221. #ifndef ROCKSDB_LITE
  6222. if (FLAGS_statistics && !FLAGS_statistics_string.empty()) {
  6223. fprintf(stderr,
  6224. "Cannot provide both --statistics and --statistics_string.\n");
  6225. exit(1);
  6226. }
  6227. if (!FLAGS_statistics_string.empty()) {
  6228. Status s = ObjectRegistry::NewInstance()->NewSharedObject<Statistics>(
  6229. FLAGS_statistics_string, &dbstats);
  6230. if (dbstats == nullptr) {
  6231. fprintf(stderr,
  6232. "No Statistics registered matching string: %s status=%s\n",
  6233. FLAGS_statistics_string.c_str(), s.ToString().c_str());
  6234. exit(1);
  6235. }
  6236. }
  6237. #endif // ROCKSDB_LITE
  6238. if (FLAGS_statistics) {
  6239. dbstats = ROCKSDB_NAMESPACE::CreateDBStatistics();
  6240. }
  6241. if (dbstats) {
  6242. dbstats->set_stats_level(static_cast<StatsLevel>(FLAGS_stats_level));
  6243. }
  6244. FLAGS_compaction_pri_e =
  6245. (ROCKSDB_NAMESPACE::CompactionPri)FLAGS_compaction_pri;
  6246. std::vector<std::string> fanout = ROCKSDB_NAMESPACE::StringSplit(
  6247. FLAGS_max_bytes_for_level_multiplier_additional, ',');
  6248. for (size_t j = 0; j < fanout.size(); j++) {
  6249. FLAGS_max_bytes_for_level_multiplier_additional_v.push_back(
  6250. #ifndef CYGWIN
  6251. std::stoi(fanout[j]));
  6252. #else
  6253. stoi(fanout[j]));
  6254. #endif
  6255. }
  6256. FLAGS_compression_type_e =
  6257. StringToCompressionType(FLAGS_compression_type.c_str());
  6258. #ifndef ROCKSDB_LITE
  6259. FLAGS_blob_db_compression_type_e =
  6260. StringToCompressionType(FLAGS_blob_db_compression_type.c_str());
  6261. if (!FLAGS_hdfs.empty() && !FLAGS_env_uri.empty()) {
  6262. fprintf(stderr, "Cannot provide both --hdfs and --env_uri.\n");
  6263. exit(1);
  6264. } else if (!FLAGS_env_uri.empty()) {
  6265. Status s = Env::LoadEnv(FLAGS_env_uri, &FLAGS_env, &env_guard);
  6266. if (FLAGS_env == nullptr) {
  6267. fprintf(stderr, "No Env registered for URI: %s\n", FLAGS_env_uri.c_str());
  6268. exit(1);
  6269. }
  6270. }
  6271. #endif // ROCKSDB_LITE
  6272. if (FLAGS_use_existing_keys && !FLAGS_use_existing_db) {
  6273. fprintf(stderr,
  6274. "`-use_existing_db` must be true for `-use_existing_keys` to be "
  6275. "settable\n");
  6276. exit(1);
  6277. }
  6278. if (!FLAGS_hdfs.empty()) {
  6279. FLAGS_env = new ROCKSDB_NAMESPACE::HdfsEnv(FLAGS_hdfs);
  6280. }
  6281. if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NONE"))
  6282. FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::NONE;
  6283. else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "NORMAL"))
  6284. FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::NORMAL;
  6285. else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "SEQUENTIAL"))
  6286. FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::SEQUENTIAL;
  6287. else if (!strcasecmp(FLAGS_compaction_fadvice.c_str(), "WILLNEED"))
  6288. FLAGS_compaction_fadvice_e = ROCKSDB_NAMESPACE::Options::WILLNEED;
  6289. else {
  6290. fprintf(stdout, "Unknown compaction fadvice:%s\n",
  6291. FLAGS_compaction_fadvice.c_str());
  6292. }
  6293. FLAGS_value_size_distribution_type_e =
  6294. StringToDistributionType(FLAGS_value_size_distribution_type.c_str());
  6295. FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
  6296. // Note options sanitization may increase thread pool sizes according to
  6297. // max_background_flushes/max_background_compactions/max_background_jobs
  6298. FLAGS_env->SetBackgroundThreads(FLAGS_num_high_pri_threads,
  6299. ROCKSDB_NAMESPACE::Env::Priority::HIGH);
  6300. FLAGS_env->SetBackgroundThreads(FLAGS_num_bottom_pri_threads,
  6301. ROCKSDB_NAMESPACE::Env::Priority::BOTTOM);
  6302. FLAGS_env->SetBackgroundThreads(FLAGS_num_low_pri_threads,
  6303. ROCKSDB_NAMESPACE::Env::Priority::LOW);
  6304. // Choose a location for the test database if none given with --db=<path>
  6305. if (FLAGS_db.empty()) {
  6306. std::string default_db_path;
  6307. FLAGS_env->GetTestDirectory(&default_db_path);
  6308. default_db_path += "/dbbench";
  6309. FLAGS_db = default_db_path;
  6310. }
  6311. if (FLAGS_stats_interval_seconds > 0) {
  6312. // When both are set then FLAGS_stats_interval determines the frequency
  6313. // at which the timer is checked for FLAGS_stats_interval_seconds
  6314. FLAGS_stats_interval = 1000;
  6315. }
  6316. if (FLAGS_seek_missing_prefix && FLAGS_prefix_size <= 8) {
  6317. fprintf(stderr, "prefix_size > 8 required by --seek_missing_prefix\n");
  6318. exit(1);
  6319. }
  6320. ROCKSDB_NAMESPACE::Benchmark benchmark;
  6321. benchmark.Run();
  6322. #ifndef ROCKSDB_LITE
  6323. if (FLAGS_print_malloc_stats) {
  6324. std::string stats_string;
  6325. ROCKSDB_NAMESPACE::DumpMallocStats(&stats_string);
  6326. fprintf(stdout, "Malloc stats:\n%s\n", stats_string.c_str());
  6327. }
  6328. #endif // ROCKSDB_LITE
  6329. return 0;
  6330. }
  6331. } // namespace ROCKSDB_NAMESPACE
  6332. #endif