block_cache_pysim.py 68 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000
  1. #!/usr/bin/env python3
  2. # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  3. import gc
  4. import heapq
  5. import random
  6. import sys
  7. import time
  8. from collections import OrderedDict
  9. from os import path
  10. import numpy as np
  11. kSampleSize = 64 # The sample size used when performing eviction.
  12. kMicrosInSecond = 1000000
  13. kSecondsInMinute = 60
  14. kSecondsInHour = 3600
  15. class TraceRecord:
  16. """
  17. A trace record represents a block access.
  18. It holds the same struct as BlockCacheTraceRecord in
  19. trace_replay/block_cache_tracer.h
  20. """
  21. def __init__(
  22. self,
  23. access_time,
  24. block_id,
  25. block_type,
  26. block_size,
  27. cf_id,
  28. cf_name,
  29. level,
  30. fd,
  31. caller,
  32. no_insert,
  33. get_id,
  34. key_id,
  35. kv_size,
  36. is_hit,
  37. referenced_key_exist_in_block,
  38. num_keys_in_block,
  39. table_id,
  40. seq_number,
  41. block_key_size,
  42. key_size,
  43. block_offset_in_file,
  44. next_access_seq_no,
  45. ):
  46. self.access_time = access_time
  47. self.block_id = block_id
  48. self.block_type = block_type
  49. self.block_size = block_size + block_key_size
  50. self.cf_id = cf_id
  51. self.cf_name = cf_name
  52. self.level = level
  53. self.fd = fd
  54. self.caller = caller
  55. if no_insert == 1:
  56. self.no_insert = True
  57. else:
  58. self.no_insert = False
  59. self.get_id = get_id
  60. self.key_id = key_id
  61. self.kv_size = kv_size
  62. if is_hit == 1:
  63. self.is_hit = True
  64. else:
  65. self.is_hit = False
  66. if referenced_key_exist_in_block == 1:
  67. self.referenced_key_exist_in_block = True
  68. else:
  69. self.referenced_key_exist_in_block = False
  70. self.num_keys_in_block = num_keys_in_block
  71. self.table_id = table_id
  72. self.seq_number = seq_number
  73. self.block_key_size = block_key_size
  74. self.key_size = key_size
  75. self.block_offset_in_file = block_offset_in_file
  76. self.next_access_seq_no = next_access_seq_no
  77. class CacheEntry:
  78. """A cache entry stored in the cache."""
  79. def __init__(
  80. self,
  81. value_size,
  82. cf_id,
  83. level,
  84. block_type,
  85. table_id,
  86. access_number,
  87. time_s,
  88. num_hits=0,
  89. ):
  90. self.value_size = value_size
  91. self.last_access_number = access_number
  92. self.num_hits = num_hits
  93. self.cf_id = 0
  94. self.level = level
  95. self.block_type = block_type
  96. self.last_access_time = time_s
  97. self.insertion_time = time_s
  98. self.table_id = table_id
  99. def __repr__(self):
  100. """Debug string."""
  101. return "(s={},last={},hits={},cf={},l={},bt={})\n".format(
  102. self.value_size,
  103. self.last_access_number,
  104. self.num_hits,
  105. self.cf_id,
  106. self.level,
  107. self.block_type,
  108. )
  109. def cost_class(self, cost_class_label):
  110. if cost_class_label == "table_bt":
  111. return "{}-{}".format(self.table_id, self.block_type)
  112. elif cost_class_label == "table":
  113. return "{}".format(self.table_id)
  114. elif cost_class_label == "bt":
  115. return "{}".format(self.block_type)
  116. elif cost_class_label == "cf":
  117. return "{}".format(self.cf_id)
  118. elif cost_class_label == "cf_bt":
  119. return "{}-{}".format(self.cf_id, self.block_type)
  120. elif cost_class_label == "table_level_bt":
  121. return "{}-{}-{}".format(self.table_id, self.level, self.block_type)
  122. assert False, "Unknown cost class label {}".format(cost_class_label)
  123. return None
  124. class HashEntry:
  125. """A hash entry stored in a hash table."""
  126. def __init__(self, key, hash, value):
  127. self.key = key
  128. self.hash = hash
  129. self.value = value
  130. def __repr__(self):
  131. return "k={},h={},v=[{}]".format(self.key, self.hash, self.value)
  132. class HashTable:
  133. """
  134. A custom implementation of hash table to support fast random sampling.
  135. It is closed hashing and uses chaining to resolve hash conflicts.
  136. It grows/shrinks the hash table upon insertion/deletion to support
  137. fast lookups and random samplings.
  138. """
  139. def __init__(self):
  140. self.initial_size = 32
  141. self.table = [None] * self.initial_size
  142. self.elements = 0
  143. def random_sample(self, sample_size):
  144. """Randomly sample 'sample_size' hash entries from the table."""
  145. samples = []
  146. index = random.randint(0, len(self.table) - 1)
  147. pos = index
  148. # Starting from index, adding hash entries to the sample list until
  149. # sample_size is met or we ran out of entries.
  150. while True:
  151. if self.table[pos] is not None:
  152. for i in range(len(self.table[pos])):
  153. if self.table[pos][i] is None:
  154. continue
  155. samples.append(self.table[pos][i])
  156. if len(samples) == sample_size:
  157. break
  158. pos += 1
  159. pos = pos % len(self.table)
  160. if pos == index or len(samples) == sample_size:
  161. break
  162. assert len(samples) <= sample_size
  163. return samples
  164. def __repr__(self):
  165. all_entries = []
  166. for i in range(len(self.table)):
  167. if self.table[i] is None:
  168. continue
  169. for j in range(len(self.table[i])):
  170. if self.table[i][j] is not None:
  171. all_entries.append(self.table[i][j])
  172. return "{}".format(all_entries)
  173. def values(self):
  174. all_values = []
  175. for i in range(len(self.table)):
  176. if self.table[i] is None:
  177. continue
  178. for j in range(len(self.table[i])):
  179. if self.table[i][j] is not None:
  180. all_values.append(self.table[i][j].value)
  181. return all_values
  182. def __len__(self):
  183. return self.elements
  184. def insert(self, key, hash, value):
  185. """
  186. Insert a hash entry in the table. Replace the old entry if it already
  187. exists.
  188. """
  189. self.grow()
  190. inserted = False
  191. index = hash % len(self.table)
  192. if self.table[index] is None:
  193. self.table[index] = []
  194. # Search for the entry first.
  195. for i in range(len(self.table[index])):
  196. if self.table[index][i] is None:
  197. continue
  198. if self.table[index][i].hash == hash and self.table[index][i].key == key:
  199. # The entry already exists in the table.
  200. self.table[index][i] = HashEntry(key, hash, value)
  201. return
  202. # Find an empty slot.
  203. for i in range(len(self.table[index])):
  204. if self.table[index][i] is None:
  205. self.table[index][i] = HashEntry(key, hash, value)
  206. inserted = True
  207. break
  208. if not inserted:
  209. self.table[index].append(HashEntry(key, hash, value))
  210. self.elements += 1
  211. def resize(self, new_size):
  212. if new_size == len(self.table):
  213. return
  214. if new_size < self.initial_size:
  215. return
  216. if self.elements < 100:
  217. return
  218. new_table = [None] * new_size
  219. # Copy 'self.table' to new_table.
  220. for i in range(len(self.table)):
  221. entries = self.table[i]
  222. if entries is None:
  223. continue
  224. for j in range(len(entries)):
  225. if entries[j] is None:
  226. continue
  227. index = entries[j].hash % new_size
  228. if new_table[index] is None:
  229. new_table[index] = []
  230. new_table[index].append(entries[j])
  231. self.table = new_table
  232. del new_table
  233. # Manually call python gc here to free the memory as 'self.table'
  234. # might be very large.
  235. gc.collect()
  236. def grow(self):
  237. if self.elements < 4 * len(self.table):
  238. return
  239. new_size = int(len(self.table) * 1.5)
  240. self.resize(new_size)
  241. def delete(self, key, hash):
  242. index = hash % len(self.table)
  243. deleted = False
  244. deleted_entry = None
  245. if self.table[index] is None:
  246. return
  247. for i in range(len(self.table[index])):
  248. if (
  249. self.table[index][i] is not None
  250. and self.table[index][i].hash == hash
  251. and self.table[index][i].key == key
  252. ):
  253. deleted_entry = self.table[index][i]
  254. self.table[index][i] = None
  255. self.elements -= 1
  256. deleted = True
  257. break
  258. if deleted:
  259. self.shrink()
  260. return deleted_entry
  261. def shrink(self):
  262. if self.elements * 2 >= len(self.table):
  263. return
  264. new_size = int(len(self.table) * 0.7)
  265. self.resize(new_size)
  266. def lookup(self, key, hash):
  267. index = hash % len(self.table)
  268. if self.table[index] is None:
  269. return None
  270. for i in range(len(self.table[index])):
  271. if (
  272. self.table[index][i] is not None
  273. and self.table[index][i].hash == hash
  274. and self.table[index][i].key == key
  275. ):
  276. return self.table[index][i].value
  277. return None
  278. class MissRatioStats:
  279. def __init__(self, time_unit):
  280. self.num_misses = 0
  281. self.num_accesses = 0
  282. self.time_unit = time_unit
  283. self.time_misses = {}
  284. self.time_miss_bytes = {}
  285. self.time_accesses = {}
  286. def update_metrics(self, access_time, is_hit, miss_bytes):
  287. access_time /= kMicrosInSecond * self.time_unit
  288. self.num_accesses += 1
  289. if access_time not in self.time_accesses:
  290. self.time_accesses[access_time] = 0
  291. self.time_accesses[access_time] += 1
  292. if not is_hit:
  293. self.num_misses += 1
  294. if access_time not in self.time_misses:
  295. self.time_misses[access_time] = 0
  296. self.time_miss_bytes[access_time] = 0
  297. self.time_misses[access_time] += 1
  298. self.time_miss_bytes[access_time] += miss_bytes
  299. def reset_counter(self):
  300. self.num_misses = 0
  301. self.num_accesses = 0
  302. self.time_miss_bytes.clear()
  303. self.time_misses.clear()
  304. self.time_accesses.clear()
  305. def compute_miss_bytes(self):
  306. miss_bytes = []
  307. for at in self.time_miss_bytes:
  308. miss_bytes.append(self.time_miss_bytes[at])
  309. miss_bytes = sorted(miss_bytes)
  310. avg_miss_bytes = 0
  311. p95_miss_bytes = 0
  312. for i in range(len(miss_bytes)):
  313. avg_miss_bytes += float(miss_bytes[i]) / float(len(miss_bytes))
  314. p95_index = min(int(0.95 * float(len(miss_bytes))), len(miss_bytes) - 1)
  315. p95_miss_bytes = miss_bytes[p95_index]
  316. return avg_miss_bytes, p95_miss_bytes
  317. def miss_ratio(self):
  318. return float(self.num_misses) * 100.0 / float(self.num_accesses)
  319. def write_miss_timeline(
  320. self, cache_type, cache_size, target_cf_name, result_dir, start, end
  321. ):
  322. start /= kMicrosInSecond * self.time_unit
  323. end /= kMicrosInSecond * self.time_unit
  324. header_file_path = "{}/header-ml-miss-timeline-{}-{}-{}-{}".format(
  325. result_dir, self.time_unit, cache_type, cache_size, target_cf_name
  326. )
  327. if not path.exists(header_file_path):
  328. with open(header_file_path, "w+") as header_file:
  329. header = "time"
  330. for trace_time in range(start, end):
  331. header += ",{}".format(trace_time)
  332. header_file.write(header + "\n")
  333. file_path = "{}/data-ml-miss-timeline-{}-{}-{}-{}".format(
  334. result_dir, self.time_unit, cache_type, cache_size, target_cf_name
  335. )
  336. with open(file_path, "w+") as file:
  337. row = "{}".format(cache_type)
  338. for trace_time in range(start, end):
  339. row += ",{}".format(self.time_misses.get(trace_time, 0))
  340. file.write(row + "\n")
  341. def write_miss_ratio_timeline(
  342. self, cache_type, cache_size, target_cf_name, result_dir, start, end
  343. ):
  344. start /= kMicrosInSecond * self.time_unit
  345. end /= kMicrosInSecond * self.time_unit
  346. header_file_path = "{}/header-ml-miss-ratio-timeline-{}-{}-{}-{}".format(
  347. result_dir, self.time_unit, cache_type, cache_size, target_cf_name
  348. )
  349. if not path.exists(header_file_path):
  350. with open(header_file_path, "w+") as header_file:
  351. header = "time"
  352. for trace_time in range(start, end):
  353. header += ",{}".format(trace_time)
  354. header_file.write(header + "\n")
  355. file_path = "{}/data-ml-miss-ratio-timeline-{}-{}-{}-{}".format(
  356. result_dir, self.time_unit, cache_type, cache_size, target_cf_name
  357. )
  358. with open(file_path, "w+") as file:
  359. row = "{}".format(cache_type)
  360. for trace_time in range(start, end):
  361. naccesses = self.time_accesses.get(trace_time, 0)
  362. miss_ratio = 0
  363. if naccesses > 0:
  364. miss_ratio = float(
  365. self.time_misses.get(trace_time, 0) * 100.0
  366. ) / float(naccesses)
  367. row += ",{0:.2f}".format(miss_ratio)
  368. file.write(row + "\n")
  369. class PolicyStats:
  370. def __init__(self, time_unit, policies):
  371. self.time_selected_polices = {}
  372. self.time_accesses = {}
  373. self.policy_names = {}
  374. self.time_unit = time_unit
  375. for i in range(len(policies)):
  376. self.policy_names[i] = policies[i].policy_name()
  377. def update_metrics(self, access_time, selected_policy):
  378. access_time /= kMicrosInSecond * self.time_unit
  379. if access_time not in self.time_accesses:
  380. self.time_accesses[access_time] = 0
  381. self.time_accesses[access_time] += 1
  382. if access_time not in self.time_selected_polices:
  383. self.time_selected_polices[access_time] = {}
  384. policy_name = self.policy_names[selected_policy]
  385. if policy_name not in self.time_selected_polices[access_time]:
  386. self.time_selected_polices[access_time][policy_name] = 0
  387. self.time_selected_polices[access_time][policy_name] += 1
  388. def write_policy_timeline(
  389. self, cache_type, cache_size, target_cf_name, result_dir, start, end
  390. ):
  391. start /= kMicrosInSecond * self.time_unit
  392. end /= kMicrosInSecond * self.time_unit
  393. header_file_path = "{}/header-ml-policy-timeline-{}-{}-{}-{}".format(
  394. result_dir, self.time_unit, cache_type, cache_size, target_cf_name
  395. )
  396. if not path.exists(header_file_path):
  397. with open(header_file_path, "w+") as header_file:
  398. header = "time"
  399. for trace_time in range(start, end):
  400. header += ",{}".format(trace_time)
  401. header_file.write(header + "\n")
  402. file_path = "{}/data-ml-policy-timeline-{}-{}-{}-{}".format(
  403. result_dir, self.time_unit, cache_type, cache_size, target_cf_name
  404. )
  405. with open(file_path, "w+") as file:
  406. for policy in self.policy_names:
  407. policy_name = self.policy_names[policy]
  408. row = "{}-{}".format(cache_type, policy_name)
  409. for trace_time in range(start, end):
  410. row += ",{}".format(
  411. self.time_selected_polices.get(trace_time, {}).get(
  412. policy_name, 0
  413. )
  414. )
  415. file.write(row + "\n")
  416. def write_policy_ratio_timeline(
  417. self, cache_type, cache_size, target_cf_name, file_path, start, end
  418. ):
  419. start /= kMicrosInSecond * self.time_unit
  420. end /= kMicrosInSecond * self.time_unit
  421. header_file_path = "{}/header-ml-policy-ratio-timeline-{}-{}-{}-{}".format(
  422. result_dir, self.time_unit, cache_type, cache_size, target_cf_name
  423. )
  424. if not path.exists(header_file_path):
  425. with open(header_file_path, "w+") as header_file:
  426. header = "time"
  427. for trace_time in range(start, end):
  428. header += ",{}".format(trace_time)
  429. header_file.write(header + "\n")
  430. file_path = "{}/data-ml-policy-ratio-timeline-{}-{}-{}-{}".format(
  431. result_dir, self.time_unit, cache_type, cache_size, target_cf_name
  432. )
  433. with open(file_path, "w+") as file:
  434. for policy in self.policy_names:
  435. policy_name = self.policy_names[policy]
  436. row = "{}-{}".format(cache_type, policy_name)
  437. for trace_time in range(start, end):
  438. naccesses = self.time_accesses.get(trace_time, 0)
  439. ratio = 0
  440. if naccesses > 0:
  441. ratio = float(
  442. self.time_selected_polices.get(trace_time, {}).get(
  443. policy_name, 0
  444. )
  445. * 100.0
  446. ) / float(naccesses)
  447. row += ",{0:.2f}".format(ratio)
  448. file.write(row + "\n")
  449. class Policy(object):
  450. """
  451. A policy maintains a set of evicted keys. It returns a reward of one to
  452. itself if it has not evicted a missing key. Otherwise, it gives itself 0
  453. reward.
  454. """
  455. def __init__(self):
  456. self.evicted_keys = {}
  457. def evict(self, key, max_size):
  458. self.evicted_keys[key] = 0
  459. def delete(self, key):
  460. self.evicted_keys.pop(key, None)
  461. def prioritize_samples(self, samples, auxilliary_info):
  462. raise NotImplementedError
  463. def policy_name(self):
  464. raise NotImplementedError
  465. def generate_reward(self, key):
  466. if key in self.evicted_keys:
  467. return 0
  468. return 1
  469. class LRUPolicy(Policy):
  470. def prioritize_samples(self, samples, auxilliary_info):
  471. return sorted(
  472. samples,
  473. cmp=lambda e1, e2: e1.value.last_access_number
  474. - e2.value.last_access_number,
  475. )
  476. def policy_name(self):
  477. return "lru"
  478. class MRUPolicy(Policy):
  479. def prioritize_samples(self, samples, auxilliary_info):
  480. return sorted(
  481. samples,
  482. cmp=lambda e1, e2: e2.value.last_access_number
  483. - e1.value.last_access_number,
  484. )
  485. def policy_name(self):
  486. return "mru"
  487. class LFUPolicy(Policy):
  488. def prioritize_samples(self, samples, auxilliary_info):
  489. return sorted(samples, cmp=lambda e1, e2: e1.value.num_hits - e2.value.num_hits)
  490. def policy_name(self):
  491. return "lfu"
  492. class HyperbolicPolicy(Policy):
  493. """
  494. An implementation of Hyperbolic caching.
  495. Aaron Blankstein, Siddhartha Sen, and Michael J. Freedman. 2017.
  496. Hyperbolic caching: flexible caching for web applications. In Proceedings
  497. of the 2017 USENIX Conference on Usenix Annual Technical Conference
  498. (USENIX ATC '17). USENIX Association, Berkeley, CA, USA, 499-511.
  499. """
  500. def compare(self, e1, e2, now):
  501. e1_duration = max(0, (now - e1.value.insertion_time) / kMicrosInSecond) * float(
  502. e1.value.value_size
  503. )
  504. e2_duration = max(0, (now - e2.value.insertion_time) / kMicrosInSecond) * float(
  505. e2.value.value_size
  506. )
  507. if e1_duration == e2_duration:
  508. return e1.value.num_hits - e2.value.num_hits
  509. if e1_duration == 0:
  510. return 1
  511. if e2_duration == 0:
  512. return 1
  513. diff = (float(e1.value.num_hits) / (float(e1_duration))) - (
  514. float(e2.value.num_hits) / float(e2_duration)
  515. )
  516. if diff == 0:
  517. return 0
  518. elif diff > 0:
  519. return 1
  520. else:
  521. return -1
  522. def prioritize_samples(self, samples, auxilliary_info):
  523. assert len(auxilliary_info) == 3
  524. now = auxilliary_info[0]
  525. return sorted(samples, cmp=lambda e1, e2: self.compare(e1, e2, now))
  526. def policy_name(self):
  527. return "hb"
  528. class CostClassPolicy(Policy):
  529. """
  530. We calculate the hit density of a cost class as
  531. number of hits / total size in cache * average duration in the cache.
  532. An entry has a higher priority if its class's hit density is higher.
  533. """
  534. def compare(self, e1, e2, now, cost_classes, cost_class_label):
  535. e1_class = e1.value.cost_class(cost_class_label)
  536. e2_class = e2.value.cost_class(cost_class_label)
  537. assert e1_class in cost_classes
  538. assert e2_class in cost_classes
  539. e1_entry = cost_classes[e1_class]
  540. e2_entry = cost_classes[e2_class]
  541. e1_density = e1_entry.density(now)
  542. e2_density = e2_entry.density(now)
  543. e1_hits = cost_classes[e1_class].hits
  544. e2_hits = cost_classes[e2_class].hits
  545. if e1_density == e2_density:
  546. return e1_hits - e2_hits
  547. if e1_entry.num_entries_in_cache == 0:
  548. return -1
  549. if e2_entry.num_entries_in_cache == 0:
  550. return 1
  551. if e1_density == 0:
  552. return 1
  553. if e2_density == 0:
  554. return -1
  555. diff = (float(e1_hits) / float(e1_density)) - (
  556. float(e2_hits) / float(e2_density)
  557. )
  558. if diff == 0:
  559. return 0
  560. elif diff > 0:
  561. return 1
  562. else:
  563. return -1
  564. def prioritize_samples(self, samples, auxilliary_info):
  565. assert len(auxilliary_info) == 3
  566. now = auxilliary_info[0]
  567. cost_classes = auxilliary_info[1]
  568. cost_class_label = auxilliary_info[2]
  569. return sorted(
  570. samples,
  571. cmp=lambda e1, e2: self.compare(
  572. e1, e2, now, cost_classes, cost_class_label
  573. ),
  574. )
  575. def policy_name(self):
  576. return "cc"
  577. class Cache(object):
  578. """
  579. This is the base class for the implementations of alternative cache
  580. replacement policies.
  581. """
  582. def __init__(self, cache_size, enable_cache_row_key):
  583. self.cache_size = cache_size
  584. self.used_size = 0
  585. self.per_second_miss_ratio_stats = MissRatioStats(1)
  586. self.miss_ratio_stats = MissRatioStats(kSecondsInMinute)
  587. self.per_hour_miss_ratio_stats = MissRatioStats(kSecondsInHour)
  588. # 0: disabled. 1: enabled. Insert both row and the refereneced data block.
  589. # 2: enabled. Insert only the row but NOT the referenced data block.
  590. self.enable_cache_row_key = enable_cache_row_key
  591. self.get_id_row_key_map = {}
  592. self.max_seen_get_id = 0
  593. self.retain_get_id_range = 100000
  594. def block_key(self, trace_record):
  595. return "b{}".format(trace_record.block_id)
  596. def row_key(self, trace_record):
  597. return "g{}-{}".format(trace_record.fd, trace_record.key_id)
  598. def _lookup(self, trace_record, key, hash):
  599. """
  600. Look up the key in the cache.
  601. Returns true upon a cache hit, false otherwise.
  602. """
  603. raise NotImplementedError
  604. def _evict(self, trace_record, key, hash, value_size):
  605. """
  606. Evict entries in the cache until there is enough room to insert the new
  607. entry with 'value_size'.
  608. """
  609. raise NotImplementedError
  610. def _insert(self, trace_record, key, hash, value_size):
  611. """
  612. Insert the new entry into the cache.
  613. """
  614. raise NotImplementedError
  615. def _should_admit(self, trace_record, key, hash, value_size):
  616. """
  617. A custom admission policy to decide whether we should admit the new
  618. entry upon a cache miss.
  619. Returns true if the new entry should be admitted, false otherwise.
  620. """
  621. raise NotImplementedError
  622. def cache_name(self):
  623. """
  624. The name of the replacement policy.
  625. """
  626. raise NotImplementedError
  627. def is_ml_cache(self):
  628. return False
  629. def _update_stats(self, access_time, is_hit, miss_bytes):
  630. self.per_second_miss_ratio_stats.update_metrics(access_time, is_hit, miss_bytes)
  631. self.miss_ratio_stats.update_metrics(access_time, is_hit, miss_bytes)
  632. self.per_hour_miss_ratio_stats.update_metrics(access_time, is_hit, miss_bytes)
  633. def access(self, trace_record):
  634. """
  635. Access a trace record. The simulator calls this function to access a
  636. trace record.
  637. """
  638. assert self.used_size <= self.cache_size
  639. if (
  640. self.enable_cache_row_key > 0
  641. and trace_record.caller == 1
  642. and trace_record.key_id != 0
  643. and trace_record.get_id != 0
  644. ):
  645. # This is a get request.
  646. self._access_row(trace_record)
  647. return
  648. is_hit = self._access_kv(
  649. trace_record,
  650. self.block_key(trace_record),
  651. trace_record.block_id,
  652. trace_record.block_size,
  653. trace_record.no_insert,
  654. )
  655. self._update_stats(
  656. trace_record.access_time, is_hit=is_hit, miss_bytes=trace_record.block_size
  657. )
  658. def _access_row(self, trace_record):
  659. row_key = self.row_key(trace_record)
  660. self.max_seen_get_id = max(self.max_seen_get_id, trace_record.get_id)
  661. self.get_id_row_key_map.pop(
  662. self.max_seen_get_id - self.retain_get_id_range, None
  663. )
  664. if trace_record.get_id not in self.get_id_row_key_map:
  665. self.get_id_row_key_map[trace_record.get_id] = {}
  666. self.get_id_row_key_map[trace_record.get_id]["h"] = False
  667. if self.get_id_row_key_map[trace_record.get_id]["h"]:
  668. # We treat future accesses as hits since this get request
  669. # completes.
  670. # print("row hit 1")
  671. self._update_stats(trace_record.access_time, is_hit=True, miss_bytes=0)
  672. return
  673. if row_key not in self.get_id_row_key_map[trace_record.get_id]:
  674. # First time seen this key.
  675. is_hit = self._access_kv(
  676. trace_record,
  677. key=row_key,
  678. hash=trace_record.key_id,
  679. value_size=trace_record.kv_size,
  680. no_insert=False,
  681. )
  682. inserted = False
  683. if trace_record.kv_size > 0:
  684. inserted = True
  685. self.get_id_row_key_map[trace_record.get_id][row_key] = inserted
  686. self.get_id_row_key_map[trace_record.get_id]["h"] = is_hit
  687. if self.get_id_row_key_map[trace_record.get_id]["h"]:
  688. # We treat future accesses as hits since this get request
  689. # completes.
  690. # print("row hit 2")
  691. self._update_stats(trace_record.access_time, is_hit=True, miss_bytes=0)
  692. return
  693. # Access its blocks.
  694. no_insert = trace_record.no_insert
  695. if (
  696. self.enable_cache_row_key == 2
  697. and trace_record.kv_size > 0
  698. and trace_record.block_type == 9
  699. ):
  700. no_insert = True
  701. is_hit = self._access_kv(
  702. trace_record,
  703. key=self.block_key(trace_record),
  704. hash=trace_record.block_id,
  705. value_size=trace_record.block_size,
  706. no_insert=no_insert,
  707. )
  708. self._update_stats(
  709. trace_record.access_time, is_hit, miss_bytes=trace_record.block_size
  710. )
  711. if (
  712. trace_record.kv_size > 0
  713. and not self.get_id_row_key_map[trace_record.get_id][row_key]
  714. ):
  715. # Insert the row key-value pair.
  716. self._access_kv(
  717. trace_record,
  718. key=row_key,
  719. hash=trace_record.key_id,
  720. value_size=trace_record.kv_size,
  721. no_insert=False,
  722. )
  723. # Mark as inserted.
  724. self.get_id_row_key_map[trace_record.get_id][row_key] = True
  725. def _access_kv(self, trace_record, key, hash, value_size, no_insert):
  726. # Sanity checks.
  727. assert self.used_size <= self.cache_size
  728. if self._lookup(trace_record, key, hash):
  729. # A cache hit.
  730. return True
  731. if no_insert or value_size <= 0:
  732. return False
  733. # A cache miss.
  734. if value_size > self.cache_size:
  735. # The block is too large to fit into the cache.
  736. return False
  737. self._evict(trace_record, key, hash, value_size)
  738. if self._should_admit(trace_record, key, hash, value_size):
  739. self._insert(trace_record, key, hash, value_size)
  740. self.used_size += value_size
  741. return False
  742. class CostClassEntry:
  743. """
  744. A cost class maintains aggregated statistics of cached entries in a class.
  745. For example, we may define block type as a class. Then, cached blocks of the
  746. same type will share one cost class entry.
  747. """
  748. def __init__(self):
  749. self.hits = 0
  750. self.num_entries_in_cache = 0
  751. self.size_in_cache = 0
  752. self.sum_insertion_times = 0
  753. self.sum_last_access_time = 0
  754. def insert(self, trace_record, key, value_size):
  755. self.size_in_cache += value_size
  756. self.num_entries_in_cache += 1
  757. self.sum_insertion_times += trace_record.access_time / kMicrosInSecond
  758. self.sum_last_access_time += trace_record.access_time / kMicrosInSecond
  759. def remove(self, insertion_time, last_access_time, key, value_size, num_hits):
  760. self.hits -= num_hits
  761. self.num_entries_in_cache -= 1
  762. self.sum_insertion_times -= insertion_time / kMicrosInSecond
  763. self.size_in_cache -= value_size
  764. self.sum_last_access_time -= last_access_time / kMicrosInSecond
  765. def update_on_hit(self, trace_record, last_access_time):
  766. self.hits += 1
  767. self.sum_last_access_time -= last_access_time / kMicrosInSecond
  768. self.sum_last_access_time += trace_record.access_time / kMicrosInSecond
  769. def avg_lifetime_in_cache(self, now):
  770. avg_insertion_time = self.sum_insertion_times / self.num_entries_in_cache
  771. return now / kMicrosInSecond - avg_insertion_time
  772. def avg_last_access_time(self):
  773. if self.num_entries_in_cache == 0:
  774. return 0
  775. return float(self.sum_last_access_time) / float(self.num_entries_in_cache)
  776. def avg_size(self):
  777. if self.num_entries_in_cache == 0:
  778. return 0
  779. return float(self.sum_last_access_time) / float(self.num_entries_in_cache)
  780. def density(self, now):
  781. avg_insertion_time = self.sum_insertion_times / self.num_entries_in_cache
  782. in_cache_duration = now / kMicrosInSecond - avg_insertion_time
  783. return self.size_in_cache * in_cache_duration
  784. class MLCache(Cache):
  785. """
  786. MLCache is the base class for implementations of alternative replacement
  787. policies using reinforcement learning.
  788. """
  789. def __init__(self, cache_size, enable_cache_row_key, policies, cost_class_label):
  790. super(MLCache, self).__init__(cache_size, enable_cache_row_key)
  791. self.table = HashTable()
  792. self.policy_stats = PolicyStats(kSecondsInMinute, policies)
  793. self.per_hour_policy_stats = PolicyStats(kSecondsInHour, policies)
  794. self.policies = policies
  795. self.cost_classes = {}
  796. self.cost_class_label = cost_class_label
  797. def is_ml_cache(self):
  798. return True
  799. def _lookup(self, trace_record, key, hash):
  800. value = self.table.lookup(key, hash)
  801. if value is not None:
  802. # Update the entry's cost class statistics.
  803. if self.cost_class_label is not None:
  804. cost_class = value.cost_class(self.cost_class_label)
  805. assert cost_class in self.cost_classes
  806. self.cost_classes[cost_class].update_on_hit(
  807. trace_record, value.last_access_time
  808. )
  809. # Update the entry's last access time.
  810. self.table.insert(
  811. key,
  812. hash,
  813. CacheEntry(
  814. value_size=value.value_size,
  815. cf_id=value.cf_id,
  816. level=value.level,
  817. block_type=value.block_type,
  818. table_id=value.table_id,
  819. access_number=self.miss_ratio_stats.num_accesses,
  820. time_s=trace_record.access_time,
  821. num_hits=value.num_hits + 1,
  822. ),
  823. )
  824. return True
  825. return False
  826. def _evict(self, trace_record, key, hash, value_size):
  827. # Select a policy, random sample kSampleSize keys from the cache, then
  828. # evict keys in the sample set until we have enough room for the new
  829. # entry.
  830. policy_index = self._select_policy(trace_record, key)
  831. assert policy_index < len(self.policies) and policy_index >= 0
  832. self.policies[policy_index].delete(key)
  833. self.policy_stats.update_metrics(trace_record.access_time, policy_index)
  834. self.per_hour_policy_stats.update_metrics(
  835. trace_record.access_time, policy_index
  836. )
  837. while self.used_size + value_size > self.cache_size:
  838. # Randomly sample n entries.
  839. samples = self.table.random_sample(kSampleSize)
  840. samples = self.policies[policy_index].prioritize_samples(
  841. samples,
  842. [trace_record.access_time, self.cost_classes, self.cost_class_label],
  843. )
  844. for hash_entry in samples:
  845. assert self.table.delete(hash_entry.key, hash_entry.hash) is not None
  846. self.used_size -= hash_entry.value.value_size
  847. self.policies[policy_index].evict(
  848. key=hash_entry.key, max_size=self.table.elements
  849. )
  850. # Update the entry's cost class statistics.
  851. if self.cost_class_label is not None:
  852. cost_class = hash_entry.value.cost_class(self.cost_class_label)
  853. assert cost_class in self.cost_classes
  854. self.cost_classes[cost_class].remove(
  855. hash_entry.value.insertion_time,
  856. hash_entry.value.last_access_time,
  857. key,
  858. hash_entry.value.value_size,
  859. hash_entry.value.num_hits,
  860. )
  861. if self.used_size + value_size <= self.cache_size:
  862. break
  863. def _insert(self, trace_record, key, hash, value_size):
  864. assert self.used_size + value_size <= self.cache_size
  865. entry = CacheEntry(
  866. value_size,
  867. trace_record.cf_id,
  868. trace_record.level,
  869. trace_record.block_type,
  870. trace_record.table_id,
  871. self.miss_ratio_stats.num_accesses,
  872. trace_record.access_time,
  873. )
  874. # Update the entry's cost class statistics.
  875. if self.cost_class_label is not None:
  876. cost_class = entry.cost_class(self.cost_class_label)
  877. if cost_class not in self.cost_classes:
  878. self.cost_classes[cost_class] = CostClassEntry()
  879. self.cost_classes[cost_class].insert(trace_record, key, value_size)
  880. self.table.insert(key, hash, entry)
  881. def _should_admit(self, trace_record, key, hash, value_size):
  882. return True
  883. def _select_policy(self, trace_record, key):
  884. raise NotImplementedError
  885. class ThompsonSamplingCache(MLCache):
  886. """
  887. An implementation of Thompson Sampling for the Bernoulli Bandit.
  888. Daniel J. Russo, Benjamin Van Roy, Abbas Kazerouni, Ian Osband,
  889. and Zheng Wen. 2018. A Tutorial on Thompson Sampling. Found.
  890. Trends Mach. Learn. 11, 1 (July 2018), 1-96.
  891. DOI: https://doi.org/10.1561/2200000070
  892. """
  893. def __init__(
  894. self,
  895. cache_size,
  896. enable_cache_row_key,
  897. policies,
  898. cost_class_label,
  899. init_a=1,
  900. init_b=1,
  901. ):
  902. super(ThompsonSamplingCache, self).__init__(
  903. cache_size, enable_cache_row_key, policies, cost_class_label
  904. )
  905. self._as = {}
  906. self._bs = {}
  907. for _i in range(len(policies)):
  908. self._as = [init_a] * len(self.policies)
  909. self._bs = [init_b] * len(self.policies)
  910. def _select_policy(self, trace_record, key):
  911. if len(self.policies) == 1:
  912. return 0
  913. samples = [
  914. np.random.beta(self._as[x], self._bs[x]) for x in range(len(self.policies))
  915. ]
  916. selected_policy = max(range(len(self.policies)), key=lambda x: samples[x])
  917. reward = self.policies[selected_policy].generate_reward(key)
  918. assert reward <= 1 and reward >= 0
  919. self._as[selected_policy] += reward
  920. self._bs[selected_policy] += 1 - reward
  921. return selected_policy
  922. def cache_name(self):
  923. if self.enable_cache_row_key:
  924. return "Hybrid ThompsonSampling with cost class {} (ts_hybrid)".format(
  925. self.cost_class_label
  926. )
  927. return "ThompsonSampling with cost class {} (ts)".format(self.cost_class_label)
  928. class LinUCBCache(MLCache):
  929. """
  930. An implementation of LinUCB with disjoint linear models.
  931. Lihong Li, Wei Chu, John Langford, and Robert E. Schapire. 2010.
  932. A contextual-bandit approach to personalized news article recommendation.
  933. In Proceedings of the 19th international conference on World wide web
  934. (WWW '10). ACM, New York, NY, USA, 661-670.
  935. DOI=http://dx.doi.org/10.1145/1772690.1772758
  936. """
  937. def __init__(self, cache_size, enable_cache_row_key, policies, cost_class_label):
  938. super(LinUCBCache, self).__init__(
  939. cache_size, enable_cache_row_key, policies, cost_class_label
  940. )
  941. self.nfeatures = 4 # Block type, level, cf.
  942. self.th = np.zeros((len(self.policies), self.nfeatures))
  943. self.eps = 0.2
  944. self.b = np.zeros_like(self.th)
  945. self.A = np.zeros((len(self.policies), self.nfeatures, self.nfeatures))
  946. self.A_inv = np.zeros((len(self.policies), self.nfeatures, self.nfeatures))
  947. for i in range(len(self.policies)):
  948. self.A[i] = np.identity(self.nfeatures)
  949. self.th_hat = np.zeros_like(self.th)
  950. self.p = np.zeros(len(self.policies))
  951. self.alph = 0.2
  952. def _select_policy(self, trace_record, key):
  953. if len(self.policies) == 1:
  954. return 0
  955. x_i = np.zeros(self.nfeatures) # The current context vector
  956. x_i[0] = trace_record.block_type
  957. x_i[1] = trace_record.level
  958. x_i[2] = trace_record.cf_id
  959. p = np.zeros(len(self.policies))
  960. for a in range(len(self.policies)):
  961. self.th_hat[a] = self.A_inv[a].dot(self.b[a])
  962. ta = x_i.dot(self.A_inv[a]).dot(x_i)
  963. a_upper_ci = self.alph * np.sqrt(ta)
  964. a_mean = self.th_hat[a].dot(x_i)
  965. p[a] = a_mean + a_upper_ci
  966. p = p + (np.random.random(len(p)) * 0.000001)
  967. selected_policy = p.argmax()
  968. reward = self.policies[selected_policy].generate_reward(key)
  969. assert reward <= 1 and reward >= 0
  970. self.A[selected_policy] += np.outer(x_i, x_i)
  971. self.b[selected_policy] += reward * x_i
  972. self.A_inv[selected_policy] = np.linalg.inv(self.A[selected_policy])
  973. del x_i
  974. return selected_policy
  975. def cache_name(self):
  976. if self.enable_cache_row_key:
  977. return "Hybrid LinUCB with cost class {} (linucb_hybrid)".format(
  978. self.cost_class_label
  979. )
  980. return "LinUCB with cost class {} (linucb)".format(self.cost_class_label)
  981. class OPTCacheEntry:
  982. """
  983. A cache entry for the OPT algorithm. The entries are sorted based on its
  984. next access sequence number in reverse order, i.e., the entry which next
  985. access is the furthest in the future is ordered before other entries.
  986. """
  987. def __init__(self, key, next_access_seq_no, value_size):
  988. self.key = key
  989. self.next_access_seq_no = next_access_seq_no
  990. self.value_size = value_size
  991. self.is_removed = False
  992. def __cmp__(self, other):
  993. if other.next_access_seq_no != self.next_access_seq_no:
  994. return other.next_access_seq_no - self.next_access_seq_no
  995. return self.value_size - other.value_size
  996. def __repr__(self):
  997. return "({} {} {} {})".format(
  998. self.key, self.next_access_seq_no, self.value_size, self.is_removed
  999. )
  1000. class PQTable:
  1001. """
  1002. A hash table with a priority queue.
  1003. """
  1004. def __init__(self):
  1005. # A list of entries arranged in a heap sorted based on the entry custom
  1006. # implementation of __cmp__
  1007. self.pq = []
  1008. self.table = {}
  1009. def pqinsert(self, entry):
  1010. "Add a new key or update the priority of an existing key"
  1011. # Remove the entry from the table first.
  1012. removed_entry = self.table.pop(entry.key, None)
  1013. if removed_entry:
  1014. # Mark as removed since there is no 'remove' API in heappq.
  1015. # Instead, an entry in pq is removed lazily when calling pop.
  1016. removed_entry.is_removed = True
  1017. self.table[entry.key] = entry
  1018. heapq.heappush(self.pq, entry)
  1019. return removed_entry
  1020. def pqpop(self):
  1021. while self.pq:
  1022. entry = heapq.heappop(self.pq)
  1023. if not entry.is_removed:
  1024. del self.table[entry.key]
  1025. return entry
  1026. return None
  1027. def pqpeek(self):
  1028. while self.pq:
  1029. entry = self.pq[0]
  1030. if not entry.is_removed:
  1031. return entry
  1032. heapq.heappop(self.pq)
  1033. return
  1034. def __contains__(self, k):
  1035. return k in self.table
  1036. def __getitem__(self, k):
  1037. return self.table[k]
  1038. def __len__(self):
  1039. return len(self.table)
  1040. def values(self):
  1041. return self.table.values()
  1042. class OPTCache(Cache):
  1043. """
  1044. An implementation of the Belady MIN algorithm. OPTCache evicts an entry
  1045. in the cache whose next access occurs furthest in the future.
  1046. Note that Belady MIN algorithm is optimal assuming all blocks having the
  1047. same size and a missing entry will be inserted in the cache.
  1048. These are NOT true for the block cache trace since blocks have different
  1049. sizes and we may not insert a block into the cache upon a cache miss.
  1050. However, it is still useful to serve as a "theoretical upper bound" on the
  1051. lowest miss ratio we can achieve given a cache size.
  1052. L. A. Belady. 1966. A Study of Replacement Algorithms for a
  1053. Virtual-storage Computer. IBM Syst. J. 5, 2 (June 1966), 78-101.
  1054. DOI=http://dx.doi.org/10.1147/sj.52.0078
  1055. """
  1056. def __init__(self, cache_size):
  1057. super(OPTCache, self).__init__(cache_size, enable_cache_row_key=0)
  1058. self.table = PQTable()
  1059. def _lookup(self, trace_record, key, hash):
  1060. if key not in self.table:
  1061. return False
  1062. # A cache hit. Update its next access time.
  1063. assert (
  1064. self.table.pqinsert(
  1065. OPTCacheEntry(
  1066. key, trace_record.next_access_seq_no, self.table[key].value_size
  1067. )
  1068. )
  1069. is not None
  1070. )
  1071. return True
  1072. def _evict(self, trace_record, key, hash, value_size):
  1073. while self.used_size + value_size > self.cache_size:
  1074. evict_entry = self.table.pqpop()
  1075. assert evict_entry is not None
  1076. self.used_size -= evict_entry.value_size
  1077. def _insert(self, trace_record, key, hash, value_size):
  1078. assert (
  1079. self.table.pqinsert(
  1080. OPTCacheEntry(key, trace_record.next_access_seq_no, value_size)
  1081. )
  1082. is None
  1083. )
  1084. def _should_admit(self, trace_record, key, hash, value_size):
  1085. return True
  1086. def cache_name(self):
  1087. return "Belady MIN (opt)"
  1088. class GDSizeEntry:
  1089. """
  1090. A cache entry for the greedy dual size replacement policy.
  1091. """
  1092. def __init__(self, key, value_size, priority):
  1093. self.key = key
  1094. self.value_size = value_size
  1095. self.priority = priority
  1096. self.is_removed = False
  1097. def __cmp__(self, other):
  1098. if other.priority != self.priority:
  1099. return self.priority - other.priority
  1100. return self.value_size - other.value_size
  1101. def __repr__(self):
  1102. return "({} {} {} {})".format(
  1103. self.key, self.next_access_seq_no, self.value_size, self.is_removed
  1104. )
  1105. class GDSizeCache(Cache):
  1106. """
  1107. An implementation of the greedy dual size algorithm.
  1108. We define cost as an entry's size.
  1109. See https://www.usenix.org/legacy/publications/library/proceedings/usits97/full_papers/cao/cao_html/node8.html
  1110. and N. Young. The k-server dual and loose competitiveness for paging.
  1111. Algorithmica,June 1994, vol. 11,(no.6):525-41.
  1112. Rewritten version of ''On-line caching as cache size varies'',
  1113. in The 2nd Annual ACM-SIAM Symposium on Discrete Algorithms, 241-250, 1991.
  1114. """
  1115. def __init__(self, cache_size, enable_cache_row_key):
  1116. super(GDSizeCache, self).__init__(cache_size, enable_cache_row_key)
  1117. self.table = PQTable()
  1118. self.L = 0.0
  1119. def cache_name(self):
  1120. if self.enable_cache_row_key:
  1121. return "Hybrid GreedyDualSize (gdsize_hybrid)"
  1122. return "GreedyDualSize (gdsize)"
  1123. def _lookup(self, trace_record, key, hash):
  1124. if key not in self.table:
  1125. return False
  1126. # A cache hit. Update its priority.
  1127. entry = self.table[key]
  1128. assert (
  1129. self.table.pqinsert(
  1130. GDSizeEntry(key, entry.value_size, self.L + entry.value_size)
  1131. )
  1132. is not None
  1133. )
  1134. return True
  1135. def _evict(self, trace_record, key, hash, value_size):
  1136. while self.used_size + value_size > self.cache_size:
  1137. evict_entry = self.table.pqpop()
  1138. assert evict_entry is not None
  1139. self.L = evict_entry.priority
  1140. self.used_size -= evict_entry.value_size
  1141. def _insert(self, trace_record, key, hash, value_size):
  1142. assert (
  1143. self.table.pqinsert(GDSizeEntry(key, value_size, self.L + value_size))
  1144. is None
  1145. )
  1146. def _should_admit(self, trace_record, key, hash, value_size):
  1147. return True
  1148. class Deque(object):
  1149. """A Deque class facilitates the implementation of LRU and ARC."""
  1150. def __init__(self):
  1151. self.od = OrderedDict()
  1152. def appendleft(self, k):
  1153. if k in self.od:
  1154. del self.od[k]
  1155. self.od[k] = None
  1156. def pop(self):
  1157. item = self.od.popitem(last=False) if self.od else None
  1158. if item is not None:
  1159. return item[0]
  1160. return None
  1161. def remove(self, k):
  1162. del self.od[k]
  1163. def __len__(self):
  1164. return len(self.od)
  1165. def __contains__(self, k):
  1166. return k in self.od
  1167. def __iter__(self):
  1168. return reversed(self.od)
  1169. def __repr__(self):
  1170. return "Deque(%r)" % (list(self),)
  1171. class ARCCache(Cache):
  1172. """
  1173. An implementation of ARC. ARC assumes that all blocks are having the
  1174. same size. The size of index and filter blocks are variable. To accommodate
  1175. this, we modified ARC as follows:
  1176. 1) We use 16 KB as the average block size and calculate the number of blocks
  1177. (c) in the cache.
  1178. 2) When we insert an entry, the cache evicts entries in both t1 and t2
  1179. queues until it has enough space for the new entry. This also requires
  1180. modification of the algorithm to maintain a maximum of 2*c blocks.
  1181. Nimrod Megiddo and Dharmendra S. Modha. 2003. ARC: A Self-Tuning, Low
  1182. Overhead Replacement Cache. In Proceedings of the 2nd USENIX Conference on
  1183. File and Storage Technologies (FAST '03). USENIX Association, Berkeley, CA,
  1184. USA, 115-130.
  1185. """
  1186. def __init__(self, cache_size, enable_cache_row_key):
  1187. super(ARCCache, self).__init__(cache_size, enable_cache_row_key)
  1188. self.table = {}
  1189. self.c = cache_size / 16 * 1024 # Number of elements in the cache.
  1190. self.p = 0 # Target size for the list T1
  1191. # L1: only once recently
  1192. self.t1 = Deque() # T1: recent cache entries
  1193. self.b1 = Deque() # B1: ghost entries recently evicted from the T1 cache
  1194. # L2: at least twice recently
  1195. self.t2 = Deque() # T2: frequent entries
  1196. self.b2 = Deque() # B2: ghost entries recently evicted from the T2 cache
  1197. def _replace(self, key, value_size):
  1198. while self.used_size + value_size > self.cache_size:
  1199. if self.t1 and ((key in self.b2) or (len(self.t1) > self.p)):
  1200. old = self.t1.pop()
  1201. self.b1.appendleft(old)
  1202. else:
  1203. if self.t2:
  1204. old = self.t2.pop()
  1205. self.b2.appendleft(old)
  1206. else:
  1207. old = self.t1.pop()
  1208. self.b1.appendleft(old)
  1209. self.used_size -= self.table[old].value_size
  1210. del self.table[old]
  1211. def _lookup(self, trace_record, key, hash):
  1212. # Case I: key is in T1 or T2.
  1213. # Move key to MRU position in T2.
  1214. if key in self.t1:
  1215. self.t1.remove(key)
  1216. self.t2.appendleft(key)
  1217. return True
  1218. if key in self.t2:
  1219. self.t2.remove(key)
  1220. self.t2.appendleft(key)
  1221. return True
  1222. return False
  1223. def _evict(self, trace_record, key, hash, value_size):
  1224. # Case II: key is in B1
  1225. # Move x from B1 to the MRU position in T2 (also fetch x to the cache).
  1226. if key in self.b1:
  1227. self.p = min(self.c, self.p + max(len(self.b2) / len(self.b1), 1))
  1228. self._replace(key, value_size)
  1229. self.b1.remove(key)
  1230. self.t2.appendleft(key)
  1231. return
  1232. # Case III: key is in B2
  1233. # Move x from B2 to the MRU position in T2 (also fetch x to the cache).
  1234. if key in self.b2:
  1235. self.p = max(0, self.p - max(len(self.b1) / len(self.b2), 1))
  1236. self._replace(key, value_size)
  1237. self.b2.remove(key)
  1238. self.t2.appendleft(key)
  1239. return
  1240. # Case IV: key is not in (T1 u B1 u T2 u B2)
  1241. self._replace(key, value_size)
  1242. while len(self.t1) + len(self.b1) >= self.c and self.b1:
  1243. self.b1.pop()
  1244. total = len(self.t1) + len(self.b1) + len(self.t2) + len(self.b2)
  1245. while total >= (2 * self.c) and self.b2:
  1246. self.b2.pop()
  1247. total -= 1
  1248. # Finally, move it to MRU position in T1.
  1249. self.t1.appendleft(key)
  1250. return
  1251. def _insert(self, trace_record, key, hash, value_size):
  1252. self.table[key] = CacheEntry(
  1253. value_size,
  1254. trace_record.cf_id,
  1255. trace_record.level,
  1256. trace_record.block_type,
  1257. trace_record.table_id,
  1258. 0,
  1259. trace_record.access_time,
  1260. )
  1261. def _should_admit(self, trace_record, key, hash, value_size):
  1262. return True
  1263. def cache_name(self):
  1264. if self.enable_cache_row_key:
  1265. return "Hybrid Adaptive Replacement Cache (arc_hybrid)"
  1266. return "Adaptive Replacement Cache (arc)"
  1267. class LRUCache(Cache):
  1268. """
  1269. A strict LRU queue.
  1270. """
  1271. def __init__(self, cache_size, enable_cache_row_key):
  1272. super(LRUCache, self).__init__(cache_size, enable_cache_row_key)
  1273. self.table = {}
  1274. self.lru = Deque()
  1275. def cache_name(self):
  1276. if self.enable_cache_row_key:
  1277. return "Hybrid LRU (lru_hybrid)"
  1278. return "LRU (lru)"
  1279. def _lookup(self, trace_record, key, hash):
  1280. if key not in self.table:
  1281. return False
  1282. # A cache hit. Update LRU queue.
  1283. self.lru.remove(key)
  1284. self.lru.appendleft(key)
  1285. return True
  1286. def _evict(self, trace_record, key, hash, value_size):
  1287. while self.used_size + value_size > self.cache_size:
  1288. evict_key = self.lru.pop()
  1289. self.used_size -= self.table[evict_key].value_size
  1290. del self.table[evict_key]
  1291. def _insert(self, trace_record, key, hash, value_size):
  1292. self.table[key] = CacheEntry(
  1293. value_size,
  1294. trace_record.cf_id,
  1295. trace_record.level,
  1296. trace_record.block_type,
  1297. trace_record.table_id,
  1298. 0,
  1299. trace_record.access_time,
  1300. )
  1301. self.lru.appendleft(key)
  1302. def _should_admit(self, trace_record, key, hash, value_size):
  1303. return True
  1304. class TraceCache(Cache):
  1305. """
  1306. A trace cache. Lookup returns true if the trace observes a cache hit.
  1307. It is used to maintain cache hits observed in the trace.
  1308. """
  1309. def __init__(self, cache_size):
  1310. super(TraceCache, self).__init__(cache_size, enable_cache_row_key=0)
  1311. def _lookup(self, trace_record, key, hash):
  1312. return trace_record.is_hit
  1313. def _evict(self, trace_record, key, hash, value_size):
  1314. pass
  1315. def _insert(self, trace_record, key, hash, value_size):
  1316. pass
  1317. def _should_admit(self, trace_record, key, hash, value_size):
  1318. return False
  1319. def cache_name(self):
  1320. return "Trace"
  1321. def parse_cache_size(cs):
  1322. cs = cs.replace("\n", "")
  1323. if cs[-1] == "M":
  1324. return int(cs[: len(cs) - 1]) * 1024 * 1024
  1325. if cs[-1] == "G":
  1326. return int(cs[: len(cs) - 1]) * 1024 * 1024 * 1024
  1327. if cs[-1] == "T":
  1328. return int(cs[: len(cs) - 1]) * 1024 * 1024 * 1024 * 1024
  1329. return int(cs)
  1330. def create_cache(cache_type, cache_size, downsample_size):
  1331. cache_size = cache_size / downsample_size
  1332. enable_cache_row_key = 0
  1333. if "hybridn" in cache_type:
  1334. enable_cache_row_key = 2
  1335. cache_type = cache_type[:-8]
  1336. if "hybrid" in cache_type:
  1337. enable_cache_row_key = 1
  1338. cache_type = cache_type[:-7]
  1339. if cache_type == "ts":
  1340. return ThompsonSamplingCache(
  1341. cache_size,
  1342. enable_cache_row_key,
  1343. [LRUPolicy(), LFUPolicy(), HyperbolicPolicy()],
  1344. cost_class_label=None,
  1345. )
  1346. elif cache_type == "linucb":
  1347. return LinUCBCache(
  1348. cache_size,
  1349. enable_cache_row_key,
  1350. [LRUPolicy(), LFUPolicy(), HyperbolicPolicy()],
  1351. cost_class_label=None,
  1352. )
  1353. elif cache_type == "pylru":
  1354. return ThompsonSamplingCache(
  1355. cache_size, enable_cache_row_key, [LRUPolicy()], cost_class_label=None
  1356. )
  1357. elif cache_type == "pymru":
  1358. return ThompsonSamplingCache(
  1359. cache_size, enable_cache_row_key, [MRUPolicy()], cost_class_label=None
  1360. )
  1361. elif cache_type == "pylfu":
  1362. return ThompsonSamplingCache(
  1363. cache_size, enable_cache_row_key, [LFUPolicy()], cost_class_label=None
  1364. )
  1365. elif cache_type == "pyhb":
  1366. return ThompsonSamplingCache(
  1367. cache_size,
  1368. enable_cache_row_key,
  1369. [HyperbolicPolicy()],
  1370. cost_class_label=None,
  1371. )
  1372. elif cache_type == "pycctbbt":
  1373. return ThompsonSamplingCache(
  1374. cache_size,
  1375. enable_cache_row_key,
  1376. [CostClassPolicy()],
  1377. cost_class_label="table_bt",
  1378. )
  1379. elif cache_type == "pycccf":
  1380. return ThompsonSamplingCache(
  1381. cache_size, enable_cache_row_key, [CostClassPolicy()], cost_class_label="cf"
  1382. )
  1383. elif cache_type == "pycctblevelbt":
  1384. return ThompsonSamplingCache(
  1385. cache_size,
  1386. enable_cache_row_key,
  1387. [CostClassPolicy()],
  1388. cost_class_label="table_level_bt",
  1389. )
  1390. elif cache_type == "pycccfbt":
  1391. return ThompsonSamplingCache(
  1392. cache_size,
  1393. enable_cache_row_key,
  1394. [CostClassPolicy()],
  1395. cost_class_label="cf_bt",
  1396. )
  1397. elif cache_type == "pycctb":
  1398. return ThompsonSamplingCache(
  1399. cache_size,
  1400. enable_cache_row_key,
  1401. [CostClassPolicy()],
  1402. cost_class_label="table",
  1403. )
  1404. elif cache_type == "pyccbt":
  1405. return ThompsonSamplingCache(
  1406. cache_size, enable_cache_row_key, [CostClassPolicy()], cost_class_label="bt"
  1407. )
  1408. elif cache_type == "opt":
  1409. if enable_cache_row_key:
  1410. print("opt does not support hybrid mode.")
  1411. assert False
  1412. return OPTCache(cache_size)
  1413. elif cache_type == "trace":
  1414. if enable_cache_row_key:
  1415. print("trace does not support hybrid mode.")
  1416. assert False
  1417. return TraceCache(cache_size)
  1418. elif cache_type == "lru":
  1419. return LRUCache(cache_size, enable_cache_row_key)
  1420. elif cache_type == "arc":
  1421. return ARCCache(cache_size, enable_cache_row_key)
  1422. elif cache_type == "gdsize":
  1423. return GDSizeCache(cache_size, enable_cache_row_key)
  1424. else:
  1425. print("Unknown cache type {}".format(cache_type))
  1426. assert False
  1427. return None
  1428. class BlockAccessTimeline:
  1429. """
  1430. BlockAccessTimeline stores all accesses of a block.
  1431. """
  1432. def __init__(self):
  1433. self.accesses = []
  1434. self.current_access_index = 1
  1435. def get_next_access(self):
  1436. if self.current_access_index == len(self.accesses):
  1437. return sys.maxsize
  1438. next_access_seq_no = self.accesses[self.current_access_index]
  1439. self.current_access_index += 1
  1440. return next_access_seq_no
  1441. def percent(e1, e2):
  1442. if e2 == 0:
  1443. return -1
  1444. return float(e1) * 100.0 / float(e2)
  1445. def is_target_cf(access_cf, target_cf_name):
  1446. if target_cf_name == "all":
  1447. return True
  1448. return access_cf == target_cf_name
  1449. def run(
  1450. trace_file_path,
  1451. cache_type,
  1452. cache,
  1453. warmup_seconds,
  1454. max_accesses_to_process,
  1455. target_cf_name,
  1456. ):
  1457. warmup_complete = False
  1458. trace_miss_ratio_stats = MissRatioStats(kSecondsInMinute)
  1459. access_seq_no = 0
  1460. time_interval = 1
  1461. start_time = time.time()
  1462. trace_start_time = 0
  1463. trace_duration = 0
  1464. is_opt_cache = False
  1465. if cache.cache_name() == "Belady MIN (opt)":
  1466. is_opt_cache = True
  1467. block_access_timelines = {}
  1468. num_no_inserts = 0
  1469. num_blocks_with_no_size = 0
  1470. num_inserts_block_with_no_size = 0
  1471. if is_opt_cache:
  1472. # Read all blocks in memory and stores their access times so that OPT
  1473. # can use this information to evict the cached key which next access is
  1474. # the furthest in the future.
  1475. print("Preprocessing block traces.")
  1476. with open(trace_file_path, "r") as trace_file:
  1477. for line in trace_file:
  1478. if (
  1479. max_accesses_to_process != -1
  1480. and access_seq_no > max_accesses_to_process
  1481. ):
  1482. break
  1483. ts = line.split(",")
  1484. timestamp = int(ts[0])
  1485. cf_name = ts[5]
  1486. if not is_target_cf(cf_name, target_cf_name):
  1487. continue
  1488. if trace_start_time == 0:
  1489. trace_start_time = timestamp
  1490. trace_duration = timestamp - trace_start_time
  1491. block_id = int(ts[1])
  1492. block_size = int(ts[3])
  1493. no_insert = int(ts[9])
  1494. if block_id not in block_access_timelines:
  1495. block_access_timelines[block_id] = BlockAccessTimeline()
  1496. if block_size == 0:
  1497. num_blocks_with_no_size += 1
  1498. block_access_timelines[block_id].accesses.append(access_seq_no)
  1499. access_seq_no += 1
  1500. if no_insert == 1:
  1501. num_no_inserts += 1
  1502. if no_insert == 0 and block_size == 0:
  1503. num_inserts_block_with_no_size += 1
  1504. if access_seq_no % 100 != 0:
  1505. continue
  1506. now = time.time()
  1507. if now - start_time > time_interval * 10:
  1508. print(
  1509. "Take {} seconds to process {} trace records with trace "
  1510. "duration of {} seconds. Throughput: {} records/second.".format(
  1511. now - start_time,
  1512. access_seq_no,
  1513. trace_duration / 1000000,
  1514. access_seq_no / (now - start_time),
  1515. )
  1516. )
  1517. time_interval += 1
  1518. print(
  1519. "Trace contains {0} blocks, {1}({2:.2f}%) blocks with no size."
  1520. "{3} accesses, {4}({5:.2f}%) accesses with no_insert,"
  1521. "{6}({7:.2f}%) accesses that want to insert but block size is 0.".format(
  1522. len(block_access_timelines),
  1523. num_blocks_with_no_size,
  1524. percent(num_blocks_with_no_size, len(block_access_timelines)),
  1525. access_seq_no,
  1526. num_no_inserts,
  1527. percent(num_no_inserts, access_seq_no),
  1528. num_inserts_block_with_no_size,
  1529. percent(num_inserts_block_with_no_size, access_seq_no),
  1530. )
  1531. )
  1532. access_seq_no = 0
  1533. time_interval = 1
  1534. start_time = time.time()
  1535. trace_start_time = 0
  1536. trace_duration = 0
  1537. print("Running simulated {} cache on block traces.".format(cache.cache_name()))
  1538. with open(trace_file_path, "r") as trace_file:
  1539. for line in trace_file:
  1540. if (
  1541. max_accesses_to_process != -1
  1542. and access_seq_no > max_accesses_to_process
  1543. ):
  1544. break
  1545. if access_seq_no % 1000000 == 0:
  1546. # Force a python gc periodically to reduce memory usage.
  1547. gc.collect()
  1548. ts = line.split(",")
  1549. timestamp = int(ts[0])
  1550. cf_name = ts[5]
  1551. if not is_target_cf(cf_name, target_cf_name):
  1552. continue
  1553. if trace_start_time == 0:
  1554. trace_start_time = timestamp
  1555. trace_duration = timestamp - trace_start_time
  1556. if (
  1557. not warmup_complete
  1558. and warmup_seconds > 0
  1559. and trace_duration > warmup_seconds * 1000000
  1560. ):
  1561. cache.miss_ratio_stats.reset_counter()
  1562. warmup_complete = True
  1563. next_access_seq_no = 0
  1564. block_id = int(ts[1])
  1565. if is_opt_cache:
  1566. next_access_seq_no = block_access_timelines[block_id].get_next_access()
  1567. record = TraceRecord(
  1568. access_time=int(ts[0]),
  1569. block_id=int(ts[1]),
  1570. block_type=int(ts[2]),
  1571. block_size=int(ts[3]),
  1572. cf_id=int(ts[4]),
  1573. cf_name=ts[5],
  1574. level=int(ts[6]),
  1575. fd=int(ts[7]),
  1576. caller=int(ts[8]),
  1577. no_insert=int(ts[9]),
  1578. get_id=int(ts[10]),
  1579. key_id=int(ts[11]),
  1580. kv_size=int(ts[12]),
  1581. is_hit=int(ts[13]),
  1582. referenced_key_exist_in_block=int(ts[14]),
  1583. num_keys_in_block=int(ts[15]),
  1584. table_id=int(ts[16]),
  1585. seq_number=int(ts[17]),
  1586. block_key_size=int(ts[18]),
  1587. key_size=int(ts[19]),
  1588. block_offset_in_file=int(ts[20]),
  1589. next_access_seq_no=next_access_seq_no,
  1590. )
  1591. trace_miss_ratio_stats.update_metrics(
  1592. record.access_time, is_hit=record.is_hit, miss_bytes=record.block_size
  1593. )
  1594. cache.access(record)
  1595. access_seq_no += 1
  1596. del record
  1597. del ts
  1598. if access_seq_no % 100 != 0:
  1599. continue
  1600. # Report progress every 10 seconds.
  1601. now = time.time()
  1602. if now - start_time > time_interval * 10:
  1603. print(
  1604. "Take {} seconds to process {} trace records with trace "
  1605. "duration of {} seconds. Throughput: {} records/second. "
  1606. "Trace miss ratio {}".format(
  1607. now - start_time,
  1608. access_seq_no,
  1609. trace_duration / 1000000,
  1610. access_seq_no / (now - start_time),
  1611. trace_miss_ratio_stats.miss_ratio(),
  1612. )
  1613. )
  1614. time_interval += 1
  1615. print(
  1616. "{},0,0,{},{},{}".format(
  1617. cache_type,
  1618. cache.cache_size,
  1619. cache.miss_ratio_stats.miss_ratio(),
  1620. cache.miss_ratio_stats.num_accesses,
  1621. )
  1622. )
  1623. now = time.time()
  1624. print(
  1625. "Take {} seconds to process {} trace records with trace duration of {} "
  1626. "seconds. Throughput: {} records/second. Trace miss ratio {}".format(
  1627. now - start_time,
  1628. access_seq_no,
  1629. trace_duration / 1000000,
  1630. access_seq_no / (now - start_time),
  1631. trace_miss_ratio_stats.miss_ratio(),
  1632. )
  1633. )
  1634. print(
  1635. "{},0,0,{},{},{}".format(
  1636. cache_type,
  1637. cache.cache_size,
  1638. cache.miss_ratio_stats.miss_ratio(),
  1639. cache.miss_ratio_stats.num_accesses,
  1640. )
  1641. )
  1642. return trace_start_time, trace_duration
  1643. def report_stats(
  1644. cache,
  1645. cache_type,
  1646. cache_size,
  1647. target_cf_name,
  1648. result_dir,
  1649. trace_start_time,
  1650. trace_end_time,
  1651. ):
  1652. cache_label = "{}-{}-{}".format(cache_type, cache_size, target_cf_name)
  1653. with open("{}/data-ml-mrc-{}".format(result_dir, cache_label), "w+") as mrc_file:
  1654. mrc_file.write(
  1655. "{},0,0,{},{},{}\n".format(
  1656. cache_type,
  1657. cache_size,
  1658. cache.miss_ratio_stats.miss_ratio(),
  1659. cache.miss_ratio_stats.num_accesses,
  1660. )
  1661. )
  1662. cache_stats = [
  1663. cache.per_second_miss_ratio_stats,
  1664. cache.miss_ratio_stats,
  1665. cache.per_hour_miss_ratio_stats,
  1666. ]
  1667. for i in range(len(cache_stats)):
  1668. avg_miss_bytes, p95_miss_bytes = cache_stats[i].compute_miss_bytes()
  1669. with open(
  1670. "{}/data-ml-avgmb-{}-{}".format(
  1671. result_dir, cache_stats[i].time_unit, cache_label
  1672. ),
  1673. "w+",
  1674. ) as mb_file:
  1675. mb_file.write(
  1676. "{},0,0,{},{}\n".format(cache_type, cache_size, avg_miss_bytes)
  1677. )
  1678. with open(
  1679. "{}/data-ml-p95mb-{}-{}".format(
  1680. result_dir, cache_stats[i].time_unit, cache_label
  1681. ),
  1682. "w+",
  1683. ) as mb_file:
  1684. mb_file.write(
  1685. "{},0,0,{},{}\n".format(cache_type, cache_size, p95_miss_bytes)
  1686. )
  1687. cache_stats[i].write_miss_timeline(
  1688. cache_type,
  1689. cache_size,
  1690. target_cf_name,
  1691. result_dir,
  1692. trace_start_time,
  1693. trace_end_time,
  1694. )
  1695. cache_stats[i].write_miss_ratio_timeline(
  1696. cache_type,
  1697. cache_size,
  1698. target_cf_name,
  1699. result_dir,
  1700. trace_start_time,
  1701. trace_end_time,
  1702. )
  1703. if not cache.is_ml_cache():
  1704. return
  1705. policy_stats = [cache.policy_stats, cache.per_hour_policy_stats]
  1706. for i in range(len(policy_stats)):
  1707. policy_stats[i].write_policy_timeline(
  1708. cache_type,
  1709. cache_size,
  1710. target_cf_name,
  1711. result_dir,
  1712. trace_start_time,
  1713. trace_end_time,
  1714. )
  1715. policy_stats[i].write_policy_ratio_timeline(
  1716. cache_type,
  1717. cache_size,
  1718. target_cf_name,
  1719. result_dir,
  1720. trace_start_time,
  1721. trace_end_time,
  1722. )
  1723. if __name__ == "__main__":
  1724. if len(sys.argv) <= 8:
  1725. print(
  1726. "Must provide 8 arguments.\n"
  1727. "1) Cache type (ts, linucb, arc, lru, opt, pylru, pymru, pylfu, "
  1728. "pyhb, gdsize, trace). One may evaluate the hybrid row_block cache "
  1729. "by appending '_hybrid' to a cache_type, e.g., ts_hybrid. "
  1730. "Note that hybrid is not supported with opt and trace. \n"
  1731. "2) Cache size (xM, xG, xT).\n"
  1732. "3) The sampling frequency used to collect the trace. (The "
  1733. "simulation scales down the cache size by the sampling frequency).\n"
  1734. "4) Warmup seconds (The number of seconds used for warmup).\n"
  1735. "5) Trace file path.\n"
  1736. "6) Result directory (A directory that saves generated results)\n"
  1737. "7) Max number of accesses to process\n"
  1738. "8) The target column family. (The simulation will only run "
  1739. "accesses on the target column family. If it is set to all, "
  1740. "it will run against all accesses.)"
  1741. )
  1742. exit(1)
  1743. print("Arguments: {}".format(sys.argv))
  1744. cache_type = sys.argv[1]
  1745. cache_size = parse_cache_size(sys.argv[2])
  1746. downsample_size = int(sys.argv[3])
  1747. warmup_seconds = int(sys.argv[4])
  1748. trace_file_path = sys.argv[5]
  1749. result_dir = sys.argv[6]
  1750. max_accesses_to_process = int(sys.argv[7])
  1751. target_cf_name = sys.argv[8]
  1752. cache = create_cache(cache_type, cache_size, downsample_size)
  1753. trace_start_time, trace_duration = run(
  1754. trace_file_path,
  1755. cache_type,
  1756. cache,
  1757. warmup_seconds,
  1758. max_accesses_to_process,
  1759. target_cf_name,
  1760. )
  1761. trace_end_time = trace_start_time + trace_duration
  1762. report_stats(
  1763. cache,
  1764. cache_type,
  1765. cache_size,
  1766. target_cf_name,
  1767. result_dir,
  1768. trace_start_time,
  1769. trace_end_time,
  1770. )