db_crashtest.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499
  1. #!/usr/bin/env python2
  2. # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  3. import os
  4. import sys
  5. import time
  6. import random
  7. import tempfile
  8. import subprocess
  9. import shutil
  10. import argparse
  11. # params overwrite priority:
  12. # for default:
  13. # default_params < {blackbox,whitebox}_default_params < args
  14. # for simple:
  15. # default_params < {blackbox,whitebox}_default_params <
  16. # simple_default_params <
  17. # {blackbox,whitebox}_simple_default_params < args
  18. # for cf_consistency:
  19. # default_params < {blackbox,whitebox}_default_params <
  20. # cf_consistency_params < args
  21. # for txn:
  22. # default_params < {blackbox,whitebox}_default_params < txn_params < args
  23. expected_values_file = tempfile.NamedTemporaryFile()
  24. default_params = {
  25. "acquire_snapshot_one_in": 10000,
  26. "block_size": 16384,
  27. "bloom_bits": lambda: random.choice([random.randint(0,19),
  28. random.lognormvariate(2.3, 1.3)]),
  29. "cache_index_and_filter_blocks": lambda: random.randint(0, 1),
  30. "cache_size": 1048576,
  31. "checkpoint_one_in": 1000000,
  32. "compression_type": lambda: random.choice(
  33. ["none", "snappy", "zlib", "bzip2", "lz4", "lz4hc", "xpress", "zstd"]),
  34. "bottommost_compression_type": lambda:
  35. "disable" if random.randint(0, 1) == 0 else
  36. random.choice(
  37. ["none", "snappy", "zlib", "bzip2", "lz4", "lz4hc", "xpress",
  38. "zstd"]),
  39. "checksum_type" : lambda: random.choice(["kCRC32c", "kxxHash", "kxxHash64"]),
  40. "compression_max_dict_bytes": lambda: 16384 * random.randint(0, 1),
  41. "compression_zstd_max_train_bytes": lambda: 65536 * random.randint(0, 1),
  42. "clear_column_family_one_in": 0,
  43. "compact_files_one_in": 1000000,
  44. "compact_range_one_in": 1000000,
  45. "delpercent": 4,
  46. "delrangepercent": 1,
  47. "destroy_db_initially": 0,
  48. "enable_pipelined_write": lambda: random.randint(0, 1),
  49. "expected_values_path": expected_values_file.name,
  50. "flush_one_in": 1000000,
  51. "get_live_files_and_wal_files_one_in": 1000000,
  52. # Temporarily disable hash index
  53. "index_type": lambda: random.choice([0,2]),
  54. "max_background_compactions": 20,
  55. "max_bytes_for_level_base": 10485760,
  56. "max_key": 100000000,
  57. "max_write_buffer_number": 3,
  58. "mmap_read": lambda: random.randint(0, 1),
  59. "nooverwritepercent": 1,
  60. "open_files": lambda : random.choice([-1, 500000]),
  61. "partition_filters": lambda: random.randint(0, 1),
  62. "pause_background_one_in": 1000000,
  63. "prefixpercent": 5,
  64. "progress_reports": 0,
  65. "readpercent": 45,
  66. "recycle_log_file_num": lambda: random.randint(0, 1),
  67. "reopen": 20,
  68. "snapshot_hold_ops": 100000,
  69. "long_running_snapshots": lambda: random.randint(0, 1),
  70. "subcompactions": lambda: random.randint(1, 4),
  71. "target_file_size_base": 2097152,
  72. "target_file_size_multiplier": 2,
  73. "use_direct_reads": lambda: random.randint(0, 1),
  74. "use_direct_io_for_flush_and_compaction": lambda: random.randint(0, 1),
  75. "use_full_merge_v1": lambda: random.randint(0, 1),
  76. "use_merge": lambda: random.randint(0, 1),
  77. "verify_checksum": 1,
  78. "write_buffer_size": 4 * 1024 * 1024,
  79. "writepercent": 35,
  80. "format_version": lambda: random.choice([2, 3, 4, 5, 5]),
  81. "index_block_restart_interval": lambda: random.choice(range(1, 16)),
  82. "use_multiget" : lambda: random.randint(0, 1),
  83. "periodic_compaction_seconds" :
  84. lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]),
  85. "compaction_ttl" : lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]),
  86. # Test small max_manifest_file_size in a smaller chance, as most of the
  87. # time we wnat manifest history to be preserved to help debug
  88. "max_manifest_file_size" : lambda : random.choice(
  89. [t * 16384 if t < 3 else 1024 * 1024 * 1024 for t in range(1, 30)]),
  90. # Sync mode might make test runs slower so running it in a smaller chance
  91. "sync" : lambda : random.choice(
  92. [1 if t == 0 else 0 for t in range(0, 20)]),
  93. # Disable compation_readahead_size because the test is not passing.
  94. #"compaction_readahead_size" : lambda : random.choice(
  95. # [0, 0, 1024 * 1024]),
  96. "db_write_buffer_size" : lambda: random.choice(
  97. [0, 0, 0, 1024 * 1024, 8 * 1024 * 1024, 128 * 1024 * 1024]),
  98. "avoid_unnecessary_blocking_io" : random.randint(0, 1),
  99. "write_dbid_to_manifest" : random.randint(0, 1),
  100. "max_write_batch_group_size_bytes" : lambda: random.choice(
  101. [16, 64, 1024 * 1024, 16 * 1024 * 1024]),
  102. "level_compaction_dynamic_level_bytes" : True,
  103. "verify_checksum_one_in": 1000000,
  104. "verify_db_one_in": 100000,
  105. "continuous_verification_interval" : 0,
  106. "max_key_len": 3,
  107. "key_len_percent_dist": "1,30,69"
  108. }
  109. _TEST_DIR_ENV_VAR = 'TEST_TMPDIR'
  110. def get_dbname(test_name):
  111. test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
  112. if test_tmpdir is None or test_tmpdir == "":
  113. dbname = tempfile.mkdtemp(prefix='rocksdb_crashtest_' + test_name)
  114. else:
  115. dbname = test_tmpdir + "/rocksdb_crashtest_" + test_name
  116. shutil.rmtree(dbname, True)
  117. os.mkdir(dbname)
  118. return dbname
  119. def is_direct_io_supported(dbname):
  120. with tempfile.NamedTemporaryFile(dir=dbname) as f:
  121. try:
  122. os.open(f.name, os.O_DIRECT)
  123. except:
  124. return False
  125. return True
  126. blackbox_default_params = {
  127. # total time for this script to test db_stress
  128. "duration": 6000,
  129. # time for one db_stress instance to run
  130. "interval": 120,
  131. # since we will be killing anyway, use large value for ops_per_thread
  132. "ops_per_thread": 100000000,
  133. "set_options_one_in": 10000,
  134. "test_batches_snapshots": 1,
  135. }
  136. whitebox_default_params = {
  137. "duration": 10000,
  138. "log2_keys_per_lock": 10,
  139. "ops_per_thread": 200000,
  140. "random_kill_odd": 888887,
  141. "test_batches_snapshots": lambda: random.randint(0, 1),
  142. }
  143. simple_default_params = {
  144. "allow_concurrent_memtable_write": lambda: random.randint(0, 1),
  145. "column_families": 1,
  146. "max_background_compactions": 1,
  147. "max_bytes_for_level_base": 67108864,
  148. "memtablerep": "skip_list",
  149. "prefixpercent": 0,
  150. "readpercent": 50,
  151. "prefix_size" : -1,
  152. "target_file_size_base": 16777216,
  153. "target_file_size_multiplier": 1,
  154. "test_batches_snapshots": 0,
  155. "write_buffer_size": 32 * 1024 * 1024,
  156. "level_compaction_dynamic_level_bytes": False,
  157. }
  158. blackbox_simple_default_params = {
  159. "open_files": -1,
  160. "set_options_one_in": 0,
  161. }
  162. whitebox_simple_default_params = {}
  163. cf_consistency_params = {
  164. "disable_wal": lambda: random.randint(0, 1),
  165. "reopen": 0,
  166. "test_cf_consistency": 1,
  167. # use small value for write_buffer_size so that RocksDB triggers flush
  168. # more frequently
  169. "write_buffer_size": 1024 * 1024,
  170. "enable_pipelined_write": lambda: random.randint(0, 1),
  171. }
  172. txn_params = {
  173. "use_txn" : 1,
  174. # Avoid lambda to set it once for the entire test
  175. "txn_write_policy": random.randint(0, 2),
  176. "unordered_write": random.randint(0, 1),
  177. "disable_wal": 0,
  178. # OpenReadOnly after checkpoint is not currnetly compatible with WritePrepared txns
  179. "checkpoint_one_in": 0,
  180. # pipeline write is not currnetly compatible with WritePrepared txns
  181. "enable_pipelined_write": 0,
  182. }
  183. def finalize_and_sanitize(src_params):
  184. dest_params = dict([(k, v() if callable(v) else v)
  185. for (k, v) in src_params.items()])
  186. if dest_params.get("compression_type") != "zstd" or \
  187. dest_params.get("compression_max_dict_bytes") == 0:
  188. dest_params["compression_zstd_max_train_bytes"] = 0
  189. if dest_params.get("allow_concurrent_memtable_write", 1) == 1:
  190. dest_params["memtablerep"] = "skip_list"
  191. if dest_params["mmap_read"] == 1 or not is_direct_io_supported(
  192. dest_params["db"]):
  193. dest_params["use_direct_io_for_flush_and_compaction"] = 0
  194. dest_params["use_direct_reads"] = 0
  195. # DeleteRange is not currnetly compatible with Txns
  196. if dest_params.get("test_batches_snapshots") == 1 or \
  197. dest_params.get("use_txn") == 1:
  198. dest_params["delpercent"] += dest_params["delrangepercent"]
  199. dest_params["delrangepercent"] = 0
  200. # Only under WritePrepared txns, unordered_write would provide the same guarnatees as vanilla rocksdb
  201. if dest_params.get("unordered_write", 0) == 1:
  202. dest_params["txn_write_policy"] = 1
  203. dest_params["allow_concurrent_memtable_write"] = 1
  204. if dest_params.get("disable_wal", 0) == 1:
  205. dest_params["atomic_flush"] = 1
  206. dest_params["sync"] = 0
  207. if dest_params.get("open_files", 1) != -1:
  208. # Compaction TTL and periodic compactions are only compatible
  209. # with open_files = -1
  210. dest_params["compaction_ttl"] = 0
  211. dest_params["periodic_compaction_seconds"] = 0
  212. if dest_params.get("compaction_style", 0) == 2:
  213. # Disable compaction TTL in FIFO compaction, because right
  214. # now assertion failures are triggered.
  215. dest_params["compaction_ttl"] = 0
  216. dest_params["periodic_compaction_seconds"] = 0
  217. if dest_params["partition_filters"] == 1:
  218. if dest_params["index_type"] != 2:
  219. dest_params["partition_filters"] = 0
  220. else:
  221. dest_params["use_block_based_filter"] = 0
  222. if dest_params.get("atomic_flush", 0) == 1:
  223. # disable pipelined write when atomic flush is used.
  224. dest_params["enable_pipelined_write"] = 0
  225. return dest_params
  226. def gen_cmd_params(args):
  227. params = {}
  228. params.update(default_params)
  229. if args.test_type == 'blackbox':
  230. params.update(blackbox_default_params)
  231. if args.test_type == 'whitebox':
  232. params.update(whitebox_default_params)
  233. if args.simple:
  234. params.update(simple_default_params)
  235. if args.test_type == 'blackbox':
  236. params.update(blackbox_simple_default_params)
  237. if args.test_type == 'whitebox':
  238. params.update(whitebox_simple_default_params)
  239. if args.cf_consistency:
  240. params.update(cf_consistency_params)
  241. if args.txn:
  242. params.update(txn_params)
  243. for k, v in vars(args).items():
  244. if v is not None:
  245. params[k] = v
  246. return params
  247. def gen_cmd(params, unknown_params):
  248. finalzied_params = finalize_and_sanitize(params)
  249. cmd = ['./db_stress'] + [
  250. '--{0}={1}'.format(k, v)
  251. for k, v in [(k, finalzied_params[k]) for k in sorted(finalzied_params)]
  252. if k not in set(['test_type', 'simple', 'duration', 'interval',
  253. 'random_kill_odd', 'cf_consistency', 'txn'])
  254. and v is not None] + unknown_params
  255. return cmd
  256. # This script runs and kills db_stress multiple times. It checks consistency
  257. # in case of unsafe crashes in RocksDB.
  258. def blackbox_crash_main(args, unknown_args):
  259. cmd_params = gen_cmd_params(args)
  260. dbname = get_dbname('blackbox')
  261. exit_time = time.time() + cmd_params['duration']
  262. print("Running blackbox-crash-test with \n"
  263. + "interval_between_crash=" + str(cmd_params['interval']) + "\n"
  264. + "total-duration=" + str(cmd_params['duration']) + "\n")
  265. while time.time() < exit_time:
  266. run_had_errors = False
  267. killtime = time.time() + cmd_params['interval']
  268. cmd = gen_cmd(dict(
  269. cmd_params.items() +
  270. {'db': dbname}.items()), unknown_args)
  271. child = subprocess.Popen(cmd, stderr=subprocess.PIPE)
  272. print("Running db_stress with pid=%d: %s\n\n"
  273. % (child.pid, ' '.join(cmd)))
  274. stop_early = False
  275. while time.time() < killtime:
  276. if child.poll() is not None:
  277. print("WARNING: db_stress ended before kill: exitcode=%d\n"
  278. % child.returncode)
  279. stop_early = True
  280. break
  281. time.sleep(1)
  282. if not stop_early:
  283. if child.poll() is not None:
  284. print("WARNING: db_stress ended before kill: exitcode=%d\n"
  285. % child.returncode)
  286. else:
  287. child.kill()
  288. print("KILLED %d\n" % child.pid)
  289. time.sleep(1) # time to stabilize after a kill
  290. while True:
  291. line = child.stderr.readline().strip()
  292. if line == '':
  293. break
  294. elif not line.startswith('WARNING'):
  295. run_had_errors = True
  296. print('stderr has error message:')
  297. print('***' + line + '***')
  298. if run_had_errors:
  299. sys.exit(2)
  300. time.sleep(1) # time to stabilize before the next run
  301. # we need to clean up after ourselves -- only do this on test success
  302. shutil.rmtree(dbname, True)
  303. # This python script runs db_stress multiple times. Some runs with
  304. # kill_random_test that causes rocksdb to crash at various points in code.
  305. def whitebox_crash_main(args, unknown_args):
  306. cmd_params = gen_cmd_params(args)
  307. dbname = get_dbname('whitebox')
  308. cur_time = time.time()
  309. exit_time = cur_time + cmd_params['duration']
  310. half_time = cur_time + cmd_params['duration'] / 2
  311. print("Running whitebox-crash-test with \n"
  312. + "total-duration=" + str(cmd_params['duration']) + "\n")
  313. total_check_mode = 4
  314. check_mode = 0
  315. kill_random_test = cmd_params['random_kill_odd']
  316. kill_mode = 0
  317. while time.time() < exit_time:
  318. if check_mode == 0:
  319. additional_opts = {
  320. # use large ops per thread since we will kill it anyway
  321. "ops_per_thread": 100 * cmd_params['ops_per_thread'],
  322. }
  323. # run with kill_random_test, with three modes.
  324. # Mode 0 covers all kill points. Mode 1 covers less kill points but
  325. # increases change of triggering them. Mode 2 covers even less
  326. # frequent kill points and further increases triggering change.
  327. if kill_mode == 0:
  328. additional_opts.update({
  329. "kill_random_test": kill_random_test,
  330. })
  331. elif kill_mode == 1:
  332. if cmd_params.get('disable_wal', 0) == 1:
  333. my_kill_odd = kill_random_test / 50 + 1
  334. else:
  335. my_kill_odd = kill_random_test / 10 + 1
  336. additional_opts.update({
  337. "kill_random_test": my_kill_odd,
  338. "kill_prefix_blacklist": "WritableFileWriter::Append,"
  339. + "WritableFileWriter::WriteBuffered",
  340. })
  341. elif kill_mode == 2:
  342. # TODO: May need to adjust random odds if kill_random_test
  343. # is too small.
  344. additional_opts.update({
  345. "kill_random_test": (kill_random_test / 5000 + 1),
  346. "kill_prefix_blacklist": "WritableFileWriter::Append,"
  347. "WritableFileWriter::WriteBuffered,"
  348. "PosixMmapFile::Allocate,WritableFileWriter::Flush",
  349. })
  350. # Run kill mode 0, 1 and 2 by turn.
  351. kill_mode = (kill_mode + 1) % 3
  352. elif check_mode == 1:
  353. # normal run with universal compaction mode
  354. additional_opts = {
  355. "kill_random_test": None,
  356. "ops_per_thread": cmd_params['ops_per_thread'],
  357. "compaction_style": 1,
  358. }
  359. elif check_mode == 2:
  360. # normal run with FIFO compaction mode
  361. # ops_per_thread is divided by 5 because FIFO compaction
  362. # style is quite a bit slower on reads with lot of files
  363. additional_opts = {
  364. "kill_random_test": None,
  365. "ops_per_thread": cmd_params['ops_per_thread'] / 5,
  366. "compaction_style": 2,
  367. }
  368. else:
  369. # normal run
  370. additional_opts = {
  371. "kill_random_test": None,
  372. "ops_per_thread": cmd_params['ops_per_thread'],
  373. }
  374. cmd = gen_cmd(dict(cmd_params.items() + additional_opts.items()
  375. + {'db': dbname}.items()), unknown_args)
  376. print "Running:" + ' '.join(cmd) + "\n" # noqa: E999 T25377293 Grandfathered in
  377. popen = subprocess.Popen(cmd, stdout=subprocess.PIPE,
  378. stderr=subprocess.STDOUT)
  379. stdoutdata, stderrdata = popen.communicate()
  380. retncode = popen.returncode
  381. msg = ("check_mode={0}, kill option={1}, exitcode={2}\n".format(
  382. check_mode, additional_opts['kill_random_test'], retncode))
  383. print msg
  384. print stdoutdata
  385. expected = False
  386. if additional_opts['kill_random_test'] is None and (retncode == 0):
  387. # we expect zero retncode if no kill option
  388. expected = True
  389. elif additional_opts['kill_random_test'] is not None and retncode <= 0:
  390. # When kill option is given, the test MIGHT kill itself.
  391. # If it does, negative retncode is expected. Otherwise 0.
  392. expected = True
  393. if not expected:
  394. print "TEST FAILED. See kill option and exit code above!!!\n"
  395. sys.exit(1)
  396. stdoutdata = stdoutdata.lower()
  397. errorcount = (stdoutdata.count('error') -
  398. stdoutdata.count('got errors 0 times'))
  399. print "#times error occurred in output is " + str(errorcount) + "\n"
  400. if (errorcount > 0):
  401. print "TEST FAILED. Output has 'error'!!!\n"
  402. sys.exit(2)
  403. if (stdoutdata.find('fail') >= 0):
  404. print "TEST FAILED. Output has 'fail'!!!\n"
  405. sys.exit(2)
  406. # First half of the duration, keep doing kill test. For the next half,
  407. # try different modes.
  408. if time.time() > half_time:
  409. # we need to clean up after ourselves -- only do this on test
  410. # success
  411. shutil.rmtree(dbname, True)
  412. os.mkdir(dbname)
  413. cmd_params.pop('expected_values_path', None)
  414. check_mode = (check_mode + 1) % total_check_mode
  415. time.sleep(1) # time to stabilize after a kill
  416. def main():
  417. parser = argparse.ArgumentParser(description="This script runs and kills \
  418. db_stress multiple times")
  419. parser.add_argument("test_type", choices=["blackbox", "whitebox"])
  420. parser.add_argument("--simple", action="store_true")
  421. parser.add_argument("--cf_consistency", action='store_true')
  422. parser.add_argument("--txn", action='store_true')
  423. all_params = dict(default_params.items()
  424. + blackbox_default_params.items()
  425. + whitebox_default_params.items()
  426. + simple_default_params.items()
  427. + blackbox_simple_default_params.items()
  428. + whitebox_simple_default_params.items())
  429. for k, v in all_params.items():
  430. parser.add_argument("--" + k, type=type(v() if callable(v) else v))
  431. # unknown_args are passed directly to db_stress
  432. args, unknown_args = parser.parse_known_args()
  433. test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
  434. if test_tmpdir is not None and not os.path.isdir(test_tmpdir):
  435. print('%s env var is set to a non-existent directory: %s' %
  436. (_TEST_DIR_ENV_VAR, test_tmpdir))
  437. sys.exit(1)
  438. if args.test_type == 'blackbox':
  439. blackbox_crash_main(args, unknown_args)
  440. if args.test_type == 'whitebox':
  441. whitebox_crash_main(args, unknown_args)
  442. if __name__ == '__main__':
  443. main()