db_write_buffer_manager_test.cc 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. #include "db/db_test_util.h"
  10. #include "db/write_thread.h"
  11. #include "port/stack_trace.h"
  12. namespace ROCKSDB_NAMESPACE {
  13. class DBWriteBufferManagerTest : public DBTestBase,
  14. public testing::WithParamInterface<bool> {
  15. public:
  16. DBWriteBufferManagerTest()
  17. : DBTestBase("db_write_buffer_manager_test", /*env_do_fsync=*/false) {}
  18. bool cost_cache_;
  19. };
  20. TEST_P(DBWriteBufferManagerTest, SharedBufferAcrossCFs1) {
  21. Options options = CurrentOptions();
  22. options.arena_block_size = 4096;
  23. options.write_buffer_size = 500000; // this is never hit
  24. std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
  25. ASSERT_LT(cache->GetUsage(), 256 * 1024);
  26. cost_cache_ = GetParam();
  27. if (cost_cache_) {
  28. options.write_buffer_manager.reset(
  29. new WriteBufferManager(100000, cache, true));
  30. } else {
  31. options.write_buffer_manager.reset(
  32. new WriteBufferManager(100000, nullptr, true));
  33. }
  34. WriteOptions wo;
  35. wo.disableWAL = true;
  36. CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options);
  37. ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
  38. ASSERT_OK(Flush(3));
  39. ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
  40. ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
  41. ASSERT_OK(Flush(0));
  42. // Write to "Default", "cf2" and "cf3".
  43. ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
  44. ASSERT_OK(Put(0, Key(1), DummyString(40000), wo));
  45. ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
  46. ASSERT_OK(Put(3, Key(2), DummyString(40000), wo));
  47. // WriteBufferManager::buffer_size_ has exceeded after the previous write is
  48. // completed.
  49. // This make sures write will go through and if stall was in effect, it will
  50. // end.
  51. ASSERT_OK(Put(0, Key(2), DummyString(1), wo));
  52. }
  53. // Test Single DB with multiple writer threads get blocked when
  54. // WriteBufferManager execeeds buffer_size_ and flush is waiting to be
  55. // finished.
  56. TEST_P(DBWriteBufferManagerTest, SharedWriteBufferAcrossCFs2) {
  57. Options options = CurrentOptions();
  58. options.arena_block_size = 4096;
  59. options.write_buffer_size = 500000; // this is never hit
  60. std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
  61. ASSERT_LT(cache->GetUsage(), 256 * 1024);
  62. cost_cache_ = GetParam();
  63. if (cost_cache_) {
  64. options.write_buffer_manager.reset(
  65. new WriteBufferManager(100000, cache, true));
  66. } else {
  67. options.write_buffer_manager.reset(
  68. new WriteBufferManager(100000, nullptr, true));
  69. }
  70. WriteOptions wo;
  71. wo.disableWAL = true;
  72. CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options);
  73. ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
  74. ASSERT_OK(Flush(3));
  75. ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
  76. ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
  77. ASSERT_OK(Flush(0));
  78. // Write to "Default", "cf2" and "cf3". No flush will be triggered.
  79. ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
  80. ASSERT_OK(Put(0, Key(1), DummyString(40000), wo));
  81. ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
  82. ASSERT_OK(Put(3, Key(2), DummyString(40000), wo));
  83. // WriteBufferManager::buffer_size_ has exceeded after the previous write is
  84. // completed.
  85. std::unordered_set<WriteThread::Writer*> w_set;
  86. std::vector<port::Thread> threads;
  87. int wait_count_db = 0;
  88. int num_writers = 4;
  89. InstrumentedMutex mutex;
  90. InstrumentedCondVar cv(&mutex);
  91. std::atomic<int> thread_num(0);
  92. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
  93. {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
  94. "DBImpl::BackgroundCallFlush:start"}});
  95. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
  96. "WBMStallInterface::BlockDB", [&](void*) {
  97. InstrumentedMutexLock lock(&mutex);
  98. wait_count_db++;
  99. cv.SignalAll();
  100. });
  101. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
  102. "WriteThread::WriteStall::Wait", [&](void* arg) {
  103. InstrumentedMutexLock lock(&mutex);
  104. WriteThread::Writer* w = static_cast<WriteThread::Writer*>(arg);
  105. w_set.insert(w);
  106. // Allow the flush to continue if all writer threads are blocked.
  107. if (w_set.size() == (unsigned long)num_writers) {
  108. TEST_SYNC_POINT(
  109. "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
  110. }
  111. });
  112. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
  113. bool s = true;
  114. std::function<void(int)> writer = [&](int cf) {
  115. int a = thread_num.fetch_add(1);
  116. std::string key = "foo" + std::to_string(a);
  117. Status tmp = Put(cf, Slice(key), DummyString(1), wo);
  118. InstrumentedMutexLock lock(&mutex);
  119. s = s && tmp.ok();
  120. };
  121. // Flow:
  122. // main_writer thread will write but will be blocked (as Flush will on hold,
  123. // buffer_size_ has exceeded, thus will create stall in effect).
  124. // |
  125. // |
  126. // multiple writer threads will be created to write across multiple columns
  127. // and they will be blocked.
  128. // |
  129. // |
  130. // Last writer thread will write and when its blocked it will signal Flush to
  131. // continue to clear the stall.
  132. threads.emplace_back(writer, 1);
  133. // Wait untill first thread (main_writer) writing to DB is blocked and then
  134. // create the multiple writers which will be blocked from getting added to the
  135. // queue because stall is in effect.
  136. {
  137. InstrumentedMutexLock lock(&mutex);
  138. while (wait_count_db != 1) {
  139. cv.Wait();
  140. }
  141. }
  142. for (int i = 0; i < num_writers; i++) {
  143. threads.emplace_back(writer, i % 4);
  144. }
  145. for (auto& t : threads) {
  146. t.join();
  147. }
  148. ASSERT_TRUE(s);
  149. // Number of DBs blocked.
  150. ASSERT_EQ(wait_count_db, 1);
  151. // Number of Writer threads blocked.
  152. ASSERT_EQ(w_set.size(), num_writers);
  153. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
  154. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
  155. }
  156. // Test multiple DBs get blocked when WriteBufferManager limit exceeds and flush
  157. // is waiting to be finished but DBs tries to write meanwhile.
  158. TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) {
  159. std::vector<std::string> dbnames;
  160. std::vector<DB*> dbs;
  161. int num_dbs = 3;
  162. for (int i = 0; i < num_dbs; i++) {
  163. dbs.push_back(nullptr);
  164. dbnames.push_back(
  165. test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i)));
  166. }
  167. Options options = CurrentOptions();
  168. options.arena_block_size = 4096;
  169. options.write_buffer_size = 500000; // this is never hit
  170. std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
  171. ASSERT_LT(cache->GetUsage(), 256 * 1024);
  172. cost_cache_ = GetParam();
  173. if (cost_cache_) {
  174. options.write_buffer_manager.reset(
  175. new WriteBufferManager(100000, cache, true));
  176. } else {
  177. options.write_buffer_manager.reset(
  178. new WriteBufferManager(100000, nullptr, true));
  179. }
  180. CreateAndReopenWithCF({"cf1", "cf2"}, options);
  181. for (int i = 0; i < num_dbs; i++) {
  182. ASSERT_OK(DestroyDB(dbnames[i], options));
  183. ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i])));
  184. }
  185. WriteOptions wo;
  186. wo.disableWAL = true;
  187. for (int i = 0; i < num_dbs; i++) {
  188. ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000)));
  189. }
  190. // Insert to db_.
  191. ASSERT_OK(Put(0, Key(1), DummyString(30000), wo));
  192. // WriteBufferManager Limit exceeded.
  193. std::vector<port::Thread> threads;
  194. int wait_count_db = 0;
  195. InstrumentedMutex mutex;
  196. InstrumentedCondVar cv(&mutex);
  197. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
  198. {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
  199. "DBImpl::BackgroundCallFlush:start"}});
  200. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
  201. "WBMStallInterface::BlockDB", [&](void*) {
  202. {
  203. InstrumentedMutexLock lock(&mutex);
  204. wait_count_db++;
  205. cv.Signal();
  206. // Since this is the last DB, signal Flush to continue.
  207. if (wait_count_db == num_dbs + 1) {
  208. TEST_SYNC_POINT(
  209. "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
  210. }
  211. }
  212. });
  213. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
  214. bool s = true;
  215. // Write to DB.
  216. std::function<void(DB*)> write_db = [&](DB* db) {
  217. Status tmp = db->Put(wo, Key(3), DummyString(1));
  218. InstrumentedMutexLock lock(&mutex);
  219. s = s && tmp.ok();
  220. };
  221. // Flow:
  222. // db_ will write and will be blocked (as Flush will on hold and will create
  223. // stall in effect).
  224. // |
  225. // multiple dbs writers will be created to write to that db and they will be
  226. // blocked.
  227. // |
  228. // |
  229. // Last writer will write and when its blocked it will signal Flush to
  230. // continue to clear the stall.
  231. threads.emplace_back(write_db, db_);
  232. // Wait untill first DB is blocked and then create the multiple writers for
  233. // different DBs which will be blocked from getting added to the queue because
  234. // stall is in effect.
  235. {
  236. InstrumentedMutexLock lock(&mutex);
  237. while (wait_count_db != 1) {
  238. cv.Wait();
  239. }
  240. }
  241. for (int i = 0; i < num_dbs; i++) {
  242. threads.emplace_back(write_db, dbs[i]);
  243. }
  244. for (auto& t : threads) {
  245. t.join();
  246. }
  247. ASSERT_TRUE(s);
  248. ASSERT_EQ(num_dbs + 1, wait_count_db);
  249. // Clean up DBs.
  250. for (int i = 0; i < num_dbs; i++) {
  251. ASSERT_OK(dbs[i]->Close());
  252. ASSERT_OK(DestroyDB(dbnames[i], options));
  253. delete dbs[i];
  254. }
  255. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
  256. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
  257. }
  258. // Test multiple threads writing across multiple DBs and multiple columns get
  259. // blocked when stall by WriteBufferManager is in effect.
  260. TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB1) {
  261. std::vector<std::string> dbnames;
  262. std::vector<DB*> dbs;
  263. int num_dbs = 3;
  264. for (int i = 0; i < num_dbs; i++) {
  265. dbs.push_back(nullptr);
  266. dbnames.push_back(
  267. test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i)));
  268. }
  269. Options options = CurrentOptions();
  270. options.arena_block_size = 4096;
  271. options.write_buffer_size = 500000; // this is never hit
  272. std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
  273. ASSERT_LT(cache->GetUsage(), 256 * 1024);
  274. cost_cache_ = GetParam();
  275. if (cost_cache_) {
  276. options.write_buffer_manager.reset(
  277. new WriteBufferManager(100000, cache, true));
  278. } else {
  279. options.write_buffer_manager.reset(
  280. new WriteBufferManager(100000, nullptr, true));
  281. }
  282. CreateAndReopenWithCF({"cf1", "cf2"}, options);
  283. for (int i = 0; i < num_dbs; i++) {
  284. ASSERT_OK(DestroyDB(dbnames[i], options));
  285. ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i])));
  286. }
  287. WriteOptions wo;
  288. wo.disableWAL = true;
  289. for (int i = 0; i < num_dbs; i++) {
  290. ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000)));
  291. }
  292. // Insert to db_.
  293. ASSERT_OK(Put(0, Key(1), DummyString(30000), wo));
  294. // WriteBufferManager::buffer_size_ has exceeded after the previous write to
  295. // dbs[0] is completed.
  296. std::vector<port::Thread> threads;
  297. int wait_count_db = 0;
  298. InstrumentedMutex mutex;
  299. InstrumentedCondVar cv(&mutex);
  300. std::unordered_set<WriteThread::Writer*> w_set;
  301. std::vector<port::Thread> writer_threads;
  302. std::atomic<int> thread_num(0);
  303. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
  304. {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
  305. "DBImpl::BackgroundCallFlush:start"}});
  306. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
  307. "WBMStallInterface::BlockDB", [&](void*) {
  308. {
  309. InstrumentedMutexLock lock(&mutex);
  310. wait_count_db++;
  311. thread_num.fetch_add(1);
  312. cv.Signal();
  313. // Allow the flush to continue if all writer threads are blocked.
  314. if (thread_num.load(std::memory_order_relaxed) == 2 * num_dbs + 1) {
  315. TEST_SYNC_POINT(
  316. "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
  317. }
  318. }
  319. });
  320. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
  321. "WriteThread::WriteStall::Wait", [&](void* arg) {
  322. WriteThread::Writer* w = static_cast<WriteThread::Writer*>(arg);
  323. {
  324. InstrumentedMutexLock lock(&mutex);
  325. w_set.insert(w);
  326. thread_num.fetch_add(1);
  327. // Allow the flush continue if all writer threads are blocked.
  328. if (thread_num.load(std::memory_order_relaxed) == 2 * num_dbs + 1) {
  329. TEST_SYNC_POINT(
  330. "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
  331. }
  332. }
  333. });
  334. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
  335. bool s1 = true, s2 = true;
  336. // Write to multiple columns of db_.
  337. std::function<void(int)> write_cf = [&](int cf) {
  338. Status tmp = Put(cf, Key(3), DummyString(1), wo);
  339. InstrumentedMutexLock lock(&mutex);
  340. s1 = s1 && tmp.ok();
  341. };
  342. // Write to multiple DBs.
  343. std::function<void(DB*)> write_db = [&](DB* db) {
  344. Status tmp = db->Put(wo, Key(3), DummyString(1));
  345. InstrumentedMutexLock lock(&mutex);
  346. s2 = s2 && tmp.ok();
  347. };
  348. // Flow:
  349. // thread will write to db_ will be blocked (as Flush will on hold,
  350. // buffer_size_ has exceeded and will create stall in effect).
  351. // |
  352. // |
  353. // multiple writers threads writing to different DBs and to db_ across
  354. // multiple columns will be created and they will be blocked due to stall.
  355. // |
  356. // |
  357. // Last writer thread will write and when its blocked it will signal Flush to
  358. // continue to clear the stall.
  359. threads.emplace_back(write_db, db_);
  360. // Wait untill first thread is blocked and then create the multiple writer
  361. // threads.
  362. {
  363. InstrumentedMutexLock lock(&mutex);
  364. while (wait_count_db != 1) {
  365. cv.Wait();
  366. }
  367. }
  368. for (int i = 0; i < num_dbs; i++) {
  369. // Write to multiple columns of db_.
  370. writer_threads.emplace_back(write_cf, i % 3);
  371. // Write to different dbs.
  372. threads.emplace_back(write_db, dbs[i]);
  373. }
  374. for (auto& t : threads) {
  375. t.join();
  376. }
  377. for (auto& t : writer_threads) {
  378. t.join();
  379. }
  380. ASSERT_TRUE(s1);
  381. ASSERT_TRUE(s2);
  382. // Number of DBs blocked.
  383. ASSERT_EQ(num_dbs + 1, wait_count_db);
  384. // Number of Writer threads blocked.
  385. ASSERT_EQ(w_set.size(), num_dbs);
  386. // Clean up DBs.
  387. for (int i = 0; i < num_dbs; i++) {
  388. ASSERT_OK(dbs[i]->Close());
  389. ASSERT_OK(DestroyDB(dbnames[i], options));
  390. delete dbs[i];
  391. }
  392. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
  393. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
  394. }
  395. // Test multiple threads writing across multiple columns of db_ by passing
  396. // different values to WriteOption.no_slown_down.
  397. TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsSingleDB) {
  398. Options options = CurrentOptions();
  399. options.arena_block_size = 4096;
  400. options.write_buffer_size = 500000; // this is never hit
  401. std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
  402. ASSERT_LT(cache->GetUsage(), 256 * 1024);
  403. cost_cache_ = GetParam();
  404. if (cost_cache_) {
  405. options.write_buffer_manager.reset(
  406. new WriteBufferManager(100000, cache, true));
  407. } else {
  408. options.write_buffer_manager.reset(
  409. new WriteBufferManager(100000, nullptr, true));
  410. }
  411. WriteOptions wo;
  412. wo.disableWAL = true;
  413. CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options);
  414. ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
  415. ASSERT_OK(Flush(3));
  416. ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
  417. ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
  418. ASSERT_OK(Flush(0));
  419. // Write to "Default", "cf2" and "cf3". No flush will be triggered.
  420. ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
  421. ASSERT_OK(Put(0, Key(1), DummyString(40000), wo));
  422. ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
  423. ASSERT_OK(Put(3, Key(2), DummyString(40000), wo));
  424. // WriteBufferManager::buffer_size_ has exceeded after the previous write to
  425. // db_ is completed.
  426. std::unordered_set<WriteThread::Writer*> w_slowdown_set;
  427. std::vector<port::Thread> threads;
  428. int wait_count_db = 0;
  429. int num_writers = 4;
  430. InstrumentedMutex mutex;
  431. InstrumentedCondVar cv(&mutex);
  432. std::atomic<int> thread_num(0);
  433. std::atomic<int> w_no_slowdown(0);
  434. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
  435. {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
  436. "DBImpl::BackgroundCallFlush:start"}});
  437. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
  438. "WBMStallInterface::BlockDB", [&](void*) {
  439. {
  440. InstrumentedMutexLock lock(&mutex);
  441. wait_count_db++;
  442. cv.SignalAll();
  443. }
  444. });
  445. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
  446. "WriteThread::WriteStall::Wait", [&](void* arg) {
  447. {
  448. InstrumentedMutexLock lock(&mutex);
  449. WriteThread::Writer* w = static_cast<WriteThread::Writer*>(arg);
  450. w_slowdown_set.insert(w);
  451. // Allow the flush continue if all writer threads are blocked.
  452. if (w_slowdown_set.size() + (unsigned long)w_no_slowdown.load(
  453. std::memory_order_relaxed) ==
  454. (unsigned long)num_writers) {
  455. TEST_SYNC_POINT(
  456. "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
  457. }
  458. }
  459. });
  460. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
  461. bool s1 = true, s2 = true;
  462. std::function<void(int)> write_slow_down = [&](int cf) {
  463. int a = thread_num.fetch_add(1);
  464. std::string key = "foo" + std::to_string(a);
  465. WriteOptions write_op;
  466. write_op.no_slowdown = false;
  467. Status tmp = Put(cf, Slice(key), DummyString(1), write_op);
  468. InstrumentedMutexLock lock(&mutex);
  469. s1 = s1 && tmp.ok();
  470. };
  471. std::function<void(int)> write_no_slow_down = [&](int cf) {
  472. int a = thread_num.fetch_add(1);
  473. std::string key = "foo" + std::to_string(a);
  474. WriteOptions write_op;
  475. write_op.no_slowdown = true;
  476. Status tmp = Put(cf, Slice(key), DummyString(1), write_op);
  477. {
  478. InstrumentedMutexLock lock(&mutex);
  479. s2 = s2 && !tmp.ok();
  480. w_no_slowdown.fetch_add(1);
  481. // Allow the flush continue if all writer threads are blocked.
  482. if (w_slowdown_set.size() +
  483. (unsigned long)w_no_slowdown.load(std::memory_order_relaxed) ==
  484. (unsigned long)num_writers) {
  485. TEST_SYNC_POINT(
  486. "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
  487. }
  488. }
  489. };
  490. // Flow:
  491. // main_writer thread will write but will be blocked (as Flush will on hold,
  492. // buffer_size_ has exceeded, thus will create stall in effect).
  493. // |
  494. // |
  495. // multiple writer threads will be created to write across multiple columns
  496. // with different values of WriteOptions.no_slowdown. Some of them will
  497. // be blocked and some of them will return with Incomplete status.
  498. // |
  499. // |
  500. // Last writer thread will write and when its blocked/return it will signal
  501. // Flush to continue to clear the stall.
  502. threads.emplace_back(write_slow_down, 1);
  503. // Wait untill first thread (main_writer) writing to DB is blocked and then
  504. // create the multiple writers which will be blocked from getting added to the
  505. // queue because stall is in effect.
  506. {
  507. InstrumentedMutexLock lock(&mutex);
  508. while (wait_count_db != 1) {
  509. cv.Wait();
  510. }
  511. }
  512. for (int i = 0; i < num_writers; i += 2) {
  513. threads.emplace_back(write_no_slow_down, (i) % 4);
  514. threads.emplace_back(write_slow_down, (i + 1) % 4);
  515. }
  516. for (auto& t : threads) {
  517. t.join();
  518. }
  519. ASSERT_TRUE(s1);
  520. ASSERT_TRUE(s2);
  521. // Number of DBs blocked.
  522. ASSERT_EQ(wait_count_db, 1);
  523. // Number of Writer threads blocked.
  524. ASSERT_EQ(w_slowdown_set.size(), num_writers / 2);
  525. // Number of Writer threads with WriteOptions.no_slowdown = true.
  526. ASSERT_EQ(w_no_slowdown.load(std::memory_order_relaxed), num_writers / 2);
  527. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
  528. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
  529. }
  530. // Test multiple threads writing across multiple columns of db_ and different
  531. // dbs by passing different values to WriteOption.no_slown_down.
  532. TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) {
  533. std::vector<std::string> dbnames;
  534. std::vector<DB*> dbs;
  535. int num_dbs = 4;
  536. for (int i = 0; i < num_dbs; i++) {
  537. dbs.push_back(nullptr);
  538. dbnames.push_back(
  539. test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i)));
  540. }
  541. Options options = CurrentOptions();
  542. options.arena_block_size = 4096;
  543. options.write_buffer_size = 500000; // this is never hit
  544. std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
  545. ASSERT_LT(cache->GetUsage(), 256 * 1024);
  546. cost_cache_ = GetParam();
  547. if (cost_cache_) {
  548. options.write_buffer_manager.reset(
  549. new WriteBufferManager(100000, cache, true));
  550. } else {
  551. options.write_buffer_manager.reset(
  552. new WriteBufferManager(100000, nullptr, true));
  553. }
  554. CreateAndReopenWithCF({"cf1", "cf2"}, options);
  555. for (int i = 0; i < num_dbs; i++) {
  556. ASSERT_OK(DestroyDB(dbnames[i], options));
  557. ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i])));
  558. }
  559. WriteOptions wo;
  560. wo.disableWAL = true;
  561. for (int i = 0; i < num_dbs; i++) {
  562. ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000)));
  563. }
  564. // Insert to db_.
  565. ASSERT_OK(Put(0, Key(1), DummyString(30000), wo));
  566. // WriteBufferManager::buffer_size_ has exceeded after the previous write to
  567. // dbs[0] is completed.
  568. std::vector<port::Thread> threads;
  569. int wait_count_db = 0;
  570. InstrumentedMutex mutex;
  571. InstrumentedCondVar cv(&mutex);
  572. std::unordered_set<WriteThread::Writer*> w_slowdown_set;
  573. std::vector<port::Thread> writer_threads;
  574. std::atomic<int> thread_num(0);
  575. std::atomic<int> w_no_slowdown(0);
  576. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
  577. {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
  578. "DBImpl::BackgroundCallFlush:start"}});
  579. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
  580. "WBMStallInterface::BlockDB", [&](void*) {
  581. InstrumentedMutexLock lock(&mutex);
  582. wait_count_db++;
  583. cv.Signal();
  584. // Allow the flush continue if all writer threads are blocked.
  585. if (w_slowdown_set.size() +
  586. (unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) +
  587. wait_count_db) ==
  588. (unsigned long)(2 * num_dbs + 1)) {
  589. TEST_SYNC_POINT(
  590. "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
  591. }
  592. });
  593. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
  594. "WriteThread::WriteStall::Wait", [&](void* arg) {
  595. WriteThread::Writer* w = static_cast<WriteThread::Writer*>(arg);
  596. InstrumentedMutexLock lock(&mutex);
  597. w_slowdown_set.insert(w);
  598. // Allow the flush continue if all writer threads are blocked.
  599. if (w_slowdown_set.size() +
  600. (unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) +
  601. wait_count_db) ==
  602. (unsigned long)(2 * num_dbs + 1)) {
  603. TEST_SYNC_POINT(
  604. "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
  605. }
  606. });
  607. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
  608. bool s1 = true, s2 = true;
  609. std::function<void(DB*)> write_slow_down = [&](DB* db) {
  610. int a = thread_num.fetch_add(1);
  611. std::string key = "foo" + std::to_string(a);
  612. WriteOptions write_op;
  613. write_op.no_slowdown = false;
  614. Status tmp = db->Put(write_op, Slice(key), DummyString(1));
  615. InstrumentedMutexLock lock(&mutex);
  616. s1 = s1 && tmp.ok();
  617. };
  618. std::function<void(DB*)> write_no_slow_down = [&](DB* db) {
  619. int a = thread_num.fetch_add(1);
  620. std::string key = "foo" + std::to_string(a);
  621. WriteOptions write_op;
  622. write_op.no_slowdown = true;
  623. Status tmp = db->Put(write_op, Slice(key), DummyString(1));
  624. {
  625. InstrumentedMutexLock lock(&mutex);
  626. s2 = s2 && !tmp.ok();
  627. w_no_slowdown.fetch_add(1);
  628. if (w_slowdown_set.size() +
  629. (unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) +
  630. wait_count_db) ==
  631. (unsigned long)(2 * num_dbs + 1)) {
  632. TEST_SYNC_POINT(
  633. "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
  634. }
  635. }
  636. };
  637. // Flow:
  638. // first thread will write but will be blocked (as Flush will on hold,
  639. // buffer_size_ has exceeded, thus will create stall in effect).
  640. // |
  641. // |
  642. // multiple writer threads will be created to write across multiple columns
  643. // of db_ and different DBs with different values of
  644. // WriteOptions.no_slowdown. Some of them will be blocked and some of them
  645. // will return with Incomplete status.
  646. // |
  647. // |
  648. // Last writer thread will write and when its blocked/return it will signal
  649. // Flush to continue to clear the stall.
  650. threads.emplace_back(write_slow_down, db_);
  651. // Wait untill first thread writing to DB is blocked and then
  652. // create the multiple writers.
  653. {
  654. InstrumentedMutexLock lock(&mutex);
  655. while (wait_count_db != 1) {
  656. cv.Wait();
  657. }
  658. }
  659. for (int i = 0; i < num_dbs; i += 2) {
  660. // Write to multiple columns of db_.
  661. writer_threads.emplace_back(write_slow_down, db_);
  662. writer_threads.emplace_back(write_no_slow_down, db_);
  663. // Write to different DBs.
  664. threads.emplace_back(write_slow_down, dbs[i]);
  665. threads.emplace_back(write_no_slow_down, dbs[i + 1]);
  666. }
  667. for (auto& t : threads) {
  668. t.join();
  669. }
  670. for (auto& t : writer_threads) {
  671. t.join();
  672. }
  673. ASSERT_TRUE(s1);
  674. ASSERT_TRUE(s2);
  675. // Number of DBs blocked.
  676. ASSERT_EQ((num_dbs / 2) + 1, wait_count_db);
  677. // Number of writer threads writing to db_ blocked from getting added to the
  678. // queue.
  679. ASSERT_EQ(w_slowdown_set.size(), num_dbs / 2);
  680. // Number of threads with WriteOptions.no_slowdown = true.
  681. ASSERT_EQ(w_no_slowdown.load(std::memory_order_relaxed), num_dbs);
  682. // Clean up DBs.
  683. for (int i = 0; i < num_dbs; i++) {
  684. ASSERT_OK(dbs[i]->Close());
  685. ASSERT_OK(DestroyDB(dbnames[i], options));
  686. delete dbs[i];
  687. }
  688. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
  689. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
  690. }
  691. // Tests a `WriteBufferManager` constructed with `allow_stall == false` does not
  692. // thrash memtable switching when full and a CF receives multiple writes.
  693. // Instead, we expect to switch a CF's memtable for flush only when that CF does
  694. // not have any pending or running flush.
  695. //
  696. // This test uses multiple DBs each with a single CF instead of a single DB
  697. // with multiple CFs. That way we can control which CF is considered for switch
  698. // by writing to that CF's DB.
  699. //
  700. // Not supported in LITE mode due to `GetProperty()` unavailable.
  701. TEST_P(DBWriteBufferManagerTest, StopSwitchingMemTablesOnceFlushing) {
  702. Options options = CurrentOptions();
  703. options.arena_block_size = 4 << 10; // 4KB
  704. options.write_buffer_size = 1 << 20; // 1MB
  705. std::shared_ptr<Cache> cache =
  706. NewLRUCache(4 << 20 /* capacity (4MB) */, 2 /* num_shard_bits */);
  707. ASSERT_LT(cache->GetUsage(), 256 << 10 /* 256KB */);
  708. cost_cache_ = GetParam();
  709. if (cost_cache_) {
  710. options.write_buffer_manager.reset(new WriteBufferManager(
  711. 512 << 10 /* buffer_size (512KB) */, cache, false /* allow_stall */));
  712. } else {
  713. options.write_buffer_manager.reset(
  714. new WriteBufferManager(512 << 10 /* buffer_size (512KB) */,
  715. nullptr /* cache */, false /* allow_stall */));
  716. }
  717. Reopen(options);
  718. std::string dbname = test::PerThreadDBPath("db_shared_wbm_db");
  719. DB* shared_wbm_db = nullptr;
  720. ASSERT_OK(DestroyDB(dbname, options));
  721. ASSERT_OK(DB::Open(options, dbname, &shared_wbm_db));
  722. // The last write will make WBM need flush, but it won't flush yet.
  723. ASSERT_OK(Put(Key(1), DummyString(256 << 10 /* 256KB */), WriteOptions()));
  724. ASSERT_FALSE(options.write_buffer_manager->ShouldFlush());
  725. ASSERT_OK(Put(Key(1), DummyString(256 << 10 /* 256KB */), WriteOptions()));
  726. ASSERT_TRUE(options.write_buffer_manager->ShouldFlush());
  727. // Flushes will be pending, not running because flush threads are blocked.
  728. test::SleepingBackgroundTask sleeping_task_high;
  729. env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
  730. &sleeping_task_high, Env::Priority::HIGH);
  731. for (int i = 0; i < 3; ++i) {
  732. ASSERT_OK(
  733. shared_wbm_db->Put(WriteOptions(), Key(1), DummyString(1 /* len */)));
  734. std::string prop;
  735. ASSERT_TRUE(
  736. shared_wbm_db->GetProperty("rocksdb.num-immutable-mem-table", &prop));
  737. ASSERT_EQ(std::to_string(i > 0 ? 1 : 0), prop);
  738. ASSERT_TRUE(
  739. shared_wbm_db->GetProperty("rocksdb.mem-table-flush-pending", &prop));
  740. ASSERT_EQ(std::to_string(i > 0 ? 1 : 0), prop);
  741. }
  742. // Clean up DBs.
  743. sleeping_task_high.WakeUp();
  744. sleeping_task_high.WaitUntilDone();
  745. ASSERT_OK(shared_wbm_db->Close());
  746. ASSERT_OK(DestroyDB(dbname, options));
  747. delete shared_wbm_db;
  748. }
  749. TEST_F(DBWriteBufferManagerTest, RuntimeChangeableAllowStall) {
  750. constexpr int kBigValue = 10000;
  751. Options options = CurrentOptions();
  752. options.write_buffer_manager.reset(
  753. new WriteBufferManager(1, nullptr /* cache */, true /* allow_stall */));
  754. DestroyAndReopen(options);
  755. // Pause flush thread so that
  756. // (a) the only way to exist write stall below is to change the `allow_stall`
  757. // (b) the write stall is "stable" without being interfered by flushes so that
  758. // we can check it without flakiness
  759. std::unique_ptr<test::SleepingBackgroundTask> sleeping_task(
  760. new test::SleepingBackgroundTask());
  761. env_->SetBackgroundThreads(1, Env::HIGH);
  762. env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
  763. sleeping_task.get(), Env::Priority::HIGH);
  764. sleeping_task->WaitUntilSleeping();
  765. // Test 1: test setting `allow_stall` from true to false
  766. //
  767. // Assert existence of a write stall
  768. WriteOptions wo_no_slowdown;
  769. wo_no_slowdown.no_slowdown = true;
  770. Status s = Put(Key(0), DummyString(kBigValue), wo_no_slowdown);
  771. ASSERT_TRUE(s.IsIncomplete());
  772. ASSERT_TRUE(s.ToString().find("Write stall") != std::string::npos);
  773. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
  774. {{"WBMStallInterface::BlockDB",
  775. "DBWriteBufferManagerTest::RuntimeChangeableThreadSafeParameters::"
  776. "ChangeParameter"}});
  777. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
  778. // Test `SetAllowStall()`
  779. port::Thread thread1([&] { ASSERT_OK(Put(Key(0), DummyString(kBigValue))); });
  780. port::Thread thread2([&] {
  781. TEST_SYNC_POINT(
  782. "DBWriteBufferManagerTest::RuntimeChangeableThreadSafeParameters::"
  783. "ChangeParameter");
  784. options.write_buffer_manager->SetAllowStall(false);
  785. });
  786. // Verify `allow_stall` is successfully set to false in thread2.
  787. // Othwerwise, thread1's write will be stalled and this test will hang
  788. // forever.
  789. thread1.join();
  790. thread2.join();
  791. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
  792. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
  793. // Test 2: test setting `allow_stall` from false to true
  794. //
  795. // Assert no write stall
  796. ASSERT_OK(Put(Key(0), DummyString(kBigValue), wo_no_slowdown));
  797. // Test `SetAllowStall()`
  798. options.write_buffer_manager->SetAllowStall(true);
  799. // Verify `allow_stall` is successfully set to true.
  800. // Otherwise the following write will not be stalled and therefore succeed.
  801. s = Put(Key(0), DummyString(kBigValue), wo_no_slowdown);
  802. ASSERT_TRUE(s.IsIncomplete());
  803. ASSERT_TRUE(s.ToString().find("Write stall") != std::string::npos);
  804. sleeping_task->WakeUp();
  805. }
  806. INSTANTIATE_TEST_CASE_P(DBWriteBufferManagerTest, DBWriteBufferManagerTest,
  807. testing::Bool());
  808. } // namespace ROCKSDB_NAMESPACE
  809. int main(int argc, char** argv) {
  810. ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
  811. ::testing::InitGoogleTest(&argc, argv);
  812. RegisterCustomObjects(argc, argv);
  813. return RUN_ALL_TESTS();
  814. }