rate_limiter.cc 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. #include "util/rate_limiter.h"
  10. #include "monitoring/statistics.h"
  11. #include "port/port.h"
  12. #include "rocksdb/env.h"
  13. #include "test_util/sync_point.h"
  14. #include "util/aligned_buffer.h"
  15. namespace ROCKSDB_NAMESPACE {
  16. size_t RateLimiter::RequestToken(size_t bytes, size_t alignment,
  17. Env::IOPriority io_priority, Statistics* stats,
  18. RateLimiter::OpType op_type) {
  19. if (io_priority < Env::IO_TOTAL && IsRateLimited(op_type)) {
  20. bytes = std::min(bytes, static_cast<size_t>(GetSingleBurstBytes()));
  21. if (alignment > 0) {
  22. // Here we may actually require more than burst and block
  23. // but we can not write less than one page at a time on direct I/O
  24. // thus we may want not to use ratelimiter
  25. bytes = std::max(alignment, TruncateToPageBoundary(alignment, bytes));
  26. }
  27. Request(bytes, io_priority, stats, op_type);
  28. }
  29. return bytes;
  30. }
  31. // Pending request
  32. struct GenericRateLimiter::Req {
  33. explicit Req(int64_t _bytes, port::Mutex* _mu)
  34. : request_bytes(_bytes), bytes(_bytes), cv(_mu), granted(false) {}
  35. int64_t request_bytes;
  36. int64_t bytes;
  37. port::CondVar cv;
  38. bool granted;
  39. };
  40. GenericRateLimiter::GenericRateLimiter(int64_t rate_bytes_per_sec,
  41. int64_t refill_period_us,
  42. int32_t fairness, RateLimiter::Mode mode,
  43. Env* env, bool auto_tuned)
  44. : RateLimiter(mode),
  45. refill_period_us_(refill_period_us),
  46. rate_bytes_per_sec_(auto_tuned ? rate_bytes_per_sec / 2
  47. : rate_bytes_per_sec),
  48. refill_bytes_per_period_(
  49. CalculateRefillBytesPerPeriod(rate_bytes_per_sec_)),
  50. env_(env),
  51. stop_(false),
  52. exit_cv_(&request_mutex_),
  53. requests_to_wait_(0),
  54. available_bytes_(0),
  55. next_refill_us_(NowMicrosMonotonic(env_)),
  56. fairness_(fairness > 100 ? 100 : fairness),
  57. rnd_((uint32_t)time(nullptr)),
  58. leader_(nullptr),
  59. auto_tuned_(auto_tuned),
  60. num_drains_(0),
  61. prev_num_drains_(0),
  62. max_bytes_per_sec_(rate_bytes_per_sec),
  63. tuned_time_(NowMicrosMonotonic(env_)) {
  64. total_requests_[0] = 0;
  65. total_requests_[1] = 0;
  66. total_bytes_through_[0] = 0;
  67. total_bytes_through_[1] = 0;
  68. }
  69. GenericRateLimiter::~GenericRateLimiter() {
  70. MutexLock g(&request_mutex_);
  71. stop_ = true;
  72. requests_to_wait_ = static_cast<int32_t>(queue_[Env::IO_LOW].size() +
  73. queue_[Env::IO_HIGH].size());
  74. for (auto& r : queue_[Env::IO_HIGH]) {
  75. r->cv.Signal();
  76. }
  77. for (auto& r : queue_[Env::IO_LOW]) {
  78. r->cv.Signal();
  79. }
  80. while (requests_to_wait_ > 0) {
  81. exit_cv_.Wait();
  82. }
  83. }
  84. // This API allows user to dynamically change rate limiter's bytes per second.
  85. void GenericRateLimiter::SetBytesPerSecond(int64_t bytes_per_second) {
  86. assert(bytes_per_second > 0);
  87. rate_bytes_per_sec_ = bytes_per_second;
  88. refill_bytes_per_period_.store(
  89. CalculateRefillBytesPerPeriod(bytes_per_second),
  90. std::memory_order_relaxed);
  91. }
  92. void GenericRateLimiter::Request(int64_t bytes, const Env::IOPriority pri,
  93. Statistics* stats) {
  94. assert(bytes <= refill_bytes_per_period_.load(std::memory_order_relaxed));
  95. TEST_SYNC_POINT("GenericRateLimiter::Request");
  96. TEST_SYNC_POINT_CALLBACK("GenericRateLimiter::Request:1",
  97. &rate_bytes_per_sec_);
  98. MutexLock g(&request_mutex_);
  99. if (auto_tuned_) {
  100. static const int kRefillsPerTune = 100;
  101. std::chrono::microseconds now(NowMicrosMonotonic(env_));
  102. if (now - tuned_time_ >=
  103. kRefillsPerTune * std::chrono::microseconds(refill_period_us_)) {
  104. Tune();
  105. }
  106. }
  107. if (stop_) {
  108. return;
  109. }
  110. ++total_requests_[pri];
  111. if (available_bytes_ >= bytes) {
  112. // Refill thread assigns quota and notifies requests waiting on
  113. // the queue under mutex. So if we get here, that means nobody
  114. // is waiting?
  115. available_bytes_ -= bytes;
  116. total_bytes_through_[pri] += bytes;
  117. return;
  118. }
  119. // Request cannot be satisfied at this moment, enqueue
  120. Req r(bytes, &request_mutex_);
  121. queue_[pri].push_back(&r);
  122. do {
  123. bool timedout = false;
  124. // Leader election, candidates can be:
  125. // (1) a new incoming request,
  126. // (2) a previous leader, whose quota has not been not assigned yet due
  127. // to lower priority
  128. // (3) a previous waiter at the front of queue, who got notified by
  129. // previous leader
  130. if (leader_ == nullptr &&
  131. ((!queue_[Env::IO_HIGH].empty() &&
  132. &r == queue_[Env::IO_HIGH].front()) ||
  133. (!queue_[Env::IO_LOW].empty() &&
  134. &r == queue_[Env::IO_LOW].front()))) {
  135. leader_ = &r;
  136. int64_t delta = next_refill_us_ - NowMicrosMonotonic(env_);
  137. delta = delta > 0 ? delta : 0;
  138. if (delta == 0) {
  139. timedout = true;
  140. } else {
  141. int64_t wait_until = env_->NowMicros() + delta;
  142. RecordTick(stats, NUMBER_RATE_LIMITER_DRAINS);
  143. ++num_drains_;
  144. timedout = r.cv.TimedWait(wait_until);
  145. }
  146. } else {
  147. // Not at the front of queue or an leader has already been elected
  148. r.cv.Wait();
  149. }
  150. // request_mutex_ is held from now on
  151. if (stop_) {
  152. --requests_to_wait_;
  153. exit_cv_.Signal();
  154. return;
  155. }
  156. // Make sure the waken up request is always the header of its queue
  157. assert(r.granted ||
  158. (!queue_[Env::IO_HIGH].empty() &&
  159. &r == queue_[Env::IO_HIGH].front()) ||
  160. (!queue_[Env::IO_LOW].empty() &&
  161. &r == queue_[Env::IO_LOW].front()));
  162. assert(leader_ == nullptr ||
  163. (!queue_[Env::IO_HIGH].empty() &&
  164. leader_ == queue_[Env::IO_HIGH].front()) ||
  165. (!queue_[Env::IO_LOW].empty() &&
  166. leader_ == queue_[Env::IO_LOW].front()));
  167. if (leader_ == &r) {
  168. // Waken up from TimedWait()
  169. if (timedout) {
  170. // Time to do refill!
  171. Refill();
  172. // Re-elect a new leader regardless. This is to simplify the
  173. // election handling.
  174. leader_ = nullptr;
  175. // Notify the header of queue if current leader is going away
  176. if (r.granted) {
  177. // Current leader already got granted with quota. Notify header
  178. // of waiting queue to participate next round of election.
  179. assert((queue_[Env::IO_HIGH].empty() ||
  180. &r != queue_[Env::IO_HIGH].front()) &&
  181. (queue_[Env::IO_LOW].empty() ||
  182. &r != queue_[Env::IO_LOW].front()));
  183. if (!queue_[Env::IO_HIGH].empty()) {
  184. queue_[Env::IO_HIGH].front()->cv.Signal();
  185. } else if (!queue_[Env::IO_LOW].empty()) {
  186. queue_[Env::IO_LOW].front()->cv.Signal();
  187. }
  188. // Done
  189. break;
  190. }
  191. } else {
  192. // Spontaneous wake up, need to continue to wait
  193. assert(!r.granted);
  194. leader_ = nullptr;
  195. }
  196. } else {
  197. // Waken up by previous leader:
  198. // (1) if requested quota is granted, it is done.
  199. // (2) if requested quota is not granted, this means current thread
  200. // was picked as a new leader candidate (previous leader got quota).
  201. // It needs to participate leader election because a new request may
  202. // come in before this thread gets waken up. So it may actually need
  203. // to do Wait() again.
  204. assert(!timedout);
  205. }
  206. } while (!r.granted);
  207. }
  208. void GenericRateLimiter::Refill() {
  209. TEST_SYNC_POINT("GenericRateLimiter::Refill");
  210. next_refill_us_ = NowMicrosMonotonic(env_) + refill_period_us_;
  211. // Carry over the left over quota from the last period
  212. auto refill_bytes_per_period =
  213. refill_bytes_per_period_.load(std::memory_order_relaxed);
  214. if (available_bytes_ < refill_bytes_per_period) {
  215. available_bytes_ += refill_bytes_per_period;
  216. }
  217. int use_low_pri_first = rnd_.OneIn(fairness_) ? 0 : 1;
  218. for (int q = 0; q < 2; ++q) {
  219. auto use_pri = (use_low_pri_first == q) ? Env::IO_LOW : Env::IO_HIGH;
  220. auto* queue = &queue_[use_pri];
  221. while (!queue->empty()) {
  222. auto* next_req = queue->front();
  223. if (available_bytes_ < next_req->request_bytes) {
  224. // avoid starvation
  225. next_req->request_bytes -= available_bytes_;
  226. available_bytes_ = 0;
  227. break;
  228. }
  229. available_bytes_ -= next_req->request_bytes;
  230. next_req->request_bytes = 0;
  231. total_bytes_through_[use_pri] += next_req->bytes;
  232. queue->pop_front();
  233. next_req->granted = true;
  234. if (next_req != leader_) {
  235. // Quota granted, signal the thread
  236. next_req->cv.Signal();
  237. }
  238. }
  239. }
  240. }
  241. int64_t GenericRateLimiter::CalculateRefillBytesPerPeriod(
  242. int64_t rate_bytes_per_sec) {
  243. if (port::kMaxInt64 / rate_bytes_per_sec < refill_period_us_) {
  244. // Avoid unexpected result in the overflow case. The result now is still
  245. // inaccurate but is a number that is large enough.
  246. return port::kMaxInt64 / 1000000;
  247. } else {
  248. return std::max(kMinRefillBytesPerPeriod,
  249. rate_bytes_per_sec * refill_period_us_ / 1000000);
  250. }
  251. }
  252. Status GenericRateLimiter::Tune() {
  253. const int kLowWatermarkPct = 50;
  254. const int kHighWatermarkPct = 90;
  255. const int kAdjustFactorPct = 5;
  256. // computed rate limit will be in
  257. // `[max_bytes_per_sec_ / kAllowedRangeFactor, max_bytes_per_sec_]`.
  258. const int kAllowedRangeFactor = 20;
  259. std::chrono::microseconds prev_tuned_time = tuned_time_;
  260. tuned_time_ = std::chrono::microseconds(NowMicrosMonotonic(env_));
  261. int64_t elapsed_intervals = (tuned_time_ - prev_tuned_time +
  262. std::chrono::microseconds(refill_period_us_) -
  263. std::chrono::microseconds(1)) /
  264. std::chrono::microseconds(refill_period_us_);
  265. // We tune every kRefillsPerTune intervals, so the overflow and division-by-
  266. // zero conditions should never happen.
  267. assert(num_drains_ - prev_num_drains_ <= port::kMaxInt64 / 100);
  268. assert(elapsed_intervals > 0);
  269. int64_t drained_pct =
  270. (num_drains_ - prev_num_drains_) * 100 / elapsed_intervals;
  271. int64_t prev_bytes_per_sec = GetBytesPerSecond();
  272. int64_t new_bytes_per_sec;
  273. if (drained_pct == 0) {
  274. new_bytes_per_sec = max_bytes_per_sec_ / kAllowedRangeFactor;
  275. } else if (drained_pct < kLowWatermarkPct) {
  276. // sanitize to prevent overflow
  277. int64_t sanitized_prev_bytes_per_sec =
  278. std::min(prev_bytes_per_sec, port::kMaxInt64 / 100);
  279. new_bytes_per_sec =
  280. std::max(max_bytes_per_sec_ / kAllowedRangeFactor,
  281. sanitized_prev_bytes_per_sec * 100 / (100 + kAdjustFactorPct));
  282. } else if (drained_pct > kHighWatermarkPct) {
  283. // sanitize to prevent overflow
  284. int64_t sanitized_prev_bytes_per_sec = std::min(
  285. prev_bytes_per_sec, port::kMaxInt64 / (100 + kAdjustFactorPct));
  286. new_bytes_per_sec =
  287. std::min(max_bytes_per_sec_,
  288. sanitized_prev_bytes_per_sec * (100 + kAdjustFactorPct) / 100);
  289. } else {
  290. new_bytes_per_sec = prev_bytes_per_sec;
  291. }
  292. if (new_bytes_per_sec != prev_bytes_per_sec) {
  293. SetBytesPerSecond(new_bytes_per_sec);
  294. }
  295. num_drains_ = prev_num_drains_;
  296. return Status::OK();
  297. }
  298. RateLimiter* NewGenericRateLimiter(
  299. int64_t rate_bytes_per_sec, int64_t refill_period_us /* = 100 * 1000 */,
  300. int32_t fairness /* = 10 */,
  301. RateLimiter::Mode mode /* = RateLimiter::Mode::kWritesOnly */,
  302. bool auto_tuned /* = false */) {
  303. assert(rate_bytes_per_sec > 0);
  304. assert(refill_period_us > 0);
  305. assert(fairness > 0);
  306. return new GenericRateLimiter(rate_bytes_per_sec, refill_period_us, fairness,
  307. mode, Env::Default(), auto_tuned);
  308. }
  309. } // namespace ROCKSDB_NAMESPACE