writable_file_writer.cc 35 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. #include "file/writable_file_writer.h"
  10. #include <algorithm>
  11. #include <mutex>
  12. #include "db/version_edit.h"
  13. #include "file/file_util.h"
  14. #include "monitoring/histogram.h"
  15. #include "monitoring/iostats_context_imp.h"
  16. #include "port/port.h"
  17. #include "rocksdb/io_status.h"
  18. #include "rocksdb/system_clock.h"
  19. #include "test_util/sync_point.h"
  20. #include "util/crc32c.h"
  21. #include "util/random.h"
  22. #include "util/rate_limiter_impl.h"
  23. namespace ROCKSDB_NAMESPACE {
  24. inline Histograms GetFileWriteHistograms(Histograms file_writer_hist,
  25. Env::IOActivity io_activity) {
  26. if (file_writer_hist == Histograms::SST_WRITE_MICROS ||
  27. file_writer_hist == Histograms::BLOB_DB_BLOB_FILE_WRITE_MICROS) {
  28. switch (io_activity) {
  29. case Env::IOActivity::kFlush:
  30. return Histograms::FILE_WRITE_FLUSH_MICROS;
  31. case Env::IOActivity::kCompaction:
  32. return Histograms::FILE_WRITE_COMPACTION_MICROS;
  33. case Env::IOActivity::kDBOpen:
  34. return Histograms::FILE_WRITE_DB_OPEN_MICROS;
  35. default:
  36. break;
  37. }
  38. }
  39. return Histograms::HISTOGRAM_ENUM_MAX;
  40. }
  41. IOStatus WritableFileWriter::Create(const std::shared_ptr<FileSystem>& fs,
  42. const std::string& fname,
  43. const FileOptions& file_opts,
  44. std::unique_ptr<WritableFileWriter>* writer,
  45. IODebugContext* dbg) {
  46. if (file_opts.use_direct_writes &&
  47. 0 == file_opts.writable_file_max_buffer_size) {
  48. return IOStatus::InvalidArgument(
  49. "Direct write requires writable_file_max_buffer_size > 0");
  50. }
  51. std::unique_ptr<FSWritableFile> file;
  52. IOStatus io_s = fs->NewWritableFile(fname, file_opts, &file, dbg);
  53. if (io_s.ok()) {
  54. writer->reset(new WritableFileWriter(std::move(file), fname, file_opts));
  55. }
  56. return io_s;
  57. }
  58. IOStatus WritableFileWriter::Append(const IOOptions& opts, const Slice& data,
  59. uint32_t crc32c_checksum) {
  60. if (seen_error()) {
  61. return GetWriterHasPreviousErrorStatus();
  62. }
  63. StopWatch sw(clock_, stats_, hist_type_,
  64. GetFileWriteHistograms(hist_type_, opts.io_activity));
  65. const IOOptions io_options = FinalizeIOOptions(opts);
  66. const char* src = data.data();
  67. size_t left = data.size();
  68. IOStatus s;
  69. pending_sync_ = true;
  70. TEST_KILL_RANDOM_WITH_WEIGHT("WritableFileWriter::Append:0", REDUCE_ODDS2);
  71. // Calculate the checksum of appended data
  72. UpdateFileChecksum(data);
  73. {
  74. IOSTATS_TIMER_GUARD(prepare_write_nanos);
  75. TEST_SYNC_POINT("WritableFileWriter::Append:BeforePrepareWrite");
  76. writable_file_->PrepareWrite(static_cast<size_t>(GetFileSize()), left,
  77. io_options, nullptr);
  78. }
  79. // See whether we need to enlarge the buffer to avoid the flush
  80. if (buf_.Capacity() - buf_.CurrentSize() < left) {
  81. for (size_t cap = buf_.Capacity();
  82. cap < max_buffer_size_; // There is still room to increase
  83. cap *= 2) {
  84. // See whether the next available size is large enough.
  85. // Buffer will never be increased to more than max_buffer_size_.
  86. size_t desired_capacity = std::min(cap * 2, max_buffer_size_);
  87. if (desired_capacity - buf_.CurrentSize() >= left ||
  88. (use_direct_io() && desired_capacity == max_buffer_size_)) {
  89. buf_.AllocateNewBuffer(desired_capacity, true);
  90. break;
  91. }
  92. }
  93. }
  94. // Flush only when buffered I/O
  95. if (!use_direct_io() && (buf_.Capacity() - buf_.CurrentSize()) < left) {
  96. if (buf_.CurrentSize() > 0) {
  97. if (!buffered_data_with_checksum_) {
  98. // If we're not calculating checksum of buffered data, fill the
  99. // buffer before flushing so that the writes are aligned. This will
  100. // benefit file system performance.
  101. size_t appended = buf_.Append(src, left);
  102. left -= appended;
  103. src += appended;
  104. }
  105. s = Flush(io_options);
  106. if (!s.ok()) {
  107. set_seen_error(s);
  108. return s;
  109. }
  110. }
  111. assert(buf_.CurrentSize() == 0);
  112. }
  113. if (perform_data_verification_ && buffered_data_with_checksum_ &&
  114. crc32c_checksum != 0) {
  115. // Since we want to use the checksum of the input data, we cannot break it
  116. // into several pieces. We will only write them in the buffer when buffer
  117. // size is enough. Otherwise, we will directly write it down.
  118. if (use_direct_io() || (buf_.Capacity() - buf_.CurrentSize()) >= left) {
  119. if ((buf_.Capacity() - buf_.CurrentSize()) >= left) {
  120. size_t appended = buf_.Append(src, left);
  121. if (appended != left) {
  122. s = IOStatus::Corruption("Write buffer append failure");
  123. }
  124. buffered_data_crc32c_checksum_ = crc32c::Crc32cCombine(
  125. buffered_data_crc32c_checksum_, crc32c_checksum, appended);
  126. } else {
  127. while (left > 0) {
  128. size_t appended = buf_.Append(src, left);
  129. buffered_data_crc32c_checksum_ =
  130. crc32c::Extend(buffered_data_crc32c_checksum_, src, appended);
  131. left -= appended;
  132. src += appended;
  133. if (left > 0) {
  134. s = Flush(io_options);
  135. if (!s.ok()) {
  136. break;
  137. }
  138. }
  139. }
  140. }
  141. } else {
  142. assert(buf_.CurrentSize() == 0);
  143. buffered_data_crc32c_checksum_ = crc32c_checksum;
  144. s = WriteBufferedWithChecksum(io_options, src, left);
  145. }
  146. } else {
  147. // In this case, either we do not need to do the data verification or
  148. // caller does not provide the checksum of the data (crc32c_checksum = 0).
  149. //
  150. // We never write directly to disk with direct I/O on.
  151. // or we simply use it for its original purpose to accumulate many small
  152. // chunks
  153. if (use_direct_io() || (buf_.Capacity() >= left)) {
  154. while (left > 0) {
  155. size_t appended = buf_.Append(src, left);
  156. if (perform_data_verification_ && buffered_data_with_checksum_) {
  157. buffered_data_crc32c_checksum_ =
  158. crc32c::Extend(buffered_data_crc32c_checksum_, src, appended);
  159. }
  160. left -= appended;
  161. src += appended;
  162. if (left > 0) {
  163. s = Flush(io_options);
  164. if (!s.ok()) {
  165. break;
  166. }
  167. }
  168. }
  169. } else {
  170. // Writing directly to file bypassing the buffer
  171. assert(buf_.CurrentSize() == 0);
  172. if (perform_data_verification_ && buffered_data_with_checksum_) {
  173. buffered_data_crc32c_checksum_ = crc32c::Value(src, left);
  174. s = WriteBufferedWithChecksum(io_options, src, left);
  175. } else {
  176. s = WriteBuffered(io_options, src, left);
  177. }
  178. }
  179. }
  180. TEST_KILL_RANDOM("WritableFileWriter::Append:1");
  181. if (s.ok()) {
  182. uint64_t cur_size = filesize_.load(std::memory_order_acquire);
  183. filesize_.store(cur_size + data.size(), std::memory_order_release);
  184. } else {
  185. set_seen_error(s);
  186. }
  187. return s;
  188. }
  189. IOStatus WritableFileWriter::Pad(const IOOptions& opts, const size_t pad_bytes,
  190. const size_t max_pad_size) {
  191. (void)max_pad_size;
  192. if (seen_error()) {
  193. return GetWriterHasPreviousErrorStatus();
  194. }
  195. const IOOptions io_options = FinalizeIOOptions(opts);
  196. assert(pad_bytes < max_pad_size);
  197. size_t left = pad_bytes;
  198. size_t cap = buf_.Capacity() - buf_.CurrentSize();
  199. // Assume pad_bytes is small compared to buf_ capacity. So we always
  200. // use buf_ rather than write directly to file in certain cases like
  201. // Append() does.
  202. while (left) {
  203. size_t append_bytes = std::min(cap, left);
  204. buf_.PadWith(append_bytes, 0);
  205. left -= append_bytes;
  206. Slice data(buf_.BufferStart() + buf_.CurrentSize() - append_bytes,
  207. append_bytes);
  208. UpdateFileChecksum(data);
  209. if (perform_data_verification_) {
  210. buffered_data_crc32c_checksum_ = crc32c::Extend(
  211. buffered_data_crc32c_checksum_,
  212. buf_.BufferStart() + buf_.CurrentSize() - append_bytes, append_bytes);
  213. }
  214. if (left > 0) {
  215. IOStatus s = Flush(io_options);
  216. if (!s.ok()) {
  217. set_seen_error(s);
  218. return s;
  219. }
  220. }
  221. cap = buf_.Capacity() - buf_.CurrentSize();
  222. }
  223. pending_sync_ = true;
  224. uint64_t cur_size = filesize_.load(std::memory_order_acquire);
  225. filesize_.store(cur_size + pad_bytes, std::memory_order_release);
  226. return IOStatus::OK();
  227. }
  228. IOStatus WritableFileWriter::Close(const IOOptions& opts) {
  229. IOOptions io_options = FinalizeIOOptions(opts);
  230. if (seen_error()) {
  231. IOStatus interim;
  232. if (writable_file_.get() != nullptr) {
  233. interim = writable_file_->Close(io_options, nullptr);
  234. writable_file_.reset();
  235. }
  236. if (interim.ok()) {
  237. return IOStatus::IOError(
  238. "File is closed but data not flushed as writer has previous error.");
  239. } else {
  240. return interim;
  241. }
  242. }
  243. // Do not quit immediately on failure the file MUST be closed
  244. // Possible to close it twice now as we MUST close
  245. // in __dtor, simply flushing is not enough
  246. // Windows when pre-allocating does not fill with zeros
  247. // also with unbuffered access we also set the end of data.
  248. if (writable_file_.get() == nullptr) {
  249. return IOStatus::OK();
  250. }
  251. IOStatus s;
  252. s = Flush(io_options); // flush cache to OS
  253. IOStatus interim;
  254. // In direct I/O mode we write whole pages so
  255. // we need to let the file know where data ends.
  256. if (use_direct_io()) {
  257. {
  258. FileOperationInfo::StartTimePoint start_ts;
  259. if (ShouldNotifyListeners()) {
  260. start_ts = FileOperationInfo::StartNow();
  261. }
  262. uint64_t filesz = filesize_.load(std::memory_order_acquire);
  263. interim = writable_file_->Truncate(filesz, io_options, nullptr);
  264. if (ShouldNotifyListeners()) {
  265. auto finish_ts = FileOperationInfo::FinishNow();
  266. NotifyOnFileTruncateFinish(start_ts, finish_ts, s);
  267. if (!interim.ok()) {
  268. NotifyOnIOError(interim, FileOperationType::kTruncate, file_name(),
  269. filesz);
  270. }
  271. }
  272. }
  273. if (interim.ok()) {
  274. {
  275. FileOperationInfo::StartTimePoint start_ts;
  276. if (ShouldNotifyListeners()) {
  277. start_ts = FileOperationInfo::StartNow();
  278. }
  279. interim = writable_file_->Fsync(io_options, nullptr);
  280. if (ShouldNotifyListeners()) {
  281. auto finish_ts = FileOperationInfo::FinishNow();
  282. NotifyOnFileSyncFinish(start_ts, finish_ts, s,
  283. FileOperationType::kFsync);
  284. if (!interim.ok()) {
  285. NotifyOnIOError(interim, FileOperationType::kFsync, file_name());
  286. }
  287. }
  288. }
  289. }
  290. if (!interim.ok() && s.ok()) {
  291. s = interim;
  292. }
  293. }
  294. TEST_KILL_RANDOM("WritableFileWriter::Close:0");
  295. {
  296. FileOperationInfo::StartTimePoint start_ts;
  297. if (ShouldNotifyListeners()) {
  298. start_ts = FileOperationInfo::StartNow();
  299. }
  300. interim = writable_file_->Close(io_options, nullptr);
  301. if (ShouldNotifyListeners()) {
  302. auto finish_ts = FileOperationInfo::FinishNow();
  303. NotifyOnFileCloseFinish(start_ts, finish_ts, s);
  304. if (!interim.ok()) {
  305. NotifyOnIOError(interim, FileOperationType::kClose, file_name());
  306. }
  307. }
  308. }
  309. if (!interim.ok() && s.ok()) {
  310. s = interim;
  311. }
  312. writable_file_.reset();
  313. TEST_KILL_RANDOM("WritableFileWriter::Close:1");
  314. if (s.ok()) {
  315. if (checksum_generator_ != nullptr && !checksum_finalized_) {
  316. checksum_generator_->Finalize();
  317. checksum_finalized_ = true;
  318. }
  319. } else {
  320. set_seen_error(s);
  321. }
  322. return s;
  323. }
  324. // write out the cached data to the OS cache or storage if direct I/O
  325. // enabled
  326. IOStatus WritableFileWriter::Flush(const IOOptions& opts) {
  327. if (seen_error()) {
  328. return GetWriterHasPreviousErrorStatus();
  329. }
  330. const IOOptions io_options = FinalizeIOOptions(opts);
  331. IOStatus s;
  332. TEST_KILL_RANDOM_WITH_WEIGHT("WritableFileWriter::Flush:0", REDUCE_ODDS2);
  333. if (buf_.CurrentSize() > 0) {
  334. if (use_direct_io()) {
  335. if (pending_sync_) {
  336. if (perform_data_verification_ && buffered_data_with_checksum_) {
  337. s = WriteDirectWithChecksum(io_options);
  338. } else {
  339. s = WriteDirect(io_options);
  340. }
  341. }
  342. } else {
  343. if (perform_data_verification_ && buffered_data_with_checksum_) {
  344. s = WriteBufferedWithChecksum(io_options, buf_.BufferStart(),
  345. buf_.CurrentSize());
  346. } else {
  347. s = WriteBuffered(io_options, buf_.BufferStart(), buf_.CurrentSize());
  348. }
  349. }
  350. if (!s.ok()) {
  351. set_seen_error(s);
  352. return s;
  353. }
  354. }
  355. {
  356. FileOperationInfo::StartTimePoint start_ts;
  357. if (ShouldNotifyListeners()) {
  358. start_ts = FileOperationInfo::StartNow();
  359. }
  360. s = writable_file_->Flush(io_options, nullptr);
  361. if (ShouldNotifyListeners()) {
  362. auto finish_ts = std::chrono::steady_clock::now();
  363. NotifyOnFileFlushFinish(start_ts, finish_ts, s);
  364. if (!s.ok()) {
  365. NotifyOnIOError(s, FileOperationType::kFlush, file_name());
  366. }
  367. }
  368. }
  369. if (!s.ok()) {
  370. set_seen_error(s);
  371. return s;
  372. }
  373. // sync OS cache to disk for every bytes_per_sync_
  374. // TODO: give log file and sst file different options (log
  375. // files could be potentially cached in OS for their whole
  376. // life time, thus we might not want to flush at all).
  377. // We try to avoid sync to the last 1MB of data. For two reasons:
  378. // (1) avoid rewrite the same page that is modified later.
  379. // (2) for older version of OS, write can block while writing out
  380. // the page.
  381. // Xfs does neighbor page flushing outside of the specified ranges. We
  382. // need to make sure sync range is far from the write offset.
  383. if (!use_direct_io() && bytes_per_sync_) {
  384. const uint64_t kBytesNotSyncRange =
  385. 1024 * 1024; // recent 1MB is not synced.
  386. const uint64_t kBytesAlignWhenSync = 4 * 1024; // Align 4KB.
  387. uint64_t cur_size = filesize_.load(std::memory_order_acquire);
  388. if (cur_size > kBytesNotSyncRange) {
  389. uint64_t offset_sync_to = cur_size - kBytesNotSyncRange;
  390. offset_sync_to -= offset_sync_to % kBytesAlignWhenSync;
  391. assert(offset_sync_to >= last_sync_size_);
  392. if (offset_sync_to > 0 &&
  393. offset_sync_to - last_sync_size_ >= bytes_per_sync_) {
  394. s = RangeSync(io_options, last_sync_size_,
  395. offset_sync_to - last_sync_size_);
  396. if (!s.ok()) {
  397. set_seen_error(s);
  398. }
  399. last_sync_size_ = offset_sync_to;
  400. }
  401. }
  402. }
  403. return s;
  404. }
  405. std::string WritableFileWriter::GetFileChecksum() {
  406. if (checksum_generator_ != nullptr) {
  407. assert(checksum_finalized_);
  408. return checksum_generator_->GetChecksum();
  409. } else {
  410. return kUnknownFileChecksum;
  411. }
  412. }
  413. const char* WritableFileWriter::GetFileChecksumFuncName() const {
  414. if (checksum_generator_ != nullptr) {
  415. return checksum_generator_->Name();
  416. } else {
  417. return kUnknownFileChecksumFuncName;
  418. }
  419. }
  420. IOStatus WritableFileWriter::PrepareIOOptions(const WriteOptions& wo,
  421. IOOptions& opts) {
  422. return PrepareIOFromWriteOptions(wo, opts);
  423. }
  424. IOStatus WritableFileWriter::Sync(const IOOptions& opts, bool use_fsync) {
  425. if (seen_error()) {
  426. return GetWriterHasPreviousErrorStatus();
  427. }
  428. IOOptions io_options = FinalizeIOOptions(opts);
  429. IOStatus s = Flush(io_options);
  430. if (!s.ok()) {
  431. set_seen_error(s);
  432. return s;
  433. }
  434. TEST_KILL_RANDOM("WritableFileWriter::Sync:0");
  435. if (!use_direct_io() && pending_sync_) {
  436. s = SyncInternal(io_options, use_fsync);
  437. if (!s.ok()) {
  438. set_seen_error(s);
  439. return s;
  440. }
  441. }
  442. TEST_KILL_RANDOM("WritableFileWriter::Sync:1");
  443. pending_sync_ = false;
  444. return IOStatus::OK();
  445. }
  446. IOStatus WritableFileWriter::SyncWithoutFlush(const IOOptions& opts,
  447. bool use_fsync) {
  448. if (seen_error()) {
  449. return GetWriterHasPreviousErrorStatus();
  450. }
  451. IOOptions io_options = FinalizeIOOptions(opts);
  452. if (!writable_file_->IsSyncThreadSafe()) {
  453. return IOStatus::NotSupported(
  454. "Can't WritableFileWriter::SyncWithoutFlush() because "
  455. "WritableFile::IsSyncThreadSafe() is false");
  456. }
  457. TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:1");
  458. IOStatus s = SyncInternal(io_options, use_fsync);
  459. TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:2");
  460. if (!s.ok()) {
  461. set_seen_error(s);
  462. }
  463. return s;
  464. }
  465. IOStatus WritableFileWriter::SyncInternal(const IOOptions& opts,
  466. bool use_fsync) {
  467. // Caller is supposed to check seen_error_
  468. IOStatus s;
  469. IOSTATS_TIMER_GUARD(fsync_nanos);
  470. TEST_SYNC_POINT("WritableFileWriter::SyncInternal:0");
  471. auto prev_perf_level = GetPerfLevel();
  472. IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, clock_);
  473. FileOperationInfo::StartTimePoint start_ts;
  474. if (ShouldNotifyListeners()) {
  475. start_ts = FileOperationInfo::StartNow();
  476. }
  477. if (use_fsync) {
  478. s = writable_file_->Fsync(opts, nullptr);
  479. } else {
  480. s = writable_file_->Sync(opts, nullptr);
  481. }
  482. if (ShouldNotifyListeners()) {
  483. auto finish_ts = std::chrono::steady_clock::now();
  484. NotifyOnFileSyncFinish(
  485. start_ts, finish_ts, s,
  486. use_fsync ? FileOperationType::kFsync : FileOperationType::kSync);
  487. if (!s.ok()) {
  488. NotifyOnIOError(
  489. s, (use_fsync ? FileOperationType::kFsync : FileOperationType::kSync),
  490. file_name());
  491. }
  492. }
  493. SetPerfLevel(prev_perf_level);
  494. // The caller will be responsible to call set_seen_error(s) if s is not OK.
  495. return s;
  496. }
  497. IOStatus WritableFileWriter::RangeSync(const IOOptions& opts, uint64_t offset,
  498. uint64_t nbytes) {
  499. if (seen_error()) {
  500. return GetWriterHasPreviousErrorStatus();
  501. }
  502. IOSTATS_TIMER_GUARD(range_sync_nanos);
  503. TEST_SYNC_POINT("WritableFileWriter::RangeSync:0");
  504. FileOperationInfo::StartTimePoint start_ts;
  505. if (ShouldNotifyListeners()) {
  506. start_ts = FileOperationInfo::StartNow();
  507. }
  508. IOStatus s = writable_file_->RangeSync(offset, nbytes, opts, nullptr);
  509. if (!s.ok()) {
  510. set_seen_error(s);
  511. }
  512. if (ShouldNotifyListeners()) {
  513. auto finish_ts = std::chrono::steady_clock::now();
  514. NotifyOnFileRangeSyncFinish(offset, nbytes, start_ts, finish_ts, s);
  515. if (!s.ok()) {
  516. NotifyOnIOError(s, FileOperationType::kRangeSync, file_name(), nbytes,
  517. offset);
  518. }
  519. }
  520. return s;
  521. }
  522. // This method writes to disk the specified data and makes use of the rate
  523. // limiter if available
  524. IOStatus WritableFileWriter::WriteBuffered(const IOOptions& opts,
  525. const char* data, size_t size) {
  526. if (seen_error()) {
  527. return GetWriterHasPreviousErrorStatus();
  528. }
  529. IOStatus s;
  530. assert(!use_direct_io());
  531. const char* src = data;
  532. size_t left = size;
  533. DataVerificationInfo v_info;
  534. char checksum_buf[sizeof(uint32_t)];
  535. Env::IOPriority rate_limiter_priority_used = opts.rate_limiter_priority;
  536. while (left > 0) {
  537. size_t allowed = left;
  538. if (rate_limiter_ != nullptr &&
  539. rate_limiter_priority_used != Env::IO_TOTAL) {
  540. allowed = rate_limiter_->RequestToken(left, 0 /* alignment */,
  541. rate_limiter_priority_used, stats_,
  542. RateLimiter::OpType::kWrite);
  543. }
  544. {
  545. IOSTATS_TIMER_GUARD(write_nanos);
  546. TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
  547. FileOperationInfo::StartTimePoint start_ts;
  548. uint64_t old_size = writable_file_->GetFileSize(opts, nullptr);
  549. if (ShouldNotifyListeners()) {
  550. start_ts = FileOperationInfo::StartNow();
  551. old_size = next_write_offset_;
  552. }
  553. {
  554. auto prev_perf_level = GetPerfLevel();
  555. IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, clock_);
  556. if (perform_data_verification_) {
  557. Crc32cHandoffChecksumCalculation(src, allowed, checksum_buf);
  558. v_info.checksum = Slice(checksum_buf, sizeof(uint32_t));
  559. s = writable_file_->Append(Slice(src, allowed), opts, v_info,
  560. nullptr);
  561. } else {
  562. s = writable_file_->Append(Slice(src, allowed), opts, nullptr);
  563. }
  564. if (!s.ok()) {
  565. // If writable_file_->Append() failed, then the data may or may not
  566. // exist in the underlying memory buffer, OS page cache, remote file
  567. // system's buffer, etc. If WritableFileWriter keeps the data in
  568. // buf_, then a future Close() or write retry may send the data to
  569. // the underlying file again. If the data does exist in the
  570. // underlying buffer and gets written to the file eventually despite
  571. // returning error, the file may end up with two duplicate pieces of
  572. // data. Therefore, clear the buf_ at the WritableFileWriter layer
  573. // and let caller determine error handling.
  574. buf_.Size(0);
  575. buffered_data_crc32c_checksum_ = 0;
  576. }
  577. SetPerfLevel(prev_perf_level);
  578. }
  579. if (ShouldNotifyListeners()) {
  580. auto finish_ts = std::chrono::steady_clock::now();
  581. NotifyOnFileWriteFinish(old_size, allowed, start_ts, finish_ts, s);
  582. if (!s.ok()) {
  583. NotifyOnIOError(s, FileOperationType::kAppend, file_name(), allowed,
  584. old_size);
  585. }
  586. }
  587. if (!s.ok()) {
  588. set_seen_error(s);
  589. return s;
  590. }
  591. }
  592. IOSTATS_ADD(bytes_written, allowed);
  593. TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0");
  594. left -= allowed;
  595. src += allowed;
  596. uint64_t cur_size = flushed_size_.load(std::memory_order_acquire);
  597. flushed_size_.store(cur_size + allowed, std::memory_order_release);
  598. }
  599. buf_.Size(0);
  600. buffered_data_crc32c_checksum_ = 0;
  601. if (!s.ok()) {
  602. set_seen_error(s);
  603. }
  604. return s;
  605. }
  606. IOStatus WritableFileWriter::WriteBufferedWithChecksum(const IOOptions& opts,
  607. const char* data,
  608. size_t size) {
  609. if (seen_error()) {
  610. return GetWriterHasPreviousErrorStatus();
  611. }
  612. IOStatus s;
  613. assert(!use_direct_io());
  614. assert(perform_data_verification_ && buffered_data_with_checksum_);
  615. const char* src = data;
  616. size_t left = size;
  617. DataVerificationInfo v_info;
  618. char checksum_buf[sizeof(uint32_t)];
  619. Env::IOPriority rate_limiter_priority_used = opts.rate_limiter_priority;
  620. // Check how much is allowed. Here, we loop until the rate limiter allows to
  621. // write the entire buffer.
  622. // TODO: need to be improved since it sort of defeats the purpose of the rate
  623. // limiter
  624. size_t data_size = left;
  625. if (rate_limiter_ != nullptr && rate_limiter_priority_used != Env::IO_TOTAL) {
  626. while (data_size > 0) {
  627. size_t tmp_size;
  628. tmp_size =
  629. rate_limiter_->RequestToken(data_size, 0, rate_limiter_priority_used,
  630. stats_, RateLimiter::OpType::kWrite);
  631. data_size -= tmp_size;
  632. }
  633. }
  634. {
  635. IOSTATS_TIMER_GUARD(write_nanos);
  636. TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
  637. FileOperationInfo::StartTimePoint start_ts;
  638. uint64_t old_size = writable_file_->GetFileSize(opts, nullptr);
  639. if (ShouldNotifyListeners()) {
  640. start_ts = FileOperationInfo::StartNow();
  641. old_size = next_write_offset_;
  642. }
  643. {
  644. auto prev_perf_level = GetPerfLevel();
  645. IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, clock_);
  646. EncodeFixed32(checksum_buf, buffered_data_crc32c_checksum_);
  647. v_info.checksum = Slice(checksum_buf, sizeof(uint32_t));
  648. s = writable_file_->Append(Slice(src, left), opts, v_info, nullptr);
  649. SetPerfLevel(prev_perf_level);
  650. }
  651. if (ShouldNotifyListeners()) {
  652. auto finish_ts = std::chrono::steady_clock::now();
  653. NotifyOnFileWriteFinish(old_size, left, start_ts, finish_ts, s);
  654. if (!s.ok()) {
  655. NotifyOnIOError(s, FileOperationType::kAppend, file_name(), left,
  656. old_size);
  657. }
  658. }
  659. if (!s.ok()) {
  660. // If writable_file_->Append() failed, then the data may or may not
  661. // exist in the underlying memory buffer, OS page cache, remote file
  662. // system's buffer, etc. If WritableFileWriter keeps the data in
  663. // buf_, then a future Close() or write retry may send the data to
  664. // the underlying file again. If the data does exist in the
  665. // underlying buffer and gets written to the file eventually despite
  666. // returning error, the file may end up with two duplicate pieces of
  667. // data. Therefore, clear the buf_ at the WritableFileWriter layer
  668. // and let caller determine error handling.
  669. buf_.Size(0);
  670. buffered_data_crc32c_checksum_ = 0;
  671. set_seen_error(s);
  672. return s;
  673. }
  674. }
  675. IOSTATS_ADD(bytes_written, left);
  676. TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0");
  677. // Buffer write is successful, reset the buffer current size to 0 and reset
  678. // the corresponding checksum value
  679. buf_.Size(0);
  680. buffered_data_crc32c_checksum_ = 0;
  681. uint64_t cur_size = flushed_size_.load(std::memory_order_acquire);
  682. flushed_size_.store(cur_size + left, std::memory_order_release);
  683. if (!s.ok()) {
  684. set_seen_error(s);
  685. }
  686. return s;
  687. }
  688. void WritableFileWriter::UpdateFileChecksum(const Slice& data) {
  689. if (checksum_generator_ != nullptr) {
  690. checksum_generator_->Update(data.data(), data.size());
  691. }
  692. }
  693. // Currently, crc32c checksum is used to calculate the checksum value of the
  694. // content in the input buffer for handoff. In the future, the checksum might be
  695. // calculated from the existing crc32c checksums of the in WAl and Manifest
  696. // records, or even SST file blocks.
  697. // TODO: effectively use the existing checksum of the data being writing to
  698. // generate the crc32c checksum instead of a raw calculation.
  699. void WritableFileWriter::Crc32cHandoffChecksumCalculation(const char* data,
  700. size_t size,
  701. char* buf) {
  702. uint32_t v_crc32c = crc32c::Extend(0, data, size);
  703. EncodeFixed32(buf, v_crc32c);
  704. }
  705. // This flushes the accumulated data in the buffer. We pad data with zeros if
  706. // necessary to the whole page.
  707. // However, during automatic flushes padding would not be necessary.
  708. // We always use RateLimiter if available. We move (Refit) any buffer bytes
  709. // that are left over the
  710. // whole number of pages to be written again on the next flush because we can
  711. // only write on aligned
  712. // offsets.
  713. IOStatus WritableFileWriter::WriteDirect(const IOOptions& opts) {
  714. if (seen_error()) {
  715. assert(false);
  716. return IOStatus::IOError("Writer has previous error.");
  717. }
  718. assert(use_direct_io());
  719. IOStatus s;
  720. const size_t alignment = buf_.Alignment();
  721. assert((next_write_offset_ % alignment) == 0);
  722. // Calculate whole page final file advance if all writes succeed
  723. const size_t file_advance =
  724. TruncateToPageBoundary(alignment, buf_.CurrentSize());
  725. // Calculate the leftover tail, we write it here padded with zeros BUT we
  726. // will write it again in the future either on Close() OR when the current
  727. // whole page fills out.
  728. const size_t leftover_tail = buf_.CurrentSize() - file_advance;
  729. // Round up and pad
  730. buf_.PadToAlignmentWith(0);
  731. const char* src = buf_.BufferStart();
  732. uint64_t write_offset = next_write_offset_;
  733. size_t left = buf_.CurrentSize();
  734. DataVerificationInfo v_info;
  735. char checksum_buf[sizeof(uint32_t)];
  736. Env::IOPriority rate_limiter_priority_used = opts.rate_limiter_priority;
  737. while (left > 0) {
  738. // Check how much is allowed
  739. size_t size = left;
  740. if (rate_limiter_ != nullptr &&
  741. rate_limiter_priority_used != Env::IO_TOTAL) {
  742. size = rate_limiter_->RequestToken(left, buf_.Alignment(),
  743. rate_limiter_priority_used, stats_,
  744. RateLimiter::OpType::kWrite);
  745. }
  746. {
  747. IOSTATS_TIMER_GUARD(write_nanos);
  748. TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
  749. FileOperationInfo::StartTimePoint start_ts;
  750. if (ShouldNotifyListeners()) {
  751. start_ts = FileOperationInfo::StartNow();
  752. }
  753. // direct writes must be positional
  754. if (perform_data_verification_) {
  755. Crc32cHandoffChecksumCalculation(src, size, checksum_buf);
  756. v_info.checksum = Slice(checksum_buf, sizeof(uint32_t));
  757. s = writable_file_->PositionedAppend(Slice(src, size), write_offset,
  758. opts, v_info, nullptr);
  759. } else {
  760. s = writable_file_->PositionedAppend(Slice(src, size), write_offset,
  761. opts, nullptr);
  762. }
  763. if (ShouldNotifyListeners()) {
  764. auto finish_ts = std::chrono::steady_clock::now();
  765. NotifyOnFileWriteFinish(write_offset, size, start_ts, finish_ts, s);
  766. if (!s.ok()) {
  767. NotifyOnIOError(s, FileOperationType::kPositionedAppend, file_name(),
  768. size, write_offset);
  769. }
  770. }
  771. if (!s.ok()) {
  772. buf_.Size(file_advance + leftover_tail);
  773. set_seen_error(s);
  774. return s;
  775. }
  776. }
  777. IOSTATS_ADD(bytes_written, size);
  778. left -= size;
  779. src += size;
  780. write_offset += size;
  781. uint64_t cur_size = flushed_size_.load(std::memory_order_acquire);
  782. flushed_size_.store(cur_size + size, std::memory_order_release);
  783. assert((next_write_offset_ % alignment) == 0);
  784. }
  785. if (s.ok()) {
  786. // Move the tail to the beginning of the buffer
  787. // This never happens during normal Append but rather during
  788. // explicit call to Flush()/Sync() or Close()
  789. buf_.RefitTail(file_advance, leftover_tail);
  790. // This is where we start writing next time which may or not be
  791. // the actual file size on disk. They match if the buffer size
  792. // is a multiple of whole pages otherwise filesize_ is leftover_tail
  793. // behind
  794. next_write_offset_ += file_advance;
  795. } else {
  796. set_seen_error(s);
  797. }
  798. return s;
  799. }
  800. IOStatus WritableFileWriter::WriteDirectWithChecksum(const IOOptions& opts) {
  801. if (seen_error()) {
  802. return GetWriterHasPreviousErrorStatus();
  803. }
  804. assert(use_direct_io());
  805. assert(perform_data_verification_ && buffered_data_with_checksum_);
  806. IOStatus s;
  807. const size_t alignment = buf_.Alignment();
  808. assert((next_write_offset_ % alignment) == 0);
  809. // Calculate whole page final file advance if all writes succeed
  810. const size_t file_advance =
  811. TruncateToPageBoundary(alignment, buf_.CurrentSize());
  812. // Calculate the leftover tail, we write it here padded with zeros BUT we
  813. // will write it again in the future either on Close() OR when the current
  814. // whole page fills out.
  815. const size_t leftover_tail = buf_.CurrentSize() - file_advance;
  816. // Round up, pad, and combine the checksum.
  817. size_t last_cur_size = buf_.CurrentSize();
  818. buf_.PadToAlignmentWith(0);
  819. size_t padded_size = buf_.CurrentSize() - last_cur_size;
  820. const char* padded_start = buf_.BufferStart() + last_cur_size;
  821. uint32_t padded_checksum = crc32c::Value(padded_start, padded_size);
  822. buffered_data_crc32c_checksum_ = crc32c::Crc32cCombine(
  823. buffered_data_crc32c_checksum_, padded_checksum, padded_size);
  824. const char* src = buf_.BufferStart();
  825. uint64_t write_offset = next_write_offset_;
  826. size_t left = buf_.CurrentSize();
  827. DataVerificationInfo v_info;
  828. char checksum_buf[sizeof(uint32_t)];
  829. Env::IOPriority rate_limiter_priority_used = opts.rate_limiter_priority;
  830. // Check how much is allowed. Here, we loop until the rate limiter allows to
  831. // write the entire buffer.
  832. // TODO: need to be improved since it sort of defeats the purpose of the rate
  833. // limiter
  834. size_t data_size = left;
  835. if (rate_limiter_ != nullptr && rate_limiter_priority_used != Env::IO_TOTAL) {
  836. while (data_size > 0) {
  837. size_t size;
  838. size = rate_limiter_->RequestToken(data_size, buf_.Alignment(),
  839. rate_limiter_priority_used, stats_,
  840. RateLimiter::OpType::kWrite);
  841. data_size -= size;
  842. }
  843. }
  844. {
  845. IOSTATS_TIMER_GUARD(write_nanos);
  846. TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
  847. FileOperationInfo::StartTimePoint start_ts;
  848. if (ShouldNotifyListeners()) {
  849. start_ts = FileOperationInfo::StartNow();
  850. }
  851. // direct writes must be positional
  852. EncodeFixed32(checksum_buf, buffered_data_crc32c_checksum_);
  853. v_info.checksum = Slice(checksum_buf, sizeof(uint32_t));
  854. s = writable_file_->PositionedAppend(Slice(src, left), write_offset, opts,
  855. v_info, nullptr);
  856. if (ShouldNotifyListeners()) {
  857. auto finish_ts = std::chrono::steady_clock::now();
  858. NotifyOnFileWriteFinish(write_offset, left, start_ts, finish_ts, s);
  859. if (!s.ok()) {
  860. NotifyOnIOError(s, FileOperationType::kPositionedAppend, file_name(),
  861. left, write_offset);
  862. }
  863. }
  864. if (!s.ok()) {
  865. // In this case, we do not change buffered_data_crc32c_checksum_ because
  866. // it still aligns with the data in the buffer.
  867. buf_.Size(file_advance + leftover_tail);
  868. buffered_data_crc32c_checksum_ =
  869. crc32c::Value(buf_.BufferStart(), buf_.CurrentSize());
  870. set_seen_error(s);
  871. return s;
  872. }
  873. }
  874. IOSTATS_ADD(bytes_written, left);
  875. assert((next_write_offset_ % alignment) == 0);
  876. uint64_t cur_size = flushed_size_.load(std::memory_order_acquire);
  877. flushed_size_.store(cur_size + left, std::memory_order_release);
  878. if (s.ok()) {
  879. // Move the tail to the beginning of the buffer
  880. // This never happens during normal Append but rather during
  881. // explicit call to Flush()/Sync() or Close(). Also the buffer checksum will
  882. // recalculated accordingly.
  883. buf_.RefitTail(file_advance, leftover_tail);
  884. // Adjust the checksum value to align with the data in the buffer
  885. buffered_data_crc32c_checksum_ =
  886. crc32c::Value(buf_.BufferStart(), buf_.CurrentSize());
  887. // This is where we start writing next time which may or not be
  888. // the actual file size on disk. They match if the buffer size
  889. // is a multiple of whole pages otherwise filesize_ is leftover_tail
  890. // behind
  891. next_write_offset_ += file_advance;
  892. } else {
  893. set_seen_error(s);
  894. }
  895. return s;
  896. }
  897. Env::IOPriority WritableFileWriter::DecideRateLimiterPriority(
  898. Env::IOPriority writable_file_io_priority,
  899. Env::IOPriority op_rate_limiter_priority) {
  900. if (writable_file_io_priority == Env::IO_TOTAL &&
  901. op_rate_limiter_priority == Env::IO_TOTAL) {
  902. return Env::IO_TOTAL;
  903. } else if (writable_file_io_priority == Env::IO_TOTAL) {
  904. return op_rate_limiter_priority;
  905. } else if (op_rate_limiter_priority == Env::IO_TOTAL) {
  906. return writable_file_io_priority;
  907. } else {
  908. return op_rate_limiter_priority;
  909. }
  910. }
  911. IOOptions WritableFileWriter::FinalizeIOOptions(const IOOptions& opts) const {
  912. Env::IOPriority op_rate_limiter_priority = opts.rate_limiter_priority;
  913. IOOptions io_options(opts);
  914. if (writable_file_.get() != nullptr) {
  915. io_options.rate_limiter_priority =
  916. WritableFileWriter::DecideRateLimiterPriority(
  917. writable_file_->GetIOPriority(), op_rate_limiter_priority);
  918. }
  919. return io_options;
  920. }
  921. } // namespace ROCKSDB_NAMESPACE